linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][RFC][4/4] IB: userspace verbs Kconfig/Makefile changes
  2005-04-04 22:09     ` [PATCH][RFC][3/4] IB: userspace verbs mthca changes Roland Dreier
@ 2005-04-04 22:09       ` Roland Dreier
  2005-04-04 22:49       ` [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes Tom Duffy
  2005-04-21  0:37       ` [PATCH][MTHCA] fix sparc build WAS: " Tom Duffy
  2 siblings, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-04 22:09 UTC (permalink / raw)
  To: linux-kernel, openib-general

Hook userspace verbs up to Kconfig and Makefile.

Signed-off-by: Roland Dreier <roland@topspin.com>

--- linux-export.orig/drivers/infiniband/Kconfig	2005-04-04 14:58:53.397756926 -0700
+++ linux-export/drivers/infiniband/Kconfig	2005-04-04 15:01:08.716332258 -0700
@@ -7,6 +7,14 @@
 	  any protocols you wish to use as well as drivers for your
 	  InfiniBand hardware.
 
+config INFINIBAND_USER_VERBS
+	tristate "InfiniBand userspace verbs support"
+	depends on INFINIBAND
+	---help---
+	  Userspace InfiniBand verbs support.  This is the kernel side
+	  of userspace verbs.  You will also need libibverbs and a
+	  hardware driver library from <http://www.openib.org>.
+
 source "drivers/infiniband/hw/mthca/Kconfig"
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
--- linux-export.orig/drivers/infiniband/core/Makefile	2005-04-04 14:58:53.398756709 -0700
+++ linux-export/drivers/infiniband/core/Makefile	2005-04-04 15:00:44.933503748 -0700
@@ -1,7 +1,8 @@
 EXTRA_CFLAGS += -Idrivers/infiniband/include
 
-obj-$(CONFIG_INFINIBAND) +=	ib_core.o ib_mad.o ib_ping.o \
-				ib_cm.o ib_sa.o ib_umad.o ib_ucm.o
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_ping.o \
+					ib_cm.o ib_sa.o ib_umad.o ib_ucm.o
+obj-$(CONFIG_INFINIBAND_USER_VERBS) +=	ib_uverbs.o
 
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
 				device.o fmr_pool.o cache.o
@@ -16,4 +17,6 @@
 
 ib_umad-y :=			user_mad.o
 
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_mem.o
+
 ib_ucm-y :=			ucm.o


^ permalink raw reply	[flat|nested] 144+ messages in thread

* [PATCH][RFC][1/4] IB: core changes for userspace verbs
  2005-04-04 22:09 [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Roland Dreier
@ 2005-04-04 22:09 ` Roland Dreier
  2005-04-04 22:09   ` [PATCH][RFC][2/4] IB: userspace verbs main module Roland Dreier
  2005-04-11 14:22 ` [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Troy Benjegerdes
  1 sibling, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-04 22:09 UTC (permalink / raw)
  To: linux-kernel, openib-general

Add new structs and struct members required by userspace verbs to IB core.

Signed-off-by: Roland Dreier <roland@topspin.com>

--- linux-export.orig/drivers/infiniband/core/verbs.c	2005-01-11 09:35:27.046388000 -0800
+++ linux-export/drivers/infiniband/core/verbs.c	2005-04-04 14:50:59.579791210 -0700
@@ -47,10 +47,11 @@
 {
 	struct ib_pd *pd;
 
-	pd = device->alloc_pd(device);
+	pd = device->alloc_pd(device, NULL, NULL, 0);
 
 	if (!IS_ERR(pd)) {
-		pd->device = device;
+		pd->device  = device;
+		pd->uobject = NULL;
 		atomic_set(&pd->usecnt, 0);
 	}
 
@@ -76,8 +77,9 @@
 	ah = pd->device->create_ah(pd, ah_attr);
 
 	if (!IS_ERR(ah)) {
-		ah->device = pd->device;
-		ah->pd     = pd;
+		ah->device  = pd->device;
+		ah->pd      = pd;
+		ah->uobject = NULL;
 		atomic_inc(&pd->usecnt);
 	}
 
@@ -122,7 +124,7 @@
 {
 	struct ib_qp *qp;
 
-	qp = pd->device->create_qp(pd, qp_init_attr);
+	qp = pd->device->create_qp(pd, qp_init_attr, NULL, 0);
 
 	if (!IS_ERR(qp)) {
 		qp->device     	  = pd->device;
@@ -130,6 +132,7 @@
 		qp->send_cq    	  = qp_init_attr->send_cq;
 		qp->recv_cq    	  = qp_init_attr->recv_cq;
 		qp->srq	       	  = qp_init_attr->srq;
+		qp->uobject       = NULL;
 		qp->event_handler = qp_init_attr->event_handler;
 		qp->qp_context    = qp_init_attr->qp_context;
 		qp->qp_type	  = qp_init_attr->qp_type;
@@ -197,10 +200,11 @@
 {
 	struct ib_cq *cq;
 
-	cq = device->create_cq(device, cqe);
+	cq = device->create_cq(device, cqe, NULL, NULL, 0);
 
 	if (!IS_ERR(cq)) {
 		cq->device        = device;
+		cq->uobject       = NULL;
 		cq->comp_handler  = comp_handler;
 		cq->event_handler = event_handler;
 		cq->cq_context    = cq_context;
@@ -245,8 +249,9 @@
 	mr = pd->device->get_dma_mr(pd, mr_access_flags);
 
 	if (!IS_ERR(mr)) {
-		mr->device = pd->device;
-		mr->pd     = pd;
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
 		atomic_inc(&pd->usecnt);
 		atomic_set(&mr->usecnt, 0);
 	}
@@ -267,8 +272,9 @@
 				     mr_access_flags, iova_start);
 
 	if (!IS_ERR(mr)) {
-		mr->device = pd->device;
-		mr->pd     = pd;
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
 		atomic_inc(&pd->usecnt);
 		atomic_set(&mr->usecnt, 0);
 	}
@@ -344,8 +350,9 @@
 
 	mw = pd->device->alloc_mw(pd);
 	if (!IS_ERR(mw)) {
-		mw->device = pd->device;
-		mw->pd     = pd;
+		mw->device  = pd->device;
+		mw->pd      = pd;
+		mw->uobject = NULL;
 		atomic_inc(&pd->usecnt);
 	}
 
--- linux-export.orig/drivers/infiniband/include/ib_verbs.h	2005-02-22 10:14:06.623746000 -0800
+++ linux-export/drivers/infiniband/include/ib_verbs.h	2005-04-04 14:50:42.054602327 -0700
@@ -41,7 +41,9 @@
 
 #include <linux/types.h>
 #include <linux/device.h>
+
 #include <asm/atomic.h>
+#include <asm/scatterlist.h>
 
 union ib_gid {
 	u8	raw[16];
@@ -618,29 +620,78 @@
 	u8	page_size;
 };
 
+struct ib_ucontext {
+	struct ib_device       *device;
+	struct list_head	pd_list;
+	struct list_head	mr_list;
+	struct list_head	mw_list;
+	struct list_head	cq_list;
+	struct list_head	qp_list;
+	struct list_head	srq_list;
+	struct list_head	ah_list;
+	spinlock_t              lock;
+};
+
+struct ib_uobject {
+	u64			user_handle;	/* handle given to us by userspace */
+	struct ib_ucontext     *context;	/* associated user context */
+	struct list_head	list;		/* link to context's list */
+	u32			id;		/* index into kernel idr */
+};
+
+struct ib_umem {
+	unsigned long		user_base;
+	unsigned long		virt_base;
+	size_t			length;
+	int			offset;
+	int			page_size;
+	struct list_head	chunk_list;
+};
+
+struct ib_umem_chunk {
+	struct list_head	list;
+	int                     nents;
+	int                     nmap;
+	struct scatterlist      page_list[0];
+};
+
+#define IB_UMEM_MAX_PAGE_CHUNK						\
+	((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) /	\
+	 ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] -	\
+	  (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
+
+struct ib_umem_object {
+	struct ib_uobject	uobject;
+	struct ib_umem		umem;
+};
+
 struct ib_pd {
-	struct ib_device *device;
-	atomic_t          usecnt; /* count all resources */
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	atomic_t          	usecnt; /* count all resources */
 };
 
 struct ib_ah {
 	struct ib_device	*device;
 	struct ib_pd		*pd;
+	struct ib_uobject      *uobject;
 };
 
 typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
 
 struct ib_cq {
-	struct ib_device *device;
-	ib_comp_handler   comp_handler;
-	void             (*event_handler)(struct ib_event *, void *);
-	void *            cq_context;
-	int               cqe;
-	atomic_t          usecnt; /* count number of work queues */
+	struct ib_device       *device;
+	struct ib_uobject      *uobject;
+	ib_comp_handler   	comp_handler;
+	void                  (*event_handler)(struct ib_event *, void *);
+	void *            	cq_context;
+	int               	cqe;
+	atomic_t          	usecnt; /* count number of work queues */
 };
 
 struct ib_srq {
 	struct ib_device	*device;
+	struct ib_uobject	*uobject;
 	struct ib_pd		*pd;
 	void			*srq_context;
 	atomic_t		usecnt;
@@ -652,6 +703,7 @@
 	struct ib_cq	       *send_cq;
 	struct ib_cq	       *recv_cq;
 	struct ib_srq	       *srq;
+	struct ib_uobject      *uobject;
 	void                  (*event_handler)(struct ib_event *, void *);
 	void		       *qp_context;
 	u32			qp_num;
@@ -659,16 +711,18 @@
 };
 
 struct ib_mr {
-	struct ib_device *device;
-	struct ib_pd     *pd;
-	u32		  lkey;
-	u32		  rkey;
-	atomic_t          usecnt; /* count number of MWs */
+	struct ib_device  *device;
+	struct ib_pd	  *pd;
+	struct ib_uobject *uobject;
+	u32		   lkey;
+	u32		   rkey;
+	atomic_t	   usecnt; /* count number of MWs */
 };
 
 struct ib_mw {
 	struct ib_device	*device;
 	struct ib_pd		*pd;
+	struct ib_uobject	*uobject;
 	u32			rkey;
 };
 
@@ -737,7 +791,12 @@
 	int		           (*modify_port)(struct ib_device *device,
 						  u8 port_num, int port_modify_mask,
 						  struct ib_port_modify *port_modify);
-	struct ib_pd *             (*alloc_pd)(struct ib_device *device);
+	struct ib_ucontext *       (*alloc_ucontext)(struct ib_device *device,
+						     const void __user *udata, int udatalen);
+	int                        (*dealloc_ucontext)(struct ib_ucontext *context);
+	struct ib_pd *             (*alloc_pd)(struct ib_device *device,
+					       struct ib_ucontext *context,
+					       const void __user *udata, int udatalen);
 	int                        (*dealloc_pd)(struct ib_pd *pd);
 	struct ib_ah *             (*create_ah)(struct ib_pd *pd,
 						struct ib_ah_attr *ah_attr);
@@ -747,7 +806,8 @@
 					       struct ib_ah_attr *ah_attr);
 	int                        (*destroy_ah)(struct ib_ah *ah);
 	struct ib_qp *             (*create_qp)(struct ib_pd *pd,
-						struct ib_qp_init_attr *qp_init_attr);
+						struct ib_qp_init_attr *qp_init_attr,
+						const void __user *udata, int udatalen);
 	int                        (*modify_qp)(struct ib_qp *qp,
 						struct ib_qp_attr *qp_attr,
 						int qp_attr_mask);
@@ -762,8 +822,9 @@
 	int                        (*post_recv)(struct ib_qp *qp,
 						struct ib_recv_wr *recv_wr,
 						struct ib_recv_wr **bad_recv_wr);
-	struct ib_cq *             (*create_cq)(struct ib_device *device,
-						int cqe);
+	struct ib_cq *             (*create_cq)(struct ib_device *device, int cqe,
+						struct ib_ucontext *context,
+						const void __user *udata, int udatalen);
 	int                        (*destroy_cq)(struct ib_cq *cq);
 	int                        (*resize_cq)(struct ib_cq *cq, int *cqe);
 	int                        (*poll_cq)(struct ib_cq *cq, int num_entries,
@@ -780,6 +841,11 @@
 						  int num_phys_buf,
 						  int mr_access_flags,
 						  u64 *iova_start);
+	struct ib_mr *             (*reg_user_mr)(struct ib_pd *pd,
+						  struct ib_umem *region,
+						  int mr_access_flags,
+						  const void __user *udata,
+						  int udatalen);
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
@@ -816,7 +882,10 @@
 						  struct ib_grh *in_grh,
 						  struct ib_mad *in_mad,
 						  struct ib_mad *out_mad);
+	int                        (*mmap)(struct ib_ucontext *context,
+					   struct vm_area_struct *vma);
 
+	struct module               *owner;
 	struct class_device          class_dev;
 	struct kobject               ports_parent;
 	struct list_head             port_list;


^ permalink raw reply	[flat|nested] 144+ messages in thread

* [PATCH][RFC][2/4] IB: userspace verbs main module
  2005-04-04 22:09 ` [PATCH][RFC][1/4] IB: core changes for userspace verbs Roland Dreier
@ 2005-04-04 22:09   ` Roland Dreier
  2005-04-04 22:09     ` [PATCH][RFC][3/4] IB: userspace verbs mthca changes Roland Dreier
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-04 22:09 UTC (permalink / raw)
  To: linux-kernel, openib-general

Add device-independent userspace verbs support (ib_uverbs module).

Signed-off-by: Roland Dreier <roland@topspin.com>

--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/core/uverbs.h	2005-04-04 14:55:10.496227053 -0700
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: uverbs.h 2001 2005-03-16 04:15:41Z roland $
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+/* Include device.h and fs.h until cdev.h is self-sufficient */
+#include <linux/fs.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+
+#include <ib_verbs.h>
+#include <ib_user_verbs.h>
+
+struct ib_uverbs_device {
+	int                 devnum;
+	struct cdev         dev;
+	struct class_device class_dev;
+	struct ib_device   *ib_dev;
+	int                 num_comp;
+};
+
+struct ib_uverbs_event_file {
+	struct ib_uverbs_file *uverbs_file;
+	spinlock_t             lock;
+	int                    fd;
+	int                    is_async;
+	wait_queue_head_t      poll_wait;
+	struct list_head       event_list;
+};
+
+struct ib_uverbs_file {
+	struct kref                 ref;
+	struct ib_uverbs_device    *device;
+	struct ib_ucontext         *ucontext;
+	struct ib_event_handler     event_handler;
+	struct ib_uverbs_event_file async_file; 
+	struct ib_uverbs_event_file comp_file[1]; 
+};
+
+struct ib_uverbs_async_event {
+	struct ib_uverbs_async_event_desc desc;
+	struct list_head                  list;
+};
+
+struct ib_uverbs_comp_event {
+	struct ib_uverbs_comp_event_desc desc;
+	struct list_head                 list;
+};
+
+struct ib_uobject_mr {
+	struct ib_uobject   uobj;
+	struct page        *page_list;
+	struct scatterlist *sg_list;
+};
+
+extern struct semaphore ib_uverbs_idr_mutex;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+
+int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
+		void *addr, size_t size);
+void ib_umem_release(struct ib_device *dev, struct ib_umem *umem);
+
+#define IB_UVERBS_DECLARE_CMD(name)					\
+	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 const char __user *buf, int in_len,	\
+				 int out_len)
+
+IB_UVERBS_DECLARE_CMD(query_params);
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+
+#endif /* UVERBS_H */
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/core/uverbs_cmd.c	2005-04-04 14:53:12.136965074 -0700
@@ -0,0 +1,790 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: uverbs_cmd.c 1995 2005-03-15 19:25:10Z roland $
+ */
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+ssize_t ib_uverbs_query_params(struct ib_uverbs_file *file,
+			       const char __user *buf,
+			       int in_len, int out_len)
+{
+	struct ib_uverbs_query_params      cmd;
+	struct ib_uverbs_query_params_resp resp;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	resp.num_cq_events = file->device->num_comp;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp))
+	    return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      const char __user *buf,
+			      int in_len, int out_len)
+{
+	struct ib_uverbs_get_context       cmd;
+	struct ib_uverbs_get_context_resp *resp;
+	struct ib_device                  *ibdev = file->device->ib_dev;
+	int outsz;
+	int i;
+	int ret = in_len;
+
+	outsz = sizeof *resp + (file->device->num_comp - 1) * sizeof (__u32);
+
+	if (out_len < outsz)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	resp = kmalloc(outsz, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	file->ucontext = ibdev->alloc_ucontext(ibdev, buf + sizeof cmd,
+					       in_len - sizeof cmd -
+					       sizeof (struct ib_uverbs_cmd_hdr));
+	if (IS_ERR(file->ucontext)) {
+		ret = PTR_ERR(file->ucontext);
+		file->ucontext = NULL;
+		kfree(resp);
+		return ret;
+	}
+
+	file->ucontext->device = ibdev;
+	INIT_LIST_HEAD(&file->ucontext->pd_list);
+	INIT_LIST_HEAD(&file->ucontext->mr_list);
+	INIT_LIST_HEAD(&file->ucontext->mw_list);
+	INIT_LIST_HEAD(&file->ucontext->cq_list);
+	INIT_LIST_HEAD(&file->ucontext->qp_list);
+	INIT_LIST_HEAD(&file->ucontext->srq_list);
+	INIT_LIST_HEAD(&file->ucontext->ah_list);
+	spin_lock_init(&file->ucontext->lock);
+
+	resp->async_fd  = file->async_file.fd;
+	for (i = 0; i < file->device->num_comp; ++i)
+		resp->cq_fd[i] = file->comp_file[i].fd;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, outsz)) {
+		ibdev->dealloc_ucontext(file->ucontext);
+		file->ucontext = NULL;
+		ret = -EFAULT;
+	}
+
+	kfree(resp);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_query_port      cmd;
+	struct ib_uverbs_query_port_resp resp;
+	struct ib_port_attr              attr;
+	int                              ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	if (ret)
+		return ret;
+
+	resp.state 	     = attr.state;
+	resp.max_mtu 	     = attr.max_mtu;
+	resp.active_mtu      = attr.active_mtu;
+	resp.gid_tbl_len     = attr.gid_tbl_len;
+	resp.port_cap_flags  = attr.port_cap_flags;
+	resp.max_msg_sz      = attr.max_msg_sz;
+	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+	resp.lid 	     = attr.lid;
+	resp.sm_lid 	     = attr.sm_lid;
+	resp.lmc 	     = attr.lmc;
+	resp.max_vl_num      = attr.max_vl_num;
+	resp.sm_sl 	     = attr.sm_sl;
+	resp.subnet_timeout  = attr.subnet_timeout;
+	resp.init_type_reply = attr.init_type_reply;
+	resp.active_width    = attr.active_width;
+	resp.active_speed    = attr.active_speed;
+	resp.phys_state      = attr.phys_state;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   const char __user *buf,
+			   int in_len, int out_len)
+{
+	struct ib_uverbs_alloc_pd      cmd;
+	struct ib_uverbs_alloc_pd_resp resp;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	int                            ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	uobj->context = file->ucontext;
+
+	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+					    file->ucontext, buf + sizeof cmd,
+					    in_len - sizeof cmd -
+					    sizeof (struct ib_uverbs_cmd_hdr));
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto err;
+	}
+
+	pd->device  = file->device->ib_dev;
+	pd->uobject = uobj;
+	atomic_set(&pd->usecnt, 0);
+
+retry:
+	if (!idr_pre_get(&ib_uverbs_pd_idr, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_pd;
+	}
+
+	down(&ib_uverbs_idr_mutex);
+	ret = idr_get_new(&ib_uverbs_pd_idr, pd, &uobj->id);
+	up(&ib_uverbs_idr_mutex);
+
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		goto err_pd;
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_add_tail(&uobj->list, &file->ucontext->pd_list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	resp.pd_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_list;
+	}
+
+	return in_len;
+
+err_list:
+ 	spin_lock_irq(&file->ucontext->lock);
+	list_del(&uobj->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	down(&ib_uverbs_idr_mutex);
+	idr_remove(&ib_uverbs_pd_idr, uobj->id);
+	up(&ib_uverbs_idr_mutex);
+
+err_pd:
+	ib_dealloc_pd(pd);
+
+err:
+	kfree(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_dealloc_pd cmd;
+	struct ib_pd               *pd;
+	int                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	down(&ib_uverbs_idr_mutex);
+
+	pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle);
+	if (!pd || pd->uobject->context != file->ucontext)
+		goto out;
+
+	ret = ib_dealloc_pd(pd);
+	if (ret)
+		goto out;
+
+	idr_remove(&ib_uverbs_pd_idr, cmd.pd_handle);
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_del(&pd->uobject->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	kfree(pd->uobject);
+
+out:
+	up(&ib_uverbs_idr_mutex);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_reg_mr      cmd;
+	struct ib_uverbs_reg_mr_resp resp;
+	struct ib_umem_object       *obj;
+	struct ib_pd                *pd;
+	struct ib_mr                *mr;
+	int                          ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+		return -EINVAL;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	obj->uobject.context = file->ucontext;
+
+	ret = ib_umem_get(file->device->ib_dev, &obj->umem,
+			  (void *) (unsigned long) cmd.start,
+			  cmd.length);
+	if (ret)
+		goto err_free;
+
+	obj->umem.virt_base = cmd.hca_va;
+
+	down(&ib_uverbs_idr_mutex);
+
+	pd = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle);
+	if (!pd || pd->uobject->context != file->ucontext) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	if (!pd->device->reg_user_mr) {
+		ret = -ENOSYS;
+		goto err_up;
+	}
+
+	mr = pd->device->reg_user_mr(pd, &obj->umem,
+				     cmd.access_flags,
+				     buf + sizeof cmd,
+				     in_len - sizeof cmd -
+				     sizeof (struct ib_uverbs_cmd_hdr));
+	if (IS_ERR(mr)) {
+		ret = PTR_ERR(mr);
+		goto err_up;
+	}
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->uobject = &obj->uobject;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&mr->usecnt, 0);
+
+	resp.lkey = mr->lkey;
+	resp.rkey = mr->rkey;
+
+retry:
+	if (!idr_pre_get(&ib_uverbs_mr_idr, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_unreg;
+	}
+
+	ret = idr_get_new(&ib_uverbs_mr_idr, mr, &obj->uobject.id);
+
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		goto err_unreg;
+
+	resp.mr_handle = obj->uobject.id;
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_add_tail(&obj->uobject.list, &file->ucontext->mr_list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_list;
+	}
+
+	up(&ib_uverbs_idr_mutex);
+
+	return in_len;
+
+err_list:
+	spin_lock_irq(&file->ucontext->lock);
+	list_del(&obj->uobject.list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+err_unreg:
+	ib_dereg_mr(mr);
+
+err_up:
+	up(&ib_uverbs_idr_mutex);
+
+	ib_umem_release(file->device->ib_dev, &obj->umem);
+
+err_free:
+	kfree(obj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dereg_mr cmd;
+	struct ib_mr             *mr;
+	struct ib_umem_object    *memobj;
+	int                       ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	down(&ib_uverbs_idr_mutex);
+
+	mr = idr_find(&ib_uverbs_mr_idr, cmd.mr_handle);
+	if (!mr || mr->uobject->context != file->ucontext)
+		goto out;
+	
+	ret = ib_dereg_mr(mr);
+	if (ret)
+		goto out;
+
+	idr_remove(&ib_uverbs_mr_idr, cmd.mr_handle);
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_del(&mr->uobject->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	memobj = container_of(mr->uobject, struct ib_umem_object, uobject);
+	ib_umem_release(file->device->ib_dev, &memobj->umem);
+	kfree(memobj);
+
+out:
+	up(&ib_uverbs_idr_mutex);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_cq      cmd;
+	struct ib_uverbs_create_cq_resp resp;
+	struct ib_uobject              *uobj;
+	struct ib_cq                   *cq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	uobj->user_handle = cmd.user_handle;
+	uobj->context     = file->ucontext;
+
+	cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+					     file->ucontext, buf + sizeof cmd,
+					     in_len - sizeof cmd -
+					     sizeof (struct ib_uverbs_cmd_hdr));
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err;
+	}
+
+	cq->device        = file->device->ib_dev;
+	cq->uobject       = uobj;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = file;
+	atomic_set(&cq->usecnt, 0);
+
+retry:
+	if (!idr_pre_get(&ib_uverbs_cq_idr, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_cq;
+	}
+
+	down(&ib_uverbs_idr_mutex);
+	ret = idr_get_new(&ib_uverbs_cq_idr, cq, &uobj->id);
+	up(&ib_uverbs_idr_mutex);
+
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		goto err_cq;
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_add_tail(&uobj->list, &file->ucontext->cq_list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	resp.cq_handle = uobj->id;
+	resp.cqe       = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_list;
+	}
+
+	return in_len;
+
+err_list:
+ 	spin_lock_irq(&file->ucontext->lock);
+	list_del(&uobj->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	down(&ib_uverbs_idr_mutex);
+	idr_remove(&ib_uverbs_cq_idr, uobj->id);
+	up(&ib_uverbs_idr_mutex);
+
+err_cq:
+	ib_destroy_cq(cq);
+
+err:
+	kfree(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_cq cmd;
+	struct ib_cq               *cq;
+	int                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	down(&ib_uverbs_idr_mutex);
+
+	cq = idr_find(&ib_uverbs_cq_idr, cmd.cq_handle);
+	if (!cq || cq->uobject->context != file->ucontext)
+		goto out;
+
+	ret = ib_destroy_cq(cq);
+	if (ret)
+		goto out;
+
+	idr_remove(&ib_uverbs_cq_idr, cmd.cq_handle);
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_del(&cq->uobject->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	kfree(cq->uobject);
+
+out:
+	up(&ib_uverbs_idr_mutex);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_qp      cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_uobject              *uobj;
+	struct ib_pd                   *pd;
+	struct ib_cq                   *scq, *rcq;
+	struct ib_qp                   *qp;
+	struct ib_qp_init_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	down(&ib_uverbs_idr_mutex);
+
+	pd  = idr_find(&ib_uverbs_pd_idr, cmd.pd_handle);
+	scq = idr_find(&ib_uverbs_cq_idr, cmd.send_cq_handle);
+	rcq = idr_find(&ib_uverbs_cq_idr, cmd.recv_cq_handle);
+
+	if (!pd  || pd->uobject->context  != file->ucontext ||
+	    !scq || scq->uobject->context != file->ucontext ||
+	    !rcq || rcq->uobject->context != file->ucontext) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.send_cq       = scq;
+	attr.recv_cq       = rcq;
+	attr.srq           = NULL;
+	attr.sq_sig_type   = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	attr.qp_type       = cmd.qp_type;
+
+	attr.cap.max_send_wr     = cmd.max_send_wr;
+	attr.cap.max_recv_wr     = cmd.max_recv_wr;
+	attr.cap.max_send_sge    = cmd.max_send_sge;
+	attr.cap.max_recv_sge    = cmd.max_recv_sge;
+	attr.cap.max_inline_data = cmd.max_inline_data;
+
+	uobj->user_handle = cmd.user_handle;
+	uobj->context     = file->ucontext;
+
+	qp = pd->device->create_qp(pd, &attr, buf + sizeof cmd,
+				   in_len - sizeof cmd -
+				   sizeof (struct ib_uverbs_cmd_hdr));
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_up;
+	}
+
+	qp->device     	  = pd->device;
+	qp->pd         	  = pd;
+	qp->send_cq    	  = attr.send_cq;
+	qp->recv_cq    	  = attr.recv_cq;
+	qp->srq	       	  = attr.srq;
+	qp->uobject       = uobj;
+	qp->event_handler = attr.event_handler;
+	qp->qp_context    = attr.qp_context;
+	qp->qp_type	  = attr.qp_type;
+	atomic_inc(&pd->usecnt);
+	atomic_inc(&attr.send_cq->usecnt);
+	atomic_inc(&attr.recv_cq->usecnt);
+	if (attr.srq)
+		atomic_inc(&attr.srq->usecnt);
+
+	resp.qpn = qp->qp_num;
+
+retry:
+	if (!idr_pre_get(&ib_uverbs_qp_idr, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto err_destroy;
+	}
+
+	ret = idr_get_new(&ib_uverbs_qp_idr, qp, &uobj->id);
+
+	if (ret == -EAGAIN)
+		goto retry;
+	if (ret)
+		goto err_destroy;
+
+	resp.qp_handle = uobj->id;
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_add_tail(&uobj->list, &file->ucontext->qp_list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_list;
+	}
+
+	up(&ib_uverbs_idr_mutex);
+
+	return in_len;
+
+err_list:
+	spin_lock_irq(&file->ucontext->lock);
+	list_del(&uobj->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_up:
+	up(&ib_uverbs_idr_mutex);
+
+	kfree(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_modify_qp cmd;
+	struct ib_qp              *qp;
+	struct ib_qp_attr         *attr;
+	int                        ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	down(&ib_uverbs_idr_mutex);
+
+	qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle);
+	if (!qp || qp->uobject->context != file->ucontext) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	attr->qp_state 		  = cmd.qp_state;
+	attr->cur_qp_state 	  = cmd.cur_qp_state;
+	attr->path_mtu 		  = cmd.path_mtu;
+	attr->path_mig_state 	  = cmd.path_mig_state;
+	attr->qkey 		  = cmd.qkey;
+	attr->rq_psn 		  = cmd.rq_psn;
+	attr->sq_psn 		  = cmd.sq_psn;
+	attr->dest_qp_num 	  = cmd.dest_qp_num;
+	attr->qp_access_flags 	  = cmd.qp_access_flags;
+	attr->pkey_index 	  = cmd.pkey_index;
+	attr->alt_pkey_index 	  = cmd.pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic 	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer 	  = cmd.min_rnr_timer;
+	attr->port_num 		  = cmd.port_num;
+	attr->timeout 		  = cmd.timeout;
+	attr->retry_cnt 	  = cmd.retry_cnt;
+	attr->rnr_retry 	  = cmd.rnr_retry;
+	attr->alt_port_num 	  = cmd.alt_port_num;
+	attr->alt_timeout 	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label        = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index        = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit         = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class     = cmd.dest.traffic_class;
+	attr->ah_attr.dlid 	    	    = cmd.dest.dlid;
+	attr->ah_attr.sl   	    	    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits 	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate   	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags 	    	    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num 	    	    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid 	    	    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl   	    	    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits     = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate       = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags 	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num 	    = cmd.alt_dest.port_num;
+
+	ret = ib_modify_qp(qp, attr, cmd.attr_mask);
+	if (ret)
+		goto out;
+
+	ret = in_len;
+
+out:
+	up(&ib_uverbs_idr_mutex);
+	kfree(attr);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_qp cmd;
+	struct ib_qp               *qp;
+	int                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	down(&ib_uverbs_idr_mutex);
+
+	qp = idr_find(&ib_uverbs_qp_idr, cmd.qp_handle);
+	if (!qp || qp->uobject->context != file->ucontext)
+		goto out;
+
+	ret = ib_destroy_qp(qp);
+	if (ret)
+		goto out;
+
+	idr_remove(&ib_uverbs_qp_idr, cmd.qp_handle);
+
+	spin_lock_irq(&file->ucontext->lock);
+	list_del(&qp->uobject->list);
+	spin_unlock_irq(&file->ucontext->lock);
+
+	kfree(qp->uobject);
+
+out:
+	up(&ib_uverbs_idr_mutex);
+
+	return ret ? ret : in_len;
+}
+
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/core/uverbs_main.c	2005-04-04 14:53:17.824728218 -0700
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: uverbs_main.c 2109 2005-04-04 21:10:34Z roland $
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define INFINIBANDEVENTFS_MAGIC	0x49426576	/* "IBev" */
+
+enum {
+	IB_UVERBS_MAJOR       = 231,
+	IB_UVERBS_BASE_MINOR  = 128,
+	IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+DECLARE_MUTEX(ib_uverbs_idr_mutex);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+
+static spinlock_t map_lock;
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     const char __user *buf, int in_len,
+				     int out_len) = {
+	[IB_USER_VERBS_CMD_QUERY_PARAMS]  = ib_uverbs_query_params,
+	[IB_USER_VERBS_CMD_GET_CONTEXT]   = ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_PORT]    = ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]      = ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]    = ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]        = ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]      = ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_CREATE_CQ]     = ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]    = ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]     = ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]     = ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]    = ib_uverbs_destroy_qp,
+};
+
+static struct vfsmount *uverbs_event_mnt;
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static int ib_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj, *tmp;
+
+	if (!context)
+		return 0;
+
+	/* Free AHs */
+
+	list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+		struct ib_qp *qp = idr_find(&ib_uverbs_qp_idr, uobj->id);
+		idr_remove(&ib_uverbs_qp_idr, uobj->id);
+		ib_destroy_qp(qp);
+		list_del(&uobj->list);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+		struct ib_cq *cq = idr_find(&ib_uverbs_cq_idr, uobj->id);
+		idr_remove(&ib_uverbs_cq_idr, uobj->id);
+		ib_destroy_cq(cq);
+		list_del(&uobj->list);
+		kfree(uobj);
+	}
+
+	/* XXX Free SRQs */
+	/* XXX Free MWs */
+
+	list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+		struct ib_mr *mr = idr_find(&ib_uverbs_mr_idr, uobj->id);
+		struct ib_umem_object *memobj;
+
+		memobj = container_of(uobj, struct ib_umem_object, uobject);
+		ib_umem_release(mr->device, &memobj->umem);
+
+		idr_remove(&ib_uverbs_mr_idr, uobj->id);
+		ib_dereg_mr(mr);
+		list_del(&uobj->list);
+		kfree(memobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+		struct ib_pd *pd = idr_find(&ib_uverbs_pd_idr, uobj->id);
+		idr_remove(&ib_uverbs_pd_idr, uobj->id);
+		ib_dealloc_pd(pd);
+		list_del(&uobj->list);
+		kfree(uobj);
+	}
+
+	return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+	struct ib_uverbs_file *file = 
+		container_of(ref, struct ib_uverbs_file, ref);
+
+	module_put(file->device->ib_dev->owner);
+	kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+				    size_t count, loff_t *pos)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	void *event;
+	int eventsz;
+	int ret = 0;
+
+	spin_lock_irq(&file->lock);
+
+	while (list_empty(&file->event_list) && file->fd >= 0) {
+		spin_unlock_irq(&file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list) ||
+					     file->fd < 0))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->lock);
+	}
+
+	if (file->fd < 0) {
+		spin_unlock_irq(&file->lock);
+		return -ENODEV;
+	}
+
+	if (file->is_async) {
+		event   = list_entry(file->event_list.next,
+				     struct ib_uverbs_async_event, list);
+		eventsz = sizeof (struct ib_uverbs_async_event_desc);
+	} else {
+		event   = list_entry(file->event_list.next,
+				     struct ib_uverbs_comp_event, list);
+		eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+	}
+
+	if (eventsz > count) {
+		ret   = -EINVAL;
+		event = NULL;
+	} else
+		list_del(file->event_list.next);
+
+	spin_unlock_irq(&file->lock);
+
+	if (event) {
+		if (copy_to_user(buf, event, eventsz))
+			ret = -EFAULT;
+		else
+			ret = eventsz;
+	}
+
+	kfree(event);
+
+	return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+					 struct poll_table_struct *wait)
+{
+	unsigned int pollflags = 0;
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	spin_lock_irq(&file->lock);
+	if (file->fd < 0)
+		pollflags = POLLERR;
+	else if (!list_empty(&file->event_list))
+		pollflags = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&file->lock);
+
+	return pollflags;
+}
+
+static void ib_uverbs_event_release(struct ib_uverbs_event_file *file)
+{
+	struct list_head *entry, *tmp;
+	int put = 0;
+
+	spin_lock_irq(&file->lock);
+	if (file->fd != -1) {
+		put      = 1;
+		file->fd = -1;
+		list_for_each_safe(entry, tmp, &file->event_list)
+			if (file->is_async)
+				kfree(list_entry(entry, struct ib_uverbs_async_event, list));
+			else
+				kfree(list_entry(entry, struct ib_uverbs_comp_event, list));
+	}
+	spin_unlock_irq(&file->lock);
+
+	if (put)
+		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	ib_uverbs_event_release(file);
+
+	return 0;
+}
+
+static struct file_operations uverbs_event_fops = {
+	/*
+	 * No .owner field since we artificially create event files,
+	 * so there is no increment to the module reference count in
+	 * the open path.  All event files come from a uverbs command
+	 * file, which already takes a module reference, so this is OK.
+	 */
+	.read 	 = ib_uverbs_event_read,
+	.poll    = ib_uverbs_event_poll,
+	.release = ib_uverbs_event_close
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct ib_uverbs_file       *file = cq_context;
+	struct ib_uverbs_comp_event *entry;
+	unsigned long                flags;
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry)
+		return;
+
+	entry->desc.cq_handle = cq->uobject->user_handle;
+
+	spin_lock_irqsave(&file->comp_file[0].lock, flags);
+	list_add_tail(&entry->list, &file->comp_file[0].event_list);
+	spin_unlock_irqrestore(&file->comp_file[0].lock, flags);
+
+	wake_up_interruptible(&file->comp_file[0].poll_wait);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+
+}
+
+static void ib_uverbs_event_handler(struct ib_event_handler *handler,
+				    struct ib_event *event)
+{
+	struct ib_uverbs_file *file =
+		container_of(handler, struct ib_uverbs_file, event_handler);
+	struct ib_uverbs_async_event *entry;
+	unsigned long flags;
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry)
+		return;
+
+	entry->desc.event_type = event->event;
+	entry->desc.element    = event->element.port_num;
+
+	spin_lock_irqsave(&file->async_file.lock, flags);
+	list_add_tail(&entry->list, &file->async_file.event_list);
+	spin_unlock_irqrestore(&file->async_file.lock, flags);
+
+	wake_up_interruptible(&file->async_file.poll_wait);
+}
+
+static int ib_uverbs_event_init(struct ib_uverbs_event_file *file,
+				struct ib_uverbs_file *uverbs_file)
+{
+	struct file *filp;
+
+	spin_lock_init(&file->lock);
+	INIT_LIST_HEAD(&file->event_list);
+	init_waitqueue_head(&file->poll_wait);
+	file->uverbs_file = uverbs_file;
+
+	file->fd = get_unused_fd();
+	if (file->fd < 0)
+		return file->fd;
+
+	filp = get_empty_filp();
+	if (!filp) {
+		put_unused_fd(file->fd);
+		return -ENFILE;
+	}
+
+	filp->f_op 	   = &uverbs_event_fops;
+	filp->f_vfsmnt 	   = mntget(uverbs_event_mnt);
+	filp->f_dentry 	   = dget(uverbs_event_mnt->mnt_root);
+	filp->f_mapping    = filp->f_dentry->d_inode->i_mapping;
+	filp->f_flags      = O_RDONLY;
+	filp->f_mode       = FMODE_READ;
+	filp->private_data = file;
+
+	fd_install(file->fd, filp);
+
+	return 0;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_cmd_hdr hdr;
+
+	if (count < sizeof hdr)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof hdr))
+		return -EFAULT;
+
+	if (hdr.in_words * 4 != count)
+		return -EINVAL;
+
+	if (hdr.command < 0 || hdr.command >= ARRAY_SIZE(uverbs_cmd_table))
+		return -EINVAL;
+
+	if (!file->ucontext                               &&
+	    hdr.command != IB_USER_VERBS_CMD_QUERY_PARAMS &&
+	    hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT)
+		return -EINVAL;
+
+	return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr,
+					     hdr.in_words * 4, hdr.out_words * 4);
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_device *dev =
+		container_of(inode->i_cdev, struct ib_uverbs_device, dev);
+	struct ib_uverbs_file *file;
+	int i = 0;
+	int ret;
+
+	if (!try_module_get(dev->ib_dev->owner))
+		return -ENODEV;
+
+	file = kmalloc(sizeof *file +
+		       (dev->num_comp - 1) * sizeof (struct ib_uverbs_event_file),
+		       GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	file->device = dev;
+	kref_init(&file->ref);
+
+	file->ucontext = NULL;
+
+	ret = ib_uverbs_event_init(&file->async_file, file);
+	if (ret)
+		goto err;
+
+	file->async_file.is_async = 1;
+
+	kref_get(&file->ref);
+
+	for (i = 0; i < dev->num_comp; ++i) {
+		ret = ib_uverbs_event_init(&file->comp_file[i], file);
+		if (ret)
+			goto err_async;
+		kref_get(&file->ref);
+		file->comp_file[i].is_async = 0;
+	}
+
+
+	filp->private_data = file;
+
+	INIT_IB_EVENT_HANDLER(&file->event_handler, dev->ib_dev,
+			      ib_uverbs_event_handler);
+	if (ib_register_event_handler(&file->event_handler))
+		goto err_async;
+
+	return 0;
+
+err_async:
+	while (i--)
+		ib_uverbs_event_release(&file->comp_file[i]);
+
+	ib_uverbs_event_release(&file->async_file);
+
+err:
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	int i;
+
+	ib_unregister_event_handler(&file->event_handler);
+	ib_uverbs_event_release(&file->async_file);
+	ib_dealloc_ucontext(file->ucontext);
+
+	for (i = 0; i < file->device->num_comp; ++i)
+		ib_uverbs_event_release(&file->comp_file[i]);
+
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return 0;
+}
+
+static struct file_operations uverbs_fops = {
+	.owner 	 = THIS_MODULE,
+	.write 	 = ib_uverbs_write,
+	.open 	 = ib_uverbs_open,
+	.release = ib_uverbs_close
+};
+
+static struct file_operations uverbs_mmap_fops = {
+	.owner 	 = THIS_MODULE,
+	.write 	 = ib_uverbs_write,
+	.mmap    = ib_uverbs_mmap,
+	.open 	 = ib_uverbs_open,
+	.release = ib_uverbs_close
+};
+
+static struct ib_client uverbs_client = {
+	.name   = "uverbs",
+	.add    = ib_uverbs_add_one,
+	.remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_dev(struct class_device *class_dev, char *buf)
+{
+	struct ib_uverbs_device *dev =
+		container_of(class_dev, struct ib_uverbs_device, class_dev);
+
+	return print_dev_t(buf, dev->dev.dev);
+}
+static CLASS_DEVICE_ATTR(dev, S_IRUGO, show_dev, NULL);
+
+static ssize_t show_ibdev(struct class_device *class_dev, char *buf)
+{
+	struct ib_uverbs_device *dev =
+		container_of(class_dev, struct ib_uverbs_device, class_dev);
+
+	return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static CLASS_DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static void ib_uverbs_release_class_dev(struct class_device *class_dev)
+{
+	struct ib_uverbs_device *dev =
+		container_of(class_dev, struct ib_uverbs_device, class_dev);
+
+	cdev_del(&dev->dev);
+	clear_bit(dev->devnum, dev_map);
+	kfree(dev);
+}
+
+static struct class uverbs_class = {
+	.name    = "infiniband_verbs",
+	.release = ib_uverbs_release_class_dev
+};
+
+static ssize_t show_abi_version(struct class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION);
+}
+static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev;
+
+	if (!device->alloc_ucontext)
+		return;
+
+	uverbs_dev = kmalloc(sizeof *uverbs_dev, GFP_KERNEL);
+	if (!uverbs_dev)
+		return;
+
+	memset(uverbs_dev, 0, sizeof *uverbs_dev);
+
+	spin_lock(&map_lock);
+	uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+		spin_unlock(&map_lock);
+		goto err;
+	}
+	set_bit(uverbs_dev->devnum, dev_map);
+	spin_unlock(&map_lock);
+
+	uverbs_dev->ib_dev   = device;
+	uverbs_dev->num_comp = 1;
+
+	if (device->mmap)
+		cdev_init(&uverbs_dev->dev, &uverbs_mmap_fops);
+	else
+		cdev_init(&uverbs_dev->dev, &uverbs_fops);
+	uverbs_dev->dev.owner = THIS_MODULE;
+	kobject_set_name(&uverbs_dev->dev.kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(&uverbs_dev->dev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+		goto err;
+
+	uverbs_dev->class_dev.class = &uverbs_class;
+	uverbs_dev->class_dev.dev   = device->dma_device;
+	snprintf(uverbs_dev->class_dev.class_id, BUS_ID_SIZE, "uverbs%d", uverbs_dev->devnum);
+	if (class_device_register(&uverbs_dev->class_dev))
+		goto err_cdev;
+
+	if (class_device_create_file(&uverbs_dev->class_dev, &class_device_attr_dev))
+		goto err_class;
+	if (class_device_create_file(&uverbs_dev->class_dev, &class_device_attr_ibdev))
+		goto err_class;
+
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+	return;
+
+err_class:
+	class_device_unregister(&uverbs_dev->class_dev);
+
+err_cdev:
+	cdev_del(&uverbs_dev->dev);
+	clear_bit(uverbs_dev->devnum, dev_map);
+
+err:
+	kfree(uverbs_dev);
+	return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+	if (!uverbs_dev)
+		return;
+
+	class_device_unregister(&uverbs_dev->class_dev);
+}
+
+static struct super_block *uverbs_event_get_sb(struct file_system_type *fs_type, int flags,
+					       const char *dev_name, void *data)
+{
+	return get_sb_pseudo(fs_type, "infinibandevent:", NULL,
+			     INFINIBANDEVENTFS_MAGIC);
+}
+
+static struct file_system_type uverbs_event_fs = {
+	/* No owner field so module can be unloaded */
+	.name    = "infinibandeventfs",
+	.get_sb  = uverbs_event_get_sb,
+	.kill_sb = kill_litter_super
+};
+
+static int __init ib_uverbs_init(void)
+{
+	int ret;
+
+	spin_lock_init(&map_lock);
+
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+				     "infiniband_verbs");
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register device number\n");
+		goto out;
+	}
+
+	ret = class_register(&uverbs_class);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+		goto out_chrdev;
+	}
+
+	ret = class_create_file(&uverbs_class, &class_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = register_filesystem(&uverbs_event_fs);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register infinibandeventfs\n");
+		goto out_class;
+	}
+
+	uverbs_event_mnt = kern_mount(&uverbs_event_fs);
+	if (IS_ERR(uverbs_event_mnt)) {
+		ret = PTR_ERR(uverbs_event_mnt);
+		printk(KERN_ERR "user_verbs: couldn't mount infinibandeventfs\n");
+		goto out_fs;
+	}
+
+	ret = ib_register_client(&uverbs_client);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register client\n");
+		goto out_mnt;
+	}
+
+	return 0;
+
+out_mnt:
+	mntput(uverbs_event_mnt);
+
+out_fs:
+	unregister_filesystem(&uverbs_event_fs);
+
+out_class:
+	class_unregister(&uverbs_class);
+
+out_chrdev:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+	return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+	ib_unregister_client(&uverbs_client);
+	unregister_filesystem(&uverbs_event_fs);
+	mntput(uverbs_event_mnt);
+	class_unregister(&uverbs_class);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/core/uverbs_mem.c	2005-04-04 14:53:17.825728001 -0700
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: uverbs_mem.c 1979 2005-03-11 21:17:00Z roland $
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#include "uverbs.h"
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem)
+{
+	struct ib_umem_chunk *chunk, *tmp;
+	int i;
+
+	list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
+		dma_unmap_sg(dev->dma_device, chunk->page_list,
+			     chunk->nents, DMA_BIDIRECTIONAL);
+		for (i = 0; i < chunk->nents; ++i) {
+			set_page_dirty_lock(chunk->page_list[i].page);
+			put_page(chunk->page_list[i].page);
+		}
+
+		kfree(chunk);
+	}
+}
+
+static void __ib_umem_unmark(struct ib_umem *umem, struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	unsigned long cur_base;
+
+	vma = find_vma(mm, umem->user_base);
+
+	for (cur_base = umem->user_base;
+	     cur_base < umem->user_base + umem->length;
+	     cur_base = vma->vm_end) {
+		if (!vma || vma->vm_start > umem->user_base + umem->length)
+			break;
+
+		if (!(vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+			vma->vm_flags &= ~VM_DONTCOPY;
+
+		vma = vma->vm_next;
+	}
+}
+
+int ib_umem_get(struct ib_device *dev, struct ib_umem *mem,
+		void *addr, size_t size)
+{
+	struct page **page_list;
+	struct vm_area_struct *vma;
+	struct ib_umem_chunk *chunk;
+	unsigned long cur_base;
+	int npages;
+	int ret = 0;
+	int off;
+	int i;
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	mem->user_base = (unsigned long) addr;
+	mem->length    = size;
+	mem->offset    = (unsigned long) addr & ~PAGE_MASK;
+	mem->page_size = PAGE_SIZE;
+
+	INIT_LIST_HEAD(&mem->chunk_list);
+
+	npages   = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	vma = find_vma(current->mm, mem->user_base);
+
+	for (cur_base = mem->user_base;
+	     cur_base < mem->user_base + size;
+	     cur_base = vma->vm_end) {
+		if (!vma || vma->vm_start > cur_base) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (!(vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+			vma->vm_flags |= VM_DONTCOPY;
+
+		vma = vma->vm_next;
+	}
+
+	cur_base = (unsigned long) addr & PAGE_MASK;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+				     min_t(int, npages,
+					   PAGE_SIZE / sizeof (struct page *)),
+				     1, 0, page_list, NULL);
+
+		if (ret < 0)
+			goto out;
+
+		cur_base += ret * PAGE_SIZE;
+		npages   -= ret;
+
+		off = 0;
+
+		while (ret) {
+			chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
+					min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
+					GFP_KERNEL);
+			if (!chunk) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
+			for (i = 0; i < chunk->nents; ++i) {
+				chunk->page_list[i].page   = page_list[i + off];
+				chunk->page_list[i].offset = 0;
+				chunk->page_list[i].length = PAGE_SIZE;
+			}
+
+			chunk->nmap = dma_map_sg(dev->dma_device,
+						 &chunk->page_list[0],
+						 chunk->nents,
+						 DMA_BIDIRECTIONAL);
+			if (chunk->nmap <= 0) {
+				for (i = 0; i < chunk->nents; ++i)
+					put_page(chunk->page_list[i].page);
+				kfree(chunk);
+
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret -= chunk->nents;
+			off += chunk->nents;
+			list_add_tail(&chunk->list, &mem->chunk_list);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (ret < 0) {
+		__ib_umem_unmark(mem, current->mm);
+		__ib_umem_release(dev, mem);
+	}
+
+	up_write(&current->mm->mmap_sem);
+	free_page((unsigned long) page_list);
+
+	return ret;
+}
+
+void ib_umem_release(struct ib_device *dev, struct ib_umem *umem)
+{
+	struct mm_struct *mm;
+
+	mm = get_task_mm(current);
+
+	if (mm) {
+		down_write(&mm->mmap_sem);
+		__ib_umem_unmark(umem, mm);
+	}
+
+	__ib_umem_release(dev, umem);
+
+	if (mm) {
+		up_write(&current->mm->mmap_sem);
+		mmput(mm);
+	}
+}
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/include/ib_user_verbs.h	2005-04-04 14:55:47.946083444 -0700
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id: ib_user_verbs.h 2001 2005-03-16 04:15:41Z roland $
+ */
+
+#ifndef IB_USER_VERBS_H
+#define IB_USER_VERBS_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IB_USER_VERBS_ABI_VERSION	1
+
+enum {
+	IB_USER_VERBS_CMD_QUERY_PARAMS,
+	IB_USER_VERBS_CMD_GET_CONTEXT,
+	IB_USER_VERBS_CMD_QUERY_PORT,
+	IB_USER_VERBS_CMD_ALLOC_PD,
+	IB_USER_VERBS_CMD_DEALLOC_PD,
+	IB_USER_VERBS_CMD_REG_MR,
+	IB_USER_VERBS_CMD_DEREG_MR,
+	IB_USER_VERBS_CMD_CREATE_CQ,
+	IB_USER_VERBS_CMD_DESTROY_CQ,
+	IB_USER_VERBS_CMD_CREATE_QP,
+	IB_USER_VERBS_CMD_MODIFY_QP,
+	IB_USER_VERBS_CMD_DESTROY_QP,
+};
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct ib_uverbs_async_event_desc {
+	__u64 element;
+	__u32 event_type;	/* enum ib_event_type */
+	__u32 reserved;
+};
+
+struct ib_uverbs_comp_event_desc {
+	__u64 cq_handle;
+};
+
+/*
+ * All commands from userspace should start with a __u32 command field
+ * followed by __u16 in_words and out_words fields (which give the
+ * length of the command block and response buffer if any in 32-bit
+ * words).  The kernel driver will read these fields first and read
+ * the rest of the command struct based on these value.
+ */
+
+struct ib_uverbs_cmd_hdr {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+};
+
+/*
+ * No driver_data for "query params" command, since this is intended
+ * to be a core function with no possible device dependence.
+ */
+struct ib_uverbs_query_params {
+	__u64 response;
+};
+
+struct ib_uverbs_query_params_resp {
+	__u32 num_cq_events;
+};
+
+struct ib_uverbs_get_context {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_get_context_resp {
+	__u32 async_fd;
+	__u32 cq_fd[1];
+};
+
+struct ib_uverbs_query_port {
+	__u64 response;
+	__u8  port_num;
+	__u8  reserved[7];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_query_port_resp {
+	__u32 port_cap_flags;
+	__u32 max_msg_sz;
+	__u32 bad_pkey_cntr;
+	__u32 qkey_viol_cntr;
+	__u32 gid_tbl_len;
+	__u16 pkey_tbl_len;
+	__u16 lid;
+	__u16 sm_lid;
+	__u8  state;
+	__u8  max_mtu;
+	__u8  active_mtu;
+	__u8  lmc;
+	__u8  max_vl_num;
+	__u8  sm_sl;
+	__u8  subnet_timeout;
+	__u8  init_type_reply;
+	__u8  active_width;
+	__u8  active_speed;
+	__u8  phys_state;
+	__u8  reserved[3];
+};
+
+struct ib_uverbs_alloc_pd {
+	__u64 response;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_alloc_pd_resp {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_dealloc_pd {
+	__u32 pd_handle;
+};
+
+struct ib_uverbs_reg_mr {
+	__u64 response;
+	__u64 start;
+	__u64 length;
+	__u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_reg_mr_resp {
+	__u32 mr_handle;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct ib_uverbs_dereg_mr {
+	__u32 mr_handle;
+};
+
+struct ib_uverbs_create_cq {
+	__u64 response;
+	__u64 user_handle;
+	__u32 cqe;
+	__u32 reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_cq_resp {
+	__u32 cq_handle;
+	__u32 cqe;
+};
+
+struct ib_uverbs_destroy_cq {
+	__u32 cq_handle;
+};
+
+struct ib_uverbs_create_qp {
+	__u64 response;
+	__u64 user_handle;
+	__u32 pd_handle;
+	__u32 send_cq_handle;
+	__u32 recv_cq_handle;
+	__u32 srq_handle;
+	__u32 max_send_wr;
+	__u32 max_recv_wr;
+	__u32 max_send_sge;
+	__u32 max_recv_sge;
+	__u32 max_inline_data;
+	__u8  sq_sig_all;
+	__u8  qp_type;
+	__u8  is_srq;
+	__u8  reserved;
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_create_qp_resp {
+	__u32 qp_handle;
+	__u32 qpn;
+};
+
+/*
+ * This struct needs to remain a multiple of 8 bytes to keep the
+ * alignment of the modify QP parameters.
+ */
+struct ib_uverbs_qp_dest {
+	__u8  dgid[16];
+	__u32 flow_label;
+	__u16 dlid;
+	__u16 reserved;
+	__u8  sgid_index;
+	__u8  hop_limit;
+	__u8  traffic_class;
+	__u8  sl;
+	__u8  src_path_bits;
+	__u8  static_rate;
+	__u8  is_global;
+	__u8  port_num;
+};
+
+struct ib_uverbs_modify_qp {
+	struct ib_uverbs_qp_dest dest;
+	struct ib_uverbs_qp_dest alt_dest;
+	__u32 qp_handle;
+	__u32 attr_mask;
+	__u32 qkey;
+	__u32 rq_psn;
+	__u32 sq_psn;
+	__u32 dest_qp_num;
+	__u32 qp_access_flags;
+	__u16 pkey_index;
+	__u16 alt_pkey_index;
+	__u8  qp_state;
+	__u8  cur_qp_state;
+	__u8  path_mtu;
+	__u8  path_mig_state;
+	__u8  en_sqd_async_notify;
+	__u8  max_rd_atomic;
+	__u8  max_dest_rd_atomic;
+	__u8  min_rnr_timer;
+	__u8  port_num;
+	__u8  timeout;
+	__u8  retry_cnt;
+	__u8  rnr_retry;
+	__u8  alt_port_num;
+	__u8  alt_timeout;
+	__u8  reserved[2];
+	__u64 driver_data[0];
+};
+
+struct ib_uverbs_modify_qp_resp {
+};
+
+struct ib_uverbs_destroy_qp {
+	__u32 qp_handle;
+};
+
+#endif /* IB_USER_VERBS_H */


^ permalink raw reply	[flat|nested] 144+ messages in thread

* [PATCH][RFC][3/4] IB: userspace verbs mthca changes
  2005-04-04 22:09   ` [PATCH][RFC][2/4] IB: userspace verbs main module Roland Dreier
@ 2005-04-04 22:09     ` Roland Dreier
  2005-04-04 22:09       ` [PATCH][RFC][4/4] IB: userspace verbs Kconfig/Makefile changes Roland Dreier
                         ` (2 more replies)
  0 siblings, 3 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-04 22:09 UTC (permalink / raw)
  To: linux-kernel, openib-general

Add Mellanox HCA-specific userspace verbs support to mthca.

Signed-off-by: Roland Dreier <roland@topspin.com>

--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_cq.c	2005-04-04 14:57:12.228756073 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_cq.c	2005-04-04 14:58:12.364679525 -0700
@@ -743,6 +743,7 @@
 }
 
 int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
 		  struct mthca_cq *cq)
 {
 	int size = nent * MTHCA_CQ_ENTRY_SIZE;
@@ -754,30 +755,33 @@
 
 	might_sleep();
 
-	cq->ibcq.cqe = nent - 1;
+	cq->ibcq.cqe  = nent - 1;
+	cq->is_kernel = !ctx;
 
 	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
 	if (cq->cqn == -1)
 		return -ENOMEM;
 
 	if (mthca_is_memfree(dev)) {
-		cq->arm_sn = 1;
-
 		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
 		if (err)
 			goto err_out;
 
-		err = -ENOMEM;
+		if (cq->is_kernel) {
+			cq->arm_sn = 1;
+
+			err = -ENOMEM;
 
-		cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
-						     cq->cqn, &cq->set_ci_db);
-		if (cq->set_ci_db_index < 0)
-			goto err_out_icm;
-
-		cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
-						  cq->cqn, &cq->arm_db);
-		if (cq->arm_db_index < 0)
-			goto err_out_ci;
+			cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+							     cq->cqn, &cq->set_ci_db);
+			if (cq->set_ci_db_index < 0)
+				goto err_out_icm;
+
+			cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+							  cq->cqn, &cq->arm_db);
+			if (cq->arm_db_index < 0)
+				goto err_out_ci;
+		}
 	}
 
 	mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
@@ -787,12 +791,14 @@
 
 	cq_context = MAILBOX_ALIGN(mailbox);
 
-	err = mthca_alloc_cq_buf(dev, size, cq);
-	if (err)
-		goto err_out_mailbox;
+	if (cq->is_kernel) {
+		err = mthca_alloc_cq_buf(dev, size, cq);
+		if (err)
+			goto err_out_mailbox;
 
-	for (i = 0; i < nent; ++i)
-		set_cqe_hw(get_cqe(cq, i));
+		for (i = 0; i < nent; ++i)
+			set_cqe_hw(get_cqe(cq, i));
+	}
 
 	spin_lock_init(&cq->lock);
 	atomic_set(&cq->refcount, 1);
@@ -803,11 +809,14 @@
 						  MTHCA_CQ_STATE_DISARMED |
 						  MTHCA_CQ_FLAG_TR);
 	cq_context->start           = cpu_to_be64(0);
-	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
-						  dev->driver_uar.index);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+	if (ctx)
+		cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
+	else
+		cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
 	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
 	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
-	cq_context->pd              = cpu_to_be32(dev->driver_pd.pd_num);
+	cq_context->pd              = cpu_to_be32(pdn);
 	cq_context->lkey            = cpu_to_be32(cq->mr.ibmr.lkey);
 	cq_context->cqn             = cpu_to_be32(cq->cqn);
 
@@ -845,17 +854,19 @@
 	return 0;
 
 err_out_free_mr:
-	mthca_free_mr(dev, &cq->mr);
-	mthca_free_cq_buf(dev, cq);
+	if (cq->is_kernel) {
+		mthca_free_mr(dev, &cq->mr);
+		mthca_free_cq_buf(dev, cq);
+	}
 
 err_out_mailbox:
 	kfree(mailbox);
 
-	if (mthca_is_memfree(dev))
+	if (cq->is_kernel && mthca_is_memfree(dev))
 		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
 
 err_out_ci:
-	if (mthca_is_memfree(dev))
+	if (cq->is_kernel)
 		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
 
 err_out_icm:
@@ -895,7 +906,8 @@
 		int j;
 
 		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
-		       cq->cqn, cq->cons_index, !!next_cqe_sw(cq));
+		       cq->cqn, cq->cons_index,
+		       cq->is_kernel ? !!next_cqe_sw(cq) : 0);
 		for (j = 0; j < 16; ++j)
 			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
 	}
@@ -913,15 +925,17 @@
 	atomic_dec(&cq->refcount);
 	wait_event(cq->wait, !atomic_read(&cq->refcount));
 
-	mthca_free_mr(dev, &cq->mr);
-	mthca_free_cq_buf(dev, cq);
-
-	if (mthca_is_memfree(dev)) {
-		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
-		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
-		mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+	if (cq->is_kernel) {
+		mthca_free_mr(dev, &cq->mr);
+		mthca_free_cq_buf(dev, cq);
+		if (mthca_is_memfree(dev)) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+		}
 	}
 
+	if (mthca_is_memfree(dev))
+		mthca_table_put(dev, dev->cq_table.table, cq->cqn);
 	mthca_free(&dev->cq_table.alloc, cq->cqn);
 	kfree(mailbox);
 }
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h	2005-04-04 14:57:12.254750421 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h	2005-04-04 14:58:12.411669307 -0700
@@ -49,14 +49,6 @@
 #define DRV_VERSION	"0.06-pre"
 #define DRV_RELDATE	"November 8, 2004"
 
-/* XXX remove once SINAI defines make it into kernel.org */
-#ifndef PCI_DEVICE_ID_MELLANOX_SINAI_OLD
-#define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
-#endif
-#ifndef PCI_DEVICE_ID_MELLANOX_SINAI
-#define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274
-#endif
-
 enum {
 	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
 	MTHCA_FLAG_SRQ        = 1 << 2,
@@ -413,6 +405,7 @@
 int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
 int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify);
 int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
 		  struct mthca_cq *cq);
 void mthca_free_cq(struct mthca_dev *dev,
 		   struct mthca_cq *cq);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_memfree.c	2005-04-04 14:57:12.256749986 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_memfree.c	2005-04-04 14:58:12.412669090 -0700
@@ -45,6 +45,15 @@
 	MTHCA_TABLE_CHUNK_SIZE = 1 << 18
 };
 
+struct mthca_user_db_table {
+	struct semaphore mutex;
+	struct {
+		u64                uvirt;
+		struct scatterlist mem;
+		int                refcount;
+	}                page[0];
+};
+
 void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm)
 {
 	struct mthca_icm_chunk *chunk, *tmp;
@@ -334,13 +343,132 @@
 	kfree(table);
 }
 
-static u64 mthca_uarc_virt(struct mthca_dev *dev, int page)
+static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page)
 {
 	return dev->uar_table.uarc_base +
-		dev->driver_uar.index * dev->uar_table.uarc_size +
+		uar->index * dev->uar_table.uarc_size +
 		page * 4096;
 }
 
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+{
+	int ret = 0;
+	u8 status;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	if (index < 0 || index > dev->uar_table.uarc_size / 8)
+		return -EINVAL;
+
+	down(&db_tab->mutex);
+
+	i = index / MTHCA_DB_REC_PER_PAGE;
+
+	if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE)       ||
+	    (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+	    (uaddr & 4095)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (db_tab->page[i].refcount) {
+		++db_tab->page[i].refcount;
+		goto out;
+	}
+
+	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
+			     &db_tab->page[i].mem.page, NULL);
+	if (ret < 0)
+		goto out;
+
+	db_tab->page[i].mem.offset = uaddr & ~PAGE_MASK;
+
+	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+	if (ret < 0) {
+		put_page(db_tab->page[i].mem.page);
+		goto out;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+				 mthca_uarc_virt(dev, uar, i), &status);
+	if (!ret && status)
+		ret = -EINVAL;
+	if (ret) {
+		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+		put_page(db_tab->page[i].mem.page);
+		goto out;
+	}
+
+	db_tab->page[i].uvirt    = uaddr;
+	db_tab->page[i].refcount = 1;
+
+out:
+	up(&db_tab->mutex);
+	return ret;
+}
+
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index)
+{
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * To make our bookkeeping simpler, we don't unmap DB
+	 * pages until we clean up the whole db table.
+	 */
+
+	down(&db_tab->mutex);
+
+	--db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
+
+	up(&db_tab->mutex);
+}
+
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev)
+{
+	struct mthca_user_db_table *db_tab;
+	int npages;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return NULL;
+
+	npages = dev->uar_table.uarc_size / 4096;
+	db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL);
+	if (!db_tab)
+		return ERR_PTR(-ENOMEM);
+
+	init_MUTEX(&db_tab->mutex);
+	for (i = 0; i < npages; ++i) {
+		db_tab->page[i].refcount = 0;
+		db_tab->page[i].uvirt    = 0;
+	}
+
+	return db_tab;
+}
+
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab)
+{
+	int i;
+	u8 status;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = 0; i < dev->uar_table.uarc_size / 4096; ++i) {
+		if (db_tab->page[i].uvirt) {
+			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1, &status);
+			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+			put_page(db_tab->page[i].mem.page);
+		}
+	}
+}
+
 int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db)
 {
 	int group;
@@ -397,7 +525,8 @@
 	}
 	memset(page->db_rec, 0, 4096);
 
-	ret = mthca_MAP_ICM_page(dev, page->mapping, mthca_uarc_virt(dev, i), &status);
+	ret = mthca_MAP_ICM_page(dev, page->mapping,
+				 mthca_uarc_virt(dev, &dev->driver_uar, i), &status);
 	if (!ret && status)
 		ret = -EINVAL;
 	if (ret) {
@@ -451,7 +580,7 @@
 
 	if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
 	    i >= dev->db_tab->max_group1 - 1) {
-		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1, &status);
 		
 		dma_free_coherent(&dev->pdev->dev, 4096,
 				  page->db_rec, page->mapping);
@@ -520,7 +649,7 @@
 		if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
 			mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
 
-		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, i), 1, &status);
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1, &status);
 		
 		dma_free_coherent(&dev->pdev->dev, 4096,
 				  dev->db_tab->page[i].db_rec,
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_memfree.h	2005-04-04 14:57:12.256749986 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_memfree.h	2005-04-04 14:58:12.413668872 -0700
@@ -148,7 +148,7 @@
 	struct semaphore      mutex;
 };
 
-enum {
+enum mthca_db_type {
 	MTHCA_DB_TYPE_INVALID   = 0x0,
 	MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
 	MTHCA_DB_TYPE_CQ_ARM    = 0x2,
@@ -158,6 +158,17 @@
 	MTHCA_DB_TYPE_GROUP_SEP = 0x7
 };
 
+struct mthca_user_db_table;
+struct mthca_uar;
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr);
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index);
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab);
+
 int mthca_init_db_tab(struct mthca_dev *dev);
 void mthca_cleanup_db_tab(struct mthca_dev *dev);
 int mthca_alloc_db(struct mthca_dev *dev, int type, u32 qn, u32 **db);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.c	2005-04-04 14:57:12.286743464 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.c	2005-04-04 14:58:12.444662133 -0700
@@ -29,13 +29,17 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  *
- * $Id: mthca_provider.c 2100 2005-03-31 20:43:01Z roland $
+ * $Id: mthca_provider.c 2109 2005-04-04 21:10:34Z roland $
  */
 
+#include <asm/uaccess.h>
+
 #include <ib_smi.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
+#include "mthca_user.h"
+#include "mthca_memfree.h"
 
 static int mthca_query_device(struct ib_device *ibdev,
 			      struct ib_device_attr *props)
@@ -283,11 +287,78 @@
 	return err;
 }
 
-static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev)
+static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
+						const void __user *udata, int udatalen)
+{
+	struct mthca_alloc_ucontext      ucmd;
+	struct mthca_alloc_ucontext_resp uresp;
+	struct mthca_ucontext           *context;
+	int                              err;
+
+	if (copy_from_user(&ucmd, udata, sizeof ucmd))
+		return ERR_PTR(-EFAULT);
+
+	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
+	if (mthca_is_memfree(to_mdev(ibdev)))
+		uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
+	else
+		uresp.uarc_size = 0;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
+	if (IS_ERR(context->db_tab)) {
+		err = PTR_ERR(context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) ucmd.respbuf,
+			 &uresp, sizeof uresp)) {
+		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+}
+
+static int mthca_dealloc_ucontext(struct ib_ucontext *context)
 {
+	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
+				  to_mucontext(context)->db_tab);
+	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
+	kfree(to_mucontext(context));
+
+	return 0;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    const void __user *udata, int udatalen)
+{
+	struct mthca_alloc_pd ucmd;
 	struct mthca_pd *pd;
 	int err;
 
+	if (context) {
+		if (udatalen != sizeof ucmd)
+			return ERR_PTR(-EINVAL);
+
+		if (copy_from_user(&ucmd, udata, sizeof ucmd))
+			return ERR_PTR(-EFAULT);
+	}
+
 	pd = kmalloc(sizeof *pd, GFP_KERNEL);
 	if (!pd)
 		return ERR_PTR(-ENOMEM);
@@ -298,6 +369,14 @@
 		return ERR_PTR(err);
 	}
 
+	if (context) {
+		if (put_user(pd->pd_num, (u32 __user *) (unsigned long) ucmd.pdnbuf)) {
+			mthca_pd_free(to_mdev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
 	return &pd->ibpd;
 }
 
@@ -337,8 +416,10 @@
 }
 
 static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
-				     struct ib_qp_init_attr *init_attr)
+				     struct ib_qp_init_attr *init_attr,
+				     const void __user *udata, int udatalen)
 {
+	struct mthca_create_qp ucmd;
 	struct mthca_qp *qp;
 	int err;
 
@@ -347,10 +428,48 @@
 	case IB_QPT_UC:
 	case IB_QPT_UD:
 	{
+		struct mthca_ucontext *context;
+
 		qp = kmalloc(sizeof *qp, GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
 
+		if (pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			if (udatalen != sizeof ucmd)
+				return ERR_PTR(-EINVAL);
+
+			if (copy_from_user(&ucmd, udata, sizeof ucmd))
+				return ERR_PTR(-EFAULT);
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.sq_db_index, ucmd.sq_db_page);
+			if (err) {
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.rq_db_index, ucmd.rq_db_page);
+			if (err) {
+				mthca_unmap_user_db(to_mdev(pd->device),
+						    &context->uar,
+						    context->db_tab,
+						    ucmd.sq_db_index);
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+		}
+
+		if (pd->uobject) {
+			qp->mr.ibmr.lkey = ucmd.lkey;
+			qp->sq.db_index  = ucmd.sq_db_index;
+			qp->rq.db_index  = ucmd.rq_db_index;
+		}
+
 		qp->sq.max    = init_attr->cap.max_send_wr;
 		qp->rq.max    = init_attr->cap.max_recv_wr;
 		qp->sq.max_gs = init_attr->cap.max_send_sge;
@@ -361,12 +480,30 @@
 				     to_mcq(init_attr->recv_cq),
 				     init_attr->qp_type, init_attr->sq_sig_type,
 				     qp);
+
+		if (err && pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.sq_db_index);
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.rq_db_index);
+		}
+
 		qp->ibqp.qp_num = qp->qpn;
 		break;
 	}
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	{
+		/* Don't allow userspace to create special QPs */
+		if (pd->uobject)
+			return ERR_PTR(-EINVAL);
+
 		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
@@ -396,42 +533,116 @@
 		return ERR_PTR(err);
 	}
 
-        init_attr->cap.max_inline_data = 0;
+	init_attr->cap.max_inline_data = 0;
+	init_attr->cap.max_send_wr     = qp->sq.max;
+	init_attr->cap.max_recv_wr     = qp->rq.max;
 
 	return &qp->ibqp;
 }
 
 static int mthca_destroy_qp(struct ib_qp *qp)
 {
+	if (qp->uobject) {
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->sq.db_index);
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->rq.db_index);
+	}
 	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
 	kfree(qp);
 	return 0;
 }
 
-static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries)
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
+				     struct ib_ucontext *context,
+				     const void __user *udata, int udatalen)
 {
+	struct mthca_create_cq ucmd;
 	struct mthca_cq *cq;
 	int nent;
 	int err;
 
+	if (context) {
+		if (udatalen != sizeof ucmd)
+			return ERR_PTR(-EINVAL);
+
+		if (copy_from_user(&ucmd, udata, sizeof ucmd))
+			return ERR_PTR(-EFAULT);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.set_db_index, ucmd.set_db_page);
+		if (err)
+			return ERR_PTR(err);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.arm_db_index, ucmd.arm_db_page);
+		if (err)
+			goto err_unmap_set;
+	}
+
 	cq = kmalloc(sizeof *cq, GFP_KERNEL);
-	if (!cq)
-		return ERR_PTR(-ENOMEM);
+	if (!cq) {
+		err = -ENOMEM;
+		goto err_unmap_arm;
+	}
+
+	if (context) {
+		cq->mr.ibmr.lkey    = ucmd.lkey;
+		cq->set_ci_db_index = ucmd.set_db_index;
+		cq->arm_db_index    = ucmd.arm_db_index;
+	}
 
 	for (nent = 1; nent <= entries; nent <<= 1)
 		; /* nothing */
 
-	err = mthca_init_cq(to_mdev(ibdev), nent, cq);
-	if (err) {
-		kfree(cq);
-		cq = ERR_PTR(err);
+	err = mthca_init_cq(to_mdev(ibdev), nent, 
+			    context ? to_mucontext(context) : NULL,
+			    context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+			    cq);
+	if (err)
+		goto err_free;
+
+	if (context && put_user(cq->cqn, (u32 __user *) (unsigned long) ucmd.cqnbuf)) {
+		mthca_free_cq(to_mdev(ibdev), cq);
+		goto err_free;
 	}
 
 	return &cq->ibcq;
+
+err_free:
+	kfree(cq);
+
+err_unmap_arm:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.arm_db_index);
+
+err_unmap_set:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.set_db_index);
+
+	return ERR_PTR(err);
 }
 
 static int mthca_destroy_cq(struct ib_cq *cq)
 {
+	if (cq->uobject) {
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->arm_db_index);
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->set_ci_db_index);
+	}
 	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
 	kfree(cq);
 
@@ -558,6 +769,57 @@
 				  convert_access(acc), mr);
 
 	if (err) {
+		kfree(page_list);
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	kfree(page_list);
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
+				       int acc, const void __user *udata, int udatalen)
+{
+	struct ib_umem_chunk *chunk;
+	int npages = 0;
+	u64 *page_list;
+	struct mthca_mr *mr;
+	int shift;
+	int i, j, k;
+	int err;
+
+	shift = ffs(region->page_size) - 1;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+	
+	list_for_each_entry(chunk, &region->chunk_list, list)
+		npages += chunk->nents;
+
+	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list) {
+		kfree(mr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	i = 0;
+
+	list_for_each_entry(chunk, &region->chunk_list, list)
+		for (j = 0; j < chunk->nmap; ++j)
+			for (k = 0; k < sg_dma_len(&chunk->page_list[j]) >> shift; ++k)
+				page_list[i++] = sg_dma_address(&chunk->page_list[j]) +
+					region->page_size * k;
+
+	err = mthca_mr_alloc_phys(to_mdev(pd->device),
+				  to_mpd(pd)->pd_num,
+				  page_list, shift, npages,
+				  region->virt_base, region->length,
+				  convert_access(acc), mr);
+
+	if (err) {
+		kfree(page_list);
 		kfree(mr);
 		return ERR_PTR(err);
 	}
@@ -574,6 +836,22 @@
 	return 0;
 }
 
+static int mthca_mmap_uar(struct ib_ucontext *context,
+			  struct vm_area_struct *vma)
+{
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			    to_mucontext(context)->uar.pfn,
+			    PAGE_SIZE, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
 static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 				      struct ib_fmr_attr *fmr_attr)
 {
@@ -690,6 +968,8 @@
 	int i;
 
 	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner                = THIS_MODULE;
+
 	dev->ib_dev.node_type            = IB_NODE_CA;
 	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
 	dev->ib_dev.dma_device           = &dev->pdev->dev;
@@ -699,6 +979,8 @@
 	dev->ib_dev.modify_port          = mthca_modify_port;
 	dev->ib_dev.query_pkey           = mthca_query_pkey;
 	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
 	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
 	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
 	dev->ib_dev.create_ah            = mthca_ah_create;
@@ -711,6 +993,7 @@
 	dev->ib_dev.poll_cq              = mthca_poll_cq;
 	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
 	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
+	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
 	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
 
 	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
@@ -726,6 +1009,7 @@
 	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
 	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
 	dev->ib_dev.process_mad          = mthca_process_mad;
+	dev->ib_dev.mmap                 = mthca_mmap_uar;
 
 	if (mthca_is_memfree(dev)) {
 		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_provider.h	2005-04-04 14:57:12.287743246 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_provider.h	2005-04-04 14:58:12.445661916 -0700
@@ -54,6 +54,14 @@
 	int           index;
 };
 
+struct mthca_user_db_table;
+
+struct mthca_ucontext {
+	struct ib_ucontext          ibucontext;
+	struct mthca_uar            uar;
+	struct mthca_user_db_table *db_tab;
+};
+
 struct mthca_mr {
 	struct ib_mr ibmr;
 	int order;
@@ -167,6 +175,7 @@
 	int                    cqn;
 	u32                    cons_index;
 	int                    is_direct;
+	int                    is_kernel;
 
 	/* Next fields are Arbel only */
 	int                    set_ci_db_index;
@@ -236,6 +245,11 @@
 	dma_addr_t      header_dma;
 };
 
+static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mthca_ucontext, ibucontext);
+}
+
 static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
 {
 	return container_of(ibmr, struct mthca_fmr, ibmr);
--- linux-export.orig/drivers/infiniband/hw/mthca/mthca_qp.c	2005-04-04 14:57:12.320736072 -0700
+++ linux-export/drivers/infiniband/hw/mthca/mthca_qp.c	2005-04-04 14:58:12.491651915 -0700
@@ -652,7 +652,11 @@
 
 	/* leave arbel_sched_queue as 0 */
 
-	qp_context->usr_page   = cpu_to_be32(dev->driver_uar.index);
+	if (qp->ibqp.uobject)
+		qp_context->usr_page =
+			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+	else
+		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
 	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
 	if (attr_mask & IB_QP_DEST_QPN) {
 		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
@@ -917,6 +921,15 @@
 
 	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
 				    1 << qp->sq.wqe_shift);
+
+	/*
+	 * If this is a userspace QP, we don't actually have to
+	 * allocate anything.  All we need is to calculate the WQE
+	 * sizes and the send_wqe_offset, so we're done now.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
 	size = PAGE_ALIGN(qp->send_wqe_offset +
 			  (qp->sq.max << qp->sq.wqe_shift));
 
@@ -1015,10 +1028,33 @@
 	return err;
 }
 
-static int mthca_alloc_memfree(struct mthca_dev *dev,
+static void mthca_free_wqe_buf(struct mthca_dev *dev,
 			       struct mthca_qp *qp)
 {
-	int ret = 0;
+	int i;
+	int size = PAGE_ALIGN(qp->send_wqe_offset +
+			      (qp->sq.max << qp->sq.wqe_shift));
+
+	if (qp->is_direct) {
+		pci_free_consistent(dev->pdev, size,
+				    qp->queue.direct.buf,
+				    pci_unmap_addr(&qp->queue.direct, mapping));
+	} else {
+		for (i = 0; i < size / PAGE_SIZE; ++i) {
+			pci_free_consistent(dev->pdev, PAGE_SIZE,
+					    qp->queue.page_list[i].buf,
+					    pci_unmap_addr(&qp->queue.page_list[i],
+							   mapping));
+		}
+	}
+
+	kfree(qp->wrid);
+}
+
+static int mthca_map_memfree(struct mthca_dev *dev,
+			     struct mthca_qp *qp)
+{
+	int ret;
 
 	if (mthca_is_memfree(dev)) {
 		ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
@@ -1029,35 +1065,15 @@
 		if (ret)
 			goto err_qpc;
 
-		ret = mthca_table_get(dev, dev->qp_table.rdb_table,
-				      qp->qpn << dev->qp_table.rdb_shift);
-		if (ret)
-			goto err_eqpc;
-
-		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
-						 qp->qpn, &qp->rq.db);
-		if (qp->rq.db_index < 0) {
-			ret = -ENOMEM;
-			goto err_rdb;
-		}
+ 		ret = mthca_table_get(dev, dev->qp_table.rdb_table,
+ 				      qp->qpn << dev->qp_table.rdb_shift);
+ 		if (ret)
+ 			goto err_eqpc;
 
-		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
-						 qp->qpn, &qp->sq.db);
-		if (qp->sq.db_index < 0) {
-			ret = -ENOMEM;
-			goto err_rq_db;
-		}
 	}
 
 	return 0;
 
-err_rq_db:
-	mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
-
-err_rdb:
-	mthca_table_put(dev, dev->qp_table.rdb_table,
-			qp->qpn << dev->qp_table.rdb_shift);
-
 err_eqpc:
 	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
 
@@ -1067,16 +1083,43 @@
 	return ret;
 }
 
+static void mthca_unmap_memfree(struct mthca_dev *dev,
+				struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+ 		mthca_table_put(dev, dev->qp_table.rdb_table,
+ 				qp->qpn << dev->qp_table.rdb_shift);
+		mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+		mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+	}
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	int ret = 0;
+
+	if (mthca_is_memfree(dev)) {
+		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+						 qp->qpn, &qp->rq.db);
+		if (qp->rq.db_index < 0)
+			return ret;
+
+		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+						 qp->qpn, &qp->sq.db);
+		if (qp->sq.db_index < 0)
+			mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+	}
+
+	return ret;
+}
+
 static void mthca_free_memfree(struct mthca_dev *dev,
 			       struct mthca_qp *qp)
 {
 	if (mthca_is_memfree(dev)) {
 		mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
 		mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
-		mthca_table_put(dev, dev->qp_table.rdb_table,
-				qp->qpn << dev->qp_table.rdb_shift);
-		mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
-		mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
 	}
 }
 
@@ -1108,13 +1151,28 @@
 	mthca_wq_init(&qp->sq);
 	mthca_wq_init(&qp->rq);
 
-	ret = mthca_alloc_memfree(dev, qp);
+	ret = mthca_map_memfree(dev, qp);
 	if (ret)
 		return ret;
 
 	ret = mthca_alloc_wqe_buf(dev, pd, qp);
 	if (ret) {
-		mthca_free_memfree(dev, qp);
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	/*
+	 * If this is a userspace QP, we're done now.  The doorbells
+	 * will be allocated and buffers will be initialized in
+	 * userspace.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	ret = mthca_alloc_memfree(dev, qp);
+	if (ret) {
+		mthca_free_wqe_buf(dev, qp);
+		mthca_unmap_memfree(dev, qp);
 		return ret;
 	}
 
@@ -1274,8 +1332,6 @@
 		   struct mthca_qp *qp)
 {
 	u8 status;
-	int size;
-	int i;
 	struct mthca_cq *send_cq;
 	struct mthca_cq *recv_cq;
 
@@ -1305,31 +1361,22 @@
 	if (qp->state != IB_QPS_RESET)
 		mthca_MODIFY_QP(dev, MTHCA_TRANS_ANY2RST, qp->qpn, 0, NULL, 0, &status);
 
-	mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn);
-	if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
-		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn);
-
-	mthca_free_mr(dev, &qp->mr);
-
-	size = PAGE_ALIGN(qp->send_wqe_offset +
-			  (qp->sq.max << qp->sq.wqe_shift));
+	/*
+	 * If this is a userspace QP, the buffers, MR, CQs and so on
+	 * will be cleaned up in userspace, so all we have to do is
+	 * unref the mem-free tables and free the QPN in our table.
+	 */
+	if (!qp->ibqp.uobject) {
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq)->cqn, qp->qpn);
+		if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+			mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq)->cqn, qp->qpn);
 
-	if (qp->is_direct) {
-		pci_free_consistent(dev->pdev, size,
-				    qp->queue.direct.buf,
-				    pci_unmap_addr(&qp->queue.direct, mapping));
-	} else {
-		for (i = 0; i < size / PAGE_SIZE; ++i) {
-			pci_free_consistent(dev->pdev, PAGE_SIZE,
-					    qp->queue.page_list[i].buf,
-					    pci_unmap_addr(&qp->queue.page_list[i],
-							   mapping));
-		}
+		mthca_free_mr(dev, &qp->mr);
+		mthca_free_memfree(dev, qp);
+		mthca_free_wqe_buf(dev, qp);
 	}
 
-	kfree(qp->wrid);
-
-	mthca_free_memfree(dev, qp);
+	mthca_unmap_memfree(dev, qp);
 
 	if (is_sqp(dev, qp)) {
 		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-export/drivers/infiniband/hw/mthca/mthca_user.h	2005-04-04 14:58:12.491651915 -0700
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef MTHCA_USER_H
+#define MTHCA_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mthca_alloc_ucontext {
+	__u64 respbuf;
+};
+
+struct mthca_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct mthca_alloc_pd {
+	__u64 pdnbuf;
+};
+
+struct mthca_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct mthca_create_cq {
+	__u64 cqnbuf;
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct mthca_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct mthca_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif /* MTHCA_USER_H */


^ permalink raw reply	[flat|nested] 144+ messages in thread

* [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
@ 2005-04-04 22:09 Roland Dreier
  2005-04-04 22:09 ` [PATCH][RFC][1/4] IB: core changes for userspace verbs Roland Dreier
  2005-04-11 14:22 ` [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Troy Benjegerdes
  0 siblings, 2 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-04 22:09 UTC (permalink / raw)
  To: linux-kernel, openib-general

Here is an initial implementation of InfiniBand userspace verbs.  I
plan to commit this code to the OpenIB repository shortly, and submit
it for inclusion during the 2.6.13 cycle, so I am posting it early for
comments.

This code, in conjunction with the libibverbs and libmthca userspace
libraries available from the subversion trees at

    https://openib.org/svn/gen2/branches/roland-uverbs/src/userspace/libibverbs
    https://openib.org/svn/gen2/branches/roland-uverbs/src/userspace/libmthca

enables userspace processes to access InfiniBand HCAs directly.

For those not familiar with the InfiniBand architecture, this
so-called "userspace verbs" support allows userspace to post data path
commands directly to the HCA.  Resource allocation and other control
path operations still go through the kernel driver.

Please take a look at this code if you have a chance.  I would
appreciate high-level criticism of the design and implementation as
well as nitpicky complaints about coding style and typos.

In particular, the memory pinning code in in uverbs_mem.c could stand
a looking over.  In addition, a sanity check of the write()-based
scheme for passing commands into the kernel in uverbs_main.c and
uverbs_cmd.c is probably worthwhile.

Thanks,
  Roland



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes
  2005-04-04 22:09     ` [PATCH][RFC][3/4] IB: userspace verbs mthca changes Roland Dreier
  2005-04-04 22:09       ` [PATCH][RFC][4/4] IB: userspace verbs Kconfig/Makefile changes Roland Dreier
@ 2005-04-04 22:49       ` Tom Duffy
  2005-04-04 23:34         ` Roland Dreier
  2005-04-21  0:37       ` [PATCH][MTHCA] fix sparc build WAS: " Tom Duffy
  2 siblings, 1 reply; 144+ messages in thread
From: Tom Duffy @ 2005-04-04 22:49 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linux-kernel, openib-general

[-- Attachment #1: Type: text/plain, Size: 900 bytes --]

On Mon, 2005-04-04 at 15:09 -0700, Roland Dreier wrote:
> --- linux-export.orig/drivers/infiniband/hw/mthca/mthca_dev.h	2005-04-04 14:57:12.254750421 -0700
> +++ linux-export/drivers/infiniband/hw/mthca/mthca_dev.h	2005-04-04 14:58:12.411669307 -0700
> @@ -49,14 +49,6 @@
>  #define DRV_VERSION	"0.06-pre"
>  #define DRV_RELDATE	"November 8, 2004"
>  
> -/* XXX remove once SINAI defines make it into kernel.org */
> -#ifndef PCI_DEVICE_ID_MELLANOX_SINAI_OLD
> -#define PCI_DEVICE_ID_MELLANOX_SINAI_OLD 0x5e8c
> -#endif
> -#ifndef PCI_DEVICE_ID_MELLANOX_SINAI
> -#define PCI_DEVICE_ID_MELLANOX_SINAI 0x6274
> -#endif
> -
>  enum {
>  	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
>  	MTHCA_FLAG_SRQ        = 1 << 2,

Now, you are really gonna hate me for asking you to put this in as you
probably did not want to include this in the patch to lkml.

So, maybe Grant was right ;-)

-tduffy

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes
  2005-04-04 22:49       ` [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes Tom Duffy
@ 2005-04-04 23:34         ` Roland Dreier
  0 siblings, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-04 23:34 UTC (permalink / raw)
  To: Tom Duffy; +Cc: linux-kernel, openib-general

    Tom> Now, you are really gonna hate me for asking you to put this
    Tom> in as you probably did not want to include this in the patch
    Tom> to lkml.

    Tom> So, maybe Grant was right ;-)

Oh well, I didn't read the patches over carefully enough.  Fortunately
it was just my "for review" version.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-04 22:09 [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Roland Dreier
  2005-04-04 22:09 ` [PATCH][RFC][1/4] IB: core changes for userspace verbs Roland Dreier
@ 2005-04-11 14:22 ` Troy Benjegerdes
  2005-04-11 15:34   ` Roland Dreier
  1 sibling, 1 reply; 144+ messages in thread
From: Troy Benjegerdes @ 2005-04-11 14:22 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linux-kernel, openib-general

> In particular, the memory pinning code in in uverbs_mem.c could stand
> a looking over.  In addition, a sanity check of the write()-based
> scheme for passing commands into the kernel in uverbs_main.c and
> uverbs_cmd.c is probably worthwhile.

How is memory pinning handled? (I haven't had time to read all the code,
so please excuse my ignorance of something obvious).

The old mellanox drivers used to have a hack to call 'sys_mlock', and
promiscuously lock memory any old userspace application asked for. What
is the API for the new uverbs memory registration, and how will things
like memory hotplug and NUMA page migration be able to unpin pages
locked by a user program?

I have applications that would benefit from being able to register 15GB
of memory on a machine with 16GB. Right now, MPI and other possible
users of infiniband in userspace have to play cacheing games and limit
what they can register. But locking all that memory without providing
the kernel a way to unlock it under memory pressure or for page
migration seems like a bad idea.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 14:22 ` [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Troy Benjegerdes
@ 2005-04-11 15:34   ` Roland Dreier
  2005-04-11 16:33     ` Troy Benjegerdes
  2005-04-18 16:09     ` Timur Tabi
  0 siblings, 2 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-11 15:34 UTC (permalink / raw)
  To: Troy Benjegerdes; +Cc: linux-kernel, openib-general

    Troy> How is memory pinning handled? (I haven't had time to read
    Troy> all the code, so please excuse my ignorance of something
    Troy> obvious).

The userspace library calls mlock() and then the kernel does
get_user_pages().

    Troy> The old mellanox drivers used to have a hack to call
    Troy> 'sys_mlock', and promiscuously lock memory any old userspace
    Troy> application asked for. What is the API for the new uverbs
    Troy> memory registration, and how will things like memory hotplug
    Troy> and NUMA page migration be able to unpin pages locked by a
    Troy> user program?

The API for uverbs memory registration is ibv_reg_mr(), and right now
the memory is pinned and that's it.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 15:34   ` Roland Dreier
@ 2005-04-11 16:33     ` Troy Benjegerdes
  2005-04-11 16:56       ` Roland Dreier
  2005-04-18 16:09     ` Timur Tabi
  1 sibling, 1 reply; 144+ messages in thread
From: Troy Benjegerdes @ 2005-04-11 16:33 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linux-kernel, openib-general

On Mon, Apr 11, 2005 at 08:34:19AM -0700, Roland Dreier wrote:
>     Troy> How is memory pinning handled? (I haven't had time to read
>     Troy> all the code, so please excuse my ignorance of something
>     Troy> obvious).
> 
> The userspace library calls mlock() and then the kernel does
> get_user_pages().

Is there a check in the kernel that the memory is actually mlock()ed?

What if a malicious (or broken) application does ibv_reg_mr() but
doesn't lock the memory? Does the IB card get a physical address for a
page that might get swapped out?

>     Troy> The old mellanox drivers used to have a hack to call
>     Troy> 'sys_mlock', and promiscuously lock memory any old userspace
>     Troy> application asked for. What is the API for the new uverbs
>     Troy> memory registration, and how will things like memory hotplug
>     Troy> and NUMA page migration be able to unpin pages locked by a
>     Troy> user program?
> 
> The API for uverbs memory registration is ibv_reg_mr(), and right now
> the memory is pinned and that's it.
> 
>  - R.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 16:33     ` Troy Benjegerdes
@ 2005-04-11 16:56       ` Roland Dreier
  2005-04-11 18:01         ` Troy Benjegerdes
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-11 16:56 UTC (permalink / raw)
  To: Troy Benjegerdes; +Cc: linux-kernel, openib-general

    Troy> Is there a check in the kernel that the memory is actually
    Troy> mlock()ed?

No.

    Troy> What if a malicious (or broken) application does
    Troy> ibv_reg_mr() but doesn't lock the memory? Does the IB card
    Troy> get a physical address for a page that might get swapped
    Troy> out?

No, the kernel does get_user_pages().  So the pages that the HCA gets
will not be swapped or used for anything else.  The only thing a
malicious userspace app can do is screw itself up.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 16:56       ` Roland Dreier
@ 2005-04-11 18:01         ` Troy Benjegerdes
  2005-04-11 18:03           ` Roland Dreier
  0 siblings, 1 reply; 144+ messages in thread
From: Troy Benjegerdes @ 2005-04-11 18:01 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linux-kernel, openib-general

On Mon, Apr 11, 2005 at 09:56:53AM -0700, Roland Dreier wrote:
>     Troy> Is there a check in the kernel that the memory is actually
>     Troy> mlock()ed?
> 
> No.
> 
>     Troy> What if a malicious (or broken) application does
>     Troy> ibv_reg_mr() but doesn't lock the memory? Does the IB card
>     Troy> get a physical address for a page that might get swapped
>     Troy> out?
> 
> No, the kernel does get_user_pages().  So the pages that the HCA gets
> will not be swapped or used for anything else.  The only thing a
> malicious userspace app can do is screw itself up.
> 
>  - R.

Do we even need the mlock in userspace then?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 18:01         ` Troy Benjegerdes
@ 2005-04-11 18:03           ` Roland Dreier
  2005-04-12  0:13             ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-11 18:03 UTC (permalink / raw)
  To: Troy Benjegerdes; +Cc: linux-kernel, openib-general

    Troy> Do we even need the mlock in userspace then?

Yes, because the kernel may go through and unmap pages from userspace
while trying to swap.  Since we have the page locked in the kernel,
the physical page won't go anywhere, but userspace might end up with a
different page mapped at the same virtual address.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 18:03           ` Roland Dreier
@ 2005-04-12  0:13             ` Andrew Morton
  2005-04-12  0:21               ` Roland Dreier
                                 ` (2 more replies)
  0 siblings, 3 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-12  0:13 UTC (permalink / raw)
  To: Roland Dreier; +Cc: hozer, linux-kernel, openib-general

Roland Dreier <roland@topspin.com> wrote:
>
>     Troy> Do we even need the mlock in userspace then?
> 
> Yes, because the kernel may go through and unmap pages from userspace
> while trying to swap.  Since we have the page locked in the kernel,
> the physical page won't go anywhere, but userspace might end up with a
> different page mapped at the same virtual address.

That shouldn't happen.  If get_user_pages() has elevated the refcount on a
page then the following can happen:

- The VM may decide to add the page to swapcache (if it's not mmapped
  from a file).

- Once the page is backed by either swapcache of a (mmapped) file, the VM
  may decide the unmap the application's pte's.  A later minor fault by the
  app will cause the same physical page to be remapped.

- The VM may decide to try to write the page to its backing file or swap.
   If it does, the page is still in core, but is now clean.

- Once all pte's are unmapped and the page is clean, the VM may decide to
  try to reclaim the page.  The VM will then see the elevated refcount and
  will bale out, leaving the page in core.

- If your code was doing a read-from-disk (modifying memory), then your
  code should run set_page_dirty() or set_page_dirty_lock() against the
  page before dropping the refcount which get_user_pages() added.  Once the
  page is dirty, the VM can't reclaim it until it has been been written to
  swap or mmapped backing file.

IOW: while the page has an elevated refcount from get_user_pages(), that
physical page is 100% pinned.  Once you've done the
set_page_dirty+put_page(), the page is again under control of the VM.

There should be no need to run mlock() from userspace.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-12  0:13             ` Andrew Morton
@ 2005-04-12  0:21               ` Roland Dreier
  2005-04-12 18:23                 ` Michael S. Tsirkin
  2005-04-13  1:04               ` [openib-general] " Libor Michalek
  2005-04-18 16:22               ` Timur Tabi
  2 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-12  0:21 UTC (permalink / raw)
  To: Andrew Morton, libor; +Cc: hozer, linux-kernel, openib-general

    Roland> Yes, because the kernel may go through and unmap pages
    Roland> from userspace while trying to swap.  Since we have the
    Roland> page locked in the kernel, the physical page won't go
    Roland> anywhere, but userspace might end up with a different page
    Roland> mapped at the same virtual address.

    Andrew> That shouldn't happen.  If get_user_pages() has elevated
    Andrew> the refcount on a page then the following can happen:

    ...

    Andrew> IOW: while the page has an elevated refcount from
    Andrew> get_user_pages(), that physical page is 100% pinned.
    Andrew> Once you've done the set_page_dirty+put_page(), the page
    Andrew> is again under control of the VM.

Hmm... I've never tested it first hand but Libor assures me there is a
something like what I said.  Libor, did I get the explanation right?

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-12  0:21               ` Roland Dreier
@ 2005-04-12 18:23                 ` Michael S. Tsirkin
  2005-04-13 18:28                   ` Roland Dreier
  0 siblings, 1 reply; 144+ messages in thread
From: Michael S. Tsirkin @ 2005-04-12 18:23 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Andrew Morton, libor, linux-kernel, openib-general

Quoting r. Roland Dreier <roland@topspin.com>:
> Subject: Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
> 
>     Roland> Yes, because the kernel may go through and unmap pages
>     Roland> from userspace while trying to swap.  Since we have the
>     Roland> page locked in the kernel, the physical page won't go
>     Roland> anywhere, but userspace might end up with a different page
>     Roland> mapped at the same virtual address.
> 
>     Andrew> That shouldn't happen.  If get_user_pages() has elevated
>     Andrew> the refcount on a page then the following can happen:
> 
>     ...
> 
>     Andrew> IOW: while the page has an elevated refcount from
>     Andrew> get_user_pages(), that physical page is 100% pinned.
>     Andrew> Once you've done the set_page_dirty+put_page(), the page
>     Andrew> is again under control of the VM.
> 
> Hmm... I've never tested it first hand but Libor assures me there is a
> something like what I said.  Libor, did I get the explanation right?
> 
>  - R.

Roland, is it possible that what you describe is the behaviour of older kernels?

Digging around in rmap.c, I see the following code in try_to_unmap_one:

        /*
         * Don't pull an anonymous page out from under get_user_pages.
         * GUP carefully breaks COW and raises page count (while holding
         * page_table_lock, as we have here) to make sure that the page
         * cannot be freed.  If we unmap that page here, a user write
         * access to the virtual address will bring back the page, but
         * its raised count will (ironically) be taken to mean it's not
         * an exclusive swap page, do_wp_page will replace it by a copy
         * page, and the user never get to see the data GUP was holding
         * the original page for.
         *
         * This test is also useful for when swapoff (unuse_process) has
         * to drop page lock: its reference to the page stops existing
         * ptes from being unmapped, so swapoff can make progress.
         */
        if (PageSwapCache(page) &&
            page_count(page) != page_mapcount(page) + 2) {
                ret = SWAP_FAIL;
                goto out_unmap;
        }

This was added in http://linus.bkbits.net:8080/linux-2.5/patch@1.1722.120.6
on 2004-06-05 , i.e. as far as I can see around 2.6.7, and the comment says:

>>>>>>>>>>>>>>>>>>>>>>
> [PATCH] mm: get_user_pages vs. try_to_unmap
> 
> Andrea Arcangeli's fix to an ironic weakness with get_user_pages. 
> 
> try_to_unmap_one must check page_count against page->mapcount before unmapping
> a swapcache page: because the raised pagecount by which get_user_pages ensures
> the page cannot be freed, will cause any write fault to see that page as not
> exclusively owned, and therefore a copy page will be substituted for it - the
> reverse of what's intended.
> 
> rmap.c was entirely free of such page_count heuristics before, I tried hard to
> avoid putting this in.  But Andrea's fix rarely gives a false positive; and
> although it might be nicer to change exclusive_swap_page etc.  to rely on
> page->mapcount instead, it seems likely that we'll want to get rid of
> page->mapcount later, so better not to entrench its use.
> 
> Signed-off-by: Hugh Dickins <hugh@veritas.com>
> Signed-off-by: Andrew Morton <akpm@osdl.org>
> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
>>>>>>>>>>>>>>>>>>>>>>

Seems quite like the situation that you described. Does my analysis make sence?

Since this case seems to be explicitly handled,
it is probably safe to rely on this behaviour or try_to_unmap,
avoiding the need for mlock, is it not?

-- 
MST - Michael S. Tsirkin

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-12  0:13             ` Andrew Morton
  2005-04-12  0:21               ` Roland Dreier
@ 2005-04-13  1:04               ` Libor Michalek
  2005-04-18 17:15                 ` Timur Tabi
  2005-04-26  3:31                 ` Libor Michalek
  2005-04-18 16:22               ` Timur Tabi
  2 siblings, 2 replies; 144+ messages in thread
From: Libor Michalek @ 2005-04-13  1:04 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Roland Dreier, linux-kernel, openib-general

On Mon, Apr 11, 2005 at 05:13:47PM -0700, Andrew Morton wrote:
> Roland Dreier <roland@topspin.com> wrote:
> >
> >     Troy> Do we even need the mlock in userspace then?
> > 
> > Yes, because the kernel may go through and unmap pages from userspace
> > while trying to swap.  Since we have the page locked in the kernel,
> > the physical page won't go anywhere, but userspace might end up with a
> > different page mapped at the same virtual address.

With the last few kernels I haven't had a chance to retest the problem
that pushed us in the direction of using mlock. I will go back and do
so with the latest kernel. Below I've given a quick description of the
issue.

> That shouldn't happen.  If get_user_pages() has elevated the refcount on a
> page then the following can happen:
> 
> - The VM may decide to add the page to swapcache (if it's not mmapped
>   from a file).
> 
> - Once the page is backed by either swapcache of a (mmapped) file, the VM
>   may decide the unmap the application's pte's.  A later minor fault by the
>   app will cause the same physical page to be remapped.

The driver did use get_user_pages() to elevated the refcount on all the
pages it was going to use for IO, as well as call set_page_dirty() since
the pages were going to have data written to them from the device.

The problem we were seeing is that the minor fault by the app resulted
in a new physical page getting mapped for the application. The page that
had the elevated refcount was still waiting for the data to be written
to by the driver at the time that the app accessed the page causing the
minor fault. Obviously since the app had a new mapping the data written
by the driver was lost.

It looks like code was added to try_to_unmap_one() to address this, so
hopefully it's no longer an issue...


-Libor

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-12 18:23                 ` Michael S. Tsirkin
@ 2005-04-13 18:28                   ` Roland Dreier
  2005-04-13 19:32                     ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-13 18:28 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Andrew Morton, libor, linux-kernel, openib-general

OK, I'm by no means an expert on this, but Libor and I looked at
rmap.c a little more, and there is code:

	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
			ptep_clear_flush_young(vma, address, pte)) {
		ret = SWAP_FAIL;
		goto out_unmap;
	}

before the check

	if (PageSwapCache(page) &&
	    page_count(page) != page_mapcount(page) + 2) {
		ret = SWAP_FAIL;
		goto out_unmap;
	}

If userspace allocates some memory but doesn't touch it aside from
passing the address in to the kernel, which does get_user_pages(), the
PTE will be young in that first test, right?  Does that mean that
the userspace mapping will be cleared and userspace will get a
different physical page if it faults that address back in?

 - R.



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-13 18:28                   ` Roland Dreier
@ 2005-04-13 19:32                     ` Andrew Morton
  0 siblings, 0 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-13 19:32 UTC (permalink / raw)
  To: Roland Dreier; +Cc: mst, libor, linux-kernel, openib-general

Roland Dreier <roland@topspin.com> wrote:
>
> OK, I'm by no means an expert on this, but Libor and I looked at
> rmap.c a little more, and there is code:
> 
> 	if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
> 			ptep_clear_flush_young(vma, address, pte)) {
> 		ret = SWAP_FAIL;
> 		goto out_unmap;
> 	}
> 
> before the check
> 
> 	if (PageSwapCache(page) &&
> 	    page_count(page) != page_mapcount(page) + 2) {
> 		ret = SWAP_FAIL;
> 		goto out_unmap;
> 	}
> 
> If userspace allocates some memory but doesn't touch it aside from
> passing the address in to the kernel, which does get_user_pages(), the
> PTE will be young in that first test, right?

If get_user_pages() was called with write=1, get_user_pages() will fault in
a real page and yes, I guess it'll be pte_young.

If get_user_pages() was called with write=0, get_user_pages() will fault
in a mapping of the zero page and we'd never get this far.

> Does that mean that
> the userspace mapping will be cleared and userspace will get a
> different physical page if it faults that address back in? 
>

We won't try to unmap a page's ptes until that page has file-or-swapcache
backing.

If the pte is then cleared, a subsequent minor fault will reestablish the
mapping to the same physical page.  A major fault cannot happen because the
page was pinned by get_user_pages().


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-11 15:34   ` Roland Dreier
  2005-04-11 16:33     ` Troy Benjegerdes
@ 2005-04-18 16:09     ` Timur Tabi
  2005-04-18 16:12       ` Roland Dreier
  2005-04-18 16:16       ` Arjan van de Ven
  1 sibling, 2 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 16:09 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Troy Benjegerdes, linux-kernel, openib-general

Roland Dreier wrote:
>     Troy> How is memory pinning handled? (I haven't had time to read
>     Troy> all the code, so please excuse my ignorance of something
>     Troy> obvious).
> 
> The userspace library calls mlock() and then the kernel does
> get_user_pages().

Why do you call mlock() and get_user_pages()?  In our code, we only call mlock(), and the 
memory is pinned.  We have a test case that fails if only get_user_pages() is called, but 
it passes if only mlock() is called.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:09     ` Timur Tabi
@ 2005-04-18 16:12       ` Roland Dreier
  2005-04-18 16:50         ` Timur Tabi
  2005-04-18 16:16       ` Arjan van de Ven
  1 sibling, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-18 16:12 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Troy Benjegerdes, linux-kernel, openib-general

    Timur> Why do you call mlock() and get_user_pages()?  In our code,
    Timur> we only call mlock(), and the memory is pinned.  We have a
    Timur> test case that fails if only get_user_pages() is called,
    Timur> but it passes if only mlock() is called.

What if a buggy/malicious userspace program doesn't call mlock()?

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:09     ` Timur Tabi
  2005-04-18 16:12       ` Roland Dreier
@ 2005-04-18 16:16       ` Arjan van de Ven
  2005-04-18 16:25         ` Timur Tabi
                           ` (2 more replies)
  1 sibling, 3 replies; 144+ messages in thread
From: Arjan van de Ven @ 2005-04-18 16:16 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

On Mon, 2005-04-18 at 11:09 -0500, Timur Tabi wrote:
> Roland Dreier wrote:
> >     Troy> How is memory pinning handled? (I haven't had time to read
> >     Troy> all the code, so please excuse my ignorance of something
> >     Troy> obvious).
> > 
> > The userspace library calls mlock() and then the kernel does
> > get_user_pages().
> 
> Why do you call mlock() and get_user_pages()?  In our code, we only call mlock(), and the 
> memory is pinned. 

this is a myth; linux is free to move the page about in physical memory
even if it's mlock()ed!!

And even then, the user can munlock the memory from another thread etc
etc. Not a good idea.

get_user_pages() is used from AIO and other parts of the kernel for
similar purposes and in fact is designed for it, so it better work. If
it has bugs those should be fixed, not worked around!



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-12  0:13             ` Andrew Morton
  2005-04-12  0:21               ` Roland Dreier
  2005-04-13  1:04               ` [openib-general] " Libor Michalek
@ 2005-04-18 16:22               ` Timur Tabi
  2005-04-18 16:43                 ` Christoph Hellwig
  2 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 16:22 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Roland Dreier, hozer, linux-kernel, openib-general

Andrew Morton wrote:
> Roland Dreier <roland@topspin.com> wrote:
> 
>>    Troy> Do we even need the mlock in userspace then?
>>
>>Yes, because the kernel may go through and unmap pages from userspace
>>while trying to swap.  Since we have the page locked in the kernel,
>>the physical page won't go anywhere, but userspace might end up with a
>>different page mapped at the same virtual address.
> 
> 
> That shouldn't happen.  If get_user_pages() has elevated the refcount on a
> page then the following can happen:
> 
> - The VM may decide to add the page to swapcache (if it's not mmapped
>   from a file).
> 
> - Once the page is backed by either swapcache of a (mmapped) file, the VM
>   may decide the unmap the application's pte's.  A later minor fault by the
>   app will cause the same physical page to be remapped.

That's not what we're seeing.  We have hardware that does DMA over the network (much like 
the Infiniband stuff), and we have a testcase that fails if get_user_pages() is used, but 
not if mlock() is used.  Consider two computers on a network, X and Y.  Both have our 
hardware, which can transfer a page of memory from a given physical address on X to a 
physical address on Y.

1) Application on X allocates a block of memory, and passes the virtual address to the driver.
2) Driver on X calls get_user_pages() and then obtains a physical address for the memory.
3) Application and driver on Y do the same thing.
4) App X fills memory with some data D.
5) App X then allocates as much memory as it possibly can.  It touches every page in this 
memory, and then frees the memory.  This will force other pages to be swapped out, 
including the supposedly pinned memory.
6) App X then tells Driver X to transfer data D to computer Y.
7) App Y compares data D and finds that it doesn't match with it's supposed to.

Conclusion: during step 5, the data in pinned memory is swapped out or something.  I'm not 
sure where it goes.

We can only demonstrate this problem using our hardware, because you need the ability to 
transfer memory without using the CPU.  We were going to prepare a test case and ship same 
hardware to a few kernel developers to prove our point, but now that we're able to call 
mlock() in non-user processes, we decided it wasn't worth our time.  Actually, I 
discovered that I can call cap_raise() and set the ulimit structure, which gives me the 
ability to call mlock() on any amount of memory from any process in 2.4 and 2.6 kernels, 
which we need to support.  If I had thought of that earlier, I wouldn't have needed all 
those hacks to call sys_mlock() from the driver.




-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:16       ` Arjan van de Ven
@ 2005-04-18 16:25         ` Timur Tabi
  2005-04-18 19:40           ` Arjan van de Ven
  2005-04-22 17:55         ` Timur Tabi
  2005-04-29  0:56         ` Andrew Morton
  2 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 16:25 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

Arjan van de Ven wrote:

> this is a myth; linux is free to move the page about in physical memory
> even if it's mlock()ed!!

Then Linux has a very odd definition of the word "locked".

> And even then, the user can munlock the memory from another thread etc
> etc. Not a good idea.

Well, that's okay, because then the app is doing something stupid, so we don't worry about 
that.

> get_user_pages() is used from AIO and other parts of the kernel for
> similar purposes and in fact is designed for it, so it better work. If
> it has bugs those should be fixed, not worked around!

I've been complaining about get_user_pages() not working for a long time now, but I can 
only demonstrate the problem with our hardware.  See my other post in this thread for details.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:22               ` Timur Tabi
@ 2005-04-18 16:43                 ` Christoph Hellwig
  2005-04-18 16:45                   ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Christoph Hellwig @ 2005-04-18 16:43 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Andrew Morton, Roland Dreier, hozer, linux-kernel, openib-general

On Mon, Apr 18, 2005 at 11:22:29AM -0500, Timur Tabi wrote:
> That's not what we're seeing.  We have hardware that does DMA over the 
> network (much like the Infiniband stuff), and we have a testcase that fails 
> if get_user_pages() is used, but not if mlock() is used.

If you don't share your testcase it's unlikely to be fixed.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:43                 ` Christoph Hellwig
@ 2005-04-18 16:45                   ` Timur Tabi
  2005-04-24  2:44                     ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 16:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrew Morton, Roland Dreier, hozer, linux-kernel, openib-general

Christoph Hellwig wrote:
> On Mon, Apr 18, 2005 at 11:22:29AM -0500, Timur Tabi wrote:
> 
>>That's not what we're seeing.  We have hardware that does DMA over the 
>>network (much like the Infiniband stuff), and we have a testcase that fails 
>>if get_user_pages() is used, but not if mlock() is used.
> 
> 
> If you don't share your testcase it's unlikely to be fixed.

As I said, the testcase only works with our hardware, and it's also very large.  It's one 
small test that's part of a huge test suite.  It takes a couple hours just to install the 
damn thing.

We want to produce a simpler test case that demonstrates the problem in an 
easy-to-understand manner, but we don't have time to do that now.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:12       ` Roland Dreier
@ 2005-04-18 16:50         ` Timur Tabi
  2005-04-21 19:47           ` Pavel Machek
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 16:50 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Troy Benjegerdes, linux-kernel, openib-general

Roland Dreier wrote:
>     Timur> Why do you call mlock() and get_user_pages()?  In our code,
>     Timur> we only call mlock(), and the memory is pinned.  We have a
>     Timur> test case that fails if only get_user_pages() is called,
>     Timur> but it passes if only mlock() is called.
> 
> What if a buggy/malicious userspace program doesn't call mlock()?

Our library calls mlock() when the apps requests memory to be "registered".  We then call 
munlock() when the app requests the memory to be unregistered.  All apps talk to our 
library for all services.  No apps talk to the driver directly.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-13  1:04               ` [openib-general] " Libor Michalek
@ 2005-04-18 17:15                 ` Timur Tabi
  2005-04-26  3:31                 ` Libor Michalek
  1 sibling, 0 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 17:15 UTC (permalink / raw)
  To: Libor Michalek; +Cc: Andrew Morton, Roland Dreier, linux-kernel, openib-general

Libor Michalek wrote:

> The problem we were seeing is that the minor fault by the app resulted
> in a new physical page getting mapped for the application. The page that
> had the elevated refcount was still waiting for the data to be written
> to by the driver at the time that the app accessed the page causing the
> minor fault. Obviously since the app had a new mapping the data written
> by the driver was lost.

Thanks Libor, this is much better explanation of the problem than what I posted.

> It looks like code was added to try_to_unmap_one() to address this, so
> hopefully it's no longer an issue...

I doubt it.  I tried this with an earlier 2.6 kernel, and get_user_pages() was still not 
enough to really pin the memory down.  Maybe it works in 2.6.12, but that doesn't help me 
any, because our driver needs to support all 2.4 and 2.6 kernels.  Currently, mlock() 
alone seems to be good enough, but I'm going to add calls to get_user_pages() just to be sure.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:25         ` Timur Tabi
@ 2005-04-18 19:40           ` Arjan van de Ven
  2005-04-18 20:00             ` Timur Tabi
  2005-04-18 20:07             ` [openib-general] " Bernhard Fischer
  0 siblings, 2 replies; 144+ messages in thread
From: Arjan van de Ven @ 2005-04-18 19:40 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

On Mon, 2005-04-18 at 11:25 -0500, Timur Tabi wrote:
> Arjan van de Ven wrote:
> 
> > this is a myth; linux is free to move the page about in physical memory
> > even if it's mlock()ed!!
> 
> Then Linux has a very odd definition of the word "locked".
> 
> > And even then, the user can munlock the memory from another thread etc
> > etc. Not a good idea.
> 
> Well, that's okay, because then the app is doing something stupid, so we don't worry about 
> that.

you should since that physical page can be reused, say by a root
process, and you'd be majorly screwed



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 19:40           ` Arjan van de Ven
@ 2005-04-18 20:00             ` Timur Tabi
  2005-04-18 20:05               ` Arjan van de Ven
  2005-04-18 20:07             ` [openib-general] " Bernhard Fischer
  1 sibling, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 20:00 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

Arjan van de Ven wrote:

> you should since that physical page can be reused, say by a root
> process, and you'd be majorly screwed

I don't understand what you mean by "reused".  The whole point behind pinning the memory 
is that it stays where it is.  It doesn't get moved around and it doesn't get swapped out.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 20:00             ` Timur Tabi
@ 2005-04-18 20:05               ` Arjan van de Ven
  2005-04-18 20:19                 ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Arjan van de Ven @ 2005-04-18 20:05 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

On Mon, 2005-04-18 at 15:00 -0500, Timur Tabi wrote:
> Arjan van de Ven wrote:
> 
> > you should since that physical page can be reused, say by a root
> > process, and you'd be majorly screwed
> 
> I don't understand what you mean by "reused".  The whole point behind pinning the memory 
> is that it stays where it is.  It doesn't get moved around and it doesn't get swapped out.
> 
you just said that you didn't care that it got munlock'd. So you don't
care that it gets freed either. And then reused.



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 19:40           ` Arjan van de Ven
  2005-04-18 20:00             ` Timur Tabi
@ 2005-04-18 20:07             ` Bernhard Fischer
  2005-04-21  2:17               ` Troy Benjegerdes
  1 sibling, 1 reply; 144+ messages in thread
From: Bernhard Fischer @ 2005-04-18 20:07 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Timur Tabi, linux-kernel, openib-general

On Mon, Apr 18, 2005 at 09:40:40PM +0200, Arjan van de Ven wrote:
>On Mon, 2005-04-18 at 11:25 -0500, Timur Tabi wrote:
>> Arjan van de Ven wrote:
>> 
>> > this is a myth; linux is free to move the page about in physical memory
>> > even if it's mlock()ed!!
darn, yes, this is true.
I know people who introduced
#define VM_RESERVED     0x00080000      /* Don't unmap it from swap_out
*/
to vm_flags just because of this. I'll just hold my breath and won't
delve further.
>> 
>> Then Linux has a very odd definition of the word "locked".
>> 
>> > And even then, the user can munlock the memory from another thread etc
>> > etc. Not a good idea.
>> 
>> Well, that's okay, because then the app is doing something stupid, so we don't worry about 
>> that.
>
>you should since that physical page can be reused, say by a root
>process, and you'd be majorly screwed

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 20:05               ` Arjan van de Ven
@ 2005-04-18 20:19                 ` Timur Tabi
  0 siblings, 0 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-18 20:19 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

Arjan van de Ven wrote:

> you just said that you didn't care that it got munlock'd. So you don't
> care that it gets freed either. And then reused.

Well, I can live with the app being able to call munlock(), because the apps that our 
customers use don't call munlock().  What I can't live with is a bug in the kernel that 
causes pinned pages to be swapped or moved.

Obviously, I would rather call get_user_pages() instead of mlock(), but I can't, because 
get_user_pages doesn't work.  The page doesn't stay pinned at the physical address, but it 
does if I call mlock() and get_user_pages().

Actually, in our tests, calling mlock() appears to be good enough, but I'll update our 
code to call get_user_pages() as well.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

^ permalink raw reply	[flat|nested] 144+ messages in thread

* [PATCH][MTHCA] fix sparc build WAS: Re: [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes
  2005-04-04 22:09     ` [PATCH][RFC][3/4] IB: userspace verbs mthca changes Roland Dreier
  2005-04-04 22:09       ` [PATCH][RFC][4/4] IB: userspace verbs Kconfig/Makefile changes Roland Dreier
  2005-04-04 22:49       ` [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes Tom Duffy
@ 2005-04-21  0:37       ` Tom Duffy
  2005-04-21  0:38         ` David S. Miller
  2 siblings, 1 reply; 144+ messages in thread
From: Tom Duffy @ 2005-04-21  0:37 UTC (permalink / raw)
  To: Roland Dreier; +Cc: David S. Miller, linux-kernel, openib-general

On Mon, 2005-04-04 at 15:09 -0700, Roland Dreier wrote:
> @@ -574,6 +836,22 @@
>         return 0;
>  }
>  
> +static int mthca_mmap_uar(struct ib_ucontext *context,
> +                         struct vm_area_struct *vma)
> +{
> +       if (vma->vm_end - vma->vm_start != PAGE_SIZE)
> +               return -EINVAL;
> +
> +       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
> +
> +       if (remap_pfn_range(vma, vma->vm_start,
> +                           to_mucontext(context)->uar.pfn,
> +                           PAGE_SIZE, vma->vm_page_prot))
> +               return -EAGAIN;
> +
> +       return 0;
> +}
> +

This breaks building on sparc64:

  CC [M]  drivers/infiniband/hw/mthca/mthca_provider.o
/build1/tduffy/openib-work/linux-2.6.11-openib/drivers/infiniband/hw/mthca/mthca_provider.c: In function `mthca_mmap_uar':
/build1/tduffy/openib-work/linux-2.6.11-openib/drivers/infiniband/hw/mthca/mthca_provider.c:352: warning: implicit declaration of function `pgprot_noncached'
/build1/tduffy/openib-work/linux-2.6.11-openib/drivers/infiniband/hw/mthca/mthca_provider.c:352: error: incompatible types in assignment
make[3]: *** [drivers/infiniband/hw/mthca/mthca_provider.o] Error 1
make[2]: *** [drivers/infiniband/hw/mthca] Error 2
make[1]: *** [_module_drivers/infiniband] Error 2
make: *** [_all] Error 2

This is ugly, but fixes the build.  Perhaps sparc needs
pgprot_noncached() to be a noop?

Signed-off-by: Tom Duffy <tduffy@sun.com>

Index: linux-2.6.11-openib/drivers/infiniband/hw/mthca/mthca_provider.c
===================================================================
--- linux-2.6.11-openib/drivers/infiniband/hw/mthca/mthca_provider.c	(revision 2202)
+++ linux-2.6.11-openib/drivers/infiniband/hw/mthca/mthca_provider.c	(working copy)
@@ -349,7 +349,9 @@ static int mthca_mmap_uar(struct ib_ucon
 	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 		return -EINVAL;
 
+#ifdef pgprot_noncached
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+#endif
 
 	if (remap_pfn_range(vma, vma->vm_start,
 			    to_mucontext(context)->uar.pfn,


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][MTHCA] fix sparc build WAS: Re: [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes
  2005-04-21  0:37       ` [PATCH][MTHCA] fix sparc build WAS: " Tom Duffy
@ 2005-04-21  0:38         ` David S. Miller
  0 siblings, 0 replies; 144+ messages in thread
From: David S. Miller @ 2005-04-21  0:38 UTC (permalink / raw)
  To: Tom Duffy; +Cc: roland, linux-kernel, openib-general

On Wed, 20 Apr 2005 17:37:11 -0700
Tom Duffy <tduffy@sun.com> wrote:

> This breaks building on sparc64:
 ...
> This is ugly, but fixes the build.  Perhaps sparc needs
> pgprot_noncached() to be a noop?

No, it should actually do something, like so:

include/asm-sparc64/pgtable.h: af9bf175a223cf44310293287d50302e0fd3f9e9
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -416,6 +416,11 @@ extern int io_remap_pfn_range(struct vm_
 			       unsigned long pfn,
 			       unsigned long size, pgprot_t prot);
 
+/* Clear virtual and physical cachability, set side-effect bit.  */
+#define pgprot_noncached(prot) \
+	(__pgprot((pgprot_val(prot) & ~(_PAGE_CP | _PAGE_CV)) | \
+	 _PAGE_E))
+
 /*
  * For sparc32&64, the pfn in io_remap_pfn_range() carries <iospace> in
  * its high 4 bits.  These macros/functions put it there or get it from there.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 20:07             ` [openib-general] " Bernhard Fischer
@ 2005-04-21  2:17               ` Troy Benjegerdes
  2005-04-21  3:07                 ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Troy Benjegerdes @ 2005-04-21  2:17 UTC (permalink / raw)
  To: Bernhard Fischer
  Cc: Arjan van de Ven, Timur Tabi, linux-kernel, openib-general

On Mon, Apr 18, 2005 at 10:07:12PM +0200, Bernhard Fischer wrote:
> On Mon, Apr 18, 2005 at 09:40:40PM +0200, Arjan van de Ven wrote:
> >On Mon, 2005-04-18 at 11:25 -0500, Timur Tabi wrote:
> >> Arjan van de Ven wrote:
> >> 
> >> > this is a myth; linux is free to move the page about in physical memory
> >> > even if it's mlock()ed!!
> darn, yes, this is true.
> I know people who introduced
> #define VM_RESERVED     0x00080000      /* Don't unmap it from swap_out
> */

Someone (aka Tospin, infinicon, and Amasso) should probably post a patch
adding '#define VM_REGISTERD 0x01000000', and some extensions to
something like 'madvise' to set pages to be registered.

My preference is said patch will also allow a way for the kernel to
reclaim registered memory from an application under memory pressure.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21  2:17               ` Troy Benjegerdes
@ 2005-04-21  3:07                 ` Timur Tabi
  2005-04-21 17:38                   ` Andy Isaacson
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-21  3:07 UTC (permalink / raw)
  To: Troy Benjegerdes
  Cc: Bernhard Fischer, Arjan van de Ven, linux-kernel, openib-general

Troy Benjegerdes wrote:

> Someone (aka Tospin, infinicon, and Amasso) should probably post a patch
> adding '#define VM_REGISTERD 0x01000000', and some extensions to
> something like 'madvise' to set pages to be registered.
> 
> My preference is said patch will also allow a way for the kernel to
> reclaim registered memory from an application under memory pressure.

I don't know if VM_REGISTERED is a good idea or not, but it should be absolutely 
impossible for the kernel to reclaim "registered" (aka pinned) memory, no matter what. 
For RDMA services (such as Infiniband, iWARP, etc), it's normal for non-root processes to 
pin hundreds of megabytes of memory, and that memory better be locked to those physical 
pages until the application deregisters them.

If kernel really thinks it needs to unpin those pages, then at the very least it should 
kill the process, and the syslog better have a very clear message indicating why.  That 
way, the application doesn't continue thinking that everything's still going to work.  If 
those pages become unpinned, the applications are going to experience serious data corruption.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21  3:07                 ` Timur Tabi
@ 2005-04-21 17:38                   ` Andy Isaacson
  2005-04-21 18:39                     ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Andy Isaacson @ 2005-04-21 17:38 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Troy Benjegerdes, Bernhard Fischer, Arjan van de Ven,
	linux-kernel, openib-general

On Wed, Apr 20, 2005 at 10:07:45PM -0500, Timur Tabi wrote:
> Troy Benjegerdes wrote:
> >Someone (aka Tospin, infinicon, and Amasso) should probably post a patch
> >adding '#define VM_REGISTERD 0x01000000', and some extensions to
> >something like 'madvise' to set pages to be registered.
> >
> >My preference is said patch will also allow a way for the kernel to
> >reclaim registered memory from an application under memory pressure.
> 
> I don't know if VM_REGISTERED is a good idea or not, but it should be 
> absolutely impossible for the kernel to reclaim "registered" (aka pinned) 
> memory, no matter what. For RDMA services (such as Infiniband, iWARP, etc), 
> it's normal for non-root processes to pin hundreds of megabytes of memory, 
> and that memory better be locked to those physical pages until the 
> application deregisters them.

If you take the hardline position that "the app is the only thing that
matters", your code is unlikely to get merged.  Linux is a
general-purpose OS.

I don't think that Troy was suggesting the kernel should deregister
memory without notifying the application.  Personally, I envision
something like the NetBSD Scheduler Activations (SA) work, where the
kernel can notify the app of changes to its state in a very efficient
manner.  (According to the NetBSD design whitepaper, the kernel does an
upcall whenever the multithreaded app gains or loses a CPU!)

In a Linux context, I doubt that fullblown SA is necessary or
appropriate.  Rather, I'd suggest two new signals, SIGMEMLOW and
SIGMEMCRIT.  The userland comms library registers handlers for both.
When the kernel decides that it needs to reclaim some memory from the
app, it sends SIGMEMLOW.  The comms library then has the responsibility
to un-reserve some memory in an orderly fashion.  If a reasonable [1]
time has expired since SIGMEMLOW and the kernel is still hungry, the
kernel sends SIGMEMCRIT.  At this point, the comms lib *must* unregister
some memory [2] even if it has to drop state to do so; if it returns
from the signal handler without having unregistered the memory, the
kernel will SIGKILL.

[1] Part of the interface spec should cover the expectation as to how
    long the library is allowed to take; I'd guess that 2 timeslices
    should suffice.
[2] Is there a way for the kernel to pass down to userspace how many
    pages it wants, maybe in the sigcontext?

> If kernel really thinks it needs to unpin those pages, then at the very 
> least it should kill the process, and the syslog better have a very clear 
> message indicating why.  That way, the application doesn't continue 
> thinking that everything's still going to work.  If those pages become 
> unpinned, the applications are going to experience serious data corruption.

You might want to consider what happens with your communication system
in a machine running power-saving modes (in the limit, suspend-to-disk).
Of course most machines with Infiniband adapters aren't running swsusp,
but it's not inconceivable that blade servers might sleep to lower power
and cooling costs.

-andy

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 17:38                   ` Andy Isaacson
@ 2005-04-21 18:39                     ` Timur Tabi
  2005-04-21 19:56                       ` Andy Isaacson
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-21 18:39 UTC (permalink / raw)
  To: Andy Isaacson
  Cc: Troy Benjegerdes, Bernhard Fischer, Arjan van de Ven,
	linux-kernel, openib-general

Andy Isaacson wrote:

> If you take the hardline position that "the app is the only thing that
> matters", your code is unlikely to get merged.  Linux is a
> general-purpose OS.

The problem is that our driver and library implement an API that we don't fully control. 
The API states that the application allocates the memory and tells the library to register 
it.  The app then goes on its merry way until it's done, at which point it tells the 
library to deregister the memory.  Neither the app nor the API has any provision for the 
app to be notified that the memory is no longer pinned and therefore can't be trusted. 
That would be considered a critical failure from the app's perspective, so the kernel 
would be doing it a favor by killing the process.

> You might want to consider what happens with your communication system
> in a machine running power-saving modes (in the limit, suspend-to-disk).
> Of course most machines with Infiniband adapters aren't running swsusp,
> but it's not inconceivable that blade servers might sleep to lower power
> and cooling costs.

Any application that registers memory, will in all likelihood be running at 100% CPU 
non-stop.  The computer is not going to be doing anything else but whatever that app is 
trying to do.  The application could conceiveable register gigabytes of RAM, and if even a 
single page becomes unpinned, the whole thing is worthless.  The application cannot do 
anything meaningful if it gets a message saying that some of the memory has become 
unpinned and should not be used.

So the real question is: how important is it to the kernel developers that Linux support 
these kinds of enterprise-class applications?

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:50         ` Timur Tabi
@ 2005-04-21 19:47           ` Pavel Machek
  0 siblings, 0 replies; 144+ messages in thread
From: Pavel Machek @ 2005-04-21 19:47 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

Hi!

> >    Timur> Why do you call mlock() and get_user_pages()?  In our 
> >    code,
> >    Timur> we only call mlock(), and the memory is pinned.  We have a
> >    Timur> test case that fails if only get_user_pages() is called,
> >    Timur> but it passes if only mlock() is called.
> >
> >What if a buggy/malicious userspace program doesn't call mlock()?
> 
> Our library calls mlock() when the apps requests memory to be 
> "registered".  We then call munlock() when the app requests the 
> memory to be unregistered.  All apps talk to our library for all 
> services.  No apps talk to the driver directly.

That does not cover "malicious" part.
				Pavel
-- 
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms         


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 18:39                     ` Timur Tabi
@ 2005-04-21 19:56                       ` Andy Isaacson
  2005-04-21 20:07                         ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Andy Isaacson @ 2005-04-21 19:56 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Troy Benjegerdes, Bernhard Fischer, Arjan van de Ven,
	linux-kernel, openib-general

On Thu, Apr 21, 2005 at 01:39:35PM -0500, Timur Tabi wrote:
> Andy Isaacson wrote:
> >If you take the hardline position that "the app is the only thing that
> >matters", your code is unlikely to get merged.  Linux is a
> >general-purpose OS.
> 
> The problem is that our driver and library implement an API that we don't 
> fully control. The API states that the application allocates the memory and 
> tells the library to register it.  The app then goes on its merry way until 
> it's done, at which point it tells the library to deregister the memory.  
> Neither the app nor the API has any provision for the app to be notified 
> that the memory is no longer pinned and therefore can't be trusted. That 
> would be considered a critical failure from the app's perspective, so the 
> kernel would be doing it a favor by killing the process.

I'm familiar with MPI 1.0 and 2.0, but I haven't been following the
development of modern messaging APIs, so I might not make sense here...

Assuming that the app calls into the library on a fairly regular basis,
you could implement a fast-path/slow-path scheme where the library
normally operates in go-fast mode, but if a "unregister" event has
occurred, the library falls back to a less performant mode.

But now having written that I'm thinking that it's not worth the bother
- if you've got a 512P MPP job, it's basically equivalent to job death
for one of the nodes to go away in this manner -- even if the process is
still running on the node, the fact that you took a giant performance
hiccup is unacceptable.  Therefore, cluster admins are going to do their
darndest to avoid this behavior, so we might as well just kill the job
and make it explicit.

> >You might want to consider what happens with your communication system
> >in a machine running power-saving modes (in the limit, suspend-to-disk).
> >Of course most machines with Infiniband adapters aren't running swsusp,
> >but it's not inconceivable that blade servers might sleep to lower power
> >and cooling costs.
> 
> Any application that registers memory, will in all likelihood be running at 
> 100% CPU non-stop.  The computer is not going to be doing anything else but 
> whatever that app is trying to do.  The application could conceiveable 
> register gigabytes of RAM, and if even a single page becomes unpinned, the 
> whole thing is worthless.  The application cannot do anything meaningful if 
> it gets a message saying that some of the memory has become unpinned and 
> should not be used.
> 
> So the real question is: how important is it to the kernel developers that 
> Linux support these kinds of enterprise-class applications?

While I understand your arguments, this kind of rhetoric is more likely
to harden ears than to convince people you're right.  I refer you to the
"Live Patching Function" thread.

*You* need to come up with a solution that looks good to *the community*
if you want it merged.  In the long run, this process is likely to
result in *your* systems working better than if you had just gone off
and done your thing.  If you have to do something that "tastes bad" to
the average l-k hacker, *justify* it by addressing the alternatives and
explaining why your solution is the right one.

I'm leaning towards agreeing that mlock()-alicious code is the right way
to solve this problem, and it's not clear to me what the benefit of
adding a new VM_REGISTERED flag would be.

Do you guys simply raise RLIMIT_MEMLOCK to allow apps to lock their
pages?  Or are you doing something more nasty?

(Oh, I see that Libor has contributed to the other branch of this
thread... off to read...)

-andy

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 19:56                       ` Andy Isaacson
@ 2005-04-21 20:07                         ` Timur Tabi
  2005-04-21 20:12                           ` Chris Wright
  2005-04-22  6:14                           ` Greg KH
  0 siblings, 2 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-21 20:07 UTC (permalink / raw)
  To: Andy Isaacson
  Cc: Troy Benjegerdes, Bernhard Fischer, Arjan van de Ven,
	linux-kernel, openib-general

Andy Isaacson wrote:

> I'm familiar with MPI 1.0 and 2.0, but I haven't been following the
> development of modern messaging APIs, so I might not make sense here...
> 
> Assuming that the app calls into the library on a fairly regular basis,

Not really.  The whole point is to have the adapter DMA the data directly from memory to 
the network.  That's why it's called RDMA - remote DMA.

> Therefore, cluster admins are going to do their
> darndest to avoid this behavior, so we might as well just kill the job
> and make it explicit.

Yes, and if it turns out that the same MPI application dies on Linux but not on Solaris 
because Linux doesn't really care if the memory stays pinned, then we're going to see a 
lot of MPI customers transitioning away from Linux.

> *You* need to come up with a solution that looks good to *the community*
> if you want it merged.  

True, but I'm not going to waste my time adding this support if the consensus I get from 
the kernel developers that they don't want Linux to behave this way.

> Do you guys simply raise RLIMIT_MEMLOCK to allow apps to lock their
> pages?  Or are you doing something more nasty?

A little more nasty.  I raise RLIMIT_MEMLOCK in the driver to "unlimited" and also set 
cap_raise(IPC_LOCK).  I do this because I need to support all 2.4 and 2.6 kernel versions 
with the same driver, but only 2.6.10 and later have any support for non-root mlock().

If and when our driver is submitted to the official kernel, that nastiness will be removed 
of course.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 20:07                         ` Timur Tabi
@ 2005-04-21 20:12                           ` Chris Wright
  2005-04-21 20:14                             ` Timur Tabi
  2005-04-22  6:14                           ` Greg KH
  1 sibling, 1 reply; 144+ messages in thread
From: Chris Wright @ 2005-04-21 20:12 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Andy Isaacson, Troy Benjegerdes, Bernhard Fischer,
	Arjan van de Ven, linux-kernel, openib-general

* Timur Tabi (timur.tabi@ammasso.com) wrote:
> Andy Isaacson wrote:
> >Do you guys simply raise RLIMIT_MEMLOCK to allow apps to lock their
> >pages?  Or are you doing something more nasty?
> 
> A little more nasty.  I raise RLIMIT_MEMLOCK in the driver to "unlimited" 
> and also set cap_raise(IPC_LOCK).  I do this because I need to support all 
> 2.4 and 2.6 kernel versions with the same driver, but only 2.6.10 and later 
> have any support for non-root mlock().

FYI, that will not work on all 2.6 kernels.  Specifically anything that's
not using capabilities.

thanks,
-chris
-- 
Linux Security Modules     http://lsm.immunix.org     http://lsm.bkbits.net

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 20:12                           ` Chris Wright
@ 2005-04-21 20:14                             ` Timur Tabi
  2005-04-21 20:25                               ` Chris Wright
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-21 20:14 UTC (permalink / raw)
  To: Chris Wright
  Cc: Andy Isaacson, Troy Benjegerdes, Bernhard Fischer,
	Arjan van de Ven, linux-kernel, openib-general

Chris Wright wrote:

> FYI, that will not work on all 2.6 kernels.  Specifically anything that's
> not using capabilities.

It works with every kernel I've tried.  I'm sure there are plenty of kernel configuration 
options that will break our driver.  But as long as all the distros our customers use 
work, as well as reasonably-configured custom kernels, we're happy.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 20:14                             ` Timur Tabi
@ 2005-04-21 20:25                               ` Chris Wright
  2005-04-21 20:30                                 ` Arjan van de Ven
  0 siblings, 1 reply; 144+ messages in thread
From: Chris Wright @ 2005-04-21 20:25 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Chris Wright, Andy Isaacson, Troy Benjegerdes, Bernhard Fischer,
	Arjan van de Ven, linux-kernel, openib-general

* Timur Tabi (timur.tabi@ammasso.com) wrote:
> It works with every kernel I've tried.  I'm sure there are plenty of kernel 
> configuration options that will break our driver.  But as long as all the 
> distros our customers use work, as well as reasonably-configured custom 
> kernels, we're happy.
> 

Hey, if you're happy (and, as you said, you don't intend to merge that
bit), I'm happy ;-)

thanks,
-chris
-- 
Linux Security Modules     http://lsm.immunix.org     http://lsm.bkbits.net

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 20:25                               ` Chris Wright
@ 2005-04-21 20:30                                 ` Arjan van de Ven
  0 siblings, 0 replies; 144+ messages in thread
From: Arjan van de Ven @ 2005-04-21 20:30 UTC (permalink / raw)
  To: Chris Wright
  Cc: Timur Tabi, Andy Isaacson, Troy Benjegerdes, Bernhard Fischer,
	linux-kernel, openib-general

On Thu, 2005-04-21 at 13:25 -0700, Chris Wright wrote:
> * Timur Tabi (timur.tabi@ammasso.com) wrote:
> > It works with every kernel I've tried.  I'm sure there are plenty of kernel 
> > configuration options that will break our driver.  But as long as all the 
> > distros our customers use work, as well as reasonably-configured custom 
> > kernels, we're happy.
> > 
> 
> Hey, if you're happy (and, as you said, you don't intend to merge that
> bit), I'm happy ;-)


yeah... drivers giving unprivileged processes more privs belong on
bugtraq though, not in the core kernel :)



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-21 20:07                         ` Timur Tabi
  2005-04-21 20:12                           ` Chris Wright
@ 2005-04-22  6:14                           ` Greg KH
  1 sibling, 0 replies; 144+ messages in thread
From: Greg KH @ 2005-04-22  6:14 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Andy Isaacson, Troy Benjegerdes, Bernhard Fischer,
	Arjan van de Ven, linux-kernel, openib-general

On Thu, Apr 21, 2005 at 03:07:42PM -0500, Timur Tabi wrote:
> >*You* need to come up with a solution that looks good to *the community*
> >if you want it merged.  
> 
> True, but I'm not going to waste my time adding this support if the 
> consensus I get from the kernel developers that they don't want Linux to 
> behave this way.

I think we have been giving you that consensus from the very
beginning :)

The very fact that you tried to trot out the "enterprise" card should
have raised a huge flag...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:16       ` Arjan van de Ven
  2005-04-18 16:25         ` Timur Tabi
@ 2005-04-22 17:55         ` Timur Tabi
  2005-04-22 18:12           ` Arjan van de Ven
  2005-04-29  0:56         ` Andrew Morton
  2 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-22 17:55 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

Arjan van de Ven wrote:
> On Mon, 2005-04-18 at 11:09 -0500, Timur Tabi wrote:
> 
>>Roland Dreier wrote:
>>
>>>    Troy> How is memory pinning handled? (I haven't had time to read
>>>    Troy> all the code, so please excuse my ignorance of something
>>>    Troy> obvious).
>>>
>>>The userspace library calls mlock() and then the kernel does
>>>get_user_pages().
>>
>>Why do you call mlock() and get_user_pages()?  In our code, we only call mlock(), and the 
>>memory is pinned. 
> 
> 
> this is a myth; linux is free to move the page about in physical memory
> even if it's mlock()ed!!

Can you tell me when Linux actually does this?  I know in theory it can happen, but I've 
never seen it.  Does the code to implement moving of data from one physical page to 
another even exist in any version of Linux?

Also, what would be the point?  What reason would there be to move some data from one 
physical page to another, while keeping the same virtual address?

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-22 17:55         ` Timur Tabi
@ 2005-04-22 18:12           ` Arjan van de Ven
  0 siblings, 0 replies; 144+ messages in thread
From: Arjan van de Ven @ 2005-04-22 18:12 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Roland Dreier, Troy Benjegerdes, linux-kernel, openib-general

On Fri, 2005-04-22 at 12:55 -0500, Timur Tabi wrote:
> Arjan van de Ven wrote:
> > On Mon, 2005-04-18 at 11:09 -0500, Timur Tabi wrote:
> > 
> >>Roland Dreier wrote:
> >>
> >>>    Troy> How is memory pinning handled? (I haven't had time to read
> >>>    Troy> all the code, so please excuse my ignorance of something
> >>>    Troy> obvious).
> >>>
> >>>The userspace library calls mlock() and then the kernel does
> >>>get_user_pages().
> >>
> >>Why do you call mlock() and get_user_pages()?  In our code, we only call mlock(), and the 
> >>memory is pinned. 
> > 
> > 
> > this is a myth; linux is free to move the page about in physical memory
> > even if it's mlock()ed!!
> 
> Can you tell me when Linux actually does this?  I know in theory it can happen, but I've 
> never seen it.  Does the code to implement moving of data from one physical page to 
> another even exist in any version of Linux?

hot(un)plug memory.

> 
> Also, what would be the point?  What reason would there be to move some data from one 
> physical page to another, while keeping the same virtual address?

so that you can hot unplug the dimm in question.

I guess that's a bit of a high end though though... so maybe you don't
care about it.



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:45                   ` Timur Tabi
@ 2005-04-24  2:44                     ` Andrew Morton
  2005-04-24 14:23                       ` Timur Tabi
  2005-04-25 19:11                       ` Andy Isaacson
  0 siblings, 2 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-24  2:44 UTC (permalink / raw)
  To: Timur Tabi; +Cc: hch, roland, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Christoph Hellwig wrote:
> > On Mon, Apr 18, 2005 at 11:22:29AM -0500, Timur Tabi wrote:
> > 
> >>That's not what we're seeing.  We have hardware that does DMA over the 
> >>network (much like the Infiniband stuff), and we have a testcase that fails 
> >>if get_user_pages() is used, but not if mlock() is used.
> > 
> > 
> > If you don't share your testcase it's unlikely to be fixed.
> 
> As I said, the testcase only works with our hardware, and it's also very large.  It's one 
> small test that's part of a huge test suite.  It takes a couple hours just to install the 
> damn thing.
> 
> We want to produce a simpler test case that demonstrates the problem in an 
> easy-to-understand manner, but we don't have time to do that now.

If your theory is correct then it should be able to demonstrate this
problem without any special hardware at all: pin some user memory, then
generate memory pressure then check the contents of those pinned pages.

But if, for the DMA transfer, you're using the array of page*'s which were
originally obtained from get_user_pages() then it's rather hard to see how
the kernel could alter the page's contents.

Then again, if mlock() fixes it then something's up.  Very odd.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-24  2:44                     ` Andrew Morton
@ 2005-04-24 14:23                       ` Timur Tabi
  2005-04-24 20:53                         ` Greg KH
  2005-04-25 13:15                         ` Roland Dreier
  2005-04-25 19:11                       ` Andy Isaacson
  1 sibling, 2 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-24 14:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: hch, roland, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> If your theory is correct then it should be able to demonstrate this
> problem without any special hardware at all: pin some user memory, then
> generate memory pressure then check the contents of those pinned pages.

I tried that, but I couldn't get it to fail.  But that was a while ago, and I've learned a 
few things since then, so I'll try again.

> But if, for the DMA transfer, you're using the array of page*'s which were
> originally obtained from get_user_pages() then it's rather hard to see how
> the kernel could alter the page's contents.
> 
> Then again, if mlock() fixes it then something's up.  Very odd.

With mlock(), we don't need to use get_user_pages() at all.  Arjan tells me the only time 
an mlocked page can move is with hot (un)plug of memory, but that isn't supported on the 
systems that we support.  We actually prefer mlock() over get_user_pages(), because if the 
process dies, the locks automatically go away too.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-24 14:23                       ` Timur Tabi
@ 2005-04-24 20:53                         ` Greg KH
  2005-04-24 21:52                           ` Timur Tabi
  2005-04-25 13:15                         ` Roland Dreier
  1 sibling, 1 reply; 144+ messages in thread
From: Greg KH @ 2005-04-24 20:53 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Andrew Morton, hch, roland, hozer, linux-kernel, openib-general

On Sun, Apr 24, 2005 at 09:23:48AM -0500, Timur Tabi wrote:
> Andrew Morton wrote:
> 
> >If your theory is correct then it should be able to demonstrate this
> >problem without any special hardware at all: pin some user memory, then
> >generate memory pressure then check the contents of those pinned pages.
> 
> I tried that, but I couldn't get it to fail.  But that was a while ago, and 
> I've learned a few things since then, so I'll try again.
> 
> >But if, for the DMA transfer, you're using the array of page*'s which were
> >originally obtained from get_user_pages() then it's rather hard to see how
> >the kernel could alter the page's contents.
> >
> >Then again, if mlock() fixes it then something's up.  Very odd.
> 
> With mlock(), we don't need to use get_user_pages() at all.  Arjan tells me 
> the only time an mlocked page can move is with hot (un)plug of memory, but 
> that isn't supported on the systems that we support.

You don't "support" i386 or ia64 or x86-64 or ppc64 systems?  What
hardware do you support?  And what about the fact that you are aiming to
get this code into mainline, right?  If not, why are you asking here?
:)

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-24 20:53                         ` Greg KH
@ 2005-04-24 21:52                           ` Timur Tabi
  2005-04-25  1:03                             ` Greg KH
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-24 21:52 UTC (permalink / raw)
  To: Greg KH; +Cc: Andrew Morton, hch, roland, hozer, linux-kernel, openib-general

Greg KH wrote:

> You don't "support" i386 or ia64 or x86-64 or ppc64 systems?  What
> hardware do you support? 

I've never seen or heard of any x86-32 or x86-64 system that supports hot-swap RAM. Our 
hardware does not support PPC, and our software doesn't support ia-64.

 > And what about the fact that you are aiming to
> get this code into mainline, right?  If not, why are you asking here?
> :)

Well, our primary concern is getting our stuff to work.  Since get_user_pages() doesn't 
work, but mlock() does, that's what we use.  I don't know how to fix get_user_pages(), and 
I don't have the time right now to figure it out.  I know that technically mlock() is not 
the right way to do it, and so we're not going to be submitting our code for the mainline 
until get_user_pages() works and our code uses it instead of mlock().


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-24 21:52                           ` Timur Tabi
@ 2005-04-25  1:03                             ` Greg KH
  2005-04-25  4:12                               ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Greg KH @ 2005-04-25  1:03 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Andrew Morton, hch, roland, hozer, linux-kernel, openib-general

On Sun, Apr 24, 2005 at 04:52:31PM -0500, Timur Tabi wrote:
> Greg KH wrote:
> 
> >You don't "support" i386 or ia64 or x86-64 or ppc64 systems?  What
> >hardware do you support? 
> 
> I've never seen or heard of any x86-32 or x86-64 system that supports 
> hot-swap RAM.

I know of at least 1 x86-32 box from a three-letter-named company with
this feature that has been shipping for a few _years_ now.  That box is
pretty much everywhere now, and I know that other versions of it are
also quite popular (despite the high cost...)

> Our hardware does not support PPC, and our software doesn't support
> ia-64.

Your hardware is just a pci card, right?  Why wouldn't it work on ppc64
and ia64 then?

> > And what about the fact that you are aiming to
> >get this code into mainline, right?  If not, why are you asking here?
> >:)
> 
> Well, our primary concern is getting our stuff to work.  Since 
> get_user_pages() doesn't work, but mlock() does, that's what we use.  I 
> don't know how to fix get_user_pages(), and I don't have the time right now 
> to figure it out.  I know that technically mlock() is not the right way to 
> do it, and so we're not going to be submitting our code for the mainline 
> until get_user_pages() works and our code uses it instead of mlock().

Wait, what _is_ "your stuff"?  The open-ib code?  Or some other, private
fork?  Any pointers to this stuff?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25  1:03                             ` Greg KH
@ 2005-04-25  4:12                               ` Timur Tabi
  2005-04-25 13:30                                 ` Dave Hansen
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-25  4:12 UTC (permalink / raw)
  To: Greg KH; +Cc: Andrew Morton, hch, roland, hozer, linux-kernel, openib-general

Greg KH wrote:

> I know of at least 1 x86-32 box from a three-letter-named company with
> this feature that has been shipping for a few _years_ now.  That box is
> pretty much everywhere now, and I know that other versions of it are
> also quite popular (despite the high cost...)

Hmm... Well, I think we were already planning on telling our customers that we don't 
support hot-swap RAM.  Is there a CONFIG option for that feature?

> Your hardware is just a pci card, right?  Why wouldn't it work on ppc64
> and ia64 then?

It's PCI-X, actually, and I don't think we've ever actually plugged it into a PPC box. 
Isn't Open Firmware support required for all PPC boxes, anyway?  Our PCI card is not OF 
compatible, AFAIK.

As for IA64, well, we could support it, but it's not a high enough priority.  We do have 
some CPU-specific code in our driver that we would need to port to IA-64.

> Wait, what _is_ "your stuff"?  The open-ib code?

No, if anything, it's the competition to IB.  It's called iWARP (RDMA over TCP/IP), and 
it's similar to IB except it uses gigabit ethernet instead of whatever hardware IB uses. 
Because we also support RMDA, we have the same problems as OpenIB, however, we would 
prefer that the kernel support OpenRDMA instead, since it's more generic.

 >  Or some other, private
> fork?  Any pointers to this stuff?

http://ammasso.com/support.html

The current version of the code calls sys_mlock() directly from the driver.  We haven't 
released yet the version that calls mlock().

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-24 14:23                       ` Timur Tabi
  2005-04-24 20:53                         ` Greg KH
@ 2005-04-25 13:15                         ` Roland Dreier
  2005-04-25 13:17                           ` Christoph Hellwig
  2005-04-25 20:54                           ` Andrew Morton
  1 sibling, 2 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-25 13:15 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Andrew Morton, hch, hozer, linux-kernel, openib-general

    Timur> With mlock(), we don't need to use get_user_pages() at all.
    Timur> Arjan tells me the only time an mlocked page can move is
    Timur> with hot (un)plug of memory, but that isn't supported on
    Timur> the systems that we support.  We actually prefer mlock()
    Timur> over get_user_pages(), because if the process dies, the
    Timur> locks automatically go away too.

There actually is another way pages can move, with both
get_user_pages() and mlock(): copy-on-write after a fork().  If
userspace does a fork(), then all PTEs are marked read-only, and if
the original process touches the page after the fork(), a new page
will be allocated and mapped at the original virtual address.

This is actually a pretty big pain, because the only good solution
seems to be for the kernel to mark these registered regions as
VM_DONTCOPY.  Right now this means that driver code ends up monkeying
with vm_flags for user vmas.

Does it seem reasonable to add a new system call to let userspace mark
memory it doesn't want copied into forked processes?  Something like

	long sys_mark_nocopy(unsigned long addr, size_t len, int mark)

which would set VM_DONTCOPY if mark != 0, and clear it if mark == 0.
A better name would be gratefully accepted...

Then to register memory for RDMA, userspace would call
sys_mark_nocopy() (with appropriate accounting to handle possibly
overlapping regions) and the kernel would call get_user_pages().  The
get_user_pages() is of course required because the kernel can't trust
userspace to keep the pages locked.  mlock() would no longer be
necessary.  We can trust userspace to call sys_mark_nocopy() as
needed, because a process can only hurt itself and its children by
misusing the sys_mark_nocopy() call.

If this seems reasonable then I can code a patch.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 13:15                         ` Roland Dreier
@ 2005-04-25 13:17                           ` Christoph Hellwig
  2005-04-25 14:16                             ` Roland Dreier
  2005-04-25 20:54                           ` Andrew Morton
  1 sibling, 1 reply; 144+ messages in thread
From: Christoph Hellwig @ 2005-04-25 13:17 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Timur Tabi, Andrew Morton, hch, hozer, linux-kernel, openib-general

On Mon, Apr 25, 2005 at 06:15:10AM -0700, Roland Dreier wrote:
> Does it seem reasonable to add a new system call to let userspace mark
> memory it doesn't want copied into forked processes?  Something like
> 
> 	long sys_mark_nocopy(unsigned long addr, size_t len, int mark)
> 
> which would set VM_DONTCOPY if mark != 0, and clear it if mark == 0.
> A better name would be gratefully accepted...

add a new MAP_DONTCOPY flag and accept it in mmap and mprotect?


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25  4:12                               ` Timur Tabi
@ 2005-04-25 13:30                                 ` Dave Hansen
  0 siblings, 0 replies; 144+ messages in thread
From: Dave Hansen @ 2005-04-25 13:30 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Greg KH, Andrew Morton, hch, roland, hozer,
	Linux Kernel Mailing List, openib-general

On Sun, 2005-04-24 at 23:12 -0500, Timur Tabi wrote:
> Greg KH wrote:
> > I know of at least 1 x86-32 box from a three-letter-named company with
> > this feature that has been shipping for a few _years_ now.  That box is
> > pretty much everywhere now, and I know that other versions of it are
> > also quite popular (despite the high cost...)
> 
> Hmm... Well, I think we were already planning on telling our customers that we don't 
> support hot-swap RAM.  Is there a CONFIG option for that feature?

The driver to do the ACPI portion of both add and remove is in the
kernel today, so it's certainly a feature that's coming relatively soon.

There is a large variety of x86_64, ppc64, ia64 and ia64 hardware that
will be doing memory hotplug.  I believe that every POWER5 system is
capable of supporting it, at least virtually.

I don't think your concerns end with memory hotplug.  The same
approaches to moving memory around will be used for NUMA memory
balancing and for memory defragmentation.  Can you say that your cards
will never be used on a system which has memory which becomes
fragmented?

-- Dave


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 13:17                           ` Christoph Hellwig
@ 2005-04-25 14:16                             ` Roland Dreier
  0 siblings, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-25 14:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Timur Tabi, Andrew Morton, hozer, linux-kernel, openib-general

    Roland> Does it seem reasonable to add a new system call to let
    Roland> userspace mark memory it doesn't want copied into forked
    Roland> processes?  Something like

    Roland> long sys_mark_nocopy(unsigned long addr, size_t len, int
    Roland> mark)

    Roland> which would set VM_DONTCOPY if mark != 0, and clear it if
    Roland> mark == 0.  A better name would be gratefully accepted...

    Christoph> add a new MAP_DONTCOPY flag and accept it in mmap and
    Christoph> mprotect?

That is much better, thanks.  But I think it would need to be
PROT_DONTCOPY to work with mprotect(), right?

 - R.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-24  2:44                     ` Andrew Morton
  2005-04-24 14:23                       ` Timur Tabi
@ 2005-04-25 19:11                       ` Andy Isaacson
  1 sibling, 0 replies; 144+ messages in thread
From: Andy Isaacson @ 2005-04-25 19:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Timur Tabi, hch, roland, hozer, linux-kernel, openib-general

On Sat, Apr 23, 2005 at 07:44:21PM -0700, Andrew Morton wrote:
> Timur Tabi <timur.tabi@ammasso.com> wrote:
> > As I said, the testcase only works with our hardware, and it's also
> > very large.  It's one small test that's part of a huge test suite.
> > It takes a couple hours just to install the damn thing.
> > 
> > We want to produce a simpler test case that demonstrates the problem in an 
> > easy-to-understand manner, but we don't have time to do that now.
> 
> If your theory is correct then it should be able to demonstrate this
> problem without any special hardware at all: pin some user memory, then
> generate memory pressure then check the contents of those pinned pages.
> 
> But if, for the DMA transfer, you're using the array of page*'s which were
> originally obtained from get_user_pages() then it's rather hard to see how
> the kernel could alter the page's contents.
> 
> Then again, if mlock() fixes it then something's up.  Very odd.

Andrew,

Libor Michalek posted a much more reasonable (to my limited
understanding) bug description in <20050412180447.E6958@topspin.com>.

(And I'd love to provide a URL, but damned if I can figure out how to
find that message on gmane.  Clue-bat applications gladly accepted.)

Libor Michalek wrote:
# The driver did use get_user_pages() to elevated the refcount on all the
# pages it was going to use for IO, as well as call set_page_dirty() since
# the pages were going to have data written to them from the device.
# 
# The problem we were seeing is that the minor fault by the app resulted
# in a new physical page getting mapped for the application. The page that
# had the elevated refcount was still waiting for the data to be written
# to by the driver at the time that the app accessed the page causing the
# minor fault. Obviously since the app had a new mapping the data written
# by the driver was lost.
# 
# It looks like code was added to try_to_unmap_one() to address this, so   
# hopefully it's no longer an issue...

Which makes me think that Timur's bug is just an
insufficiently-understood version of Libor's.

-andy

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 13:15                         ` Roland Dreier
  2005-04-25 13:17                           ` Christoph Hellwig
@ 2005-04-25 20:54                           ` Andrew Morton
  2005-04-25 21:12                             ` Roland Dreier
  1 sibling, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 20:54 UTC (permalink / raw)
  To: Roland Dreier; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

Roland Dreier <roland@topspin.com> wrote:
>
>     Timur> With mlock(), we don't need to use get_user_pages() at all.
>      Timur> Arjan tells me the only time an mlocked page can move is
>      Timur> with hot (un)plug of memory, but that isn't supported on
>      Timur> the systems that we support.  We actually prefer mlock()
>      Timur> over get_user_pages(), because if the process dies, the
>      Timur> locks automatically go away too.
> 
>  There actually is another way pages can move, with both
>  get_user_pages() and mlock(): copy-on-write after a fork().  If
>  userspace does a fork(), then all PTEs are marked read-only, and if
>  the original process touches the page after the fork(), a new page
>  will be allocated and mapped at the original virtual address.

Do we care about that?  A straightforward scenario under which this can
happen is:

a) app starts some read I/O in an asynchronous manner
b) app forks
c) child writes to one of the pages which is still under read I/O
d) the read I/O completes
e) the child is left with the old data plus the child's modification instead
   of the new data

which is a very silly application which is giving itself unpredictable
memory contents anyway.

I assume there's a more sensible scenario?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 20:54                           ` Andrew Morton
@ 2005-04-25 21:12                             ` Roland Dreier
  2005-04-25 22:14                               ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-25 21:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

    Andrew> Do we care about that?  A straightforward scenario under
    Andrew> which this can happen is:

    Andrew> a) app starts some read I/O in an asynchronous manner
    Andrew> b) app forks
    Andrew> c) child writes to one of the pages which is still under read I/O
    Andrew> d) the read I/O completes
    Andrew> e) the child is left with the old data plus the child's modification instead
    Andrew>    of the new data

    Andrew> which is a very silly application which is giving itself
    Andrew> unpredictable memory contents anyway.

    Andrew> I assume there's a more sensible scenario?

You're right, that is a silly scenario ;)  In fact if we mark vmas
with VM_DONTCOPY, then the child just crashes with a seg fault.

The type of thing I'm worried about is something like, for example:

a) app registers memory region with RDMA hardware -- in other words,
   loads the device's translation table for future I/O
b) app forks
c) app writes to the registered memory region, and the kernel breaks
   the COW for the (now read-only) page by mapping a new page
d) app starts an I/O that will do a DMA read from the region
e) device reads using the wrong, old mapping

This can be pretty insiduous because for example fork() + immediate
exec() or just using system() still leaves the parent with PTEs marked
read-only.  If an application does overlapping memory registrations so
get_user_pages() is called a lot, then as far as I can see
can_share_swap_page() will always return 0 and the COW will happen
even if the child process has thrown out its original vmas.

Or if the counts are in the correct range, then there's a small window
between fork() and exec() where the parent process can screw itself
up, so most of the time the app works, until it doesn't.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 21:12                             ` Roland Dreier
@ 2005-04-25 22:14                               ` Andrew Morton
  2005-04-25 22:21                                 ` Timur Tabi
                                                   ` (2 more replies)
  0 siblings, 3 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 22:14 UTC (permalink / raw)
  To: Roland Dreier; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

Roland Dreier <roland@topspin.com> wrote:
>
>     Andrew> Do we care about that?  A straightforward scenario under
>     Andrew> which this can happen is:
> 
>     Andrew> a) app starts some read I/O in an asynchronous manner
>     Andrew> b) app forks
>     Andrew> c) child writes to one of the pages which is still under read I/O
>     Andrew> d) the read I/O completes
>     Andrew> e) the child is left with the old data plus the child's modification instead
>     Andrew>    of the new data
> 
>     Andrew> which is a very silly application which is giving itself
>     Andrew> unpredictable memory contents anyway.
> 
>     Andrew> I assume there's a more sensible scenario?
> 
> You're right, that is a silly scenario ;)  In fact if we mark vmas
> with VM_DONTCOPY, then the child just crashes with a seg fault.
> 
> The type of thing I'm worried about is something like, for example:
> 
> a) app registers memory region with RDMA hardware -- in other words,
>    loads the device's translation table for future I/O

Whoa, hang on.

The way we expect get_user_pages() to be used is that the kernel will use
get_user_pages() once per application I/O request.

Are you saying that RDMA clients will semi-permanently own pages which were
pinned by get_user_pages()?  That those pages will be used for multiple
separate I/O operations?

If so, then that's a significant design departure and it would be good to
hear why it is necessary.

> b) app forks
> c) app writes to the registered memory region, and the kernel breaks
>    the COW for the (now read-only) page by mapping a new page
> d) app starts an I/O that will do a DMA read from the region
> e) device reads using the wrong, old mapping

Sure.  But such an app could be declared to be buggy...

> This can be pretty insiduous because for example fork() + immediate
> exec() or just using system() still leaves the parent with PTEs marked
> read-only.  If an application does overlapping memory registrations so
> get_user_pages() is called a lot, then as far as I can see
> can_share_swap_page() will always return 0 and the COW will happen
> even if the child process has thrown out its original vmas.
> 
> Or if the counts are in the correct range, then there's a small window
> between fork() and exec() where the parent process can screw itself
> up, so most of the time the app works, until it doesn't.
> 
>  - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:14                               ` Andrew Morton
@ 2005-04-25 22:21                                 ` Timur Tabi
  2005-04-25 22:32                                   ` Andrew Morton
  2005-04-25 22:23                                 ` Timur Tabi
  2005-04-26  0:02                                 ` Roland Dreier
  2 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-25 22:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Roland Dreier, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> The way we expect get_user_pages() to be used is that the kernel will use
> get_user_pages() once per application I/O request.
> 
> Are you saying that RDMA clients will semi-permanently own pages which were
> pinned by get_user_pages()?  That those pages will be used for multiple
> separate I/O operations?

Yes, absolutely!

The memory buffer is allocated by the process (usually just via malloc) and 
registed/pinned by the driver.  It then stays pinned for the life of the process (typically).

> If so, then that's a significant design departure and it would be good to
> hear why it is necessary.

That's just how RMDA works.  Once the memory is pinned, if the app wants to send data to 
another node, it does two things:

1) Puts the data into its buffer
2) Sends a "work request" to the driver with (among other things) the offset and length of 
the data.

This is a time-critical operation.  It must occurs as fast as possible, which means the 
memory must have already been pinned.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:14                               ` Andrew Morton
  2005-04-25 22:21                                 ` Timur Tabi
@ 2005-04-25 22:23                                 ` Timur Tabi
  2005-04-25 22:35                                   ` Andrew Morton
  2005-04-26  0:02                                 ` Roland Dreier
  2 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-25 22:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Roland Dreier, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> The way we expect get_user_pages() to be used is that the kernel will use
> get_user_pages() once per application I/O request.

Are you saying that the mapping obtained by get_user_pages() is valid only within the 
context of the IOCtl call?  That once the driver returns from the IOCtl, the mapping 
should no longer be used?

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:21                                 ` Timur Tabi
@ 2005-04-25 22:32                                   ` Andrew Morton
  2005-04-25 23:58                                     ` Roland Dreier
  0 siblings, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 22:32 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, hch, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Andrew Morton wrote:
> 
> > The way we expect get_user_pages() to be used is that the kernel will use
> > get_user_pages() once per application I/O request.
> > 
> > Are you saying that RDMA clients will semi-permanently own pages which were
> > pinned by get_user_pages()?  That those pages will be used for multiple
> > separate I/O operations?
> 
> Yes, absolutely!
> 
> The memory buffer is allocated by the process (usually just via malloc) and 
> registed/pinned by the driver.  It then stays pinned for the life of the process (typically).

ug.  What stops the memory from leaking if the process exits?

I hope this is a privileged operation?

> > If so, then that's a significant design departure and it would be good to
> > hear why it is necessary.
> 
> That's just how RMDA works.  Once the memory is pinned, if the app wants to send data to 
> another node, it does two things:
> 
> 1) Puts the data into its buffer
> 2) Sends a "work request" to the driver with (among other things) the offset and length of 
> the data.
> 
> This is a time-critical operation.  It must occurs as fast as possible, which means the 
> memory must have already been pinned.

It would be better to obtain this memory via a mmap() of some special
device node, so we can perform appropriate permission checking and clean
everything up on unclean application exit.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:23                                 ` Timur Tabi
@ 2005-04-25 22:35                                   ` Andrew Morton
  2005-04-25 22:42                                     ` Timur Tabi
                                                       ` (2 more replies)
  0 siblings, 3 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 22:35 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, hch, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Andrew Morton wrote:
> 
> > The way we expect get_user_pages() to be used is that the kernel will use
> > get_user_pages() once per application I/O request.
> 
> Are you saying that the mapping obtained by get_user_pages() is valid only within the 
> context of the IOCtl call?  That once the driver returns from the IOCtl, the mapping 
> should no longer be used?

Yes, we expect that all the pages which get_user_pages() pinned will become
unpinned within the context of the syscall which pinned the pages.  Or
shortly after, in the case of async I/O.

This is because there is no file descriptor or anything else associated
with the pages which permits the kernel to clean stuff up on unclean
application exit.  Also there are the obvious issues with permitting
pinning of unbounded amounts of memory.



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:35                                   ` Andrew Morton
@ 2005-04-25 22:42                                     ` Timur Tabi
  2005-04-25 23:13                                       ` Andrew Morton
  2005-04-25 22:51                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation Bob Woodruff
  2005-04-25 23:17                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
  2 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-25 22:42 UTC (permalink / raw)
  To: Andrew Morton; +Cc: roland, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> This is because there is no file descriptor or anything else associated
> with the pages which permits the kernel to clean stuff up on unclean
> application exit.  Also there are the obvious issues with permitting
> pinning of unbounded amounts of memory.

Then that might explain the "bug" that we're seeing with get_user_pages().  We've been 
assuming that get_user_pages() mappings are permanent.

Well, I was just about to re-implement get_user_pages() support in our driver to 
demonstrate the bug.  I guess I'll hold off on that.

If you look at the Infiniband code that was recently submitted, I think you'll see it does 
exactly that: after calling mlock(), the driver calls get_user_pages(), and it stores the 
page mappings for future use.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* RE: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation
  2005-04-25 22:35                                   ` Andrew Morton
  2005-04-25 22:42                                     ` Timur Tabi
@ 2005-04-25 22:51                                     ` Bob Woodruff
  2005-04-25 23:13                                       ` Timur Tabi
  2005-04-25 23:17                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
  2 siblings, 1 reply; 144+ messages in thread
From: Bob Woodruff @ 2005-04-25 22:51 UTC (permalink / raw)
  To: 'Andrew Morton', Timur Tabi, Davis, Arlin R
  Cc: hch, linux-kernel, openib-general

 Andrew Morton wrote,
>Yes, we expect that all the pages which get_user_pages() pinned will become
>unpinned within the context of the syscall which pinned the pages.  Or
>shortly after, in the case of async I/O.

>This is because there is no file descriptor or anything else associated
>with the pages which permits the kernel to clean stuff up on unclean
>application exit.  Also there are the obvious issues with permitting
>pinning of unbounded amounts of memory.

There definitely needs to be a mechanism to prevent people from pinning
too much memory. We saw issues in the sourceforge stack and some of the
vendors stacks where we could lock memory till the system hung. 
In the sourceforge InfiniBand stack, we put in a 
check to make sure that people did not pin too much memory. 
It was sort of a crude/bruit force mechanism, but effective. I think that we
limited people from locking down more that 1/2 of kernel memory or
70 % of all memory (it was tunable with a module option) and if they
exceeded
the limit, their requests to register memory would begin to fail. 
Arlin can provide details on how we did it or people can look at the 
IBAL code for an example. 

woody




^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation
  2005-04-25 22:51                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation Bob Woodruff
@ 2005-04-25 23:13                                       ` Timur Tabi
  2005-04-25 23:17                                         ` Andrew Morton
  2005-04-25 23:29                                         ` Bob Woodruff
  0 siblings, 2 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-25 23:13 UTC (permalink / raw)
  To: Bob Woodruff
  Cc: 'Andrew Morton',
	Davis, Arlin R, hch, linux-kernel, openib-general

Bob Woodruff wrote:

> There definitely needs to be a mechanism to prevent people from pinning
> too much memory. 

Any limit would have to be very high - definitely more than just half.  What if the 
application needs to pin 2GB?  The customer is not going to buy 4+ GB of RAM just because 
Linux doesn't like pinning more than half.  In an x86-32 system, that would required PAE 
support and slow everything down.

Off the top of my head, I'd say Linux would need to allow all but 512MB to be pinned.  So 
you have 3GB of RAM, Linux should allow you to pin 2.5GB.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:42                                     ` Timur Tabi
@ 2005-04-25 23:13                                       ` Andrew Morton
  2005-04-25 23:21                                         ` Timur Tabi
  2005-04-26  0:08                                         ` Roland Dreier
  0 siblings, 2 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 23:13 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, hch, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Andrew Morton wrote:
> 
> > This is because there is no file descriptor or anything else associated
> > with the pages which permits the kernel to clean stuff up on unclean
> > application exit.  Also there are the obvious issues with permitting
> > pinning of unbounded amounts of memory.
> 
> Then that might explain the "bug" that we're seeing with get_user_pages().  We've been 
> assuming that get_user_pages() mappings are permanent.

They are permanent until someone runs put_page() against all the pages. 
What I'm saying is that all current callers of get_user_pages() _do_ run
put_page() within the same syscall or upon I/O termination.

> Well, I was just about to re-implement get_user_pages() support in our driver to 
> demonstrate the bug.  I guess I'll hold off on that.
> 
> If you look at the Infiniband code that was recently submitted, I think you'll see it does 
> exactly that: after calling mlock(), the driver calls get_user_pages(), and it stores the 
> page mappings for future use.

Where?

bix:/usr/src/linux-2.6.12-rc3> grep -rl get_user_pages .
./arch/i386/lib/usercopy.c
./arch/sparc64/kernel/ptrace.c
./drivers/video/pvr2fb.c
./drivers/media/video/video-buf.c
./drivers/scsi/sg.c
./drivers/scsi/st.c
./include/asm-ia64/pgtable.h
./include/linux/mm.h
./include/asm-um/archparam-i386.h
./include/asm-i386/fixmap.h
./fs/nfs/direct.c
./fs/aio.c
./fs/binfmt_elf.c
./fs/bio.c
./fs/direct-io.c
./kernel/futex.c
./kernel/ptrace.c
./mm/memory.c
./mm/nommu.c
./mm/rmap.c
./mm/mempolicy.c

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:35                                   ` Andrew Morton
  2005-04-25 22:42                                     ` Timur Tabi
  2005-04-25 22:51                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation Bob Woodruff
@ 2005-04-25 23:17                                     ` Libor Michalek
  2005-04-25 23:24                                       ` Andrew Morton
  2 siblings, 1 reply; 144+ messages in thread
From: Libor Michalek @ 2005-04-25 23:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Timur Tabi, hch, linux-kernel, openib-general

On Mon, Apr 25, 2005 at 03:35:42PM -0700, Andrew Morton wrote:
> Timur Tabi <timur.tabi@ammasso.com> wrote:
> >
> > Andrew Morton wrote:
> > 
> > > The way we expect get_user_pages() to be used is that the kernel will use
> > > get_user_pages() once per application I/O request.
> > 
> > Are you saying that the mapping obtained by get_user_pages() is valid only within the 
> > context of the IOCtl call?  That once the driver returns from the IOCtl, the mapping 
> > should no longer be used?
> 
> Yes, we expect that all the pages which get_user_pages() pinned will become
> unpinned within the context of the syscall which pinned the pages.  Or
> shortly after, in the case of async I/O.

  When a network protocol is making use of async I/O the amount of time
between posting the read request and getting the completion for that
request is unbounded since it depends on the other half of the connection
sending some data. In this case the buffer that was pinned during the
io_submit() may be pinned, and holding the pages, for a long time. During
this time the process might fork, at this point any data received will be
placed into the wrong spot. 

> This is because there is no file descriptor or anything else associated
> with the pages which permits the kernel to clean stuff up on unclean
> application exit.  Also there are the obvious issues with permitting
> pinning of unbounded amounts of memory.

  Correct, the driver must be able to determine that the process has died
and clean up after it, so the pinned region in most implementations is
associated with an open file descriptor.

-Libor

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation
  2005-04-25 23:13                                       ` Timur Tabi
@ 2005-04-25 23:17                                         ` Andrew Morton
  2005-04-25 23:29                                         ` Bob Woodruff
  1 sibling, 0 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 23:17 UTC (permalink / raw)
  To: Timur Tabi
  Cc: robert.j.woodruff, arlin.r.davis, hch, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Bob Woodruff wrote:
> 
> > There definitely needs to be a mechanism to prevent people from pinning
> > too much memory. 
> 
> Any limit would have to be very high - definitely more than just half.  What if the 
> application needs to pin 2GB?  The customer is not going to buy 4+ GB of RAM just because 
> Linux doesn't like pinning more than half.  In an x86-32 system, that would required PAE 
> support and slow everything down.
> 
> Off the top of my head, I'd say Linux would need to allow all but 512MB to be pinned.  So 
> you have 3GB of RAM, Linux should allow you to pin 2.5GB.
> 

You can pin the whole darn lot *if you have the correct privileges*.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:13                                       ` Andrew Morton
@ 2005-04-25 23:21                                         ` Timur Tabi
  2005-04-25 23:27                                           ` Andrew Morton
  2005-04-26  0:08                                         ` Roland Dreier
  1 sibling, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-25 23:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: roland, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> They are permanent until someone runs put_page() against all the pages. 
> What I'm saying is that all current callers of get_user_pages() _do_ run
> put_page() within the same syscall or upon I/O termination.

Oh, okay then.  I guess I'll get back to work!

Actually, with RDMA, "I/O termination" technically doesn't happen until the memory is 
deregistered.  When the memory is registered, all that means is that it's should be pinned 
and the virtual-to-physical should be stored.  No actual I/O occurs at that point.

>>If you look at the Infiniband code that was recently submitted, I think you'll see it does 
>>exactly that: after calling mlock(), the driver calls get_user_pages(), and it stores the 
>>page mappings for future
 >
> Where?

I was talking about the code that Roland mentioned in the first message of this thread - 
the user-space verbs support.  He said the code calls mlock() and get_user_pages().

FYI, our driver detects the process termination and cleans up everything itself.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:17                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
@ 2005-04-25 23:24                                       ` Andrew Morton
  2005-04-25 23:37                                         ` Caitlin Bestler
  2005-04-26  3:55                                         ` Libor Michalek
  0 siblings, 2 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 23:24 UTC (permalink / raw)
  To: Libor Michalek; +Cc: timur.tabi, hch, linux-kernel, openib-general

Libor Michalek <libor@topspin.com> wrote:
>
> On Mon, Apr 25, 2005 at 03:35:42PM -0700, Andrew Morton wrote:
> > Timur Tabi <timur.tabi@ammasso.com> wrote:
> > >
> > > Andrew Morton wrote:
> > > 
> > > > The way we expect get_user_pages() to be used is that the kernel will use
> > > > get_user_pages() once per application I/O request.
> > > 
> > > Are you saying that the mapping obtained by get_user_pages() is valid only within the 
> > > context of the IOCtl call?  That once the driver returns from the IOCtl, the mapping 
> > > should no longer be used?
> > 
> > Yes, we expect that all the pages which get_user_pages() pinned will become
> > unpinned within the context of the syscall which pinned the pages.  Or
> > shortly after, in the case of async I/O.
> 
>   When a network protocol is making use of async I/O the amount of time
> between posting the read request and getting the completion for that
> request is unbounded since it depends on the other half of the connection
> sending some data. In this case the buffer that was pinned during the
> io_submit() may be pinned, and holding the pages, for a long time.

Sure.

> During
> this time the process might fork, at this point any data received will be
> placed into the wrong spot. 

Well the data is placed in _a_ spot.  That's only the "wrong" spot because
you've defined it to be wrong!

IOW: what behaviour are you actually looking for here, and why, and does it
matter?

> > This is because there is no file descriptor or anything else associated
> > with the pages which permits the kernel to clean stuff up on unclean
> > application exit.  Also there are the obvious issues with permitting
> > pinning of unbounded amounts of memory.
> 
>   Correct, the driver must be able to determine that the process has died
> and clean up after it, so the pinned region in most implementations is
> associated with an open file descriptor.

How is that association created?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:21                                         ` Timur Tabi
@ 2005-04-25 23:27                                           ` Andrew Morton
  0 siblings, 0 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-25 23:27 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, hch, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> FYI, our driver detects the process termination and cleans up everything itself.

How is this implemented?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* RE: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation
  2005-04-25 23:13                                       ` Timur Tabi
  2005-04-25 23:17                                         ` Andrew Morton
@ 2005-04-25 23:29                                         ` Bob Woodruff
  1 sibling, 0 replies; 144+ messages in thread
From: Bob Woodruff @ 2005-04-25 23:29 UTC (permalink / raw)
  To: 'Timur Tabi'
  Cc: 'Andrew Morton',
	Davis, Arlin R, hch, linux-kernel, openib-general

Timur Tabi wrote,
 
>Any limit would have to be very high - definitely more than just half.
What if the 
>application needs to pin 2GB?  The customer is not going to buy 4+ GB of
RAM just 
>because 
>Linux doesn't like pinning more than half.  In an x86-32 system, that would
required >PAE 
>support and slow everything down.

>Off the top of my head, I'd say Linux would need to allow all but 512MB to
be pinned.  >So 
>you have 3GB of RAM, Linux should allow you to pin 2.5GB.

That is why we made it tunable, so that people could decide how to allow.

There is probably a better way to do it than some hard limit, but 
that would take a little more understanding of the VM system than we had,
and that is why some of the core kernel folks maybe able to help us come up
with a better solution.

woody


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:24                                       ` Andrew Morton
@ 2005-04-25 23:37                                         ` Caitlin Bestler
  2005-04-26  0:10                                           ` Andrew Morton
  2005-04-26  3:55                                         ` Libor Michalek
  1 sibling, 1 reply; 144+ messages in thread
From: Caitlin Bestler @ 2005-04-25 23:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Libor Michalek, hch, linux-kernel, openib-general, timur.tabi

On 4/25/05, Andrew Morton <akpm@osdl.org> wrote:

> 
> > > This is because there is no file descriptor or anything else associated
> > > with the pages which permits the kernel to clean stuff up on unclean
> > > application exit.  Also there are the obvious issues with permitting
> > > pinning of unbounded amounts of memory.
> >
> >   Correct, the driver must be able to determine that the process has died
> > and clean up after it, so the pinned region in most implementations is
> > associated with an open file descriptor.
> 
> How is that association created?


There is not a file descrptor, but there is an rnic handle. Both DAPL
and IT-API require that process death will result in the handle and all
of its dependent objects being released.

The rnic handle can always be declared to be a "file descriptor" if
that makes it follow normal OS conventions more precisiely.

There is also a need for some form of resource manager to approve
creation of Memory Regions. Obviously you cannot have multiple
applications claiming half of physical memory.

But if you merely require the user to have root privileges in order
to create a Memory Region, and then take a first-come first-served
attitude, I don't think you end up with something that is truly a
general purpose capability.

A general purpose RDMA capability requires the ability to indefinitely
pin large portions of user memory. It makes sense to integrate that
with OS policy control over resource utilization and to integrate it with
memory suspend/resume capabilities so that hotplug memory can
be supported. What you can't do is downgrade a Memory Region so
that it is no longer a memory region. Doing that means that you are
not truly supporting RDMA.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:32                                   ` Andrew Morton
@ 2005-04-25 23:58                                     ` Roland Dreier
  2005-04-26  0:11                                       ` Andrew Morton
  2005-04-26  2:03                                       ` IWAMOTO Toshihiro
  0 siblings, 2 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-25 23:58 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Timur Tabi, hch, hozer, linux-kernel, openib-general

    Andrew> ug.  What stops the memory from leaking if the process
    Andrew> exits?

    Andrew> I hope this is a privileged operation?

I don't think it has to be privileged.  In my implementation, the
driver keeps a per-process list of registered memory regions and
unpins/cleans up on process exit.

    Andrew> It would be better to obtain this memory via a mmap() of
    Andrew> some special device node, so we can perform appropriate
    Andrew> permission checking and clean everything up on unclean
    Andrew> application exit.

This seems to interact poorly with how applications want to use RDMA,
ie typically through a library interface such as MPI.  People doing
HPC don't want to recode their apps to use a new allocator, they just
want to link to a new MPI library and have the app go fast.

 - R.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 22:14                               ` Andrew Morton
  2005-04-25 22:21                                 ` Timur Tabi
  2005-04-25 22:23                                 ` Timur Tabi
@ 2005-04-26  0:02                                 ` Roland Dreier
  2005-04-26  6:12                                   ` Christoph Hellwig
  2 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-26  0:02 UTC (permalink / raw)
  To: Andrew Morton; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

    Andrew> Whoa, hang on.

    Andrew> The way we expect get_user_pages() to be used is that the
    Andrew> kernel will use get_user_pages() once per application I/O
    Andrew> request.

    Andrew> Are you saying that RDMA clients will semi-permanently own
    Andrew> pages which were pinned by get_user_pages()?  That those
    Andrew> pages will be used for multiple separate I/O operations?

    Andrew> If so, then that's a significant design departure and it
    Andrew> would be good to hear why it is necessary.

The idea is that applications manage the lifetime of pinned memory
regions.  They can do things like post multiple I/O operations without
any page-walking overhead, or pass a buffer descriptor to a remote
host who will send data at some indeterminate time in the future.  In
addition, InfiniBand has the notion of atomic operations, so a cluster
application may be using some memory region to implement a global lock.

This might not be the most kernel-friendly design but it is pretty
deeply ingrained in the design of RDMA transports like InfiniBand and
iWARP (RDMA over IP).

I'm also not opposed to implementing some other mechanism to make this
work, but the combiniation of get_user_pages() in the kernel and
extending mprotect() to allow setting VM_DONTCOPY seems to work fine.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:13                                       ` Andrew Morton
  2005-04-25 23:21                                         ` Timur Tabi
@ 2005-04-26  0:08                                         ` Roland Dreier
  1 sibling, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-26  0:08 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Timur Tabi, hch, hozer, linux-kernel, openib-general

    Timur> If you look at the Infiniband code that was recently
    Timur> submitted, I think you'll see it does exactly that: after
    Timur> calling mlock(), the driver calls get_user_pages(), and it
    Timur> stores the page mappings for future use.

    Andrew> Where?

The code isn't merged yet.  I sent a version to lkml for review -- in
fact it was this very thread that we're in now.  The code in question
is in http://lkml.org/lkml/2005/4/4/266

This implements a "userspace verbs" character device that memory
registration goes through.  This means the kernel has a device node
that will be closed when a process dies, and so the memory can be
cleaned up.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:37                                         ` Caitlin Bestler
@ 2005-04-26  0:10                                           ` Andrew Morton
  0 siblings, 0 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-26  0:10 UTC (permalink / raw)
  To: Caitlin Bestler; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

Caitlin Bestler <caitlin.bestler@gmail.com> wrote:
>
> > 
> > > > This is because there is no file descriptor or anything else associated
> > > > with the pages which permits the kernel to clean stuff up on unclean
> > > > application exit.  Also there are the obvious issues with permitting
> > > > pinning of unbounded amounts of memory.
> > >
> > >   Correct, the driver must be able to determine that the process has died
> > > and clean up after it, so the pinned region in most implementations is
> > > associated with an open file descriptor.
> > 
> > How is that association created?
> 
> 
> There is not a file descrptor, but there is an rnic handle. Both DAPL
> and IT-API require that process death will result in the handle and all
> of its dependent objects being released.

What's an "rnic handle", in Linux terms?

> The rnic handle can always be declared to be a "file descriptor" if
> that makes it follow normal OS conventions more precisiely.

Does that mean that the code has not yet been implemented?

Yes, a Linux fd is appropriate.  But we don't have any sane way right now
of saying "you need to run put_page() against all these pages in the
->release() handler".  That'll need to be coded by yourselves.

> There is also a need for some form of resource manager to approve
> creation of Memory Regions. Obviously you cannot have multiple
> applications claiming half of physical memory.

The kernel already has considerable resource management capabilities. 
Please consider using/extending/generalising those before inventing
anything new.  RLIMIT_MEMLOCK would be a starting point.

> But if you merely require the user to have root privileges in order
> to create a Memory Region, and then take a first-come first-served
> attitude, I don't think you end up with something that is truly a
> general purpose capability.

We don't want code in the kernel which will permit hostile unprivileged
users to trivially cause the box to lock up.  RLIMIT_MEMLOCK and, if
necessary, CAP_IPC_LOCK sound appropriate here.

> A general purpose RDMA capability requires the ability to indefinitely
> pin large portions of user memory. It makes sense to integrate that
> with OS policy control over resource utilization and to integrate it with
> memory suspend/resume capabilities so that hotplug memory can
> be supported. What you can't do is downgrade a Memory Region so
> that it is no longer a memory region. Doing that means that you are
> not truly supporting RDMA.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:58                                     ` Roland Dreier
@ 2005-04-26  0:11                                       ` Andrew Morton
  2005-04-26  0:23                                         ` Roland Dreier
  2005-04-26  2:03                                       ` IWAMOTO Toshihiro
  1 sibling, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-26  0:11 UTC (permalink / raw)
  To: Roland Dreier; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

Roland Dreier <roland@topspin.com> wrote:
>
>     Andrew> ug.  What stops the memory from leaking if the process
>     Andrew> exits?
> 
>     Andrew> I hope this is a privileged operation?
> 
> I don't think it has to be privileged.  In my implementation, the
> driver keeps a per-process list of registered memory regions and
> unpins/cleans up on process exit.

How does the driver detect process exit?

>     Andrew> It would be better to obtain this memory via a mmap() of
>     Andrew> some special device node, so we can perform appropriate
>     Andrew> permission checking and clean everything up on unclean
>     Andrew> application exit.
> 
> This seems to interact poorly with how applications want to use RDMA,
> ie typically through a library interface such as MPI.  People doing
> HPC don't want to recode their apps to use a new allocator, they just
> want to link to a new MPI library and have the app go fast.

Fair enough.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  0:11                                       ` Andrew Morton
@ 2005-04-26  0:23                                         ` Roland Dreier
  2005-04-26  0:37                                           ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-26  0:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

    Andrew> How does the driver detect process exit?

I already answered earlier but just to be clear: registration goes
through a character device, and all regions are cleaned up in the
->release() of that device.

I don't currently have any code accounting against RLIMIT_MEMLOCK or
testing CAP_FOO, but I have no problem adding whatever is thought
appropriate.  Userspace also has control over the permissions and
owner/group of the /dev node.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  0:23                                         ` Roland Dreier
@ 2005-04-26  0:37                                           ` Andrew Morton
  2005-04-26  2:21                                             ` Timur Tabi
  2005-04-26 15:31                                             ` Roland Dreier
  0 siblings, 2 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-26  0:37 UTC (permalink / raw)
  To: Roland Dreier; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

Roland Dreier <roland@topspin.com> wrote:
>
>     Andrew> How does the driver detect process exit?
> 
> I already answered earlier but just to be clear: registration goes
> through a character device, and all regions are cleaned up in the
> ->release() of that device.

yup.

> I don't currently have any code accounting against RLIMIT_MEMLOCK or
> testing CAP_FOO, but I have no problem adding whatever is thought
> appropriate.  Userspace also has control over the permissions and
> owner/group of the /dev node.

I guess device node permissions won't be appropriate here, if only because
it sounds like everyone will go and set them to 0666.

RLIMIT_MEMLOCK sounds like the appropriate mechanism.  We cannot rely upon
userspace running mlock(), so perhaps it is appropriate to run sys_mlock()
in-kernel because that gives us the appropriate RLIMIT_MEMLOCK checking.

However an hostile app can just go and run munlock() and then allocate
some more pinned-by-get_user_pages() memory.

umm, how about we

- force the special pages into a separate vma

- run get_user_pages() against it all

- use RLIMIT_MEMLOCK accounting to check whether the user is allowed to
  do this thing

- undo the RMLIMIT_MEMLOCK accounting in ->release

This will all interact with user-initiated mlock/munlock in messy ways. 
Maybe a new kernel-internal vma->vm_flag which works like VM_LOCKED but is
unaffected by mlock/munlock activity is needed.

A bit of generalisation in do_mlock() should suit?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:58                                     ` Roland Dreier
  2005-04-26  0:11                                       ` Andrew Morton
@ 2005-04-26  2:03                                       ` IWAMOTO Toshihiro
  2005-04-26  2:16                                         ` Timur Tabi
  2005-04-26  2:26                                         ` [openib-general] " Stephen Langdon
  1 sibling, 2 replies; 144+ messages in thread
From: IWAMOTO Toshihiro @ 2005-04-26  2:03 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Andrew Morton, Timur Tabi, hch, hozer, linux-kernel, openib-general

At Mon, 25 Apr 2005 16:58:03 -0700,
Roland Dreier wrote:
>     Andrew> It would be better to obtain this memory via a mmap() of
>     Andrew> some special device node, so we can perform appropriate
>     Andrew> permission checking and clean everything up on unclean
>     Andrew> application exit.
> 
> This seems to interact poorly with how applications want to use RDMA,
> ie typically through a library interface such as MPI.  People doing
> HPC don't want to recode their apps to use a new allocator, they just
> want to link to a new MPI library and have the app go fast.

Such HPC users cannot use the memory hotremoval feature, and something
needs to be implemented so that the NUMA migration can handle such
memory properly, but I see your point.

If such memory were allocated by a driver, the memory could be placed
in non-hotremovable areas to avoid the above problems.

--
IWAMOTO Toshihiro

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  2:03                                       ` IWAMOTO Toshihiro
@ 2005-04-26  2:16                                         ` Timur Tabi
  2005-04-26  2:26                                         ` [openib-general] " Stephen Langdon
  1 sibling, 0 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-26  2:16 UTC (permalink / raw)
  To: IWAMOTO Toshihiro
  Cc: Roland Dreier, Andrew Morton, hch, hozer, linux-kernel, openib-general

IWAMOTO Toshihiro wrote:

> If such memory were allocated by a driver, the memory could be placed
> in non-hotremovable areas to avoid the above problems.

How can the driver allocated 3GB of pinned memory on a system with 3.5GB of RAM?  Can 
vmalloc() or get_free_pages() allocate that much memory?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  0:37                                           ` Andrew Morton
@ 2005-04-26  2:21                                             ` Timur Tabi
  2005-04-26  3:16                                               ` Andrew Morton
  2005-04-26 15:31                                             ` Roland Dreier
  1 sibling, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-26  2:21 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Roland Dreier, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> RLIMIT_MEMLOCK sounds like the appropriate mechanism.  We cannot rely upon
> userspace running mlock(), so perhaps it is appropriate to run sys_mlock()
> in-kernel because that gives us the appropriate RLIMIT_MEMLOCK checking.

I don't see what's wrong with relying on userspace to call mlock().  First all, all RDMA 
apps call a third-party API, like DAPL or MPI, to register memory.  The memory needs to be 
registered in order for the driver and adapter to know where it is.  During this 
registration, the memory is also pinned.  That's when we call mlock().

> 
> However an hostile app can just go and run munlock() and then allocate
> some more pinned-by-get_user_pages() memory.

Isn't mlock() on a per-process basis anyway?  How can one process call munlock() on 
another process' memory?

> umm, how about we
> 
> - force the special pages into a separate vma
> 
> - run get_user_pages() against it all
> 
> - use RLIMIT_MEMLOCK accounting to check whether the user is allowed to
>   do this thing
> 
> - undo the RMLIMIT_MEMLOCK accounting in ->release

Isn't this kinda what mlock() does already?  Create a new VMA and then VM_LOCK it?

> This will all interact with user-initiated mlock/munlock in messy ways. 
> Maybe a new kernel-internal vma->vm_flag which works like VM_LOCKED but is
> unaffected by mlock/munlock activity is needed.
> 
> A bit of generalisation in do_mlock() should suit?

Yes, but do_mlock() needs to prevent pages from being moved during memory hotswap.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  2:03                                       ` IWAMOTO Toshihiro
  2005-04-26  2:16                                         ` Timur Tabi
@ 2005-04-26  2:26                                         ` Stephen Langdon
  1 sibling, 0 replies; 144+ messages in thread
From: Stephen Langdon @ 2005-04-26  2:26 UTC (permalink / raw)
  To: IWAMOTO Toshihiro
  Cc: Roland Dreier, Andrew Morton, linux-kernel, openib-general, hch,
	Timur Tabi


[-- Attachment #1.1: Type: text/plain, Size: 8697 bytes --]

I don't think that we should jump to the conclusion that in the long 
term HPC users cannot benefit from support of mechanisms such as 
hotremoval of memory or other forms of page migration in physical 
memory.  In an earlier exchange on the openib-general list Mike Krause 
sent the message quoted below on very much the same topic.  On the other 
hand I am willing to accept that there is practical value to 
implementations which are not (yet) sophisticated to enough to support 
the migration functions.

Steve Langdon

> Michael Krause wrote: At 05:35 PM 3/14/2005, Caitlin Bestler wrote:
>
>>  
>>
>> > -----Original Message-----
>> > From: Troy Benjegerdes [ mailto:hozer@hozed.org]
>> > Sent: Monday, March 14, 2005 5:06 PM
>> > To: Caitlin Bestler
>> > Cc: openib-general@openib.org
>> > Subject: Re: [openib-general] Getting rid of pinned memory requirement
>> >
>> > >
>> > > The key is that the entire operation either has to be fast
>> > > enough so that no connection or application session layer
>> > > time-outs occur, or an end-to-end agreement to suspend the
>> > > connetion is a requirement. The first option seems more
>> > > plausible to me, the second essentially
>> > > reuqires extending the CM protocol. That's a tall order even for
>> > > InfiniBand, and it's even worse for iWARP where the CM
>> > > functionality typically ends when the connection is established.
>> > 
>> > I'll buy the good network design argument.
>
>
> I and others designed InfiniBand RNR (Receiver not ready) operations 
> to allow one to adjust V-to-P mappings (not change the address that 
> was advertised) in order to allow an OS to safely play some games with 
> memory and not drop a connection.  The time values associated with RNR 
> allow a solution to tolerate up to infinite amount of time to perform 
> such operations but the envisioned goal was to do this on the order of 
> a handful or milliseconds in the worse case.  For iWARP, there was no 
> support for defining RNR functionality as indeed many people claimed 
> one could just drop in-bound segments and allow the retransmission 
> protocol to deal with the delay (even if this has performance 
> implications due to back-off algorithms though some claim SACK would 
> minimize this to a large extent).  Again, the idea was to minimize the 
> worse case to milliseconds of down time.  BTW, all of this assumed 
> that the OS would not perform these types of changes that often so the 
> long-term impact on an application would be minimum.
>
>> >
>> > I suppose if the kernel wants to revoke a card's pinned
>> > memory, we should be able to guarantee that it gets new
>> > pinned memory within a bounded time. What sort of timing do
>> > we need? Milliseconds?
>> > Microseconds?
>> >
>> > In the case of iWarp, isn't this just TCP underneath? If so,
>> > can't we just drop any packets in the pipe on the floor and
>> > let them get retransmitted? (I suppose the same argument goes
>> > for infiniband..
>> > what sort of a time window do we have for retransmission?)
>> >
>> > What are the limits on end-to-end flow control in IB and iWarp?
>> >
>>
>> >From the RDMA Provider's perspective, the short answer is "quick 
>> enough so that I don't have to do anything heroic to keep the 
>> connection alive."
>
>
> It should not require anything heroic.  What is does require is a 
> local method to suspend the local QP(s) so that it cannot place or 
> read memory in the effected area.  That can take some time depending 
> upon the implementation.  There is then the time to over write the 
> mappings which again depending upon the implementation and the number 
> of mappings could be milliseconds in length.
>
>> With TCP you also have to add "and healthy". If you've ever had a 
>> long download that got effectively stalled by a burst of noise and 
>> you just hit the 'reload' button on your browser then you know what 
>> I'm talking about.
>>
>> But in transport neutral terms I would think that one RTT is 
>> definitely safe -- that much data could have
>> been dropped by one switch failure or one nasty spike in inbound noise.
>>
>> > >
>> > > Yes, there are limits on how much memory you can mlock, or even
>> > > allocate. Applications are required to reqister memory precisely
>> > > because the required guarantess are not there by default.
>> > Eliminating
>> > > those guarantees *is* effectively rewriting every RDMA application
>> > > without even letting them know.
>> >
>> > Some of this argument is a policy issue, which I would argue
>> > shouldn't be hard-coded in the code or in the network hardware.
>> >
>> > At least in my view, the guarantees are only there to make
>> > applications go fast. We are getting low latency and high
>> > performance with infiniband by making memory registration go
>> > really really slow. If, to make big HPC simulation
>> > applications work, we wind up doing memcpy() to put the data
>> > into a registered buffer because we can't register half of
>> > physical memory, the application isn't going very fast.
>> >
>>
>> What you are looking for is a distinction between registering
>> memory to *enable* the RNIC to optimize local access and
>> registering memory to enable its being advertised to the
>> remote end.
>>
>> Early implementations of RDMA, both IB and iWARP, have not
>> distinquished between the two. But theoretically *applications*
>> do not need memory regions that are not enabled for remote
>> access to be pinned. That is an RNIC requirement that could
>> evolve. But applications themselves *do* need remotely
>> accessible memory regions, portions of which they intend
>> to advertise with RKeys, to be truly available (i.e., pinned).
>>
>> You are also making a policy assumption that an application
>> that actually needs half of physical memory should be using
>> paged memory. Memory is cheap, and if performance is critical
>> why should this memory be swapped out to disk?
>>
>> Is the limitation on not being able to register half of
>> physical memory based upon some assumption that swapping
>> is a requirement? Or is it a limitation in the memory region
>> size? If it's the latter, you need to get the OS to support
>> larger page sizes.
>
>
> For some OS, you can pin very large areas.  I've seen 15/16 of memory 
> being able to be pinned with no adverse impacts on the applications.  
> For these OS, kernel memory is effectively pinned memory.  As such, 
> depending upon the mix of services being provided, the system may 
> operate quite nicely with such large amounts of memory being pinned.  
> As more services are "ported" to operate over RDMA technologies, 
> memory management isn't necessarily any harder; it just becomes 
> something people have to think more about.  Today's VM designs have 
> allowed people to get sloppy as they assume that swapping will occur 
> and since many platforms are not that loaded, they don't see any real 
> adverse impacts.  User-space RDMA applications requires people to 
> think once again about memory management and that swapping isn't a 
> get-out-of-jail card.  One needs to develop resource management tools 
> to determine who obtains specified amounts of resources and their 
> priorities.  For the most part, this is somewhat a re-invention of 
> some thinking that went into the micro-kernel work in past years.  
> These problems are not intractable; they are only constrained by the 
> legacy inertia inherent in all technologies today.
>
> Mike
>
>  
>



IWAMOTO Toshihiro wrote:

>At Mon, 25 Apr 2005 16:58:03 -0700,
>Roland Dreier wrote:
>  
>
>>    Andrew> It would be better to obtain this memory via a mmap() of
>>    Andrew> some special device node, so we can perform appropriate
>>    Andrew> permission checking and clean everything up on unclean
>>    Andrew> application exit.
>>
>>This seems to interact poorly with how applications want to use RDMA,
>>ie typically through a library interface such as MPI.  People doing
>>HPC don't want to recode their apps to use a new allocator, they just
>>want to link to a new MPI library and have the app go fast.
>>    
>>
>
>Such HPC users cannot use the memory hotremoval feature, and something
>needs to be implemented so that the NUMA migration can handle such
>memory properly, but I see your point.
>
>If such memory were allocated by a driver, the memory could be placed
>in non-hotremovable areas to avoid the above problems.
>
>--
>IWAMOTO Toshihiro
>_______________________________________________
>openib-general mailing list
>openib-general@openib.org
>http://openib.org/mailman/listinfo/openib-general
>
>To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>  
>


[-- Attachment #1.2: steve.langdon.vcf --]
[-- Type: text/x-vcard, Size: 348 bytes --]

begin:vcard
fn:Steve Langdon
n:Langdon;Stephen
org:Hewlett-Packard;Consulting & Architecture Group
adr:MS LKG1-3/B19;;550 King Street;Littleton;MA;01460;USA
email;internet:steve.langdon@hp.com
title:Fellow
tel;work:+1 978-506-5771
tel;fax:+1 978-742-1144
tel;home:+1 978-456-8177
tel;cell:+1 978-618-8599
x-mozilla-html:TRUE
version:2.1
end:vcard


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/x-pkcs7-signature, Size: 6189 bytes --]

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  2:21                                             ` Timur Tabi
@ 2005-04-26  3:16                                               ` Andrew Morton
  2005-04-26  3:38                                                 ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-26  3:16 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, hch, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Andrew Morton wrote:
> 
> > RLIMIT_MEMLOCK sounds like the appropriate mechanism.  We cannot rely upon
> > userspace running mlock(), so perhaps it is appropriate to run sys_mlock()
> > in-kernel because that gives us the appropriate RLIMIT_MEMLOCK checking.
> 
> I don't see what's wrong with relying on userspace to call mlock().  First all, all RDMA 
> apps call a third-party API, like DAPL or MPI, to register memory.  The memory needs to be 
> registered in order for the driver and adapter to know where it is.  During this 
> registration, the memory is also pinned.  That's when we call mlock().

All the above refers to well-behaved applications.

Now think about how the syscalls which you provide may be used by
applications which are *designed* to cripple or to compromise the machine.

> > 
> > However an hostile app can just go and run munlock() and then allocate
> > some more pinned-by-get_user_pages() memory.
> 
> Isn't mlock() on a per-process basis anyway?  How can one process call munlock() on 
> another process' memory?

I'm referring to an application which uses your syscalls to obtain pinned
memory and uses munlock() so that it may then use your syscalls to obtain
evem more pinned memory.  With the objective of taking the machine down.

> > umm, how about we
> > 
> > - force the special pages into a separate vma
> > 
> > - run get_user_pages() against it all
> > 
> > - use RLIMIT_MEMLOCK accounting to check whether the user is allowed to
> >   do this thing
> > 
> > - undo the RMLIMIT_MEMLOCK accounting in ->release
> 
> Isn't this kinda what mlock() does already?  Create a new VMA and then VM_LOCK it?

kinda.  But applications can undo the mlock which the kernel did.

> > This will all interact with user-initiated mlock/munlock in messy ways. 
> > Maybe a new kernel-internal vma->vm_flag which works like VM_LOCKED but is
> > unaffected by mlock/munlock activity is needed.
> > 
> > A bit of generalisation in do_mlock() should suit?
> 
> Yes, but do_mlock() needs to prevent pages from being moved during memory hotswap.

I haven't even thought about memory hotswap.  Surely it'll fail if the
pages are pinned by get_user_pages()?


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-13  1:04               ` [openib-general] " Libor Michalek
  2005-04-18 17:15                 ` Timur Tabi
@ 2005-04-26  3:31                 ` Libor Michalek
  2005-05-04 18:27                   ` Timur Tabi
  1 sibling, 1 reply; 144+ messages in thread
From: Libor Michalek @ 2005-04-26  3:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, openib-general

[-- Attachment #1: Type: text/plain, Size: 2775 bytes --]

On Tue, Apr 12, 2005 at 06:04:47PM -0700, Libor Michalek wrote:
> On Mon, Apr 11, 2005 at 05:13:47PM -0700, Andrew Morton wrote:
> > Roland Dreier <roland@topspin.com> wrote:
> > >
> > >     Troy> Do we even need the mlock in userspace then?
> > > 
> > > Yes, because the kernel may go through and unmap pages from userspace
> > > while trying to swap.  Since we have the page locked in the kernel,
> > > the physical page won't go anywhere, but userspace might end up with a
> > > different page mapped at the same virtual address.
> 
> With the last few kernels I haven't had a chance to retest the problem
> that pushed us in the direction of using mlock. I will go back and do
> so with the latest kernel. Below I've given a quick description of the
> issue.
> 
> > That shouldn't happen.  If get_user_pages() has elevated the refcount on a
> > page then the following can happen:
> > 
> > - The VM may decide to add the page to swapcache (if it's not mmapped
> >   from a file).
> > 
> > - Once the page is backed by either swapcache of a (mmapped) file, the VM
> >   may decide the unmap the application's pte's.  A later minor fault by the
> >   app will cause the same physical page to be remapped.
> 
> The driver did use get_user_pages() to elevated the refcount on all the
> pages it was going to use for IO, as well as call set_page_dirty() since
> the pages were going to have data written to them from the device.
> 
> The problem we were seeing is that the minor fault by the app resulted
> in a new physical page getting mapped for the application. The page that
> had the elevated refcount was still waiting for the data to be written
> to by the driver at the time that the app accessed the page causing the
> minor fault. Obviously since the app had a new mapping the data written
> by the driver was lost.
> 
> It looks like code was added to try_to_unmap_one() to address this, so
> hopefully it's no longer an issue...

  I wrote a quick test module and program to confirm that the problem
we saw in older kernels with get_user_pages() no longer exists. The
module creates a character device with three different ioctl commands:

  - Pin the pages of a buffer using get_user_pages()
  - Check the pages by calling get_user_pages() a second time and
    comparing the new and original page list.
  - Relase the pages using put_page()

  The program opens the charcter device file descriptor, pins the pages
and waits for a signal, before checking the pages, which is sent to the
process after running some other program which exercises the VM. On older
kernels the check fails, on my 2.6.11 kernel the check succeeds. So
mlock is not needed on top of get_user_pages() as it was before.

  Thanks for the heads up.

  Module and program attached.

-Libor

[-- Attachment #2: mltest.c --]
[-- Type: text/plain, Size: 7203 bytes --]

/*
 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *	copyright notice, this list of conditions and the following
 *	disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *	copyright notice, this list of conditions and the following
 *	disclaimer in the documentation and/or other materials
 *	provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * $Id: $
 */
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/mount.h>
#include <linux/cdev.h>
#include <linux/devfs_fs_kernel.h>

#include <asm/uaccess.h>
#include <asm/highmem.h>

	
MODULE_AUTHOR("Libor Michalek");
MODULE_DESCRIPTION("Get pages test");
MODULE_LICENSE("GPL");

enum {
	TEST_MAJOR = 232,
	TEST_MINOR = 255
};

#define TEST_DEV MKDEV(TEST_MAJOR, TEST_MINOR)

enum {
	TEST_CMD_REGISTER   = 1,
	TEST_CMD_UNREGISTER = 2,
	TEST_CMD_CHECK      = 3
};

struct ioctl_arg {
	__u64 addr;
	__u64 size;
};

struct region_root {
	struct semaphore mutex;
	struct list_head regions; /* list of pending events. */
	struct file *filp;
	int nr_region;
};

struct test_region {
	unsigned long user;
	unsigned long addr;
	unsigned long size;
	int  nr_pages;
	struct page **pages;
	struct region_root *root;
	struct list_head region_list; /* member in root region list */
};

static void test_unlock(struct test_region *region)
{
        long i;

	list_del(&region->region_list);

        for (i = 0; i < region->nr_pages; i++)
                put_page(region->pages[i]);

	printk(KERN_ERR "TEST: Unlocked address <%016lx>\n", region->user);

	kfree(region->pages);
	kfree(region);
}

static struct test_region *test_lookup(struct region_root *root,
				       unsigned long addr)
{
	struct test_region *region;

	list_for_each_entry(region, &root->regions, region_list)
		if (region->user == addr)
			return region;

	return NULL;
}

static int test_lock(struct region_root *root,
		     unsigned long uaddr,
		     unsigned long size)
{
	struct test_region *region;
	int nr_pages;
	int result;

	region = kmalloc(sizeof(*region), GFP_KERNEL);
	if (!region)
		return -ENOMEM;

	region->user = uaddr;
	region->addr = uaddr & PAGE_MASK;
	region->size = PAGE_ALIGN(size + (uaddr & ~PAGE_MASK));
	region->root = root;

        nr_pages = (region->size + PAGE_SIZE-1) >> PAGE_SHIFT;

	region->pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
	if (!region->pages) {

		result = -ENOMEM;
		goto page_err;
	}

        region->nr_pages = get_user_pages(current, current->mm,
					  region->addr,
					  nr_pages,
					  1, 0, 
					  region->pages, NULL);
	if (region->nr_pages != nr_pages) {
		result = -EFAULT;
		goto get_err;
	}

	list_add_tail(&region->region_list, &root->regions);

	printk(KERN_ERR "TEST:   Locked address <%016lx>\n", region->user);

	return 0;
get_err:
	kfree(region->pages);
page_err:
	kfree(region);
	return result;
}

static int test_check(struct test_region *region)
{
	struct page **pages;
	int nr_pages;
	int result = 0;
	int i;

	pages = kmalloc(sizeof(struct page *) * region->nr_pages, GFP_KERNEL);
	if (!pages)
		return -ENOMEM;

        nr_pages = get_user_pages(current, current->mm,
				  region->addr,
				  region->nr_pages,
				  1, 0, 
				  pages, NULL);
	if (region->nr_pages != nr_pages) {
		result = -EFAULT;
		goto get_err;
	}

	for (i = 0; i < nr_pages; i++) {

		if (region->pages[i] != pages[i])
			printk(KERN_ERR "TEST: Check error <%p:%p> "
			       "page <%u> of <%u>\n",
			       pages[i], region->pages[i], i, nr_pages);
		put_page(pages[i]);
	}

get_err:
	kfree(pages);
	return result;
}

static long test_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
	struct region_root *root = filp->private_data;
	struct test_region *region;
	struct ioctl_arg    ureq;
	int result = 0;

	if (!root)
		return -EINVAL;

        if (copy_from_user(&ureq, (void __user *)arg, sizeof(ureq)))
                return -EFAULT;

	down(&root->mutex);

	switch (cmd) {
	case TEST_CMD_REGISTER:

		result = test_lock(root, ureq.addr, ureq.size);
		break;
	case TEST_CMD_UNREGISTER:

		region = test_lookup(root, ureq.addr);
		if (!region)
			result = -ENOENT;
		else
			test_unlock(region);

		break;
	case TEST_CMD_CHECK:

		region = test_lookup(root, ureq.addr);
		if (!region)
			result = -ENOENT;
		else
			result = test_check(region);

		break;
	default:
		result = -ERANGE;
		break;
	}

	up(&root->mutex);
	return result;
}

static int test_open(struct inode *inode, struct file *filp)
{
	struct region_root *root;

	root = kmalloc(sizeof(*root), GFP_KERNEL);
	if (!root)
		return -ENOMEM;

	memset(root, 0, sizeof(*root));

	INIT_LIST_HEAD(&root->regions);
	init_MUTEX(&root->mutex);

	filp->private_data = root;
	root->filp = filp;

	printk(KERN_ERR "TEST: Created root struct\n");

	return 0;
}

static int test_close(struct inode *inode, struct file *filp)
{
	struct region_root *root = filp->private_data;
	struct test_region *region;

	down(&root->mutex);

	while (!list_empty(&root->regions)) {

		region = list_entry(root->regions.next,
				    struct test_region, region_list);
		test_unlock(region);
	}

	up(&root->mutex);

	kfree(root);

	filp->private_data = NULL;

	printk(KERN_ERR "TEST: Deleted root struct\n");
	return 0;
}

static struct file_operations test_fops = {
	.owner          = THIS_MODULE,
	.open 	        = test_open,
	.release        = test_close,
	.compat_ioctl   = test_ioctl,
	.unlocked_ioctl = test_ioctl,
};


static struct cdev test_cdev;

static int __init test_init(void)
{
	int result;

	result = register_chrdev_region(TEST_DEV, 1, "mltest");
	if (result) {
		printk(KERN_ERR "TEST: Error <%d> registering dev\n", result);
		goto err_chr;
	}

	cdev_init(&test_cdev, &test_fops);

	result = cdev_add(&test_cdev, TEST_DEV, 1);
	if (result) {
		printk(KERN_ERR "TEST: Error <%d> adding cdev\n", result);
		goto err_cdev;
	}

	return 0;
err_cdev:
	unregister_chrdev_region(TEST_DEV, 1);
err_chr:
	return result;
}

static void __exit test_cleanup(void)
{
	cdev_del(&test_cdev);
	unregister_chrdev_region(TEST_DEV, 1);
}

module_init(test_init);
module_exit(test_cleanup);

[-- Attachment #3: umlt.c --]
[-- Type: text/plain, Size: 3356 bytes --]

/*
 * Copyright (c) 2005 Topspin Communications.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *	copyright notice, this list of conditions and the following
 *	disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *	copyright notice, this list of conditions and the following
 *	disclaimer in the documentation and/or other materials
 *	provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * $Id: $
 */

#include <stdlib.h>
#include <string.h>
#include <glob.h>
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <stdint.h>
#include <poll.h>
#include <unistd.h>
#include <signal.h>

#include <linux/types.h>

#define TEST_DEV_PATH "/dev/mltest"
#define TEST_SLEEP_UTIME 50000

struct ioctl_arg {
	__u64 addr;
	__u64 size;
};

enum {
	TEST_CMD_REGISTER   = 1,
	TEST_CMD_UNREGISTER = 2,
	TEST_CMD_CHECK      = 3
};

static int hold = 1;

void sig_usr(int value)
{
	hold = 0;
}

int main(int argc, char **argv)
{
	struct ioctl_arg req;
	void *data;
	int   param_c = 0;
	int   size;
	int   fd;
	int   result;

	if (2 != argc ||
	    0 > (size = atoi(argv[++param_c]))) { 
		
		fprintf(stderr, "usage: %s <size>\n", argv[0]);
		fprintf(stderr, "  size  - allocated region size in bytes.\n");
		
		exit(1);
	}
	signal(SIGUSR1, sig_usr);

	data = malloc(size);
	if (!data) {
		fprintf(stderr, "Failed to allocated region of size <%d>\n",
			size);
		exit(1);
	}
	
	fd = open(TEST_DEV_PATH, O_RDWR);
	if (fd < 0) {
		
		fprintf(stderr, "Error <%d:%d> opening device <%s>\n",
			fd, errno, TEST_DEV_PATH);
		goto open_err;
	}

	req.addr = (unsigned long)data;
	req.size = size;

	result = ioctl(fd, TEST_CMD_REGISTER, &req);
	if (result) {

		fprintf(stderr, "Error <%d:%d> registering region\n",
			result, errno);
		goto ioctl_err;
	}

	fprintf(stdout, 
		"Address <%016lx> registered, process <%d> waiting...\n",
		data, getpid());

	while (hold) {

		usleep(TEST_SLEEP_UTIME);
	}

	fprintf(stdout, "Process continuing, checking address <%016lx>\n",
		data);

	result = ioctl(fd, TEST_CMD_CHECK, &req);
	if (result) {

		fprintf(stderr, "Error <%d:%d> checking region\n",
			result, errno);
		goto ioctl_err;
	}

	result = ioctl(fd, TEST_CMD_UNREGISTER, &req);
	if (result) {

		fprintf(stderr, "Error <%d:%d> unregistering region\n", 
			result, errno);
		goto ioctl_err;
	}

ioctl_err:
	close(fd);
open_err:
	free(data);

	return 0;
}

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  3:16                                               ` Andrew Morton
@ 2005-04-26  3:38                                                 ` Timur Tabi
  2005-04-26  4:33                                                   ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-26  3:38 UTC (permalink / raw)
  To: Andrew Morton; +Cc: roland, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> I'm referring to an application which uses your syscalls to obtain pinned
> memory and uses munlock() so that it may then use your syscalls to obtain
> evem more pinned memory.  With the objective of taking the machine down.

I'm in favor of having drivers call do_mlock() and do_munlock() on behalf of the 
application.  All we need to do is export those functions, and my driver can call them. 
However, that still doesn't prevent an app from calling munlock().

But I don't understand the distinction between having the driver call do_mlock() vs. the 
application calling mlock().  Won't we still have the same problems?  A malicious app can 
just call our driver instead of calling mlock() or munlock(). The driver won't know the 
difference between an authorized app and an unauthorized one.

Besides, isn't the whole point behind RLIMIT_MEMLOCK to limit how much one process can lock?

> I haven't even thought about memory hotswap.  Surely it'll fail if the
> pages are pinned by get_user_pages()?

Any memory registered for RDMA devices obviously can't be swapped out.  Technically, the 
driver could detect that, and reject any attempt to transfer data to those regions until 
everything is remapped to other RAM.  But that's opening a whole new can of worms.  I 
don't know how the memory hotswap mechanism works, so I can't guess what recovery 
mechanisms can be implemented in the driver.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-25 23:24                                       ` Andrew Morton
  2005-04-25 23:37                                         ` Caitlin Bestler
@ 2005-04-26  3:55                                         ` Libor Michalek
  1 sibling, 0 replies; 144+ messages in thread
From: Libor Michalek @ 2005-04-26  3:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: timur.tabi, hch, linux-kernel, openib-general

On Mon, Apr 25, 2005 at 04:24:05PM -0700, Andrew Morton wrote:
> Libor Michalek <libor@topspin.com> wrote:
> > On Mon, Apr 25, 2005 at 03:35:42PM -0700, Andrew Morton wrote:
> >
> > > Yes, we expect that all the pages which get_user_pages() pinned 
> > > will become unpinned within the context of the syscall which pinned
> > > the pages.  Or shortly after, in the case of async I/O.
> > 
> >   When a network protocol is making use of async I/O the amount of time
> > between posting the read request and getting the completion for that
> > request is unbounded since it depends on the other half of the connection
> > sending some data. In this case the buffer that was pinned during the
> > io_submit() may be pinned, and holding the pages, for a long time.
> 
> Sure.
> 
> > During
> > this time the process might fork, at this point any data received will be
> > placed into the wrong spot. 
> 
> Well the data is placed in _a_ spot.  That's only the "wrong" spot because
> you've defined it to be wrong!
> 
> IOW: what behaviour are you actually looking for here, and why, and does it
> matter?

  For example a network server app has an open connection on which it
uses async IO to submit two buffers for a read operation. Both buffers
are pinned using get_user_pages() and the connection waits for data to
arrive. The connection received data, it is written into the first buffer,
the app is notified using async IO, and it retreives the async IO
completion. The app reads the buffer which happens to contain a command
to spawn a child, the app forks a child. Now there is still a buffer
posted for read and if more data arrives on the connection that data is
copied to the pages which were saved when the buffer was pinned. The app
is notified, retrieves the async IO completion, but when it goes to read
that buffer it will not have the new data.
  
> > > This is because there is no file descriptor or anything else associated
> > > with the pages which permits the kernel to clean stuff up on unclean
> > > application exit.  Also there are the obvious issues with permitting
> > > pinning of unbounded amounts of memory.
> > 
> >   Correct, the driver must be able to determine that the process has died
> > and clean up after it, so the pinned region in most implementations is
> > associated with an open file descriptor.
> 
> How is that association created?

  The kernel module which pinned the memory is responsible for unpinning
it if the file descriptor, which was used to deliver the command that
resulted in the pinning, is closed.

-Libor


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  3:38                                                 ` Timur Tabi
@ 2005-04-26  4:33                                                   ` Andrew Morton
  2005-04-26 14:07                                                     ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-26  4:33 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, hch, hozer, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Andrew Morton wrote:
> 
> > I'm referring to an application which uses your syscalls to obtain pinned
> > memory and uses munlock() so that it may then use your syscalls to obtain
> > evem more pinned memory.  With the objective of taking the machine down.
> 
> I'm in favor of having drivers call do_mlock() and do_munlock() on behalf of the 
> application.  All we need to do is export those functions, and my driver can call them. 
> However, that still doesn't prevent an app from calling munlock().

Precisely.  That's why I suggested that we have an alternative vma->vm_flag
bit which behaves in a similar manner to VM_LOCKED (say, VM_LOCKED_KERNEL),
only userspace cannot alter it.

> But I don't understand the distinction between having the driver call do_mlock() vs. the 
> application calling mlock().  Won't we still have the same problems?  A malicious app can 
> just call our driver instead of calling mlock() or munlock(). The driver won't know the 
> difference between an authorized app and an unauthorized one.

The driver will set VM_LOCKED_KERNEL, not VM_LOCKED.

> Besides, isn't the whole point behind RLIMIT_MEMLOCK to limit how much one process can lock?

Sure.  The internal setting of VM_LOCKED_KERNEL should still use
RLIMIT_MEMLOCK accounting.



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  0:02                                 ` Roland Dreier
@ 2005-04-26  6:12                                   ` Christoph Hellwig
  2005-04-26 13:45                                     ` [openib-general] " Caitlin Bestler
  2005-04-26 15:24                                     ` Timur Tabi
  0 siblings, 2 replies; 144+ messages in thread
From: Christoph Hellwig @ 2005-04-26  6:12 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Andrew Morton, timur.tabi, hch, hozer, linux-kernel, openib-general

On Mon, Apr 25, 2005 at 05:02:36PM -0700, Roland Dreier wrote:
> The idea is that applications manage the lifetime of pinned memory
> regions.  They can do things like post multiple I/O operations without
> any page-walking overhead, or pass a buffer descriptor to a remote
> host who will send data at some indeterminate time in the future.  In
> addition, InfiniBand has the notion of atomic operations, so a cluster
> application may be using some memory region to implement a global lock.
> 
> This might not be the most kernel-friendly design but it is pretty
> deeply ingrained in the design of RDMA transports like InfiniBand and
> iWARP (RDMA over IP).

Actuallky, no it isn't.   All these transports would work just fine with
the mmap a character device to hand out memory from the kernel approach
I told you to use multiple times and Andrew mentioned in this thread aswell.
What doesn't work with that design are the braindead designed by comittee
APIs in the RDMA world - but I don't think we should care about them too
much.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  6:12                                   ` Christoph Hellwig
@ 2005-04-26 13:45                                     ` Caitlin Bestler
  2005-04-26 15:24                                     ` Timur Tabi
  1 sibling, 0 replies; 144+ messages in thread
From: Caitlin Bestler @ 2005-04-26 13:45 UTC (permalink / raw)
  To: Christoph Hellwig, Roland Dreier, Andrew Morton, timur.tabi,
	hozer, linux-kernel, openib-general

On 4/25/05, Christoph Hellwig <hch@infradead.org> wrote:
> On Mon, Apr 25, 2005 at 05:02:36PM -0700, Roland Dreier wrote:
> > The idea is that applications manage the lifetime of pinned memory
> > regions.  They can do things like post multiple I/O operations without
> > any page-walking overhead, or pass a buffer descriptor to a remote
> > host who will send data at some indeterminate time in the future.  In
> > addition, InfiniBand has the notion of atomic operations, so a cluster
> > application may be using some memory region to implement a global lock.
> >
> > This might not be the most kernel-friendly design but it is pretty
> > deeply ingrained in the design of RDMA transports like InfiniBand and
> > iWARP (RDMA over IP).
> 
> Actuallky, no it isn't.   All these transports would work just fine with
> the mmap a character device to hand out memory from the kernel approach
> I told you to use multiple times and Andrew mentioned in this thread aswell.
> What doesn't work with that design are the braindead designed by comittee
> APIs in the RDMA world - but I don't think we should care about them too
> much.
> 


RDMA registers and uses the memory the user specifies. That is why byte
granularity and multiple redundant registrations are explicitly specified.

The mechanism by which this requirement is implemented is of course
OS dependent. But the requirements are that the application specifies
what portion of their memory they want registered (or what set of physical
pages if they have sufficient privilege) and that request is either honored
or refused by a resource manager (one preferably as integrated with
general OS resource management as possible).

The other aspect is that remotely enabled memory regions and memory
windows most be enabled for hardware access for the duration of 
the region or window -- indefinitely until process death or explicit
termination by the application layer.

Theoretically there is nothing in the wire protocols that requires source
buffers to be pinned indefinitely, but that is the only way any RDMA
interface has ever worked -- so "brain death" must be pretty widespread.

The fact that this problem must be solved for remotely accessible
buffers, and that for cluster applications like MPI there is no distinction
between buffers used for inbound messages and outbound messages,
might have something to do with this.

User verbs needs to deal with these actual Memory Registration requirements,
including the very real application need for Memory Windows. The solution
should map to existing OS controls as much as possible.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  4:33                                                   ` Andrew Morton
@ 2005-04-26 14:07                                                     ` Timur Tabi
  0 siblings, 0 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-26 14:07 UTC (permalink / raw)
  To: Andrew Morton; +Cc: roland, hch, hozer, linux-kernel, openib-general

Andrew Morton wrote:

> Precisely.  That's why I suggested that we have an alternative vma->vm_flag
> bit which behaves in a similar manner to VM_LOCKED (say, VM_LOCKED_KERNEL),
> only userspace cannot alter it.

How about calling it VM_PINNED?  That way, we can define

Locked - won't be swapped to disk, but can be moved around in memory
Pinned - won't be swapped to disk or moved around in memory


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  6:12                                   ` Christoph Hellwig
  2005-04-26 13:45                                     ` [openib-general] " Caitlin Bestler
@ 2005-04-26 15:24                                     ` Timur Tabi
  1 sibling, 0 replies; 144+ messages in thread
From: Timur Tabi @ 2005-04-26 15:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Roland Dreier, Andrew Morton, hozer, linux-kernel, openib-general

Christoph Hellwig wrote:

> What doesn't work with that design are the braindead designed by comittee
> APIs in the RDMA world - but I don't think we should care about them too
> much.

I think you should.  The whole point behind RDMA is that these APIs exist and are being 
used by real-world applications.  You can't just ignore them because they're inconvenient. 
  If you're not willing to cater to these API's needs, then you may as well tell all the 
RDMA developers to forgot about Linux and port everything to Windows instead.

The APIs are here to stay, and the whole point behind this thread is to discuss how Linux 
can support them.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  0:37                                           ` Andrew Morton
  2005-04-26  2:21                                             ` Timur Tabi
@ 2005-04-26 15:31                                             ` Roland Dreier
  2005-04-26 15:42                                               ` [openib-general] " Libor Michalek
  1 sibling, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-26 15:31 UTC (permalink / raw)
  To: Andrew Morton; +Cc: timur.tabi, hch, hozer, linux-kernel, openib-general

    Andrew> umm, how about we

    Andrew> - force the special pages into a separate vma

    Andrew> - run get_user_pages() against it all

    Andrew> - use RLIMIT_MEMLOCK accounting to check whether the user
    Andrew> is allowed to do this thing

    Andrew> - undo the RMLIMIT_MEMLOCK accounting in ->release

    Andrew> This will all interact with user-initiated mlock/munlock
    Andrew> in messy ways. Maybe a new kernel-internal vma->vm_flag
    Andrew> which works like VM_LOCKED but is unaffected by
    Andrew> mlock/munlock activity is needed.

    Andrew> A bit of generalisation in do_mlock() should suit?

Yes, it seems that modifying do_mlock() to something like

	int do_mlock(unsigned long start, size_t len,
		     unsigned int set, unsigned int clear)

and then exporting a function along the lines of

	int do_mem_pin(unsigned long start, size_t len, int on)

that sets/clears (VM_LOCKED_KERNEL | VM_DONTCOPY) according to the on
flag.

Seem reasonable?  If so I can code this up.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 15:31                                             ` Roland Dreier
@ 2005-04-26 15:42                                               ` Libor Michalek
  2005-04-26 15:49                                                 ` Roland Dreier
  0 siblings, 1 reply; 144+ messages in thread
From: Libor Michalek @ 2005-04-26 15:42 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Andrew Morton, hch, linux-kernel, openib-general, timur.tabi

On Tue, Apr 26, 2005 at 08:31:32AM -0700, Roland Dreier wrote:
>     Andrew> umm, how about we
> 
>     Andrew> - force the special pages into a separate vma
> 
>     Andrew> - run get_user_pages() against it all
> 
>     Andrew> - use RLIMIT_MEMLOCK accounting to check whether the user
>     Andrew> is allowed to do this thing
> 
>     Andrew> - undo the RMLIMIT_MEMLOCK accounting in ->release
> 
>     Andrew> This will all interact with user-initiated mlock/munlock
>     Andrew> in messy ways. Maybe a new kernel-internal vma->vm_flag
>     Andrew> which works like VM_LOCKED but is unaffected by
>     Andrew> mlock/munlock activity is needed.
> 
>     Andrew> A bit of generalisation in do_mlock() should suit?
> 
> Yes, it seems that modifying do_mlock() to something like
> 
> 	int do_mlock(unsigned long start, size_t len,
> 		     unsigned int set, unsigned int clear)
> 
> and then exporting a function along the lines of
> 
> 	int do_mem_pin(unsigned long start, size_t len, int on)
> 
> that sets/clears (VM_LOCKED_KERNEL | VM_DONTCOPY) according to the on
> flag.

  Do you mean that the set/clear parameters to do_mlock() are the
actual flags which are set/cleared by the caller? Also, the issue
remains that the flags are not reference counted which is a problem
if you are dealing with overlapping memory region, or even if one
region ends and another begins on the same page. Since the desire is
to be able to pin any memory that a user can malloc this is a likely
scenario.

-Libor

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 15:42                                               ` [openib-general] " Libor Michalek
@ 2005-04-26 15:49                                                 ` Roland Dreier
  2005-04-26 19:28                                                   ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-26 15:49 UTC (permalink / raw)
  To: Libor Michalek
  Cc: Andrew Morton, hch, linux-kernel, openib-general, timur.tabi

    Libor>   Do you mean that the set/clear parameters to do_mlock()
    Libor> are the actual flags which are set/cleared by the caller? 
    Libor> Also, the issue remains that the flags are not reference
    Libor> counted which is a problem if you are dealing with
    Libor> overlapping memory region, or even if one region ends and
    Libor> another begins on the same page. Since the desire is to be
    Libor> able to pin any memory that a user can malloc this is a
    Libor> likely scenario.

Good point... we need to figure out how to handle:

    a) app registers 0x0000 through 0x17ff
    b) app registers 0x1800 through 0x2fff
    c) app unregisters 0x0000 through 0x17ff
    d) the page at 0x1000 must stay pinned

hmm...

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 15:49                                                 ` Roland Dreier
@ 2005-04-26 19:28                                                   ` Andrew Morton
  2005-04-26 20:14                                                     ` Roland Dreier
  2005-04-27  3:15                                                     ` Caitlin Bestler
  0 siblings, 2 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-26 19:28 UTC (permalink / raw)
  To: Roland Dreier; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

Roland Dreier <roland@topspin.com> wrote:
>
>     Libor>   Do you mean that the set/clear parameters to do_mlock()
>     Libor> are the actual flags which are set/cleared by the caller? 
>     Libor> Also, the issue remains that the flags are not reference
>     Libor> counted which is a problem if you are dealing with
>     Libor> overlapping memory region, or even if one region ends and
>     Libor> another begins on the same page. Since the desire is to be
>     Libor> able to pin any memory that a user can malloc this is a
>     Libor> likely scenario.
> 
> Good point... we need to figure out how to handle:
> 
>     a) app registers 0x0000 through 0x17ff
>     b) app registers 0x1800 through 0x2fff
>     c) app unregisters 0x0000 through 0x17ff
>     d) the page at 0x1000 must stay pinned

The userspace library should be able to track the tree and the overlaps,
etc.  Things might become interesting when the memory is MAP_SHARED
pagecache and multiple independent processes are involved, although I guess
that'd work OK.

But afaict the problem wherein part of a page needs VM_DONTCOPY and the
other part does not cannot be solved.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 19:28                                                   ` Andrew Morton
@ 2005-04-26 20:14                                                     ` Roland Dreier
  2005-04-26 20:18                                                       ` Timur Tabi
  2005-04-26 20:32                                                       ` Andrew Morton
  2005-04-27  3:15                                                     ` Caitlin Bestler
  1 sibling, 2 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-26 20:14 UTC (permalink / raw)
  To: Andrew Morton; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

    Roland>     a) app registers 0x0000 through 0x17ff
    Roland>     b) app registers 0x1800 through 0x2fff
    Roland>     c) app unregisters 0x0000 through 0x17ff
    Roland>     d) the page at 0x1000 must stay pinned

    Andrew> The userspace library should be able to track the tree and
    Andrew> the overlaps, etc.  Things might become interesting when
    Andrew> the memory is MAP_SHARED pagecache and multiple
    Andrew> independent processes are involved, although I guess
    Andrew> that'd work OK.

I used to think I knew how to handle this, but in your scheme where
the kernel is doing accounting for pinned memory by marking vmas with
VM_KERNEL_LOCKED, at step c), I don't see why the kernel won't unlock
vmas covering 0x0000 through 0x1fff and credit 8K back to the
process's pinning count.

Sorry to be so dense but can you spell out what you think should
happen at steps a), b) and c) above?

    Andrew> But afaict the problem wherein part of a page needs
    Andrew> VM_DONTCOPY and the other part does not cannot be solved.

Yes, I agree.  If an app wants to register half a page and pass the
other half to a child process, I think the only answer is "don't do
that then."

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 20:14                                                     ` Roland Dreier
@ 2005-04-26 20:18                                                       ` Timur Tabi
  2005-04-26 20:37                                                         ` Andrew Morton
  2005-04-26 20:32                                                       ` Andrew Morton
  1 sibling, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-04-26 20:18 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Andrew Morton, libor, hch, linux-kernel, openib-general

Roland Dreier wrote:

> Yes, I agree.  If an app wants to register half a page and pass the
> other half to a child process, I think the only answer is "don't do
> that then."

How can the app know that, though?  It would have to allocate I/O buffers with knowledge 
of page boundaries.  Today, the apps just malloc() a bunch of memory and pay no attention 
to whether the beginning or the end of the buffer shares a page with some other, unrelated 
object.  We may as well tell the app that it needs to page-align all I/O buffers.

My point is that we can't just simply say, "Don't do that".  Some entity (the kernel, 
libraries, whatever) should be able to tell the app that its usage of memory is going to 
break in some unpredictable way.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 20:14                                                     ` Roland Dreier
  2005-04-26 20:18                                                       ` Timur Tabi
@ 2005-04-26 20:32                                                       ` Andrew Morton
  2005-04-26 21:23                                                         ` Roland Dreier
  1 sibling, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-26 20:32 UTC (permalink / raw)
  To: Roland Dreier; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

Roland Dreier <roland@topspin.com> wrote:
>
>     Roland>     a) app registers 0x0000 through 0x17ff
>     Roland>     b) app registers 0x1800 through 0x2fff
>     Roland>     c) app unregisters 0x0000 through 0x17ff
>     Roland>     d) the page at 0x1000 must stay pinned
> 
>     Andrew> The userspace library should be able to track the tree and
>     Andrew> the overlaps, etc.  Things might become interesting when
>     Andrew> the memory is MAP_SHARED pagecache and multiple
>     Andrew> independent processes are involved, although I guess
>     Andrew> that'd work OK.
> 
> I used to think I knew how to handle this, but in your scheme where
> the kernel is doing accounting for pinned memory by marking vmas with
> VM_KERNEL_LOCKED, at step c), I don't see why the kernel won't unlock
> vmas covering 0x0000 through 0x1fff and credit 8K back to the
> process's pinning count.
> 
> Sorry to be so dense but can you spell out what you think should
> happen at steps a), b) and c) above?

Well I was vaguely proposing that the userspace library keep track of the
byteranges and the underlying page states.  So in the above scenario
userspace would leave the page at 0x1000 registered until all
registrations against that page have been undone.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 20:18                                                       ` Timur Tabi
@ 2005-04-26 20:37                                                         ` Andrew Morton
  2005-04-29 14:26                                                           ` Bill Jordan
  0 siblings, 1 reply; 144+ messages in thread
From: Andrew Morton @ 2005-04-26 20:37 UTC (permalink / raw)
  To: Timur Tabi; +Cc: roland, libor, hch, linux-kernel, openib-general

Timur Tabi <timur.tabi@ammasso.com> wrote:
>
> Roland Dreier wrote:
> 
>  > Yes, I agree.  If an app wants to register half a page and pass the
>  > other half to a child process, I think the only answer is "don't do
>  > that then."
> 
>  How can the app know that, though?  It would have to allocate I/O buffers with knowledge 
>  of page boundaries.  Today, the apps just malloc() a bunch of memory and pay no attention 
>  to whether the beginning or the end of the buffer shares a page with some other, unrelated 
>  object.  We may as well tell the app that it needs to page-align all I/O buffers.
> 
>  My point is that we can't just simply say, "Don't do that".  Some entity (the kernel, 
>  libraries, whatever) should be able to tell the app that its usage of memory is going to 
>  break in some unpredictable way.

Our point is that contemporary microprocessors cannot electrically do what
you want them to do!

Now, conceeeeeeiveably the kernel could keep track of the state of the
pages down to the byte level, and could keep track of all COWed pages and
could look at faulting addresses at the byte level and could copy sub-page
ranges by hand from one process's address space into another process's
after I/O completion.  I don't think we want to do that.

Methinks your specification is busted.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 20:32                                                       ` Andrew Morton
@ 2005-04-26 21:23                                                         ` Roland Dreier
  2005-04-27  0:05                                                           ` Andrew Morton
  0 siblings, 1 reply; 144+ messages in thread
From: Roland Dreier @ 2005-04-26 21:23 UTC (permalink / raw)
  To: Andrew Morton; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

    Andrew> Well I was vaguely proposing that the userspace library
    Andrew> keep track of the byteranges and the underlying page
    Andrew> states.  So in the above scenario userspace would leave
    Andrew> the page at 0x1000 registered until all registrations
    Andrew> against that page have been undone.

OK, I already have code in userspace that keeps reference counts for
overlapping regions, etc.  However I'm not sure how to tie this in
with reliable accounting of pinned memory -- we don't want malicious
userspace code to be able fool the accounting, right?

So I'm still trying to puzzle out what to do.  I don't want to keep a
complicated data structure in the kernel keeping track of what memory
has been registered.  Right now, I just keep a list of structs, one
for each region, and when a process dies, I just go through region by
region and do a put_page() to balance off the get_user_pages().

However I don't see how to make it work if I put the reference
counting for overlapping regions in userspace but when I want mlock()
accounting in the kernel.  If a buggy/malicious app does:

    a) register from 0x0000 to 0x2fff
    b) register from 0x1000 to 0x1fff
    c) unregister from 0x0000 to 0x2fff

then it seems the kernel is screwed unless it counts how many times a
vma has been pinned.  And adding a pin_count member to vm_struct seems
like a pretty damn major step.

We definitely have to make sure that userspace is never able to either
unpin a page that is still registered with RDMA hardware, because that
can lead to DMA to into memory that someone else owns.  On the other
hand, we don't want userspace to be able to defeat resource accounting
by tricking the kernel into keeping page_count elevated after it
credits the memory back to a process's limit on locked pages.

The limit on the number of locked pages seems like a natural thing to
check against, but perhaps we need a different limit for the number of
pages pinned for use by RDMA hardware.  Sort of the same way that
there's a separate limit on the number of in-flight aios.

 - R.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 21:23                                                         ` Roland Dreier
@ 2005-04-27  0:05                                                           ` Andrew Morton
  2005-04-27  2:13                                                             ` Roland Dreier
  2005-04-27  3:21                                                             ` Caitlin Bestler
  0 siblings, 2 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-27  0:05 UTC (permalink / raw)
  To: Roland Dreier; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

Roland Dreier <roland@topspin.com> wrote:
>
>     Andrew> Well I was vaguely proposing that the userspace library
>     Andrew> keep track of the byteranges and the underlying page
>     Andrew> states.  So in the above scenario userspace would leave
>     Andrew> the page at 0x1000 registered until all registrations
>     Andrew> against that page have been undone.
> 
> OK, I already have code in userspace that keeps reference counts for
> overlapping regions, etc.  However I'm not sure how to tie this in
> with reliable accounting of pinned memory -- we don't want malicious
> userspace code to be able fool the accounting, right?
> 
> So I'm still trying to puzzle out what to do.  I don't want to keep a
> complicated data structure in the kernel keeping track of what memory
> has been registered.  Right now, I just keep a list of structs, one
> for each region, and when a process dies, I just go through region by
> region and do a put_page() to balance off the get_user_pages().
> 
> However I don't see how to make it work if I put the reference
> counting for overlapping regions in userspace but when I want mlock()
> accounting in the kernel.  If a buggy/malicious app does:
> 
>     a) register from 0x0000 to 0x2fff
>     b) register from 0x1000 to 0x1fff
>     c) unregister from 0x0000 to 0x2fff

As far as the kernel is concerned, step b) should be a no-op.  (The kernel
might choose to split the vma, but that's not significant).

> then it seems the kernel is screwed unless it counts how many times a
> vma has been pinned.  And adding a pin_count member to vm_struct seems
> like a pretty damn major step.
> 
> We definitely have to make sure that userspace is never able to either
> unpin a page that is still registered with RDMA hardware, because that
> can lead to DMA to into memory that someone else owns.  On the other
> hand, we don't want userspace to be able to defeat resource accounting
> by tricking the kernel into keeping page_count elevated after it
> credits the memory back to a process's limit on locked pages.

The kernel can simply register and unregister ranges for RDMA.  So
effectively a particular page is in either the registered or unregistered
state.  Kernel accounting counts the number of registered pages and
compares this with rlimits.

On top of all that, your userspace library needs to keep track of when
pages should really be registered and unregistered with the kernel.  Using
overlap logic and per-page refcounting or whatever.

No?

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-27  0:05                                                           ` Andrew Morton
@ 2005-04-27  2:13                                                             ` Roland Dreier
  2005-04-27  3:21                                                             ` Caitlin Bestler
  1 sibling, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-27  2:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: libor, hch, linux-kernel, openib-general, timur.tabi

    Andrew> The kernel can simply register and unregister ranges for
    Andrew> RDMA.  So effectively a particular page is in either the
    Andrew> registered or unregistered state.  Kernel accounting
    Andrew> counts the number of registered pages and compares this
    Andrew> with rlimits.

    Andrew> On top of all that, your userspace library needs to keep
    Andrew> track of when pages should really be registered and
    Andrew> unregistered with the kernel.  Using overlap logic and
    Andrew> per-page refcounting or whatever.

This is OK as long as userspace is trusted.  However I don't see how
this works when we don't trust userspace.  The problem is that for an
RDMA device (IB HCA or iWARP RNIC), a process can create many memory
regions, each of which a separate virtual to physical translation
map.  For example, an app can do:

    a) register 0x0000 through 0xffff and get memory handle 1
    b) register 0x0000 through 0xffff and get memory handle 2
    c) use memory handle 1 for communication with remote app A
    d) use memory handle 2 for communication with remote app B

Even though memory handles 1 and 2 both refer to exactly the same
memory, they may have different lifetimes, might be attached to
different connections, and so on.

Clearly the memory at 0x0000 must stay pinned as long as the RDMA
device thinks either memory handle 1 or memory handle 2 is valid.
Furthermore, the kernel must be the one keeping track of how many
regions refer to a given page because we can't allow userspace to be
able to tell a device to go DMA to memory it doesn't own any more.

Creation and destruction of these memory handles will always go
through the kernel driver, so this isn't so bad.  And get_user_pages()
is almost exactly what we need: it stacks perfectly, since it operates
on the page_count rather than just setting a bit in vm_flags.  The
main problem is that it doesn't check against RLIMIT_MEMLOCK.

The most reasonable thing to do would seem to be having the IB kernel
memory region code update current->mm->locked_vm and check it against
RLIMIT_MEMLOCK.  I guess it would be good to figure out an appropriate
abstraction to export rather than monkeying with current->mm directly.
We could also put this directly in get_user_pages(), but I'd be
worried about messing with current users.

I just don't see a way to make VM_KERNEL_LOCKED work.

It would also be nice to have a way for apps to set VM_DONTCOPY
appropriately.  Christoph's suggestion of extending mmap() and
mprotect() with PROT_DONTCOPY seems good to me, especially since it
means we don't have to export do_mlock() functionality to modules.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 19:28                                                   ` Andrew Morton
  2005-04-26 20:14                                                     ` Roland Dreier
@ 2005-04-27  3:15                                                     ` Caitlin Bestler
  1 sibling, 0 replies; 144+ messages in thread
From: Caitlin Bestler @ 2005-04-27  3:15 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Roland Dreier, hch, linux-kernel, openib-general, timur.tabi

On 4/26/05, Andrew Morton <akpm@osdl.org> wrote:
> Roland Dreier <roland@topspin.com> wrote:
> >
> >     Libor>   Do you mean that the set/clear parameters to do_mlock()
> >     Libor> are the actual flags which are set/cleared by the caller?
> >     Libor> Also, the issue remains that the flags are not reference
> >     Libor> counted which is a problem if you are dealing with
> >     Libor> overlapping memory region, or even if one region ends and
> >     Libor> another begins on the same page. Since the desire is to be
> >     Libor> able to pin any memory that a user can malloc this is a
> >     Libor> likely scenario.
> >
> > Good point... we need to figure out how to handle:
> >
> >     a) app registers 0x0000 through 0x17ff
> >     b) app registers 0x1800 through 0x2fff
> >     c) app unregisters 0x0000 through 0x17ff
> >     d) the page at 0x1000 must stay pinned
> 
> The userspace library should be able to track the tree and the overlaps,
> etc.  Things might become interesting when the memory is MAP_SHARED
> pagecache and multiple independent processes are involved, although I guess
> that'd work OK.
> 
> But afaict the problem wherein part of a page needs VM_DONTCOPY and the
> other part does not cannot be solved.
> 

Which portion of the userspace library? HCA-dependent code, or common code?

The HCA-dependent code would fail to count when the same memory was
registered to different HCAs (for example to the internal network device and
the external network device).

The vendor-independent code *could* do it, but only by maintaining a 
complete list of all registrations that had been issued but not cancelled.
That data would be redundant with data kept at the verb layer, and by
the kernel.

It *would' work, but maintaining highly redundant data at multiple layers
is something that I generally try to avoid.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-27  0:05                                                           ` Andrew Morton
  2005-04-27  2:13                                                             ` Roland Dreier
@ 2005-04-27  3:21                                                             ` Caitlin Bestler
  1 sibling, 0 replies; 144+ messages in thread
From: Caitlin Bestler @ 2005-04-27  3:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Roland Dreier, hch, linux-kernel, openib-general, timur.tabi

On 4/26/05, Andrew Morton <akpm@osdl.org> wrote:

> >
> > However I don't see how to make it work if I put the reference
> > counting for overlapping regions in userspace but when I want mlock()
> > accounting in the kernel.  If a buggy/malicious app does:
> >
> >     a) register from 0x0000 to 0x2fff
> >     b) register from 0x1000 to 0x1fff
> >     c) unregister from 0x0000 to 0x2fff
> 
> As far as the kernel is concerned, step b) should be a no-op.  (The kernel
> might choose to split the vma, but that's not significant).
> 

If "register" and "unregister" is meant in the RDMA sense then the above
sequence is totally reasonable. The b) registration could be for a different
protection domain that did not require access to all of the larger region.

Unless a full counting lock is available from the kernel, the responsibility
of the collective RDMA components would be to a) pin 0x0000 to 0x2fff,
b) nothing c) unpin 0x000 to 0x0fff and 0x2000 to 0x2fff

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-18 16:16       ` Arjan van de Ven
  2005-04-18 16:25         ` Timur Tabi
  2005-04-22 17:55         ` Timur Tabi
@ 2005-04-29  0:56         ` Andrew Morton
  2 siblings, 0 replies; 144+ messages in thread
From: Andrew Morton @ 2005-04-29  0:56 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: timur.tabi, roland, hozer, linux-kernel, openib-general

Arjan van de Ven <arjan@infradead.org> wrote:
>
> > Why do you call mlock() and get_user_pages()?  In our code, we only call mlock(), and the 
> > memory is pinned. 
> 
> this is a myth; linux is free to move the page about in physical memory
> even if it's mlock()ed!!

eh?  I guess the kernel _is_ free to move the page about, but it doesn't.

We might do at some time in the future for memory hotplug, I guess.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26 20:37                                                         ` Andrew Morton
@ 2005-04-29 14:26                                                           ` Bill Jordan
  2005-04-29 15:56                                                             ` Caitlin Bestler
  0 siblings, 1 reply; 144+ messages in thread
From: Bill Jordan @ 2005-04-29 14:26 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Timur Tabi, hch, linux-kernel, openib-general

On 4/26/05, Andrew Morton <akpm@osdl.org> wrote:

> Our point is that contemporary microprocessors cannot electrically do what
> you want them to do!
> 
> Now, conceeeeeeiveably the kernel could keep track of the state of the
> pages down to the byte level, and could keep track of all COWed pages and
> could look at faulting addresses at the byte level and could copy sub-page
> ranges by hand from one process's address space into another process's
> after I/O completion.  I don't think we want to do that.
> 
> Methinks your specification is busted.

I agree in principal. However, I expect this issue will come up with
more and more new specifications, and if it isn't addressed once in
the linux kernel, it will be kludged and broken many times in many
drivers.

I believe we need an kernel level interface that will pin user pages,
and lock the user vma in a single step. The interface should be used
by drivers when the hardware mappings are done. If the process is
split into a user operation to lock the memory, and a driver operation
to map the hardware, there will always be opportunity for abuse.

Reference counting needs to be done by this interface to allow
different hardware to interoperate.

The interface can't overload the VM_LOCKED flag, or rely on any other
attributes that the user can tinker with via any other interface.

And as much as I hate to admit it, I think on a fork, we will need to
copy parts of pages at the beginning or end of user I/O buffers.

-- 
Bill Jordan
InfiniCon Systems

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-29 14:26                                                           ` Bill Jordan
@ 2005-04-29 15:56                                                             ` Caitlin Bestler
  2005-04-29 16:45                                                               ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Roland Dreier
  2005-04-29 17:04                                                               ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
  0 siblings, 2 replies; 144+ messages in thread
From: Caitlin Bestler @ 2005-04-29 15:56 UTC (permalink / raw)
  To: Bill Jordan; +Cc: Andrew Morton, hch, linux-kernel, openib-general, Timur Tabi

On 4/29/05, Bill Jordan <woodennickel@gmail.com> wrote:
> On 4/26/05, Andrew Morton <akpm@osdl.org> wrote:
> 
> > Our point is that contemporary microprocessors cannot electrically do what
> > you want them to do!
> >
> > Now, conceeeeeeiveably the kernel could keep track of the state of the
> > pages down to the byte level, and could keep track of all COWed pages and
> > could look at faulting addresses at the byte level and could copy sub-page
> > ranges by hand from one process's address space into another process's
> > after I/O completion.  I don't think we want to do that.
> >
> > Methinks your specification is busted.
> 
> I agree in principal. However, I expect this issue will come up with
> more and more new specifications, and if it isn't addressed once in
> the linux kernel, it will be kludged and broken many times in many
> drivers.
> 
> I believe we need an kernel level interface that will pin user pages,
> and lock the user vma in a single step. The interface should be used
> by drivers when the hardware mappings are done. If the process is
> split into a user operation to lock the memory, and a driver operation
> to map the hardware, there will always be opportunity for abuse.
> 
> Reference counting needs to be done by this interface to allow
> different hardware to interoperate.
> 
> The interface can't overload the VM_LOCKED flag, or rely on any other
> attributes that the user can tinker with via any other interface.
> 
> And as much as I hate to admit it, I think on a fork, we will need to
> copy parts of pages at the beginning or end of user I/O buffers.
> 

I agree with all but the last part, in my opinion there is no need to deal
with fork issues as long as solutions do not result in failures. There is
*no* basis for a child process to expect that it will inherit RDMA resources.
A child process that uses such resources will get undefined results, nothing
further needs to be stated, and no heroic efforts are required to avoid them.

What is definitely needed is kernel counting of locks on user pages.
Finer granularity is not expected, it is the RDMA hardware that works
at finer granularity. All it needs is to know what bus address a given
virtual page maps to -- and it needs to know that said mapping will
not change without advance notice.

Further, any revocation of an existing mapping (to deal with hot page
swapping or whatever) cannot expect the RDMA hardware to respond
any faster than it would to invalidating a memory region.

The RDMA hardware has an inherent need to cache translations.
That is why it cannot guarantee that it will cease updating a memory
region the nanosecond that a request is made to invalidate an STag.
Instead it is allowed to block on such a request and only guarantees
to have ceased access when the invalidate request completes.

The same need for a delay exists for any interface that moves memory
around, or requests to reclaim memory from the application.

This also applies on process death. The hardware cannot stop on a dime.
The best it can do is stop promptly, and given an unambiguous indication
to the OS as to when it has stopped.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation)
  2005-04-29 15:56                                                             ` Caitlin Bestler
@ 2005-04-29 16:45                                                               ` Roland Dreier
  2005-04-29 17:23                                                                 ` Libor Michalek
                                                                                   ` (2 more replies)
  2005-04-29 17:04                                                               ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
  1 sibling, 3 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-29 16:45 UTC (permalink / raw)
  To: Caitlin Bestler
  Cc: Bill Jordan, Andrew Morton, hch, linux-kernel, openib-general,
	Timur Tabi

Is there anything wrong with the following plan?

1) For memory registration, use get_user_pages() in the kernel.  Use
   locked_vm and RLIMIT_MEMLOCK to limit the amount of memory pinned
   by a given process.  One disadvantage of this is that the
   accounting will overestimate the amount of pinned memory if a
   process pins the same page twice, but this doesn't seem that bad to
   me -- it errs on the side of safety.

2) For fork() support:

   a) Extend mprotect() with PROT_DONTCOPY so processes can avoid
      copy-on-write problems.

   b) (maybe someday?) Add a VM_ALWAYSCOPY flag and extend mprotect()
      with PROT_ALWAYSCOPY so processes can mark pages to be
      pre-copied into child processes, to handle the case where only
      half a page is registered.

I believe this puts the code that must be trusted into the kernel and
gives userspace primitives that let apps handle the rest.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-29 15:56                                                             ` Caitlin Bestler
  2005-04-29 16:45                                                               ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Roland Dreier
@ 2005-04-29 17:04                                                               ` Libor Michalek
  2005-04-30  0:31                                                                 ` Caitlin Bestler
  1 sibling, 1 reply; 144+ messages in thread
From: Libor Michalek @ 2005-04-29 17:04 UTC (permalink / raw)
  To: Caitlin Bestler
  Cc: Bill Jordan, Andrew Morton, hch, linux-kernel, openib-general,
	Timur Tabi

On Fri, Apr 29, 2005 at 08:56:20AM -0700, Caitlin Bestler wrote:
> On 4/29/05, Bill Jordan <woodennickel@gmail.com> wrote:
> > On 4/26/05, Andrew Morton <akpm@osdl.org> wrote:
> > 
> > > Our point is that contemporary microprocessors cannot electrically
> > > do what you want them to do!
> > >
> > > Now, conceeeeeeiveably the kernel could keep track of the state of the
> > > pages down to the byte level, and could keep track of all COWed pages and
> > > could look at faulting addresses at the byte level and could copy sub-page
> > > ranges by hand from one process's address space into another process's
> > > after I/O completion.  I don't think we want to do that.
> > >
> > > Methinks your specification is busted.
> > 
> > I agree in principal. However, I expect this issue will come up with
> > more and more new specifications, and if it isn't addressed once in
> > the linux kernel, it will be kludged and broken many times in many
> > drivers.
> > 
> > I believe we need an kernel level interface that will pin user pages,
> > and lock the user vma in a single step. The interface should be used
> > by drivers when the hardware mappings are done. If the process is
> > split into a user operation to lock the memory, and a driver operation
> > to map the hardware, there will always be opportunity for abuse.
> > 
> > Reference counting needs to be done by this interface to allow
> > different hardware to interoperate.
> > 
> > The interface can't overload the VM_LOCKED flag, or rely on any other
> > attributes that the user can tinker with via any other interface.
> > 
> > And as much as I hate to admit it, I think on a fork, we will need to
> > copy parts of pages at the beginning or end of user I/O buffers.
> > 
> 
> I agree with all but the last part, in my opinion there is no need to deal
> with fork issues as long as solutions do not result in failures. There is
> *no* basis for a child process to expect that it will inherit RDMA resources.
> A child process that uses such resources will get undefined results, nothing
> further needs to be stated, and no heroic efforts are required to avoid them.

  However, you have a potential problem with registered buffers that
do not begin or end on a page boundary, which is common with malloc.
If the buffer resides on a portion of a page, and you mark the vm
which contains that entire page VM_DONTCOPY, to ensure that the parent
has access to the exact physical page after the fork, the child will
not be able to access anything on that entire page. So if the child
expects to access data on the same page that happens to contain the
registered buffer it will get a segment violation.

The four situations we've discussed are:

  1) Physical page does not get used for anything else.
  2) Processes virtual to physical mapping remains fixed.
  3) Same virtual to physical mapping after forking a child.
  4) Forked child has access to all non-registered memory of
     the parent.

The first two are now taken care of with get_user_pages, (we use to
use VM_LOCKED for the second case) third case is handled by setting
the vm to VM_DONTCOPY, and on the fourth case we've always punted,
but the real answer is to break partial pages into seperate vms and
mark them ALWAYS_COPY.

-Libor



^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation)
  2005-04-29 16:45                                                               ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Roland Dreier
@ 2005-04-29 17:23                                                                 ` Libor Michalek
  2005-04-29 18:22                                                                 ` RDMA memory registration Brice Goglin
  2005-04-29 19:43                                                                 ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Bill Jordan
  2 siblings, 0 replies; 144+ messages in thread
From: Libor Michalek @ 2005-04-29 17:23 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Caitlin Bestler, Andrew Morton, linux-kernel, openib-general,
	hch, Timur Tabi

On Fri, Apr 29, 2005 at 09:45:50AM -0700, Roland Dreier wrote:
> Is there anything wrong with the following plan?
> 
> 1) For memory registration, use get_user_pages() in the kernel.  Use
>    locked_vm and RLIMIT_MEMLOCK to limit the amount of memory pinned
>    by a given process.  One disadvantage of this is that the
>    accounting will overestimate the amount of pinned memory if a
>    process pins the same page twice, but this doesn't seem that bad to
>    me -- it errs on the side of safety.

  I think the overestimate will be fine in practice. If a process is
locking a lot of memory it will most likely be in big chunks, so not
much page overlap there. If the process is locking lots of tiny buffers
with lots of page overlap, the total locked amount will most likely be
small. Although it is odd that you could end up with a total locked
amount larger then the number of physical pages in the system...

> 2) For fork() support:
> 
>    a) Extend mprotect() with PROT_DONTCOPY so processes can avoid
>       copy-on-write problems.
> 
>    b) (maybe someday?) Add a VM_ALWAYSCOPY flag and extend mprotect()
>       with PROT_ALWAYSCOPY so processes can mark pages to be
>       pre-copied into child processes, to handle the case where only
>       half a page is registered.
> 
> I believe this puts the code that must be trusted into the kernel and
> gives userspace primitives that let apps handle the rest.

  I'm assuming that for libibverbs memory registration you plan on hiding
the mprotect in the library? Without reference counting at the kernel
level this could yield unexpected results in a perfectly legitimate app.

  For example if the app is managing a buffer it will pass to another
device, but also want's to move data in/out with RDMA hardware, the user
marks it themselves with DONTCOPY, registers with libibverbs, performs
IO, unregisters with libibverbs. At this point the user expects the buffer
to have DONTCOPY set, but it does not because of the unregister... Not
that it's likely, but it's a valid thing to do. However, since I don't
have a better suggestion, I'm in favour of using mprotect as you outlined. 


-Libor

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: RDMA memory registration
  2005-04-29 16:45                                                               ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Roland Dreier
  2005-04-29 17:23                                                                 ` Libor Michalek
@ 2005-04-29 18:22                                                                 ` Brice Goglin
  2005-04-29 18:31                                                                   ` Roland Dreier
  2005-04-29 19:33                                                                   ` [openib-general] " Grant Grundler
  2005-04-29 19:43                                                                 ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Bill Jordan
  2 siblings, 2 replies; 144+ messages in thread
From: Brice Goglin @ 2005-04-29 18:22 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Caitlin Bestler, Bill Jordan, Andrew Morton, hch, linux-kernel,
	openib-general, David Addison

Roland Dreier a écrit :
> 2) For fork() support:
> 
>    a) Extend mprotect() with PROT_DONTCOPY so processes can avoid
>       copy-on-write problems.
> 
>    b) (maybe someday?) Add a VM_ALWAYSCOPY flag and extend mprotect()
>       with PROT_ALWAYSCOPY so processes can mark pages to be
>       pre-copied into child processes, to handle the case where only
>       half a page is registered.
> 
> I believe this puts the code that must be trusted into the kernel and
> gives userspace primitives that let apps handle the rest.

Do you plan to work with David Addison from Quadrics ?
For sure, your hardware have very different capabilities.
But ioproc_ops is a really nice solution and might help a lot
when dealing with deregistration and fork.

For instance, instead of adding PROT_DONT/ALWAYSCOPY, you may use
an ioproc hook in the fork path. This hook (a function in your driver)
would be called for each registered page. It will decide whether
the page should be pre-copied or not and update the registration
table (or whatever stores address translations in the NIC).
In addition, the driver would probably pre-copy cow pages when
registering them.

It's nice to see these two works coming to LKML at the same time.
It would be great if we could merge them and get a generic solution
that's suitable to both registration based cards (IB/Myri/Ammasso)
and MMU-based cards (Quadrics).

Brice

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: RDMA memory registration
  2005-04-29 18:22                                                                 ` RDMA memory registration Brice Goglin
@ 2005-04-29 18:31                                                                   ` Roland Dreier
  2005-04-29 19:33                                                                   ` [openib-general] " Grant Grundler
  1 sibling, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-29 18:31 UTC (permalink / raw)
  To: Brice Goglin
  Cc: Caitlin Bestler, Bill Jordan, Andrew Morton, hch, linux-kernel,
	openib-general, David Addison

    Brice> Do you plan to work with David Addison from Quadrics ?  For
    Brice> sure, your hardware have very different capabilities.  But
    Brice> ioproc_ops is a really nice solution and might help a lot
    Brice> when dealing with deregistration and fork.

I'm following the discussion with interest.  Some hardware (eg
Mellanox HCAs) has the ability to use these hooks to avoid pinning
pages at all, but in general IB and iWARP need to pin pages so the
mapping doesn't change.

    Brice> For instance, instead of adding PROT_DONT/ALWAYSCOPY, you
    Brice> may use an ioproc hook in the fork path. This hook (a
    Brice> function in your driver) would be called for each
    Brice> registered page. It will decide whether the page should be
    Brice> pre-copied or not and update the registration table (or
    Brice> whatever stores address translations in the NIC).  In
    Brice> addition, the driver would probably pre-copy cow pages when
    Brice> registering them.

This sort of monkeying around with the VM from driver code seems much
more complicated than letting userspace handle it.

 - R.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: RDMA memory registration
  2005-04-29 18:22                                                                 ` RDMA memory registration Brice Goglin
  2005-04-29 18:31                                                                   ` Roland Dreier
@ 2005-04-29 19:33                                                                   ` Grant Grundler
  2005-05-03  8:42                                                                     ` David Addison
  1 sibling, 1 reply; 144+ messages in thread
From: Grant Grundler @ 2005-04-29 19:33 UTC (permalink / raw)
  To: Brice Goglin
  Cc: Roland Dreier, Andrew Morton, linux-kernel, openib-general, hch,
	Caitlin Bestler

On Fri, Apr 29, 2005 at 08:22:24PM +0200, Brice Goglin wrote:
> For instance, instead of adding PROT_DONT/ALWAYSCOPY, you may use
> an ioproc hook in the fork path. This hook (a function in your driver)
> would be called for each registered page. It will decide whether
> the page should be pre-copied or not and update the registration
> table (or whatever stores address translations in the NIC).
> In addition, the driver would probably pre-copy cow pages when
> registering them.

This doesn't scale well as more cards are added to the box.
I think I understand why it's good for single cards though.

> It's nice to see these two works coming to LKML at the same time.
> It would be great if we could merge them and get a generic solution
> that's suitable to both registration based cards (IB/Myri/Ammasso)
> and MMU-based cards (Quadrics).

Aren't the mellanox mem-free cards more or less MMU's as well?
I had that impression after attending Dror Goldberg's talk
though I don't think he asserted that.
Openib.org developers conf (Feb 2005) slideset is here:
	http://www.openib.org/docs/oib_wkshp_022005/memfree-hca-mellanox-dgoldenberg.pdf

Being mostly clueless about Quadrics implementation, I'm probably
missing something that makes Quadrics a MMU but not the IB variants.
Can someone clue me in please?

thanks,
grant

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation)
  2005-04-29 16:45                                                               ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Roland Dreier
  2005-04-29 17:23                                                                 ` Libor Michalek
  2005-04-29 18:22                                                                 ` RDMA memory registration Brice Goglin
@ 2005-04-29 19:43                                                                 ` Bill Jordan
  2005-04-29 19:45                                                                   ` RDMA memory registration Roland Dreier
  2 siblings, 1 reply; 144+ messages in thread
From: Bill Jordan @ 2005-04-29 19:43 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Caitlin Bestler, Andrew Morton, hch, linux-kernel,
	openib-general, Timur Tabi

On 4/29/05, Roland Dreier <roland@topspin.com> wrote:
>   b) (maybe someday?) Add a VM_ALWAYSCOPY flag and extend mprotect()
>      with PROT_ALWAYSCOPY so processes can mark pages to be
>      pre-copied into child processes, to handle the case where only
>      half a page is registered.

Are you suggesting making the partial pages their own VMA, or marking
the entire buffer with this flag? I originally thought the entire
buffer should be copy on fork (instead of copy on write), and I
believe this is the path Mellanox was pursing with the VM_NO_COW flag.
However, if applications are registering gigs of ram, it would be very
bad to have the entire area copied on fork.

On the other hand, I've always wondered about the choice to leave
holes in the child process's address space. I would have chosen to map
the zero page instead.

-- 
Bill Jordan
InfiniCon Systems

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: RDMA memory registration
  2005-04-29 19:43                                                                 ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Bill Jordan
@ 2005-04-29 19:45                                                                   ` Roland Dreier
  0 siblings, 0 replies; 144+ messages in thread
From: Roland Dreier @ 2005-04-29 19:45 UTC (permalink / raw)
  To: bjordan
  Cc: Caitlin Bestler, Andrew Morton, hch, linux-kernel,
	openib-general, Timur Tabi

    Bill> Are you suggesting making the partial pages their own VMA,
    Bill> or marking the entire buffer with this flag? I originally
    Bill> thought the entire buffer should be copy on fork (instead of
    Bill> copy on write), and I believe this is the path Mellanox was
    Bill> pursing with the VM_NO_COW flag.  However, if applications
    Bill> are registering gigs of ram, it would be very bad to have
    Bill> the entire area copied on fork.

It's up to userspace really but I would expect that the partial pages
would be in a vma by themselves.

 - R.


^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-29 17:04                                                               ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
@ 2005-04-30  0:31                                                                 ` Caitlin Bestler
  2005-05-03 18:43                                                                   ` Andy Isaacson
  0 siblings, 1 reply; 144+ messages in thread
From: Caitlin Bestler @ 2005-04-30  0:31 UTC (permalink / raw)
  To: Libor Michalek
  Cc: Bill Jordan, Andrew Morton, hch, linux-kernel, openib-general,
	Timur Tabi

On 4/29/05, Libor Michalek <libor@topspin.com> wrote:

> 
>   However, you have a potential problem with registered buffers that
> do not begin or end on a page boundary, which is common with malloc.
> If the buffer resides on a portion of a page, and you mark the vm
> which contains that entire page VM_DONTCOPY, to ensure that the parent
> has access to the exact physical page after the fork, the child will
> not be able to access anything on that entire page. So if the child
> expects to access data on the same page that happens to contain the
> registered buffer it will get a segment violation.
> 
> The four situations we've discussed are:
> 
>   1) Physical page does not get used for anything else.
>   2) Processes virtual to physical mapping remains fixed.
>   3) Same virtual to physical mapping after forking a child.
>   4) Forked child has access to all non-registered memory of
>      the parent.
> 
> The first two are now taken care of with get_user_pages, (we use to
> use VM_LOCKED for the second case) third case is handled by setting
> the vm to VM_DONTCOPY, and on the fourth case we've always punted,
> but the real answer is to break partial pages into seperate vms and
> mark them ALWAYS_COPY.
> 
> -Libor
> 
> 
Attempting to provide *any* support for applications that fork children
after doing RDMA registrations is a ratshole best avoided. The general
rule that application developers should follow is to do RDMA *only*
in the child processes.

Keep in mind that it is not only the memory regions that must be dealt
with, but control data invisible to the user (the QP context, etc.). This
data frequently is interlinked between kernel residente and user resident
data (such as a QP context has the PD ID somewhere on-chip or in
kernel, which the Send Queue ring needs to be in user memory). Having
two different user processes that both think they have the user half to
this type of split data structure is just asking for trouble, even if you 
manage to get the copy on write bit timing problems all solved.

All of this can be avoided by a simple rule: don't fork after opening
an RDMA device.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: RDMA memory registration
  2005-04-29 19:33                                                                   ` [openib-general] " Grant Grundler
@ 2005-05-03  8:42                                                                     ` David Addison
  2005-05-03 15:36                                                                       ` Grant Grundler
  0 siblings, 1 reply; 144+ messages in thread
From: David Addison @ 2005-05-03  8:42 UTC (permalink / raw)
  To: Grant Grundler
  Cc: Brice Goglin, Andrew Morton, linux-kernel, openib-general, hch,
	Caitlin Bestler

Grant Grundler wrote:
> On Fri, Apr 29, 2005 at 08:22:24PM +0200, Brice Goglin wrote:
>>For instance, instead of adding PROT_DONT/ALWAYSCOPY, you may use
>>an ioproc hook in the fork path. This hook (a function in your driver)
>>would be called for each registered page. It will decide whether
>>the page should be pre-copied or not and update the registration
>>table (or whatever stores address translations in the NIC).
>>In addition, the driver would probably pre-copy cow pages when
>>registering them.
> 
> This doesn't scale well as more cards are added to the box.
> I think I understand why it's good for single cards though.
> 
With the IOPROC patch the device driver hooks are registered on a per process
or perhaps better still, a per VMA basis. And for processes/VMAs where there
are no registrations the overhead is very low.

With multiple cards in a box, all using different device drivers, I guess there
could end up being multiple registrations per process/VMA. But I'm not sure
this will be a common case for RDMA use in real life.

Cheers
Addy.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: RDMA memory registration
  2005-05-03  8:42                                                                     ` David Addison
@ 2005-05-03 15:36                                                                       ` Grant Grundler
  0 siblings, 0 replies; 144+ messages in thread
From: Grant Grundler @ 2005-05-03 15:36 UTC (permalink / raw)
  To: David Addison
  Cc: Brice Goglin, Andrew Morton, linux-kernel, openib-general, hch,
	Caitlin Bestler

On Tue, May 03, 2005 at 09:42:12AM +0100, David Addison wrote:
> >This doesn't scale well as more cards are added to the box.
> >I think I understand why it's good for single cards though.
>
> With the IOPROC patch the device driver hooks are registered on a per 
> process or perhaps better still, a per VMA basis.

I was originally thinking the registrations are global (for all memory)
and not per process. Per process or per VMA seems reasonable to me.

> And for processes/VMAs where there are no registrations the overhead
> is very low.

Yes - thanks. I'm still reading the LKML thread you started:
	http://lkml.org/lkml/2005/4/26/198

In particular, the comments from Brice Goglin:
	http://lkml.org/lkml/2005/4/26/222

openib.org folks can find the IOPROC patch for 2.6.12-rc3 archived here:
	http://lkml.org/lkml/diff/2005/4/26/198/1

> With multiple cards in a box, all using different device drivers,
> I guess there could end up being multiple registrations per process/VMA.
> But I'm not sure this will be a common case for RDMA use in real life.

I agree. Gateways between fabrics is the only case I can think of.
This won't be a problem until someone at a large national lab tries
to connect two "legacy" fabrics together.

thanks,
grant

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-30  0:31                                                                 ` Caitlin Bestler
@ 2005-05-03 18:43                                                                   ` Andy Isaacson
  2005-05-03 19:04                                                                     ` Caitlin Bestler
  2005-05-04 18:22                                                                     ` William Jordan
  0 siblings, 2 replies; 144+ messages in thread
From: Andy Isaacson @ 2005-05-03 18:43 UTC (permalink / raw)
  To: Caitlin Bestler
  Cc: Libor Michalek, Bill Jordan, Andrew Morton, hch, linux-kernel,
	openib-general, Timur Tabi

On Fri, Apr 29, 2005 at 05:31:44PM -0700, Caitlin Bestler wrote:
> Attempting to provide *any* support for applications that fork children
> after doing RDMA registrations is a ratshole best avoided. The general
> rule that application developers should follow is to do RDMA *only*
> in the child processes.

I think it's unreasonable to *prohibit* fork-after-registration; for one
thing, there's lots of code that forks under the covers.  Setuid helpers
like getpty just assume that they're going to be able to fork.  Even
stuff like get*by*(3) can potentially fork.  And with site-configured
stuff like PAM, you end up with things that work on the developer's
system but break in deployment.

I think it's exceedingly reasonable to say "RDMA doesn't work in
children".  But the child should get a sane memory image:  at least
zeros in fully-registered pages, and preferably copies of
partially-registered pages.  Differentiating between fully-registered
and partially-registered pages avoids (I think) the pathological case of
having to copy a GB of data just to system("/bin/ls > /tmp/tmpfile").
You can still go pathological if you've partially-registered gigabytes
of address space (for example a linked list where each node is allocated
with malloc and then registered) but that's a case of "Well, don't do
that then".

Rather than replacing the fully-registered pages with pages of zeros,
you could simply unmap them.

A consistent statement would be

    After fork(2), any regions which were registered are UNDEFINED.
    Region boundaries are byte-accurate; a registration can cover just
    part of a page, in which case the non-registered part of the page
    has normal fork COW semantics.

Probably the most sane solution is to simply unmap the fully-registered
pages at fork time, and copy any partially-registered pages.  But the
statement above does not require this.

> Keep in mind that it is not only the memory regions that must be dealt
> with, but control data invisible to the user (the QP context, etc.). This
> data frequently is interlinked between kernel residente and user resident
> data (such as a QP context has the PD ID somewhere on-chip or in
> kernel, which the Send Queue ring needs to be in user memory). Having
> two different user processes that both think they have the user half to
> this type of split data structure is just asking for trouble, even if you 
> manage to get the copy on write bit timing problems all solved.

Obviously, calling *any* RDMA-userland-stuff in the child is completely
undefined [1].  One place where I can see a potential problem is in
atexit()-type handlers registered by the RDMA library.  Since those
aren't performance-critical they can and should do sanity checks with
getpid() and/or checking with the kernel driver.

[1] You might want to allow the child to start a completely new RDMA
    context, but I don't see that as necessary.

-andy

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-03 18:43                                                                   ` Andy Isaacson
@ 2005-05-03 19:04                                                                     ` Caitlin Bestler
  2005-05-04 18:22                                                                     ` William Jordan
  1 sibling, 0 replies; 144+ messages in thread
From: Caitlin Bestler @ 2005-05-03 19:04 UTC (permalink / raw)
  To: Andy Isaacson
  Cc: Libor Michalek, Bill Jordan, Andrew Morton, hch, linux-kernel,
	openib-general, Timur Tabi

On 5/3/05, Andy Isaacson <adi@hexapodia.org> wrote:

> 
> A consistent statement would be
> 
>     After fork(2), any regions which were registered are UNDEFINED.
>     Region boundaries are byte-accurate; a registration can cover just
>     part of a page, in which case the non-registered part of the page
>     has normal fork COW semantics.
> 

That is a reasonable approach.

> 
> Obviously, calling *any* RDMA-userland-stuff in the child is completely
> undefined [1].  One place where I can see a potential problem is in
> atexit()-type handlers registered by the RDMA library.  Since those
> aren't performance-critical they can and should do sanity checks with
> getpid() and/or checking with the kernel driver.
> 

That is also reasonable. None of the RDMA libraries I have worked on
bothered to use an atexit()-type hook because the user was theoretically
*required* to close the rnic, and driver code was already reuqired to clean
up in case of a total process failure. Adding an intermediate safety-net
for applications that exited cleanly but forget to close just didn't seem
worthwhile. If the application wants the cleanup performed optimally
then it can close the rnic, otherwise it can't complain about forcing
the RNIC vendor to clean up in the driver code.

> [1] You might want to allow the child to start a completely new RDMA
>     context, but I don't see that as necessary.
> 

That should be allowed. It is actually more normal to use the parent
as a dispatcher and to actually manage the connection in a child
process.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-03 18:43                                                                   ` Andy Isaacson
  2005-05-03 19:04                                                                     ` Caitlin Bestler
@ 2005-05-04 18:22                                                                     ` William Jordan
  2005-05-05  1:27                                                                       ` Rik van Riel
  1 sibling, 1 reply; 144+ messages in thread
From: William Jordan @ 2005-05-04 18:22 UTC (permalink / raw)
  To: Andy Isaacson
  Cc: Caitlin Bestler, Andrew Morton, linux-kernel, openib-general,
	hch, Timur Tabi

On 5/3/05, Andy Isaacson <adi@hexapodia.org> wrote:
> Rather than replacing the fully-registered pages with pages of zeros,
> you could simply unmap them.

I don't like this option. It is nearly free to map all of the pages to
the zero-page. You never have to allocate a page if the user never
writes to it.

Buf if you unmap the page, there could be issues. The memory region
could be on the stack, or malloc'ed. In these cases, the child should
be able to return from the function, or free the memory without
setting a timebomb.

-- 
Bill Jordan
InfiniCon Systems

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-04-26  3:31                 ` Libor Michalek
@ 2005-05-04 18:27                   ` Timur Tabi
  2005-05-05 18:48                     ` Timur Tabi
  2005-05-05 23:34                     ` Libor Michalek
  0 siblings, 2 replies; 144+ messages in thread
From: Timur Tabi @ 2005-05-04 18:27 UTC (permalink / raw)
  To: Libor Michalek; +Cc: Andrew Morton, linux-kernel, openib-general

Libor Michalek wrote:

>   The program opens the charcter device file descriptor, pins the pages
> and waits for a signal, before checking the pages, which is sent to the
> process after running some other program which exercises the VM. On older
> kernels the check fails, on my 2.6.11 kernel the check succeeds. So
> mlock is not needed on top of get_user_pages() as it was before.

Libor,

When you say "older", what exactly do you mean?  I have different test that normally fails 
with just get_user_pages(), but it works with 2.6.9 and above.  I haven't been able to get 
any kernel earlier than 2.6.9 to compile or boot properly, so I'm having a hard time 
narrowing down the actual point when get_user_pages() started working.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-04 18:22                                                                     ` William Jordan
@ 2005-05-05  1:27                                                                       ` Rik van Riel
  2005-05-05  1:57                                                                         ` Andy Isaacson
  0 siblings, 1 reply; 144+ messages in thread
From: Rik van Riel @ 2005-05-05  1:27 UTC (permalink / raw)
  To: William Jordan
  Cc: Andy Isaacson, Caitlin Bestler, Andrew Morton, linux-kernel,
	openib-general, hch, Timur Tabi

On Wed, 4 May 2005, William Jordan wrote:
> On 5/3/05, Andy Isaacson <adi@hexapodia.org> wrote:
> > Rather than replacing the fully-registered pages with pages of zeros,
> > you could simply unmap them.
> 
> I don't like this option. It is nearly free to map all of the pages to
> the zero-page. You never have to allocate a page if the user never
> writes to it.

Unmapping should work fine, as long as the VMA flags are
set appropriately.  The page fault handler can take care
of the rest...

-- 
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-05  1:27                                                                       ` Rik van Riel
@ 2005-05-05  1:57                                                                         ` Andy Isaacson
  0 siblings, 0 replies; 144+ messages in thread
From: Andy Isaacson @ 2005-05-05  1:57 UTC (permalink / raw)
  To: Rik van Riel
  Cc: William Jordan, Caitlin Bestler, Andrew Morton, linux-kernel,
	openib-general, hch, Timur Tabi

On Wed, May 04, 2005 at 09:27:21PM -0400, Rik van Riel wrote:
> On Wed, 4 May 2005, William Jordan wrote:
> > On 5/3/05, Andy Isaacson <adi@hexapodia.org> wrote:
> > > Rather than replacing the fully-registered pages with pages of zeros,
> > > you could simply unmap them.
> > 
> > I don't like this option. It is nearly free to map all of the pages to
> > the zero-page. You never have to allocate a page if the user never
> > writes to it.
> 
> Unmapping should work fine, as long as the VMA flags are
> set appropriately.  The page fault handler can take care
> of the rest...

I think there may be a difference in terminology here.  What I
originally proposed (and what I think Bill was reacting to) is the
equivalent of sys_munmap() on the range of registered pages.  That has
the downsides that he mentioned; an address that was valid in the parent
will now result in SIGSEGV or SIGBUS in the child, and it's explicitly
endorsed by the userland APIs (such as MPI2) that it's valid to register
stack addresses (for example).

What I think you're proposing, Rik, is that VMA get destroyed (or split,
if only part of it had been registered) and replaced with an anonymous
one.  That's a very low-overhead way of going about it, I think.  Then
as you say, the page fault handler will automatically give a zero page
to the process when it faults on those addresses.

Did I understand your suggestion correctly?  I think I agree with
Bill that having the child fault on pages which happened to have been
registered by the parent would be a bad thing.

This would, if I understand correctly, be visible in /proc/$$/maps.
Which is OK, if a little bit suprising; but the alternatives are worse.

-andy

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-04 18:27                   ` Timur Tabi
@ 2005-05-05 18:48                     ` Timur Tabi
  2005-05-06 23:08                       ` Timur Tabi
  2005-05-05 23:34                     ` Libor Michalek
  1 sibling, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-05-05 18:48 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Libor Michalek, Andrew Morton, linux-kernel, openib-general

Timur Tabi wrote:

> When you say "older", what exactly do you mean?  I have different test 
> that normally fails with just get_user_pages(), but it works with 2.6.9 
> and above.  I haven't been able to get any kernel earlier than 2.6.9 to 
> compile or boot properly, so I'm having a hard time narrowing down the 
> actual point when get_user_pages() started working.

I haven't gotten a reply to this question, but I've done my own research, and I think I 
found the answer.  Using my own test of get_user_pages(), it appears that the fix was 
placed in 2.6.7.  However, I would like to know specifically what the fix is. 
Unfortunately, tracking this stuff down is beyond my understanding of the Linux VM.

Assuming that the fix is in try_to_unmap_one(), the only significant change I see between
2.6.6 and 2.6.7 is the addition of this code:

	pgd = pgd_offset(mm, address);
	if (!pgd_present(*pgd))
		goto out_unlock;

	pmd = pmd_offset(pgd, address);
	if (!pmd_present(*pmd))
		goto out_unlock;

	pte = pte_offset_map(pmd, address);
	if (!pte_present(*pte))
		goto out_unmap;

	if (page_to_pfn(page) != pte_pfn(*pte))
		goto out_unmap;

Can anyone tell me if this is the actual fix, or at least a major part of the actual fix?

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-04 18:27                   ` Timur Tabi
  2005-05-05 18:48                     ` Timur Tabi
@ 2005-05-05 23:34                     ` Libor Michalek
  1 sibling, 0 replies; 144+ messages in thread
From: Libor Michalek @ 2005-05-05 23:34 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Andrew Morton, linux-kernel, openib-general

On Wed, May 04, 2005 at 01:27:54PM -0500, Timur Tabi wrote:
> Libor Michalek wrote:
> 
> >   The program opens the charcter device file descriptor, pins the pages
> > and waits for a signal, before checking the pages, which is sent to the
> > process after running some other program which exercises the VM. On older
> > kernels the check fails, on my 2.6.11 kernel the check succeeds. So
> > mlock is not needed on top of get_user_pages() as it was before.
> 
> When you say "older", what exactly do you mean? I have different test 
> that normally fails with just get_user_pages(), but it works with 2.6.9
> and above.  I haven't been able to get any kernel earlier than 2.6.9 to
> compile or boot properly, so I'm having a hard time narrowing down the
> actual point when get_user_pages() started working.

  The older kernel I tried was one of the 2.4.21 RHEL 3 kernels. I hadn't
spent much time investigating the issue since this was a new kernel, so it
was a natural one for me to try.

-Libor

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-05 18:48                     ` Timur Tabi
@ 2005-05-06 23:08                       ` Timur Tabi
  2005-05-07 13:18                         ` Hugh Dickins
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-05-06 23:08 UTC (permalink / raw)
  To: Timur Tabi; +Cc: Libor Michalek, Andrew Morton, linux-kernel, openib-general

Timur Tabi wrote:

> I haven't gotten a reply to this question, but I've done my own 
> research, and I think I found the answer.  Using my own test of 
> get_user_pages(), it appears that the fix was placed in 2.6.7.  However, 
> I would like to know specifically what the fix is. Unfortunately, 
> tracking this stuff down is beyond my understanding of the Linux VM.

I'm also still waiting for a reply to this question. Anyone????

Upon doing some more research, I think the fix might be those code instead:

	/*
	 * Don't pull an anonymous page out from under get_user_pages.
	 * GUP carefully breaks COW and raises page count (while holding
	 * page_table_lock, as we have here) to make sure that the page
	 * cannot be freed.  If we unmap that page here, a user write
	 * access to the virtual address will bring back the page, but
	 * its raised count will (ironically) be taken to mean it's not
	 * an exclusive swap page, do_wp_page will replace it by a copy
	 * page, and the user never get to see the data GUP was holding
	 * the original page for.
	 */
	if (PageSwapCache(page) &&
	    page_count(page) != page->mapcount + 2) {
		ret = SWAP_FAIL;
		goto out_unmap;
	}

Both this change and the other one I mentioned are new to 2.6.7.  I suppose I could try 
applying these patches to the 2.6.6 kernel and see if anything improves, but that won't 
help me understand what's really going on.  The above comment makes sounds almost like 
it's a fix, but it talks about copy-on-write, which is has nothing to do with the real 
problem.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-06 23:08                       ` Timur Tabi
@ 2005-05-07 13:18                         ` Hugh Dickins
  2005-05-07 14:45                           ` Timur Tabi
  0 siblings, 1 reply; 144+ messages in thread
From: Hugh Dickins @ 2005-05-07 13:18 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Libor Michalek, Andrew Morton, Andrea Arcangeli, linux-kernel,
	openib-general

Sorry for not replying earlier (indeed, sorry for not joining in the
wider RDMA pinning discussion), concentrating on other stuff at present.

On Fri, 6 May 2005, Timur Tabi wrote:
> Timur Tabi wrote:
> 
> > I haven't gotten a reply to this question, but I've done my own research,
> > and I think I found the answer.  Using my own test of get_user_pages(),
> > it appears that the fix was placed in 2.6.7.  However, I would like to
> > know specifically what the fix is. Unfortunately, tracking this stuff
> > down is beyond my understanding of the Linux VM.
> 
> I'm also still waiting for a reply to this question. Anyone????
> 
> Upon doing some more research, I think the fix might be those code instead:

I believe you're right this time - I was rather puzzled by your earlier
choice, then unhelpfully forgot to reply and point you a few lines further
down to this comment, which does shout "get_user_pages fix" quite loudly.

> /*
> * Don't pull an anonymous page out from under get_user_pages.
> * GUP carefully breaks COW and raises page count (while holding
> * page_table_lock, as we have here) to make sure that the page
> * cannot be freed.  If we unmap that page here, a user write
> * access to the virtual address will bring back the page, but
> * its raised count will (ironically) be taken to mean it's not
> * an exclusive swap page, do_wp_page will replace it by a copy
> * page, and the user never get to see the data GUP was holding
> * the original page for.
> */
> if (PageSwapCache(page) &&
> page_count(page) != page->mapcount + 2) {
> ret = SWAP_FAIL;
> goto out_unmap;
> }
> 
> Both this change and the other one I mentioned are new to 2.6.7.  I suppose I
> could try applying these patches to the 2.6.6 kernel and see if anything
> improves, but that won't help me understand what's really going on.

There's a lot of change in the rmap area between 2.6.6 and 2.6.7, but
you're right that this is an isolated fix, which could in principle be
applied to earlier releases.  Though I don't see it's worth doing now.

> The above comment makes sounds almost like it's a fix,

Almost?  Sorry if my comment doesn't make it obvious it's a fix for a
get_user_pages issue - I rewrote Andrea Arcangeli's original commment.
The analysis and fix are his.

> but it talks about copy-on-write,
> which is has nothing to do with the real problem.

Oh, well, maybe, but what is the real problem?
Are you sure that copy-on-write doesn't come into it?

I haven't reread through the whole thread, but my recollection is
that you never quite said what the real problem is: you'd found some
time ago that get_user_pages sometimes failed to pin the pages for
your complex app, so were forced to mlock too; but couldn't provide
any simple test case for the failure (which can indeed be a lot of
work to devise), so we were all in the dark as to what went wrong.

But you've now found that 2.6.7 and later kernels allow your app to
work correctly without mlock, good.  get_user_pages is certainly the
right tool to use for such pinning.  (On the question of whether
mlock guarantees that user virtual addresses map to the same physical
addresses, I prefer Arjan's view that it does not; but accept that
there might prove to be difficulties in holding that position.)

So, it works now, you've exonerated today's get_user_pages, and you've
identified at least one get_user_pages fix which went in at that time:
do we really need to chase this further?

Oh, in writing of copy-on-write, I've just remembered another fix
for get_user_pages which I made in 2.6.7 (though I've not heard of
anyone seeing the problem fixed): call to do_wp_page in do_swap_page.
get_user_pages assumes that the write fault it generates will break
copy-on-write i.e. will make a private copy page when necessary,
before returning to the caller; but that wasn't happening in the
do_swap_page case.

By the way, please don't be worried when soon the try_to_unmap_one
comment and code that you identified above disappear.  When I'm
back in patch submission mode, I'll be sending Andrew a patch which
removes it, instead reworking can_share_swap_page to rely on the
page_mapcount instead of page_count, which avoids the ironical
behaviour my comment refers to, and allows an awkward page migration
case to proceed (once unpinned).  Andrea and I now both prefer this
page_mapcount approach.

Hugh

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-07 13:18                         ` Hugh Dickins
@ 2005-05-07 14:45                           ` Timur Tabi
  2005-05-07 16:30                             ` Hugh Dickins
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-05-07 14:45 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Libor Michalek, Andrew Morton, Andrea Arcangeli, linux-kernel,
	openib-general

Hugh Dickins wrote:

> Oh, well, maybe, but what is the real problem?
> Are you sure that copy-on-write doesn't come into it?

No, but I do know that my test case doesn't call fork(), so it's reproducible without 
involving COW.  Of course, I'm sure someone's going to tell me now that COW comes into 
effect even without fork().  If so, please explain.

> I haven't reread through the whole thread, but my recollection is
> that you never quite said what the real problem is: you'd found some
> time ago that get_user_pages sometimes failed to pin the pages for
> your complex app, so were forced to mlock too; but couldn't provide
> any simple test case for the failure (which can indeed be a lot of
> work to devise), so we were all in the dark as to what went wrong.

The short answer: under "extreme" memory pressure, the data inside a page pinned by 
get_user_pages() is swapped out, moved, or deleted (I'm not sure which).  Some other data 
is placed into that physical location.

By extreme memory pressure, I mean having the process allocate and touch as much memory as 
possible.  Something like this:

num_bytes = get_amount_of_physical_ram();
char *p = malloc(num_bytes);
for (i=0; i<num_bytes; i+=PAGE_SIZE)
   p[i] = 0;

The above over-simplified code fails on earlier 2.6 kernels (or earlier versions of glibc 
that accompany most distros the use the earlier 2.6 kernels).  Either malloc() returns 
NULL, or the p[i]=0 part causes a segfault.  I haven't bothered to trace down why.  But 
when it does work, the page pinned by get_user_pages() changes.

> But you've now found that 2.6.7 and later kernels allow your app to
> work correctly without mlock, good.  get_user_pages is certainly the
> right tool to use for such pinning.  (On the question of whether
> mlock guarantees that user virtual addresses map to the same physical
> addresses, I prefer Arjan's view that it does not; but accept that
> there might prove to be difficulties in holding that position.)

My understanding is that mlock() could in theory allow the page to be moved, but that 
currently nothing in the kernel would actually move it.  However, that could change in the 
future to allow hot-swapping of RAM.

> So, it works now, you've exonerated today's get_user_pages, and you've
> identified at least one get_user_pages fix which went in at that time:
> do we really need to chase this further?

My driver needs to support all 2.4 and 2.6 kernel versions.  My makefile scans the kernel 
source tree with 'grep' to identify various characterists, and I use #ifdefs to 
conditionally compile code depending on what features are present in the kernel.  I can't 
use the kernel version number, because that's not reliable - distros will incorporate 
patches from future kernels without changing the version ID.

So I need to take into account distro vendors that use an earlier kernel, like 2.6.5, and 
back-port the patch from 2.6.7.  The distro vendor will keep the 2.6.5 version number, 
which is why I can't rely on it.

I need to know exactly what the fix is, so that when I scan mm/rmap.c, I know what to look 
for.  Currently, I look for this regex:

try_to_unmap_one.*vm_area_struct

which seems to work.  However, now I think it's just a coincidence.

> By the way, please don't be worried when soon the try_to_unmap_one
> comment and code that you identified above disappear.  When I'm
> back in patch submission mode, I'll be sending Andrew a patch which
> removes it, instead reworking can_share_swap_page to rely on the
> page_mapcount instead of page_count, which avoids the ironical
> behaviour my comment refers to, and allows an awkward page migration
> case to proceed (once unpinned).  Andrea and I now both prefer this
> page_mapcount approach.

Ugh, that means my regex is probably going to break.  Not only that, but I don't 
understand what you're saying either.  Trying to understand the VM is really hard.

I guess in this specific case, it doesn't really matter, because calling mlock() when I 
should be calling get_user_pages() is not a bad thing.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-07 14:45                           ` Timur Tabi
@ 2005-05-07 16:30                             ` Hugh Dickins
  2005-05-11 20:12                               ` William Jordan
  0 siblings, 1 reply; 144+ messages in thread
From: Hugh Dickins @ 2005-05-07 16:30 UTC (permalink / raw)
  To: Timur Tabi
  Cc: Libor Michalek, Andrew Morton, Andrea Arcangeli, linux-kernel,
	openib-general

On Sat, 7 May 2005, Timur Tabi wrote:
> 
> > Oh, well, maybe, but what is the real problem?
> > Are you sure that copy-on-write doesn't come into it?
> 
> No, but I do know that my test case doesn't call fork(), so it's reproducible
> without involving COW.  Of course, I'm sure someone's going to tell me now
> that COW comes into effect even without fork().  If so, please explain.

I'll try.  COW comes into effect whenever you're sharing a page and
then need to make private changes to it.  Fork is one way of sharing
(with ancestor and descendant processes).  Using the empty zero page
is another way of sharing (with all other processes and parts of your
own address space with a readonly page full of zeroes).  Using a file
page from the page cache is another way of sharing.

None of those is actually your case, but our test for whether a page
is shared has been inadequate: oversimplifying, if page_count is more
than 1 then we have to assume it is shared and do the copy-on-write
(if the modifications are to be private).  But there are various places
where the page_count is temporarily raised (e.g. while paging out),
which we cannot distinguish, so occasionally we'll copy on write even
when it's not necessary, but we lack the information to tell us so.

In particular, of course, get_user_pages raises page_count to pin
the page: so making a page appear shared when it's not shared at all.

> The short answer: under "extreme" memory pressure, the data inside a page
> pinned by get_user_pages() is swapped out, moved, or deleted (I'm not sure
> which).  Some other data is placed into that physical location.
> 
> By extreme memory pressure, I mean having the process allocate and touch as
> much memory as possible.  Something like this:
> 
> num_bytes = get_amount_of_physical_ram();
> char *p = malloc(num_bytes);
> for (i=0; i<num_bytes; i+=PAGE_SIZE)
> p[i] = 0;
> 
> The above over-simplified code fails on earlier 2.6 kernels (or earlier
> versions of glibc that accompany most distros the use the earlier 2.6
> kernels).  Either malloc() returns NULL, or the p[i]=0 part causes a segfault.
> I haven't bothered to trace down why.  But when it does work, the page pinned
> by get_user_pages() changes.

Which has to be a bug with get_user_pages, which has no other purpose
than to pin the pages.  I cannot criticize you for working around it
to get your app working on lots of releases, but what _we_ have to do
is fix get_user_pages - and it appears that Andrea did so a year ago.

I'm surprised if it's as simple as you describe (you do say over-
simplified, maybe the critical points have fallen out), since GUP
users would have complained long ago if it wasn't doing the job in
normal cases of memory pressure.  Andrea's case does involve the
process independently trying to touch a page it has pinned for I/O
with get_user_pages.  Or (and I've only just thought of this, suspect
it might be exactly your case) not touch, but apply get_user_pages
again to a page already so pinned (while memory pressure has caused
try_to_unmap_one temporarily to detach it from the user address space
- the aspect of the problem that Andrea's fix attacks).

> My understanding is that mlock() could in theory allow the page to be moved,
> but that currently nothing in the kernel would actually move it.  However,
> that could change in the future to allow hot-swapping of RAM.

That's my understanding too, that nothing currently does so.  Aside from
hot-swapping RAM, there's also a need to be able to migrate pages around
RAM, either to unfragment memory allowing higher-order allocations to
succeed more often, or to get around extreme dmamem/normal-mem/highmem
imbalances without dedicating huge reserves.  Those would more often
succeed if uninhibited by mlock.

> So I need to take into account distro vendors that use an earlier kernel, like
> 2.6.5, and back-port the patch from 2.6.7.  The distro vendor will keep the
> 2.6.5 version number, which is why I can't rely on it.
> 
> I need to know exactly what the fix is, so that when I scan mm/rmap.c, I know
> what to look for.  Currently, I look for this regex:
> 
> try_to_unmap_one.*vm_area_struct
> 
> which seems to work.  However, now I think it's just a coincidence.

Perhaps any release based on 2.6.7 or above, or any release which
mentions "get_user_pages" in its mm/rmap.c or mm/objrmap.c?

> > By the way, please don't be worried when soon the try_to_unmap_one
> > comment and code that you identified above disappear.  When I'm
> > back in patch submission mode, I'll be sending Andrew a patch which
> > removes it, instead reworking can_share_swap_page to rely on the
> > page_mapcount instead of page_count, which avoids the ironical
> > behaviour my comment refers to, and allows an awkward page migration
> > case to proceed (once unpinned).  Andrea and I now both prefer this
> > page_mapcount approach.
> 
> Ugh, that means my regex is probably going to break.  Not only that, but I
> don't understand what you're saying either.  Trying to understand the VM is
> really hard.

Sorry about that, but suiting your regex is low in our priorities for
VM design!  I was tempted to offer to keep a comment on get_user_pages
in mm/rmap.c after the change, but that's really rather babyish: just
assume 2.6.7 and upwards are fixed (or complain if you find not).

Perhaps I'll manage a clearer explanation when I come to write the
change description for the patch, we'll have to see.

> I guess in this specific case, it doesn't really matter, because calling
> mlock() when I should be calling get_user_pages() is not a bad thing.

If you can afford to keep that amount of memory mlocked, and have to
capability to do so, yes, it should do no harm.  We were just upset
to think that mlock was still needed to get around a get_user_pages
bug which was fixed a year ago.

Hugh

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-07 16:30                             ` Hugh Dickins
@ 2005-05-11 20:12                               ` William Jordan
  2005-05-11 20:42                                 ` Hugh Dickins
  2005-05-11 22:49                                 ` Andrea Arcangeli
  0 siblings, 2 replies; 144+ messages in thread
From: William Jordan @ 2005-05-11 20:12 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Timur Tabi, Andrew Morton, Andrea Arcangeli, linux-kernel,
	openib-general

On 5/7/05, Hugh Dickins <hugh@veritas.com> wrote:
> > My understanding is that mlock() could in theory allow the page to be moved,
> > but that currently nothing in the kernel would actually move it.  However,
> > that could change in the future to allow hot-swapping of RAM.
> 
> That's my understanding too, that nothing currently does so.  Aside from
> hot-swapping RAM, there's also a need to be able to migrate pages around
> RAM, either to unfragment memory allowing higher-order allocations to
> succeed more often, or to get around extreme dmamem/normal-mem/highmem
> imbalances without dedicating huge reserves.  Those would more often
> succeed if uninhibited by mlock.

Hugh,

If I am reading you correctly, you are saying that mlock currently
prevents pages from migrating around to unfragment memory, but
get_user_pages does not prevent this? If this is the case, this could
very easily be the problem Timur was experiencing. He is using
get_user_pages to lock pages long term (for the life of the process,
beyond the bounds of a single system call).

If it is possible for a page to be migrated in physical memory during
extreme virtual memory pressure while the reference count is held with
get_user_pages, that would cause the problem where the hardware is no
longer mapped to the same page as the application.

BTW: In earlier kernels, I experienced the same issues in our IB
drivers when trying to pin pages using only get_user_pages.

-- 
Bill Jordan
InfiniCon Systems

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-11 20:12                               ` William Jordan
@ 2005-05-11 20:42                                 ` Hugh Dickins
  2005-05-11 22:52                                   ` Andrea Arcangeli
  2005-05-11 22:49                                 ` Andrea Arcangeli
  1 sibling, 1 reply; 144+ messages in thread
From: Hugh Dickins @ 2005-05-11 20:42 UTC (permalink / raw)
  To: William Jordan
  Cc: Timur Tabi, Andrew Morton, Andrea Arcangeli, linux-kernel,
	openib-general

On Wed, 11 May 2005, William Jordan wrote:
> On 5/7/05, Hugh Dickins <hugh@veritas.com> wrote:
> > > My understanding is that mlock() could in theory allow the page to be moved,
> > > but that currently nothing in the kernel would actually move it.  However,
> > > that could change in the future to allow hot-swapping of RAM.
> > 
> > That's my understanding too, that nothing currently does so.  Aside from
> > hot-swapping RAM, there's also a need to be able to migrate pages around
> > RAM, either to unfragment memory allowing higher-order allocations to
> > succeed more often, or to get around extreme dmamem/normal-mem/highmem
> > imbalances without dedicating huge reserves.  Those would more often
> > succeed if uninhibited by mlock.
> 
> If I am reading you correctly, you are saying that mlock currently
> prevents pages from migrating around to unfragment memory, but
> get_user_pages does not prevent this?

No, not what I meant at all.  I'm saying that currently (aside from
proposed patches) there is no such migration of pages; that we'd prefer
to implement migration in such a way that mlock does not inhibit it
(though there might prove to be strong arguments defeating that);
and that get_user_pages _must_ prevent migration (and if there
were already such migration, I'd be saying it _does_ prevent it).

Hugh

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-11 20:12                               ` William Jordan
  2005-05-11 20:42                                 ` Hugh Dickins
@ 2005-05-11 22:49                                 ` Andrea Arcangeli
  2005-05-11 22:53                                   ` Timur Tabi
  1 sibling, 1 reply; 144+ messages in thread
From: Andrea Arcangeli @ 2005-05-11 22:49 UTC (permalink / raw)
  To: William Jordan
  Cc: Hugh Dickins, Timur Tabi, Andrew Morton, linux-kernel, openib-general

On Wed, May 11, 2005 at 04:12:41PM -0400, William Jordan wrote:
> If I am reading you correctly, you are saying that mlock currently
> prevents pages from migrating around to unfragment memory, but
> get_user_pages does not prevent this? If this is the case, this could

This is not the case. Infact get_user_pages is a stronger pin than
mlock. But if you call it by hand and you plan to write to the page, you
have to use the "write=1" flag, this is fundamental if you want to write
to the physical page from userland while it's being tracked by IB dma.

In short you should not use mlock and you should use only
get_user_pages(write=1).

If the problem appears again even after the last fix for the COW I did
last year, than it means we've another yet another bug to fix.

Using mlock for this is unnecessary. mlock is a "virtual" pin and it
provides weaker guarantees than what you need. You need _physical_ pin
and get_user_pages(write=1) is the only one that will give it to you.

write=0 is ok too if you're never ever going to write to the page with
the cpu from userland.

In the old days there was the concept that get_user_pages wasn't a
"pte-pin", but that was infact broken in the way COW was working with threads,
but this is fixed now that is really a "pte-pin" again (like in 2.2
which never had the corruption cow bug!) even though the pte may
temporarily be set to swapcache or null. In current 2.6 you're
guaranteed that despite the pte may be temporarly be set to not-present,
the next minor fault will bring into memory the very same physical page
that was there before. At least unless you map the thing writeprotect
(i.e. write=0) and you write to it from userland.. ;).

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-11 20:42                                 ` Hugh Dickins
@ 2005-05-11 22:52                                   ` Andrea Arcangeli
  0 siblings, 0 replies; 144+ messages in thread
From: Andrea Arcangeli @ 2005-05-11 22:52 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: William Jordan, Timur Tabi, Andrew Morton, linux-kernel, openib-general

On Wed, May 11, 2005 at 09:42:24PM +0100, Hugh Dickins wrote:
> proposed patches) there is no such migration of pages; that we'd prefer
> to implement migration in such a way that mlock does not inhibit it
> (though there might prove to be strong arguments defeating that);
> and that get_user_pages _must_ prevent migration (and if there
> were already such migration, I'd be saying it _does_ prevent it).

Indeed, mlock is a virtual pin and as such it won't be guaranteed to
always prevent migration. While get_user_pages is a physical pin on the
physical page so it has to prevent migration.

I think for him the physical pin is better since I guess IB would break
(at least unless you've some method to call to stop IB, adjust the IB
dma tracking, and restart IB, that hotplug can call). For the short term
using only get_user_pages sounds simpler IMHO.

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-11 22:49                                 ` Andrea Arcangeli
@ 2005-05-11 22:53                                   ` Timur Tabi
  2005-05-11 23:05                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 144+ messages in thread
From: Timur Tabi @ 2005-05-11 22:53 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: William Jordan, Hugh Dickins, Andrew Morton, linux-kernel,
	openib-general

Andrea Arcangeli wrote:

> If the problem appears again even after the last fix for the COW I did
> last year, than it means we've another yet another bug to fix.

All of my memory pinning test cases pass when I use get_user_pages() with kernels 2.6.7 
and later.

-- 
Timur Tabi
Staff Software Engineer
timur.tabi@ammasso.com

One thing a Southern boy will never say is,
"I don't think duct tape will fix it."
      -- Ed Smylie, NASA engineer for Apollo 13

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
  2005-05-11 22:53                                   ` Timur Tabi
@ 2005-05-11 23:05                                     ` Andrea Arcangeli
  0 siblings, 0 replies; 144+ messages in thread
From: Andrea Arcangeli @ 2005-05-11 23:05 UTC (permalink / raw)
  To: Timur Tabi
  Cc: William Jordan, Hugh Dickins, Andrew Morton, linux-kernel,
	openib-general

On Wed, May 11, 2005 at 05:53:36PM -0500, Timur Tabi wrote:
> Andrea Arcangeli wrote:
> 
> >If the problem appears again even after the last fix for the COW I did
> >last year, than it means we've another yet another bug to fix.
> 
> All of my memory pinning test cases pass when I use get_user_pages() with 
> kernels 2.6.7 and later.

Well then your problem was the cow bug, that was corrupting userland
with O_DIRECT too...

^ permalink raw reply	[flat|nested] 144+ messages in thread

* Re: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation
       [not found] ` <3VNYt-4M4-15@gated-at.bofh.it>
@ 2005-04-22 13:10   ` Bodo Eggert <harvested.in.lkml@posting.7eggert.dyndns.org>
  0 siblings, 0 replies; 144+ messages in thread
From: Bodo Eggert <harvested.in.lkml@posting.7eggert.dyndns.org> @ 2005-04-22 13:10 UTC (permalink / raw)
  To: Andy Isaacson, Timur Tabi, Troy Benjegerdes, Bernhard Fischer,
	Arjan van de Ven, linux-kernel, openib-general

Andy Isaacson <adi@hexapodia.org> wrote:
> On Wed, Apr 20, 2005 at 10:07:45PM -0500, Timur Tabi wrote:

>> I don't know if VM_REGISTERED is a good idea or not, but it should be
>> absolutely impossible for the kernel to reclaim "registered" (aka pinned)
>> memory, no matter what. For RDMA services (such as Infiniband, iWARP, etc),
>> it's normal for non-root processes to pin hundreds of megabytes of memory,
>> and that memory better be locked to those physical pages until the
>> application deregisters them.
> 
> If you take the hardline position that "the app is the only thing that
> matters", your code is unlikely to get merged.  Linux is a
> general-purpose OS.

All userspace hardware drivers with DMA will require pinned pages (and some
of them will require continuous memory). Since this memory may be scheduled
to be accessed by DMA, reclaiming those pages may (aka. will) result in
"random" memory corruption unless done by the driver itself.

You can't even set a time limit, the driver may have allocated all DMA
memory to queued transfers, and some media needs to get plugged in by
the lazy robot. As soon as the robot arrives - boom. (For the same reason,
this memory MUST NOT be freed if the application terminates abnormally,
e.g. killed by OOM).

In other words, you need to make this memory as unaccessible as the
framebuffer on a graphic card. If that causes a lockup, you better had
prevented that while allocating.

> In a Linux context, I doubt that fullblown SA is necessary or
> appropriate.  Rather, I'd suggest two new signals, SIGMEMLOW and
> SIGMEMCRIT.  The userland comms library registers handlers for both.
> When the kernel decides that it needs to reclaim some memory from the
> app, it sends SIGMEMLOW.  The comms library then has the responsibility
> to un-reserve some memory in an orderly fashion.  If a reasonable [1]
> time has expired since SIGMEMLOW and the kernel is still hungry, the
> kernel sends SIGMEMCRIT.  At this point, the comms lib *must* unregister
> some memory [2] even if it has to drop state to do so; if it returns
> from the signal handler without having unregistered the memory, the
> kernel will SIGKILL.

Choosing Data loss vs. finitely stalled system may sometimes be a bad
decision.

If I designes an application that might get a "gimme memory or die",
I'd reserve an extra bunch of memory with the only purpose of being
released in this situation. If the kernel had done that instead, this
part of memory could have been used e.g. as a read-only disk cache in
the meantime (off cause provided somebody cared to implement that).

> [2] Is there a way for the kernel to pass down to userspace how many
>     pages it wants, maybe in the sigcontext?

Then you'd need only one signal.

I think this interface is usefull, it would e.g. allow a picture viewer
to cache as many decoded and scaled pictures as the RAM permits, freeing
them if the RAM gets full and the swap would have to be used.

-- 
"When the pin is pulled, Mr. Grenade is not our friend.
-U.S. Marine Corps


^ permalink raw reply	[flat|nested] 144+ messages in thread

end of thread, other threads:[~2005-05-11 23:07 UTC | newest]

Thread overview: 144+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-04-04 22:09 [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Roland Dreier
2005-04-04 22:09 ` [PATCH][RFC][1/4] IB: core changes for userspace verbs Roland Dreier
2005-04-04 22:09   ` [PATCH][RFC][2/4] IB: userspace verbs main module Roland Dreier
2005-04-04 22:09     ` [PATCH][RFC][3/4] IB: userspace verbs mthca changes Roland Dreier
2005-04-04 22:09       ` [PATCH][RFC][4/4] IB: userspace verbs Kconfig/Makefile changes Roland Dreier
2005-04-04 22:49       ` [openib-general] [PATCH][RFC][3/4] IB: userspace verbs mthca changes Tom Duffy
2005-04-04 23:34         ` Roland Dreier
2005-04-21  0:37       ` [PATCH][MTHCA] fix sparc build WAS: " Tom Duffy
2005-04-21  0:38         ` David S. Miller
2005-04-11 14:22 ` [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Troy Benjegerdes
2005-04-11 15:34   ` Roland Dreier
2005-04-11 16:33     ` Troy Benjegerdes
2005-04-11 16:56       ` Roland Dreier
2005-04-11 18:01         ` Troy Benjegerdes
2005-04-11 18:03           ` Roland Dreier
2005-04-12  0:13             ` Andrew Morton
2005-04-12  0:21               ` Roland Dreier
2005-04-12 18:23                 ` Michael S. Tsirkin
2005-04-13 18:28                   ` Roland Dreier
2005-04-13 19:32                     ` Andrew Morton
2005-04-13  1:04               ` [openib-general] " Libor Michalek
2005-04-18 17:15                 ` Timur Tabi
2005-04-26  3:31                 ` Libor Michalek
2005-05-04 18:27                   ` Timur Tabi
2005-05-05 18:48                     ` Timur Tabi
2005-05-06 23:08                       ` Timur Tabi
2005-05-07 13:18                         ` Hugh Dickins
2005-05-07 14:45                           ` Timur Tabi
2005-05-07 16:30                             ` Hugh Dickins
2005-05-11 20:12                               ` William Jordan
2005-05-11 20:42                                 ` Hugh Dickins
2005-05-11 22:52                                   ` Andrea Arcangeli
2005-05-11 22:49                                 ` Andrea Arcangeli
2005-05-11 22:53                                   ` Timur Tabi
2005-05-11 23:05                                     ` Andrea Arcangeli
2005-05-05 23:34                     ` Libor Michalek
2005-04-18 16:22               ` Timur Tabi
2005-04-18 16:43                 ` Christoph Hellwig
2005-04-18 16:45                   ` Timur Tabi
2005-04-24  2:44                     ` Andrew Morton
2005-04-24 14:23                       ` Timur Tabi
2005-04-24 20:53                         ` Greg KH
2005-04-24 21:52                           ` Timur Tabi
2005-04-25  1:03                             ` Greg KH
2005-04-25  4:12                               ` Timur Tabi
2005-04-25 13:30                                 ` Dave Hansen
2005-04-25 13:15                         ` Roland Dreier
2005-04-25 13:17                           ` Christoph Hellwig
2005-04-25 14:16                             ` Roland Dreier
2005-04-25 20:54                           ` Andrew Morton
2005-04-25 21:12                             ` Roland Dreier
2005-04-25 22:14                               ` Andrew Morton
2005-04-25 22:21                                 ` Timur Tabi
2005-04-25 22:32                                   ` Andrew Morton
2005-04-25 23:58                                     ` Roland Dreier
2005-04-26  0:11                                       ` Andrew Morton
2005-04-26  0:23                                         ` Roland Dreier
2005-04-26  0:37                                           ` Andrew Morton
2005-04-26  2:21                                             ` Timur Tabi
2005-04-26  3:16                                               ` Andrew Morton
2005-04-26  3:38                                                 ` Timur Tabi
2005-04-26  4:33                                                   ` Andrew Morton
2005-04-26 14:07                                                     ` Timur Tabi
2005-04-26 15:31                                             ` Roland Dreier
2005-04-26 15:42                                               ` [openib-general] " Libor Michalek
2005-04-26 15:49                                                 ` Roland Dreier
2005-04-26 19:28                                                   ` Andrew Morton
2005-04-26 20:14                                                     ` Roland Dreier
2005-04-26 20:18                                                       ` Timur Tabi
2005-04-26 20:37                                                         ` Andrew Morton
2005-04-29 14:26                                                           ` Bill Jordan
2005-04-29 15:56                                                             ` Caitlin Bestler
2005-04-29 16:45                                                               ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Roland Dreier
2005-04-29 17:23                                                                 ` Libor Michalek
2005-04-29 18:22                                                                 ` RDMA memory registration Brice Goglin
2005-04-29 18:31                                                                   ` Roland Dreier
2005-04-29 19:33                                                                   ` [openib-general] " Grant Grundler
2005-05-03  8:42                                                                     ` David Addison
2005-05-03 15:36                                                                       ` Grant Grundler
2005-04-29 19:43                                                                 ` RDMA memory registration (was: [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation) Bill Jordan
2005-04-29 19:45                                                                   ` RDMA memory registration Roland Dreier
2005-04-29 17:04                                                               ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
2005-04-30  0:31                                                                 ` Caitlin Bestler
2005-05-03 18:43                                                                   ` Andy Isaacson
2005-05-03 19:04                                                                     ` Caitlin Bestler
2005-05-04 18:22                                                                     ` William Jordan
2005-05-05  1:27                                                                       ` Rik van Riel
2005-05-05  1:57                                                                         ` Andy Isaacson
2005-04-26 20:32                                                       ` Andrew Morton
2005-04-26 21:23                                                         ` Roland Dreier
2005-04-27  0:05                                                           ` Andrew Morton
2005-04-27  2:13                                                             ` Roland Dreier
2005-04-27  3:21                                                             ` Caitlin Bestler
2005-04-27  3:15                                                     ` Caitlin Bestler
2005-04-26  2:03                                       ` IWAMOTO Toshihiro
2005-04-26  2:16                                         ` Timur Tabi
2005-04-26  2:26                                         ` [openib-general] " Stephen Langdon
2005-04-25 22:23                                 ` Timur Tabi
2005-04-25 22:35                                   ` Andrew Morton
2005-04-25 22:42                                     ` Timur Tabi
2005-04-25 23:13                                       ` Andrew Morton
2005-04-25 23:21                                         ` Timur Tabi
2005-04-25 23:27                                           ` Andrew Morton
2005-04-26  0:08                                         ` Roland Dreier
2005-04-25 22:51                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbsimplementation Bob Woodruff
2005-04-25 23:13                                       ` Timur Tabi
2005-04-25 23:17                                         ` Andrew Morton
2005-04-25 23:29                                         ` Bob Woodruff
2005-04-25 23:17                                     ` [openib-general] Re: [PATCH][RFC][0/4] InfiniBand userspace verbs implementation Libor Michalek
2005-04-25 23:24                                       ` Andrew Morton
2005-04-25 23:37                                         ` Caitlin Bestler
2005-04-26  0:10                                           ` Andrew Morton
2005-04-26  3:55                                         ` Libor Michalek
2005-04-26  0:02                                 ` Roland Dreier
2005-04-26  6:12                                   ` Christoph Hellwig
2005-04-26 13:45                                     ` [openib-general] " Caitlin Bestler
2005-04-26 15:24                                     ` Timur Tabi
2005-04-25 19:11                       ` Andy Isaacson
2005-04-18 16:09     ` Timur Tabi
2005-04-18 16:12       ` Roland Dreier
2005-04-18 16:50         ` Timur Tabi
2005-04-21 19:47           ` Pavel Machek
2005-04-18 16:16       ` Arjan van de Ven
2005-04-18 16:25         ` Timur Tabi
2005-04-18 19:40           ` Arjan van de Ven
2005-04-18 20:00             ` Timur Tabi
2005-04-18 20:05               ` Arjan van de Ven
2005-04-18 20:19                 ` Timur Tabi
2005-04-18 20:07             ` [openib-general] " Bernhard Fischer
2005-04-21  2:17               ` Troy Benjegerdes
2005-04-21  3:07                 ` Timur Tabi
2005-04-21 17:38                   ` Andy Isaacson
2005-04-21 18:39                     ` Timur Tabi
2005-04-21 19:56                       ` Andy Isaacson
2005-04-21 20:07                         ` Timur Tabi
2005-04-21 20:12                           ` Chris Wright
2005-04-21 20:14                             ` Timur Tabi
2005-04-21 20:25                               ` Chris Wright
2005-04-21 20:30                                 ` Arjan van de Ven
2005-04-22  6:14                           ` Greg KH
2005-04-22 17:55         ` Timur Tabi
2005-04-22 18:12           ` Arjan van de Ven
2005-04-29  0:56         ` Andrew Morton
     [not found] <3VAeQ-1To-7@gated-at.bofh.it>
     [not found] ` <3VNYt-4M4-15@gated-at.bofh.it>
2005-04-22 13:10   ` [openib-general] " Bodo Eggert <harvested.in.lkml@posting.7eggert.dyndns.org>

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).