All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues
@ 2015-10-30 22:41 ira.weiny
  2015-10-30 22:41 ` [PATCH 1/9] staging/rdma/hfi1: Remove file pointer macros ira.weiny
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: ira.weiny @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh, devel; +Cc: linux-rdma, dledford, dennis.dalessandro

From: Ira Weiny <ira.weiny@intel.com>

The following are fixes to the hfi1 driver for stability, security, and
performance.

New series based off of the latest staging-next.  Large patches were split up
and a new patch was added to remove offending macros.

Ira Weiny (1):
  staging/rdma/hfi1: Remove file pointer macros

Mike Marciniszyn (3):
  staging/rdma/hfi1: move hfi1_migrate_qp
  staging/rdma/hfi1: Use parallel workqueue for SDMA engines
  staging/rdma/hfi: pre-compute sc and sde for RC/UC QPs

Mitko Haralanov (5):
  staging/rdma/hfi1: Clean up macro indentation
  staging/rdma/hfi1: Remove unnecessary include files
  staging/rdma/hfi1: Move macros to a common header
  staging/rdma/hfi1: Add function stubs for TID caching
  staging/rdma/hfi1: Implement Expected Receive TID caching

 drivers/staging/rdma/hfi1/Kconfig        |    1 +
 drivers/staging/rdma/hfi1/Makefile       |    2 +-
 drivers/staging/rdma/hfi1/chip.c         |    1 +
 drivers/staging/rdma/hfi1/common.h       |   15 +-
 drivers/staging/rdma/hfi1/file_ops.c     |  598 +++------------
 drivers/staging/rdma/hfi1/hfi.h          |   55 +-
 drivers/staging/rdma/hfi1/init.c         |   18 +-
 drivers/staging/rdma/hfi1/iowait.h       |    6 +-
 drivers/staging/rdma/hfi1/qp.c           |   47 +-
 drivers/staging/rdma/hfi1/qp.h           |   38 +-
 drivers/staging/rdma/hfi1/ruc.c          |   30 +-
 drivers/staging/rdma/hfi1/sdma.c         |    8 +-
 drivers/staging/rdma/hfi1/sdma.h         |    8 +-
 drivers/staging/rdma/hfi1/trace.h        |  132 ++--
 drivers/staging/rdma/hfi1/ud.c           |    1 +
 drivers/staging/rdma/hfi1/user_exp_rcv.c | 1174 ++++++++++++++++++++++++++++++
 drivers/staging/rdma/hfi1/user_exp_rcv.h |   82 +++
 drivers/staging/rdma/hfi1/user_pages.c   |  110 +--
 drivers/staging/rdma/hfi1/user_sdma.c    |   47 +-
 drivers/staging/rdma/hfi1/user_sdma.h    |   10 +-
 drivers/staging/rdma/hfi1/verbs.c        |   34 +-
 drivers/staging/rdma/hfi1/verbs.h        |    6 +-
 include/uapi/rdma/hfi/hfi1_user.h        |   42 +-
 23 files changed, 1670 insertions(+), 795 deletions(-)
 create mode 100644 drivers/staging/rdma/hfi1/user_exp_rcv.c
 create mode 100644 drivers/staging/rdma/hfi1/user_exp_rcv.h

-- 
1.8.2

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/9] staging/rdma/hfi1: Remove file pointer macros
  2015-10-30 22:41 [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny
@ 2015-10-30 22:41 ` ira.weiny
       [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: ira.weiny @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh, devel; +Cc: linux-rdma, dledford, dennis.dalessandro

From: Ira Weiny <ira.weiny@intel.com>

Remove the following macros in favor of explicit use of struct hfi1_filedata and
various sub structures.

ctxt_fp
subctxt_fp
tidcursor_fp
user_sdma_pkt_fp
user_sdma_comp_fp

Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/staging/rdma/hfi1/file_ops.c  | 124 ++++++++++++++++++----------------
 drivers/staging/rdma/hfi1/hfi.h       |  12 ----
 drivers/staging/rdma/hfi1/user_sdma.c |  34 ++++++----
 3 files changed, 84 insertions(+), 86 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
index aae9826ec62b..f78bcf673252 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -204,7 +204,8 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 			       size_t count, loff_t *offset)
 {
 	const struct hfi1_cmd __user *ucmd;
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_cmd cmd;
 	struct hfi1_user_info uinfo;
 	struct hfi1_tid_info tinfo;
@@ -338,17 +339,17 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 		ret = exp_tid_free(fp, &tinfo);
 		break;
 	case HFI1_CMD_RECV_CTRL:
-		ret = manage_rcvq(uctxt, subctxt_fp(fp), (int)user_val);
+		ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
 		break;
 	case HFI1_CMD_POLL_TYPE:
 		uctxt->poll_type = (typeof(uctxt->poll_type))user_val;
 		break;
 	case HFI1_CMD_ACK_EVENT:
-		ret = user_event_ack(uctxt, subctxt_fp(fp), user_val);
+		ret = user_event_ack(uctxt, fd->subctxt, user_val);
 		break;
 	case HFI1_CMD_SET_PKEY:
 		if (HFI1_CAP_IS_USET(PKEY_CHECK))
-			ret = set_ctxt_pkey(uctxt, subctxt_fp(fp), user_val);
+			ret = set_ctxt_pkey(uctxt, fd->subctxt, user_val);
 		else
 			ret = -EPERM;
 		break;
@@ -431,13 +432,13 @@ bail:
 
 static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 {
-	struct hfi1_user_sdma_pkt_q *pq;
-	struct hfi1_user_sdma_comp_q *cq;
+	struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
+	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 	int ret = 0, done = 0, reqs = 0;
 	unsigned long dim = from->nr_segs;
 
-	if (!user_sdma_comp_fp(kiocb->ki_filp) ||
-	    !user_sdma_pkt_fp(kiocb->ki_filp)) {
+	if (!cq || !pq) {
 		ret = -EIO;
 		goto done;
 	}
@@ -448,10 +449,7 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 	}
 
 	hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
-		  ctxt_fp(kiocb->ki_filp)->ctxt, subctxt_fp(kiocb->ki_filp),
-		  dim);
-	pq = user_sdma_pkt_fp(kiocb->ki_filp);
-	cq = user_sdma_comp_fp(kiocb->ki_filp);
+		  fd->uctxt->ctxt, fd->subctxt, dim);
 
 	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) {
 		ret = -ENOSPC;
@@ -476,7 +474,8 @@ done:
 
 static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 {
-	struct hfi1_ctxtdata *uctxt;
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd;
 	unsigned long flags, pfn;
 	u64 token = vma->vm_pgoff << PAGE_SHIFT,
@@ -486,7 +485,6 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 	int ret = 0;
 	u16 ctxt;
 
-	uctxt = ctxt_fp(fp);
 	if (!is_valid_mmap(token) || !uctxt ||
 	    !(vma->vm_flags & VM_SHARED)) {
 		ret = -EINVAL;
@@ -496,7 +494,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 	ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
 	subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
 	type = HFI1_MMAP_TOKEN_GET(TYPE, token);
-	if (ctxt != uctxt->ctxt || subctxt != subctxt_fp(fp)) {
+	if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
 		ret = -EINVAL;
 		goto done;
 	}
@@ -660,13 +658,12 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 		vmf = 1;
 		break;
 	case SDMA_COMP: {
-		struct hfi1_user_sdma_comp_q *cq;
+		struct hfi1_user_sdma_comp_q *cq = fd->cq;
 
-		if (!user_sdma_comp_fp(fp)) {
+		if (!cq) {
 			ret = -EFAULT;
 			goto done;
 		}
-		cq = user_sdma_comp_fp(fp);
 		memaddr = (u64)cq->comps;
 		memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE);
 		flags |= VM_IO | VM_DONTEXPAND;
@@ -680,7 +677,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 
 	if ((vma->vm_end - vma->vm_start) != memlen) {
 		hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
-			  uctxt->ctxt, subctxt_fp(fp),
+			  uctxt->ctxt, fd->subctxt,
 			  (vma->vm_end - vma->vm_start), memlen);
 		ret = -EINVAL;
 		goto done;
@@ -730,7 +727,7 @@ static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
 	struct hfi1_ctxtdata *uctxt;
 	unsigned pollflag;
 
-	uctxt = ctxt_fp(fp);
+	uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
 	if (!uctxt)
 		pollflag = POLLERR;
 	else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
@@ -930,6 +927,7 @@ static int find_shared_ctxt(struct file *fp,
 {
 	int devmax, ndev, i;
 	int ret = 0;
+	struct hfi1_filedata *fd = fp->private_data;
 
 	devmax = hfi1_count_units(NULL, NULL);
 
@@ -959,10 +957,10 @@ static int find_shared_ctxt(struct file *fp,
 				ret = -EINVAL;
 				goto done;
 			}
-			ctxt_fp(fp) = uctxt;
-			subctxt_fp(fp) = uctxt->cnt++;
-			uctxt->subpid[subctxt_fp(fp)] = current->pid;
-			uctxt->active_slaves |= 1 << subctxt_fp(fp);
+			fd->uctxt = uctxt;
+			fd->subctxt  = uctxt->cnt++;
+			uctxt->subpid[fd->subctxt] = current->pid;
+			uctxt->active_slaves |= 1 << fd->subctxt;
 			ret = 1;
 			goto done;
 		}
@@ -975,6 +973,7 @@ done:
 static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 			 struct hfi1_user_info *uinfo)
 {
+	struct hfi1_filedata *fd = fp->private_data;
 	struct hfi1_ctxtdata *uctxt;
 	unsigned ctxt;
 	int ret;
@@ -1022,7 +1021,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 	 * This has to be done here so the rest of the sub-contexts find the
 	 * proper master.
 	 */
-	if (uinfo->subctxt_cnt && !subctxt_fp(fp)) {
+	if (uinfo->subctxt_cnt && !fd->subctxt) {
 		ret = init_subctxts(uctxt, uinfo);
 		/*
 		 * On error, we don't need to disable and de-allocate the
@@ -1042,7 +1041,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 	spin_lock_init(&uctxt->sdma_qlock);
 	hfi1_stats.sps_ctxts++;
 	dd->freectxts--;
-	ctxt_fp(fp) = uctxt;
+	fd->uctxt = uctxt;
 
 	return 0;
 }
@@ -1106,7 +1105,8 @@ static int user_init(struct file *fp)
 {
 	int ret;
 	unsigned int rcvctrl_ops = 0;
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 
 	/* make sure that the context has already been setup */
 	if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) {
@@ -1118,7 +1118,7 @@ static int user_init(struct file *fp)
 	 * Subctxts don't need to initialize anything since master
 	 * has done it.
 	 */
-	if (subctxt_fp(fp)) {
+	if (fd->subctxt) {
 		ret = wait_event_interruptible(uctxt->wait,
 			!test_bit(HFI1_CTXT_MASTER_UNINIT,
 			&uctxt->event_flags));
@@ -1178,8 +1178,8 @@ done:
 static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
 {
 	struct hfi1_ctxt_info cinfo;
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
 	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	int ret = 0;
 
 	memset(&cinfo, 0, sizeof(cinfo));
@@ -1189,7 +1189,7 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
 	cinfo.num_active = hfi1_count_active_units();
 	cinfo.unit = uctxt->dd->unit;
 	cinfo.ctxt = uctxt->ctxt;
-	cinfo.subctxt = subctxt_fp(fp);
+	cinfo.subctxt = fd->subctxt;
 	cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
 				uctxt->dd->rcv_entries.group_size) +
 		uctxt->expected_count;
@@ -1201,10 +1201,10 @@ static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
 	cinfo.egrtids = uctxt->egrbufs.alloced;
 	cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
 	cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
-	cinfo.sdma_ring_size = user_sdma_comp_fp(fp)->nentries;
+	cinfo.sdma_ring_size = fd->cq->nentries;
 	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
 
-	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, subctxt_fp(fp), cinfo);
+	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
 	if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
 		ret = -EFAULT;
 done:
@@ -1213,7 +1213,8 @@ done:
 
 static int setup_ctxt(struct file *fp)
 {
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	int ret = 0;
 
@@ -1222,7 +1223,7 @@ static int setup_ctxt(struct file *fp)
 	 * programming of eager buffers. This is done if context sharing
 	 * is not requested or by the master process.
 	 */
-	if (!uctxt->subctxt_cnt || !subctxt_fp(fp)) {
+	if (!uctxt->subctxt_cnt || !fd->subctxt) {
 		ret = hfi1_init_ctxt(uctxt->sc);
 		if (ret)
 			goto done;
@@ -1234,7 +1235,7 @@ static int setup_ctxt(struct file *fp)
 		ret = hfi1_setup_eagerbufs(uctxt);
 		if (ret)
 			goto done;
-		if (uctxt->subctxt_cnt && !subctxt_fp(fp)) {
+		if (uctxt->subctxt_cnt && !fd->subctxt) {
 			ret = setup_subctxt(uctxt);
 			if (ret)
 				goto done;
@@ -1277,7 +1278,7 @@ static int setup_ctxt(struct file *fp)
 			uctxt->tidusemap[uctxt->tidmapcnt - 1] =
 				~((1ULL << (uctxt->numtidgroups %
 					    BITS_PER_LONG)) - 1);
-		trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0,
+		trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0,
 				       uctxt->tidusemap, uctxt->tidmapcnt);
 	}
 	ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
@@ -1292,7 +1293,8 @@ done:
 static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
 {
 	struct hfi1_base_info binfo;
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	ssize_t sz;
 	unsigned offset;
@@ -1314,50 +1316,50 @@ static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
 	offset = ((u64)uctxt->sc->hw_free -
 		  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
 	binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
-					       subctxt_fp(fp), offset);
+						fd->subctxt, offset);
 	binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
-					    subctxt_fp(fp),
+					    fd->subctxt,
 					    uctxt->sc->base_addr);
 	binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
 						uctxt->ctxt,
-						subctxt_fp(fp),
+						fd->subctxt,
 						uctxt->sc->base_addr);
 	binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
-					       subctxt_fp(fp),
+					       fd->subctxt,
 					       uctxt->rcvhdrq);
 	binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
-					       subctxt_fp(fp),
+					       fd->subctxt,
 					       uctxt->egrbufs.rcvtids[0].phys);
 	binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
-						 subctxt_fp(fp), 0);
+						 fd->subctxt, 0);
 	/*
 	 * user regs are at
 	 * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
 	 */
 	binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
-					    subctxt_fp(fp), 0);
+					    fd->subctxt, 0);
 	offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
-		    HFI1_MAX_SHARED_CTXTS) + subctxt_fp(fp)) *
+		    HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
 		  sizeof(*dd->events));
 	binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
-					      subctxt_fp(fp),
+					      fd->subctxt,
 					      offset);
 	binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
-					      subctxt_fp(fp),
+					      fd->subctxt,
 					      dd->status);
 	if (HFI1_CAP_IS_USET(DMA_RTAIL))
 		binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
-						       subctxt_fp(fp), 0);
+						       fd->subctxt, 0);
 	if (uctxt->subctxt_cnt) {
 		binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
 							uctxt->ctxt,
-							subctxt_fp(fp), 0);
+							fd->subctxt, 0);
 		binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
 							 uctxt->ctxt,
-							 subctxt_fp(fp), 0);
+							 fd->subctxt, 0);
 		binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
 							 uctxt->ctxt,
-							 subctxt_fp(fp), 0);
+							 fd->subctxt, 0);
 	}
 	sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
 	if (copy_to_user(ubase, &binfo, sz))
@@ -1368,7 +1370,8 @@ static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
 static unsigned int poll_urgent(struct file *fp,
 				struct poll_table_struct *pt)
 {
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	unsigned pollflag;
 
@@ -1390,7 +1393,8 @@ static unsigned int poll_urgent(struct file *fp,
 static unsigned int poll_next(struct file *fp,
 			      struct poll_table_struct *pt)
 {
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	unsigned pollflag;
 
@@ -1562,7 +1566,8 @@ static inline unsigned num_free_groups(unsigned long map, u16 *start)
 static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 {
 	int ret = 0;
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	unsigned tid, mapped = 0, npages, ngroups, exp_groups,
 		tidpairs = uctxt->expected_count / 2;
@@ -1718,7 +1723,7 @@ static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 						   pages[pmapped], 0,
 						   tidsize, PCI_DMA_FROMDEVICE);
 				trace_hfi1_exp_rcv_set(uctxt->ctxt,
-						       subctxt_fp(fp),
+						       fd->subctxt,
 						       tid, vaddr,
 						       phys[pmapped],
 						       pages[pmapped]);
@@ -1763,7 +1768,7 @@ static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 			   (((useidx & 0xffffff) << 16) |
 			    ((bitidx + bits_used) & 0xffffff)));
 	}
-	trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, uctxt->tidusemap,
+	trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0, uctxt->tidusemap,
 			       uctxt->tidmapcnt);
 
 done:
@@ -1792,7 +1797,8 @@ bail:
 
 static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
 {
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 	struct hfi1_devdata *dd = uctxt->dd;
 	unsigned long tidmap[uctxt->tidmapcnt];
 	struct page **pages;
@@ -1828,7 +1834,7 @@ static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
 					hfi1_put_tid(dd, tid, PT_INVALID,
 						      0, 0);
 					trace_hfi1_exp_rcv_free(uctxt->ctxt,
-								subctxt_fp(fp),
+								fd->subctxt,
 								tid, phys[i],
 								pages[i]);
 					pci_unmap_page(dd->pcidev, phys[i],
@@ -1845,7 +1851,7 @@ static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
 			map &= ~(1ULL<<bitidx);
 		}
 	}
-	trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 1, uctxt->tidusemap,
+	trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 1, uctxt->tidusemap,
 			       uctxt->tidmapcnt);
 done:
 	return ret;
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index 190f7a2f6773..755e4d152d41 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -1423,18 +1423,6 @@ int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr,
 void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf,
 			   u64 pbc, const void *from, size_t count);
 
-/* for use in system calls, where we want to know device type, etc. */
-#define ctxt_fp(fp) \
-	(((struct hfi1_filedata *)(fp)->private_data)->uctxt)
-#define subctxt_fp(fp) \
-	(((struct hfi1_filedata *)(fp)->private_data)->subctxt)
-#define tidcursor_fp(fp) \
-	(((struct hfi1_filedata *)(fp)->private_data)->tidcursor)
-#define user_sdma_pkt_fp(fp) \
-	(((struct hfi1_filedata *)(fp)->private_data)->pq)
-#define user_sdma_comp_fp(fp) \
-	(((struct hfi1_filedata *)(fp)->private_data)->cq)
-
 static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd)
 {
 	return ppd->dd;
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index 36c838dcf023..be7a4e53335f 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -352,6 +352,7 @@ static void sdma_kmem_cache_ctor(void *obj)
 
 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 {
+	struct hfi1_filedata *fd;
 	int ret = 0;
 	unsigned memsize;
 	char buf[64];
@@ -365,6 +366,8 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 		goto done;
 	}
 
+	fd = fp->private_data;
+
 	if (!hfi1_sdma_comp_ring_size) {
 		ret = -EINVAL;
 		goto done;
@@ -384,7 +387,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 	INIT_LIST_HEAD(&pq->list);
 	pq->dd = dd;
 	pq->ctxt = uctxt->ctxt;
-	pq->subctxt = subctxt_fp(fp);
+	pq->subctxt = fd->subctxt;
 	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 	pq->state = SDMA_PKT_Q_INACTIVE;
 	atomic_set(&pq->n_reqs, 0);
@@ -393,7 +396,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 		    activate_packet_queue);
 	pq->reqidx = 0;
 	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
-		 subctxt_fp(fp));
+		 fd->subctxt);
 	pq->txreq_cache = kmem_cache_create(buf,
 			       sizeof(struct user_sdma_txreq),
 					    L1_CACHE_BYTES,
@@ -404,7 +407,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 			   uctxt->ctxt);
 		goto pq_txreq_nomem;
 	}
-	user_sdma_pkt_fp(fp) = pq;
+	fd->pq = pq;
 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 	if (!cq)
 		goto cq_nomem;
@@ -416,7 +419,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
 		goto cq_comps_nomem;
 
 	cq->nentries = hfi1_sdma_comp_ring_size;
-	user_sdma_comp_fp(fp) = cq;
+	fd->cq = cq;
 
 	spin_lock_irqsave(&uctxt->sdma_qlock, flags);
 	list_add(&pq->list, &uctxt->sdma_queues);
@@ -431,7 +434,7 @@ pq_txreq_nomem:
 	kfree(pq->reqs);
 pq_reqs_nomem:
 	kfree(pq);
-	user_sdma_pkt_fp(fp) = NULL;
+	fd->pq = NULL;
 pq_nomem:
 	ret = -ENOMEM;
 done:
@@ -485,9 +488,10 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 				   unsigned long dim, unsigned long *count)
 {
 	int ret = 0, i = 0, sent;
-	struct hfi1_ctxtdata *uctxt = ctxt_fp(fp);
-	struct hfi1_user_sdma_pkt_q *pq = user_sdma_pkt_fp(fp);
-	struct hfi1_user_sdma_comp_q *cq = user_sdma_comp_fp(fp);
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
+	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 	struct hfi1_devdata *dd = pq->dd;
 	unsigned long idx = 0;
 	u8 pcount = initial_pkt_count;
@@ -499,7 +503,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 		hfi1_cdbg(
 		   SDMA,
 		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
-		   dd->unit, uctxt->ctxt, subctxt_fp(fp),
+		   dd->unit, uctxt->ctxt, fd->subctxt,
 		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 		ret = -EINVAL;
 		goto done;
@@ -507,15 +511,15 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 	if (ret) {
 		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
-			  dd->unit, uctxt->ctxt, subctxt_fp(fp), ret);
+			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 		ret = -EFAULT;
 		goto done;
 	}
-	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, subctxt_fp(fp),
+	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 				     (u16 *)&info);
 	if (cq->comps[info.comp_idx].status == QUEUED) {
 		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
-			  dd->unit, uctxt->ctxt, subctxt_fp(fp),
+			  dd->unit, uctxt->ctxt, fd->subctxt,
 			  info.comp_idx);
 		ret = -EBADSLT;
 		goto done;
@@ -523,7 +527,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 	if (!info.fragsize) {
 		hfi1_cdbg(SDMA,
 			  "[%u:%u:%u:%u] Request does not specify fragsize",
-			  dd->unit, uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 		ret = -EINVAL;
 		goto done;
 	}
@@ -532,7 +536,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 	 * "allocate" the request entry.
 	 */
 	hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
-		  uctxt->ctxt, subctxt_fp(fp), info.comp_idx);
+		  uctxt->ctxt, fd->subctxt, info.comp_idx);
 	req = pq->reqs + info.comp_idx;
 	memset(req, 0, sizeof(*req));
 	/* Mark the request as IN_USE before we start filling it in. */
@@ -659,7 +663,7 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
 
 	/* Have to select the engine */
 	req->sde = sdma_select_engine_vl(dd,
-					 (u32)(uctxt->ctxt + subctxt_fp(fp)),
+					 (u32)(uctxt->ctxt + fd->subctxt),
 					 vl);
 	if (!req->sde || !sdma_running(req->sde)) {
 		ret = -ECOMM;
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/9] staging/rdma/hfi1: Clean up macro indentation
       [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2015-10-30 22:41   ` ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  2015-10-30 22:41   ` [PATCH v4 6/9] staging/rdma/hfi1: Implement Expected Receive TID caching ira.weiny-ral2JQCrhuEAvxtiuMwx3w
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: ira.weiny-ral2JQCrhuEAvxtiuMwx3w @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w, Mitko Haralanov,
	Ira Weiny

From: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

In preparation for implementing Expected TID caching we do some simple clean up
of header file macros.

Signed-off-by: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/staging/rdma/hfi1/common.h | 15 ++++++++-------
 include/uapi/rdma/hfi/hfi1_user.h  | 26 +++++++++++++-------------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h
index 5e203239c5b0..5dd92720faae 100644
--- a/drivers/staging/rdma/hfi1/common.h
+++ b/drivers/staging/rdma/hfi1/common.h
@@ -132,13 +132,14 @@
  * HFI1_CAP_RESERVED_MASK bits.
  */
 #define HFI1_CAP_WRITABLE_MASK   (HFI1_CAP_SDMA_AHG |			\
-				 HFI1_CAP_HDRSUPP |			\
-				 HFI1_CAP_MULTI_PKT_EGR |		\
-				 HFI1_CAP_NODROP_RHQ_FULL |		\
-				 HFI1_CAP_NODROP_EGR_FULL |		\
-				 HFI1_CAP_ALLOW_PERM_JKEY |		\
-				 HFI1_CAP_STATIC_RATE_CTRL |		\
-				 HFI1_CAP_PRINT_UNIMPL)
+				  HFI1_CAP_HDRSUPP |			\
+				  HFI1_CAP_MULTI_PKT_EGR |		\
+				  HFI1_CAP_NODROP_RHQ_FULL |		\
+				  HFI1_CAP_NODROP_EGR_FULL |		\
+				  HFI1_CAP_ALLOW_PERM_JKEY |		\
+				  HFI1_CAP_STATIC_RATE_CTRL |		\
+				  HFI1_CAP_PRINT_UNIMPL |		\
+				  HFI1_CAP_TID_UNMAP)
 /*
  * A set of capability bits that are "global" and are not allowed to be
  * set in the user bitmask.
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index 599562fe5d57..a2fc6cbfe414 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -127,13 +127,13 @@
 #define HFI1_CMD_TID_UPDATE      4	/* update expected TID entries */
 #define HFI1_CMD_TID_FREE        5	/* free expected TID entries */
 #define HFI1_CMD_CREDIT_UPD      6	/* force an update of PIO credit */
-#define HFI1_CMD_SDMA_STATUS_UPD 7       /* force update of SDMA status ring */
+#define HFI1_CMD_SDMA_STATUS_UPD 7      /* force update of SDMA status ring */
 
 #define HFI1_CMD_RECV_CTRL       8	/* control receipt of packets */
 #define HFI1_CMD_POLL_TYPE       9	/* set the kind of polling we want */
 #define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
-#define HFI1_CMD_SET_PKEY        11      /* set context's pkey */
-#define HFI1_CMD_CTXT_RESET      12      /* reset context's HW send context */
+#define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
+#define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
 /* separate EPROM commands from normal PSM commands */
 #define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
 #define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
@@ -144,18 +144,18 @@
 #define HFI1_CMD_EP_WRITE_P0     70      /* write EPROM partition 0 */
 #define HFI1_CMD_EP_WRITE_P1     71      /* write EPROM partition 1 */
 
-#define _HFI1_EVENT_FROZEN_BIT       0
-#define _HFI1_EVENT_LINKDOWN_BIT     1
-#define _HFI1_EVENT_LID_CHANGE_BIT   2
-#define _HFI1_EVENT_LMC_CHANGE_BIT   3
-#define _HFI1_EVENT_SL2VL_CHANGE_BIT 4
+#define _HFI1_EVENT_FROZEN_BIT         0
+#define _HFI1_EVENT_LINKDOWN_BIT       1
+#define _HFI1_EVENT_LID_CHANGE_BIT     2
+#define _HFI1_EVENT_LMC_CHANGE_BIT     3
+#define _HFI1_EVENT_SL2VL_CHANGE_BIT   4
 #define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_SL2VL_CHANGE_BIT
 
-#define HFI1_EVENT_FROZEN                (1UL << _HFI1_EVENT_FROZEN_BIT)
-#define HFI1_EVENT_LINKDOWN_BIT		(1UL << _HFI1_EVENT_LINKDOWN_BIT)
-#define HFI1_EVENT_LID_CHANGE_BIT	(1UL << _HFI1_EVENT_LID_CHANGE_BIT)
-#define HFI1_EVENT_LMC_CHANGE_BIT	(1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
-#define HFI1_EVENT_SL2VL_CHANGE_BIT	(1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
+#define HFI1_EVENT_FROZEN            (1UL << _HFI1_EVENT_FROZEN_BIT)
+#define HFI1_EVENT_LINKDOWN          (1UL << _HFI1_EVENT_LINKDOWN_BIT)
+#define HFI1_EVENT_LID_CHANGE        (1UL << _HFI1_EVENT_LID_CHANGE_BIT)
+#define HFI1_EVENT_LMC_CHANGE        (1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
+#define HFI1_EVENT_SL2VL_CHANGE      (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
 
 /*
  * These are the status bits readable (in ASCII form, 64bit value)
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/9] staging/rdma/hfi1: Remove unnecessary include files
  2015-10-30 22:41 [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny
  2015-10-30 22:41 ` [PATCH 1/9] staging/rdma/hfi1: Remove file pointer macros ira.weiny
       [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2015-10-30 22:41 ` ira.weiny
  2015-10-30 22:41 ` [PATCH 4/9] staging/rdma/hfi1: Move macros to a common header ira.weiny
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: ira.weiny @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh, devel; +Cc: linux-rdma, Mitko Haralanov, dledford, dennis.dalessandro

From: Mitko Haralanov <mitko.haralanov@intel.com>

These includes were not used in file_ops.c

Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/staging/rdma/hfi1/file_ops.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
index f78bcf673252..b8aeef0e0d39 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -47,20 +47,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
-#include <linux/pci.h>
 #include <linux/poll.h>
 #include <linux/cdev.h>
-#include <linux/swap.h>
 #include <linux/vmalloc.h>
-#include <linux/highmem.h>
 #include <linux/io.h>
-#include <linux/jiffies.h>
-#include <asm/pgtable.h>
-#include <linux/delay.h>
-#include <linux/export.h>
-#include <linux/module.h>
-#include <linux/cred.h>
-#include <linux/uio.h>
 
 #include "hfi.h"
 #include "pio.h"
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/9] staging/rdma/hfi1: Move macros to a common header
  2015-10-30 22:41 [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny
                   ` (2 preceding siblings ...)
  2015-10-30 22:41 ` [PATCH 3/9] staging/rdma/hfi1: Remove unnecessary include files ira.weiny
@ 2015-10-30 22:41 ` ira.weiny
  2015-10-30 22:41 ` [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching ira.weiny
  2015-10-30 22:41 ` [PATCH 7/9] staging/rdma/hfi1: move hfi1_migrate_qp ira.weiny
  5 siblings, 0 replies; 14+ messages in thread
From: ira.weiny @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh, devel; +Cc: linux-rdma, Mitko Haralanov, dledford, dennis.dalessandro

From: Mitko Haralanov <mitko.haralanov@intel.com>

In preparation of implementing TID caching move EXP_TID macros to a common
header user_exp_rcv.h

Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/staging/rdma/hfi1/file_ops.c     | 13 +-----
 drivers/staging/rdma/hfi1/user_exp_rcv.h | 74 ++++++++++++++++++++++++++++++++
 drivers/staging/rdma/hfi1/user_sdma.h    | 10 +----
 3 files changed, 76 insertions(+), 21 deletions(-)
 create mode 100644 drivers/staging/rdma/hfi1/user_exp_rcv.h

diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
index b8aeef0e0d39..3b15eb4d5bf9 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -58,6 +58,7 @@
 #include "common.h"
 #include "trace.h"
 #include "user_sdma.h"
+#include "user_exp_rcv.h"
 #include "eprom.h"
 
 #undef pr_fmt
@@ -160,18 +161,6 @@ enum mmap_types {
 	HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
 	HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
 
-#define EXP_TID_SET(field, value)			\
-	(((value) & EXP_TID_TID##field##_MASK) <<	\
-	 EXP_TID_TID##field##_SHIFT)
-#define EXP_TID_CLEAR(tid, field) {					\
-		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
-			   EXP_TID_TID##field##_SHIFT);			\
-			}
-#define EXP_TID_RESET(tid, field, value) do {				\
-		EXP_TID_CLEAR(tid, field);				\
-		(tid) |= EXP_TID_SET(field, value);			\
-	} while (0)
-
 #define dbg(fmt, ...)				\
 	pr_info(fmt, ##__VA_ARGS__)
 
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/staging/rdma/hfi1/user_exp_rcv.h
new file mode 100644
index 000000000000..4f4876e1d353
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.h
@@ -0,0 +1,74 @@
+#ifndef _HFI1_USER_EXP_RCV_H
+#define _HFI1_USER_EXP_RCV_H
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define EXP_TID_TIDLEN_MASK   0x7FFULL
+#define EXP_TID_TIDLEN_SHIFT  0
+#define EXP_TID_TIDCTRL_MASK  0x3ULL
+#define EXP_TID_TIDCTRL_SHIFT 20
+#define EXP_TID_TIDIDX_MASK   0x3FFULL
+#define EXP_TID_TIDIDX_SHIFT  22
+#define EXP_TID_GET(tid, field)	\
+	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+
+#define EXP_TID_SET(field, value)			\
+	(((value) & EXP_TID_TID##field##_MASK) <<	\
+	 EXP_TID_TID##field##_SHIFT)
+#define EXP_TID_CLEAR(tid, field) ({					\
+		(tid) &= ~(EXP_TID_TID##field##_MASK <<			\
+			   EXP_TID_TID##field##_SHIFT);			\
+		})
+#define EXP_TID_RESET(tid, field, value) do {				\
+		EXP_TID_CLEAR(tid, field);				\
+		(tid) |= EXP_TID_SET(field, (value));			\
+	} while (0)
+
+#endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h
index fa4422553e23..0046ffa774fe 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.h
+++ b/drivers/staging/rdma/hfi1/user_sdma.h
@@ -52,15 +52,7 @@
 
 #include "common.h"
 #include "iowait.h"
-
-#define EXP_TID_TIDLEN_MASK   0x7FFULL
-#define EXP_TID_TIDLEN_SHIFT  0
-#define EXP_TID_TIDCTRL_MASK  0x3ULL
-#define EXP_TID_TIDCTRL_SHIFT 20
-#define EXP_TID_TIDIDX_MASK   0x7FFULL
-#define EXP_TID_TIDIDX_SHIFT  22
-#define EXP_TID_GET(tid, field)	\
-	(((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK)
+#include "user_exp_rcv.h"
 
 extern uint extended_psn;
 
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching
  2015-10-30 22:41 [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny
                   ` (3 preceding siblings ...)
  2015-10-30 22:41 ` [PATCH 4/9] staging/rdma/hfi1: Move macros to a common header ira.weiny
@ 2015-10-30 22:41 ` ira.weiny
       [not found]   ` <1446244918-12089-6-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2015-10-30 22:41 ` [PATCH 7/9] staging/rdma/hfi1: move hfi1_migrate_qp ira.weiny
  5 siblings, 1 reply; 14+ messages in thread
From: ira.weiny @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh, devel; +Cc: linux-rdma, Mitko Haralanov, dledford, dennis.dalessandro

From: Mitko Haralanov <mitko.haralanov@intel.com>

Add mmu notify helper functions and TID caching function stubs in preparation
for the TID caching implementation.

TID caching makes use of the MMU notifier to allow the driver to respond to the
user freeing memory which is allocated to the HFI.

This patch implements the basic MMU notifier functions to insert, find and
remove buffer pages from memory based on the mmu_notifier being invoked.

In addition it places stubs in place for the main entry points by follow on
code.

Follow up patches will complete the implementation of the interaction with user
space and makes use of these functions.

Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/staging/rdma/hfi1/Kconfig        |   1 +
 drivers/staging/rdma/hfi1/Makefile       |   2 +-
 drivers/staging/rdma/hfi1/user_exp_rcv.c | 314 +++++++++++++++++++++++++++++++
 drivers/staging/rdma/hfi1/user_exp_rcv.h |   8 +
 4 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/rdma/hfi1/user_exp_rcv.c

diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig
index fd25078ee923..bd0249bcf199 100644
--- a/drivers/staging/rdma/hfi1/Kconfig
+++ b/drivers/staging/rdma/hfi1/Kconfig
@@ -1,6 +1,7 @@
 config INFINIBAND_HFI1
 	tristate "Intel OPA Gen1 support"
 	depends on X86_64
+	select MMU_NOTIFIER
 	default m
 	---help---
 	This is a low-level driver for Intel OPA Gen1 adapter.
diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile
index 2e5daa6cdcc2..00c47b788666 100644
--- a/drivers/staging/rdma/hfi1/Makefile
+++ b/drivers/staging/rdma/hfi1/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
 hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o eprom.o file_ops.o firmware.o \
 	init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \
 	qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \
-	uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o
+	uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs_mcast.o verbs.o
 hfi1-$(CONFIG_DEBUG_FS) += debugfs.o
 
 CFLAGS_trace.o = -I$(src)
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
new file mode 100644
index 000000000000..d8066c646f0c
--- /dev/null
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@@ -0,0 +1,314 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <asm/page.h>
+
+#include "user_exp_rcv.h"
+#include "trace.h"
+
+
+struct mmu_rb_node {
+	struct rb_node rbnode;
+	unsigned long virt;
+	unsigned long phys;
+	unsigned long len;
+	struct tid_group *grp;
+	u32 rcventry;
+	dma_addr_t dma_addr;
+	bool freed;
+	unsigned npages;
+	struct page *pages[0];
+};
+
+enum mmu_call_types {
+	MMU_INVALIDATE_PAGE = 0,
+	MMU_INVALIDATE_RANGE = 1
+};
+
+static const char * const mmu_types[] = {
+	"PAGE",
+	"RANGE"
+};
+
+static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long,
+			       unsigned long);
+static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *,
+						 unsigned long);
+static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *,
+							 u32);
+static int mmu_rb_insert_by_addr(struct rb_root *, struct mmu_rb_node *);
+static int mmu_rb_insert_by_entry(struct rb_root *, struct mmu_rb_node *);
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *,
+					unsigned long, unsigned long,
+					enum mmu_call_types);
+static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
+				     unsigned long);
+static inline void mmu_notifier_range_start(struct mmu_notifier *,
+					    struct mm_struct *,
+					    unsigned long, unsigned long);
+
+
+static struct mmu_notifier_ops mn_opts = {
+	.invalidate_page = mmu_notifier_page,
+	.invalidate_range_start = mmu_notifier_range_start,
+};
+
+/*
+ * Initialize context and file private data needed for Expected
+ * receive caching. This needs to be done after the context has
+ * been configured with the eager/expected RcvEntry counts.
+ */
+int hfi1_user_exp_rcv_init(struct file *fp)
+{
+	return -EINVAL;
+}
+
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
+{
+	return -EINVAL;
+}
+
+/*
+ * RcvArray entry allocation for Expected Receives is done by the
+ * following algorithm:
+ *
+ * The context keeps 3 lists of groups of RcvArray entries:
+ *   1. List of empty groups - tid_group_list
+ *      This list is created during user context creation and
+ *      contains elements which describe sets (of 8) of empty
+ *      RcvArray entries.
+ *   2. List of partially used groups - tid_used_list
+ *      This list contains sets of RcvArray entries which are
+ *      not completely used up. Another mapping request could
+ *      use some of all of the remaining entries.
+ *   3. List of full groups - tid_full_list
+ *      This is the list where sets that are completely used
+ *      up go.
+ *
+ * An attempt to optimize the usage of RcvArray entries is
+ * made by finding all sets of physically contiguous pages in a
+ * user's buffer.
+ * These physically contiguous sets are further split into
+ * sizes supported by the receive engine of the HFI. The
+ * resulting sets of pages are stored in struct tid_pageset,
+ * which describes the sets as:
+ *    * .count - number of pages in this set
+ *    * .idx - starting index into struct page ** array
+ *                    of this set
+ *
+ * From this point on, the algorithm deals with the page sets
+ * described above. The number of pagesets is divided by the
+ * RcvArray group size to produce the number of full groups
+ * needed.
+ *
+ * Groups from the 3 lists are manipulated using the following
+ * rules:
+ *   1. For each set of 8 pagesets, a complete group from
+ *      tid_group_list is taken, programmed, and moved to
+ *      the tid_full_list list.
+ *   2. For all remaining pagesets:
+ *      2.1 If the tid_used_list is empty and the tid_group_list
+ *          is empty, stop processing pageset and return only
+ *          what has been programmed up to this point.
+ *      2.2 If the tid_used_list is empty and the tid_group_list
+ *          is not empty, move a group from tid_group_list to
+ *          tid_used_list.
+ *      2.3 For each group is tid_used_group, program as much as
+ *          can fit into the group. If the group becomes fully
+ *          used, move it to tid_full_list.
+ */
+int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+	return -EINVAL;
+}
+
+int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+	return -EINVAL;
+}
+
+int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
+{
+	return -EINVAL;
+}
+
+static inline void mmu_notifier_page(struct mmu_notifier *mn,
+				     struct mm_struct *mm, unsigned long addr)
+{
+	mmu_notifier_mem_invalidate(mn, addr, addr + PAGE_SIZE,
+				    MMU_INVALIDATE_PAGE);
+}
+
+static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
+					    struct mm_struct *mm,
+					    unsigned long start,
+					    unsigned long end)
+{
+	mmu_notifier_mem_invalidate(mn, start, end, MMU_INVALIDATE_RANGE);
+}
+
+static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
+					unsigned long start, unsigned long end,
+					enum mmu_call_types type)
+{
+	/* Stub for now */
+	return;
+}
+
+static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr,
+			       unsigned long len)
+{
+	if ((addr + len) <= node->virt)
+		return -1;
+	else if (addr >= node->virt && addr < (node->virt + node->len))
+		return 0;
+	else
+		return 1;
+}
+
+static inline int mmu_entry_cmp(struct mmu_rb_node *node, u32 entry)
+{
+	if (entry < node->rcventry)
+		return -1;
+	else if (entry > node->rcventry)
+		return 1;
+	else
+		return 0;
+}
+
+static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *root,
+						 unsigned long addr)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct mmu_rb_node *mnode =
+			container_of(node, struct mmu_rb_node, rbnode);
+		/*
+		 * When searching, use at least one page length for size. The
+		 * MMU notifier will not give us anything less than that. We
+		 * also don't need anything more than a page because we are
+		 * guaranteed to have non-overlapping buffers in the tree.
+		 */
+		int result = mmu_addr_cmp(mnode, addr, PAGE_SIZE);
+
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return mnode;
+	}
+	return NULL;
+}
+
+static inline struct mmu_rb_node *mmu_rb_search_by_entry(struct rb_root *root,
+							 u32 index)
+{
+	struct mmu_rb_node *rbnode;
+	struct rb_node *node;
+
+	if (root && !RB_EMPTY_ROOT(root))
+		for (node = rb_first(root); node; node = rb_next(node)) {
+			rbnode = rb_entry(node, struct mmu_rb_node, rbnode);
+			if (rbnode->rcventry == index)
+				return rbnode;
+		}
+	return NULL;
+}
+
+static int mmu_rb_insert_by_entry(struct rb_root *root,
+				  struct mmu_rb_node *node)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	while (*new) {
+		struct mmu_rb_node *this =
+			container_of(*new, struct mmu_rb_node, rbnode);
+		int result = mmu_entry_cmp(this, node->rcventry);
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else
+			return 1;
+	}
+
+	rb_link_node(&node->rbnode, parent, new);
+	rb_insert_color(&node->rbnode, root);
+	return 0;
+}
+
+static int mmu_rb_insert_by_addr(struct rb_root *root, struct mmu_rb_node *node)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct mmu_rb_node *this =
+			container_of(*new, struct mmu_rb_node, rbnode);
+		int result = mmu_addr_cmp(this, node->virt, node->len);
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else
+			return 1;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&node->rbnode, parent, new);
+	rb_insert_color(&node->rbnode, root);
+
+	return 0;
+}
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.h b/drivers/staging/rdma/hfi1/user_exp_rcv.h
index 4f4876e1d353..28ef98a45a1e 100644
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.h
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.h
@@ -50,6 +50,8 @@
  *
  */
 
+#include "hfi.h"
+
 #define EXP_TID_TIDLEN_MASK   0x7FFULL
 #define EXP_TID_TIDLEN_SHIFT  0
 #define EXP_TID_TIDCTRL_MASK  0x3ULL
@@ -71,4 +73,10 @@
 		(tid) |= EXP_TID_SET(field, (value));			\
 	} while (0)
 
+int hfi1_user_exp_rcv_init(struct file *);
+int hfi1_user_exp_rcv_free(struct hfi1_filedata *);
+int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *);
+int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *);
+
 #endif /* _HFI1_USER_EXP_RCV_H */
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v4 6/9] staging/rdma/hfi1: Implement Expected Receive TID caching
       [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2015-10-30 22:41   ` [PATCH 2/9] staging/rdma/hfi1: Clean up macro indentation ira.weiny-ral2JQCrhuEAvxtiuMwx3w
@ 2015-10-30 22:41   ` ira.weiny-ral2JQCrhuEAvxtiuMwx3w
       [not found]     ` <1446244918-12089-7-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2015-10-30 22:41   ` [PATCH v4 8/9] staging/rdma/hfi1: Use parallel workqueue for SDMA engines ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  2015-10-30 22:41   ` [PATCH 9/9] staging/rdma/hfi: pre-compute sc and sde for RC/UC QPs ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  3 siblings, 1 reply; 14+ messages in thread
From: ira.weiny-ral2JQCrhuEAvxtiuMwx3w @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w, Mitko Haralanov,
	Ira Weiny

From: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Expected receives work by user-space libraries (PSM) calling into the driver
with information about the user's receive buffer and have the driver DMA-map
that buffer and program the HFI to receive data directly into it.

This is an expensive operation as it requires the driver to pin the pages which
the user's buffer maps to, DMA-map them, and then program the HFI.

When the receive is complete, user-space libraries have to call into the driver
again so the buffer is removed from the HFI, un-mapped, and the pages unpinned.

All of these operations are expensive, considering that a lot of applications
(especially micro-benchmarks) use the same buffer over and over.

In order to get better performance for user-space applications, it is highly
beneficial that they don't continuously call into the driver to register and
unregister the same buffer. Rather, they can register the buffer and cache it
for future work. The buffer can be unregistered when it is freed by the user.

This change implements such buffer caching by making use of the kernel's MMU
notifier API. User-space libraries call into the driver only when the need to
register a new buffer.

Once a buffer is registered, it stays programmed into the HFI until the kernel
notifies the driver that the buffer has been freed by the user. At that time,
the user-space library is notified and it can do the necessary work to remove
the buffer from its cache.

Buffers which have been invalidated by the kernel are not automatically removed
from the HFI and do not have their pages unpinned. Buffers are only completely
removed when the user-space libraries call into the driver to free them.  This
is done to ensure that any ongoing transfers into that buffer are complete.
This is important when a buffer is not completely freed but rather it is
shrunk. The user-space library could still have uncompleted transfers into the
remaining buffer.

With this feature, it is important that systems are setup with reasonable
limits for the amount of lockable memory.  Keeping the limit at "unlimited" (as
we've done up to this point), may result in jobs being killed by the kernel's
OOM due to them taking up excessive amounts of memory.

Reviewed-by: Arthur Kepner <arthur.kepner-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

---
Changes from V3:
	Reworked based on the removal of the file pointer macros
	Split out some prep patches and code clean up

Changes from V2:
	Fix random Kconfig 0-day build error
	Fix leak of random memory to user space caught by Dan Carpenter
	Separate out pointer bug fix into a previous patch
	Change error checks in case statement per Dan's comments

 drivers/staging/rdma/hfi1/file_ops.c     | 469 ++---------------
 drivers/staging/rdma/hfi1/hfi.h          |  43 +-
 drivers/staging/rdma/hfi1/init.c         |   5 +-
 drivers/staging/rdma/hfi1/trace.h        | 132 +++--
 drivers/staging/rdma/hfi1/user_exp_rcv.c | 874 ++++++++++++++++++++++++++++++-
 drivers/staging/rdma/hfi1/user_pages.c   | 110 +---
 drivers/staging/rdma/hfi1/user_sdma.c    |  13 +
 include/uapi/rdma/hfi/hfi1_user.h        |  14 +-
 8 files changed, 1069 insertions(+), 591 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c
index 3b15eb4d5bf9..746a12f055fe 100644
--- a/drivers/staging/rdma/hfi1/file_ops.c
+++ b/drivers/staging/rdma/hfi1/file_ops.c
@@ -96,9 +96,6 @@ static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
 static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
 static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
 static int vma_fault(struct vm_area_struct *, struct vm_fault *);
-static int exp_tid_setup(struct file *, struct hfi1_tid_info *);
-static int exp_tid_free(struct file *, struct hfi1_tid_info *);
-static void unlock_exp_tids(struct hfi1_ctxtdata *);
 
 static const struct file_operations hfi1_file_ops = {
 	.owner = THIS_MODULE,
@@ -174,8 +171,12 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
 {
 	/* The real work is performed later in assign_ctxt() */
 	fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL);
-	if (fp->private_data) /* no cpu affinity by default */
-		((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1;
+	if (fp->private_data) {
+		struct hfi1_filedata *fd = fp->private_data;
+
+		/* no cpu affinity by default */
+		fd->rec_cpu_num = -1;
+	}
 	return fp->private_data ? 0 : -ENOMEM;
 }
 
@@ -188,6 +189,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 	struct hfi1_cmd cmd;
 	struct hfi1_user_info uinfo;
 	struct hfi1_tid_info tinfo;
+	unsigned long addr;
 	ssize_t consumed = 0, copy = 0, ret = 0;
 	void *dest = NULL;
 	__u64 user_val = 0;
@@ -219,6 +221,7 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 		break;
 	case HFI1_CMD_TID_UPDATE:
 	case HFI1_CMD_TID_FREE:
+	case HFI1_CMD_TID_INVAL_READ:
 		copy = sizeof(tinfo);
 		dest = &tinfo;
 		break;
@@ -297,9 +300,8 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 			sc_return_credits(uctxt->sc);
 		break;
 	case HFI1_CMD_TID_UPDATE:
-		ret = exp_tid_setup(fp, &tinfo);
+		ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
 		if (!ret) {
-			unsigned long addr;
 			/*
 			 * Copy the number of tidlist entries we used
 			 * and the length of the buffer we registered.
@@ -314,8 +316,25 @@ static ssize_t hfi1_file_write(struct file *fp, const char __user *data,
 				ret = -EFAULT;
 		}
 		break;
+	case HFI1_CMD_TID_INVAL_READ:
+		ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
+		if (ret)
+			break;
+		addr = (unsigned long)cmd.addr +
+			offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			ret = -EFAULT;
+		break;
 	case HFI1_CMD_TID_FREE:
-		ret = exp_tid_free(fp, &tinfo);
+		ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
+		if (ret)
+			break;
+		addr = (unsigned long)cmd.addr +
+			offsetof(struct hfi1_tid_info, tidcnt);
+		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
+				 sizeof(tinfo.tidcnt)))
+			ret = -EFAULT;
 		break;
 	case HFI1_CMD_RECV_CTRL:
 		ret = manage_rcvq(uctxt, fd->subctxt, (int)user_val);
@@ -736,6 +755,7 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 	mutex_lock(&hfi1_mutex);
 
 	flush_wc();
+
 	/* drain user sdma queue */
 	if (fdata->pq)
 		hfi1_user_sdma_free_queues(fdata);
@@ -785,12 +805,9 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 	uctxt->pionowait = 0;
 	uctxt->event_flags = 0;
 
-	hfi1_clear_tids(uctxt);
+	hfi1_user_exp_rcv_free(fdata);
 	hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
 
-	if (uctxt->tid_pg_list)
-		unlock_exp_tids(uctxt);
-
 	hfi1_stats.sps_ctxts--;
 	dd->freectxts++;
 	mutex_unlock(&hfi1_mutex);
@@ -994,6 +1011,7 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 	ret = sc_enable(uctxt->sc);
 	if (ret)
 		return ret;
+
 	/*
 	 * Setup shared context resources if the user-level has requested
 	 * shared contexts and this is the 'master' process.
@@ -1028,22 +1046,19 @@ static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
 			 const struct hfi1_user_info *uinfo)
 {
-	int ret = 0;
 	unsigned num_subctxts;
 
 	num_subctxts = uinfo->subctxt_cnt;
-	if (num_subctxts > HFI1_MAX_SHARED_CTXTS) {
-		ret = -EINVAL;
-		goto bail;
-	}
+	if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
+		return -EINVAL;
 
 	uctxt->subctxt_cnt = uinfo->subctxt_cnt;
 	uctxt->subctxt_id = uinfo->subctxt_id;
 	uctxt->active_slaves = 1;
 	uctxt->redirect_seq_cnt = 1;
 	set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
-bail:
-	return ret;
+
+	return 0;
 }
 
 static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
@@ -1101,7 +1116,7 @@ static int user_init(struct file *fp)
 		ret = wait_event_interruptible(uctxt->wait,
 			!test_bit(HFI1_CTXT_MASTER_UNINIT,
 			&uctxt->event_flags));
-		goto done;
+		goto expected;
 	}
 
 	/* initialize poll variables... */
@@ -1148,8 +1163,18 @@ static int user_init(struct file *fp)
 		clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
 		wake_up(&uctxt->wait);
 	}
-	ret = 0;
 
+expected:
+	/*
+	 * Expected receive has to be setup for all processes (including
+	 * shared contexts). However, it has to be done after the master
+	 * context has been fully configured as it depends on the
+	 * eager/expected split of the RcvArray entries.
+	 * Setting it up here ensures that the subcontexts will be waiting
+	 * (due to the above wait_event_interruptible() until the master
+	 * is setup.
+	 */
+	ret = hfi1_user_exp_rcv_init(fp);
 done:
 	return ret;
 }
@@ -1203,6 +1228,7 @@ static int setup_ctxt(struct file *fp)
 	 * is not requested or by the master process.
 	 */
 	if (!uctxt->subctxt_cnt || !fd->subctxt) {
+
 		ret = hfi1_init_ctxt(uctxt->sc);
 		if (ret)
 			goto done;
@@ -1219,46 +1245,6 @@ static int setup_ctxt(struct file *fp)
 			if (ret)
 				goto done;
 		}
-		/* Setup Expected Rcv memories */
-		uctxt->tid_pg_list = vzalloc(uctxt->expected_count *
-					     sizeof(struct page **));
-		if (!uctxt->tid_pg_list) {
-			ret = -ENOMEM;
-			goto done;
-		}
-		uctxt->physshadow = vzalloc(uctxt->expected_count *
-					    sizeof(*uctxt->physshadow));
-		if (!uctxt->physshadow) {
-			ret = -ENOMEM;
-			goto done;
-		}
-		/* allocate expected TID map and initialize the cursor */
-		atomic_set(&uctxt->tidcursor, 0);
-		uctxt->numtidgroups = uctxt->expected_count /
-			dd->rcv_entries.group_size;
-		uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG +
-			!!(uctxt->numtidgroups % BITS_PER_LONG);
-		uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt *
-						sizeof(*uctxt->tidusemap),
-						GFP_KERNEL, uctxt->numa_id);
-		if (!uctxt->tidusemap) {
-			ret = -ENOMEM;
-			goto done;
-		}
-		/*
-		 * In case that the number of groups is not a multiple of
-		 * 64 (the number of groups in a tidusemap element), mark
-		 * the extra ones as used. This will effectively make them
-		 * permanently used and should never be assigned. Otherwise,
-		 * the code which checks how many free groups we have will
-		 * get completely confused about the state of the bits.
-		 */
-		if (uctxt->numtidgroups % BITS_PER_LONG)
-			uctxt->tidusemap[uctxt->tidmapcnt - 1] =
-				~((1ULL << (uctxt->numtidgroups %
-					    BITS_PER_LONG)) - 1);
-		trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0,
-				       uctxt->tidusemap, uctxt->tidmapcnt);
 	}
 	ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
 	if (ret)
@@ -1497,367 +1483,6 @@ static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
 	return 0;
 }
 
-#define num_user_pages(vaddr, len)					\
-	(1 + (((((unsigned long)(vaddr) +				\
-		 (unsigned long)(len) - 1) & PAGE_MASK) -		\
-	       ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
-
-/**
- * tzcnt - count the number of trailing zeros in a 64bit value
- * @value: the value to be examined
- *
- * Returns the number of trailing least significant zeros in the
- * the input value. If the value is zero, return the number of
- * bits of the value.
- */
-static inline u8 tzcnt(u64 value)
-{
-	return value ? __builtin_ctzl(value) : sizeof(value) * 8;
-}
-
-static inline unsigned num_free_groups(unsigned long map, u16 *start)
-{
-	unsigned free;
-	u16 bitidx = *start;
-
-	if (bitidx >= BITS_PER_LONG)
-		return 0;
-	/* "Turn off" any bits set before our bit index */
-	map &= ~((1ULL << bitidx) - 1);
-	free = tzcnt(map) - bitidx;
-	while (!free && bitidx < BITS_PER_LONG) {
-		/* Zero out the last set bit so we look at the rest */
-		map &= ~(1ULL << bitidx);
-		/*
-		 * Account for the previously checked bits and advance
-		 * the bit index. We don't have to check for bitidx
-		 * getting bigger than BITS_PER_LONG here as it would
-		 * mean extra instructions that we don't need. If it
-		 * did happen, it would push free to a negative value
-		 * which will break the loop.
-		 */
-		free = tzcnt(map) - ++bitidx;
-	}
-	*start = bitidx;
-	return free;
-}
-
-static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-	int ret = 0;
-	struct hfi1_filedata *fd = fp->private_data;
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
-	struct hfi1_devdata *dd = uctxt->dd;
-	unsigned tid, mapped = 0, npages, ngroups, exp_groups,
-		tidpairs = uctxt->expected_count / 2;
-	struct page **pages;
-	unsigned long vaddr, tidmap[uctxt->tidmapcnt];
-	dma_addr_t *phys;
-	u32 tidlist[tidpairs], pairidx = 0, tidcursor;
-	u16 useidx, idx, bitidx, tidcnt = 0;
-
-	vaddr = tinfo->vaddr;
-
-	if (offset_in_page(vaddr)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	npages = num_user_pages(vaddr, tinfo->length);
-	if (!npages) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
-		       npages * PAGE_SIZE)) {
-		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
-			   (void *)vaddr, npages);
-		ret = -EFAULT;
-		goto bail;
-	}
-
-	memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt);
-	memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs);
-
-	exp_groups = uctxt->expected_count / dd->rcv_entries.group_size;
-	/* which group set do we look at first? */
-	tidcursor = atomic_read(&uctxt->tidcursor);
-	useidx = (tidcursor >> 16) & 0xffff;
-	bitidx = tidcursor & 0xffff;
-
-	/*
-	 * Keep going until we've mapped all pages or we've exhausted all
-	 * RcvArray entries.
-	 * This iterates over the number of tidmaps + 1
-	 * (idx <= uctxt->tidmapcnt) so we check the bitmap which we
-	 * started from one more time for any free bits before the
-	 * starting point bit.
-	 */
-	for (mapped = 0, idx = 0;
-	     mapped < npages && idx <= uctxt->tidmapcnt;) {
-		u64 i, offset = 0;
-		unsigned free, pinned, pmapped = 0, bits_used;
-		u16 grp;
-
-		/*
-		 * "Reserve" the needed group bits under lock so other
-		 * processes can't step in the middle of it. Once
-		 * reserved, we don't need the lock anymore since we
-		 * are guaranteed the groups.
-		 */
-		spin_lock(&uctxt->exp_lock);
-		if (uctxt->tidusemap[useidx] == -1ULL ||
-		    bitidx >= BITS_PER_LONG) {
-			/* no free groups in the set, use the next */
-			useidx = (useidx + 1) % uctxt->tidmapcnt;
-			idx++;
-			bitidx = 0;
-			spin_unlock(&uctxt->exp_lock);
-			continue;
-		}
-		ngroups = ((npages - mapped) / dd->rcv_entries.group_size) +
-			!!((npages - mapped) % dd->rcv_entries.group_size);
-
-		/*
-		 * If we've gotten here, the current set of groups does have
-		 * one or more free groups.
-		 */
-		free = num_free_groups(uctxt->tidusemap[useidx], &bitidx);
-		if (!free) {
-			/*
-			 * Despite the check above, free could still come back
-			 * as 0 because we don't check the entire bitmap but
-			 * we start from bitidx.
-			 */
-			spin_unlock(&uctxt->exp_lock);
-			continue;
-		}
-		bits_used = min(free, ngroups);
-		tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx;
-		uctxt->tidusemap[useidx] |= tidmap[useidx];
-		spin_unlock(&uctxt->exp_lock);
-
-		/*
-		 * At this point, we know where in the map we have free bits.
-		 * properly offset into the various "shadow" arrays and compute
-		 * the RcvArray entry index.
-		 */
-		offset = ((useidx * BITS_PER_LONG) + bitidx) *
-			dd->rcv_entries.group_size;
-		pages = uctxt->tid_pg_list + offset;
-		phys = uctxt->physshadow + offset;
-		tid = uctxt->expected_base + offset;
-
-		/* Calculate how many pages we can pin based on free bits */
-		pinned = min((bits_used * dd->rcv_entries.group_size),
-			     (npages - mapped));
-		/*
-		 * Now that we know how many free RcvArray entries we have,
-		 * we can pin that many user pages.
-		 */
-		ret = hfi1_get_user_pages(vaddr + (mapped * PAGE_SIZE),
-					  pinned, pages);
-		if (ret) {
-			/*
-			 * We can't continue because the pages array won't be
-			 * initialized. This should never happen,
-			 * unless perhaps the user has mpin'ed the pages
-			 * themselves.
-			 */
-			dd_dev_info(dd,
-				    "Failed to lock addr %p, %u pages: errno %d\n",
-				    (void *) vaddr, pinned, -ret);
-			/*
-			 * Let go of the bits that we reserved since we are not
-			 * going to use them.
-			 */
-			spin_lock(&uctxt->exp_lock);
-			uctxt->tidusemap[useidx] &=
-				~(((1ULL << bits_used) - 1) << bitidx);
-			spin_unlock(&uctxt->exp_lock);
-			goto done;
-		}
-		/*
-		 * How many groups do we need based on how many pages we have
-		 * pinned?
-		 */
-		ngroups = (pinned / dd->rcv_entries.group_size) +
-			!!(pinned % dd->rcv_entries.group_size);
-		/*
-		 * Keep programming RcvArray entries for all the <ngroups> free
-		 * groups.
-		 */
-		for (i = 0, grp = 0; grp < ngroups; i++, grp++) {
-			unsigned j;
-			u32 pair_size = 0, tidsize;
-			/*
-			 * This inner loop will program an entire group or the
-			 * array of pinned pages (which ever limit is hit
-			 * first).
-			 */
-			for (j = 0; j < dd->rcv_entries.group_size &&
-				     pmapped < pinned; j++, pmapped++, tid++) {
-				tidsize = PAGE_SIZE;
-				phys[pmapped] = hfi1_map_page(dd->pcidev,
-						   pages[pmapped], 0,
-						   tidsize, PCI_DMA_FROMDEVICE);
-				trace_hfi1_exp_rcv_set(uctxt->ctxt,
-						       fd->subctxt,
-						       tid, vaddr,
-						       phys[pmapped],
-						       pages[pmapped]);
-				/*
-				 * Each RcvArray entry is programmed with one
-				 * page * worth of memory. This will handle
-				 * the 8K MTU as well as anything smaller
-				 * due to the fact that both entries in the
-				 * RcvTidPair are programmed with a page.
-				 * PSM currently does not handle anything
-				 * bigger than 8K MTU, so should we even worry
-				 * about 10K here?
-				 */
-				hfi1_put_tid(dd, tid, PT_EXPECTED,
-					     phys[pmapped],
-					     ilog2(tidsize >> PAGE_SHIFT) + 1);
-				pair_size += tidsize >> PAGE_SHIFT;
-				EXP_TID_RESET(tidlist[pairidx], LEN, pair_size);
-				if (!(tid % 2)) {
-					tidlist[pairidx] |=
-					   EXP_TID_SET(IDX,
-						(tid - uctxt->expected_base)
-						       / 2);
-					tidlist[pairidx] |=
-						EXP_TID_SET(CTRL, 1);
-					tidcnt++;
-				} else {
-					tidlist[pairidx] |=
-						EXP_TID_SET(CTRL, 2);
-					pair_size = 0;
-					pairidx++;
-				}
-			}
-			/*
-			 * We've programmed the entire group (or as much of the
-			 * group as we'll use. Now, it's time to push it out...
-			 */
-			flush_wc();
-		}
-		mapped += pinned;
-		atomic_set(&uctxt->tidcursor,
-			   (((useidx & 0xffffff) << 16) |
-			    ((bitidx + bits_used) & 0xffffff)));
-	}
-	trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 0, uctxt->tidusemap,
-			       uctxt->tidmapcnt);
-
-done:
-	/* If we've mapped anything, copy relevant info to user */
-	if (mapped) {
-		if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
-				 tidlist, sizeof(tidlist[0]) * tidcnt)) {
-			ret = -EFAULT;
-			goto done;
-		}
-		/* copy TID info to user */
-		if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap,
-				 tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt))
-			ret = -EFAULT;
-	}
-bail:
-	/*
-	 * Calculate mapped length. New Exp TID protocol does not "unwind" and
-	 * report an error if it can't map the entire buffer. It just reports
-	 * the length that was mapped.
-	 */
-	tinfo->length = mapped * PAGE_SIZE;
-	tinfo->tidcnt = tidcnt;
-	return ret;
-}
-
-static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo)
-{
-	struct hfi1_filedata *fd = fp->private_data;
-	struct hfi1_ctxtdata *uctxt = fd->uctxt;
-	struct hfi1_devdata *dd = uctxt->dd;
-	unsigned long tidmap[uctxt->tidmapcnt];
-	struct page **pages;
-	dma_addr_t *phys;
-	u16 idx, bitidx, tid;
-	int ret = 0;
-
-	if (copy_from_user(&tidmap, (void __user *)(unsigned long)
-			   tinfo->tidmap,
-			   sizeof(tidmap[0]) * uctxt->tidmapcnt)) {
-		ret = -EFAULT;
-		goto done;
-	}
-	for (idx = 0; idx < uctxt->tidmapcnt; idx++) {
-		unsigned long map;
-
-		bitidx = 0;
-		if (!tidmap[idx])
-			continue;
-		map = tidmap[idx];
-		while ((bitidx = tzcnt(map)) < BITS_PER_LONG) {
-			int i, pcount = 0;
-			struct page *pshadow[dd->rcv_entries.group_size];
-			unsigned offset = ((idx * BITS_PER_LONG) + bitidx) *
-				dd->rcv_entries.group_size;
-
-			pages = uctxt->tid_pg_list + offset;
-			phys = uctxt->physshadow + offset;
-			tid = uctxt->expected_base + offset;
-			for (i = 0; i < dd->rcv_entries.group_size;
-			     i++, tid++) {
-				if (pages[i]) {
-					hfi1_put_tid(dd, tid, PT_INVALID,
-						      0, 0);
-					trace_hfi1_exp_rcv_free(uctxt->ctxt,
-								fd->subctxt,
-								tid, phys[i],
-								pages[i]);
-					pci_unmap_page(dd->pcidev, phys[i],
-					      PAGE_SIZE, PCI_DMA_FROMDEVICE);
-					pshadow[pcount] = pages[i];
-					pages[i] = NULL;
-					pcount++;
-					phys[i] = 0;
-				}
-			}
-			flush_wc();
-			hfi1_release_user_pages(pshadow, pcount);
-			clear_bit(bitidx, &uctxt->tidusemap[idx]);
-			map &= ~(1ULL<<bitidx);
-		}
-	}
-	trace_hfi1_exp_tid_map(uctxt->ctxt, fd->subctxt, 1, uctxt->tidusemap,
-			       uctxt->tidmapcnt);
-done:
-	return ret;
-}
-
-static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt)
-{
-	struct hfi1_devdata *dd = uctxt->dd;
-	unsigned tid;
-
-	dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n",
-		    uctxt->ctxt);
-	for (tid = 0; tid < uctxt->expected_count; tid++) {
-		struct page *p = uctxt->tid_pg_list[tid];
-		dma_addr_t phys;
-
-		if (!p)
-			continue;
-
-		phys = uctxt->physshadow[tid];
-		uctxt->physshadow[tid] = 0;
-		uctxt->tid_pg_list[tid] = NULL;
-		pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE);
-		hfi1_release_user_pages(&p, 1);
-	}
-}
-
 static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
 			 u16 pkey)
 {
diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h
index 755e4d152d41..88e52be8d066 100644
--- a/drivers/staging/rdma/hfi1/hfi.h
+++ b/drivers/staging/rdma/hfi1/hfi.h
@@ -65,6 +65,8 @@
 #include <linux/cdev.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rbtree.h>
 
 #include "chip_registers.h"
 #include "common.h"
@@ -166,6 +168,11 @@ struct ctxt_eager_bufs {
 	} *rcvtids;
 };
 
+struct exp_tid_set {
+	struct list_head list;
+	u32 count;
+};
+
 struct hfi1_ctxtdata {
 	/* shadow the ctxt's RcvCtrl register */
 	u64 rcvctrl;
@@ -222,20 +229,13 @@ struct hfi1_ctxtdata {
 	u32 expected_count;
 	/* index of first expected TID entry. */
 	u32 expected_base;
-	/* cursor into the exp group sets */
-	atomic_t tidcursor;
-	/* number of exp TID groups assigned to the ctxt */
-	u16 numtidgroups;
-	/* size of exp TID group fields in tidusemap */
-	u16 tidmapcnt;
-	/* exp TID group usage bitfield array */
-	unsigned long *tidusemap;
-	/* pinned pages for exp sends, allocated at open */
-	struct page **tid_pg_list;
-	/* dma handles for exp tid pages */
-	dma_addr_t *physshadow;
+
+	struct exp_tid_set tid_group_list;
+	struct exp_tid_set tid_used_list;
+	struct exp_tid_set tid_full_list;
+
 	/* lock protecting all Expected TID data */
-	spinlock_t exp_lock;
+	struct mutex exp_lock;
 	/* number of pio bufs for this ctxt (all procs, if shared) */
 	u32 piocnt;
 	/* first pio buffer for this ctxt */
@@ -1094,6 +1094,8 @@ struct hfi1_devdata {
 #define PT_EAGER    1
 #define PT_INVALID  2
 
+struct mmu_rb_node;
+
 /* Private data for file operations */
 struct hfi1_filedata {
 	struct hfi1_ctxtdata *uctxt;
@@ -1102,6 +1104,15 @@ struct hfi1_filedata {
 	struct hfi1_user_sdma_pkt_q *pq;
 	/* for cpu affinity; -1 if none */
 	int rec_cpu_num;
+	struct mmu_notifier mn;
+	struct rb_root tid_rb_root;
+	u32 tid_limit;
+	u32 tid_used;
+	spinlock_t rb_lock;
+	u32 *invalid_tids;
+	u32 invalid_tid_idx;
+	spinlock_t invalid_lock;
+	int (*mmu_rb_insert)(struct rb_root *, struct mmu_rb_node *);
 };
 
 extern struct list_head hfi1_dev_list;
@@ -1558,8 +1569,8 @@ void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val);
  */
 #define DEFAULT_RCVHDR_ENTSIZE 32
 
-int hfi1_get_user_pages(unsigned long, size_t, struct page **);
-void hfi1_release_user_pages(struct page **, size_t);
+int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **);
+void hfi1_release_user_pages(struct page **, size_t, bool);
 
 static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
 {
@@ -1608,8 +1619,6 @@ int get_platform_config_field(struct hfi1_devdata *dd,
 			enum platform_config_table_type_encoding table_type,
 			int table_index, int field_index, u32 *data, u32 len);
 
-dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long,
-			 size_t, int);
 const char *get_unit_name(int unit);
 
 /*
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index 47a1202fcbdf..060ab566856a 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -219,7 +219,7 @@ struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt)
 		rcd->numa_id = numa_node_id();
 		rcd->rcv_array_groups = dd->rcv_entries.ngroups;
 
-		spin_lock_init(&rcd->exp_lock);
+		mutex_init(&rcd->exp_lock);
 
 		/*
 		 * Calculate the context's RcvArray entry starting point.
@@ -942,13 +942,10 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 	kfree(rcd->egrbufs.buffers);
 
 	sc_free(rcd->sc);
-	vfree(rcd->physshadow);
-	vfree(rcd->tid_pg_list);
 	vfree(rcd->user_event_mask);
 	vfree(rcd->subctxt_uregbase);
 	vfree(rcd->subctxt_rcvegrbuf);
 	vfree(rcd->subctxt_rcvhdr_base);
-	kfree(rcd->tidusemap);
 	kfree(rcd->opstats);
 	kfree(rcd);
 }
diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h
index 57430295c404..db51cf55c538 100644
--- a/drivers/staging/rdma/hfi1/trace.h
+++ b/drivers/staging/rdma/hfi1/trace.h
@@ -153,92 +153,130 @@ TRACE_EVENT(hfi1_receive_interrupt,
 	)
 );
 
-const char *print_u64_array(struct trace_seq *, u64 *, int);
+TRACE_EVENT(hfi1_exp_tid_reg,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr,
+		     u32 npages, unsigned long va, unsigned long pa,
+		     dma_addr_t dma),
+	    TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+	    TP_STRUCT__entry(
+		    __field(unsigned, ctxt)
+		    __field(u16, subctxt)
+		    __field(u32, rarr)
+		    __field(u32, npages)
+		    __field(unsigned long, va)
+		    __field(unsigned long, pa)
+		    __field(dma_addr_t, dma)
+		    ),
+	    TP_fast_assign(
+		    __entry->ctxt = ctxt;
+		    __entry->subctxt = subctxt;
+		    __entry->rarr = rarr;
+		    __entry->npages = npages;
+		    __entry->va = va;
+		    __entry->pa = pa;
+		    __entry->dma = dma;
+		    ),
+	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+		      __entry->ctxt,
+		      __entry->subctxt,
+		      __entry->rarr,
+		      __entry->npages,
+		      __entry->pa,
+		      __entry->va,
+		      __entry->dma
+		    )
+	);
 
-TRACE_EVENT(hfi1_exp_tid_map,
-	    TP_PROTO(unsigned ctxt, u16 subctxt, int dir,
-		     unsigned long *maps, u16 count),
-	    TP_ARGS(ctxt, subctxt, dir, maps, count),
+TRACE_EVENT(hfi1_exp_tid_unreg,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages,
+		     unsigned long va, unsigned long pa, dma_addr_t dma),
+	    TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
 	    TP_STRUCT__entry(
 		    __field(unsigned, ctxt)
 		    __field(u16, subctxt)
-		    __field(int, dir)
-		    __field(u16, count)
-		    __dynamic_array(unsigned long, maps, sizeof(*maps) * count)
+		    __field(u32, rarr)
+		    __field(u32, npages)
+		    __field(unsigned long, va)
+		    __field(unsigned long, pa)
+		    __field(dma_addr_t, dma)
 		    ),
 	    TP_fast_assign(
 		    __entry->ctxt = ctxt;
 		    __entry->subctxt = subctxt;
-		    __entry->dir = dir;
-		    __entry->count = count;
-		    memcpy(__get_dynamic_array(maps), maps,
-			   sizeof(*maps) * count);
+		    __entry->rarr = rarr;
+		    __entry->npages = npages;
+		    __entry->va = va;
+		    __entry->pa = pa;
+		    __entry->dma = dma;
 		    ),
-	    TP_printk("[%3u:%02u] %s tidmaps %s",
+	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
 		      __entry->ctxt,
 		      __entry->subctxt,
-		      (__entry->dir ? ">" : "<"),
-		      print_u64_array(p, __get_dynamic_array(maps),
-				      __entry->count)
+		      __entry->rarr,
+		      __entry->npages,
+		      __entry->pa,
+		      __entry->va,
+		      __entry->dma
 		    )
 	);
 
-TRACE_EVENT(hfi1_exp_rcv_set,
-	    TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
-		     unsigned long vaddr, u64 phys_addr, void *page),
-	    TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page),
+TRACE_EVENT(hfi1_exp_tid_inval,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr,
+		     u32 npages, dma_addr_t dma),
+	    TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
 	    TP_STRUCT__entry(
 		    __field(unsigned, ctxt)
 		    __field(u16, subctxt)
-		    __field(u32, tid)
-		    __field(unsigned long, vaddr)
-		    __field(u64, phys_addr)
-		    __field(void *, page)
+		    __field(unsigned long, va)
+		    __field(u32, rarr)
+		    __field(u32, npages)
+		    __field(dma_addr_t, dma)
 		    ),
 	    TP_fast_assign(
 		    __entry->ctxt = ctxt;
 		    __entry->subctxt = subctxt;
-		    __entry->tid = tid;
-		    __entry->vaddr = vaddr;
-		    __entry->phys_addr = phys_addr;
-		    __entry->page = page;
+		    __entry->va = va;
+		    __entry->rarr = rarr;
+		    __entry->npages = npages;
+		    __entry->dma = dma;
 		    ),
-	    TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p",
+	    TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
 		      __entry->ctxt,
 		      __entry->subctxt,
-		      __entry->tid,
-		      __entry->vaddr,
-		      __entry->phys_addr,
-		      __entry->page
+		      __entry->rarr,
+		      __entry->npages,
+		      __entry->va,
+		      __entry->dma
 		    )
 	);
 
-TRACE_EVENT(hfi1_exp_rcv_free,
-	    TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid,
-		     unsigned long phys, void *page),
-	    TP_ARGS(ctxt, subctxt, tid, phys, page),
+TRACE_EVENT(hfi1_mmu_invalidate,
+	    TP_PROTO(unsigned ctxt, u16 subctxt, const char *type,
+		     unsigned long start, unsigned long end),
+	    TP_ARGS(ctxt, subctxt, type, start, end),
 	    TP_STRUCT__entry(
 		    __field(unsigned, ctxt)
 		    __field(u16, subctxt)
-		    __field(u32, tid)
-		    __field(unsigned long, phys)
-		    __field(void *, page)
+		    __string(type, type)
+		    __field(unsigned long, start)
+		    __field(unsigned long, end)
 		    ),
 	    TP_fast_assign(
 		    __entry->ctxt = ctxt;
 		    __entry->subctxt = subctxt;
-		    __entry->tid = tid;
-		    __entry->phys = phys;
-		    __entry->page = page;
+		    __assign_str(type, type);
+		    __entry->start = start;
+		    __entry->end = end;
 		    ),
-	    TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p",
+	    TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx",
 		      __entry->ctxt,
 		      __entry->subctxt,
-		      __entry->tid,
-		      __entry->phys,
-		      __entry->page
+		      __get_str(type),
+		      __entry->start,
+		      __entry->end
 		    )
 	);
+
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM hfi1_tx
 
diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c
index d8066c646f0c..747a86a459dd 100644
--- a/drivers/staging/rdma/hfi1/user_exp_rcv.c
+++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c
@@ -52,6 +52,13 @@
 #include "user_exp_rcv.h"
 #include "trace.h"
 
+struct tid_group {
+	struct list_head list;
+	unsigned base;
+	u8 size;
+	u8 used;
+	u8 map;
+};
 
 struct mmu_rb_node {
 	struct rb_node rbnode;
@@ -76,6 +83,25 @@ static const char * const mmu_types[] = {
 	"RANGE"
 };
 
+struct tid_pageset {
+	u16 idx;
+	u16 count;
+};
+
+
+#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
+
+#define num_user_pages(vaddr, len)					\
+	(1 + (((((unsigned long)(vaddr) +				\
+		 (unsigned long)(len) - 1) & PAGE_MASK) -		\
+	       ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
+			    struct rb_root *);
+static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
+static int set_rcvarray_entry(struct file *, unsigned long, u32,
+			      struct tid_group *, struct page **, unsigned);
+
 static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long,
 			       unsigned long);
 static struct mmu_rb_node *mmu_rb_search_by_addr(struct rb_root *,
@@ -92,7 +118,56 @@ static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
 static inline void mmu_notifier_range_start(struct mmu_notifier *,
 					    struct mm_struct *,
 					    unsigned long, unsigned long);
+static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
+			    struct tid_pageset *, unsigned, u16, struct page **,
+			    u32 *, unsigned *, unsigned *);
+static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
+static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *);
+
+static inline u32 rcventry2tidinfo(u32 rcventry)
+{
+	u32 pair = rcventry & ~0x1;
+
+	return EXP_TID_SET(IDX, pair >> 1) |
+		EXP_TID_SET(CTRL, 1 << (rcventry - pair));
+}
+
+static inline void exp_tid_group_init(struct exp_tid_set *set)
+{
+	INIT_LIST_HEAD(&set->list);
+	set->count = 0;
+}
+
+static inline void tid_group_remove(struct tid_group *grp,
+				    struct exp_tid_set *set)
+{
+	list_del_init(&grp->list);
+	set->count--;
+}
+
+static inline void tid_group_add_tail(struct tid_group *grp,
+				      struct exp_tid_set *set)
+{
+	list_add_tail(&grp->list, &set->list);
+	set->count++;
+}
+
+static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
+{
+	struct tid_group *grp =
+		list_first_entry(&set->list, struct tid_group, list);
+	list_del_init(&grp->list);
+	set->count--;
+	return grp;
+}
 
+static inline void tid_group_move(struct tid_group *group,
+				  struct exp_tid_set *s1,
+				  struct exp_tid_set *s2)
+{
+	tid_group_remove(group, s1);
+	tid_group_add_tail(group, s2);
+}
 
 static struct mmu_notifier_ops mn_opts = {
 	.invalidate_page = mmu_notifier_page,
@@ -106,12 +181,157 @@ static struct mmu_notifier_ops mn_opts = {
  */
 int hfi1_user_exp_rcv_init(struct file *fp)
 {
-	return -EINVAL;
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned tidbase;
+	int i, ret = 0;
+
+	INIT_HLIST_NODE(&fd->mn.hlist);
+	spin_lock_init(&fd->rb_lock);
+	spin_lock_init(&fd->invalid_lock);
+	fd->mn.ops = &mn_opts;
+	fd->tid_rb_root = RB_ROOT;
+
+	if (!uctxt->subctxt_cnt || !fd->subctxt) {
+		exp_tid_group_init(&uctxt->tid_group_list);
+		exp_tid_group_init(&uctxt->tid_used_list);
+		exp_tid_group_init(&uctxt->tid_full_list);
+
+		tidbase = uctxt->expected_base;
+		for (i = 0; i < uctxt->expected_count /
+			     dd->rcv_entries.group_size; i++) {
+			struct tid_group *grp;
+
+			grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+			if (!grp) {
+				/*
+				 * If we fail here, the groups already
+				 * allocated will be freed by the close
+				 * call.
+				 */
+				ret = -ENOMEM;
+				goto done;
+			}
+			grp->size = dd->rcv_entries.group_size;
+			grp->base = tidbase;
+			tid_group_add_tail(grp, &uctxt->tid_group_list);
+			tidbase += dd->rcv_entries.group_size;
+		}
+	}
+
+	if (!HFI1_CAP_IS_USET(TID_UNMAP)) {
+		fd->invalid_tid_idx = 0;
+		fd->invalid_tids = kzalloc(uctxt->expected_count *
+					   sizeof(u32), GFP_KERNEL);
+		if (!fd->invalid_tids) {
+			ret = -ENOMEM;
+			goto done;
+		} else {
+			/*
+			 * Register MMU notifier callbacks. If the registration
+			 * fails, continue but turn off the TID caching for
+			 * all user contexts.
+			 */
+			ret = mmu_notifier_register(&fd->mn, current->mm);
+			if (ret) {
+				dd_dev_info(dd,
+					    "Failed MMU notifier registration %d\n",
+					    ret);
+				HFI1_CAP_USET(TID_UNMAP);
+				ret = 0;
+			}
+		}
+	}
+
+	if (HFI1_CAP_IS_USET(TID_UNMAP))
+		fd->mmu_rb_insert = mmu_rb_insert_by_entry;
+	else
+		fd->mmu_rb_insert = mmu_rb_insert_by_addr;
+
+	/*
+	 * PSM does not have a good way to separate, count, and
+	 * effectively enforce a limit on RcvArray entries used by
+	 * subctxts (when context sharing is used) when TID caching
+	 * is enabled. To help with that, we calculate a per-process
+	 * RcvArray entry share and enforce that.
+	 * If TID caching is not in use, PSM deals with usage on its
+	 * own. In that case, we allow any subctxt to take all of the
+	 * entries.
+	 *
+	 * Make sure that we set the tid counts only after successful
+	 * init.
+	 */
+	if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) {
+		u16 remainder;
+
+		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
+		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
+		if (remainder && fd->subctxt < remainder)
+			fd->tid_limit++;
+	} else
+		fd->tid_limit = uctxt->expected_count;
+done:
+	return ret;
 }
 
 int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 {
-	return -EINVAL;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct tid_group *grp, *gptr;
+
+	/*
+	 * The notifier would have been removed when the process'es mm
+	 * was freed.
+	 */
+	if (current->mm && !HFI1_CAP_IS_USET(TID_UNMAP))
+		mmu_notifier_unregister(&fd->mn, current->mm);
+
+	kfree(fd->invalid_tids);
+
+	if (!uctxt->cnt) {
+		if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+			unlock_exp_tids(uctxt, &uctxt->tid_full_list,
+					&fd->tid_rb_root);
+		if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+			unlock_exp_tids(uctxt, &uctxt->tid_used_list,
+					&fd->tid_rb_root);
+		list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
+					 list) {
+			list_del_init(&grp->list);
+			kfree(grp);
+		}
+		spin_lock(&fd->rb_lock);
+		if (!RB_EMPTY_ROOT(&fd->tid_rb_root)) {
+			struct rb_node *node;
+			struct mmu_rb_node *rbnode;
+
+			while ((node = rb_first(&fd->tid_rb_root))) {
+				rbnode = rb_entry(node, struct mmu_rb_node,
+						  rbnode);
+				rb_erase(&rbnode->rbnode, &fd->tid_rb_root);
+				kfree(rbnode);
+			}
+		}
+		spin_unlock(&fd->rb_lock);
+		hfi1_clear_tids(uctxt);
+	}
+	return 0;
+}
+
+/*
+ * Write an "empty" RcvArray entry.
+ * This function exists so the TID registaration code can use it
+ * to write to unused/unneeded entries and still take advantage
+ * of the WC performance improvements. The HFI will ignore this
+ * write to the RcvArray entry.
+ */
+static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
+{
+	/* Doing the WC fill writes only makes sense if the device is
+	 * present and the RcvArray has been mapped as WC memory. */
+	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
+		writeq(0, dd->rcvarray_wc + (index * 8));
 }
 
 /*
@@ -165,17 +385,591 @@ int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
  */
 int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 {
-	return -EINVAL;
+	int ret = 0, need_group = 0, pinned;
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
+		tididx = 0, mapped, mapped_pages = 0;
+	unsigned long vaddr = tinfo->vaddr;
+	struct page **pages = NULL;
+	u32 *tidlist = NULL;
+	struct tid_pageset *pagesets = NULL;
+
+	/* Get the number of pages the user buffer spans */
+	npages = num_user_pages(vaddr, tinfo->length);
+	if (!npages) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (npages > uctxt->expected_count) {
+		dd_dev_err(dd, "Expected buffer too big\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
+			   GFP_KERNEL);
+	if (!pagesets) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Verify that access is OK for the user buffer */
+	if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+		       npages * PAGE_SIZE)) {
+		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
+			   (void *)vaddr, npages);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	/* Allocate the array of struct page pointers needed for pinning */
+	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/*
+	 * Pin all the pages of the user buffer. If we can't pin all the
+	 * pages, accept the amount pinned so far and program only that.
+	 * User space knows how to deal with partially programmed buffers.
+	 */
+	pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
+	if (pinned <= 0) {
+		/*
+		 * -EDQUOT has a special meaning (we can't lock any more
+		 * pages), which user space knows how to deal with. We
+		 * don't need an error message.
+		 */
+		if (pinned != -EDQUOT)
+			dd_dev_err(dd,
+				   "Failed to lock addr %p, %u pages: errno %d\n",
+				   (void *) vaddr, npages, pinned);
+		ret = pinned;
+		goto bail;
+	}
+
+	/* Find sets of physically contiguous pages */
+	npagesets = find_phys_blocks(pages, pinned, pagesets);
+
+	/*
+	 * We don't need to access this under a lock since tid_used is per
+	 * process and the same process cannot be in hfi1_user_exp_rcv_clear()
+	 * and hfi1_user_exp_rcv_setup() at the same time.
+	 */
+	if (fd->tid_used + npagesets > fd->tid_limit)
+		pageset_count = fd->tid_limit - fd->tid_used;
+	else
+		pageset_count = npagesets;
+
+	if (!pageset_count)
+		goto bail;
+
+	ngroups = pageset_count / dd->rcv_entries.group_size;
+	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
+	if (!tidlist) {
+		ret = -ENOMEM;
+		goto nomem;
+	}
+
+	tididx = 0;
+
+	/* From this point on, we are going to be using shared (between master
+	 * and subcontexts) context resources. We need to take the lock. */
+	mutex_lock(&uctxt->exp_lock);
+	/* The first step is to program the RcvArray entries which are complete
+	 * groups. */
+	while (ngroups && uctxt->tid_group_list.count) {
+		struct tid_group *grp =
+			tid_group_pop(&uctxt->tid_group_list);
+
+		ret = program_rcvarray(fp, vaddr, grp, pagesets,
+				       pageidx, dd->rcv_entries.group_size,
+				       pages, tidlist, &tididx, &mapped);
+		/*
+		 * If there was a failure to program the RcvArray
+		 * entries for the entire group, reset the grp fields
+		 * and add the grp back to the free group list.
+		 */
+		if (ret <= 0) {
+			tid_group_add_tail(grp, &uctxt->tid_group_list);
+			hfi1_cdbg(TID,
+				  "Failed to program RcvArray group %d", ret);
+			goto unlock;
+		}
+
+		tid_group_add_tail(grp, &uctxt->tid_full_list);
+		ngroups--;
+		pageidx += ret;
+		mapped_pages += mapped;
+	}
+
+	while (pageidx < pageset_count) {
+		struct tid_group *grp, *ptr;
+		/*
+		 * If we don't have any partially used tid groups, check
+		 * if we have empty groups. If so, take one from there and
+		 * put in the partially used list.
+		 */
+		if (!uctxt->tid_used_list.count || need_group) {
+			if (!uctxt->tid_group_list.count)
+				goto unlock;
+
+			grp = tid_group_pop(&uctxt->tid_group_list);
+			tid_group_add_tail(grp, &uctxt->tid_used_list);
+			need_group = 0;
+		}
+		/*
+		 * There is an optimization opportunity here - instead of
+		 * fitting as many page sets as we can, check for a group
+		 * later on in the list that could fit all of them.
+		 */
+		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
+					 list) {
+			unsigned use = min_t(unsigned, pageset_count - pageidx,
+					     grp->size - grp->used);
+
+			ret = program_rcvarray(fp, vaddr, grp, pagesets,
+					       pageidx, use, pages, tidlist,
+					       &tididx, &mapped);
+			if (ret < 0) {
+				hfi1_cdbg(TID,
+					  "Failed to program RcvArray entries %d",
+					  ret);
+				ret = -EFAULT;
+				goto unlock;
+			} else if (ret > 0) {
+				if (grp->used == grp->size)
+					tid_group_move(grp,
+						       &uctxt->tid_used_list,
+						       &uctxt->tid_full_list);
+				pageidx += ret;
+				mapped_pages += mapped;
+				need_group = 0;
+				/* Check if we are done so we break out early */
+				if (pageidx >= pageset_count)
+					break;
+			} else if (WARN_ON(ret == 0)) {
+				/*
+				 * If ret is 0, we did not program any entries
+				 * into this group, which can only happen if
+				 * we've screwed up the accounting somewhere.
+				 * Warn and try to continue.
+				 */
+				need_group = 1;
+			}
+		}
+	}
+unlock:
+	mutex_unlock(&uctxt->exp_lock);
+nomem:
+	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
+		  mapped_pages, ret);
+	if (tididx) {
+		fd->tid_used += tididx;
+		tinfo->tidcnt = tididx;
+		tinfo->length = mapped_pages * PAGE_SIZE;
+
+		if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
+				 tidlist, sizeof(tidlist[0]) * tididx)) {
+			/* On failure to copy to the user level, we need to undo
+			 * everything done so far so we don't leak resources. */
+			tinfo->tidlist = (unsigned long)&tidlist;
+			hfi1_user_exp_rcv_clear(fp, tinfo);
+			tinfo->tidlist = 0;
+			ret = -EFAULT;
+			goto bail;
+		}
+	}
+
+	/*
+	 * If not everything was mapped (due to insufficient RcvArray entries,
+	 * for example), unpin all unmapped pages so we can pin them nex time.
+	 */
+	if (mapped_pages != pinned)
+		hfi1_release_user_pages(&pages[mapped_pages],
+					pinned - mapped_pages,
+					false);
+bail:
+	kfree(pagesets);
+	kfree(pages);
+	kfree(tidlist);
+	return ret > 0 ? 0 : ret;
 }
 
 int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
 {
-	return -EINVAL;
+	int ret = 0;
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	u32 *tidinfo;
+	unsigned tididx;
+
+	tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
+	if (!tidinfo)
+		return -ENOMEM;
+
+	if (copy_from_user(tidinfo, (void __user *)(unsigned long)
+			   tinfo->tidlist, sizeof(tidinfo[0]) *
+			   tinfo->tidcnt)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	mutex_lock(&uctxt->exp_lock);
+	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
+		ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
+		if (ret) {
+			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
+				  ret);
+			break;
+		}
+	}
+	fd->tid_used -= tididx;
+	tinfo->tidcnt = tididx;
+	mutex_unlock(&uctxt->exp_lock);
+done:
+	kfree(tidinfo);
+	return ret;
 }
 
 int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
 {
-	return -EINVAL;
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	unsigned long *ev = uctxt->dd->events +
+		(((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
+		  HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
+	u32 *array;
+	int ret = 0;
+
+	if (!fd->invalid_tids) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/*
+	 * copy_to_user() can sleep, which will leave the invalid_lock
+	 * locked and cause the MMU notifier to be blocked on the lock
+	 * for a long time.
+	 * Copy the data to a local buffer so we can release the lock.
+	 */
+	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
+	if (!array) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	spin_lock(&fd->invalid_lock);
+	if (fd->invalid_tid_idx) {
+		memcpy(array, fd->invalid_tids, sizeof(*array) *
+		       fd->invalid_tid_idx);
+		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
+		       fd->invalid_tid_idx);
+		tinfo->tidcnt = fd->invalid_tid_idx;
+		fd->invalid_tid_idx = 0;
+		/* Reset the user flag while still holding the lock.
+		 * Otherwise, PSM can miss events. */
+		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+	} else
+		tinfo->tidcnt = 0;
+	spin_unlock(&fd->invalid_lock);
+
+	if (tinfo->tidcnt) {
+		if (copy_to_user((void __user *)tinfo->tidlist,
+				 array, sizeof(*array) * tinfo->tidcnt))
+			ret = -EFAULT;
+	}
+	kfree(array);
+done:
+	return ret;
+}
+
+static u32 find_phys_blocks(struct page **pages, unsigned npages,
+			    struct tid_pageset *list)
+{
+	unsigned pagecount, pageidx, setcount = 0, i;
+	unsigned long pfn, this_pfn;
+
+	if (!npages)
+		return 0;
+
+	/*
+	 * Look for sets of physically contiguous pages in the user buffer.
+	 * This will allow us to optimize Expected RcvArray entry usage by
+	 * using the bigger supported sizes.
+	 */
+	pfn = page_to_pfn(pages[0]);
+	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
+
+		/* If the pfn's are not sequential, pages are not physically
+		 * contiguous. */
+		if (this_pfn != ++pfn) {
+			/*
+			 * At this point we have to loop over the set of
+			 * physically contiguous pages and break them down it
+			 * sizes supported by the HW.
+			 * There are two main constraints:
+			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
+			 *        If the total set size is bigger than that
+			 *        program only a MAX_EXPECTED_BUFFER chunk.
+			 *     2. The buffer size has to be a power of two. If
+			 *        it is not, round down to the closes power of
+			 *        2 and program that size.
+			 */
+			while (pagecount) {
+				int maxpages = pagecount;
+				u32 bufsize = pagecount * PAGE_SIZE;
+
+				if (bufsize > MAX_EXPECTED_BUFFER)
+					maxpages =
+						MAX_EXPECTED_BUFFER >>
+						PAGE_SHIFT;
+				else if (!is_power_of_2(bufsize))
+					maxpages =
+						rounddown_pow_of_two(bufsize) >>
+						PAGE_SHIFT;
+
+				list[setcount].idx = pageidx;
+				list[setcount].count = maxpages;
+				pagecount -= maxpages;
+				pageidx += maxpages;
+				setcount++;
+			}
+			pageidx = i;
+			pagecount = 1;
+			pfn = this_pfn;
+		} else
+			pagecount++;
+	}
+	return setcount;
+}
+
+static int program_rcvarray(struct file *fp, unsigned long vaddr,
+			    struct tid_group *grp,
+			    struct tid_pageset *sets,
+			    unsigned start, u16 count, struct page **pages,
+			    u32 *tidlist, unsigned *tididx, unsigned *pmapped)
+{
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	u16 idx;
+	u32 tidinfo = 0, rcventry, useidx = 0;
+	int mapped = 0;
+
+	/* Count should never be larger than the group size */
+	if (count > grp->size)
+		return -EINVAL;
+
+	/* Find the first unused entry in the group */
+	for (idx = 0; idx < grp->size; idx++) {
+		if (!(grp->map & (1 << idx))) {
+			useidx = idx;
+			break;
+		}
+		rcv_array_wc_fill(dd, grp->base + idx);
+	}
+
+	idx = 0;
+	while (idx < count) {
+		u16 npages, pageidx, setidx = start + idx;
+		int ret = 0;
+
+		/*
+		 * If this entry in the group is used, move to the next one.
+		 * If we go past the end of the group, exit the loop.
+		 */
+		if (useidx >= grp->size)
+			break;
+		else if (grp->map & (1 << useidx)) {
+			rcv_array_wc_fill(dd, grp->base + useidx);
+			useidx++;
+			continue;
+		}
+
+		rcventry = grp->base + useidx;
+		npages = sets[setidx].count;
+		pageidx = sets[setidx].idx;
+
+		ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
+					 rcventry, grp, pages + pageidx,
+					 npages);
+		if (ret)
+			return ret;
+		mapped += npages;
+
+		tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
+			EXP_TID_SET(LEN, npages);
+		tidlist[(*tididx)++] = tidinfo;
+		grp->used++;
+		grp->map |= 1 << useidx++;
+		idx++;
+	}
+
+	/* Fill the rest of the group with "blank" writes */
+	for (; useidx < grp->size; useidx++)
+		rcv_array_wc_fill(dd, grp->base + useidx);
+	*pmapped = mapped;
+	return idx;
+}
+
+static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
+			      u32 rcventry, struct tid_group *grp,
+			      struct page **pages, unsigned npages)
+{
+	int ret;
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct mmu_rb_node *node;
+	struct hfi1_devdata *dd = uctxt->dd;
+	struct rb_root *root = &fd->tid_rb_root;
+	dma_addr_t phys;
+
+	/*
+	 * Allocate the node first so we can handle a potential
+	 * failure before we've programmed anything.
+	 */
+	node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
+		       GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	phys = pci_map_single(dd->pcidev,
+			      __va(page_to_phys(pages[0])),
+			      npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
+		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
+			   phys);
+		kfree(node);
+		return -EFAULT;
+	}
+
+	node->virt = vaddr;
+	node->phys = page_to_phys(pages[0]);
+	node->len = npages * PAGE_SIZE;
+	node->npages = npages;
+	node->rcventry = rcventry;
+	node->dma_addr = phys;
+	node->grp = grp;
+	node->freed = false;
+	memcpy(node->pages, pages, sizeof(struct page *) * npages);
+
+	spin_lock(&fd->rb_lock);
+	ret = fd->mmu_rb_insert(root, node);
+	spin_unlock(&fd->rb_lock);
+
+	if (ret) {
+		hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+			  node->rcventry, node->virt, node->phys, ret);
+		pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+				 PCI_DMA_FROMDEVICE);
+		kfree(node);
+		return -EFAULT;
+	}
+	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
+	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry,
+			       npages, node->virt, node->phys, phys);
+	return 0;
+}
+
+static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
+			      struct tid_group **grp)
+{
+	struct hfi1_filedata *fd = fp->private_data;
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+	struct mmu_rb_node *node;
+	u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
+	u32 tidbase = uctxt->expected_base,
+		tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
+
+	if (tididx > uctxt->expected_count) {
+		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
+			   tididx, uctxt->ctxt);
+		return -EINVAL;
+	}
+
+	if (tidctrl == 0x3)
+		return -EINVAL;
+
+	rcventry = tidbase + tididx + (tidctrl - 1);
+
+	spin_lock(&fd->rb_lock);
+	node = mmu_rb_search_by_entry(&fd->tid_rb_root, rcventry);
+	if (!node) {
+		spin_unlock(&fd->rb_lock);
+		return -EBADF;
+	}
+	rb_erase(&node->rbnode, &fd->tid_rb_root);
+	spin_unlock(&fd->rb_lock);
+	if (grp)
+		*grp = node->grp;
+	clear_tid_node(fd, fd->subctxt, node);
+	return 0;
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt,
+			   struct mmu_rb_node *node)
+{
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct hfi1_devdata *dd = uctxt->dd;
+
+	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
+				 node->npages, node->virt, node->phys,
+				 node->dma_addr);
+
+	hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
+	/* Make sure device has seen the write before we unpin the
+	 * pages */
+	flush_wc();
+
+	pci_unmap_single(dd->pcidev, node->dma_addr, node->len,
+			 PCI_DMA_FROMDEVICE);
+	hfi1_release_user_pages(node->pages, node->npages, true);
+
+	node->grp->used--;
+	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
+
+	if (node->grp->used == node->grp->size - 1)
+		tid_group_move(node->grp, &uctxt->tid_full_list,
+			       &uctxt->tid_used_list);
+	else if (!node->grp->used)
+		tid_group_move(node->grp, &uctxt->tid_used_list,
+			       &uctxt->tid_group_list);
+	kfree(node);
+}
+
+static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
+			    struct exp_tid_set *set, struct rb_root *root)
+{
+	struct tid_group *grp, *ptr;
+	struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata,
+						tid_rb_root);
+	int i;
+
+	list_for_each_entry_safe(grp, ptr, &set->list, list) {
+		list_del_init(&grp->list);
+
+		spin_lock(&fd->rb_lock);
+		for (i = 0; i < grp->size; i++) {
+			if (grp->map & (1 << i)) {
+				u16 rcventry = grp->base + i;
+				struct mmu_rb_node *node;
+
+				node = mmu_rb_search_by_entry(root, rcventry);
+				if (!node)
+					continue;
+				rb_erase(&node->rbnode, root);
+				clear_tid_node(fd, -1, node);
+			}
+		}
+		spin_unlock(&fd->rb_lock);
+	}
 }
 
 static inline void mmu_notifier_page(struct mmu_notifier *mn,
@@ -197,8 +991,74 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn,
 					unsigned long start, unsigned long end,
 					enum mmu_call_types type)
 {
-	/* Stub for now */
-	return;
+	struct hfi1_filedata *fd = container_of(mn, struct hfi1_filedata, mn);
+	struct hfi1_ctxtdata *uctxt = fd->uctxt;
+	struct rb_root *root = &fd->tid_rb_root;
+	struct mmu_rb_node *node;
+	unsigned long addr = start;
+
+	trace_hfi1_mmu_invalidate(uctxt->ctxt, fd->subctxt, mmu_types[type],
+				  start, end);
+
+	spin_lock(&fd->rb_lock);
+	while (addr < end) {
+		node = mmu_rb_search_by_addr(root, addr);
+
+		if (!node) {
+			/* Didn't find a node at this address. However, the
+			 * range could be bigger than what we have registered
+			 * so we have to keep looking. */
+			addr += PAGE_SIZE;
+			continue;
+		}
+
+		/*
+		 * The next address to be looked up is computed based
+		 * on the node's starting address. This is due to the
+		 * fact that the range where we start might be in the
+		 * middle of the node's buffer so simply incrementing
+		 * the address by the node's size would result is a
+		 * bad address.
+		 */
+		addr = node->virt + (node->npages * PAGE_SIZE);
+		if (node->freed)
+			continue;
+
+		trace_hfi1_exp_tid_inval(uctxt->ctxt, fd->subctxt, node->virt,
+					 node->rcventry, node->npages,
+					 node->dma_addr);
+		node->freed = true;
+
+		spin_lock(&fd->invalid_lock);
+		if (fd->invalid_tid_idx < uctxt->expected_count) {
+			fd->invalid_tids[fd->invalid_tid_idx] =
+				rcventry2tidinfo(node->rcventry -
+						 uctxt->expected_base);
+			fd->invalid_tids[fd->invalid_tid_idx] |=
+				EXP_TID_SET(LEN, node->npages);
+			if (!fd->invalid_tid_idx) {
+				unsigned long *ev;
+
+				/*
+				 * hfi1_set_uevent_bits() sets a user even flag
+				 * for all processes. Because calling into the
+				 * driver to process TID cache invalidations is
+				 * expensive and TID cache invalidations are
+				 * handled on a per-process basis, we can
+				 * optimize this to set the flag only for the
+				 * process in question.
+				 */
+				ev = uctxt->dd->events +
+					(((uctxt->ctxt -
+					   uctxt->dd->first_user_ctxt) *
+					  HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
+				set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
+			}
+			fd->invalid_tid_idx++;
+		}
+		spin_unlock(&fd->invalid_lock);
+	}
+	spin_unlock(&fd->rb_lock);
 }
 
 static inline int mmu_addr_cmp(struct mmu_rb_node *node, unsigned long addr,
diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c
index 9071afbd7bf4..ec84cc63743e 100644
--- a/drivers/staging/rdma/hfi1/user_pages.c
+++ b/drivers/staging/rdma/hfi1/user_pages.c
@@ -47,110 +47,48 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
-
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/device.h>
-
 #include "hfi.h"
 
-static void __hfi1_release_user_pages(struct page **p, size_t num_pages,
-				      int dirty)
+int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable,
+			    struct page **pages)
 {
-	size_t i;
-
-	for (i = 0; i < num_pages; i++) {
-		if (dirty)
-			set_page_dirty_lock(p[i]);
-		put_page(p[i]);
-	}
-}
-
-/*
- * Call with current->mm->mmap_sem held.
- */
-static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
-				 struct page **p)
-{
-	unsigned long lock_limit;
-	size_t got;
+	unsigned long pinned, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool can_lock = capable(CAP_IPC_LOCK);
 	int ret;
 
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-
-	for (got = 0; got < num_pages; got += ret) {
-		ret = get_user_pages(current, current->mm,
-				     start_page + got * PAGE_SIZE,
-				     num_pages - got, 1, 1,
-				     p + got, NULL);
-		if (ret < 0)
-			goto bail_release;
-	}
-
-	current->mm->pinned_vm += num_pages;
-
-	ret = 0;
-	goto bail;
-
-bail_release:
-	__hfi1_release_user_pages(p, got, 0);
-bail:
-	return ret;
-}
-
-/**
- * hfi1_map_page - a safety wrapper around pci_map_page()
- *
- */
-dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page,
-			 unsigned long offset, size_t size, int direction)
-{
-	dma_addr_t phys;
+	down_read(&current->mm->mmap_sem);
+	pinned = current->mm->pinned_vm;
+	up_read(&current->mm->mmap_sem);
 
-	phys = pci_map_page(hwdev, page, offset, size, direction);
+	if (pinned + npages > lock_limit && !can_lock)
+		return -EDQUOT;
 
-	return phys;
-}
-
-/**
- * hfi1_get_user_pages - lock user pages into memory
- * @start_page: the start page
- * @num_pages: the number of pages
- * @p: the output page structures
- *
- * This function takes a given start page (page aligned user virtual
- * address) and pins it and the following specified number of pages.  For
- * now, num_pages is always 1, but that will probably change at some point
- * (because caller is doing expected sends on a single virtually contiguous
- * buffer, so we can do all pages at once).
- */
-int hfi1_get_user_pages(unsigned long start_page, size_t num_pages,
-			struct page **p)
-{
-	int ret;
+	ret = get_user_pages_fast(vaddr, npages, writable, pages);
+	if (ret < 0)
+		return ret;
 
 	down_write(&current->mm->mmap_sem);
-
-	ret = __hfi1_get_user_pages(start_page, num_pages, p);
-
+	current->mm->pinned_vm += ret;
 	up_write(&current->mm->mmap_sem);
-
 	return ret;
 }
 
-void hfi1_release_user_pages(struct page **p, size_t num_pages)
+void hfi1_release_user_pages(struct page **p, size_t npages, bool dirty)
 {
-	if (current->mm) /* during close after signal, mm can be NULL */
-		down_write(&current->mm->mmap_sem);
+	size_t i;
 
-	__hfi1_release_user_pages(p, num_pages, 1);
+	for (i = 0; i < npages; i++) {
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
 
-	if (current->mm) {
-		current->mm->pinned_vm -= num_pages;
+	if (current->mm) { /* during close after signal, mm can be NULL */
+		down_write(&current->mm->mmap_sem);
+		current->mm->pinned_vm -= npages;
 		up_write(&current->mm->mmap_sem);
 	}
 }
diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c
index be7a4e53335f..f418c0843659 100644
--- a/drivers/staging/rdma/hfi1/user_sdma.c
+++ b/drivers/staging/rdma/hfi1/user_sdma.c
@@ -1059,6 +1059,12 @@ static int pin_vector_pages(struct user_sdma_request *req,
 	/* If called by the kernel thread, use the user's mm */
 	if (current->flags & PF_KTHREAD)
 		use_mm(req->user_proc->mm);
+	/*
+	 * We should be calling hfi1_acquire_user_pages() so we can keep
+	 * the number of pinned pages up-to-date. However, we can't do
+	 * that because we can't use hfi1_release_user_pages() (see
+	 * comment in unpin_vector_pages()).
+	 */
 	pinned = get_user_pages_fast(
 		(unsigned long)iovec->iov.iov_base,
 		iovec->npages, 0, iovec->pages);
@@ -1088,6 +1094,13 @@ static void unpin_vector_pages(struct user_sdma_iovec *iovec)
 			  iovec->offset, iovec->iov.iov_len);
 		return;
 	}
+	/*
+	 * We should be calling hfi1_release_user_pages() so we can keep
+	 * the number of pinned pages up-to-date. However, this function
+	 * can be called in IRQ context and this will cause a deadlock
+	 * because hfi1_release_user_pages() takes the mm semaphore,
+	 * (which sleeps).
+	 */
 	for (i = 0; i < iovec->npages; i++)
 		if (iovec->pages[i])
 			put_page(iovec->pages[i]);
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index a2fc6cbfe414..54998e86689b 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -66,7 +66,7 @@
  * The major version changes when data structures change in an incompatible
  * way. The driver must be the same for initialization to succeed.
  */
-#define HFI1_USER_SWMAJOR 4
+#define HFI1_USER_SWMAJOR 5
 
 /*
  * Minor version differences are always compatible
@@ -93,7 +93,7 @@
 #define HFI1_CAP_MULTI_PKT_EGR    (1UL <<  7) /* Enable multi-packet Egr buffs*/
 #define HFI1_CAP_NODROP_RHQ_FULL  (1UL <<  8) /* Don't drop on Hdr Q full */
 #define HFI1_CAP_NODROP_EGR_FULL  (1UL <<  9) /* Don't drop on EGR buffs full */
-#define HFI1_CAP_TID_UNMAP        (1UL << 10) /* Enable Expected TID caching */
+#define HFI1_CAP_TID_UNMAP        (1UL << 10) /* Disable Expected TID caching */
 #define HFI1_CAP_PRINT_UNIMPL     (1UL << 11) /* Show for unimplemented feats */
 #define HFI1_CAP_ALLOW_PERM_JKEY  (1UL << 12) /* Allow use of permissive JKEY */
 #define HFI1_CAP_NO_INTEGRITY     (1UL << 13) /* Enable ctxt integrity checks */
@@ -134,6 +134,7 @@
 #define HFI1_CMD_ACK_EVENT       10	/* ack & clear user status bits */
 #define HFI1_CMD_SET_PKEY        11     /* set context's pkey */
 #define HFI1_CMD_CTXT_RESET      12     /* reset context's HW send context */
+#define HFI1_CMD_TID_INVAL_READ  13     /* read TID cache invalidations */
 /* separate EPROM commands from normal PSM commands */
 #define HFI1_CMD_EP_INFO         64      /* read EPROM device ID */
 #define HFI1_CMD_EP_ERASE_CHIP   65      /* erase whole EPROM */
@@ -149,13 +150,15 @@
 #define _HFI1_EVENT_LID_CHANGE_BIT     2
 #define _HFI1_EVENT_LMC_CHANGE_BIT     3
 #define _HFI1_EVENT_SL2VL_CHANGE_BIT   4
-#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_SL2VL_CHANGE_BIT
+#define _HFI1_EVENT_TID_MMU_NOTIFY_BIT 5
+#define _HFI1_MAX_EVENT_BIT _HFI1_EVENT_TID_MMU_NOTIFY_BIT
 
 #define HFI1_EVENT_FROZEN            (1UL << _HFI1_EVENT_FROZEN_BIT)
 #define HFI1_EVENT_LINKDOWN          (1UL << _HFI1_EVENT_LINKDOWN_BIT)
 #define HFI1_EVENT_LID_CHANGE        (1UL << _HFI1_EVENT_LID_CHANGE_BIT)
 #define HFI1_EVENT_LMC_CHANGE        (1UL << _HFI1_EVENT_LMC_CHANGE_BIT)
 #define HFI1_EVENT_SL2VL_CHANGE      (1UL << _HFI1_EVENT_SL2VL_CHANGE_BIT)
+#define HFI1_EVENT_TID_MMU_NOTIFY    (1UL << _HFI1_EVENT_TID_MMU_NOTIFY_BIT)
 
 /*
  * These are the status bits readable (in ASCII form, 64bit value)
@@ -240,11 +243,6 @@ struct hfi1_tid_info {
 	__u32 tidcnt;
 	/* length of transfer buffer programmed by this request */
 	__u32 length;
-	/*
-	 * pointer to bitmap of TIDs used for this call;
-	 * checked for being large enough at open
-	 */
-	__u64 tidmap;
 };
 
 struct hfi1_cmd {
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 7/9] staging/rdma/hfi1: move hfi1_migrate_qp
  2015-10-30 22:41 [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny
                   ` (4 preceding siblings ...)
  2015-10-30 22:41 ` [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching ira.weiny
@ 2015-10-30 22:41 ` ira.weiny
  5 siblings, 0 replies; 14+ messages in thread
From: ira.weiny @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh, devel; +Cc: linux-rdma, dledford, dennis.dalessandro

From: Mike Marciniszyn <mike.marciniszyn@intel.com>

Move hfi1_migrate_qp from ruc.c to qp.[hc] in prep for modifying the QP
workqueue.

Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/staging/rdma/hfi1/qp.c    | 20 ++++++++++++++++++++
 drivers/staging/rdma/hfi1/qp.h    |  2 ++
 drivers/staging/rdma/hfi1/ruc.c   | 20 --------------------
 drivers/staging/rdma/hfi1/verbs.h |  2 --
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
index df1fa56eaf85..6e6c9a8f6b6f 100644
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -1685,3 +1685,23 @@ void qp_comm_est(struct hfi1_qp *qp)
 		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
 	}
 }
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void hfi1_migrate_qp(struct hfi1_qp *qp)
+{
+	struct ib_event ev;
+
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->remote_ah_attr = qp->alt_ah_attr;
+	qp->port_num = qp->alt_ah_attr.port_num;
+	qp->s_pkey_index = qp->s_alt_pkey_index;
+	qp->s_flags |= HFI1_S_AHG_CLEAR;
+
+	ev.device = qp->ibqp.device;
+	ev.element.qp = &qp->ibqp;
+	ev.event = IB_EVENT_PATH_MIG;
+	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
index b9c1575990aa..bacfa9c5e8a8 100644
--- a/drivers/staging/rdma/hfi1/qp.h
+++ b/drivers/staging/rdma/hfi1/qp.h
@@ -247,4 +247,6 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
  */
 void qp_comm_est(struct hfi1_qp *qp);
 
+void hfi1_migrate_qp(struct hfi1_qp *qp);
+
 #endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
index 8614b070545c..947ea27332ff 100644
--- a/drivers/staging/rdma/hfi1/ruc.c
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -241,26 +241,6 @@ bail:
 	return ret;
 }
 
-/*
- * Switch to alternate path.
- * The QP s_lock should be held and interrupts disabled.
- */
-void hfi1_migrate_qp(struct hfi1_qp *qp)
-{
-	struct ib_event ev;
-
-	qp->s_mig_state = IB_MIG_MIGRATED;
-	qp->remote_ah_attr = qp->alt_ah_attr;
-	qp->port_num = qp->alt_ah_attr.port_num;
-	qp->s_pkey_index = qp->s_alt_pkey_index;
-	qp->s_flags |= HFI1_S_AHG_CLEAR;
-
-	ev.device = qp->ibqp.device;
-	ev.element.qp = &qp->ibqp;
-	ev.event = IB_EVENT_PATH_MIG;
-	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
-}
-
 static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index)
 {
 	if (!index) {
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index e4a8a0d4ccf8..9555aab51530 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -1071,8 +1071,6 @@ int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
 int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only);
 
-void hfi1_migrate_qp(struct hfi1_qp *qp);
-
 int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr,
 		       int has_grh, struct hfi1_qp *qp, u32 bth0);
 
-- 
1.8.2

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v4 8/9] staging/rdma/hfi1: Use parallel workqueue for SDMA engines
       [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
  2015-10-30 22:41   ` [PATCH 2/9] staging/rdma/hfi1: Clean up macro indentation ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  2015-10-30 22:41   ` [PATCH v4 6/9] staging/rdma/hfi1: Implement Expected Receive TID caching ira.weiny-ral2JQCrhuEAvxtiuMwx3w
@ 2015-10-30 22:41   ` ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  2015-10-30 22:41   ` [PATCH 9/9] staging/rdma/hfi: pre-compute sc and sde for RC/UC QPs ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  3 siblings, 0 replies; 14+ messages in thread
From: ira.weiny-ral2JQCrhuEAvxtiuMwx3w @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w, Ira Weiny

From: Mike Marciniszyn <mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

The workqueue is currently single threaded per port which for a small number of
SDMA engines is ok.

For hfi1, the there are up to 16 SDMA engines that can be fed descriptors in
parallel.

Use alloc_workqueue with a workqueue limit of the number of sdma engines and
with WQ_CPU_INTENSIVE and WQ_HIGHPRI specified.

Then change send to use the new scheduler which no longer needs to get the
s_lock

Reviewed-by: Dennis Dalessandro <dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

---
Changes from v3:
	Split patch out code refactoring and separate functionality

 drivers/staging/rdma/hfi1/chip.c   |  1 +
 drivers/staging/rdma/hfi1/init.c   | 13 ++++++-------
 drivers/staging/rdma/hfi1/iowait.h |  6 ++++--
 drivers/staging/rdma/hfi1/qp.h     | 35 +++++++++++++++++++++++++++++++++++
 drivers/staging/rdma/hfi1/sdma.c   |  8 +++++---
 drivers/staging/rdma/hfi1/sdma.h   |  8 +++++---
 drivers/staging/rdma/hfi1/verbs.c  | 20 ++++----------------
 drivers/staging/rdma/hfi1/verbs.h  |  1 -
 8 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c
index e48981994b10..b7c7d1cac4e1 100644
--- a/drivers/staging/rdma/hfi1/chip.c
+++ b/drivers/staging/rdma/hfi1/chip.c
@@ -9018,6 +9018,7 @@ static int request_msix_irqs(struct hfi1_devdata *dd)
 		if (handler == sdma_interrupt) {
 			dd_dev_info(dd, "sdma engine %d cpu %d\n",
 				sde->this_idx, sdma_cpu);
+			sde->cpu = sdma_cpu;
 			cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
 			sdma_cpu = cpumask_next(sdma_cpu, def);
 			if (sdma_cpu >= nr_cpu_ids)
diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c
index 060ab566856a..3f6166b9c59f 100644
--- a/drivers/staging/rdma/hfi1/init.c
+++ b/drivers/staging/rdma/hfi1/init.c
@@ -601,20 +601,19 @@ static int create_workqueues(struct hfi1_devdata *dd)
 	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		ppd = dd->pport + pidx;
 		if (!ppd->hfi1_wq) {
-			char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
-
-			snprintf(wq_name, sizeof(wq_name), "hfi%d_%d",
-				 dd->unit, pidx);
 			ppd->hfi1_wq =
-				create_singlethread_workqueue(wq_name);
+				alloc_workqueue(
+				    "hfi%d_%d",
+				    WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+				    dd->num_sdma,
+				    dd->unit, pidx);
 			if (!ppd->hfi1_wq)
 				goto wq_error;
 		}
 	}
 	return 0;
 wq_error:
-	pr_err("create_singlethread_workqueue failed for port %d\n",
-		pidx + 1);
+	pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
 	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		ppd = dd->pport + pidx;
 		if (ppd->hfi1_wq) {
diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h
index fa361b405851..e8ba5606d08d 100644
--- a/drivers/staging/rdma/hfi1/iowait.h
+++ b/drivers/staging/rdma/hfi1/iowait.h
@@ -150,12 +150,14 @@ static inline void iowait_init(
  * iowait_schedule() - initialize wait structure
  * @wait: wait struct to schedule
  * @wq: workqueue for schedule
+ * @cpu: cpu
  */
 static inline void iowait_schedule(
 	struct iowait *wait,
-	struct workqueue_struct *wq)
+	struct workqueue_struct *wq,
+	int cpu)
 {
-	queue_work(wq, &wait->iowork);
+	queue_work_on(cpu, wq, &wait->iowork);
 }
 
 /**
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
index bacfa9c5e8a8..ca70a0e662d0 100644
--- a/drivers/staging/rdma/hfi1/qp.h
+++ b/drivers/staging/rdma/hfi1/qp.h
@@ -247,6 +247,41 @@ void qp_iter_print(struct seq_file *s, struct qp_iter *iter);
  */
 void qp_comm_est(struct hfi1_qp *qp);
 
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+static inline void _hfi1_schedule_send(struct hfi1_qp *qp)
+{
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	iowait_schedule(&qp->s_iowait, ppd->hfi1_wq,
+		qp->s_sde ?
+			qp->s_sde->cpu :
+			cpumask_first(cpumask_of_node(dd->assigned_node_id)));
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+static inline void hfi1_schedule_send(struct hfi1_qp *qp)
+{
+	if (hfi1_send_ok(qp))
+		_hfi1_schedule_send(qp);
+}
+
 void hfi1_migrate_qp(struct hfi1_qp *qp);
 
 #endif /* _QP_H */
diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c
index 2a1da2189900..ddef98d7e6ac 100644
--- a/drivers/staging/rdma/hfi1/sdma.c
+++ b/drivers/staging/rdma/hfi1/sdma.c
@@ -778,18 +778,19 @@ struct sdma_engine *sdma_select_engine_vl(
 	struct sdma_engine *rval;
 
 	if (WARN_ON(vl > 8))
-		return NULL;
+		return &dd->per_sdma[0];
 
 	rcu_read_lock();
 	m = rcu_dereference(dd->sdma_map);
 	if (unlikely(!m)) {
 		rcu_read_unlock();
-		return NULL;
+		return &dd->per_sdma[0];
 	}
 	e = m->map[vl & m->mask];
 	rval = e->sde[selector & e->mask];
 	rcu_read_unlock();
 
+	rval =  !rval ? &dd->per_sdma[0] : rval;
 	trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
 	return rval;
 }
@@ -1874,7 +1875,7 @@ static void dump_sdma_state(struct sdma_engine *sde)
 }
 
 #define SDE_FMT \
-	"SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+	"SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
 /**
  * sdma_seqfile_dump_sde() - debugfs dump of sde
  * @s: seq file
@@ -1894,6 +1895,7 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
 	head = sde->descq_head & sde->sdma_mask;
 	tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
 	seq_printf(s, SDE_FMT, sde->this_idx,
+		sde->cpu,
 		sdma_state_name(sde->state.current_state),
 		(unsigned long long)read_sde_csr(sde, SD(CTRL)),
 		(unsigned long long)read_sde_csr(sde, SD(STATUS)),
diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h
index cc22d2ee2054..85701eed1585 100644
--- a/drivers/staging/rdma/hfi1/sdma.h
+++ b/drivers/staging/rdma/hfi1/sdma.h
@@ -410,8 +410,6 @@ struct sdma_engine {
 	u64 idle_mask;
 	u64 progress_mask;
 	/* private: */
-	struct workqueue_struct *wq;
-	/* private: */
 	volatile __le64      *head_dma; /* DMA'ed by chip */
 	/* private: */
 	dma_addr_t            head_phys;
@@ -426,6 +424,8 @@ struct sdma_engine {
 	u32 sdma_mask;
 	/* private */
 	struct sdma_state state;
+	/* private */
+	int cpu;
 	/* private: */
 	u8 sdma_shift;
 	/* private: */
@@ -990,7 +990,9 @@ static inline void sdma_iowait_schedule(
 	struct sdma_engine *sde,
 	struct iowait *wait)
 {
-	iowait_schedule(wait, sde->wq);
+	struct hfi1_pportdata *ppd = sde->dd->pport;
+
+	iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
 }
 
 /* for use by interrupt handling */
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index a13a2b135365..f43a8646af1b 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -162,6 +162,8 @@ static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext
 	return container_of(ibucontext, struct hfi1_ucontext, ibucontext);
 }
 
+static inline void _hfi1_schedule_send(struct hfi1_qp *qp);
+
 /*
  * Translate ib_wr_opcode into ib_wc_opcode.
  */
@@ -497,9 +499,9 @@ static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		nreq++;
 	}
 bail:
-	if (nreq && !call_send)
-		hfi1_schedule_send(qp);
 	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (nreq && !call_send)
+		_hfi1_schedule_send(qp);
 	if (nreq && call_send)
 		hfi1_do_send(&qp->s_iowait.iowork);
 	return err;
@@ -2125,20 +2127,6 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
 	vfree(dev->lk_table.table);
 }
 
-/*
- * This must be called with s_lock held.
- */
-void hfi1_schedule_send(struct hfi1_qp *qp)
-{
-	if (hfi1_send_ok(qp)) {
-		struct hfi1_ibport *ibp =
-			to_iport(qp->ibqp.device, qp->port_num);
-		struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-		iowait_schedule(&qp->s_iowait, ppd->hfi1_wq);
-	}
-}
-
 void hfi1_cnp_rcv(struct hfi1_packet *packet)
 {
 	struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index 9555aab51530..fb8ea5c892c6 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -841,7 +841,6 @@ static inline int hfi1_send_ok(struct hfi1_qp *qp)
 /*
  * This must be called with s_lock held.
  */
-void hfi1_schedule_send(struct hfi1_qp *qp);
 void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
 		    u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
 void hfi1_cap_mask_chg(struct hfi1_ibport *ibp);
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 9/9] staging/rdma/hfi: pre-compute sc and sde for RC/UC QPs
       [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
                     ` (2 preceding siblings ...)
  2015-10-30 22:41   ` [PATCH v4 8/9] staging/rdma/hfi1: Use parallel workqueue for SDMA engines ira.weiny-ral2JQCrhuEAvxtiuMwx3w
@ 2015-10-30 22:41   ` ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  3 siblings, 0 replies; 14+ messages in thread
From: ira.weiny-ral2JQCrhuEAvxtiuMwx3w @ 2015-10-30 22:41 UTC (permalink / raw)
  To: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w, Ira Weiny

From: Mike Marciniszyn <mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Now that we have a multi-threaded work queue we precomputed and store the SC
and SDE on RC and UC QPs for faster access.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Signed-off-by: Ira Weiny <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
 drivers/staging/rdma/hfi1/qp.c    | 27 +++++++++++++++++++++------
 drivers/staging/rdma/hfi1/qp.h    |  1 -
 drivers/staging/rdma/hfi1/ruc.c   | 10 ++--------
 drivers/staging/rdma/hfi1/ud.c    |  1 +
 drivers/staging/rdma/hfi1/verbs.c | 14 +++-----------
 drivers/staging/rdma/hfi1/verbs.h |  3 ++-
 6 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c
index 6e6c9a8f6b6f..87aa49227bf6 100644
--- a/drivers/staging/rdma/hfi1/qp.c
+++ b/drivers/staging/rdma/hfi1/qp.c
@@ -617,7 +617,7 @@ int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	int mig = 0;
 	int ret;
 	u32 pmtu = 0; /* for gcc warning only */
-	struct hfi1_devdata *dd;
+	struct hfi1_devdata *dd = dd_from_dev(dev);
 
 	spin_lock_irq(&qp->r_lock);
 	spin_lock(&qp->s_lock);
@@ -631,23 +631,35 @@ int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		goto inval;
 
 	if (attr_mask & IB_QP_AV) {
+		u8 sc;
+
 		if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
 			goto inval;
 		if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr))
 			goto inval;
+		sc = ah_to_sc(ibqp->device, &attr->ah_attr);
+		if (!qp_to_sdma_engine(qp, sc) &&
+				dd->flags & HFI1_HAS_SEND_DMA)
+			goto inval;
 	}
 
 	if (attr_mask & IB_QP_ALT_PATH) {
+		u8 sc;
+
 		if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE)
 			goto inval;
 		if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
 			goto inval;
-		if (attr->alt_pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+		if (attr->alt_pkey_index >= hfi1_get_npkeys(dd))
+			goto inval;
+		sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr);
+		if (!qp_to_sdma_engine(qp, sc) &&
+				dd->flags & HFI1_HAS_SEND_DMA)
 			goto inval;
 	}
 
 	if (attr_mask & IB_QP_PKEY_INDEX)
-		if (attr->pkey_index >= hfi1_get_npkeys(dd_from_dev(dev)))
+		if (attr->pkey_index >= hfi1_get_npkeys(dd))
 			goto inval;
 
 	if (attr_mask & IB_QP_MIN_RNR_TIMER)
@@ -792,6 +804,8 @@ int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		qp->remote_ah_attr = attr->ah_attr;
 		qp->s_srate = attr->ah_attr.static_rate;
 		qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+		qp->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+		qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
 	}
 
 	if (attr_mask & IB_QP_ALT_PATH) {
@@ -806,6 +820,8 @@ int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 			qp->port_num = qp->alt_ah_attr.port_num;
 			qp->s_pkey_index = qp->s_alt_pkey_index;
 			qp->s_flags |= HFI1_S_AHG_CLEAR;
+			qp->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr);
+			qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
 		}
 	}
 
@@ -1528,9 +1544,6 @@ struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5)
 	if (!(dd->flags & HFI1_HAS_SEND_DMA))
 		return NULL;
 	switch (qp->ibqp.qp_type) {
-	case IB_QPT_UC:
-	case IB_QPT_RC:
-		break;
 	case IB_QPT_SMI:
 		return NULL;
 	default:
@@ -1699,6 +1712,8 @@ void hfi1_migrate_qp(struct hfi1_qp *qp)
 	qp->port_num = qp->alt_ah_attr.port_num;
 	qp->s_pkey_index = qp->s_alt_pkey_index;
 	qp->s_flags |= HFI1_S_AHG_CLEAR;
+	qp->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr);
+	qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
 
 	ev.device = qp->ibqp.device;
 	ev.element.qp = &qp->ibqp;
diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h
index ca70a0e662d0..5e1def523f61 100644
--- a/drivers/staging/rdma/hfi1/qp.h
+++ b/drivers/staging/rdma/hfi1/qp.h
@@ -128,7 +128,6 @@ static inline void clear_ahg(struct hfi1_qp *qp)
 	if (qp->s_sde && qp->s_ahgidx >= 0)
 		sdma_ahg_free(qp->s_sde, qp->s_ahgidx);
 	qp->s_ahgidx = -1;
-	qp->s_sde = NULL;
 }
 
 /**
diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c
index 947ea27332ff..7b11c61ac5d6 100644
--- a/drivers/staging/rdma/hfi1/ruc.c
+++ b/drivers/staging/rdma/hfi1/ruc.c
@@ -694,11 +694,8 @@ static inline void build_ahg(struct hfi1_qp *qp, u32 npsn)
 		clear_ahg(qp);
 	if (!(qp->s_flags & HFI1_S_AHG_VALID)) {
 		/* first middle that needs copy  */
-		if (qp->s_ahgidx < 0) {
-			if (!qp->s_sde)
-				qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
+		if (qp->s_ahgidx < 0)
 			qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde);
-		}
 		if (qp->s_ahgidx >= 0) {
 			qp->s_ahgpsn = npsn;
 			qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
@@ -741,7 +738,6 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
 	u16 lrh0;
 	u32 nwords;
 	u32 extra_bytes;
-	u8 sc5;
 	u32 bth1;
 
 	/* Construct the header. */
@@ -755,9 +751,7 @@ void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr,
 		lrh0 = HFI1_LRH_GRH;
 		middle = 0;
 	}
-	sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
-	lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
-	qp->s_sc = sc5;
+	lrh0 |= (qp->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
 	/*
 	 * reset s_hdr/AHG fields
 	 *
diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c
index d40d1a1e10aa..d6b4ba8a811e 100644
--- a/drivers/staging/rdma/hfi1/ud.c
+++ b/drivers/staging/rdma/hfi1/ud.c
@@ -383,6 +383,7 @@ int hfi1_make_ud_req(struct hfi1_qp *qp)
 		lrh0 |= (sc5 & 0xf) << 12;
 		qp->s_sc = sc5;
 	}
+	qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc);
 	qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0);
 	qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
 	qp->s_hdr->ibh.lrh[2] =
diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c
index f43a8646af1b..2c7cf783f24d 100644
--- a/drivers/staging/rdma/hfi1/verbs.c
+++ b/drivers/staging/rdma/hfi1/verbs.c
@@ -999,7 +999,6 @@ int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
 	struct verbs_txreq *tx;
 	struct sdma_txreq *stx;
 	u64 pbc_flags = 0;
-	struct sdma_engine *sde;
 	u8 sc5 = qp->s_sc;
 	int ret;
 
@@ -1020,12 +1019,7 @@ int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
 	if (IS_ERR(tx))
 		goto bail_tx;
 
-	if (!qp->s_hdr->sde) {
-		tx->sde = sde = qp_to_sdma_engine(qp, sc5);
-		if (!sde)
-			goto bail_no_sde;
-	} else
-		tx->sde = sde = qp->s_hdr->sde;
+	tx->sde = qp->s_sde;
 
 	if (likely(pbc == 0)) {
 		u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
@@ -1040,17 +1034,15 @@ int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr,
 	if (qp->s_rdma_mr)
 		qp->s_rdma_mr = NULL;
 	tx->hdr_dwords = hdrwords + 2;
-	ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc);
+	ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc);
 	if (unlikely(ret))
 		goto bail_build;
 	trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh);
-	ret =  sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq);
+	ret =  sdma_send_txreq(tx->sde, &qp->s_iowait, &tx->txreq);
 	if (unlikely(ret == -ECOMM))
 		goto bail_ecomm;
 	return ret;
 
-bail_no_sde:
-	hfi1_put_txreq(tx);
 bail_ecomm:
 	/* The current one got "sent" */
 	return 0;
diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h
index fb8ea5c892c6..62c6e38cca45 100644
--- a/drivers/staging/rdma/hfi1/verbs.h
+++ b/drivers/staging/rdma/hfi1/verbs.h
@@ -436,7 +436,8 @@ struct hfi1_qp {
 	struct hfi1_swqe *s_wq;  /* send work queue */
 	struct hfi1_mmap_info *ip;
 	struct ahg_ib_header *s_hdr;     /* next packet header to send */
-	u8 s_sc;			/* SC[0..4] for next packet */
+	/* sc for UC/RC QPs - based on ah for UD */
+	u8 s_sc;
 	unsigned long timeout_jiffies;  /* computed from timeout */
 
 	enum ib_mtu path_mtu;
-- 
1.8.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v4 6/9] staging/rdma/hfi1: Implement Expected Receive TID caching
       [not found]     ` <1446244918-12089-7-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2015-10-30 22:47       ` Greg KH
  0 siblings, 0 replies; 14+ messages in thread
From: Greg KH @ 2015-10-30 22:47 UTC (permalink / raw)
  To: ira.weiny-ral2JQCrhuEAvxtiuMwx3w
  Cc: devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
	dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w,
	mike.marciniszyn-ral2JQCrhuEAvxtiuMwx3w, Mitko Haralanov

Meta-comment, you put "v4" in the subject, in a place where I can't sort
this series and expect them to be applied in the correct order.  You
need a 'v4' in all of the patches (git format-patch will do it
automatically for you.)

As this is, it's a pain to apply, please fix up and resend.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching
       [not found]   ` <1446244918-12089-6-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
@ 2015-11-12  5:58     ` Or Gerlitz
  2015-11-20  1:06       ` Greg Kroah-Hartman
  0 siblings, 1 reply; 14+ messages in thread
From: Or Gerlitz @ 2015-11-12  5:58 UTC (permalink / raw)
  To: Weiny, Ira, Greg Kroah-Hartman, Mike Marciniszyn
  Cc: devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b, Doug Ledford,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	dennis.dalessandro-ral2JQCrhuEAvxtiuMwx3w, Mitko Haralanov,
	Kamal Heib

On Sat, Oct 31, 2015 at 12:41 AM,  <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> From: Mitko Haralanov <mitko.haralanov-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>
> Add mmu notify helper functions and TID caching function stubs in preparation
> for the TID caching implementation.
>
> TID caching makes use of the MMU notifier to allow the driver to respond to the
> user freeing memory which is allocated to the HFI.
>
> This patch implements the basic MMU notifier functions to insert, find and
> remove buffer pages from memory based on the mmu_notifier being invoked.
>
> In addition it places stubs in place for the main entry points by follow on
> code.
>
> Follow up patches will complete the implementation of the interaction with user
> space and makes use of these functions.

So this is an wholy orthogonal mechanism for memory registrations or
de-registrations vs
what's supported by the upstream RDMA stack to which this driver
attempts to be a HW provider, right?

Ira, Mike - why do that?

2. Greg - I read an earlier comment you made that a driver in staging
need to be 1st
and most **fixed** with patches such that the TODO items to get it out
of staging are
addressed. I see here every day long train of features going into this
driver, should the
focus need to be elsewhere, according to the staging guidelines?

Or.


>  drivers/staging/rdma/hfi1/Kconfig        |   1 +
>  drivers/staging/rdma/hfi1/Makefile       |   2 +-
>  drivers/staging/rdma/hfi1/user_exp_rcv.c | 314 +++++++++++++++++++++++++++++++
>  drivers/staging/rdma/hfi1/user_exp_rcv.h |   8 +
>  4 files changed, 324 insertions(+), 1 deletion(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching
  2015-11-12  5:58     ` Or Gerlitz
@ 2015-11-20  1:06       ` Greg Kroah-Hartman
       [not found]         ` <20151120010630.GA25032-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 14+ messages in thread
From: Greg Kroah-Hartman @ 2015-11-20  1:06 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: devel, linux-rdma, Mitko Haralanov, Doug Ledford, Kamal Heib

On Thu, Nov 12, 2015 at 07:58:54AM +0200, Or Gerlitz wrote:
> On Sat, Oct 31, 2015 at 12:41 AM,  <ira.weiny@intel.com> wrote:
> > From: Mitko Haralanov <mitko.haralanov@intel.com>
> >
> > Add mmu notify helper functions and TID caching function stubs in preparation
> > for the TID caching implementation.
> >
> > TID caching makes use of the MMU notifier to allow the driver to respond to the
> > user freeing memory which is allocated to the HFI.
> >
> > This patch implements the basic MMU notifier functions to insert, find and
> > remove buffer pages from memory based on the mmu_notifier being invoked.
> >
> > In addition it places stubs in place for the main entry points by follow on
> > code.
> >
> > Follow up patches will complete the implementation of the interaction with user
> > space and makes use of these functions.
> 
> So this is an wholy orthogonal mechanism for memory registrations or
> de-registrations vs
> what's supported by the upstream RDMA stack to which this driver
> attempts to be a HW provider, right?
> 
> Ira, Mike - why do that?
> 
> 2. Greg - I read an earlier comment you made that a driver in staging
> need to be 1st
> and most **fixed** with patches such that the TODO items to get it out
> of staging are
> addressed. I see here every day long train of features going into this
> driver, should the
> focus need to be elsewhere, according to the staging guidelines?

I've already complained about this, supposedly these are things that are
needed to get it out of staging...

And yes, you are correct.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching
       [not found]         ` <20151120010630.GA25032-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
@ 2015-11-20  8:21           ` Or Gerlitz
  0 siblings, 0 replies; 14+ messages in thread
From: Or Gerlitz @ 2015-11-20  8:21 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Mike Marciniszyn, Doug Ledford
  Cc: Weiny, Ira, devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Mitko Haralanov, Kamal Heib,
	Dennis Dalessandro

On Fri, Nov 20, 2015 at 3:06 AM, Greg Kroah-Hartman
<gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org> wrote:
> On Thu, Nov 12, 2015 at 07:58:54AM +0200, Or Gerlitz wrote:
>> On Sat, Oct 31, 2015 at 12:41 AM,  <ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

>> So this is an wholy orthogonal mechanism for memory registrations or
>> de-registrations vs what's supported by the upstream RDMA stack to which this driver
>> attempts to be a HW provider, right?

>> Ira, Mike - why do that?

>> 2. Greg - I read an earlier comment you made that a driver in staging need to be 1st
>> and most **fixed** with patches such that the TODO items to get it out
>> of staging are addressed. I see here every day long train of features going into this
>> driver, should the focus need to be elsewhere, according to the staging guidelines?

> I've already complained about this, supposedly these are things that are
> needed to get it out of staging...

Greg,

When the driver was picked by Doug to staging, the only major comment
he has was the need to come up with shared code that implements IBTA
transport in SW -- to avoid the code duplication between ipath, qib and hfi1

Indeed, work has been started to do that, but since hfi1 is in staging,
none of the patches applied on it deal with that comment. I guess it
should be OK that people sends cleanup once the driver is already there,
buy should it be so heavily patched with new features? Mike? Doug?

> And yes, you are correct.

I know...
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-11-20  8:21 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-30 22:41 [PATCH 0/9] staging/rdma/hfi1: Fix bugs and performance issues ira.weiny
2015-10-30 22:41 ` [PATCH 1/9] staging/rdma/hfi1: Remove file pointer macros ira.weiny
     [not found] ` <1446244918-12089-1-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-10-30 22:41   ` [PATCH 2/9] staging/rdma/hfi1: Clean up macro indentation ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-30 22:41   ` [PATCH v4 6/9] staging/rdma/hfi1: Implement Expected Receive TID caching ira.weiny-ral2JQCrhuEAvxtiuMwx3w
     [not found]     ` <1446244918-12089-7-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-10-30 22:47       ` Greg KH
2015-10-30 22:41   ` [PATCH v4 8/9] staging/rdma/hfi1: Use parallel workqueue for SDMA engines ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-30 22:41   ` [PATCH 9/9] staging/rdma/hfi: pre-compute sc and sde for RC/UC QPs ira.weiny-ral2JQCrhuEAvxtiuMwx3w
2015-10-30 22:41 ` [PATCH 3/9] staging/rdma/hfi1: Remove unnecessary include files ira.weiny
2015-10-30 22:41 ` [PATCH 4/9] staging/rdma/hfi1: Move macros to a common header ira.weiny
2015-10-30 22:41 ` [PATCH 5/9] staging/rdma/hfi1: Add function stubs for TID caching ira.weiny
     [not found]   ` <1446244918-12089-6-git-send-email-ira.weiny-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-11-12  5:58     ` Or Gerlitz
2015-11-20  1:06       ` Greg Kroah-Hartman
     [not found]         ` <20151120010630.GA25032-U8xfFu+wG4EAvxtiuMwx3w@public.gmane.org>
2015-11-20  8:21           ` Or Gerlitz
2015-10-30 22:41 ` [PATCH 7/9] staging/rdma/hfi1: move hfi1_migrate_qp ira.weiny

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.