All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-09 13:59 ` Hannes Hering
  0 siblings, 0 replies; 19+ messages in thread
From: Hannes Hering @ 2009-06-09 13:59 UTC (permalink / raw)
  To: rdreier; +Cc: ossrosch, alexs, ewg, linux-kernel, linuxppc-dev, raisch

This patch implements toleration of dynamic memory operations and 16 GB
gigantic pages. On module load the driver walks through available system
memory, checks for available memory ranges and then registers the kernel
internal memory region accordingly. The translation of address ranges is
implemented via a 3-level busmap.

Signed-off-by: Hannes Hering <hering2@de.ibm.com>

---
This patch is built and tested against infiniband.git. Please apply for 2.6.31.

Regards

Hannes

Index: infiniband/drivers/infiniband/hw/ehca/ehca_main.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/ehca/ehca_main.c	2009-06-09 14:20:37.000000000 +0200
+++ infiniband/drivers/infiniband/hw/ehca/ehca_main.c	2009-06-09 14:20:47.000000000 +0200
@@ -52,7 +52,7 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
-#define HCAD_VERSION "0026"
+#define HCAD_VERSION "0027"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
@@ -506,6 +506,7 @@
 	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
 	shca->ib_device.process_mad	    = ehca_process_mad;
 	shca->ib_device.mmap		    = ehca_mmap;
+	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
 
 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
 		shca->ib_device.uverbs_cmd_mask |=
@@ -1028,17 +1029,23 @@
 		goto module_init1;
 	}
 
+	ret = ehca_create_busmap();
+	if (ret) {
+		ehca_gen_err("Cannot create busmap.");
+		goto module_init2;
+	}
+
 	ret = ibmebus_register_driver(&ehca_driver);
 	if (ret) {
 		ehca_gen_err("Cannot register eHCA device driver");
 		ret = -EINVAL;
-		goto module_init2;
+		goto module_init3;
 	}
 
 	ret = register_memory_notifier(&ehca_mem_nb);
 	if (ret) {
 		ehca_gen_err("Failed registering memory add/remove notifier");
-		goto module_init3;
+		goto module_init4;
 	}
 
 	if (ehca_poll_all_eqs != 1) {
@@ -1053,9 +1060,12 @@
 
 	return 0;
 
-module_init3:
+module_init4:
 	ibmebus_unregister_driver(&ehca_driver);
 
+module_init3:
+	ehca_destroy_busmap();
+
 module_init2:
 	ehca_destroy_slab_caches();
 
@@ -1073,6 +1083,8 @@
 
 	unregister_memory_notifier(&ehca_mem_nb);
 
+	ehca_destroy_busmap();
+
 	ehca_destroy_slab_caches();
 
 	ehca_destroy_comp_pool();
Index: infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/ehca/ehca_mrmw.c	2009-06-09 14:20:37.000000000 +0200
+++ infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.c	2009-06-09 14:20:47.000000000 +0200
@@ -53,6 +53,39 @@
 /* max number of rpages (per hcall register_rpages) */
 #define MAX_RPAGES 512
 
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+#define EHCA_REG_MR 0
+#define EHCA_REG_BUSMAP_MR (~0)
+
+static unsigned long ehca_mr_len;
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+	u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
 static struct kmem_cache *mr_cache;
 static struct kmem_cache *mw_cache;
 
@@ -68,6 +101,8 @@
 #define EHCA_MR_PGSHIFT1M  20
 #define EHCA_MR_PGSHIFT16M 24
 
+static u64 ehca_map_vaddr(void *caddr);
+
 static u32 ehca_encode_hwpage_size(u32 pgsize)
 {
 	int log = ilog2(pgsize);
@@ -135,7 +170,8 @@
 			goto get_dma_mr_exit0;
 		}
 
-		ret = ehca_reg_maxmr(shca, e_maxmr, (u64 *)KERNELBASE,
+		ret = ehca_reg_maxmr(shca, e_maxmr,
+				     (void *)ehca_map_vaddr((void *)KERNELBASE),
 				     mr_access_flags, e_pd,
 				     &e_maxmr->ib.ib_mr.lkey,
 				     &e_maxmr->ib.ib_mr.rkey);
@@ -251,7 +287,7 @@
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
 				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-				  &e_mr->ib.ib_mr.rkey);
+				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 		if (ret) {
 			ib_mr = ERR_PTR(ret);
 			goto reg_phys_mr_exit1;
@@ -370,7 +406,7 @@
 
 	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
 			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
 		ehca_warn(pd->device, "failed to register mr "
 			  "with hwpage_size=%llx", hwpage_size);
@@ -794,7 +830,7 @@
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
 			  mr_access_flags, e_pd, &pginfo,
-			  &tmp_lkey, &tmp_rkey);
+			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		ib_fmr = ERR_PTR(ret);
 		goto alloc_fmr_exit1;
@@ -983,6 +1019,10 @@
 
 /*----------------------------------------------------------------------*/
 
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo);
+
 int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_mr *e_mr,
 		u64 *iova_start,
@@ -991,7 +1031,8 @@
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey, /*OUT*/
-		u32 *rkey) /*OUT*/
+		u32 *rkey, /*OUT*/
+		int reg_busmap)
 {
 	int ret;
 	u64 h_ret;
@@ -1015,7 +1056,11 @@
 
 	e_mr->ipz_mr_handle = hipzout.handle;
 
-	ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	if (reg_busmap)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+
 	if (ret)
 		goto ehca_reg_mr_exit1;
 
@@ -1316,7 +1361,7 @@
 		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-				  e_pd, pginfo, lkey, rkey);
+				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
 		if (ret) {
 			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
 			memcpy(&e_mr->flags, &(save_mr.flags),
@@ -1409,7 +1454,7 @@
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
 			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-			  &tmp_rkey);
+			  &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
 		memcpy(&e_fmr->flags, &(save_mr.flags),
@@ -1478,6 +1523,90 @@
 } /* end ehca_reg_smr() */
 
 /*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+	unsigned long ret = idx;
+	ret |= dir << EHCA_DIR_INDEX_SHIFT;
+	ret |= top << EHCA_TOP_INDEX_SHIFT;
+	return abs_to_virt(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+	((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+			       struct ehca_shca *shca, struct ehca_mr *mr,
+			       struct ehca_mr_pginfo *pginfo)
+{
+	u64 h_ret = 0;
+	unsigned long page = 0;
+	u64 rpage = virt_to_abs(kpage);
+	int page_count;
+
+	void *sectbase = ehca_calc_sectbase(top, dir, idx);
+	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+					   "hwpage_size does not fit to "
+					   "section start address");
+	}
+	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+	while (page < page_count) {
+		u64 rnum;
+		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+		     rnum++) {
+			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+			kpage[rnum] = virt_to_abs(pg);
+		}
+
+		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+			ehca_err(&shca->ib_device, "register_rpage_mr failed");
+			return h_ret;
+		}
+	}
+	return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+				struct ehca_shca *shca, struct ehca_mr *mr,
+				struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int idx;
+
+	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+			continue;
+
+		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+					   pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+				    struct ehca_mr *mr,
+				    struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int dir;
+
+	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+			continue;
+
+		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
 
 /* register internal max-MR to internal SHCA */
 int ehca_reg_internal_maxmr(
@@ -1495,6 +1624,11 @@
 	u32 num_hwpages;
 	u64 hw_pgsize;
 
+	if (!ehca_bmap) {
+		ret = -EFAULT;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+
 	e_mr = ehca_mr_new();
 	if (!e_mr) {
 		ehca_err(&shca->ib_device, "out of memory");
@@ -1504,8 +1638,8 @@
 	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
 
 	/* register internal max-MR on HCA */
-	size_maxmr = (u64)high_memory - PAGE_OFFSET;
-	iova_start = (u64 *)KERNELBASE;
+	size_maxmr = ehca_mr_len;
+	iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE);
 	ib_pbuf.addr = 0;
 	ib_pbuf.size = size_maxmr;
 	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
@@ -1524,7 +1658,7 @@
 
 	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
 			  &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
 	if (ret) {
 		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
 			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
@@ -2077,8 +2211,8 @@
 		     u64 *iova_start)
 {
 	/* a MR is treated as max-MR only if it fits following: */
-	if ((size == ((u64)high_memory - PAGE_OFFSET)) &&
-	    (iova_start == (void *)KERNELBASE)) {
+	if ((size == ehca_mr_len) &&
+	    (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) {
 		ehca_gen_dbg("this is a max-MR");
 		return 1;
 	} else
@@ -2184,3 +2318,350 @@
 	if (mw_cache)
 		kmem_cache_destroy(mw_cache);
 }
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+				     int dir)
+{
+	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+		ehca_top_bmap->dir[dir] =
+			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+		if (!ehca_top_bmap->dir[dir])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+	}
+	return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+		ehca_bmap->top[top] =
+			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+		if (!ehca_bmap->top[top])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+	}
+	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+	return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+	int top, dir;
+
+	if (!ehca_bmap)
+		return;
+
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+				continue;
+
+			kfree(ehca_bmap->top[top]->dir[dir]);
+		}
+
+		kfree(ehca_bmap->top[top]);
+	}
+
+	kfree(ehca_bmap);
+	ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i, start_section, end_section;
+	int top, dir, idx;
+
+	if (!nr_pages)
+		return 0;
+
+	if (!ehca_bmap) {
+		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+		if (!ehca_bmap)
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+	}
+
+	start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+	end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+	for (i = start_section; i < end_section; i++) {
+		int ret;
+		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+		idx = i & EHCA_INDEX_MASK;
+
+		ret = ehca_init_bmap(ehca_bmap, top, dir);
+		if (ret) {
+			ehca_destroy_busmap();
+			return ret;
+		}
+		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+		ehca_mr_len += EHCA_SECTSIZE;
+	}
+	return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+	int page_order;
+
+	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+		return 0;
+
+	page_order = compound_order(pfn_to_page(pfn));
+	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+		return 0;
+
+	return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+				       unsigned long total_nr_pages, void *arg)
+{
+	int ret;
+	unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+		return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+	/* Given chunk is >= 16GB -> check for hugepages */
+	start_pfn = initial_pfn;
+	end_pfn = initial_pfn + total_nr_pages;
+	pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		if (ehca_is_hugepage(pfn)) {
+			/* Add mem found in front of the hugepage */
+			nr_pages = pfn - start_pfn;
+			ret = ehca_update_busmap(start_pfn, nr_pages);
+			if (ret)
+				return ret;
+			/* Skip the hugepage */
+			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+			start_pfn = pfn;
+		} else
+			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+	}
+
+	/* Add mem found behind the hugepage(s)  */
+	nr_pages = pfn - start_pfn;
+	return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+	int ret;
+
+	ehca_mr_len = 0;
+	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+				   ehca_create_busmap_callback);
+	return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo)
+{
+	int top;
+	u64 hret, *kpage;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		return -ENOMEM;
+	}
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+			break;
+	}
+
+	ehca_free_fw_ctrlblock(kpage);
+
+	if (hret == H_SUCCESS)
+		return 0; /* Everything is fine */
+	else {
+		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
+				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+		return ehca2ib_return_code(hret);
+	}
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+	int top, dir, idx;
+	unsigned long abs_addr, offset;
+	u64 entry;
+
+	if (!ehca_bmap)
+		return EHCA_INVAL_ADDR;
+
+	abs_addr = virt_to_abs(caddr);
+	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]))
+		return EHCA_INVAL_ADDR;
+
+	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+		return EHCA_INVAL_ADDR;
+
+	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+	if (ehca_bmap_valid(entry)) {
+		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+		return entry | offset;
+	} else
+		return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (cpu_addr)
+		return ehca_map_vaddr(cpu_addr);
+	else
+		return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			     enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (offset + size > PAGE_SIZE)
+		return EHCA_INVAL_ADDR;
+
+	addr = ehca_map_vaddr(page_address(page));
+	if (!ehca_dma_mapping_error(dev, addr))
+		addr += offset;
+
+	return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		u64 addr;
+		addr = ehca_map_vaddr(sg_virt(sg));
+		if (ehca_dma_mapping_error(dev, addr))
+			return 0;
+
+		sg->dma_address = addr;
+		sg->dma_length = sg->length;
+	}
+	return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			      int nents, enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->dma_address;
+}
+
+static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->length;
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+					    size_t size,
+					    enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+	u64 dma_addr;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p) {
+		addr = page_address(p);
+		dma_addr = ehca_map_vaddr(addr);
+		if (ehca_dma_mapping_error(dev, dma_addr)) {
+			free_pages((unsigned long)addr,	get_order(size));
+			return NULL;
+		}
+		if (dma_handle)
+			*dma_handle = dma_addr;
+		return addr;
+	}
+	return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	if (cpu_addr && size)
+		free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+	.mapping_error = ehca_dma_mapping_error,
+	.map_single = ehca_dma_map_single,
+	.unmap_single = ehca_dma_unmap_single,
+	.map_page = ehca_dma_map_page,
+	.unmap_page = ehca_dma_unmap_page,
+	.map_sg = ehca_dma_map_sg,
+	.unmap_sg = ehca_dma_unmap_sg,
+	.dma_address = ehca_dma_address,
+	.dma_len = ehca_dma_len,
+	.sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
+	.sync_single_for_device = ehca_dma_sync_single_for_device,
+	.alloc_coherent = ehca_dma_alloc_coherent,
+	.free_coherent = ehca_dma_free_coherent,
+};
Index: infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.h
===================================================================
--- infiniband.orig/drivers/infiniband/hw/ehca/ehca_mrmw.h	2009-06-09 14:20:37.000000000 +0200
+++ infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.h	2009-06-09 14:20:47.000000000 +0200
@@ -50,7 +50,8 @@
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey,
-		u32 *rkey);
+		u32 *rkey,
+		int reg_busmap);
 
 int ehca_reg_mr_rpages(struct ehca_shca *shca,
 		       struct ehca_mr *e_mr,
@@ -118,4 +119,9 @@
 
 void ehca_mr_deletenew(struct ehca_mr *mr);
 
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
 #endif  /*_EHCA_MRMW_H_*/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-09 13:59 ` Hannes Hering
  0 siblings, 0 replies; 19+ messages in thread
From: Hannes Hering @ 2009-06-09 13:59 UTC (permalink / raw)
  To: rdreier; +Cc: alexs, linux-kernel, ewg, linuxppc-dev, raisch, ossrosch

This patch implements toleration of dynamic memory operations and 16 GB
gigantic pages. On module load the driver walks through available system
memory, checks for available memory ranges and then registers the kernel
internal memory region accordingly. The translation of address ranges is
implemented via a 3-level busmap.

Signed-off-by: Hannes Hering <hering2@de.ibm.com>

---
This patch is built and tested against infiniband.git. Please apply for 2.6.31.

Regards

Hannes

Index: infiniband/drivers/infiniband/hw/ehca/ehca_main.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/ehca/ehca_main.c	2009-06-09 14:20:37.000000000 +0200
+++ infiniband/drivers/infiniband/hw/ehca/ehca_main.c	2009-06-09 14:20:47.000000000 +0200
@@ -52,7 +52,7 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
-#define HCAD_VERSION "0026"
+#define HCAD_VERSION "0027"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
@@ -506,6 +506,7 @@
 	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
 	shca->ib_device.process_mad	    = ehca_process_mad;
 	shca->ib_device.mmap		    = ehca_mmap;
+	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
 
 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
 		shca->ib_device.uverbs_cmd_mask |=
@@ -1028,17 +1029,23 @@
 		goto module_init1;
 	}
 
+	ret = ehca_create_busmap();
+	if (ret) {
+		ehca_gen_err("Cannot create busmap.");
+		goto module_init2;
+	}
+
 	ret = ibmebus_register_driver(&ehca_driver);
 	if (ret) {
 		ehca_gen_err("Cannot register eHCA device driver");
 		ret = -EINVAL;
-		goto module_init2;
+		goto module_init3;
 	}
 
 	ret = register_memory_notifier(&ehca_mem_nb);
 	if (ret) {
 		ehca_gen_err("Failed registering memory add/remove notifier");
-		goto module_init3;
+		goto module_init4;
 	}
 
 	if (ehca_poll_all_eqs != 1) {
@@ -1053,9 +1060,12 @@
 
 	return 0;
 
-module_init3:
+module_init4:
 	ibmebus_unregister_driver(&ehca_driver);
 
+module_init3:
+	ehca_destroy_busmap();
+
 module_init2:
 	ehca_destroy_slab_caches();
 
@@ -1073,6 +1083,8 @@
 
 	unregister_memory_notifier(&ehca_mem_nb);
 
+	ehca_destroy_busmap();
+
 	ehca_destroy_slab_caches();
 
 	ehca_destroy_comp_pool();
Index: infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/ehca/ehca_mrmw.c	2009-06-09 14:20:37.000000000 +0200
+++ infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.c	2009-06-09 14:20:47.000000000 +0200
@@ -53,6 +53,39 @@
 /* max number of rpages (per hcall register_rpages) */
 #define MAX_RPAGES 512
 
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+#define EHCA_REG_MR 0
+#define EHCA_REG_BUSMAP_MR (~0)
+
+static unsigned long ehca_mr_len;
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+	u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
 static struct kmem_cache *mr_cache;
 static struct kmem_cache *mw_cache;
 
@@ -68,6 +101,8 @@
 #define EHCA_MR_PGSHIFT1M  20
 #define EHCA_MR_PGSHIFT16M 24
 
+static u64 ehca_map_vaddr(void *caddr);
+
 static u32 ehca_encode_hwpage_size(u32 pgsize)
 {
 	int log = ilog2(pgsize);
@@ -135,7 +170,8 @@
 			goto get_dma_mr_exit0;
 		}
 
-		ret = ehca_reg_maxmr(shca, e_maxmr, (u64 *)KERNELBASE,
+		ret = ehca_reg_maxmr(shca, e_maxmr,
+				     (void *)ehca_map_vaddr((void *)KERNELBASE),
 				     mr_access_flags, e_pd,
 				     &e_maxmr->ib.ib_mr.lkey,
 				     &e_maxmr->ib.ib_mr.rkey);
@@ -251,7 +287,7 @@
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
 				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-				  &e_mr->ib.ib_mr.rkey);
+				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 		if (ret) {
 			ib_mr = ERR_PTR(ret);
 			goto reg_phys_mr_exit1;
@@ -370,7 +406,7 @@
 
 	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
 			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
 		ehca_warn(pd->device, "failed to register mr "
 			  "with hwpage_size=%llx", hwpage_size);
@@ -794,7 +830,7 @@
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
 			  mr_access_flags, e_pd, &pginfo,
-			  &tmp_lkey, &tmp_rkey);
+			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		ib_fmr = ERR_PTR(ret);
 		goto alloc_fmr_exit1;
@@ -983,6 +1019,10 @@
 
 /*----------------------------------------------------------------------*/
 
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo);
+
 int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_mr *e_mr,
 		u64 *iova_start,
@@ -991,7 +1031,8 @@
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey, /*OUT*/
-		u32 *rkey) /*OUT*/
+		u32 *rkey, /*OUT*/
+		int reg_busmap)
 {
 	int ret;
 	u64 h_ret;
@@ -1015,7 +1056,11 @@
 
 	e_mr->ipz_mr_handle = hipzout.handle;
 
-	ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	if (reg_busmap)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+
 	if (ret)
 		goto ehca_reg_mr_exit1;
 
@@ -1316,7 +1361,7 @@
 		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-				  e_pd, pginfo, lkey, rkey);
+				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
 		if (ret) {
 			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
 			memcpy(&e_mr->flags, &(save_mr.flags),
@@ -1409,7 +1454,7 @@
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
 			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-			  &tmp_rkey);
+			  &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
 		memcpy(&e_fmr->flags, &(save_mr.flags),
@@ -1478,6 +1523,90 @@
 } /* end ehca_reg_smr() */
 
 /*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+	unsigned long ret = idx;
+	ret |= dir << EHCA_DIR_INDEX_SHIFT;
+	ret |= top << EHCA_TOP_INDEX_SHIFT;
+	return abs_to_virt(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+	((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+			       struct ehca_shca *shca, struct ehca_mr *mr,
+			       struct ehca_mr_pginfo *pginfo)
+{
+	u64 h_ret = 0;
+	unsigned long page = 0;
+	u64 rpage = virt_to_abs(kpage);
+	int page_count;
+
+	void *sectbase = ehca_calc_sectbase(top, dir, idx);
+	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+					   "hwpage_size does not fit to "
+					   "section start address");
+	}
+	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+	while (page < page_count) {
+		u64 rnum;
+		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+		     rnum++) {
+			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+			kpage[rnum] = virt_to_abs(pg);
+		}
+
+		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+			ehca_err(&shca->ib_device, "register_rpage_mr failed");
+			return h_ret;
+		}
+	}
+	return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+				struct ehca_shca *shca, struct ehca_mr *mr,
+				struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int idx;
+
+	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+			continue;
+
+		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+					   pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+				    struct ehca_mr *mr,
+				    struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int dir;
+
+	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+			continue;
+
+		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
 
 /* register internal max-MR to internal SHCA */
 int ehca_reg_internal_maxmr(
@@ -1495,6 +1624,11 @@
 	u32 num_hwpages;
 	u64 hw_pgsize;
 
+	if (!ehca_bmap) {
+		ret = -EFAULT;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+
 	e_mr = ehca_mr_new();
 	if (!e_mr) {
 		ehca_err(&shca->ib_device, "out of memory");
@@ -1504,8 +1638,8 @@
 	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
 
 	/* register internal max-MR on HCA */
-	size_maxmr = (u64)high_memory - PAGE_OFFSET;
-	iova_start = (u64 *)KERNELBASE;
+	size_maxmr = ehca_mr_len;
+	iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE);
 	ib_pbuf.addr = 0;
 	ib_pbuf.size = size_maxmr;
 	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
@@ -1524,7 +1658,7 @@
 
 	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
 			  &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
 	if (ret) {
 		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
 			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
@@ -2077,8 +2211,8 @@
 		     u64 *iova_start)
 {
 	/* a MR is treated as max-MR only if it fits following: */
-	if ((size == ((u64)high_memory - PAGE_OFFSET)) &&
-	    (iova_start == (void *)KERNELBASE)) {
+	if ((size == ehca_mr_len) &&
+	    (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) {
 		ehca_gen_dbg("this is a max-MR");
 		return 1;
 	} else
@@ -2184,3 +2318,350 @@
 	if (mw_cache)
 		kmem_cache_destroy(mw_cache);
 }
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+				     int dir)
+{
+	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+		ehca_top_bmap->dir[dir] =
+			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+		if (!ehca_top_bmap->dir[dir])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+	}
+	return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+		ehca_bmap->top[top] =
+			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+		if (!ehca_bmap->top[top])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+	}
+	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+	return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+	int top, dir;
+
+	if (!ehca_bmap)
+		return;
+
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+				continue;
+
+			kfree(ehca_bmap->top[top]->dir[dir]);
+		}
+
+		kfree(ehca_bmap->top[top]);
+	}
+
+	kfree(ehca_bmap);
+	ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i, start_section, end_section;
+	int top, dir, idx;
+
+	if (!nr_pages)
+		return 0;
+
+	if (!ehca_bmap) {
+		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+		if (!ehca_bmap)
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+	}
+
+	start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+	end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+	for (i = start_section; i < end_section; i++) {
+		int ret;
+		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+		idx = i & EHCA_INDEX_MASK;
+
+		ret = ehca_init_bmap(ehca_bmap, top, dir);
+		if (ret) {
+			ehca_destroy_busmap();
+			return ret;
+		}
+		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+		ehca_mr_len += EHCA_SECTSIZE;
+	}
+	return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+	int page_order;
+
+	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+		return 0;
+
+	page_order = compound_order(pfn_to_page(pfn));
+	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+		return 0;
+
+	return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+				       unsigned long total_nr_pages, void *arg)
+{
+	int ret;
+	unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+		return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+	/* Given chunk is >= 16GB -> check for hugepages */
+	start_pfn = initial_pfn;
+	end_pfn = initial_pfn + total_nr_pages;
+	pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		if (ehca_is_hugepage(pfn)) {
+			/* Add mem found in front of the hugepage */
+			nr_pages = pfn - start_pfn;
+			ret = ehca_update_busmap(start_pfn, nr_pages);
+			if (ret)
+				return ret;
+			/* Skip the hugepage */
+			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+			start_pfn = pfn;
+		} else
+			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+	}
+
+	/* Add mem found behind the hugepage(s)  */
+	nr_pages = pfn - start_pfn;
+	return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+	int ret;
+
+	ehca_mr_len = 0;
+	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+				   ehca_create_busmap_callback);
+	return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo)
+{
+	int top;
+	u64 hret, *kpage;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		return -ENOMEM;
+	}
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+			break;
+	}
+
+	ehca_free_fw_ctrlblock(kpage);
+
+	if (hret == H_SUCCESS)
+		return 0; /* Everything is fine */
+	else {
+		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
+				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+		return ehca2ib_return_code(hret);
+	}
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+	int top, dir, idx;
+	unsigned long abs_addr, offset;
+	u64 entry;
+
+	if (!ehca_bmap)
+		return EHCA_INVAL_ADDR;
+
+	abs_addr = virt_to_abs(caddr);
+	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]))
+		return EHCA_INVAL_ADDR;
+
+	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+		return EHCA_INVAL_ADDR;
+
+	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+	if (ehca_bmap_valid(entry)) {
+		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+		return entry | offset;
+	} else
+		return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (cpu_addr)
+		return ehca_map_vaddr(cpu_addr);
+	else
+		return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			     enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (offset + size > PAGE_SIZE)
+		return EHCA_INVAL_ADDR;
+
+	addr = ehca_map_vaddr(page_address(page));
+	if (!ehca_dma_mapping_error(dev, addr))
+		addr += offset;
+
+	return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		u64 addr;
+		addr = ehca_map_vaddr(sg_virt(sg));
+		if (ehca_dma_mapping_error(dev, addr))
+			return 0;
+
+		sg->dma_address = addr;
+		sg->dma_length = sg->length;
+	}
+	return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			      int nents, enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->dma_address;
+}
+
+static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->length;
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+					    size_t size,
+					    enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+	u64 dma_addr;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p) {
+		addr = page_address(p);
+		dma_addr = ehca_map_vaddr(addr);
+		if (ehca_dma_mapping_error(dev, dma_addr)) {
+			free_pages((unsigned long)addr,	get_order(size));
+			return NULL;
+		}
+		if (dma_handle)
+			*dma_handle = dma_addr;
+		return addr;
+	}
+	return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	if (cpu_addr && size)
+		free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+	.mapping_error = ehca_dma_mapping_error,
+	.map_single = ehca_dma_map_single,
+	.unmap_single = ehca_dma_unmap_single,
+	.map_page = ehca_dma_map_page,
+	.unmap_page = ehca_dma_unmap_page,
+	.map_sg = ehca_dma_map_sg,
+	.unmap_sg = ehca_dma_unmap_sg,
+	.dma_address = ehca_dma_address,
+	.dma_len = ehca_dma_len,
+	.sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
+	.sync_single_for_device = ehca_dma_sync_single_for_device,
+	.alloc_coherent = ehca_dma_alloc_coherent,
+	.free_coherent = ehca_dma_free_coherent,
+};
Index: infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.h
===================================================================
--- infiniband.orig/drivers/infiniband/hw/ehca/ehca_mrmw.h	2009-06-09 14:20:37.000000000 +0200
+++ infiniband/drivers/infiniband/hw/ehca/ehca_mrmw.h	2009-06-09 14:20:47.000000000 +0200
@@ -50,7 +50,8 @@
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey,
-		u32 *rkey);
+		u32 *rkey,
+		int reg_busmap);
 
 int ehca_reg_mr_rpages(struct ehca_shca *shca,
 		       struct ehca_mr *e_mr,
@@ -118,4 +119,9 @@
 
 void ehca_mr_deletenew(struct ehca_mr *mr);
 
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
 #endif  /*_EHCA_MRMW_H_*/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-09 13:59 ` Hannes Hering
  (?)
@ 2009-06-10  0:02 ` Michael Ellerman
  2009-06-10  8:54   ` Hannes Hering
  -1 siblings, 1 reply; 19+ messages in thread
From: Michael Ellerman @ 2009-06-10  0:02 UTC (permalink / raw)
  To: Hannes Hering
  Cc: rdreier, alexs, linux-kernel, ewg, linuxppc-dev, raisch, ossrosch

[-- Attachment #1: Type: text/plain, Size: 1801 bytes --]

On Tue, 2009-06-09 at 15:59 +0200, Hannes Hering wrote:
> This patch implements toleration of dynamic memory operations and 16 GB
> gigantic pages. On module load the driver walks through available system
> memory, checks for available memory ranges and then registers the kernel
> internal memory region accordingly. The translation of address ranges is
> implemented via a 3-level busmap.

Hi Hannes,

For those of us who haven't read the HEA spec lately, can you give us
some more detail on that? :)

How does it interact with kexec/kdump?

> +static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
> +{
> +	unsigned long i, start_section, end_section;
> +	int top, dir, idx;
> +
> +	if (!nr_pages)
> +		return 0;
> +
> +	if (!ehca_bmap) {
> +		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
> +		if (!ehca_bmap)
> +			return -ENOMEM;
> +		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
> +		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
> +	}
> +
> +	start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
> +	end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;


phys_to_abs() ? As below, or does it come from somewhere else?

 arch/powerpc/include/asm/abs_addr.h:
 47 static inline unsigned long phys_to_abs(unsigned long pa)                   
 48 {
 49         unsigned long chunk;
 50 
 51         /* This is a no-op on non-iSeries */
 52         if (!firmware_has_feature(FW_FEATURE_ISERIES))
 53                 return pa;
 54 
 55         chunk = addr_to_chunk(pa);
 56 
 57         if (chunk < mschunks_map.num_chunks)
 58                 chunk = mschunks_map.mapping[chunk];
 59 
 60         return chunk_to_addr(chunk) + (pa & MSCHUNKS_OFFSET_MASK);
 61 }


cheers

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-10  0:02 ` Michael Ellerman
@ 2009-06-10  8:54   ` Hannes Hering
  0 siblings, 0 replies; 19+ messages in thread
From: Hannes Hering @ 2009-06-10  8:54 UTC (permalink / raw)
  To: michael; +Cc: rdreier, alexs, linux-kernel, ewg, linuxppc-dev, raisch, ossrosch

Hi Michael,

On Wednesday 10 June 2009 02:02:36 Michael Ellerman wrote:
> For those of us who haven't read the HEA spec lately, can you give us
> some more detail on that? :)
first of all, please note that this patch is actually for the ehca infiniband
driver. The ehca driver uses an internal memory region, which is supposed to
contain all physical memory. A memory region maps a virtually contiguous
adapter address space to the physical or better absolute address space. The
limitation is that the memory region cannot map non-contiguous virtual adapter
address space. However, on ppc64 machines there is a feature to dynamically add
or remove memory to logical partitions during runtime. These operations scatter
the absolute memory so that the kernel memory has a non-contiguous layout. This
layout cannot be represented in a memory region. The purpose of this code is to
detect the memory layout so that the memory region can be made up of the
existing memory chunks. It also translates the kernel addresses to the memory
region address, which is needed for interaction with the HCA.
> How does it interact with kexec/kdump?
We never tested the ehca driver with kexec/kdump. This patch also doesn't
improve anything in this context.

> phys_to_abs() ? As below, or does it come from somewhere else?
You're right, actually that isn't needed on System p. On the other hand I
needed to choose an address type, which is the base of all mapping. The
"busmap" holds all addresses as absolute addresses. The absolute addresses can
then be converted in any other type (virt, phys). 

Regards

Hannes


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-09 13:59 ` Hannes Hering
@ 2009-06-13  4:50   ` Roland Dreier
  -1 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-13  4:50 UTC (permalink / raw)
  To: Hannes Hering; +Cc: ossrosch, alexs, ewg, linux-kernel, linuxppc-dev, raisch

OK, one major issue with this patch and a few minor nits.

First, the major issue is that I don't see anything in the patch that
changes the code in ehca_mem_notifier() in ehca_main.c:

	case MEM_GOING_ONLINE:
	case MEM_GOING_OFFLINE:
		/* only ok if no hca is attached to the lpar */
		spin_lock_irqsave(&shca_list_lock, flags);
		if (list_empty(&shca_list)) {
			spin_unlock_irqrestore(&shca_list_lock, flags);
			return NOTIFY_OK;
		} else {
			spin_unlock_irqrestore(&shca_list_lock, flags);
			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
						   30 * 1000))
				ehca_gen_err("DMEM operations are not allowed"
					     "in conjunction with eHCA");
			return NOTIFY_BAD;
		}

But your patch description says:

 > This patch implements toleration of dynamic memory operations....

But it seems you're still going to hit the same NOTIFY_BAD case above
after your patch.  So something doesn't compute for me.  Could you
explain more?

Second, a nit:

 > +#define EHCA_REG_MR 0
 > +#define EHCA_REG_BUSMAP_MR (~0)

and you pass these as the reg_busmap parm in:

 >  int ehca_reg_mr(struct ehca_shca *shca,
 >  		struct ehca_mr *e_mr,
 >  		u64 *iova_start,
 > @@ -991,7 +1031,8 @@
 >  		struct ehca_pd *e_pd,
 >  		struct ehca_mr_pginfo *pginfo,
 >  		u32 *lkey, /*OUT*/
 > -		u32 *rkey) /*OUT*/
 > +		u32 *rkey, /*OUT*/
 > +		int reg_busmap)

and test it as:

 > +	if (reg_busmap)
 > +		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
 > +	else
 > +		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);

So the ~0 for true looks a bit odd.  One option would be to make
reg_busmap a bool, since that's how you're using it, but then you lose
the nice self-documenting macro where you call things.

So I think it would be cleaner to do something like

enum ehca_reg_type {
	EHCA_REG_MR,
	EHCA_REG_BUSMAP_MR
};

and make the "int reg_busmap" parameter into "enum ehca_reg_type reg_type"
and have the code become

+	if (reg_type == EHCA_REG_BUSMAP_MR)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else if (reg_type == EHCA_REG_MR)
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = -EINVAL

or something like that.

 > +struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
 > +	.mapping_error = ehca_dma_mapping_error,
 > +	.map_single = ehca_dma_map_single,
 > +	.unmap_single = ehca_dma_unmap_single,
 > +	.map_page = ehca_dma_map_page,
 > +	.unmap_page = ehca_dma_unmap_page,
 > +	.map_sg = ehca_dma_map_sg,
 > +	.unmap_sg = ehca_dma_unmap_sg,
 > +	.dma_address = ehca_dma_address,
 > +	.dma_len = ehca_dma_len,
 > +	.sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
 > +	.sync_single_for_device = ehca_dma_sync_single_for_device,
 > +	.alloc_coherent = ehca_dma_alloc_coherent,
 > +	.free_coherent = ehca_dma_free_coherent,
 > +};

I always think structures like this are easier to read if you align the
'=' signs.  But no big deal.

 > +	ret = ehca_create_busmap();
 > +	if (ret) {
 > +		ehca_gen_err("Cannot create busmap.");
 > +		goto module_init2;
 > +	}
 > +
 >  	ret = ibmebus_register_driver(&ehca_driver);
 >  	if (ret) {
 >  		ehca_gen_err("Cannot register eHCA device driver");
 >  		ret = -EINVAL;
 > -		goto module_init2;
 > +		goto module_init3;
 >  	}
 >  
 >  	ret = register_memory_notifier(&ehca_mem_nb);
 >  	if (ret) {
 >  		ehca_gen_err("Failed registering memory add/remove notifier");
 > -		goto module_init3;
 > +		goto module_init4;

Having to renumber unrelated things is when something changes is why I
don't like this style of error path labels.  But I think it's well and
truly too late to fix that in ehca.

 - R.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-13  4:50   ` Roland Dreier
  0 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-13  4:50 UTC (permalink / raw)
  To: Hannes Hering; +Cc: alexs, linux-kernel, ewg, linuxppc-dev, raisch, ossrosch

OK, one major issue with this patch and a few minor nits.

First, the major issue is that I don't see anything in the patch that
changes the code in ehca_mem_notifier() in ehca_main.c:

	case MEM_GOING_ONLINE:
	case MEM_GOING_OFFLINE:
		/* only ok if no hca is attached to the lpar */
		spin_lock_irqsave(&shca_list_lock, flags);
		if (list_empty(&shca_list)) {
			spin_unlock_irqrestore(&shca_list_lock, flags);
			return NOTIFY_OK;
		} else {
			spin_unlock_irqrestore(&shca_list_lock, flags);
			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
						   30 * 1000))
				ehca_gen_err("DMEM operations are not allowed"
					     "in conjunction with eHCA");
			return NOTIFY_BAD;
		}

But your patch description says:

 > This patch implements toleration of dynamic memory operations....

But it seems you're still going to hit the same NOTIFY_BAD case above
after your patch.  So something doesn't compute for me.  Could you
explain more?

Second, a nit:

 > +#define EHCA_REG_MR 0
 > +#define EHCA_REG_BUSMAP_MR (~0)

and you pass these as the reg_busmap parm in:

 >  int ehca_reg_mr(struct ehca_shca *shca,
 >  		struct ehca_mr *e_mr,
 >  		u64 *iova_start,
 > @@ -991,7 +1031,8 @@
 >  		struct ehca_pd *e_pd,
 >  		struct ehca_mr_pginfo *pginfo,
 >  		u32 *lkey, /*OUT*/
 > -		u32 *rkey) /*OUT*/
 > +		u32 *rkey, /*OUT*/
 > +		int reg_busmap)

and test it as:

 > +	if (reg_busmap)
 > +		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
 > +	else
 > +		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);

So the ~0 for true looks a bit odd.  One option would be to make
reg_busmap a bool, since that's how you're using it, but then you lose
the nice self-documenting macro where you call things.

So I think it would be cleaner to do something like

enum ehca_reg_type {
	EHCA_REG_MR,
	EHCA_REG_BUSMAP_MR
};

and make the "int reg_busmap" parameter into "enum ehca_reg_type reg_type"
and have the code become

+	if (reg_type == EHCA_REG_BUSMAP_MR)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else if (reg_type == EHCA_REG_MR)
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = -EINVAL

or something like that.

 > +struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
 > +	.mapping_error = ehca_dma_mapping_error,
 > +	.map_single = ehca_dma_map_single,
 > +	.unmap_single = ehca_dma_unmap_single,
 > +	.map_page = ehca_dma_map_page,
 > +	.unmap_page = ehca_dma_unmap_page,
 > +	.map_sg = ehca_dma_map_sg,
 > +	.unmap_sg = ehca_dma_unmap_sg,
 > +	.dma_address = ehca_dma_address,
 > +	.dma_len = ehca_dma_len,
 > +	.sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
 > +	.sync_single_for_device = ehca_dma_sync_single_for_device,
 > +	.alloc_coherent = ehca_dma_alloc_coherent,
 > +	.free_coherent = ehca_dma_free_coherent,
 > +};

I always think structures like this are easier to read if you align the
'=' signs.  But no big deal.

 > +	ret = ehca_create_busmap();
 > +	if (ret) {
 > +		ehca_gen_err("Cannot create busmap.");
 > +		goto module_init2;
 > +	}
 > +
 >  	ret = ibmebus_register_driver(&ehca_driver);
 >  	if (ret) {
 >  		ehca_gen_err("Cannot register eHCA device driver");
 >  		ret = -EINVAL;
 > -		goto module_init2;
 > +		goto module_init3;
 >  	}
 >  
 >  	ret = register_memory_notifier(&ehca_mem_nb);
 >  	if (ret) {
 >  		ehca_gen_err("Failed registering memory add/remove notifier");
 > -		goto module_init3;
 > +		goto module_init4;

Having to renumber unrelated things is when something changes is why I
don't like this style of error path labels.  But I think it's well and
truly too late to fix that in ehca.

 - R.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-13  4:50   ` Roland Dreier
@ 2009-06-16  7:08     ` Alexander Schmidt
  -1 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-16  7:08 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Hannes Hering, linux-kernel, ewg, linuxppc-dev, raisch, ossrosch,
	Hoang-Nam Nguyen

Hi Roland,

thank you for taking a look at the code!

On Fri, 12 Jun 2009 21:50:58 -0700
Roland Dreier <rdreier@cisco.com> wrote:

> OK, one major issue with this patch and a few minor nits.
> 
> First, the major issue is that I don't see anything in the patch that
> changes the code in ehca_mem_notifier() in ehca_main.c:
> 
> 	case MEM_GOING_ONLINE:
> 	case MEM_GOING_OFFLINE:
> 		/* only ok if no hca is attached to the lpar */
> 		spin_lock_irqsave(&shca_list_lock, flags);
> 		if (list_empty(&shca_list)) {
> 			spin_unlock_irqrestore(&shca_list_lock, flags);
> 			return NOTIFY_OK;
> 		} else {
> 			spin_unlock_irqrestore(&shca_list_lock, flags);
> 			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
> 						   30 * 1000))
> 				ehca_gen_err("DMEM operations are not allowed"
> 					     "in conjunction with eHCA");
> 			return NOTIFY_BAD;
> 		}
> 
> But your patch description says:
> 
>  > This patch implements toleration of dynamic memory operations....
> 
> But it seems you're still going to hit the same NOTIFY_BAD case above
> after your patch.  So something doesn't compute for me.  Could you
> explain more?

Yeah, the notifier code remains untouched as we still do not allow dynamic
memory operations _while_ our module is loaded. The patch allows the driver to
cope with DMEM operations that happened before the module was loaded, which
might result in a non-contiguous memory layout. When the driver registers
its global memory region in the system, the memory layout must be considered.

We chose the term "toleration" instead of "support" to illustrate this.

I'll put some more details into the changelog, incorporate the other comments
and send out a second version of the patch.

Thanks,
Alex

> 
> Second, a nit:
> 
>  > +#define EHCA_REG_MR 0
>  > +#define EHCA_REG_BUSMAP_MR (~0)
> 
> and you pass these as the reg_busmap parm in:
> 
>  >  int ehca_reg_mr(struct ehca_shca *shca,
>  >  		struct ehca_mr *e_mr,
>  >  		u64 *iova_start,
>  > @@ -991,7 +1031,8 @@
>  >  		struct ehca_pd *e_pd,
>  >  		struct ehca_mr_pginfo *pginfo,
>  >  		u32 *lkey, /*OUT*/
>  > -		u32 *rkey) /*OUT*/
>  > +		u32 *rkey, /*OUT*/
>  > +		int reg_busmap)
> 
> and test it as:
> 
>  > +	if (reg_busmap)
>  > +		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
>  > +	else
>  > +		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
> 
> So the ~0 for true looks a bit odd.  One option would be to make
> reg_busmap a bool, since that's how you're using it, but then you lose
> the nice self-documenting macro where you call things.
> 
> So I think it would be cleaner to do something like
> 
> enum ehca_reg_type {
> 	EHCA_REG_MR,
> 	EHCA_REG_BUSMAP_MR
> };
> 
> and make the "int reg_busmap" parameter into "enum ehca_reg_type reg_type"
> and have the code become
> 
> +	if (reg_type == EHCA_REG_BUSMAP_MR)
> +		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
> +	else if (reg_type == EHCA_REG_MR)
> +		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
> +	else
> +		ret = -EINVAL
> 
> or something like that.
> 
>  > +struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
>  > +	.mapping_error = ehca_dma_mapping_error,
>  > +	.map_single = ehca_dma_map_single,
>  > +	.unmap_single = ehca_dma_unmap_single,
>  > +	.map_page = ehca_dma_map_page,
>  > +	.unmap_page = ehca_dma_unmap_page,
>  > +	.map_sg = ehca_dma_map_sg,
>  > +	.unmap_sg = ehca_dma_unmap_sg,
>  > +	.dma_address = ehca_dma_address,
>  > +	.dma_len = ehca_dma_len,
>  > +	.sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
>  > +	.sync_single_for_device = ehca_dma_sync_single_for_device,
>  > +	.alloc_coherent = ehca_dma_alloc_coherent,
>  > +	.free_coherent = ehca_dma_free_coherent,
>  > +};
> 
> I always think structures like this are easier to read if you align the
> '=' signs.  But no big deal.
> 
>  > +	ret = ehca_create_busmap();
>  > +	if (ret) {
>  > +		ehca_gen_err("Cannot create busmap.");
>  > +		goto module_init2;
>  > +	}
>  > +
>  >  	ret = ibmebus_register_driver(&ehca_driver);
>  >  	if (ret) {
>  >  		ehca_gen_err("Cannot register eHCA device driver");
>  >  		ret = -EINVAL;
>  > -		goto module_init2;
>  > +		goto module_init3;
>  >  	}
>  >  
>  >  	ret = register_memory_notifier(&ehca_mem_nb);
>  >  	if (ret) {
>  >  		ehca_gen_err("Failed registering memory add/remove notifier");
>  > -		goto module_init3;
>  > +		goto module_init4;
> 
> Having to renumber unrelated things is when something changes is why I
> don't like this style of error path labels.  But I think it's well and
> truly too late to fix that in ehca.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-16  7:08     ` Alexander Schmidt
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-16  7:08 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Hannes Hering, linux-kernel, ossrosch, linuxppc-dev, raisch, ewg,
	Hoang-Nam Nguyen

Hi Roland,

thank you for taking a look at the code!

On Fri, 12 Jun 2009 21:50:58 -0700
Roland Dreier <rdreier@cisco.com> wrote:

> OK, one major issue with this patch and a few minor nits.
> 
> First, the major issue is that I don't see anything in the patch that
> changes the code in ehca_mem_notifier() in ehca_main.c:
> 
> 	case MEM_GOING_ONLINE:
> 	case MEM_GOING_OFFLINE:
> 		/* only ok if no hca is attached to the lpar */
> 		spin_lock_irqsave(&shca_list_lock, flags);
> 		if (list_empty(&shca_list)) {
> 			spin_unlock_irqrestore(&shca_list_lock, flags);
> 			return NOTIFY_OK;
> 		} else {
> 			spin_unlock_irqrestore(&shca_list_lock, flags);
> 			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
> 						   30 * 1000))
> 				ehca_gen_err("DMEM operations are not allowed"
> 					     "in conjunction with eHCA");
> 			return NOTIFY_BAD;
> 		}
> 
> But your patch description says:
> 
>  > This patch implements toleration of dynamic memory operations....
> 
> But it seems you're still going to hit the same NOTIFY_BAD case above
> after your patch.  So something doesn't compute for me.  Could you
> explain more?

Yeah, the notifier code remains untouched as we still do not allow dynamic
memory operations _while_ our module is loaded. The patch allows the driver to
cope with DMEM operations that happened before the module was loaded, which
might result in a non-contiguous memory layout. When the driver registers
its global memory region in the system, the memory layout must be considered.

We chose the term "toleration" instead of "support" to illustrate this.

I'll put some more details into the changelog, incorporate the other comments
and send out a second version of the patch.

Thanks,
Alex

> 
> Second, a nit:
> 
>  > +#define EHCA_REG_MR 0
>  > +#define EHCA_REG_BUSMAP_MR (~0)
> 
> and you pass these as the reg_busmap parm in:
> 
>  >  int ehca_reg_mr(struct ehca_shca *shca,
>  >  		struct ehca_mr *e_mr,
>  >  		u64 *iova_start,
>  > @@ -991,7 +1031,8 @@
>  >  		struct ehca_pd *e_pd,
>  >  		struct ehca_mr_pginfo *pginfo,
>  >  		u32 *lkey, /*OUT*/
>  > -		u32 *rkey) /*OUT*/
>  > +		u32 *rkey, /*OUT*/
>  > +		int reg_busmap)
> 
> and test it as:
> 
>  > +	if (reg_busmap)
>  > +		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
>  > +	else
>  > +		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
> 
> So the ~0 for true looks a bit odd.  One option would be to make
> reg_busmap a bool, since that's how you're using it, but then you lose
> the nice self-documenting macro where you call things.
> 
> So I think it would be cleaner to do something like
> 
> enum ehca_reg_type {
> 	EHCA_REG_MR,
> 	EHCA_REG_BUSMAP_MR
> };
> 
> and make the "int reg_busmap" parameter into "enum ehca_reg_type reg_type"
> and have the code become
> 
> +	if (reg_type == EHCA_REG_BUSMAP_MR)
> +		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
> +	else if (reg_type == EHCA_REG_MR)
> +		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
> +	else
> +		ret = -EINVAL
> 
> or something like that.
> 
>  > +struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
>  > +	.mapping_error = ehca_dma_mapping_error,
>  > +	.map_single = ehca_dma_map_single,
>  > +	.unmap_single = ehca_dma_unmap_single,
>  > +	.map_page = ehca_dma_map_page,
>  > +	.unmap_page = ehca_dma_unmap_page,
>  > +	.map_sg = ehca_dma_map_sg,
>  > +	.unmap_sg = ehca_dma_unmap_sg,
>  > +	.dma_address = ehca_dma_address,
>  > +	.dma_len = ehca_dma_len,
>  > +	.sync_single_for_cpu = ehca_dma_sync_single_for_cpu,
>  > +	.sync_single_for_device = ehca_dma_sync_single_for_device,
>  > +	.alloc_coherent = ehca_dma_alloc_coherent,
>  > +	.free_coherent = ehca_dma_free_coherent,
>  > +};
> 
> I always think structures like this are easier to read if you align the
> '=' signs.  But no big deal.
> 
>  > +	ret = ehca_create_busmap();
>  > +	if (ret) {
>  > +		ehca_gen_err("Cannot create busmap.");
>  > +		goto module_init2;
>  > +	}
>  > +
>  >  	ret = ibmebus_register_driver(&ehca_driver);
>  >  	if (ret) {
>  >  		ehca_gen_err("Cannot register eHCA device driver");
>  >  		ret = -EINVAL;
>  > -		goto module_init2;
>  > +		goto module_init3;
>  >  	}
>  >  
>  >  	ret = register_memory_notifier(&ehca_mem_nb);
>  >  	if (ret) {
>  >  		ehca_gen_err("Failed registering memory add/remove notifier");
>  > -		goto module_init3;
>  > +		goto module_init4;
> 
> Having to renumber unrelated things is when something changes is why I
> don't like this style of error path labels.  But I think it's well and
> truly too late to fix that in ehca.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-09 13:59 ` Hannes Hering
@ 2009-06-16  7:10   ` Alexander Schmidt
  -1 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-16  7:10 UTC (permalink / raw)
  To: Hannes Hering
  Cc: rdreier, linux-kernel, ewg, linuxppc-dev, raisch, ossrosch,
	Hoang-Nam Nguyen

From: Hannes Hering <hering2@de.ibm.com>

This patch implements toleration of dynamic memory operations and 16 GB
gigantic pages. "Toleration" means that the driver can cope with dynamic
memory operations that happened before the driver was loaded. While using the
ehca driver, dynamic memory operations are still prohibited. On module load the
driver walks through available system memory, checks for available memory ranges
and then registers the kernel internal memory region accordingly. The
translation of address ranges is implemented via a 3-level busmap.

Signed-off-by: Hannes Hering <hering2@de.ibm.com>

---
This patch is built and tested against infiniband.git. Please apply for 2.6.31.

 drivers/infiniband/hw/ehca/ehca_main.c |   20 +
 drivers/infiniband/hw/ehca/ehca_mrmw.c |  508 ++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/ehca/ehca_mrmw.h |   13 
 3 files changed, 523 insertions(+), 18 deletions(-)

--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_main.c
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_main.c
@@ -52,7 +52,7 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
-#define HCAD_VERSION "0026"
+#define HCAD_VERSION "0027"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
@@ -506,6 +506,7 @@ static int ehca_init_device(struct ehca_
 	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
 	shca->ib_device.process_mad	    = ehca_process_mad;
 	shca->ib_device.mmap		    = ehca_mmap;
+	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
 
 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
 		shca->ib_device.uverbs_cmd_mask |=
@@ -1028,17 +1029,23 @@ static int __init ehca_module_init(void)
 		goto module_init1;
 	}
 
+	ret = ehca_create_busmap();
+	if (ret) {
+		ehca_gen_err("Cannot create busmap.");
+		goto module_init2;
+	}
+
 	ret = ibmebus_register_driver(&ehca_driver);
 	if (ret) {
 		ehca_gen_err("Cannot register eHCA device driver");
 		ret = -EINVAL;
-		goto module_init2;
+		goto module_init3;
 	}
 
 	ret = register_memory_notifier(&ehca_mem_nb);
 	if (ret) {
 		ehca_gen_err("Failed registering memory add/remove notifier");
-		goto module_init3;
+		goto module_init4;
 	}
 
 	if (ehca_poll_all_eqs != 1) {
@@ -1053,9 +1060,12 @@ static int __init ehca_module_init(void)
 
 	return 0;
 
-module_init3:
+module_init4:
 	ibmebus_unregister_driver(&ehca_driver);
 
+module_init3:
+	ehca_destroy_busmap();
+
 module_init2:
 	ehca_destroy_slab_caches();
 
@@ -1073,6 +1083,8 @@ static void __exit ehca_module_exit(void
 
 	unregister_memory_notifier(&ehca_mem_nb);
 
+	ehca_destroy_busmap();
+
 	ehca_destroy_slab_caches();
 
 	ehca_destroy_comp_pool();
--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -53,6 +53,38 @@
 /* max number of rpages (per hcall register_rpages) */
 #define MAX_RPAGES 512
 
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+	u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
 static struct kmem_cache *mr_cache;
 static struct kmem_cache *mw_cache;
 
@@ -68,6 +100,8 @@ enum ehca_mr_pgsize {
 #define EHCA_MR_PGSHIFT1M  20
 #define EHCA_MR_PGSHIFT16M 24
 
+static u64 ehca_map_vaddr(void *caddr);
+
 static u32 ehca_encode_hwpage_size(u32 pgsize)
 {
 	int log = ilog2(pgsize);
@@ -135,7 +169,8 @@ struct ib_mr *ehca_get_dma_mr(struct ib_
 			goto get_dma_mr_exit0;
 		}
 
-		ret = ehca_reg_maxmr(shca, e_maxmr, (u64 *)KERNELBASE,
+		ret = ehca_reg_maxmr(shca, e_maxmr,
+				     (void *)ehca_map_vaddr((void *)KERNELBASE),
 				     mr_access_flags, e_pd,
 				     &e_maxmr->ib.ib_mr.lkey,
 				     &e_maxmr->ib.ib_mr.rkey);
@@ -251,7 +286,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
 				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-				  &e_mr->ib.ib_mr.rkey);
+				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 		if (ret) {
 			ib_mr = ERR_PTR(ret);
 			goto reg_phys_mr_exit1;
@@ -370,7 +405,7 @@ reg_user_mr_fallback:
 
 	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
 			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
 		ehca_warn(pd->device, "failed to register mr "
 			  "with hwpage_size=%llx", hwpage_size);
@@ -794,7 +829,7 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
 			  mr_access_flags, e_pd, &pginfo,
-			  &tmp_lkey, &tmp_rkey);
+			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		ib_fmr = ERR_PTR(ret);
 		goto alloc_fmr_exit1;
@@ -983,6 +1018,10 @@ free_fmr_exit0:
 
 /*----------------------------------------------------------------------*/
 
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo);
+
 int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_mr *e_mr,
 		u64 *iova_start,
@@ -991,7 +1030,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey, /*OUT*/
-		u32 *rkey) /*OUT*/
+		u32 *rkey, /*OUT*/
+		enum ehca_reg_type reg_type)
 {
 	int ret;
 	u64 h_ret;
@@ -1015,7 +1055,13 @@ int ehca_reg_mr(struct ehca_shca *shca,
 
 	e_mr->ipz_mr_handle = hipzout.handle;
 
-	ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	if (reg_type == EHCA_REG_BUSMAP_MR)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else if (reg_type == EHCA_REG_MR)
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = -EINVAL;
+
 	if (ret)
 		goto ehca_reg_mr_exit1;
 
@@ -1316,7 +1362,7 @@ int ehca_rereg_mr(struct ehca_shca *shca
 		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-				  e_pd, pginfo, lkey, rkey);
+				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
 		if (ret) {
 			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
 			memcpy(&e_mr->flags, &(save_mr.flags),
@@ -1409,7 +1455,7 @@ int ehca_unmap_one_fmr(struct ehca_shca 
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
 			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-			  &tmp_rkey);
+			  &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
 		memcpy(&e_fmr->flags, &(save_mr.flags),
@@ -1478,6 +1524,90 @@ ehca_reg_smr_exit0:
 } /* end ehca_reg_smr() */
 
 /*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+	unsigned long ret = idx;
+	ret |= dir << EHCA_DIR_INDEX_SHIFT;
+	ret |= top << EHCA_TOP_INDEX_SHIFT;
+	return abs_to_virt(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+	((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+			       struct ehca_shca *shca, struct ehca_mr *mr,
+			       struct ehca_mr_pginfo *pginfo)
+{
+	u64 h_ret = 0;
+	unsigned long page = 0;
+	u64 rpage = virt_to_abs(kpage);
+	int page_count;
+
+	void *sectbase = ehca_calc_sectbase(top, dir, idx);
+	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+					   "hwpage_size does not fit to "
+					   "section start address");
+	}
+	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+	while (page < page_count) {
+		u64 rnum;
+		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+		     rnum++) {
+			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+			kpage[rnum] = virt_to_abs(pg);
+		}
+
+		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+			ehca_err(&shca->ib_device, "register_rpage_mr failed");
+			return h_ret;
+		}
+	}
+	return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+				struct ehca_shca *shca, struct ehca_mr *mr,
+				struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int idx;
+
+	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+			continue;
+
+		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+					   pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+				    struct ehca_mr *mr,
+				    struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int dir;
+
+	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+			continue;
+
+		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
 
 /* register internal max-MR to internal SHCA */
 int ehca_reg_internal_maxmr(
@@ -1495,6 +1625,11 @@ int ehca_reg_internal_maxmr(
 	u32 num_hwpages;
 	u64 hw_pgsize;
 
+	if (!ehca_bmap) {
+		ret = -EFAULT;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+
 	e_mr = ehca_mr_new();
 	if (!e_mr) {
 		ehca_err(&shca->ib_device, "out of memory");
@@ -1504,8 +1639,8 @@ int ehca_reg_internal_maxmr(
 	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
 
 	/* register internal max-MR on HCA */
-	size_maxmr = (u64)high_memory - PAGE_OFFSET;
-	iova_start = (u64 *)KERNELBASE;
+	size_maxmr = ehca_mr_len;
+	iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE);
 	ib_pbuf.addr = 0;
 	ib_pbuf.size = size_maxmr;
 	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
@@ -1524,7 +1659,7 @@ int ehca_reg_internal_maxmr(
 
 	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
 			  &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
 	if (ret) {
 		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
 			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
@@ -2077,8 +2212,8 @@ int ehca_mr_is_maxmr(u64 size,
 		     u64 *iova_start)
 {
 	/* a MR is treated as max-MR only if it fits following: */
-	if ((size == ((u64)high_memory - PAGE_OFFSET)) &&
-	    (iova_start == (void *)KERNELBASE)) {
+	if ((size == ehca_mr_len) &&
+	    (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) {
 		ehca_gen_dbg("this is a max-MR");
 		return 1;
 	} else
@@ -2184,3 +2319,350 @@ void ehca_cleanup_mrmw_cache(void)
 	if (mw_cache)
 		kmem_cache_destroy(mw_cache);
 }
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+				     int dir)
+{
+	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+		ehca_top_bmap->dir[dir] =
+			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+		if (!ehca_top_bmap->dir[dir])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+	}
+	return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+		ehca_bmap->top[top] =
+			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+		if (!ehca_bmap->top[top])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+	}
+	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+	return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+	int top, dir;
+
+	if (!ehca_bmap)
+		return;
+
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+				continue;
+
+			kfree(ehca_bmap->top[top]->dir[dir]);
+		}
+
+		kfree(ehca_bmap->top[top]);
+	}
+
+	kfree(ehca_bmap);
+	ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i, start_section, end_section;
+	int top, dir, idx;
+
+	if (!nr_pages)
+		return 0;
+
+	if (!ehca_bmap) {
+		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+		if (!ehca_bmap)
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+	}
+
+	start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+	end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+	for (i = start_section; i < end_section; i++) {
+		int ret;
+		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+		idx = i & EHCA_INDEX_MASK;
+
+		ret = ehca_init_bmap(ehca_bmap, top, dir);
+		if (ret) {
+			ehca_destroy_busmap();
+			return ret;
+		}
+		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+		ehca_mr_len += EHCA_SECTSIZE;
+	}
+	return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+	int page_order;
+
+	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+		return 0;
+
+	page_order = compound_order(pfn_to_page(pfn));
+	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+		return 0;
+
+	return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+				       unsigned long total_nr_pages, void *arg)
+{
+	int ret;
+	unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+		return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+	/* Given chunk is >= 16GB -> check for hugepages */
+	start_pfn = initial_pfn;
+	end_pfn = initial_pfn + total_nr_pages;
+	pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		if (ehca_is_hugepage(pfn)) {
+			/* Add mem found in front of the hugepage */
+			nr_pages = pfn - start_pfn;
+			ret = ehca_update_busmap(start_pfn, nr_pages);
+			if (ret)
+				return ret;
+			/* Skip the hugepage */
+			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+			start_pfn = pfn;
+		} else
+			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+	}
+
+	/* Add mem found behind the hugepage(s)  */
+	nr_pages = pfn - start_pfn;
+	return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+	int ret;
+
+	ehca_mr_len = 0;
+	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+				   ehca_create_busmap_callback);
+	return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo)
+{
+	int top;
+	u64 hret, *kpage;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		return -ENOMEM;
+	}
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+			break;
+	}
+
+	ehca_free_fw_ctrlblock(kpage);
+
+	if (hret == H_SUCCESS)
+		return 0; /* Everything is fine */
+	else {
+		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
+				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+		return ehca2ib_return_code(hret);
+	}
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+	int top, dir, idx;
+	unsigned long abs_addr, offset;
+	u64 entry;
+
+	if (!ehca_bmap)
+		return EHCA_INVAL_ADDR;
+
+	abs_addr = virt_to_abs(caddr);
+	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]))
+		return EHCA_INVAL_ADDR;
+
+	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+		return EHCA_INVAL_ADDR;
+
+	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+	if (ehca_bmap_valid(entry)) {
+		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+		return entry | offset;
+	} else
+		return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (cpu_addr)
+		return ehca_map_vaddr(cpu_addr);
+	else
+		return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			     enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (offset + size > PAGE_SIZE)
+		return EHCA_INVAL_ADDR;
+
+	addr = ehca_map_vaddr(page_address(page));
+	if (!ehca_dma_mapping_error(dev, addr))
+		addr += offset;
+
+	return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		u64 addr;
+		addr = ehca_map_vaddr(sg_virt(sg));
+		if (ehca_dma_mapping_error(dev, addr))
+			return 0;
+
+		sg->dma_address = addr;
+		sg->dma_length = sg->length;
+	}
+	return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			      int nents, enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->dma_address;
+}
+
+static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->length;
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+					    size_t size,
+					    enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+	u64 dma_addr;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p) {
+		addr = page_address(p);
+		dma_addr = ehca_map_vaddr(addr);
+		if (ehca_dma_mapping_error(dev, dma_addr)) {
+			free_pages((unsigned long)addr,	get_order(size));
+			return NULL;
+		}
+		if (dma_handle)
+			*dma_handle = dma_addr;
+		return addr;
+	}
+	return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	if (cpu_addr && size)
+		free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+	.mapping_error          = ehca_dma_mapping_error,
+	.map_single             = ehca_dma_map_single,
+	.unmap_single           = ehca_dma_unmap_single,
+	.map_page               = ehca_dma_map_page,
+	.unmap_page             = ehca_dma_unmap_page,
+	.map_sg                 = ehca_dma_map_sg,
+	.unmap_sg               = ehca_dma_unmap_sg,
+	.dma_address            = ehca_dma_address,
+	.dma_len                = ehca_dma_len,
+	.sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
+	.sync_single_for_device = ehca_dma_sync_single_for_device,
+	.alloc_coherent         = ehca_dma_alloc_coherent,
+	.free_coherent          = ehca_dma_free_coherent,
+};
--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_mrmw.h
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_mrmw.h
@@ -42,6 +42,11 @@
 #ifndef _EHCA_MRMW_H_
 #define _EHCA_MRMW_H_
 
+enum ehca_reg_type {
+	EHCA_REG_MR,
+	EHCA_REG_BUSMAP_MR
+};
+
 int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_mr *e_mr,
 		u64 *iova_start,
@@ -50,7 +55,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey,
-		u32 *rkey);
+		u32 *rkey,
+		enum ehca_reg_type reg_type);
 
 int ehca_reg_mr_rpages(struct ehca_shca *shca,
 		       struct ehca_mr *e_mr,
@@ -118,4 +124,9 @@ void ehca_mrmw_reverse_map_acl(const u32
 
 void ehca_mr_deletenew(struct ehca_mr *mr);
 
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
 #endif  /*_EHCA_MRMW_H_*/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-16  7:10   ` Alexander Schmidt
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-16  7:10 UTC (permalink / raw)
  To: Hannes Hering
  Cc: rdreier, linux-kernel, ossrosch, linuxppc-dev, raisch, ewg,
	Hoang-Nam Nguyen

From: Hannes Hering <hering2@de.ibm.com>

This patch implements toleration of dynamic memory operations and 16 GB
gigantic pages. "Toleration" means that the driver can cope with dynamic
memory operations that happened before the driver was loaded. While using the
ehca driver, dynamic memory operations are still prohibited. On module load the
driver walks through available system memory, checks for available memory ranges
and then registers the kernel internal memory region accordingly. The
translation of address ranges is implemented via a 3-level busmap.

Signed-off-by: Hannes Hering <hering2@de.ibm.com>

---
This patch is built and tested against infiniband.git. Please apply for 2.6.31.

 drivers/infiniband/hw/ehca/ehca_main.c |   20 +
 drivers/infiniband/hw/ehca/ehca_mrmw.c |  508 ++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/ehca/ehca_mrmw.h |   13 
 3 files changed, 523 insertions(+), 18 deletions(-)

--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_main.c
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_main.c
@@ -52,7 +52,7 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
-#define HCAD_VERSION "0026"
+#define HCAD_VERSION "0027"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
@@ -506,6 +506,7 @@ static int ehca_init_device(struct ehca_
 	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
 	shca->ib_device.process_mad	    = ehca_process_mad;
 	shca->ib_device.mmap		    = ehca_mmap;
+	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
 
 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
 		shca->ib_device.uverbs_cmd_mask |=
@@ -1028,17 +1029,23 @@ static int __init ehca_module_init(void)
 		goto module_init1;
 	}
 
+	ret = ehca_create_busmap();
+	if (ret) {
+		ehca_gen_err("Cannot create busmap.");
+		goto module_init2;
+	}
+
 	ret = ibmebus_register_driver(&ehca_driver);
 	if (ret) {
 		ehca_gen_err("Cannot register eHCA device driver");
 		ret = -EINVAL;
-		goto module_init2;
+		goto module_init3;
 	}
 
 	ret = register_memory_notifier(&ehca_mem_nb);
 	if (ret) {
 		ehca_gen_err("Failed registering memory add/remove notifier");
-		goto module_init3;
+		goto module_init4;
 	}
 
 	if (ehca_poll_all_eqs != 1) {
@@ -1053,9 +1060,12 @@ static int __init ehca_module_init(void)
 
 	return 0;
 
-module_init3:
+module_init4:
 	ibmebus_unregister_driver(&ehca_driver);
 
+module_init3:
+	ehca_destroy_busmap();
+
 module_init2:
 	ehca_destroy_slab_caches();
 
@@ -1073,6 +1083,8 @@ static void __exit ehca_module_exit(void
 
 	unregister_memory_notifier(&ehca_mem_nb);
 
+	ehca_destroy_busmap();
+
 	ehca_destroy_slab_caches();
 
 	ehca_destroy_comp_pool();
--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -53,6 +53,38 @@
 /* max number of rpages (per hcall register_rpages) */
 #define MAX_RPAGES 512
 
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+	u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
 static struct kmem_cache *mr_cache;
 static struct kmem_cache *mw_cache;
 
@@ -68,6 +100,8 @@ enum ehca_mr_pgsize {
 #define EHCA_MR_PGSHIFT1M  20
 #define EHCA_MR_PGSHIFT16M 24
 
+static u64 ehca_map_vaddr(void *caddr);
+
 static u32 ehca_encode_hwpage_size(u32 pgsize)
 {
 	int log = ilog2(pgsize);
@@ -135,7 +169,8 @@ struct ib_mr *ehca_get_dma_mr(struct ib_
 			goto get_dma_mr_exit0;
 		}
 
-		ret = ehca_reg_maxmr(shca, e_maxmr, (u64 *)KERNELBASE,
+		ret = ehca_reg_maxmr(shca, e_maxmr,
+				     (void *)ehca_map_vaddr((void *)KERNELBASE),
 				     mr_access_flags, e_pd,
 				     &e_maxmr->ib.ib_mr.lkey,
 				     &e_maxmr->ib.ib_mr.rkey);
@@ -251,7 +286,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
 				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-				  &e_mr->ib.ib_mr.rkey);
+				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 		if (ret) {
 			ib_mr = ERR_PTR(ret);
 			goto reg_phys_mr_exit1;
@@ -370,7 +405,7 @@ reg_user_mr_fallback:
 
 	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
 			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
 	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
 		ehca_warn(pd->device, "failed to register mr "
 			  "with hwpage_size=%llx", hwpage_size);
@@ -794,7 +829,7 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
 			  mr_access_flags, e_pd, &pginfo,
-			  &tmp_lkey, &tmp_rkey);
+			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		ib_fmr = ERR_PTR(ret);
 		goto alloc_fmr_exit1;
@@ -983,6 +1018,10 @@ free_fmr_exit0:
 
 /*----------------------------------------------------------------------*/
 
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo);
+
 int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_mr *e_mr,
 		u64 *iova_start,
@@ -991,7 +1030,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey, /*OUT*/
-		u32 *rkey) /*OUT*/
+		u32 *rkey, /*OUT*/
+		enum ehca_reg_type reg_type)
 {
 	int ret;
 	u64 h_ret;
@@ -1015,7 +1055,13 @@ int ehca_reg_mr(struct ehca_shca *shca,
 
 	e_mr->ipz_mr_handle = hipzout.handle;
 
-	ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	if (reg_type == EHCA_REG_BUSMAP_MR)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else if (reg_type == EHCA_REG_MR)
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = -EINVAL;
+
 	if (ret)
 		goto ehca_reg_mr_exit1;
 
@@ -1316,7 +1362,7 @@ int ehca_rereg_mr(struct ehca_shca *shca
 		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
 
 		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
-				  e_pd, pginfo, lkey, rkey);
+				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
 		if (ret) {
 			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
 			memcpy(&e_mr->flags, &(save_mr.flags),
@@ -1409,7 +1455,7 @@ int ehca_unmap_one_fmr(struct ehca_shca 
 	ret = ehca_reg_mr(shca, e_fmr, NULL,
 			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
 			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
-			  &tmp_rkey);
+			  &tmp_rkey, EHCA_REG_MR);
 	if (ret) {
 		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
 		memcpy(&e_fmr->flags, &(save_mr.flags),
@@ -1478,6 +1524,90 @@ ehca_reg_smr_exit0:
 } /* end ehca_reg_smr() */
 
 /*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+	unsigned long ret = idx;
+	ret |= dir << EHCA_DIR_INDEX_SHIFT;
+	ret |= top << EHCA_TOP_INDEX_SHIFT;
+	return abs_to_virt(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+	((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+			       struct ehca_shca *shca, struct ehca_mr *mr,
+			       struct ehca_mr_pginfo *pginfo)
+{
+	u64 h_ret = 0;
+	unsigned long page = 0;
+	u64 rpage = virt_to_abs(kpage);
+	int page_count;
+
+	void *sectbase = ehca_calc_sectbase(top, dir, idx);
+	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+					   "hwpage_size does not fit to "
+					   "section start address");
+	}
+	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+	while (page < page_count) {
+		u64 rnum;
+		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+		     rnum++) {
+			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+			kpage[rnum] = virt_to_abs(pg);
+		}
+
+		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+			ehca_err(&shca->ib_device, "register_rpage_mr failed");
+			return h_ret;
+		}
+	}
+	return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+				struct ehca_shca *shca, struct ehca_mr *mr,
+				struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int idx;
+
+	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+			continue;
+
+		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+					   pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+				    struct ehca_mr *mr,
+				    struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int dir;
+
+	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+			continue;
+
+		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
 
 /* register internal max-MR to internal SHCA */
 int ehca_reg_internal_maxmr(
@@ -1495,6 +1625,11 @@ int ehca_reg_internal_maxmr(
 	u32 num_hwpages;
 	u64 hw_pgsize;
 
+	if (!ehca_bmap) {
+		ret = -EFAULT;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+
 	e_mr = ehca_mr_new();
 	if (!e_mr) {
 		ehca_err(&shca->ib_device, "out of memory");
@@ -1504,8 +1639,8 @@ int ehca_reg_internal_maxmr(
 	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
 
 	/* register internal max-MR on HCA */
-	size_maxmr = (u64)high_memory - PAGE_OFFSET;
-	iova_start = (u64 *)KERNELBASE;
+	size_maxmr = ehca_mr_len;
+	iova_start = (u64 *)ehca_map_vaddr((void *)KERNELBASE);
 	ib_pbuf.addr = 0;
 	ib_pbuf.size = size_maxmr;
 	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
@@ -1524,7 +1659,7 @@ int ehca_reg_internal_maxmr(
 
 	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
 			  &pginfo, &e_mr->ib.ib_mr.lkey,
-			  &e_mr->ib.ib_mr.rkey);
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
 	if (ret) {
 		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
 			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
@@ -2077,8 +2212,8 @@ int ehca_mr_is_maxmr(u64 size,
 		     u64 *iova_start)
 {
 	/* a MR is treated as max-MR only if it fits following: */
-	if ((size == ((u64)high_memory - PAGE_OFFSET)) &&
-	    (iova_start == (void *)KERNELBASE)) {
+	if ((size == ehca_mr_len) &&
+	    (iova_start == (void *)ehca_map_vaddr((void *)KERNELBASE))) {
 		ehca_gen_dbg("this is a max-MR");
 		return 1;
 	} else
@@ -2184,3 +2319,350 @@ void ehca_cleanup_mrmw_cache(void)
 	if (mw_cache)
 		kmem_cache_destroy(mw_cache);
 }
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+				     int dir)
+{
+	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+		ehca_top_bmap->dir[dir] =
+			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+		if (!ehca_top_bmap->dir[dir])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+	}
+	return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+		ehca_bmap->top[top] =
+			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+		if (!ehca_bmap->top[top])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+	}
+	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+	return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+	int top, dir;
+
+	if (!ehca_bmap)
+		return;
+
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+				continue;
+
+			kfree(ehca_bmap->top[top]->dir[dir]);
+		}
+
+		kfree(ehca_bmap->top[top]);
+	}
+
+	kfree(ehca_bmap);
+	ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i, start_section, end_section;
+	int top, dir, idx;
+
+	if (!nr_pages)
+		return 0;
+
+	if (!ehca_bmap) {
+		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+		if (!ehca_bmap)
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+	}
+
+	start_section = phys_to_abs(pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+	end_section = phys_to_abs((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+	for (i = start_section; i < end_section; i++) {
+		int ret;
+		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+		idx = i & EHCA_INDEX_MASK;
+
+		ret = ehca_init_bmap(ehca_bmap, top, dir);
+		if (ret) {
+			ehca_destroy_busmap();
+			return ret;
+		}
+		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+		ehca_mr_len += EHCA_SECTSIZE;
+	}
+	return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+	int page_order;
+
+	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+		return 0;
+
+	page_order = compound_order(pfn_to_page(pfn));
+	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+		return 0;
+
+	return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+				       unsigned long total_nr_pages, void *arg)
+{
+	int ret;
+	unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+		return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+	/* Given chunk is >= 16GB -> check for hugepages */
+	start_pfn = initial_pfn;
+	end_pfn = initial_pfn + total_nr_pages;
+	pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		if (ehca_is_hugepage(pfn)) {
+			/* Add mem found in front of the hugepage */
+			nr_pages = pfn - start_pfn;
+			ret = ehca_update_busmap(start_pfn, nr_pages);
+			if (ret)
+				return ret;
+			/* Skip the hugepage */
+			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+			start_pfn = pfn;
+		} else
+			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+	}
+
+	/* Add mem found behind the hugepage(s)  */
+	nr_pages = pfn - start_pfn;
+	return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+	int ret;
+
+	ehca_mr_len = 0;
+	ret = walk_memory_resource(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+				   ehca_create_busmap_callback);
+	return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo)
+{
+	int top;
+	u64 hret, *kpage;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		return -ENOMEM;
+	}
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+			break;
+	}
+
+	ehca_free_fw_ctrlblock(kpage);
+
+	if (hret == H_SUCCESS)
+		return 0; /* Everything is fine */
+	else {
+		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
+				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+		return ehca2ib_return_code(hret);
+	}
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+	int top, dir, idx;
+	unsigned long abs_addr, offset;
+	u64 entry;
+
+	if (!ehca_bmap)
+		return EHCA_INVAL_ADDR;
+
+	abs_addr = virt_to_abs(caddr);
+	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]))
+		return EHCA_INVAL_ADDR;
+
+	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+		return EHCA_INVAL_ADDR;
+
+	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+	if (ehca_bmap_valid(entry)) {
+		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+		return entry | offset;
+	} else
+		return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (cpu_addr)
+		return ehca_map_vaddr(cpu_addr);
+	else
+		return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			     enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (offset + size > PAGE_SIZE)
+		return EHCA_INVAL_ADDR;
+
+	addr = ehca_map_vaddr(page_address(page));
+	if (!ehca_dma_mapping_error(dev, addr))
+		addr += offset;
+
+	return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		u64 addr;
+		addr = ehca_map_vaddr(sg_virt(sg));
+		if (ehca_dma_mapping_error(dev, addr))
+			return 0;
+
+		sg->dma_address = addr;
+		sg->dma_length = sg->length;
+	}
+	return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			      int nents, enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_address(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->dma_address;
+}
+
+static unsigned int ehca_dma_len(struct ib_device *dev, struct scatterlist *sg)
+{
+	return sg->length;
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+					    size_t size,
+					    enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+	u64 dma_addr;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p) {
+		addr = page_address(p);
+		dma_addr = ehca_map_vaddr(addr);
+		if (ehca_dma_mapping_error(dev, dma_addr)) {
+			free_pages((unsigned long)addr,	get_order(size));
+			return NULL;
+		}
+		if (dma_handle)
+			*dma_handle = dma_addr;
+		return addr;
+	}
+	return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	if (cpu_addr && size)
+		free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+	.mapping_error          = ehca_dma_mapping_error,
+	.map_single             = ehca_dma_map_single,
+	.unmap_single           = ehca_dma_unmap_single,
+	.map_page               = ehca_dma_map_page,
+	.unmap_page             = ehca_dma_unmap_page,
+	.map_sg                 = ehca_dma_map_sg,
+	.unmap_sg               = ehca_dma_unmap_sg,
+	.dma_address            = ehca_dma_address,
+	.dma_len                = ehca_dma_len,
+	.sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
+	.sync_single_for_device = ehca_dma_sync_single_for_device,
+	.alloc_coherent         = ehca_dma_alloc_coherent,
+	.free_coherent          = ehca_dma_free_coherent,
+};
--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_mrmw.h
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_mrmw.h
@@ -42,6 +42,11 @@
 #ifndef _EHCA_MRMW_H_
 #define _EHCA_MRMW_H_
 
+enum ehca_reg_type {
+	EHCA_REG_MR,
+	EHCA_REG_BUSMAP_MR
+};
+
 int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_mr *e_mr,
 		u64 *iova_start,
@@ -50,7 +55,8 @@ int ehca_reg_mr(struct ehca_shca *shca,
 		struct ehca_pd *e_pd,
 		struct ehca_mr_pginfo *pginfo,
 		u32 *lkey,
-		u32 *rkey);
+		u32 *rkey,
+		enum ehca_reg_type reg_type);
 
 int ehca_reg_mr_rpages(struct ehca_shca *shca,
 		       struct ehca_mr *e_mr,
@@ -118,4 +124,9 @@ void ehca_mrmw_reverse_map_acl(const u32
 
 void ehca_mr_deletenew(struct ehca_mr *mr);
 
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
 #endif  /*_EHCA_MRMW_H_*/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-16  7:08     ` Alexander Schmidt
@ 2009-06-16 16:10       ` Roland Dreier
  -1 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-16 16:10 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: linux-kernel, ewg, linuxppc-dev, raisch, Hoang-Nam Nguyen


 > Yeah, the notifier code remains untouched as we still do not allow dynamic
 > memory operations _while_ our module is loaded. The patch allows the driver to
 > cope with DMEM operations that happened before the module was loaded, which
 > might result in a non-contiguous memory layout. When the driver registers
 > its global memory region in the system, the memory layout must be considered.
 > 
 > We chose the term "toleration" instead of "support" to illustrate this.

I see.  So things just silently broke in some cases when the driver was
loaded after operations you didn't tolerate?

Anyway, thanks for the explanation.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-16 16:10       ` Roland Dreier
  0 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-16 16:10 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: linuxppc-dev, Hoang-Nam Nguyen, raisch, linux-kernel, ewg


 > Yeah, the notifier code remains untouched as we still do not allow dynamic
 > memory operations _while_ our module is loaded. The patch allows the driver to
 > cope with DMEM operations that happened before the module was loaded, which
 > might result in a non-contiguous memory layout. When the driver registers
 > its global memory region in the system, the memory layout must be considered.
 > 
 > We chose the term "toleration" instead of "support" to illustrate this.

I see.  So things just silently broke in some cases when the driver was
loaded after operations you didn't tolerate?

Anyway, thanks for the explanation.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-16 16:10       ` Roland Dreier
  (?)
@ 2009-06-17  6:41       ` Alexander Schmidt
  -1 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-17  6:41 UTC (permalink / raw)
  To: Roland Dreier; +Cc: linuxppc-dev, Hoang-Nam Nguyen, raisch, linux-kernel, ewg

On Tue, 16 Jun 2009 09:10:39 -0700
Roland Dreier <rdreier@cisco.com> wrote:

> 
>  > Yeah, the notifier code remains untouched as we still do not allow dynamic
>  > memory operations _while_ our module is loaded. The patch allows the driver to
>  > cope with DMEM operations that happened before the module was loaded, which
>  > might result in a non-contiguous memory layout. When the driver registers
>  > its global memory region in the system, the memory layout must be considered.
>  > 
>  > We chose the term "toleration" instead of "support" to illustrate this.
> 
> I see.  So things just silently broke in some cases when the driver was
> loaded after operations you didn't tolerate?
> 
> Anyway, thanks for the explanation.

Well, things did not break silently. The registration of the MR failed with
an error code which was reported to userspace.

Will you push the patch for .31 or .32?

Thanks,
Alex

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-16  7:10   ` Alexander Schmidt
@ 2009-06-23  5:19     ` Roland Dreier
  -1 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-23  5:19 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: Hannes Hering, linux-kernel, linuxppc-dev, raisch, ewg, Hoang-Nam Nguyen

thanks, applied.

 > -#define HCAD_VERSION "0026"
 > +#define HCAD_VERSION "0027"

the driver version was already 0027 (since bde2cfaf), so I dropped this chunk.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-23  5:19     ` Roland Dreier
  0 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-23  5:19 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: Hannes Hering, linux-kernel, ewg, linuxppc-dev, raisch, Hoang-Nam Nguyen

thanks, applied.

 > -#define HCAD_VERSION "0026"
 > +#define HCAD_VERSION "0027"

the driver version was already 0027 (since bde2cfaf), so I dropped this chunk.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-23  5:19     ` Roland Dreier
@ 2009-06-23  8:11       ` Alexander Schmidt
  -1 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-23  8:11 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Hannes Hering, linux-kernel, linuxppc-dev, raisch, ewg, Hoang-Nam Nguyen

On Mon, 22 Jun 2009 22:19:21 -0700
Roland Dreier <rdreier@cisco.com> wrote:

> thanks, applied.
> 
>  > -#define HCAD_VERSION "0026"
>  > +#define HCAD_VERSION "0027"
> 
> the driver version was already 0027 (since bde2cfaf), so I dropped this chunk.

thank you for applying, we would like to increase the version number for this
patch, so please also apply the following:

ehca: Increment version number for DMEM toleration

Signed-off-by: Alexander Schmidt <alexs@linux.vnet.ibm.com>
---
 drivers/infiniband/hw/ehca/ehca_main.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_main.c
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_main.c
@@ -52,7 +52,7 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
-#define HCAD_VERSION "0027"
+#define HCAD_VERSION "0028"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-23  8:11       ` Alexander Schmidt
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Schmidt @ 2009-06-23  8:11 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Hannes Hering, linux-kernel, ewg, linuxppc-dev, raisch, Hoang-Nam Nguyen

On Mon, 22 Jun 2009 22:19:21 -0700
Roland Dreier <rdreier@cisco.com> wrote:

> thanks, applied.
> 
>  > -#define HCAD_VERSION "0026"
>  > +#define HCAD_VERSION "0027"
> 
> the driver version was already 0027 (since bde2cfaf), so I dropped this chunk.

thank you for applying, we would like to increase the version number for this
patch, so please also apply the following:

ehca: Increment version number for DMEM toleration

Signed-off-by: Alexander Schmidt <alexs@linux.vnet.ibm.com>
---
 drivers/infiniband/hw/ehca/ehca_main.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- infiniband.git.orig/drivers/infiniband/hw/ehca/ehca_main.c
+++ infiniband.git/drivers/infiniband/hw/ehca/ehca_main.c
@@ -52,7 +52,7 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
-#define HCAD_VERSION "0027"
+#define HCAD_VERSION "0028"
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
  2009-06-23  8:11       ` Alexander Schmidt
@ 2009-06-23 17:30         ` Roland Dreier
  -1 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-23 17:30 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: Hannes Hering, linux-kernel, linuxppc-dev, raisch, ewg, Hoang-Nam Nguyen

applied...

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [ewg] Re: [PATCH 2.6.31 try 2] ehca: Tolerate dynamic memory operations and huge pages
@ 2009-06-23 17:30         ` Roland Dreier
  0 siblings, 0 replies; 19+ messages in thread
From: Roland Dreier @ 2009-06-23 17:30 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: Hannes Hering, linux-kernel, ewg, linuxppc-dev, raisch, Hoang-Nam Nguyen

applied...

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2009-06-23 17:31 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-09 13:59 [PATCH 2.6.31] ehca: Tolerate dynamic memory operations and huge pages Hannes Hering
2009-06-09 13:59 ` Hannes Hering
2009-06-10  0:02 ` Michael Ellerman
2009-06-10  8:54   ` Hannes Hering
2009-06-13  4:50 ` Roland Dreier
2009-06-13  4:50   ` Roland Dreier
2009-06-16  7:08   ` Alexander Schmidt
2009-06-16  7:08     ` Alexander Schmidt
2009-06-16 16:10     ` [ewg] " Roland Dreier
2009-06-16 16:10       ` Roland Dreier
2009-06-17  6:41       ` Alexander Schmidt
2009-06-16  7:10 ` [PATCH 2.6.31 try 2] " Alexander Schmidt
2009-06-16  7:10   ` Alexander Schmidt
2009-06-23  5:19   ` [ewg] " Roland Dreier
2009-06-23  5:19     ` Roland Dreier
2009-06-23  8:11     ` Alexander Schmidt
2009-06-23  8:11       ` Alexander Schmidt
2009-06-23 17:30       ` Roland Dreier
2009-06-23 17:30         ` Roland Dreier

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.