dev.dpdk.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kni: add IOVA va support for kni
@ 2018-09-27 10:49 Kiran Kumar
  2018-09-27 10:58 ` Burakov, Anatoly
                   ` (2 more replies)
  0 siblings, 3 replies; 251+ messages in thread
From: Kiran Kumar @ 2018-09-27 10:49 UTC (permalink / raw)
  To: ferruh.yigit
  Cc: dev, anatoly.burakov, Jacob,  Jerin, hemant.agrawal,
	jianfeng.tan, Kokkilagadda, Kiran

With current KNI implementation kernel module will work only in
IOVA=PA mode. This patch will add support for kernel module to work
with IOVA=VA mode.

The idea is to maintain a mapping in KNI module between user pages and
kernel pages and in fast path perform a lookup in this table and get
the kernel virtual address for corresponding user virtual address.

In IOVA=VA mode, the memory allocated to the pool is physically
and virtually contiguous. We will take advantage of this and create a
mapping in the kernel.In kernel we need mapping for queues
(tx_q, rx_q,... slow path) and mbuf memory (fast path).

At the KNI init time, in slow path we will create a mapping for the
queues and mbuf using get_user_pages similar to af_xdp. Using pool
memory base address, we will create a page map table for the mbuf,
which we will use in the fast path for kernel page translation.

At KNI init time, we will pass the base address of the pool and size of
the pool to kernel. In kernel, using get_user_pages API, we will get
the pages with size PAGE_SIZE and store the mapping and start address
of user space in a table.

In fast path for any user address perform PAGE_SHIFT
(user_addr >> PAGE_SHIFT) and subtract the start address from this value,
we will get the index of the kernel page with in the page map table.
Adding offset to this kernel page address, we will get the kernel address
for this user virtual address.

For example user pool base address is X, and size is S that we passed to
kernel. In kernel we will create a mapping for this using get_user_pages.
Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2) ....]
and user start page will be U (we will get it from X >> PAGE_SHIFT).

For any user address Z we will get the index of the page map table using
((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to this
address will give kernel virtual address.

Signed-off-by: Kiran Kumar <kkokkilagadda@caviumnetworks.com>
---
 kernel/linux/kni/kni_dev.h                    |  37 +++
 kernel/linux/kni/kni_misc.c                   | 211 +++++++++++++++++-
 kernel/linux/kni/kni_net.c                    | 112 ++++++++--
 lib/librte_eal/linuxapp/eal/eal.c             |   9 -
 .../eal/include/exec-env/rte_kni_common.h     |   8 +
 lib/librte_kni/rte_kni.c                      |  21 ++
 6 files changed, 363 insertions(+), 35 deletions(-)

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index 6275ef27f..6a92da497 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -29,10 +29,47 @@
 
 #define MBUF_BURST_SZ 32
 
+struct iova_page_info {
+	/* User to kernel page table map, used for
+	 * fast path lookup
+	 */
+	struct mbuf_page {
+		void *addr;
+	} *page_map;
+
+	/* Page mask */
+	u64 page_mask;
+
+	/* Start page for user address */
+	u64 start_page;
+
+	struct page_info {
+		/* Physical pages returned by get_user_pages */
+		struct page **pgs;
+
+		/* Number of Pages returned by get_user_pages */
+		u32 npgs;
+	} page_info;
+
+	/* Queue info */
+	struct page_info tx_q;
+	struct page_info rx_q;
+	struct page_info alloc_q;
+	struct page_info free_q;
+	struct page_info req_q;
+	struct page_info resp_q;
+	struct page_info sync_va;
+};
+
 /**
  * A structure describing the private information for a kni device.
  */
 struct kni_dev {
+	/* Page info for IOVA=VA mode */
+	struct iova_page_info va_info;
+	/* IOVA mode 0 = PA, 1 = VA */
+	uint8_t iova_mode;
+
 	/* kni list */
 	struct list_head list;
 
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index fa69f8e63..2627c1f69 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -197,6 +197,117 @@ kni_dev_remove(struct kni_dev *dev)
 	return 0;
 }
 
+static void
+kni_unpin_pages(struct page_info *mem)
+{
+	u32 i;
+
+	/* Set the user pages as dirty, so that these pages will not be
+	 * allocated to other applications until we release them.
+	 */
+	for (i = 0; i < mem->npgs; i++) {
+		struct page *page = mem->pgs[i];
+
+		set_page_dirty_lock(page);
+		put_page(page);
+	}
+
+	kfree(mem->pgs);
+	mem->pgs = NULL;
+}
+
+static void
+kni_clean_queue(struct page_info *mem)
+{
+	if (mem->pgs) {
+		set_page_dirty_lock(mem->pgs[0]);
+		put_page(mem->pgs[0]);
+		kfree(mem->pgs);
+		mem->pgs = NULL;
+	}
+}
+
+static void
+kni_cleanup_iova(struct iova_page_info *mem)
+{
+	kni_unpin_pages(&mem->page_info);
+	kfree(mem->page_map);
+	mem->page_map = NULL;
+
+	kni_clean_queue(&mem->tx_q);
+	kni_clean_queue(&mem->rx_q);
+	kni_clean_queue(&mem->alloc_q);
+	kni_clean_queue(&mem->free_q);
+	kni_clean_queue(&mem->req_q);
+	kni_clean_queue(&mem->resp_q);
+	kni_clean_queue(&mem->sync_va);
+}
+
+int
+kni_pin_pages(void *address, size_t size, struct page_info *mem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	/* Get at least one page */
+	if (size < PAGE_SIZE)
+		size = PAGE_SIZE;
+
+	/* Compute number of user pages based on page size */
+	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	/* Allocate memory for the pages */
+	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
+		      GFP_KERNEL | __GFP_NOWARN);
+	if (!mem->pgs) {
+		pr_err("%s: -ENOMEM\n", __func__);
+		return -ENOMEM;
+	}
+
+	down_write(&current->mm->mmap_sem);
+
+	/* Get the user pages from the user address*/
+	npgs = get_user_pages((u64)address, mem->npgs,
+				gup_flags, &mem->pgs[0], NULL);
+	up_write(&current->mm->mmap_sem);
+
+	/* We didn't get all the requested pages, throw error */
+	if (npgs != mem->npgs) {
+		if (npgs >= 0) {
+			mem->npgs = npgs;
+			err = -ENOMEM;
+			pr_err("%s: -ENOMEM\n", __func__);
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	kni_unpin_pages(mem);
+out_pgs:
+	kfree(mem->pgs);
+	mem->pgs = NULL;
+	return err;
+}
+
+static void*
+kni_map_queue(struct kni_dev *kni, u64 addr,
+			   struct page_info *mm)
+{
+	/* Map atleast 1 page */
+	if (kni_pin_pages((void *)addr, PAGE_SIZE,
+			  mm) != 0) {
+		pr_err("Unable to pin pages\n");
+		return NULL;
+	}
+
+	return (page_address(mm->pgs[0]) +
+			   (addr & kni->va_info.page_mask));
+}
+
 static int
 kni_release(struct inode *inode, struct file *file)
 {
@@ -224,6 +335,11 @@ kni_release(struct inode *inode, struct file *file)
 		}
 
 		kni_dev_remove(dev);
+
+		/* IOVA=VA mode, unpin pages */
+		if (likely(dev->iova_mode == 1))
+			kni_cleanup_iova(&dev->va_info);
+
 		list_del(&dev->list);
 	}
 	up_write(&knet->kni_list_lock);
@@ -364,18 +480,94 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
+	kni->iova_mode = dev_info.iova_mode;
 
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (kni->iova_mode) {
+		u64 mbuf_addr;
+		int i;
+
+		/* map userspace memory info */
+		mbuf_addr = (u64)dev_info.mbuf_va;
+
+		/* Pre compute page mask, used in fast path */
+		kni->va_info.page_mask = (u64)(PAGE_SIZE - 1);
 
+		/* Store start page address, This is the reference
+		 * for all the user virtual address
+		 */
+		kni->va_info.start_page = (mbuf_addr >> PAGE_SHIFT);
+
+		/* Get and pin the user pages */
+		if (kni_pin_pages(dev_info.mbuf_va, dev_info.mbuf_pool_size,
+			      &kni->va_info.page_info) != 0) {
+			pr_err("Unable to pin pages\n");
+			return -1;
+		}
+
+		/* Page map table between user and kernel pages */
+		kni->va_info.page_map = kcalloc(kni->va_info.page_info.npgs,
+						   sizeof(struct mbuf_page),
+						   GFP_KERNEL);
+		if (kni->va_info.page_map == NULL) {
+			pr_err("Out of memory\n");
+			return -ENOMEM;
+		}
+
+		/* Conver the user pages to kernel pages */
+		for (i = 0; i < kni->va_info.page_info.npgs; i++) {
+			kni->va_info.page_map[i].addr =
+				page_address(kni->va_info.page_info.pgs[i]);
+		}
+
+		/* map queues */
+		kni->tx_q = kni_map_queue(kni, dev_info.tx_phys,
+					  &kni->va_info.tx_q);
+		if (kni->tx_q == NULL)
+			goto iova_err;
+
+		kni->rx_q = kni_map_queue(kni, dev_info.rx_phys,
+					  &kni->va_info.rx_q);
+		if (kni->rx_q == NULL)
+			goto iova_err;
+
+		kni->alloc_q = kni_map_queue(kni, dev_info.alloc_phys,
+					     &kni->va_info.alloc_q);
+		if (kni->alloc_q == NULL)
+			goto iova_err;
+
+		kni->free_q = kni_map_queue(kni, dev_info.free_phys,
+					    &kni->va_info.free_q);
+		if (kni->free_q == NULL)
+			goto iova_err;
+
+		kni->req_q = kni_map_queue(kni, dev_info.req_phys,
+					   &kni->va_info.req_q);
+		if (kni->req_q == NULL)
+			goto iova_err;
+
+		kni->resp_q = kni_map_queue(kni, dev_info.resp_phys,
+					    &kni->va_info.resp_q);
+		if (kni->resp_q == NULL)
+			goto iova_err;
+
+		kni->sync_kva = kni_map_queue(kni, dev_info.sync_phys,
+					      &kni->va_info.sync_va);
+		if (kni->sync_kva == NULL)
+			goto iova_err;
+	} else {
+		/* Address tranlation for IOVA=PA mode */
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	}
+	kni->sync_va = dev_info.sync_va;
 	kni->mbuf_size = dev_info.mbuf_size;
 
+
 	pr_debug("tx_phys:      0x%016llx, tx_q addr:      0x%p\n",
 		(unsigned long long) dev_info.tx_phys, kni->tx_q);
 	pr_debug("rx_phys:      0x%016llx, rx_q addr:      0x%p\n",
@@ -475,6 +667,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	up_write(&knet->kni_list_lock);
 
 	return 0;
+iova_err:
+	kni_cleanup_iova(&kni->va_info);
+	return -1;
 }
 
 static int
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index 7fcfa106c..d6d76a32f 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,25 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+
+/* Get the kernel address from the user address using
+ * page map table. Will be used only in IOVA=VA mode
+ */
+static inline void*
+get_kva(uint64_t usr_addr, struct kni_dev *kni)
+{
+	uint32_t index;
+	/* User page - start user page will give the index
+	 * with in the page map table
+	 */
+	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
+
+	/* Add the offset to the page address */
+	return (kni->va_info.page_map[index].addr +
+		(usr_addr & kni->va_info.page_mask));
+
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -181,7 +200,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = get_kva((u64)(kni->pa[i]), kni);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}
 
@@ -258,8 +280,15 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = get_kva((u64)pkt_pa, kni);
+			data_kva = (uint8_t *)pkt_kva +
+				(sizeof(struct rte_kni_mbuf) +
+				 pkt_kva->data_off);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -330,9 +359,15 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = get_kva((u64)kni->pa[i], kni);
+			data_kva = (uint8_t *)kva +
+				(sizeof(struct rte_kni_mbuf) + kva->data_off);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = dev_alloc_skb(len + 2);
@@ -358,8 +393,17 @@ kni_net_rx_normal(struct kni_dev *kni)
 				if (!kva->next)
 					break;
 
-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = get_kva(
+						(u64)va2pa(kva->next, kva),
+						kni);
+					data_kva = (uint8_t *)kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 kva->data_off);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}
 
@@ -429,14 +473,31 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
-			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
-			kni->va[i] = pa2va(kni->pa[i], kva);
+			if (likely(kni->iova_mode == 1)) {
+				kva = get_kva((u64)(kni->pa[i]), kni);
+				len = kva->pkt_len;
+				data_kva = (uint8_t *)kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 kva->data_off);
+				kni->va[i] = pa2va(kni->pa[i], kva);
+				alloc_kva = get_kva((u64)(kni->alloc_pa[i]),
+						    kni);
+				alloc_data_kva = (uint8_t *)alloc_kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 alloc_kva->data_off);
+				kni->alloc_va[i] = pa2va(kni->alloc_pa[i],
+							 alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				len = kva->pkt_len;
+				data_kva = kva2data_kva(kva);
+				kni->va[i] = pa2va(kni->pa[i], kva);
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
-			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+				kni->alloc_va[i] = pa2va(kni->alloc_pa[i],
+							 alloc_kva);
+			}
 
 			memcpy(alloc_data_kva, data_kva, len);
 			alloc_kva->pkt_len = len;
@@ -502,9 +563,15 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = get_kva((u64)(kni->pa[i]), kni);
+			data_kva = (uint8_t *)kva +
+				(sizeof(struct rte_kni_mbuf) + kva->data_off);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = dev_alloc_skb(len + 2);
@@ -540,8 +607,17 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;
 
-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = get_kva(
+						(u64)(va2pa(kva->next, kva)),
+						kni);
+					data_kva = (uint8_t *)kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 kva->data_off);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index e59ac6577..7d5b2ebfa 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -875,15 +875,6 @@ rte_eal_init(int argc, char **argv)
 	/* autodetect the iova mapping mode (default is iova_pa) */
 	rte_eal_get_configuration()->iova_mode = rte_bus_get_iommu_class();
 
-	/* Workaround for KNI which requires physical address to work */
-	if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
-			rte_eal_check_module("rte_kni") == 1) {
-		rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
-		RTE_LOG(WARNING, EAL,
-			"Some devices want IOVA as VA but PA will be used because.. "
-			"KNI module inserted\n");
-	}
-
 	if (internal_config.no_hugetlbfs == 0) {
 		/* rte_config isn't initialized yet */
 		ret = internal_config.process_type == RTE_PROC_PRIMARY ?
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index cfa9448bd..706756e6c 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -122,6 +122,14 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	char mac_addr[6];
+
+	/* IOVA mode. 1 = VA, 0 = PA */
+	uint8_t iova_mode;
+
+	/* Pool size, will be used in kernel to map the
+	 * user pages
+	 */
+	uint64_t mbuf_pool_size;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 65f6a2b03..1e79e68f1 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -397,6 +397,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	ctx->slot_id = slot->id;
 	ctx->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+	if (dev_info.iova_mode) {
+		struct rte_mempool_memhdr *hdr;
+		uint64_t pool_size = 0;
+
+		/* In each pool header chunk, we will maintain the
+		 * base address of the pool. This chunk is physically and
+		 * virtually contiguous.
+		 * This approach will work, only if the allocated pool
+		 * memory is contiguous, else it won't work
+		 */
+		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
+		dev_info.mbuf_va = (void *)(hdr->addr);
+
+		/* Traverse the list and get the total size of the pool */
+		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
+			pool_size += hdr->len;
+		}
+		dev_info.mbuf_pool_size = pool_size +
+			pktmbuf_pool->mz->len;
+	}
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	KNI_MEM_CHECK(ret < 0);
 
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [PATCH] kni: add IOVA va support for kni
  2018-09-27 10:49 [PATCH] kni: add IOVA va support for kni Kiran Kumar
@ 2018-09-27 10:58 ` Burakov, Anatoly
  2018-10-02 17:05 ` Ferruh Yigit
  2019-04-01  9:51 ` [PATCH v2] " Kiran Kumar Kokkilagadda
  2 siblings, 0 replies; 251+ messages in thread
From: Burakov, Anatoly @ 2018-09-27 10:58 UTC (permalink / raw)
  To: Kiran Kumar, ferruh.yigit
  Cc: dev, Jacob, Jerin, hemant.agrawal, jianfeng.tan, Kokkilagadda, Kiran

On 27-Sep-18 11:49 AM, Kiran Kumar wrote:
> With current KNI implementation kernel module will work only in
> IOVA=PA mode. This patch will add support for kernel module to work
> with IOVA=VA mode.
> 
> The idea is to maintain a mapping in KNI module between user pages and
> kernel pages and in fast path perform a lookup in this table and get
> the kernel virtual address for corresponding user virtual address.
> 
> In IOVA=VA mode, the memory allocated to the pool is physically
> and virtually contiguous. We will take advantage of this and create a
> mapping in the kernel.In kernel we need mapping for queues
> (tx_q, rx_q,... slow path) and mbuf memory (fast path).
> 
> At the KNI init time, in slow path we will create a mapping for the
> queues and mbuf using get_user_pages similar to af_xdp. Using pool
> memory base address, we will create a page map table for the mbuf,
> which we will use in the fast path for kernel page translation.
> 
> At KNI init time, we will pass the base address of the pool and size of
> the pool to kernel. In kernel, using get_user_pages API, we will get
> the pages with size PAGE_SIZE and store the mapping and start address
> of user space in a table.
> 
> In fast path for any user address perform PAGE_SHIFT
> (user_addr >> PAGE_SHIFT) and subtract the start address from this value,
> we will get the index of the kernel page with in the page map table.
> Adding offset to this kernel page address, we will get the kernel address
> for this user virtual address.
> 
> For example user pool base address is X, and size is S that we passed to
> kernel. In kernel we will create a mapping for this using get_user_pages.
> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2) ....]
> and user start page will be U (we will get it from X >> PAGE_SHIFT).
> 
> For any user address Z we will get the index of the page map table using
> ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to this
> address will give kernel virtual address.
> 
> Signed-off-by: Kiran Kumar <kkokkilagadda@caviumnetworks.com>
> ---

<snip>

> +
> +	/* IOVA mode. 1 = VA, 0 = PA */
> +	uint8_t iova_mode;

Wouldn't it be easier to understand if the same values (e.g. iova_mode 
== RTE_IOVA_VA) were used here as they are in DPDK?

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [PATCH] kni: add IOVA va support for kni
  2018-09-27 10:49 [PATCH] kni: add IOVA va support for kni Kiran Kumar
  2018-09-27 10:58 ` Burakov, Anatoly
@ 2018-10-02 17:05 ` Ferruh Yigit
  2019-04-01 17:30   ` Jerin Jacob Kollanukkaran
  2019-04-01  9:51 ` [PATCH v2] " Kiran Kumar Kokkilagadda
  2 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2018-10-02 17:05 UTC (permalink / raw)
  To: Kiran Kumar
  Cc: dev, anatoly.burakov, Jacob, Jerin, hemant.agrawal, jianfeng.tan,
	Kokkilagadda, Kiran

On 9/27/2018 11:49 AM, Kiran Kumar wrote:
> With current KNI implementation kernel module will work only in
> IOVA=PA mode. This patch will add support for kernel module to work
> with IOVA=VA mode.
> 
> The idea is to maintain a mapping in KNI module between user pages and
> kernel pages and in fast path perform a lookup in this table and get
> the kernel virtual address for corresponding user virtual address.
> 
> In IOVA=VA mode, the memory allocated to the pool is physically
> and virtually contiguous. We will take advantage of this and create a
> mapping in the kernel.In kernel we need mapping for queues
> (tx_q, rx_q,... slow path) and mbuf memory (fast path).
> 
> At the KNI init time, in slow path we will create a mapping for the
> queues and mbuf using get_user_pages similar to af_xdp. Using pool
> memory base address, we will create a page map table for the mbuf,
> which we will use in the fast path for kernel page translation.
> 
> At KNI init time, we will pass the base address of the pool and size of
> the pool to kernel. In kernel, using get_user_pages API, we will get
> the pages with size PAGE_SIZE and store the mapping and start address
> of user space in a table.
> 
> In fast path for any user address perform PAGE_SHIFT
> (user_addr >> PAGE_SHIFT) and subtract the start address from this value,
> we will get the index of the kernel page with in the page map table.
> Adding offset to this kernel page address, we will get the kernel address
> for this user virtual address.
> 
> For example user pool base address is X, and size is S that we passed to
> kernel. In kernel we will create a mapping for this using get_user_pages.
> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2) ....]
> and user start page will be U (we will get it from X >> PAGE_SHIFT).
> 
> For any user address Z we will get the index of the page map table using
> ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to this
> address will give kernel virtual address.
> 
> Signed-off-by: Kiran Kumar <kkokkilagadda@caviumnetworks.com>

Hi Kiran,

Thanks for the patch but it was later for this release, missed proposal deadline
etc.. Taking the time remaining for integration deadline, this patch will be
considered for next release.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [PATCH v2] kni: add IOVA va support for kni
  2018-09-27 10:49 [PATCH] kni: add IOVA va support for kni Kiran Kumar
  2018-09-27 10:58 ` Burakov, Anatoly
  2018-10-02 17:05 ` Ferruh Yigit
@ 2019-04-01  9:51 ` Kiran Kumar Kokkilagadda
  2019-04-03 16:29   ` Ferruh Yigit
  2019-04-16  4:55   ` [dpdk-dev] [PATCH v3] " kirankumark
  2 siblings, 2 replies; 251+ messages in thread
From: Kiran Kumar Kokkilagadda @ 2019-04-01  9:51 UTC (permalink / raw)
  To: ferruh.yigit; +Cc: dev, Kiran Kumar Kokkilagadda

From: Kiran Kumar K <kirankumark@marvell.com>

With current KNI implementation kernel module will work only in
IOVA=PA mode. This patch will add support for kernel module to work
with IOVA=VA mode.

The idea is to maintain a mapping in KNI module between user pages and
kernel pages and in fast path perform a lookup in this table and get
the kernel virtual address for corresponding user virtual address.

In IOVA=VA mode, the memory allocated to the pool is physically
and virtually contiguous. We will take advantage of this and create a
mapping in the kernel.In kernel we need mapping for queues
(tx_q, rx_q,... slow path) and mbuf memory (fast path).

At the KNI init time, in slow path we will create a mapping for the
queues and mbuf using get_user_pages similar to af_xdp. Using pool
memory base address, we will create a page map table for the mbuf,
which we will use in the fast path for kernel page translation.

At KNI init time, we will pass the base address of the pool and size of
the pool to kernel. In kernel, using get_user_pages API, we will get
the pages with size PAGE_SIZE and store the mapping and start address
of user space in a table.

In fast path for any user address perform PAGE_SHIFT
(user_addr >> PAGE_SHIFT) and subtract the start address from this value,
we will get the index of the kernel page with in the page map table.
Adding offset to this kernel page address, we will get the kernel address
for this user virtual address.

For example user pool base address is X, and size is S that we passed to
kernel. In kernel we will create a mapping for this using get_user_pages.
Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2) ....]
and user start page will be U (we will get it from X >> PAGE_SHIFT).

For any user address Z we will get the index of the page map table using
((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to this
address will give kernel virtual address.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---

V2 changes:
* Fixed build issue with older kernel

 kernel/linux/kni/kni_dev.h                    |  37 +++
 kernel/linux/kni/kni_misc.c                   | 215 +++++++++++++++++-
 kernel/linux/kni/kni_net.c                    | 114 ++++++++--
 .../eal/include/exec-env/rte_kni_common.h     |   8 +
 lib/librte_kni/rte_kni.c                      |  21 ++
 5 files changed, 369 insertions(+), 26 deletions(-)

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index 688f574a4..055b8d59e 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -32,10 +32,47 @@
 /* Default carrier state for created KNI network interfaces */
 extern uint32_t dflt_carrier;

+struct iova_page_info {
+	/* User to kernel page table map, used for
+	 * fast path lookup
+	 */
+	struct mbuf_page {
+		void *addr;
+	} *page_map;
+
+	/* Page mask */
+	u64 page_mask;
+
+	/* Start page for user address */
+	u64 start_page;
+
+	struct page_info {
+		/* Physical pages returned by get_user_pages */
+		struct page **pgs;
+
+		/* Number of Pages returned by get_user_pages */
+		u32 npgs;
+	} page_info;
+
+	/* Queue info */
+	struct page_info tx_q;
+	struct page_info rx_q;
+	struct page_info alloc_q;
+	struct page_info free_q;
+	struct page_info req_q;
+	struct page_info resp_q;
+	struct page_info sync_va;
+};
+
 /**
  * A structure describing the private information for a kni device.
  */
 struct kni_dev {
+	/* Page info for IOVA=VA mode */
+	struct iova_page_info va_info;
+	/* IOVA mode 0 = PA, 1 = VA */
+	uint8_t iova_mode;
+
 	/* kni list */
 	struct list_head list;

diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 04c78eb87..0be0e1dfd 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -201,6 +201,122 @@ kni_dev_remove(struct kni_dev *dev)
 	return 0;
 }

+static void
+kni_unpin_pages(struct page_info *mem)
+{
+	u32 i;
+
+	/* Set the user pages as dirty, so that these pages will not be
+	 * allocated to other applications until we release them.
+	 */
+	for (i = 0; i < mem->npgs; i++) {
+		struct page *page = mem->pgs[i];
+
+		set_page_dirty_lock(page);
+		put_page(page);
+	}
+
+	kfree(mem->pgs);
+	mem->pgs = NULL;
+}
+
+static void
+kni_clean_queue(struct page_info *mem)
+{
+	if (mem->pgs) {
+		set_page_dirty_lock(mem->pgs[0]);
+		put_page(mem->pgs[0]);
+		kfree(mem->pgs);
+		mem->pgs = NULL;
+	}
+}
+
+static void
+kni_cleanup_iova(struct iova_page_info *mem)
+{
+	kni_unpin_pages(&mem->page_info);
+	kfree(mem->page_map);
+	mem->page_map = NULL;
+
+	kni_clean_queue(&mem->tx_q);
+	kni_clean_queue(&mem->rx_q);
+	kni_clean_queue(&mem->alloc_q);
+	kni_clean_queue(&mem->free_q);
+	kni_clean_queue(&mem->req_q);
+	kni_clean_queue(&mem->resp_q);
+	kni_clean_queue(&mem->sync_va);
+}
+
+int
+kni_pin_pages(void *address, size_t size, struct page_info *mem)
+{
+	unsigned int gup_flags = FOLL_WRITE;
+	long npgs;
+	int err;
+
+	/* Get at least one page */
+	if (size < PAGE_SIZE)
+		size = PAGE_SIZE;
+
+	/* Compute number of user pages based on page size */
+	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	/* Allocate memory for the pages */
+	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
+		      GFP_KERNEL | __GFP_NOWARN);
+	if (!mem->pgs) {
+		pr_err("%s: -ENOMEM\n", __func__);
+		return -ENOMEM;
+	}
+
+	down_write(&current->mm->mmap_sem);
+
+	/* Get the user pages from the user address*/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0)
+	npgs = get_user_pages((u64)address, mem->npgs,
+				gup_flags, &mem->pgs[0], NULL);
+#else
+	npgs = get_user_pages(current, current->mm, (u64)address, mem->npgs,
+				gup_flags, 0, &mem->pgs[0], NULL);
+#endif
+	up_write(&current->mm->mmap_sem);
+
+	/* We didn't get all the requested pages, throw error */
+	if (npgs != mem->npgs) {
+		if (npgs >= 0) {
+			mem->npgs = npgs;
+			err = -ENOMEM;
+			pr_err("%s: -ENOMEM\n", __func__);
+			goto out_pin;
+		}
+		err = npgs;
+		goto out_pgs;
+	}
+	return 0;
+
+out_pin:
+	kni_unpin_pages(mem);
+out_pgs:
+	kfree(mem->pgs);
+	mem->pgs = NULL;
+	return err;
+}
+
+static void*
+kni_map_queue(struct kni_dev *kni, u64 addr,
+			   struct page_info *mm)
+{
+	/* Map atleast 1 page */
+	if (kni_pin_pages((void *)addr, PAGE_SIZE,
+			  mm) != 0) {
+		pr_err("Unable to pin pages\n");
+		return NULL;
+	}
+
+	return (page_address(mm->pgs[0]) +
+			   (addr & kni->va_info.page_mask));
+}
+
 static int
 kni_release(struct inode *inode, struct file *file)
 {
@@ -228,6 +344,11 @@ kni_release(struct inode *inode, struct file *file)
 		}

 		kni_dev_remove(dev);
+
+		/* IOVA=VA mode, unpin pages */
+		if (likely(dev->iova_mode == 1))
+			kni_cleanup_iova(&dev->va_info);
+
 		list_del(&dev->list);
 	}
 	up_write(&knet->kni_list_lock);
@@ -368,16 +489,91 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);

 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
+	kni->iova_mode = dev_info.iova_mode;

-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (kni->iova_mode) {
+		u64 mbuf_addr;
+		int i;
+
+		/* map userspace memory info */
+		mbuf_addr = (u64)dev_info.mbuf_va;

+		/* Pre compute page mask, used in fast path */
+		kni->va_info.page_mask = (u64)(PAGE_SIZE - 1);
+
+		/* Store start page address, This is the reference
+		 * for all the user virtual address
+		 */
+		kni->va_info.start_page = (mbuf_addr >> PAGE_SHIFT);
+
+		/* Get and pin the user pages */
+		if (kni_pin_pages(dev_info.mbuf_va, dev_info.mbuf_pool_size,
+			      &kni->va_info.page_info) != 0) {
+			pr_err("Unable to pin pages\n");
+			return -1;
+		}
+
+		/* Page map table between user and kernel pages */
+		kni->va_info.page_map = kcalloc(kni->va_info.page_info.npgs,
+						   sizeof(struct mbuf_page),
+						   GFP_KERNEL);
+		if (kni->va_info.page_map == NULL) {
+			pr_err("Out of memory\n");
+			return -ENOMEM;
+		}
+
+		/* Conver the user pages to kernel pages */
+		for (i = 0; i < kni->va_info.page_info.npgs; i++) {
+			kni->va_info.page_map[i].addr =
+				page_address(kni->va_info.page_info.pgs[i]);
+		}
+
+		/* map queues */
+		kni->tx_q = kni_map_queue(kni, dev_info.tx_phys,
+					  &kni->va_info.tx_q);
+		if (kni->tx_q == NULL)
+			goto iova_err;
+
+		kni->rx_q = kni_map_queue(kni, dev_info.rx_phys,
+					  &kni->va_info.rx_q);
+		if (kni->rx_q == NULL)
+			goto iova_err;
+
+		kni->alloc_q = kni_map_queue(kni, dev_info.alloc_phys,
+					     &kni->va_info.alloc_q);
+		if (kni->alloc_q == NULL)
+			goto iova_err;
+
+		kni->free_q = kni_map_queue(kni, dev_info.free_phys,
+					    &kni->va_info.free_q);
+		if (kni->free_q == NULL)
+			goto iova_err;
+
+		kni->req_q = kni_map_queue(kni, dev_info.req_phys,
+					   &kni->va_info.req_q);
+		if (kni->req_q == NULL)
+			goto iova_err;
+
+		kni->resp_q = kni_map_queue(kni, dev_info.resp_phys,
+					    &kni->va_info.resp_q);
+		if (kni->resp_q == NULL)
+			goto iova_err;
+
+		kni->sync_kva = kni_map_queue(kni, dev_info.sync_phys,
+					      &kni->va_info.sync_va);
+		if (kni->sync_kva == NULL)
+			goto iova_err;
+	} else {
+		/* Address tranlation for IOVA=PA mode */
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	}
+	kni->sync_va = dev_info.sync_va;
 	kni->mbuf_size = dev_info.mbuf_size;

 	pr_debug("tx_phys:      0x%016llx, tx_q addr:      0x%p\n",
@@ -484,6 +680,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	up_write(&knet->kni_list_lock);

 	return 0;
+iova_err:
+	kni_cleanup_iova(&kni->va_info);
+	return -1;
 }

 static int
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index 7371b6d58..83fbcf6f1 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,25 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;

+
+/* Get the kernel address from the user address using
+ * page map table. Will be used only in IOVA=VA mode
+ */
+static inline void*
+get_kva(uint64_t usr_addr, struct kni_dev *kni)
+{
+	uint32_t index;
+	/* User page - start user page will give the index
+	 * with in the page map table
+	 */
+	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
+
+	/* Add the offset to the page address */
+	return (kni->va_info.page_map[index].addr +
+		(usr_addr & kni->va_info.page_mask));
+
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -186,7 +205,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;

 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = get_kva((u64)(kni->pa[i]), kni);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}

@@ -263,8 +285,16 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;

-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = get_kva((u64)pkt_pa, kni);
+			data_kva = (uint8_t *)pkt_kva +
+				(sizeof(struct rte_kni_mbuf) +
+				 pkt_kva->data_off);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);

 		len = skb->len;
@@ -333,11 +363,18 @@ kni_net_rx_normal(struct kni_dev *kni)
 	if (num_rx == 0)
 		return;

+
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = get_kva((u64)kni->pa[i], kni);
+			data_kva = (uint8_t *)kva +
+				(sizeof(struct rte_kni_mbuf) + kva->data_off);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -363,8 +400,17 @@ kni_net_rx_normal(struct kni_dev *kni)
 				if (!kva->next)
 					break;

-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = get_kva(
+						(u64)va2pa(kva->next, kva),
+						kni);
+					data_kva = (uint8_t *)kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 kva->data_off);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}

@@ -434,14 +480,31 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
-			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
-			kni->va[i] = pa2va(kni->pa[i], kva);
+			if (likely(kni->iova_mode == 1)) {
+				kva = get_kva((u64)(kni->pa[i]), kni);
+				len = kva->pkt_len;
+				data_kva = (uint8_t *)kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 kva->data_off);
+				kni->va[i] = pa2va(kni->pa[i], kva);
+				alloc_kva = get_kva((u64)(kni->alloc_pa[i]),
+						    kni);
+				alloc_data_kva = (uint8_t *)alloc_kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 alloc_kva->data_off);
+				kni->alloc_va[i] = pa2va(kni->alloc_pa[i],
+							 alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				len = kva->pkt_len;
+				data_kva = kva2data_kva(kva);
+				kni->va[i] = pa2va(kni->pa[i], kva);

-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
-			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+				kni->alloc_va[i] = pa2va(kni->alloc_pa[i],
+							 alloc_kva);
+			}

 			memcpy(alloc_data_kva, data_kva, len);
 			alloc_kva->pkt_len = len;
@@ -507,9 +570,15 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)

 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = get_kva((u64)(kni->pa[i]), kni);
+			data_kva = (uint8_t *)kva +
+				(sizeof(struct rte_kni_mbuf) + kva->data_off);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -545,8 +614,17 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;

-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = get_kva(
+						(u64)(va2pa(kva->next, kva)),
+						kni);
+					data_kva = (uint8_t *)kva +
+					(sizeof(struct rte_kni_mbuf) +
+					 kva->data_off);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}

diff --git a/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h
index 5afa08713..897dd956f 100644
--- a/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h
@@ -128,6 +128,14 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	char mac_addr[6];
+
+	/* IOVA mode. 1 = VA, 0 = PA */
+	uint8_t iova_mode;
+
+	/* Pool size, will be used in kernel to map the
+	 * user pages
+	 */
+	uint64_t mbuf_pool_size;
 };

 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 492e207a3..3bf19faa0 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -304,6 +304,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;

+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+	if (dev_info.iova_mode) {
+		struct rte_mempool_memhdr *hdr;
+		uint64_t pool_size = 0;
+
+		/* In each pool header chunk, we will maintain the
+		 * base address of the pool. This chunk is physically and
+		 * virtually contiguous.
+		 * This approach will work, only if the allocated pool
+		 * memory is contiguous, else it won't work
+		 */
+		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
+		dev_info.mbuf_va = (void *)(hdr->addr);
+
+		/* Traverse the list and get the total size of the pool */
+		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
+			pool_size += hdr->len;
+		}
+		dev_info.mbuf_pool_size = pool_size +
+			pktmbuf_pool->mz->len;
+	}
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
--
2.17.1

^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [PATCH] kni: add IOVA va support for kni
  2018-10-02 17:05 ` Ferruh Yigit
@ 2019-04-01 17:30   ` Jerin Jacob Kollanukkaran
  2019-04-01 18:20     ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-04-01 17:30 UTC (permalink / raw)
  To: kkokkilagadda, ferruh.yigit
  Cc: Kiran.Kokkilagadda, Jerin.JacobKollanukkaran, dev,
	anatoly.burakov, hemant.agrawal, jianfeng.tan

On Tue, 2018-10-02 at 18:05 +0100, Ferruh Yigit wrote:
> On 9/27/2018 11:49 AM, Kiran Kumar wrote:
> > With current KNI implementation kernel module will work only in
> > IOVA=PA mode. This patch will add support for kernel module to work
> > with IOVA=VA mode.
> > 
> > The idea is to maintain a mapping in KNI module between user pages
> > and
> > kernel pages and in fast path perform a lookup in this table and
> > get
> > the kernel virtual address for corresponding user virtual address.
> > 
> > In IOVA=VA mode, the memory allocated to the pool is physically
> > and virtually contiguous. We will take advantage of this and create
> > a
> > mapping in the kernel.In kernel we need mapping for queues
> > (tx_q, rx_q,... slow path) and mbuf memory (fast path).
> > 
> > At the KNI init time, in slow path we will create a mapping for the
> > queues and mbuf using get_user_pages similar to af_xdp. Using pool
> > memory base address, we will create a page map table for the mbuf,
> > which we will use in the fast path for kernel page translation.
> > 
> > At KNI init time, we will pass the base address of the pool and
> > size of
> > the pool to kernel. In kernel, using get_user_pages API, we will
> > get
> > the pages with size PAGE_SIZE and store the mapping and start
> > address
> > of user space in a table.
> > 
> > In fast path for any user address perform PAGE_SHIFT
> > (user_addr >> PAGE_SHIFT) and subtract the start address from this
> > value,
> > we will get the index of the kernel page with in the page map
> > table.
> > Adding offset to this kernel page address, we will get the kernel
> > address
> > for this user virtual address.
> > 
> > For example user pool base address is X, and size is S that we
> > passed to
> > kernel. In kernel we will create a mapping for this using
> > get_user_pages.
> > Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2)
> > ....]
> > and user start page will be U (we will get it from X >>
> > PAGE_SHIFT).
> > 
> > For any user address Z we will get the index of the page map table
> > using
> > ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to
> > this
> > address will give kernel virtual address.
> > 
> > Signed-off-by: Kiran Kumar <kkokkilagadda@caviumnetworks.com>
> 
> Hi Kiran,
> 
> Thanks for the patch but it was later for this release, missed
> proposal deadline
> etc.. Taking the time remaining for integration deadline, this patch
> will be
> considered for next release.

Hi Ferruh,

This patch submitted in Oct last year. Could you please review this?
This feature was added based on community request to support IOVA for
KNI as Initial IOVA support doesn;t have KNI support.



> 
> Thanks,
> ferruh
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [PATCH] kni: add IOVA va support for kni
  2019-04-01 17:30   ` Jerin Jacob Kollanukkaran
@ 2019-04-01 18:20     ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-04-01 18:20 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, kkokkilagadda
  Cc: Kiran.Kokkilagadda, Jerin.JacobKollanukkaran, dev,
	anatoly.burakov, hemant.agrawal, jianfeng.tan

On 4/1/2019 6:30 PM, Jerin Jacob Kollanukkaran wrote:
> On Tue, 2018-10-02 at 18:05 +0100, Ferruh Yigit wrote:
>> On 9/27/2018 11:49 AM, Kiran Kumar wrote:
>>> With current KNI implementation kernel module will work only in
>>> IOVA=PA mode. This patch will add support for kernel module to work
>>> with IOVA=VA mode.
>>>
>>> The idea is to maintain a mapping in KNI module between user pages
>>> and
>>> kernel pages and in fast path perform a lookup in this table and
>>> get
>>> the kernel virtual address for corresponding user virtual address.
>>>
>>> In IOVA=VA mode, the memory allocated to the pool is physically
>>> and virtually contiguous. We will take advantage of this and create
>>> a
>>> mapping in the kernel.In kernel we need mapping for queues
>>> (tx_q, rx_q,... slow path) and mbuf memory (fast path).
>>>
>>> At the KNI init time, in slow path we will create a mapping for the
>>> queues and mbuf using get_user_pages similar to af_xdp. Using pool
>>> memory base address, we will create a page map table for the mbuf,
>>> which we will use in the fast path for kernel page translation.
>>>
>>> At KNI init time, we will pass the base address of the pool and
>>> size of
>>> the pool to kernel. In kernel, using get_user_pages API, we will
>>> get
>>> the pages with size PAGE_SIZE and store the mapping and start
>>> address
>>> of user space in a table.
>>>
>>> In fast path for any user address perform PAGE_SHIFT
>>> (user_addr >> PAGE_SHIFT) and subtract the start address from this
>>> value,
>>> we will get the index of the kernel page with in the page map
>>> table.
>>> Adding offset to this kernel page address, we will get the kernel
>>> address
>>> for this user virtual address.
>>>
>>> For example user pool base address is X, and size is S that we
>>> passed to
>>> kernel. In kernel we will create a mapping for this using
>>> get_user_pages.
>>> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2)
>>> ....]
>>> and user start page will be U (we will get it from X >>
>>> PAGE_SHIFT).
>>>
>>> For any user address Z we will get the index of the page map table
>>> using
>>> ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to
>>> this
>>> address will give kernel virtual address.
>>>
>>> Signed-off-by: Kiran Kumar <kkokkilagadda@caviumnetworks.com>
>>
>> Hi Kiran,
>>
>> Thanks for the patch but it was later for this release, missed
>> proposal deadline
>> etc.. Taking the time remaining for integration deadline, this patch
>> will be
>> considered for next release.
> 
> Hi Ferruh,
> 
> This patch submitted in Oct last year. Could you please review this?
> This feature was added based on community request to support IOVA for
> KNI as Initial IOVA support doesn;t have KNI support.
> 

Hi Jerin,

I will check it, I remember the request, thanks for the patch.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [PATCH v2] kni: add IOVA va support for kni
  2019-04-01  9:51 ` [PATCH v2] " Kiran Kumar Kokkilagadda
@ 2019-04-03 16:29   ` Ferruh Yigit
  2019-04-04  5:03     ` [dpdk-dev] [EXT] " Kiran Kumar Kokkilagadda
  2019-04-04  9:57     ` Burakov, Anatoly
  2019-04-16  4:55   ` [dpdk-dev] [PATCH v3] " kirankumark
  1 sibling, 2 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-04-03 16:29 UTC (permalink / raw)
  To: Kiran Kumar Kokkilagadda; +Cc: dev, Jerin Jacob

On 4/1/2019 10:51 AM, Kiran Kumar Kokkilagadda wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> With current KNI implementation kernel module will work only in
> IOVA=PA mode. This patch will add support for kernel module to work
> with IOVA=VA mode.

Thanks Kiran for removing the limitation, I have a few questions, can you please
help me understand.

And when this patch is ready, the restriction in 'linux/eal/eal.c', in
'rte_eal_init' should be removed, perhaps with this patch. I assume you already
doing it to be able to test this patch.

> 
> The idea is to maintain a mapping in KNI module between user pages and
> kernel pages and in fast path perform a lookup in this table and get
> the kernel virtual address for corresponding user virtual address.
> 
> In IOVA=VA mode, the memory allocated to the pool is physically
> and virtually contiguous. We will take advantage of this and create a
> mapping in the kernel.In kernel we need mapping for queues
> (tx_q, rx_q,... slow path) and mbuf memory (fast path).

Is it?
As far as I know mempool can have multiple chunks and they can be both virtually
and physically separated.

And even for a single chunk, that will be virtually continuous, but will it be
physically continuous?

> 
> At the KNI init time, in slow path we will create a mapping for the
> queues and mbuf using get_user_pages similar to af_xdp. Using pool
> memory base address, we will create a page map table for the mbuf,
> which we will use in the fast path for kernel page translation.
> 
> At KNI init time, we will pass the base address of the pool and size of
> the pool to kernel. In kernel, using get_user_pages API, we will get
> the pages with size PAGE_SIZE and store the mapping and start address
> of user space in a table.
> 
> In fast path for any user address perform PAGE_SHIFT
> (user_addr >> PAGE_SHIFT) and subtract the start address from this value,
> we will get the index of the kernel page with in the page map table.
> Adding offset to this kernel page address, we will get the kernel address
> for this user virtual address.
> 
> For example user pool base address is X, and size is S that we passed to
> kernel. In kernel we will create a mapping for this using get_user_pages.
> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2) ....]
> and user start page will be U (we will get it from X >> PAGE_SHIFT).
> 
> For any user address Z we will get the index of the page map table using
> ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to this
> address will give kernel virtual address.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>

<...>

> +int
> +kni_pin_pages(void *address, size_t size, struct page_info *mem)
> +{
> +	unsigned int gup_flags = FOLL_WRITE;
> +	long npgs;
> +	int err;
> +
> +	/* Get at least one page */
> +	if (size < PAGE_SIZE)
> +		size = PAGE_SIZE;
> +
> +	/* Compute number of user pages based on page size */
> +	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
> +
> +	/* Allocate memory for the pages */
> +	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
> +		      GFP_KERNEL | __GFP_NOWARN);
> +	if (!mem->pgs) {
> +		pr_err("%s: -ENOMEM\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	down_write(&current->mm->mmap_sem);
> +
> +	/* Get the user pages from the user address*/
> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0)
> +	npgs = get_user_pages((u64)address, mem->npgs,
> +				gup_flags, &mem->pgs[0], NULL);
> +#else
> +	npgs = get_user_pages(current, current->mm, (u64)address, mem->npgs,
> +				gup_flags, 0, &mem->pgs[0], NULL);
> +#endif
> +	up_write(&current->mm->mmap_sem);

This should work even memory is physically not continuous, right? Where exactly
physically continuous requirement is coming from?

<...>

> +
> +/* Get the kernel address from the user address using
> + * page map table. Will be used only in IOVA=VA mode
> + */
> +static inline void*
> +get_kva(uint64_t usr_addr, struct kni_dev *kni)
> +{
> +	uint32_t index;
> +	/* User page - start user page will give the index
> +	 * with in the page map table
> +	 */
> +	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
> +
> +	/* Add the offset to the page address */
> +	return (kni->va_info.page_map[index].addr +
> +		(usr_addr & kni->va_info.page_mask));
> +
> +}
> +
>  /* physical address to kernel virtual address */
>  static void *
>  pa2kva(void *pa)
> @@ -186,7 +205,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>  			return;
> 
>  		for (i = 0; i < num_rx; i++) {
> -			kva = pa2kva(kni->pa[i]);
> +			if (likely(kni->iova_mode == 1))
> +				kva = get_kva((u64)(kni->pa[i]), kni);

kni->pa[] now has iova addresses, for 'get_kva()' to work shouldn't
'va_info.start_page' calculated from 'mempool_memhdr->iova' instead of
'mempool_memhdr->addr'

If this is working I must be missing something but not able to find what it is.

<...>

> @@ -304,6 +304,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  	kni->group_id = conf->group_id;
>  	kni->mbuf_size = conf->mbuf_size;
> 
> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> +	if (dev_info.iova_mode) {
> +		struct rte_mempool_memhdr *hdr;
> +		uint64_t pool_size = 0;
> +
> +		/* In each pool header chunk, we will maintain the
> +		 * base address of the pool. This chunk is physically and
> +		 * virtually contiguous.
> +		 * This approach will work, only if the allocated pool
> +		 * memory is contiguous, else it won't work
> +		 */
> +		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
> +		dev_info.mbuf_va = (void *)(hdr->addr);
> +
> +		/* Traverse the list and get the total size of the pool */
> +		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
> +			pool_size += hdr->len;
> +		}

This code is aware that there may be multiple chunks, but assumes they are all
continuous, I don't know if this assumption is correct.

Also I guess there is another assumption that there will be single pktmbuf_pool
in the application which passed into kni?
What if there are multiple pktmbuf_pool, like one for each PMD, will this work?
Now some mbufs in kni Rx fifo will come from different pktmbuf_pool which we
don't know their pages, so won't able to get their kernel virtual address.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v2] kni: add IOVA va support for kni
  2019-04-03 16:29   ` Ferruh Yigit
@ 2019-04-04  5:03     ` Kiran Kumar Kokkilagadda
  2019-04-04 11:20       ` Ferruh Yigit
  2019-04-04  9:57     ` Burakov, Anatoly
  1 sibling, 1 reply; 251+ messages in thread
From: Kiran Kumar Kokkilagadda @ 2019-04-04  5:03 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Jerin Jacob



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Wednesday, April 3, 2019 9:59 PM
> To: Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
> Cc: dev@dpdk.org; Jerin Jacob <jerin.jacob@caviumnetworks.com>
> Subject: [EXT] Re: [dpdk-dev] [PATCH v2] kni: add IOVA va support for kni
> 
> External Email
> 
> ----------------------------------------------------------------------
> On 4/1/2019 10:51 AM, Kiran Kumar Kokkilagadda wrote:
> > From: Kiran Kumar K <kirankumark@marvell.com>
> >
> > With current KNI implementation kernel module will work only in
> > IOVA=PA mode. This patch will add support for kernel module to work
> > with IOVA=VA mode.
> 
> Thanks Kiran for removing the limitation, I have a few questions, can you please
> help me understand.
> 
> And when this patch is ready, the restriction in 'linux/eal/eal.c', in 'rte_eal_init'
> should be removed, perhaps with this patch. I assume you already doing it to be
> able to test this patch.
> 

User can choose the mode by passing --iova-mode=<va/pa>. I will remove the rte_kni module restriction.
 

> >
> > The idea is to maintain a mapping in KNI module between user pages and
> > kernel pages and in fast path perform a lookup in this table and get
> > the kernel virtual address for corresponding user virtual address.
> >
> > In IOVA=VA mode, the memory allocated to the pool is physically and
> > virtually contiguous. We will take advantage of this and create a
> > mapping in the kernel.In kernel we need mapping for queues (tx_q,
> > rx_q,... slow path) and mbuf memory (fast path).
> 
> Is it?
> As far as I know mempool can have multiple chunks and they can be both
> virtually and physically separated.
> 
> And even for a single chunk, that will be virtually continuous, but will it be
> physically continuous?
> 

You are right, it need not have to be physically contiguous. Will change the description.
 

> >
> > At the KNI init time, in slow path we will create a mapping for the
> > queues and mbuf using get_user_pages similar to af_xdp. Using pool
> > memory base address, we will create a page map table for the mbuf,
> > which we will use in the fast path for kernel page translation.
> >
> > At KNI init time, we will pass the base address of the pool and size
> > of the pool to kernel. In kernel, using get_user_pages API, we will
> > get the pages with size PAGE_SIZE and store the mapping and start
> > address of user space in a table.
> >
> > In fast path for any user address perform PAGE_SHIFT (user_addr >>
> > PAGE_SHIFT) and subtract the start address from this value, we will
> > get the index of the kernel page with in the page map table.
> > Adding offset to this kernel page address, we will get the kernel
> > address for this user virtual address.
> >
> > For example user pool base address is X, and size is S that we passed
> > to kernel. In kernel we will create a mapping for this using get_user_pages.
> > Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2)
> > ....] and user start page will be U (we will get it from X >> PAGE_SHIFT).
> >
> > For any user address Z we will get the index of the page map table
> > using ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to
> > this address will give kernel virtual address.
> >
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> 
> <...>
> 
> > +int
> > +kni_pin_pages(void *address, size_t size, struct page_info *mem) {
> > +	unsigned int gup_flags = FOLL_WRITE;
> > +	long npgs;
> > +	int err;
> > +
> > +	/* Get at least one page */
> > +	if (size < PAGE_SIZE)
> > +		size = PAGE_SIZE;
> > +
> > +	/* Compute number of user pages based on page size */
> > +	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
> > +
> > +	/* Allocate memory for the pages */
> > +	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
> > +		      GFP_KERNEL | __GFP_NOWARN);
> > +	if (!mem->pgs) {
> > +		pr_err("%s: -ENOMEM\n", __func__);
> > +		return -ENOMEM;
> > +	}
> > +
> > +	down_write(&current->mm->mmap_sem);
> > +
> > +	/* Get the user pages from the user address*/ #if
> LINUX_VERSION_CODE
> > +>= KERNEL_VERSION(4,9,0)
> > +	npgs = get_user_pages((u64)address, mem->npgs,
> > +				gup_flags, &mem->pgs[0], NULL);
> > +#else
> > +	npgs = get_user_pages(current, current->mm, (u64)address, mem-
> >npgs,
> > +				gup_flags, 0, &mem->pgs[0], NULL); #endif
> > +	up_write(&current->mm->mmap_sem);
> 
> This should work even memory is physically not continuous, right? Where
> exactly physically continuous requirement is coming from?
> 

Yes, it is not necessary to be physically contiguous.

 


> <...>
> 
> > +
> > +/* Get the kernel address from the user address using
> > + * page map table. Will be used only in IOVA=VA mode  */ static
> > +inline void* get_kva(uint64_t usr_addr, struct kni_dev *kni) {
> > +	uint32_t index;
> > +	/* User page - start user page will give the index
> > +	 * with in the page map table
> > +	 */
> > +	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
> > +
> > +	/* Add the offset to the page address */
> > +	return (kni->va_info.page_map[index].addr +
> > +		(usr_addr & kni->va_info.page_mask));
> > +
> > +}
> > +
> >  /* physical address to kernel virtual address */  static void *
> > pa2kva(void *pa) @@ -186,7 +205,10 @@ kni_fifo_trans_pa2va(struct
> > kni_dev *kni,
> >  			return;
> >
> >  		for (i = 0; i < num_rx; i++) {
> > -			kva = pa2kva(kni->pa[i]);
> > +			if (likely(kni->iova_mode == 1))
> > +				kva = get_kva((u64)(kni->pa[i]), kni);
> 
> kni->pa[] now has iova addresses, for 'get_kva()' to work shouldn't
> 'va_info.start_page' calculated from 'mempool_memhdr->iova' instead of
> 'mempool_memhdr->addr'
> 
> If this is working I must be missing something but not able to find what it is.
> 
> <...>
> 
 
In IOVA=VA mode, both the values will be same right?


> > @@ -304,6 +304,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
> >  	kni->group_id = conf->group_id;
> >  	kni->mbuf_size = conf->mbuf_size;
> >
> > +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> > +	if (dev_info.iova_mode) {
> > +		struct rte_mempool_memhdr *hdr;
> > +		uint64_t pool_size = 0;
> > +
> > +		/* In each pool header chunk, we will maintain the
> > +		 * base address of the pool. This chunk is physically and
> > +		 * virtually contiguous.
> > +		 * This approach will work, only if the allocated pool
> > +		 * memory is contiguous, else it won't work
> > +		 */
> > +		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
> > +		dev_info.mbuf_va = (void *)(hdr->addr);
> > +
> > +		/* Traverse the list and get the total size of the pool */
> > +		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
> > +			pool_size += hdr->len;
> > +		}
> 
> This code is aware that there may be multiple chunks, but assumes they are all
> continuous, I don't know if this assumption is correct.
> 
> Also I guess there is another assumption that there will be single pktmbuf_pool
> in the application which passed into kni?
> What if there are multiple pktmbuf_pool, like one for each PMD, will this work?
> Now some mbufs in kni Rx fifo will come from different pktmbuf_pool which we
> don't know their pages, so won't able to get their kernel virtual address.

All these chunks have to be virtually contiguous, otherwise this approach will not work. Here one thing we can do is create the mapping for complete huge page, and make sure all the mbuff_mempools will be with in this offset. As huge page size will be big (x64 it is 1G, ARM 512MB), we may not exceed the size, even with multiple pktmbuf_pools. If mbuf_pools were created outside this huge_page, this approach will not work and we will fall back to PA mode.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [PATCH v2] kni: add IOVA va support for kni
  2019-04-03 16:29   ` Ferruh Yigit
  2019-04-04  5:03     ` [dpdk-dev] [EXT] " Kiran Kumar Kokkilagadda
@ 2019-04-04  9:57     ` Burakov, Anatoly
  2019-04-04 11:21       ` Ferruh Yigit
  1 sibling, 1 reply; 251+ messages in thread
From: Burakov, Anatoly @ 2019-04-04  9:57 UTC (permalink / raw)
  To: Ferruh Yigit, Kiran Kumar Kokkilagadda; +Cc: dev, Jerin Jacob

On 03-Apr-19 5:29 PM, Ferruh Yigit wrote:
> On 4/1/2019 10:51 AM, Kiran Kumar Kokkilagadda wrote:
>> From: Kiran Kumar K <kirankumark@marvell.com>
>>
>> With current KNI implementation kernel module will work only in
>> IOVA=PA mode. This patch will add support for kernel module to work
>> with IOVA=VA mode.
> 
> Thanks Kiran for removing the limitation, I have a few questions, can you please
> help me understand.
> 
> And when this patch is ready, the restriction in 'linux/eal/eal.c', in
> 'rte_eal_init' should be removed, perhaps with this patch. I assume you already
> doing it to be able to test this patch.
> 
>>
>> The idea is to maintain a mapping in KNI module between user pages and
>> kernel pages and in fast path perform a lookup in this table and get
>> the kernel virtual address for corresponding user virtual address.
>>
>> In IOVA=VA mode, the memory allocated to the pool is physically
>> and virtually contiguous. We will take advantage of this and create a
>> mapping in the kernel.In kernel we need mapping for queues
>> (tx_q, rx_q,... slow path) and mbuf memory (fast path).
> 
> Is it?
> As far as I know mempool can have multiple chunks and they can be both virtually
> and physically separated.
> 
> And even for a single chunk, that will be virtually continuous, but will it be
> physically continuous?

Just to clarify.

Within DPDK, we do not make a distinction between physical address and 
IOVA address - we never need the actual physical address, we just need 
the DMA addresses, which can either match the physical address, or be 
completely arbitrary (in our case, they will match VA addresses, but it 
doesn't have to be the case - in fact, 17.11 will, under some 
circumstances, populate IOVA addresses simply starting from address 0).

However, one has to remember that IOVA address *is not a physical 
address*. The pages backing a VA chunk may be *IOVA*-contiguous, but may 
not necessarily be *physically* contiguous. Under normal circumstances 
we really don't care, because the VFIO/IOMMU takes care of the mapping 
between IOVA and PA transparently for the hardware.

So, in IOVA as VA mode, the memory allocated to the mempool will be 
(within a given chunk) both VA and IOVA contiguous - but not necessarily 
*physically* contiguous! In fact, if you try calling rte_mem_virt2phy() 
on the mempool pages, you'll likely find that they aren't (i've seen 
cases where pages were mapped /backwards/!).

Therefore, unless by "physically contiguous" you mean "IOVA-contiguous", 
you *cannot* rely on memory in mempool being *physically* contiguous 
merely based on the fact that it's IOVA-contiguous.

> 
>>
>> At the KNI init time, in slow path we will create a mapping for the
>> queues and mbuf using get_user_pages similar to af_xdp. Using pool
>> memory base address, we will create a page map table for the mbuf,
>> which we will use in the fast path for kernel page translation.
>>
>> At KNI init time, we will pass the base address of the pool and size of
>> the pool to kernel. In kernel, using get_user_pages API, we will get
>> the pages with size PAGE_SIZE and store the mapping and start address
>> of user space in a table.
>>
>> In fast path for any user address perform PAGE_SHIFT
>> (user_addr >> PAGE_SHIFT) and subtract the start address from this value,
>> we will get the index of the kernel page with in the page map table.
>> Adding offset to this kernel page address, we will get the kernel address
>> for this user virtual address.
>>
>> For example user pool base address is X, and size is S that we passed to
>> kernel. In kernel we will create a mapping for this using get_user_pages.
>> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2) ....]
>> and user start page will be U (we will get it from X >> PAGE_SHIFT).
>>
>> For any user address Z we will get the index of the page map table using
>> ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to this
>> address will give kernel virtual address.
>>
>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> 
> <...>
> 
>> +int
>> +kni_pin_pages(void *address, size_t size, struct page_info *mem)
>> +{
>> +	unsigned int gup_flags = FOLL_WRITE;
>> +	long npgs;
>> +	int err;
>> +
>> +	/* Get at least one page */
>> +	if (size < PAGE_SIZE)
>> +		size = PAGE_SIZE;
>> +
>> +	/* Compute number of user pages based on page size */
>> +	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
>> +
>> +	/* Allocate memory for the pages */
>> +	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
>> +		      GFP_KERNEL | __GFP_NOWARN);
>> +	if (!mem->pgs) {
>> +		pr_err("%s: -ENOMEM\n", __func__);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	down_write(&current->mm->mmap_sem);
>> +
>> +	/* Get the user pages from the user address*/
>> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0)
>> +	npgs = get_user_pages((u64)address, mem->npgs,
>> +				gup_flags, &mem->pgs[0], NULL);
>> +#else
>> +	npgs = get_user_pages(current, current->mm, (u64)address, mem->npgs,
>> +				gup_flags, 0, &mem->pgs[0], NULL);
>> +#endif
>> +	up_write(&current->mm->mmap_sem);
> 
> This should work even memory is physically not continuous, right? Where exactly
> physically continuous requirement is coming from?
> 
> <...>
> 
>> +
>> +/* Get the kernel address from the user address using
>> + * page map table. Will be used only in IOVA=VA mode
>> + */
>> +static inline void*
>> +get_kva(uint64_t usr_addr, struct kni_dev *kni)
>> +{
>> +	uint32_t index;
>> +	/* User page - start user page will give the index
>> +	 * with in the page map table
>> +	 */
>> +	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
>> +
>> +	/* Add the offset to the page address */
>> +	return (kni->va_info.page_map[index].addr +
>> +		(usr_addr & kni->va_info.page_mask));
>> +
>> +}
>> +
>>   /* physical address to kernel virtual address */
>>   static void *
>>   pa2kva(void *pa)
>> @@ -186,7 +205,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>>   			return;
>>
>>   		for (i = 0; i < num_rx; i++) {
>> -			kva = pa2kva(kni->pa[i]);
>> +			if (likely(kni->iova_mode == 1))
>> +				kva = get_kva((u64)(kni->pa[i]), kni);
> 
> kni->pa[] now has iova addresses, for 'get_kva()' to work shouldn't
> 'va_info.start_page' calculated from 'mempool_memhdr->iova' instead of
> 'mempool_memhdr->addr'
> 
> If this is working I must be missing something but not able to find what it is.
> 
> <...>
> 
>> @@ -304,6 +304,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>>   	kni->group_id = conf->group_id;
>>   	kni->mbuf_size = conf->mbuf_size;
>>
>> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
>> +	if (dev_info.iova_mode) {
>> +		struct rte_mempool_memhdr *hdr;
>> +		uint64_t pool_size = 0;
>> +
>> +		/* In each pool header chunk, we will maintain the
>> +		 * base address of the pool. This chunk is physically and
>> +		 * virtually contiguous.
>> +		 * This approach will work, only if the allocated pool
>> +		 * memory is contiguous, else it won't work
>> +		 */
>> +		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
>> +		dev_info.mbuf_va = (void *)(hdr->addr);
>> +
>> +		/* Traverse the list and get the total size of the pool */
>> +		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
>> +			pool_size += hdr->len;
>> +		}
> 
> This code is aware that there may be multiple chunks, but assumes they are all
> continuous, I don't know if this assumption is correct.

It is in fact incorrect - mempool chunks can be located anywhere in memory.

> 
> Also I guess there is another assumption that there will be single pktmbuf_pool
> in the application which passed into kni?
> What if there are multiple pktmbuf_pool, like one for each PMD, will this work?
> Now some mbufs in kni Rx fifo will come from different pktmbuf_pool which we
> don't know their pages, so won't able to get their kernel virtual address.
> 

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [EXT] Re: [PATCH v2] kni: add IOVA va support for kni
  2019-04-04  5:03     ` [dpdk-dev] [EXT] " Kiran Kumar Kokkilagadda
@ 2019-04-04 11:20       ` Ferruh Yigit
  2019-04-04 13:29         ` Burakov, Anatoly
  0 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-04-04 11:20 UTC (permalink / raw)
  To: Kiran Kumar Kokkilagadda; +Cc: dev, Jerin Jacob

On 4/4/2019 6:03 AM, Kiran Kumar Kokkilagadda wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Wednesday, April 3, 2019 9:59 PM
>> To: Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
>> Cc: dev@dpdk.org; Jerin Jacob <jerin.jacob@caviumnetworks.com>
>> Subject: [EXT] Re: [dpdk-dev] [PATCH v2] kni: add IOVA va support for kni
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> On 4/1/2019 10:51 AM, Kiran Kumar Kokkilagadda wrote:
>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>
>>> With current KNI implementation kernel module will work only in
>>> IOVA=PA mode. This patch will add support for kernel module to work
>>> with IOVA=VA mode.
>>
>> Thanks Kiran for removing the limitation, I have a few questions, can you please
>> help me understand.
>>
>> And when this patch is ready, the restriction in 'linux/eal/eal.c', in 'rte_eal_init'
>> should be removed, perhaps with this patch. I assume you already doing it to be
>> able to test this patch.
>>
> 
> User can choose the mode by passing --iova-mode=<va/pa>. I will remove the rte_kni module restriction.
>  
> 
>>>
>>> The idea is to maintain a mapping in KNI module between user pages and
>>> kernel pages and in fast path perform a lookup in this table and get
>>> the kernel virtual address for corresponding user virtual address.
>>>
>>> In IOVA=VA mode, the memory allocated to the pool is physically and
>>> virtually contiguous. We will take advantage of this and create a
>>> mapping in the kernel.In kernel we need mapping for queues (tx_q,
>>> rx_q,... slow path) and mbuf memory (fast path).
>>
>> Is it?
>> As far as I know mempool can have multiple chunks and they can be both
>> virtually and physically separated.
>>
>> And even for a single chunk, that will be virtually continuous, but will it be
>> physically continuous?
>>
> 
> You are right, it need not have to be physically contiguous. Will change the description.
>  
> 
>>>
>>> At the KNI init time, in slow path we will create a mapping for the
>>> queues and mbuf using get_user_pages similar to af_xdp. Using pool
>>> memory base address, we will create a page map table for the mbuf,
>>> which we will use in the fast path for kernel page translation.
>>>
>>> At KNI init time, we will pass the base address of the pool and size
>>> of the pool to kernel. In kernel, using get_user_pages API, we will
>>> get the pages with size PAGE_SIZE and store the mapping and start
>>> address of user space in a table.
>>>
>>> In fast path for any user address perform PAGE_SHIFT (user_addr >>
>>> PAGE_SHIFT) and subtract the start address from this value, we will
>>> get the index of the kernel page with in the page map table.
>>> Adding offset to this kernel page address, we will get the kernel
>>> address for this user virtual address.
>>>
>>> For example user pool base address is X, and size is S that we passed
>>> to kernel. In kernel we will create a mapping for this using get_user_pages.
>>> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2)
>>> ....] and user start page will be U (we will get it from X >> PAGE_SHIFT).
>>>
>>> For any user address Z we will get the index of the page map table
>>> using ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to
>>> this address will give kernel virtual address.
>>>
>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>
>> <...>
>>
>>> +int
>>> +kni_pin_pages(void *address, size_t size, struct page_info *mem) {
>>> +	unsigned int gup_flags = FOLL_WRITE;
>>> +	long npgs;
>>> +	int err;
>>> +
>>> +	/* Get at least one page */
>>> +	if (size < PAGE_SIZE)
>>> +		size = PAGE_SIZE;
>>> +
>>> +	/* Compute number of user pages based on page size */
>>> +	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
>>> +
>>> +	/* Allocate memory for the pages */
>>> +	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
>>> +		      GFP_KERNEL | __GFP_NOWARN);
>>> +	if (!mem->pgs) {
>>> +		pr_err("%s: -ENOMEM\n", __func__);
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	down_write(&current->mm->mmap_sem);
>>> +
>>> +	/* Get the user pages from the user address*/ #if
>> LINUX_VERSION_CODE
>>> +>= KERNEL_VERSION(4,9,0)
>>> +	npgs = get_user_pages((u64)address, mem->npgs,
>>> +				gup_flags, &mem->pgs[0], NULL);
>>> +#else
>>> +	npgs = get_user_pages(current, current->mm, (u64)address, mem-
>>> npgs,
>>> +				gup_flags, 0, &mem->pgs[0], NULL); #endif
>>> +	up_write(&current->mm->mmap_sem);
>>
>> This should work even memory is physically not continuous, right? Where
>> exactly physically continuous requirement is coming from?
>>
> 
> Yes, it is not necessary to be physically contiguous.
> 
>  
> 
> 
>> <...>
>>
>>> +
>>> +/* Get the kernel address from the user address using
>>> + * page map table. Will be used only in IOVA=VA mode  */ static
>>> +inline void* get_kva(uint64_t usr_addr, struct kni_dev *kni) {
>>> +	uint32_t index;
>>> +	/* User page - start user page will give the index
>>> +	 * with in the page map table
>>> +	 */
>>> +	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
>>> +
>>> +	/* Add the offset to the page address */
>>> +	return (kni->va_info.page_map[index].addr +
>>> +		(usr_addr & kni->va_info.page_mask));
>>> +
>>> +}
>>> +
>>>  /* physical address to kernel virtual address */  static void *
>>> pa2kva(void *pa) @@ -186,7 +205,10 @@ kni_fifo_trans_pa2va(struct
>>> kni_dev *kni,
>>>  			return;
>>>
>>>  		for (i = 0; i < num_rx; i++) {
>>> -			kva = pa2kva(kni->pa[i]);
>>> +			if (likely(kni->iova_mode == 1))
>>> +				kva = get_kva((u64)(kni->pa[i]), kni);
>>
>> kni->pa[] now has iova addresses, for 'get_kva()' to work shouldn't
>> 'va_info.start_page' calculated from 'mempool_memhdr->iova' instead of
>> 'mempool_memhdr->addr'
>>
>> If this is working I must be missing something but not able to find what it is.
>>
>> <...>
>>
>  
> In IOVA=VA mode, both the values will be same right?

I don't know for sure, but according my understanding 'mempool_memhdr->addr' is
application virtual address, 'mempool_memhdr->iova' id device virtual address
and can be anything as long as iommu configured correctly.
So technically ->addr and ->iova can be same, but I don't know it is always the
case in DPDK.

> 
> 
>>> @@ -304,6 +304,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>>>  	kni->group_id = conf->group_id;
>>>  	kni->mbuf_size = conf->mbuf_size;
>>>
>>> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
>>> +	if (dev_info.iova_mode) {
>>> +		struct rte_mempool_memhdr *hdr;
>>> +		uint64_t pool_size = 0;
>>> +
>>> +		/* In each pool header chunk, we will maintain the
>>> +		 * base address of the pool. This chunk is physically and
>>> +		 * virtually contiguous.
>>> +		 * This approach will work, only if the allocated pool
>>> +		 * memory is contiguous, else it won't work
>>> +		 */
>>> +		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
>>> +		dev_info.mbuf_va = (void *)(hdr->addr);
>>> +
>>> +		/* Traverse the list and get the total size of the pool */
>>> +		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
>>> +			pool_size += hdr->len;
>>> +		}
>>
>> This code is aware that there may be multiple chunks, but assumes they are all
>> continuous, I don't know if this assumption is correct.
>>
>> Also I guess there is another assumption that there will be single pktmbuf_pool
>> in the application which passed into kni?
>> What if there are multiple pktmbuf_pool, like one for each PMD, will this work?
>> Now some mbufs in kni Rx fifo will come from different pktmbuf_pool which we
>> don't know their pages, so won't able to get their kernel virtual address.
> 
> All these chunks have to be virtually contiguous, otherwise this approach will not work. Here one thing we can do is create the mapping for complete huge page, and make sure all the mbuff_mempools will be with in this offset. As huge page size will be big (x64 it is 1G, ARM 512MB), we may not exceed the size, even with multiple pktmbuf_pools. If mbuf_pools were created outside this huge_page, this approach will not work and we will fall back to PA mode.
> 

Those were also my two points,
- This requires all chunks in to be virtually continuous
- There should be only one pktmbuf_pools in system for this to work

But both above assumptions are not correct.
And as far as I can see there is not way to detect them and act accordingly.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [PATCH v2] kni: add IOVA va support for kni
  2019-04-04  9:57     ` Burakov, Anatoly
@ 2019-04-04 11:21       ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-04-04 11:21 UTC (permalink / raw)
  To: Burakov, Anatoly, Kiran Kumar Kokkilagadda; +Cc: dev, Jerin Jacob

On 4/4/2019 10:57 AM, Burakov, Anatoly wrote:
> On 03-Apr-19 5:29 PM, Ferruh Yigit wrote:
>> On 4/1/2019 10:51 AM, Kiran Kumar Kokkilagadda wrote:
>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>
>>> With current KNI implementation kernel module will work only in
>>> IOVA=PA mode. This patch will add support for kernel module to work
>>> with IOVA=VA mode.
>>
>> Thanks Kiran for removing the limitation, I have a few questions, can you please
>> help me understand.
>>
>> And when this patch is ready, the restriction in 'linux/eal/eal.c', in
>> 'rte_eal_init' should be removed, perhaps with this patch. I assume you already
>> doing it to be able to test this patch.
>>
>>>
>>> The idea is to maintain a mapping in KNI module between user pages and
>>> kernel pages and in fast path perform a lookup in this table and get
>>> the kernel virtual address for corresponding user virtual address.
>>>
>>> In IOVA=VA mode, the memory allocated to the pool is physically
>>> and virtually contiguous. We will take advantage of this and create a
>>> mapping in the kernel.In kernel we need mapping for queues
>>> (tx_q, rx_q,... slow path) and mbuf memory (fast path).
>>
>> Is it?
>> As far as I know mempool can have multiple chunks and they can be both virtually
>> and physically separated.
>>
>> And even for a single chunk, that will be virtually continuous, but will it be
>> physically continuous?
> 
> Just to clarify.
> 
> Within DPDK, we do not make a distinction between physical address and 
> IOVA address - we never need the actual physical address, we just need 
> the DMA addresses, which can either match the physical address, or be 
> completely arbitrary (in our case, they will match VA addresses, but it 
> doesn't have to be the case - in fact, 17.11 will, under some 
> circumstances, populate IOVA addresses simply starting from address 0).
> 
> However, one has to remember that IOVA address *is not a physical 
> address*. The pages backing a VA chunk may be *IOVA*-contiguous, but may 
> not necessarily be *physically* contiguous. Under normal circumstances 
> we really don't care, because the VFIO/IOMMU takes care of the mapping 
> between IOVA and PA transparently for the hardware.
> 
> So, in IOVA as VA mode, the memory allocated to the mempool will be 
> (within a given chunk) both VA and IOVA contiguous - but not necessarily 
> *physically* contiguous! In fact, if you try calling rte_mem_virt2phy() 
> on the mempool pages, you'll likely find that they aren't (i've seen 
> cases where pages were mapped /backwards/!).
> 
> Therefore, unless by "physically contiguous" you mean "IOVA-contiguous", 
> you *cannot* rely on memory in mempool being *physically* contiguous 
> merely based on the fact that it's IOVA-contiguous.

Thanks for clarification.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [EXT] Re: [PATCH v2] kni: add IOVA va support for kni
  2019-04-04 11:20       ` Ferruh Yigit
@ 2019-04-04 13:29         ` Burakov, Anatoly
  0 siblings, 0 replies; 251+ messages in thread
From: Burakov, Anatoly @ 2019-04-04 13:29 UTC (permalink / raw)
  To: Ferruh Yigit, Kiran Kumar Kokkilagadda; +Cc: dev, Jerin Jacob

On 04-Apr-19 12:20 PM, Ferruh Yigit wrote:
> On 4/4/2019 6:03 AM, Kiran Kumar Kokkilagadda wrote:
>>
>>
>>> -----Original Message-----
>>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>>> Sent: Wednesday, April 3, 2019 9:59 PM
>>> To: Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
>>> Cc: dev@dpdk.org; Jerin Jacob <jerin.jacob@caviumnetworks.com>
>>> Subject: [EXT] Re: [dpdk-dev] [PATCH v2] kni: add IOVA va support for kni
>>>
>>> External Email
>>>
>>> ----------------------------------------------------------------------
>>> On 4/1/2019 10:51 AM, Kiran Kumar Kokkilagadda wrote:
>>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>>
>>>> With current KNI implementation kernel module will work only in
>>>> IOVA=PA mode. This patch will add support for kernel module to work
>>>> with IOVA=VA mode.
>>>
>>> Thanks Kiran for removing the limitation, I have a few questions, can you please
>>> help me understand.
>>>
>>> And when this patch is ready, the restriction in 'linux/eal/eal.c', in 'rte_eal_init'
>>> should be removed, perhaps with this patch. I assume you already doing it to be
>>> able to test this patch.
>>>
>>
>> User can choose the mode by passing --iova-mode=<va/pa>. I will remove the rte_kni module restriction.
>>   
>>
>>>>
>>>> The idea is to maintain a mapping in KNI module between user pages and
>>>> kernel pages and in fast path perform a lookup in this table and get
>>>> the kernel virtual address for corresponding user virtual address.
>>>>
>>>> In IOVA=VA mode, the memory allocated to the pool is physically and
>>>> virtually contiguous. We will take advantage of this and create a
>>>> mapping in the kernel.In kernel we need mapping for queues (tx_q,
>>>> rx_q,... slow path) and mbuf memory (fast path).
>>>
>>> Is it?
>>> As far as I know mempool can have multiple chunks and they can be both
>>> virtually and physically separated.
>>>
>>> And even for a single chunk, that will be virtually continuous, but will it be
>>> physically continuous?
>>>
>>
>> You are right, it need not have to be physically contiguous. Will change the description.
>>   
>>
>>>>
>>>> At the KNI init time, in slow path we will create a mapping for the
>>>> queues and mbuf using get_user_pages similar to af_xdp. Using pool
>>>> memory base address, we will create a page map table for the mbuf,
>>>> which we will use in the fast path for kernel page translation.
>>>>
>>>> At KNI init time, we will pass the base address of the pool and size
>>>> of the pool to kernel. In kernel, using get_user_pages API, we will
>>>> get the pages with size PAGE_SIZE and store the mapping and start
>>>> address of user space in a table.
>>>>
>>>> In fast path for any user address perform PAGE_SHIFT (user_addr >>
>>>> PAGE_SHIFT) and subtract the start address from this value, we will
>>>> get the index of the kernel page with in the page map table.
>>>> Adding offset to this kernel page address, we will get the kernel
>>>> address for this user virtual address.
>>>>
>>>> For example user pool base address is X, and size is S that we passed
>>>> to kernel. In kernel we will create a mapping for this using get_user_pages.
>>>> Our page map table will look like [Y, Y+PAGE_SIZE, Y+(PAGE_SIZE*2)
>>>> ....] and user start page will be U (we will get it from X >> PAGE_SHIFT).
>>>>
>>>> For any user address Z we will get the index of the page map table
>>>> using ((Z >> PAGE_SHIFT) - U). Adding offset (Z & (PAGE_SIZE - 1)) to
>>>> this address will give kernel virtual address.
>>>>
>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>
>>> <...>
>>>
>>>> +int
>>>> +kni_pin_pages(void *address, size_t size, struct page_info *mem) {
>>>> +	unsigned int gup_flags = FOLL_WRITE;
>>>> +	long npgs;
>>>> +	int err;
>>>> +
>>>> +	/* Get at least one page */
>>>> +	if (size < PAGE_SIZE)
>>>> +		size = PAGE_SIZE;
>>>> +
>>>> +	/* Compute number of user pages based on page size */
>>>> +	mem->npgs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
>>>> +
>>>> +	/* Allocate memory for the pages */
>>>> +	mem->pgs = kcalloc(mem->npgs, sizeof(*mem->pgs),
>>>> +		      GFP_KERNEL | __GFP_NOWARN);
>>>> +	if (!mem->pgs) {
>>>> +		pr_err("%s: -ENOMEM\n", __func__);
>>>> +		return -ENOMEM;
>>>> +	}
>>>> +
>>>> +	down_write(&current->mm->mmap_sem);
>>>> +
>>>> +	/* Get the user pages from the user address*/ #if
>>> LINUX_VERSION_CODE
>>>> +>= KERNEL_VERSION(4,9,0)
>>>> +	npgs = get_user_pages((u64)address, mem->npgs,
>>>> +				gup_flags, &mem->pgs[0], NULL);
>>>> +#else
>>>> +	npgs = get_user_pages(current, current->mm, (u64)address, mem-
>>>> npgs,
>>>> +				gup_flags, 0, &mem->pgs[0], NULL); #endif
>>>> +	up_write(&current->mm->mmap_sem);
>>>
>>> This should work even memory is physically not continuous, right? Where
>>> exactly physically continuous requirement is coming from?
>>>
>>
>> Yes, it is not necessary to be physically contiguous.
>>
>>   
>>
>>
>>> <...>
>>>
>>>> +
>>>> +/* Get the kernel address from the user address using
>>>> + * page map table. Will be used only in IOVA=VA mode  */ static
>>>> +inline void* get_kva(uint64_t usr_addr, struct kni_dev *kni) {
>>>> +	uint32_t index;
>>>> +	/* User page - start user page will give the index
>>>> +	 * with in the page map table
>>>> +	 */
>>>> +	index = (usr_addr >> PAGE_SHIFT) - kni->va_info.start_page;
>>>> +
>>>> +	/* Add the offset to the page address */
>>>> +	return (kni->va_info.page_map[index].addr +
>>>> +		(usr_addr & kni->va_info.page_mask));
>>>> +
>>>> +}
>>>> +
>>>>   /* physical address to kernel virtual address */  static void *
>>>> pa2kva(void *pa) @@ -186,7 +205,10 @@ kni_fifo_trans_pa2va(struct
>>>> kni_dev *kni,
>>>>   			return;
>>>>
>>>>   		for (i = 0; i < num_rx; i++) {
>>>> -			kva = pa2kva(kni->pa[i]);
>>>> +			if (likely(kni->iova_mode == 1))
>>>> +				kva = get_kva((u64)(kni->pa[i]), kni);
>>>
>>> kni->pa[] now has iova addresses, for 'get_kva()' to work shouldn't
>>> 'va_info.start_page' calculated from 'mempool_memhdr->iova' instead of
>>> 'mempool_memhdr->addr'
>>>
>>> If this is working I must be missing something but not able to find what it is.
>>>
>>> <...>
>>>
>>   
>> In IOVA=VA mode, both the values will be same right?
> 
> I don't know for sure, but according my understanding 'mempool_memhdr->addr' is
> application virtual address, 'mempool_memhdr->iova' id device virtual address
> and can be anything as long as iommu configured correctly.
> So technically ->addr and ->iova can be same, but I don't know it is always the
> case in DPDK.

In IOVA as VA case, they will be the same, except if external memory is 
used (in which case all bets are off, as this memory is not managed by 
DPDK and we can't enforce any constraints on it). That is not to say 
that IOVA being different from VA can't happen in the future, but the 
whole notion of "IOVA as VA" implies that IOVA is VA :)

> 
>>
>>
>>>> @@ -304,6 +304,27 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>>>>   	kni->group_id = conf->group_id;
>>>>   	kni->mbuf_size = conf->mbuf_size;
>>>>
>>>> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
>>>> +	if (dev_info.iova_mode) {
>>>> +		struct rte_mempool_memhdr *hdr;
>>>> +		uint64_t pool_size = 0;
>>>> +
>>>> +		/* In each pool header chunk, we will maintain the
>>>> +		 * base address of the pool. This chunk is physically and
>>>> +		 * virtually contiguous.
>>>> +		 * This approach will work, only if the allocated pool
>>>> +		 * memory is contiguous, else it won't work
>>>> +		 */
>>>> +		hdr = STAILQ_FIRST(&pktmbuf_pool->mem_list);
>>>> +		dev_info.mbuf_va = (void *)(hdr->addr);
>>>> +
>>>> +		/* Traverse the list and get the total size of the pool */
>>>> +		STAILQ_FOREACH(hdr, &pktmbuf_pool->mem_list, next) {
>>>> +			pool_size += hdr->len;
>>>> +		}
>>>
>>> This code is aware that there may be multiple chunks, but assumes they are all
>>> continuous, I don't know if this assumption is correct.
>>>
>>> Also I guess there is another assumption that there will be single pktmbuf_pool
>>> in the application which passed into kni?
>>> What if there are multiple pktmbuf_pool, like one for each PMD, will this work?
>>> Now some mbufs in kni Rx fifo will come from different pktmbuf_pool which we
>>> don't know their pages, so won't able to get their kernel virtual address.
>>
>> All these chunks have to be virtually contiguous, otherwise this approach will not work. Here one thing we can do is create the mapping for complete huge page, and make sure all the mbuff_mempools will be with in this offset. As huge page size will be big (x64 it is 1G, ARM 512MB), we may not exceed the size, even with multiple pktmbuf_pools. If mbuf_pools were created outside this huge_page, this approach will not work and we will fall back to PA mode.
>>
> 
> Those were also my two points,
> - This requires all chunks in to be virtually continuous
> - There should be only one pktmbuf_pools in system for this to work
> 
> But both above assumptions are not correct.
> And as far as I can see there is not way to detect them and act accordingly.
> 

Correct, there is no guarantee that an entire mempool will be contained 
within one page, or even two adjacent pages. There is nothing stopping 
the memchunks to be allocated backwards (i.e. memchunk1 may have higher 
or lower address than memchunk2), or even be allocated from pages of 
different sizes.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3] kni: add IOVA va support for kni
  2019-04-01  9:51 ` [PATCH v2] " Kiran Kumar Kokkilagadda
  2019-04-03 16:29   ` Ferruh Yigit
@ 2019-04-16  4:55   ` kirankumark
  2019-04-19 10:38     ` Thomas Monjalon
  2019-04-22  4:39     ` [dpdk-dev] [PATCH v4] " kirankumark
  1 sibling, 2 replies; 251+ messages in thread
From: kirankumark @ 2019-04-16  4:55 UTC (permalink / raw)
  To: ferruh.yigit; +Cc: dev, Kiran Kumar K

From: Kiran Kumar K <kirankumark@marvell.com>

With current KNI implementation kernel module will work only in
IOVA=PA mode. This patch will add support for kernel module to work
with IOVA=VA mode.

The idea is to get the physical address from iova address using
api iommu_iova_to_phys. Using this API, we will get the physical
address from iova address and later use phys_to_virt API to
convert the physical address to kernel virtual address.

With this approach we have compared the performance with IOVA=PA
and there is no difference observed. Seems like kernel is the
overhead.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

 kernel/linux/kni/kni_dev.h                    |  4 +
 kernel/linux/kni/kni_misc.c                   | 57 +++++++++++---
 kernel/linux/kni/kni_net.c                    | 76 +++++++++++++++----
 lib/librte_eal/linux/eal/eal.c                |  9 ---
 .../linux/eal/include/rte_kni_common.h        |  1 +
 lib/librte_kni/rte_kni.c                      |  2 +
 6 files changed, 116 insertions(+), 33 deletions(-)

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index df46aa70e..9c4944921 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>

 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -39,6 +40,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;

+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	struct net_device_stats stats;
 	int status;
 	uint16_t group_id;           /* Group ID of a group of KNI devices */
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 31845e10f..de4f6ce41 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -306,10 +306,12 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 #ifdef RTE_KNI_KMOD_ETHTOOL
 	struct pci_dev *found_pci = NULL;
 	struct net_device *lad_dev = NULL;
-	struct pci_dev *pci = NULL;
 #endif

 	pr_info("Creating kni...\n");
@@ -368,15 +370,50 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);

 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+
+	if (dev_info.iova_mode) {
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}

 	kni->mbuf_size = dev_info.mbuf_size;

diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index be9e6b0b9..4d07ba576 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;

+/* iova to kernel virtual address */
+static void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(dma_addr_t)pa));
+}
+
+static void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt((iommu_iova_to_phys(kni->domain,
+					(dma_addr_t)m->buf_physaddr) +
+			     m->data_off));
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;

 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = iova2kva(kni, kni->pa[i]);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}

@@ -263,8 +282,13 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;

-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = iova2kva(kni, pkt_pa);
+			data_kva = iova2data_kva(kni, pkt_kva);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);

 		len = skb->len;
@@ -335,9 +359,14 @@ kni_net_rx_normal(struct kni_dev *kni)

 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -434,13 +463,20 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+
+			if (likely(kni->iova_mode == 1)) {
+				kva = iova2kva(kni, kni->pa[i]);
+				data_kva = iova2data_kva(kni, kva);
+				alloc_kva = iova2kva(kni, kni->alloc_pa[i]);
+				alloc_data_kva = iova2data_kva(kni, alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				data_kva = kva2data_kva(kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+			}
 			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
-
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);

 			memcpy(alloc_data_kva, data_kva, len);
@@ -507,9 +543,15 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)

 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -545,8 +587,14 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;

-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = iova2kva(kni,
+						       va2pa(kva->next, kva));
+					data_kva = iova2data_kva(kni, kva);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}

diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index f7ae62d7b..8fac6707d 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1040,15 +1040,6 @@ rte_eal_init(int argc, char **argv)
 		/* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
 		rte_eal_get_configuration()->iova_mode =
 			rte_bus_get_iommu_class();
-
-		/* Workaround for KNI which requires physical address to work */
-		if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
-			RTE_LOG(WARNING, EAL,
-				"Some devices want IOVA as VA but PA will be used because.. "
-				"KNI module inserted\n");
-		}
 	} else {
 		rte_eal_get_configuration()->iova_mode =
 			internal_config.iova_mode;
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 5afa08713..79ee4bc5a 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -128,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	char mac_addr[6];
+	uint8_t iova_mode;
 };

 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 946459c79..ec8f23694 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -304,6 +304,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;

+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
--
2.17.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v3] kni: add IOVA va support for kni
  2019-04-16  4:55   ` [dpdk-dev] [PATCH v3] " kirankumark
@ 2019-04-19 10:38     ` Thomas Monjalon
  2019-04-22  4:39     ` [dpdk-dev] [PATCH v4] " kirankumark
  1 sibling, 0 replies; 251+ messages in thread
From: Thomas Monjalon @ 2019-04-19 10:38 UTC (permalink / raw)
  To: kirankumark; +Cc: dev, ferruh.yigit

Hi,

16/04/2019 06:55, kirankumark@marvell.com:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> With current KNI implementation kernel module will work only in
> IOVA=PA mode. This patch will add support for kernel module to work
> with IOVA=VA mode.
> 
> The idea is to get the physical address from iova address using
> api iommu_iova_to_phys. Using this API, we will get the physical
> address from iova address and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
> 
> With this approach we have compared the performance with IOVA=PA
> and there is no difference observed. Seems like kernel is the
> overhead.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
> 
> V3 Changes:
> * Add new approach to work kni with IOVA=VA mode using
> iommu_iova_to_phys API.

Compilation test is failing:
http://mails.dpdk.org/archives/test-report/2019-April/080701.html



^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4] kni: add IOVA va support for kni
  2019-04-16  4:55   ` [dpdk-dev] [PATCH v3] " kirankumark
  2019-04-19 10:38     ` Thomas Monjalon
@ 2019-04-22  4:39     ` kirankumark
  2019-04-22  6:15       ` [dpdk-dev] [PATCH v5] " kirankumark
  2019-04-23  8:56       ` [dpdk-dev] [PATCH v4] kni: add IOVA va support for kni Burakov, Anatoly
  1 sibling, 2 replies; 251+ messages in thread
From: kirankumark @ 2019-04-22  4:39 UTC (permalink / raw)
  To: ferruh.yigit; +Cc: dev, Kiran Kumar K

From: Kiran Kumar K <kirankumark@marvell.com>

With current KNI implementation kernel module will work only in
IOVA=PA mode. This patch will add support for kernel module to work
with IOVA=VA mode.

The idea is to get the physical address from iova address using
api iommu_iova_to_phys. Using this API, we will get the physical
address from iova address and later use phys_to_virt API to
convert the physical address to kernel virtual address.

With this approach we have compared the performance with IOVA=PA
and there is no difference observed. Seems like kernel is the
overhead.

This approach will not work with the kernel versions less than 4.4.0
because of API compatibility issues.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0


V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

 kernel/linux/kni/kni_dev.h                    |  4 +
 kernel/linux/kni/kni_misc.c                   | 63 ++++++++++++---
 kernel/linux/kni/kni_net.c                    | 76 +++++++++++++++----
 lib/librte_eal/linux/eal/eal.c                |  9 ---
 .../linux/eal/include/rte_kni_common.h        |  1 +
 lib/librte_kni/rte_kni.c                      |  2 +
 6 files changed, 122 insertions(+), 33 deletions(-)

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index df46aa70e..9c4944921 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>

 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -39,6 +40,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;

+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	struct net_device_stats stats;
 	int status;
 	uint16_t group_id;           /* Group ID of a group of KNI devices */
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 31845e10f..9e90af31b 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -306,10 +306,12 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 #ifdef RTE_KNI_KMOD_ETHTOOL
 	struct pci_dev *found_pci = NULL;
 	struct net_device *lad_dev = NULL;
-	struct pci_dev *pci = NULL;
 #endif

 	pr_info("Creating kni...\n");
@@ -368,15 +370,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);

 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+
+	if (dev_info.iova_mode) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
+		(void)pci;
+		pr_err("Kernel version is not supported\n");
+		return -EINVAL;
+#else
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}

 	kni->mbuf_size = dev_info.mbuf_size;

diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index be9e6b0b9..4d07ba576 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;

+/* iova to kernel virtual address */
+static void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(dma_addr_t)pa));
+}
+
+static void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt((iommu_iova_to_phys(kni->domain,
+					(dma_addr_t)m->buf_physaddr) +
+			     m->data_off));
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;

 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = iova2kva(kni, kni->pa[i]);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}

@@ -263,8 +282,13 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;

-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = iova2kva(kni, pkt_pa);
+			data_kva = iova2data_kva(kni, pkt_kva);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);

 		len = skb->len;
@@ -335,9 +359,14 @@ kni_net_rx_normal(struct kni_dev *kni)

 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -434,13 +463,20 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+
+			if (likely(kni->iova_mode == 1)) {
+				kva = iova2kva(kni, kni->pa[i]);
+				data_kva = iova2data_kva(kni, kva);
+				alloc_kva = iova2kva(kni, kni->alloc_pa[i]);
+				alloc_data_kva = iova2data_kva(kni, alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				data_kva = kva2data_kva(kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+			}
 			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
-
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);

 			memcpy(alloc_data_kva, data_kva, len);
@@ -507,9 +543,15 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)

 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -545,8 +587,14 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;

-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = iova2kva(kni,
+						       va2pa(kva->next, kva));
+					data_kva = iova2data_kva(kni, kva);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}

diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index f7ae62d7b..8fac6707d 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1040,15 +1040,6 @@ rte_eal_init(int argc, char **argv)
 		/* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
 		rte_eal_get_configuration()->iova_mode =
 			rte_bus_get_iommu_class();
-
-		/* Workaround for KNI which requires physical address to work */
-		if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
-			RTE_LOG(WARNING, EAL,
-				"Some devices want IOVA as VA but PA will be used because.. "
-				"KNI module inserted\n");
-		}
 	} else {
 		rte_eal_get_configuration()->iova_mode =
 			internal_config.iova_mode;
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 5afa08713..79ee4bc5a 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -128,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	char mac_addr[6];
+	uint8_t iova_mode;
 };

 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 946459c79..ec8f23694 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -304,6 +304,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;

+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
--
2.17.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v5] kni: add IOVA va support for kni
  2019-04-22  4:39     ` [dpdk-dev] [PATCH v4] " kirankumark
@ 2019-04-22  6:15       ` kirankumark
  2019-04-26  9:11         ` Burakov, Anatoly
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
  2019-04-23  8:56       ` [dpdk-dev] [PATCH v4] kni: add IOVA va support for kni Burakov, Anatoly
  1 sibling, 2 replies; 251+ messages in thread
From: kirankumark @ 2019-04-22  6:15 UTC (permalink / raw)
  To: ferruh.yigit; +Cc: dev, Kiran Kumar K

From: Kiran Kumar K <kirankumark@marvell.com>

With current KNI implementation kernel module will work only in
IOVA=PA mode. This patch will add support for kernel module to work
with IOVA=VA mode.

The idea is to get the physical address from iova address using
api iommu_iova_to_phys. Using this API, we will get the physical
address from iova address and later use phys_to_virt API to
convert the physical address to kernel virtual address.

With this approach we have compared the performance with IOVA=PA
and there is no difference observed. Seems like kernel is the
overhead.

This approach will not work with the kernel versions less than 4.4.0
because of API compatibility issues.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

 kernel/linux/kni/kni_dev.h                    |  4 +
 kernel/linux/kni/kni_misc.c                   | 63 ++++++++++++---
 kernel/linux/kni/kni_net.c                    | 76 +++++++++++++++----
 lib/librte_eal/linux/eal/eal.c                |  9 ---
 .../linux/eal/include/rte_kni_common.h        |  1 +
 lib/librte_kni/rte_kni.c                      |  2 +
 6 files changed, 122 insertions(+), 33 deletions(-)

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index df46aa70e..9c4944921 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>

 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -39,6 +40,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;

+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	struct net_device_stats stats;
 	int status;
 	uint16_t group_id;           /* Group ID of a group of KNI devices */
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 31845e10f..9e90af31b 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -306,10 +306,12 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 #ifdef RTE_KNI_KMOD_ETHTOOL
 	struct pci_dev *found_pci = NULL;
 	struct net_device *lad_dev = NULL;
-	struct pci_dev *pci = NULL;
 #endif

 	pr_info("Creating kni...\n");
@@ -368,15 +370,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);

 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+
+	if (dev_info.iova_mode) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
+		(void)pci;
+		pr_err("Kernel version is not supported\n");
+		return -EINVAL;
+#else
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}

 	kni->mbuf_size = dev_info.mbuf_size;

diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index be9e6b0b9..e77a28066 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;

+/* iova to kernel virtual address */
+static void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt((iommu_iova_to_phys(kni->domain,
+					(uintptr_t)m->buf_physaddr) +
+			     m->data_off));
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;

 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = iova2kva(kni, kni->pa[i]);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}

@@ -263,8 +282,13 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;

-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = iova2kva(kni, pkt_pa);
+			data_kva = iova2data_kva(kni, pkt_kva);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);

 		len = skb->len;
@@ -335,9 +359,14 @@ kni_net_rx_normal(struct kni_dev *kni)

 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -434,13 +463,20 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+
+			if (likely(kni->iova_mode == 1)) {
+				kva = iova2kva(kni, kni->pa[i]);
+				data_kva = iova2data_kva(kni, kva);
+				alloc_kva = iova2kva(kni, kni->alloc_pa[i]);
+				alloc_data_kva = iova2data_kva(kni, alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				data_kva = kva2data_kva(kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+			}
 			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
-
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);

 			memcpy(alloc_data_kva, data_kva, len);
@@ -507,9 +543,15 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)

 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);

 		skb = dev_alloc_skb(len + 2);
@@ -545,8 +587,14 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;

-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = iova2kva(kni,
+						       va2pa(kva->next, kva));
+					data_kva = iova2data_kva(kni, kva);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}

diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index f7ae62d7b..8fac6707d 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1040,15 +1040,6 @@ rte_eal_init(int argc, char **argv)
 		/* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
 		rte_eal_get_configuration()->iova_mode =
 			rte_bus_get_iommu_class();
-
-		/* Workaround for KNI which requires physical address to work */
-		if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
-			RTE_LOG(WARNING, EAL,
-				"Some devices want IOVA as VA but PA will be used because.. "
-				"KNI module inserted\n");
-		}
 	} else {
 		rte_eal_get_configuration()->iova_mode =
 			internal_config.iova_mode;
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 5afa08713..79ee4bc5a 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -128,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	char mac_addr[6];
+	uint8_t iova_mode;
 };

 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 946459c79..ec8f23694 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -304,6 +304,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;

+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
--
2.17.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v4] kni: add IOVA va support for kni
  2019-04-22  4:39     ` [dpdk-dev] [PATCH v4] " kirankumark
  2019-04-22  6:15       ` [dpdk-dev] [PATCH v5] " kirankumark
@ 2019-04-23  8:56       ` Burakov, Anatoly
  1 sibling, 0 replies; 251+ messages in thread
From: Burakov, Anatoly @ 2019-04-23  8:56 UTC (permalink / raw)
  To: kirankumark, ferruh.yigit; +Cc: dev

On 22-Apr-19 5:39 AM, kirankumark@marvell.com wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> With current KNI implementation kernel module will work only in
> IOVA=PA mode. This patch will add support for kernel module to work
> with IOVA=VA mode.
> 
> The idea is to get the physical address from iova address using
> api iommu_iova_to_phys. Using this API, we will get the physical
> address from iova address and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
> 
> With this approach we have compared the performance with IOVA=PA
> and there is no difference observed. Seems like kernel is the
> overhead.
> 
> This approach will not work with the kernel versions less than 4.4.0
> because of API compatibility issues.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---

<snip>

> +/* iova to kernel virtual address */
> +static void *
> +iova2kva(struct kni_dev *kni, void *pa)
> +{
> +	return phys_to_virt(iommu_iova_to_phys(kni->domain,
> +				(dma_addr_t)pa));
> +}
> +
> +static void *
> +iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
> +{
> +	return phys_to_virt((iommu_iova_to_phys(kni->domain,
> +					(dma_addr_t)m->buf_physaddr) +
> +			     m->data_off));

Does this account for mbufs crossing page boundary? In IOVA as VA mode, 
the mempool is likely allocated in one go, so the mempool allocator will 
not care for preventing mbufs from crossing page boundary. The data may 
very well start at the very end of a page, and continue through the 
beginning of next page, which will have a different physical address.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v5] kni: add IOVA va support for kni
  2019-04-22  6:15       ` [dpdk-dev] [PATCH v5] " kirankumark
@ 2019-04-26  9:11         ` Burakov, Anatoly
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
  1 sibling, 0 replies; 251+ messages in thread
From: Burakov, Anatoly @ 2019-04-26  9:11 UTC (permalink / raw)
  To: kirankumark, ferruh.yigit; +Cc: dev

On 22-Apr-19 7:15 AM, kirankumark@marvell.com wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> With current KNI implementation kernel module will work only in
> IOVA=PA mode. This patch will add support for kernel module to work
> with IOVA=VA mode.
> 
> The idea is to get the physical address from iova address using
> api iommu_iova_to_phys. Using this API, we will get the physical
> address from iova address and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
> 
> With this approach we have compared the performance with IOVA=PA
> and there is no difference observed. Seems like kernel is the
> overhead.
> 
> This approach will not work with the kernel versions less than 4.4.0
> because of API compatibility issues.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---

<snip>

> diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
> index be9e6b0b9..e77a28066 100644
> --- a/kernel/linux/kni/kni_net.c
> +++ b/kernel/linux/kni/kni_net.c
> @@ -35,6 +35,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
>   /* kni rx function pointer, with default to normal rx */
>   static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
> 
> +/* iova to kernel virtual address */
> +static void *
> +iova2kva(struct kni_dev *kni, void *pa)
> +{
> +	return phys_to_virt(iommu_iova_to_phys(kni->domain,
> +				(uintptr_t)pa));
> +}
> +
> +static void *
> +iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
> +{
> +	return phys_to_virt((iommu_iova_to_phys(kni->domain,
> +					(uintptr_t)m->buf_physaddr) +
> +			     m->data_off));
> +}
> +

Apologies, i've accidentally responded to the previous version with this 
comment.

I don't see how this could possibly work, because for any 
IOVA-contiguous chunk of memory, mbufs are allowed to cross page 
boundaries. In this function, you're getting the start address of a 
buffer, but there are no guarantees that the end of the buffer is on the 
same physical page.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-04-22  6:15       ` [dpdk-dev] [PATCH v5] " kirankumark
  2019-04-26  9:11         ` Burakov, Anatoly
@ 2019-06-25  3:56         ` vattunuru
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 1/4] lib/mempool: skip populating mempool objs that falls on page boundaries vattunuru
                             ` (5 more replies)
  1 sibling, 6 replies; 251+ messages in thread
From: vattunuru @ 2019-06-25  3:56 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, olivier.matz, arybchenko, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

----
V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered
across page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new
mempool flag.

V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Kiran Kumar K (1):
  kernel/linux/kni: add IOVA support in kni module

Vamsi Attunuru (3):
  lib/mempool: skip populating mempool objs that falls on page
    boundaries
  lib/kni: add PCI related information
  example/kni: add IOVA support for kni application

 examples/kni/main.c                               | 53 +++++++++++++++-
 kernel/linux/kni/kni_dev.h                        |  3 +
 kernel/linux/kni/kni_misc.c                       | 62 +++++++++++++++---
 kernel/linux/kni/kni_net.c                        | 76 +++++++++++++++++++----
 lib/librte_eal/linux/eal/eal.c                    |  8 ---
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 +++
 lib/librte_kni/rte_kni.c                          |  7 +++
 lib/librte_mempool/rte_mempool.c                  |  2 +-
 lib/librte_mempool/rte_mempool.h                  |  2 +
 lib/librte_mempool/rte_mempool_ops_default.c      | 30 +++++++++
 10 files changed, 219 insertions(+), 32 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] lib/mempool: skip populating mempool objs that falls on page boundaries
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
@ 2019-06-25  3:56           ` vattunuru
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information vattunuru
                             ` (4 subsequent siblings)
  5 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-06-25  3:56 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, olivier.matz, arybchenko, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds new mempool flag to avoid scattering mbuf memory
across page boundaries. Mempool created with this flag set,
populated with mbufs which are exactly inside the page boundaries.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 lib/librte_mempool/rte_mempool.c             |  2 +-
 lib/librte_mempool/rte_mempool.h             |  2 ++
 lib/librte_mempool/rte_mempool_ops_default.c | 30 ++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 69bd2a6..175a20a 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -338,7 +338,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
 		(char *)vaddr + off,
 		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
-		len - off, mempool_add_elem, NULL);
+		len - off, mempool_add_elem, opaque);
 
 	/* not enough room to store one object */
 	if (i == 0) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a..97a1529 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -263,6 +263,8 @@ struct rte_mempool {
 #define MEMPOOL_F_SC_GET         0x0008 /**< Default get is "single-consumer".*/
 #define MEMPOOL_F_POOL_CREATED   0x0010 /**< Internal: pool is created. */
 #define MEMPOOL_F_NO_IOVA_CONTIG 0x0020 /**< Don't need IOVA contiguous objs. */
+#define MEMPOOL_F_NO_PAGE_BOUND  0x0040
+/**< Don't create objs on page boundaries. */
 #define MEMPOOL_F_NO_PHYS_CONTIG MEMPOOL_F_NO_IOVA_CONTIG /* deprecated */
 
 /**
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc8..c029e9a 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -45,11 +45,29 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object falls on a page boundary, else returns 0 */
+static inline int
+mempool_check_obj_bounds(void *obj, uint64_t hugepage_sz, size_t elt_sz)
+{
+	uintptr_t page_end, elt_addr = (uintptr_t)obj;
+	uint32_t pg_shift = rte_bsf32(hugepage_sz);
+	uint64_t page_mask;
+
+	page_mask =  ~((1ull << pg_shift) - 1);
+	page_end = (elt_addr & page_mask) + hugepage_sz;
+
+	if (elt_addr + elt_sz > page_end)
+		return -1;
+
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
+	struct rte_memzone *mz;
 	size_t total_elt_sz;
 	size_t off;
 	unsigned int i;
@@ -58,6 +76,18 @@ rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+
+		if (mp->flags & MEMPOOL_F_NO_PAGE_BOUND) {
+			mz = (struct rte_memzone *)obj_cb_arg;
+			if (mempool_check_obj_bounds((char *)vaddr + off,
+						    mz->hugepage_sz,
+						    total_elt_sz) < 0) {
+				i--; /* Decrement count & skip this obj */
+				off += total_elt_sz;
+				continue;
+			}
+		}
+
 		off += mp->header_size;
 		obj = (char *)vaddr + off;
 		obj_cb(mp, obj_cb_arg, obj,
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 1/4] lib/mempool: skip populating mempool objs that falls on page boundaries vattunuru
@ 2019-06-25  3:56           ` vattunuru
  2019-06-25 17:41             ` Stephen Hemminger
  2019-07-11 16:22             ` [dpdk-dev] " Ferruh Yigit
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 3/4] example/kni: add IOVA support for kni application vattunuru
                             ` (3 subsequent siblings)
  5 siblings, 2 replies; 251+ messages in thread
From: vattunuru @ 2019-06-25  3:56 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, olivier.matz, arybchenko, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

PCI related information is needed in KNI kernel module,
since it requires iommu domain info for address
translations(using iommu_iova_to_phys() call) when
KNI runs in IOVA = VA mode.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
 lib/librte_kni/rte_kni.c                          | 5 +++++
 2 files changed, 12 insertions(+)

diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 91a1c14..5db5a13 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -111,6 +111,13 @@ struct rte_kni_device_info {
 	void * mbuf_va;
 	phys_addr_t mbuf_phys;
 
+	/* PCI info */
+	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+	uint8_t bus;                  /**< Device bus */
+	uint8_t devid;                /**< Device ID */
+	uint8_t function;             /**< Device function. */
+
 	uint16_t group_id;            /**< Group ID */
 	uint32_t core_id;             /**< core ID to bind for kernel thread */
 
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index e29d0cc..99c4bf5 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 		kni->ops.port_id = UINT16_MAX;
 
 	memset(&dev_info, 0, sizeof(dev_info));
+	dev_info.bus = conf->addr.bus;
+	dev_info.devid = conf->addr.devid;
+	dev_info.function = conf->addr.function;
+	dev_info.vendor_id = conf->id.vendor_id;
+	dev_info.device_id = conf->id.device_id;
 	dev_info.core_id = conf->core_id;
 	dev_info.force_bind = conf->force_bind;
 	dev_info.group_id = conf->group_id;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] example/kni: add IOVA support for kni application
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 1/4] lib/mempool: skip populating mempool objs that falls on page boundaries vattunuru
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information vattunuru
@ 2019-06-25  3:56           ` vattunuru
  2019-07-11 16:23             ` Ferruh Yigit
  2019-06-25  3:57           ` [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module vattunuru
                             ` (2 subsequent siblings)
  5 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-06-25  3:56 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, olivier.matz, arybchenko, Vamsi Attunuru, Kiran Kumar K

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation operates in IOVA = PA mode,
Patch adds support for IOVA = VA mode by addressing
the issues with page address translations(IOVA <==> KVA).

In this patch KNI application creates mempool with
"MEMPOOL_F_NO_PAGE_BOUND" flag to ensure all mbuf memory
is with in the page boundaries and subsequently kernel KNI
module uses iommu_iova_to_phys() and phys_to_virt() APIs
to get the kernel virtual addresses.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 examples/kni/main.c                               | 53 ++++++++++++++++++++++-
 lib/librte_eal/linux/eal/eal.c                    |  8 ----
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/rte_kni.c                          |  2 +
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/examples/kni/main.c b/examples/kni/main.c
index 4710d71..13083a7 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -37,6 +37,7 @@
 #include <rte_ethdev.h>
 #include <rte_mempool.h>
 #include <rte_mbuf.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_string_fns.h>
 #include <rte_cycles.h>
 #include <rte_malloc.h>
@@ -945,6 +946,56 @@ kni_free_kni(uint16_t port_id)
 	return 0;
 }
 
+static struct rte_mempool *
+kni_packet_pool_create(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
+	int socket_id)
+{
+	struct rte_pktmbuf_pool_private mbp_priv;
+	const char *mp_ops_name;
+	struct rte_mempool *mp;
+	unsigned int elt_size;
+	int ret;
+
+	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
+		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
+			priv_size);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size +
+		(unsigned int)data_room_size;
+	mbp_priv.mbuf_data_room_size = data_room_size;
+	mbp_priv.mbuf_priv_size = priv_size;
+
+	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
+		 sizeof(struct rte_pktmbuf_pool_private), socket_id,
+		 MEMPOOL_F_NO_PAGE_BOUND);
+	if (mp == NULL)
+		return NULL;
+
+	mp_ops_name = rte_mbuf_best_mempool_ops();
+	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+	rte_pktmbuf_pool_init(mp, &mbp_priv);
+
+	ret = rte_mempool_populate_default(mp);
+	if (ret < 0) {
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+
+	rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
+
+	return mp;
+}
+
 /* Initialise ports/queues etc. and start main loop on each core */
 int
 main(int argc, char** argv)
@@ -975,7 +1026,7 @@ main(int argc, char** argv)
 		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
 
 	/* Create the mbuf pool */
-	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
+	pktmbuf_pool = kni_packet_pool_create("mbuf_pool", NB_MBUF,
 		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
 	if (pktmbuf_pool == NULL) {
 		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 3e1d6eb..d143c49 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1041,14 +1041,6 @@ rte_eal_init(int argc, char **argv)
 		rte_eal_get_configuration()->iova_mode =
 			rte_bus_get_iommu_class();
 
-		/* Workaround for KNI which requires physical address to work */
-		if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
-			RTE_LOG(WARNING, EAL,
-				"Some devices want IOVA as VA but PA will be used because.. "
-				"KNI module inserted\n");
-		}
 	} else {
 		rte_eal_get_configuration()->iova_mode =
 			internal_config.iova_mode;
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 5db5a13..404c85d 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -128,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 99c4bf5..4263f21 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -300,6 +300,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
                             ` (2 preceding siblings ...)
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 3/4] example/kni: add IOVA support for kni application vattunuru
@ 2019-06-25  3:57           ` vattunuru
  2019-07-11 16:30             ` Ferruh Yigit
  2019-07-11 16:43             ` [dpdk-dev] " Stephen Hemminger
  2019-06-25 10:00           ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI Burakov, Anatoly
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
  5 siblings, 2 replies; 251+ messages in thread
From: vattunuru @ 2019-06-25  3:57 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, olivier.matz, arybchenko, Kiran Kumar K, Vamsi Attunuru

From: Kiran Kumar K <kirankumark@marvell.com>

Patch adds support for kernel module to work in IOVA = VA mode,
the idea is to get physical address from iova address using
iommu_iova_to_phys API and later use phys_to_virt API to
convert the physical address to kernel virtual address.

When compared with IOVA = PA mode, there is no performance
drop with this approach.

This approach does not work with the kernel versions less
than 4.4.0 because of API compatibility issues.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 kernel/linux/kni/kni_dev.h  |  3 ++
 kernel/linux/kni/kni_misc.c | 62 ++++++++++++++++++++++++++++++------
 kernel/linux/kni/kni_net.c  | 76 +++++++++++++++++++++++++++++++++++++--------
 3 files changed, 119 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index d57bce6..6ad53c7 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 
 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -39,6 +40,8 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+	struct iommu_domain *domain;
 	struct net_device_stats stats;
 	int status;
 	uint16_t group_id;           /* Group ID of a group of KNI devices */
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 1fc5eeb..b70c827 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -294,6 +294,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 
 	pr_info("Creating kni...\n");
 	/* Check the buffer size, to avoid warning */
@@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
+		(void)pci;
+		pr_err("Kernel version is not supported\n");
+		return -EINVAL;
+#else
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index ad83658..92d5991 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -35,6 +35,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt((iommu_iova_to_phys(kni->domain,
+					(uintptr_t)m->buf_physaddr) +
+			     m->data_off));
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			if (likely(kni->iova_mode == 1))
+				kva = iova2kva(kni, kni->pa[i]);
+			else
+				kva = pa2kva(kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 		}
 
@@ -263,8 +282,13 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		if (likely(kni->iova_mode == 1)) {
+			pkt_kva = iova2kva(kni, pkt_pa);
+			data_kva = iova2data_kva(kni, pkt_kva);
+		} else {
+			pkt_kva = pa2kva(pkt_pa);
+			data_kva = kva2data_kva(pkt_kva);
+		}
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -335,9 +359,14 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = dev_alloc_skb(len + 2);
@@ -434,13 +463,21 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+
+			if (likely(kni->iova_mode == 1)) {
+				kva = iova2kva(kni, kni->pa[i]);
+				data_kva = iova2data_kva(kni, kva);
+				alloc_kva = iova2kva(kni, kni->alloc_pa[i]);
+				alloc_data_kva = iova2data_kva(kni, alloc_kva);
+			} else {
+				kva = pa2kva(kni->pa[i]);
+				data_kva = kva2data_kva(kva);
+				alloc_kva = pa2kva(kni->alloc_pa[i]);
+				alloc_data_kva = kva2data_kva(alloc_kva);
+			}
 			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -507,9 +544,16 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+
+		if (likely(kni->iova_mode == 1)) {
+			kva = iova2kva(kni, kni->pa[i]);
+			data_kva = iova2data_kva(kni, kva);
+		} else {
+			kva = pa2kva(kni->pa[i]);
+			data_kva = kva2data_kva(kva);
+		}
+
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = dev_alloc_skb(len + 2);
@@ -545,8 +589,14 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 				if (!kva->next)
 					break;
 
-				kva = pa2kva(va2pa(kva->next, kva));
-				data_kva = kva2data_kva(kva);
+				if (likely(kni->iova_mode == 1)) {
+					kva = iova2kva(kni,
+						       va2pa(kva->next, kva));
+					data_kva = iova2data_kva(kni, kva);
+				} else {
+					kva = pa2kva(va2pa(kva->next, kva));
+					data_kva = kva2data_kva(kva);
+				}
 			}
 		}
 
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
                             ` (3 preceding siblings ...)
  2019-06-25  3:57           ` [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module vattunuru
@ 2019-06-25 10:00           ` Burakov, Anatoly
  2019-06-25 11:15             ` Jerin Jacob Kollanukkaran
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
  5 siblings, 1 reply; 251+ messages in thread
From: Burakov, Anatoly @ 2019-06-25 10:00 UTC (permalink / raw)
  To: vattunuru, dev; +Cc: ferruh.yigit, olivier.matz, arybchenko

On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> ----
> V6 Changes:
> * Added new mempool flag to ensure mbuf memory is not scattered
> across page boundaries.
> * Added KNI kernel module required PCI device information.
> * Modified KNI example application to create mempool with new
> mempool flag.
> 
Others can chime in, but my 2 cents: this reduces the usefulness of KNI 
because it limits the kinds of mempools one can use them with, and makes 
it so that the code that works with every other PMD requires changes to 
work with KNI.

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-06-25 10:00           ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI Burakov, Anatoly
@ 2019-06-25 11:15             ` Jerin Jacob Kollanukkaran
  2019-06-25 11:30               ` Burakov, Anatoly
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-06-25 11:15 UTC (permalink / raw)
  To: Burakov, Anatoly, Vamsi Krishna Attunuru, dev
  Cc: ferruh.yigit, olivier.matz, arybchenko

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
> Sent: Tuesday, June 25, 2019 3:30 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> arybchenko@solarflare.com
> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> 
> On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > ----
> > V6 Changes:
> > * Added new mempool flag to ensure mbuf memory is not scattered across
> > page boundaries.
> > * Added KNI kernel module required PCI device information.
> > * Modified KNI example application to create mempool with new mempool
> > flag.
> >
> Others can chime in, but my 2 cents: this reduces the usefulness of KNI because
> it limits the kinds of mempools one can use them with, and makes it so that the
> code that works with every other PMD requires changes to work with KNI.

# One option to make this flag as default only for packet mempool(not allow allocate on page boundary).
In real world the overhead will be very minimal considering Huge page size is 1G or 512M 
# Enable this flag explicitly only IOVA = VA mode in library. Not  need to expose to application
# I don’t think, there needs to be any PMD specific change to make KNI with IOVA = VA mode
# No preference on flags to be passed by application vs in library. But IMO this change would be
needed in mempool support KNI in IOVA = VA mode.



> 
> --
> Thanks,
> Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-06-25 11:15             ` Jerin Jacob Kollanukkaran
@ 2019-06-25 11:30               ` Burakov, Anatoly
  2019-06-25 13:38                 ` Burakov, Anatoly
  0 siblings, 1 reply; 251+ messages in thread
From: Burakov, Anatoly @ 2019-06-25 11:30 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Vamsi Krishna Attunuru, dev
  Cc: ferruh.yigit, olivier.matz, arybchenko

On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
>> Sent: Tuesday, June 25, 2019 3:30 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
>> arybchenko@solarflare.com
>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>>
>> On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> ----
>>> V6 Changes:
>>> * Added new mempool flag to ensure mbuf memory is not scattered across
>>> page boundaries.
>>> * Added KNI kernel module required PCI device information.
>>> * Modified KNI example application to create mempool with new mempool
>>> flag.
>>>
>> Others can chime in, but my 2 cents: this reduces the usefulness of KNI because
>> it limits the kinds of mempools one can use them with, and makes it so that the
>> code that works with every other PMD requires changes to work with KNI.
> 
> # One option to make this flag as default only for packet mempool(not allow allocate on page boundary).
> In real world the overhead will be very minimal considering Huge page size is 1G or 512M
> # Enable this flag explicitly only IOVA = VA mode in library. Not  need to expose to application
> # I don’t think, there needs to be any PMD specific change to make KNI with IOVA = VA mode
> # No preference on flags to be passed by application vs in library. But IMO this change would be
> needed in mempool support KNI in IOVA = VA mode.
> 

I would be OK to just make it default behavior to not cross page 
boundaries when allocating buffers. This would solve the problem for KNI 
and for any other use case that would rely on PA-contiguous buffers in 
face of IOVA as VA mode.

We could also add a flag to explicitly allow page crossing without also 
making mbufs IOVA-non-contiguous, but i'm not sure if there are use 
cases that would benefit from this.

> 
> 
>>
>> --
>> Thanks,
>> Anatoly


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-06-25 11:30               ` Burakov, Anatoly
@ 2019-06-25 13:38                 ` Burakov, Anatoly
  2019-06-27  9:34                   ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 251+ messages in thread
From: Burakov, Anatoly @ 2019-06-25 13:38 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Vamsi Krishna Attunuru, dev
  Cc: ferruh.yigit, olivier.matz, arybchenko

On 25-Jun-19 12:30 PM, Burakov, Anatoly wrote:
> On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
>>> -----Original Message-----
>>> From: dev <dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
>>> Sent: Tuesday, June 25, 2019 3:30 PM
>>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>>> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
>>> arybchenko@solarflare.com
>>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>>>
>>> On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
>>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>>
>>>> ----
>>>> V6 Changes:
>>>> * Added new mempool flag to ensure mbuf memory is not scattered across
>>>> page boundaries.
>>>> * Added KNI kernel module required PCI device information.
>>>> * Modified KNI example application to create mempool with new mempool
>>>> flag.
>>>>
>>> Others can chime in, but my 2 cents: this reduces the usefulness of 
>>> KNI because
>>> it limits the kinds of mempools one can use them with, and makes it 
>>> so that the
>>> code that works with every other PMD requires changes to work with KNI.
>>
>> # One option to make this flag as default only for packet mempool(not 
>> allow allocate on page boundary).
>> In real world the overhead will be very minimal considering Huge page 
>> size is 1G or 512M
>> # Enable this flag explicitly only IOVA = VA mode in library. Not  
>> need to expose to application
>> # I don’t think, there needs to be any PMD specific change to make KNI 
>> with IOVA = VA mode
>> # No preference on flags to be passed by application vs in library. 
>> But IMO this change would be
>> needed in mempool support KNI in IOVA = VA mode.
>>
> 
> I would be OK to just make it default behavior to not cross page 
> boundaries when allocating buffers. This would solve the problem for KNI 
> and for any other use case that would rely on PA-contiguous buffers in 
> face of IOVA as VA mode.
> 
> We could also add a flag to explicitly allow page crossing without also 
> making mbufs IOVA-non-contiguous, but i'm not sure if there are use 
> cases that would benefit from this.

On another thought, such a default would break 4K pages in case for 
packets bigger than page size (i.e. jumbo frames). Should we care?

> 
>>
>>
>>>
>>> -- 
>>> Thanks,
>>> Anatoly
> 
> 


-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information vattunuru
@ 2019-06-25 17:41             ` Stephen Hemminger
  2019-06-26  3:48               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-07-11 16:22             ` [dpdk-dev] " Ferruh Yigit
  1 sibling, 1 reply; 251+ messages in thread
From: Stephen Hemminger @ 2019-06-25 17:41 UTC (permalink / raw)
  To: vattunuru; +Cc: dev, ferruh.yigit, olivier.matz, arybchenko

On Tue, 25 Jun 2019 09:26:58 +0530
<vattunuru@marvell.com> wrote:

> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> PCI related information is needed in KNI kernel module,
> since it requires iommu domain info for address
> translations(using iommu_iova_to_phys() call) when
> KNI runs in IOVA = VA mode.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> ---
>  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
>  lib/librte_kni/rte_kni.c                          | 5 +++++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 91a1c14..5db5a13 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>  	void * mbuf_va;
>  	phys_addr_t mbuf_phys;
>  
> +	/* PCI info */
> +	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> +	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> +	uint8_t bus;                  /**< Device bus */
> +	uint8_t devid;                /**< Device ID */
> +	uint8_t function;             /**< Device function. */
> +
>  	uint16_t group_id;            /**< Group ID */
>  	uint32_t core_id;             /**< core ID to bind for kernel thread */
>  
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index e29d0cc..99c4bf5 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  		kni->ops.port_id = UINT16_MAX;
>  
>  	memset(&dev_info, 0, sizeof(dev_info));
> +	dev_info.bus = conf->addr.bus;
> +	dev_info.devid = conf->addr.devid;
> +	dev_info.function = conf->addr.function;
> +	dev_info.vendor_id = conf->id.vendor_id;
> +	dev_info.device_id = conf->id.device_id;
>  	dev_info.core_id = conf->core_id;
>  	dev_info.force_bind = conf->force_bind;
>  	dev_info.group_id = conf->group_id;


NAK

Why is PCI info part of KNI. KNI can be used with non-PCI devices now

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 2/4] lib/kni: add PCI related information
  2019-06-25 17:41             ` Stephen Hemminger
@ 2019-06-26  3:48               ` Vamsi Krishna Attunuru
  2019-06-26 14:58                 ` Stephen Hemminger
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-06-26  3:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, ferruh.yigit, olivier.matz, arybchenko




________________________________
From: Stephen Hemminger <stephen@networkplumber.org>
Sent: Tuesday, June 25, 2019 11:11 PM
To: Vamsi Krishna Attunuru
Cc: dev@dpdk.org; ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
Subject: [EXT] Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information

External Email

----------------------------------------------------------------------
On Tue, 25 Jun 2019 09:26:58 +0530
<vattunuru@marvell.com> wrote:

> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> PCI related information is needed in KNI kernel module,
> since it requires iommu domain info for address
> translations(using iommu_iova_to_phys() call) when
> KNI runs in IOVA = VA mode.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> ---
>  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
>  lib/librte_kni/rte_kni.c                          | 5 +++++
>  2 files changed, 12 insertions(+)
>
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 91a1c14..5db5a13 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>        void * mbuf_va;
>        phys_addr_t mbuf_phys;
>
> +     /* PCI info */
> +     uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> +     uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> +     uint8_t bus;                  /**< Device bus */
> +     uint8_t devid;                /**< Device ID */
> +     uint8_t function;             /**< Device function. */
> +
>        uint16_t group_id;            /**< Group ID */
>        uint32_t core_id;             /**< core ID to bind for kernel thread */
>
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index e29d0cc..99c4bf5 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>                kni->ops.port_id = UINT16_MAX;
>
>        memset(&dev_info, 0, sizeof(dev_info));
> +     dev_info.bus = conf->addr.bus;
> +     dev_info.devid = conf->addr.devid;
> +     dev_info.function = conf->addr.function;
> +     dev_info.vendor_id = conf->id.vendor_id;
> +     dev_info.device_id = conf->id.device_id;
>        dev_info.core_id = conf->core_id;
>        dev_info.force_bind = conf->force_bind;
>        dev_info.group_id = conf->group_id;


NAK

Why is PCI info part of KNI. KNI can be used with non-PCI devices now

Kernel KNI module needs device info(PCI in this case) to figure out iommu domain details to work with IOVA=VA mode and the struct rte_kni_device_info only carries KNI device info to the kernel module, if not here, where do you suggest me to put this info.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 2/4] lib/kni: add PCI related information
  2019-06-26  3:48               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-06-26 14:58                 ` Stephen Hemminger
  2019-06-27  9:43                   ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Stephen Hemminger @ 2019-06-26 14:58 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru; +Cc: dev, ferruh.yigit, olivier.matz, arybchenko

On Wed, 26 Jun 2019 03:48:12 +0000
Vamsi Krishna Attunuru <vattunuru@marvell.com> wrote:

> ________________________________
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Tuesday, June 25, 2019 11:11 PM
> To: Vamsi Krishna Attunuru
> Cc: dev@dpdk.org; ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
> Subject: [EXT] Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information
> 
> External Email
> 
> ----------------------------------------------------------------------
> On Tue, 25 Jun 2019 09:26:58 +0530
> <vattunuru@marvell.com> wrote:
> 
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > PCI related information is needed in KNI kernel module,
> > since it requires iommu domain info for address
> > translations(using iommu_iova_to_phys() call) when
> > KNI runs in IOVA = VA mode.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > ---
> >  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
> >  lib/librte_kni/rte_kni.c                          | 5 +++++
> >  2 files changed, 12 insertions(+)
> >
> > diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > index 91a1c14..5db5a13 100644
> > --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > @@ -111,6 +111,13 @@ struct rte_kni_device_info {
> >        void * mbuf_va;
> >        phys_addr_t mbuf_phys;
> >
> > +     /* PCI info */
> > +     uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> > +     uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> > +     uint8_t bus;                  /**< Device bus */
> > +     uint8_t devid;                /**< Device ID */
> > +     uint8_t function;             /**< Device function. */
> > +
> >        uint16_t group_id;            /**< Group ID */
> >        uint32_t core_id;             /**< core ID to bind for kernel thread */
> >
> > diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> > index e29d0cc..99c4bf5 100644
> > --- a/lib/librte_kni/rte_kni.c
> > +++ b/lib/librte_kni/rte_kni.c
> > @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
> >                kni->ops.port_id = UINT16_MAX;
> >
> >        memset(&dev_info, 0, sizeof(dev_info));
> > +     dev_info.bus = conf->addr.bus;
> > +     dev_info.devid = conf->addr.devid;
> > +     dev_info.function = conf->addr.function;
> > +     dev_info.vendor_id = conf->id.vendor_id;
> > +     dev_info.device_id = conf->id.device_id;
> >        dev_info.core_id = conf->core_id;
> >        dev_info.force_bind = conf->force_bind;
> >        dev_info.group_id = conf->group_id;  
> 
> 
> NAK
> 
> Why is PCI info part of KNI. KNI can be used with non-PCI devices now
> 
> Kernel KNI module needs device info(PCI in this case) to figure out iommu domain details to work with IOVA=VA mode and the struct rte_kni_device_info only carries KNI device info to the kernel module, if not here, where do you suggest me to put this info.

The kernel KNI driver works as is with netvsc PMD (which is vmbus).
Your code may break that. If needs to work for both.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-06-25 13:38                 ` Burakov, Anatoly
@ 2019-06-27  9:34                   ` Jerin Jacob Kollanukkaran
  2019-07-01 13:51                     ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-06-27  9:34 UTC (permalink / raw)
  To: Burakov, Anatoly, Vamsi Krishna Attunuru, dev
  Cc: ferruh.yigit, olivier.matz, arybchenko

> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Tuesday, June 25, 2019 7:09 PM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; dev@dpdk.org
> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> arybchenko@solarflare.com
> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> 
> On 25-Jun-19 12:30 PM, Burakov, Anatoly wrote:
> > On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
> >>> -----Original Message-----
> >>> From: dev <dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
> >>> Sent: Tuesday, June 25, 2019 3:30 PM
> >>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >>> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> >>> arybchenko@solarflare.com
> >>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> >>>
> >>> On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
> >>>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>>
> >>>> ----
> >>>> V6 Changes:
> >>>> * Added new mempool flag to ensure mbuf memory is not scattered
> >>>> across page boundaries.
> >>>> * Added KNI kernel module required PCI device information.
> >>>> * Modified KNI example application to create mempool with new
> >>>> mempool flag.
> >>>>
> >>> Others can chime in, but my 2 cents: this reduces the usefulness of
> >>> KNI because it limits the kinds of mempools one can use them with,
> >>> and makes it so that the code that works with every other PMD
> >>> requires changes to work with KNI.
> >>
> >> # One option to make this flag as default only for packet mempool(not
> >> allow allocate on page boundary).
> >> In real world the overhead will be very minimal considering Huge page
> >> size is 1G or 512M # Enable this flag explicitly only IOVA = VA mode
> >> in library. Not need to expose to application # I don’t think, there
> >> needs to be any PMD specific change to make KNI with IOVA = VA mode #
> >> No preference on flags to be passed by application vs in library.
> >> But IMO this change would be
> >> needed in mempool support KNI in IOVA = VA mode.
> >>
> >
> > I would be OK to just make it default behavior to not cross page
> > boundaries when allocating buffers. This would solve the problem for
> > KNI and for any other use case that would rely on PA-contiguous
> > buffers in face of IOVA as VA mode.
> >
> > We could also add a flag to explicitly allow page crossing without
> > also making mbufs IOVA-non-contiguous, but i'm not sure if there are
> > use cases that would benefit from this.
> 
> On another thought, such a default would break 4K pages in case for packets
> bigger than page size (i.e. jumbo frames). Should we care?

The hugepage size will not be 4K. Right?

Olivier,

As a maintainer any thoughts of exposing/not exposing the new mepool flag to
Skip the page boundaries?

All,
Either option is fine, Asking for feedback to processed further?


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 2/4] lib/kni: add PCI related information
  2019-06-26 14:58                 ` Stephen Hemminger
@ 2019-06-27  9:43                   ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-06-27  9:43 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: dev, ferruh.yigit, olivier.matz, arybchenko,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda




________________________________
From: Stephen Hemminger <stephen@networkplumber.org>
Sent: Wednesday, June 26, 2019 8:28 PM
To: Vamsi Krishna Attunuru
Cc: dev@dpdk.org; ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information

On Wed, 26 Jun 2019 03:48:12 +0000
Vamsi Krishna Attunuru <vattunuru@marvell.com> wrote:

> ________________________________
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Tuesday, June 25, 2019 11:11 PM
> To: Vamsi Krishna Attunuru
> Cc: dev@dpdk.org; ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
> Subject: [EXT] Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information
>
> External Email
>
> ----------------------------------------------------------------------
> On Tue, 25 Jun 2019 09:26:58 +0530
> <vattunuru@marvell.com> wrote:
>
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > PCI related information is needed in KNI kernel module,
> > since it requires iommu domain info for address
> > translations(using iommu_iova_to_phys() call) when
> > KNI runs in IOVA = VA mode.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > ---
> >  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
> >  lib/librte_kni/rte_kni.c                          | 5 +++++
> >  2 files changed, 12 insertions(+)
> >
> > diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > index 91a1c14..5db5a13 100644
> > --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > @@ -111,6 +111,13 @@ struct rte_kni_device_info {
> >        void * mbuf_va;
> >        phys_addr_t mbuf_phys;
> >
> > +     /* PCI info */
> > +     uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> > +     uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> > +     uint8_t bus;                  /**< Device bus */
> > +     uint8_t devid;                /**< Device ID */
> > +     uint8_t function;             /**< Device function. */
> > +
> >        uint16_t group_id;            /**< Group ID */
> >        uint32_t core_id;             /**< core ID to bind for kernel thread */
> >
> > diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> > index e29d0cc..99c4bf5 100644
> > --- a/lib/librte_kni/rte_kni.c
> > +++ b/lib/librte_kni/rte_kni.c
> > @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
> >                kni->ops.port_id = UINT16_MAX;
> >
> >        memset(&dev_info, 0, sizeof(dev_info));
> > +     dev_info.bus = conf->addr.bus;
> > +     dev_info.devid = conf->addr.devid;
> > +     dev_info.function = conf->addr.function;
> > +     dev_info.vendor_id = conf->id.vendor_id;
> > +     dev_info.device_id = conf->id.device_id;
> >        dev_info.core_id = conf->core_id;
> >        dev_info.force_bind = conf->force_bind;
> >        dev_info.group_id = conf->group_id;
>
>
> NAK
>
> Why is PCI info part of KNI. KNI can be used with non-PCI devices now
>
> Kernel KNI module needs device info(PCI in this case) to figure out iommu domain details to work with IOVA=VA mode and the struct rte_kni_device_info only carries KNI device info to the kernel module, if not here, where do you suggest me to put this info.

The kernel KNI driver works as is with netvsc PMD (which is vmbus).
Your code may break that. If needs to work for both.

Vamsi >  I do not think this changes would break any existing functionality, as it can still operate in PA mode if any PMD or application requires so, same for netvsc as well right. Please correct me if am missing any assumptions here.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-06-27  9:34                   ` Jerin Jacob Kollanukkaran
@ 2019-07-01 13:51                     ` Vamsi Krishna Attunuru
  2019-07-04  6:42                       ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-01 13:51 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Burakov, Anatoly, dev
  Cc: ferruh.yigit, olivier.matz, arybchenko

ping..

________________________________
From: Jerin Jacob Kollanukkaran
Sent: Thursday, June 27, 2019 3:04:58 PM
To: Burakov, Anatoly; Vamsi Krishna Attunuru; dev@dpdk.org
Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
Subject: RE: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI

> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Tuesday, June 25, 2019 7:09 PM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; dev@dpdk.org
> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> arybchenko@solarflare.com
> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>
> On 25-Jun-19 12:30 PM, Burakov, Anatoly wrote:
> > On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
> >>> -----Original Message-----
> >>> From: dev <dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
> >>> Sent: Tuesday, June 25, 2019 3:30 PM
> >>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >>> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> >>> arybchenko@solarflare.com
> >>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> >>>
> >>> On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
> >>>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>>
> >>>> ----
> >>>> V6 Changes:
> >>>> * Added new mempool flag to ensure mbuf memory is not scattered
> >>>> across page boundaries.
> >>>> * Added KNI kernel module required PCI device information.
> >>>> * Modified KNI example application to create mempool with new
> >>>> mempool flag.
> >>>>
> >>> Others can chime in, but my 2 cents: this reduces the usefulness of
> >>> KNI because it limits the kinds of mempools one can use them with,
> >>> and makes it so that the code that works with every other PMD
> >>> requires changes to work with KNI.
> >>
> >> # One option to make this flag as default only for packet mempool(not
> >> allow allocate on page boundary).
> >> In real world the overhead will be very minimal considering Huge page
> >> size is 1G or 512M # Enable this flag explicitly only IOVA = VA mode
> >> in library. Not need to expose to application # I don’t think, there
> >> needs to be any PMD specific change to make KNI with IOVA = VA mode #
> >> No preference on flags to be passed by application vs in library.
> >> But IMO this change would be
> >> needed in mempool support KNI in IOVA = VA mode.
> >>
> >
> > I would be OK to just make it default behavior to not cross page
> > boundaries when allocating buffers. This would solve the problem for
> > KNI and for any other use case that would rely on PA-contiguous
> > buffers in face of IOVA as VA mode.
> >
> > We could also add a flag to explicitly allow page crossing without
> > also making mbufs IOVA-non-contiguous, but i'm not sure if there are
> > use cases that would benefit from this.
>
> On another thought, such a default would break 4K pages in case for packets
> bigger than page size (i.e. jumbo frames). Should we care?

The hugepage size will not be 4K. Right?

Olivier,

As a maintainer any thoughts of exposing/not exposing the new mepool flag to
Skip the page boundaries?

All,
Either option is fine, Asking for feedback to processed further?


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-07-01 13:51                     ` Vamsi Krishna Attunuru
@ 2019-07-04  6:42                       ` Vamsi Krishna Attunuru
  2019-07-04  9:48                         ` Jerin Jacob Kollanukkaran
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-04  6:42 UTC (permalink / raw)
  To: dev
  Cc: ferruh.yigit, olivier.matz, arybchenko,
	Jerin Jacob Kollanukkaran, Burakov, Anatoly

Hi All,


Just to summarize, below items have arisen from the initial review.

1) Can the new mempool flag be made default to all the pools and will there be case that new flag functionality would fail  for some page sizes.?

2) Adding HW device info(pci dev info) to KNI device structure, will it break KNI on virtual devices in VA or PA mode.?


Can someone suggest if any changes required to address above issues.

________________________________
From: dev <dev-bounces@dpdk.org> on behalf of Vamsi Krishna Attunuru <vattunuru@marvell.com>
Sent: Monday, July 1, 2019 7:21:22 PM
To: Jerin Jacob Kollanukkaran; Burakov, Anatoly; dev@dpdk.org
Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
Subject: [EXT] Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI

External Email

----------------------------------------------------------------------
ping..

________________________________
From: Jerin Jacob Kollanukkaran
Sent: Thursday, June 27, 2019 3:04:58 PM
To: Burakov, Anatoly; Vamsi Krishna Attunuru; dev@dpdk.org
Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com
Subject: RE: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI

> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Tuesday, June 25, 2019 7:09 PM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>; dev@dpdk.org
> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> arybchenko@solarflare.com
> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>
> On 25-Jun-19 12:30 PM, Burakov, Anatoly wrote:
> > On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
> >>> -----Original Message-----
> >>> From: dev <dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
> >>> Sent: Tuesday, June 25, 2019 3:30 PM
> >>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >>> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com;
> >>> arybchenko@solarflare.com
> >>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> >>>
> >>> On 25-Jun-19 4:56 AM, vattunuru@marvell.com wrote:
> >>>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>>
> >>>> ----
> >>>> V6 Changes:
> >>>> * Added new mempool flag to ensure mbuf memory is not scattered
> >>>> across page boundaries.
> >>>> * Added KNI kernel module required PCI device information.
> >>>> * Modified KNI example application to create mempool with new
> >>>> mempool flag.
> >>>>
> >>> Others can chime in, but my 2 cents: this reduces the usefulness of
> >>> KNI because it limits the kinds of mempools one can use them with,
> >>> and makes it so that the code that works with every other PMD
> >>> requires changes to work with KNI.
> >>
> >> # One option to make this flag as default only for packet mempool(not
> >> allow allocate on page boundary).
> >> In real world the overhead will be very minimal considering Huge page
> >> size is 1G or 512M # Enable this flag explicitly only IOVA = VA mode
> >> in library. Not need to expose to application # I don’t think, there
> >> needs to be any PMD specific change to make KNI with IOVA = VA mode #
> >> No preference on flags to be passed by application vs in library.
> >> But IMO this change would be
> >> needed in mempool support KNI in IOVA = VA mode.
> >>
> >
> > I would be OK to just make it default behavior to not cross page
> > boundaries when allocating buffers. This would solve the problem for
> > KNI and for any other use case that would rely on PA-contiguous
> > buffers in face of IOVA as VA mode.
> >
> > We could also add a flag to explicitly allow page crossing without
> > also making mbufs IOVA-non-contiguous, but i'm not sure if there are
> > use cases that would benefit from this.
>
> On another thought, such a default would break 4K pages in case for packets
> bigger than page size (i.e. jumbo frames). Should we care?

The hugepage size will not be 4K. Right?

Olivier,

As a maintainer any thoughts of exposing/not exposing the new mepool flag to
Skip the page boundaries?

All,
Either option is fine, Asking for feedback to processed further?


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-07-04  6:42                       ` Vamsi Krishna Attunuru
@ 2019-07-04  9:48                         ` Jerin Jacob Kollanukkaran
  2019-07-11 16:21                           ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2019-07-04  9:48 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: ferruh.yigit, olivier.matz, arybchenko, Burakov, Anatoly

>From: Vamsi Krishna Attunuru 
>Sent: Thursday, July 4, 2019 12:13 PM
>To: dev@dpdk.org
>Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Burakov, Anatoly <anatoly.burakov@intel.com>
>Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>
>Hi All,
>
>Just to summarize, below items have arisen from the initial review.
>1) Can the new mempool flag be made default to all the pools and will there be case that new flag functionality would fail  for some page sizes.?

If the minimum huge page size is 2MB and normal huge page size is 512MB or 1G. So I think, new flags can be default as skipping the page boundaries for 
Mempool objects has nearly zero overhead. But I leave decision to maintainers.

>2) Adding HW device info(pci dev info) to KNI device structure, will it break KNI on virtual devices in VA or PA mode.?

Iommu_domain will be created only for PCI devices and the system runs in IOVA_VA mode. Virtual devices(IOVA_DC(don't care) or
IOVA_PA devices still it works without PCI device structure)

It is  a useful feature where KNI can run without root privilege and it is pending for long time. Request to review and close this

>
>Can someone suggest if any changes required to address above issues. 
________________________________________
From: dev <mailto:dev-bounces@dpdk.org> on behalf of Vamsi Krishna Attunuru <mailto:vattunuru@marvell.com>
Sent: Monday, July 1, 2019 7:21:22 PM
To: Jerin Jacob Kollanukkaran; Burakov, Anatoly; mailto:dev@dpdk.org
Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com; mailto:arybchenko@solarflare.com
Subject: [EXT] Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI 
 
External Email

----------------------------------------------------------------------
ping..

________________________________
From: Jerin Jacob Kollanukkaran
Sent: Thursday, June 27, 2019 3:04:58 PM
To: Burakov, Anatoly; Vamsi Krishna Attunuru; mailto:dev@dpdk.org
Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com; mailto:arybchenko@solarflare.com
Subject: RE: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI

> -----Original Message-----
> From: Burakov, Anatoly <mailto:anatoly.burakov@intel.com>
> Sent: Tuesday, June 25, 2019 7:09 PM
> To: Jerin Jacob Kollanukkaran <mailto:jerinj@marvell.com>; Vamsi Krishna Attunuru
> <mailto:vattunuru@marvell.com>; mailto:dev@dpdk.org
> Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com;
> mailto:arybchenko@solarflare.com
> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>
> On 25-Jun-19 12:30 PM, Burakov, Anatoly wrote:
> > On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
> >>> -----Original Message-----
> >>> From: dev <mailto:dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
> >>> Sent: Tuesday, June 25, 2019 3:30 PM
> >>> To: Vamsi Krishna Attunuru <mailto:vattunuru@marvell.com>; mailto:dev@dpdk.org
> >>> Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com;
> >>> mailto:arybchenko@solarflare.com
> >>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> >>>
> >>> On 25-Jun-19 4:56 AM, mailto:vattunuru@marvell.com wrote:
> >>>> From: Vamsi Attunuru <mailto:vattunuru@marvell.com>
> >>>>
> >>>> ----
> >>>> V6 Changes:
> >>>> * Added new mempool flag to ensure mbuf memory is not scattered
> >>>> across page boundaries.
> >>>> * Added KNI kernel module required PCI device information.
> >>>> * Modified KNI example application to create mempool with new
> >>>> mempool flag.
> >>>>
> >>> Others can chime in, but my 2 cents: this reduces the usefulness of
> >>> KNI because it limits the kinds of mempools one can use them with,
> >>> and makes it so that the code that works with every other PMD
> >>> requires changes to work with KNI.
> >>
> >> # One option to make this flag as default only for packet mempool(not
> >> allow allocate on page boundary).
> >> In real world the overhead will be very minimal considering Huge page
> >> size is 1G or 512M # Enable this flag explicitly only IOVA = VA mode
> >> in library. Not need to expose to application # I don't think, there
> >> needs to be any PMD specific change to make KNI with IOVA = VA mode #
> >> No preference on flags to be passed by application vs in library.
> >> But IMO this change would be
> >> needed in mempool support KNI in IOVA = VA mode.
> >>
> >
> > I would be OK to just make it default behavior to not cross page
> > boundaries when allocating buffers. This would solve the problem for
> > KNI and for any other use case that would rely on PA-contiguous
> > buffers in face of IOVA as VA mode.
> >
> > We could also add a flag to explicitly allow page crossing without
> > also making mbufs IOVA-non-contiguous, but i'm not sure if there are
> > use cases that would benefit from this.
>
> On another thought, such a default would break 4K pages in case for packets
> bigger than page size (i.e. jumbo frames). Should we care?

The hugepage size will not be 4K. Right?

Olivier,

As a maintainer any thoughts of exposing/not exposing the new mepool flag to
Skip the page boundaries?

All,
Either option is fine, Asking for feedback to processed further?

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
  2019-07-04  9:48                         ` Jerin Jacob Kollanukkaran
@ 2019-07-11 16:21                           ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-11 16:21 UTC (permalink / raw)
  To: Jerin Jacob Kollanukkaran, Vamsi Krishna Attunuru, dev
  Cc: olivier.matz, arybchenko, Burakov, Anatoly

On 7/4/2019 10:48 AM, Jerin Jacob Kollanukkaran wrote:
>> From: Vamsi Krishna Attunuru 
>> Sent: Thursday, July 4, 2019 12:13 PM
>> To: dev@dpdk.org
>> Cc: ferruh.yigit@intel.com; olivier.matz@6wind.com; arybchenko@solarflare.com; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Burakov, Anatoly <anatoly.burakov@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>>
>> Hi All,
>>
>> Just to summarize, below items have arisen from the initial review.
>> 1) Can the new mempool flag be made default to all the pools and will there be case that new flag functionality would fail  for some page sizes.?
> 
> If the minimum huge page size is 2MB and normal huge page size is 512MB or 1G. So I think, new flags can be default as skipping the page boundaries for 
> Mempool objects has nearly zero overhead. But I leave decision to maintainers.
> 
>> 2) Adding HW device info(pci dev info) to KNI device structure, will it break KNI on virtual devices in VA or PA mode.?
> 
> Iommu_domain will be created only for PCI devices and the system runs in IOVA_VA mode. Virtual devices(IOVA_DC(don't care) or
> IOVA_PA devices still it works without PCI device structure)
> 
> It is  a useful feature where KNI can run without root privilege and it is pending for long time. Request to review and close this

I support the idea to remove 'kni' forcing to the IOVA=PA mode, but also not
sure about forcing all KNI users to update their code to allocate mempool in a
very specific way.

What about giving more control to the user on this?

Any user want to use IOVA=VA and KNI together can update application to justify
memory allocation of the KNI and give an explicit "kni iova_mode=1" config.
Who want to use existing KNI implementation can continue to use it with IOVA=PA
mode which is current case, or for this case user may need to force the DPDK
application to IOVA=PA but at least there is a workaround.

And kni sample application should have sample for both case, although this
increases the testing and maintenance cost, I hope we can get support from you
on the iova_mode=1 usecase.

What do you think?



> 
>>
>> Can someone suggest if any changes required to address above issues. 
> ________________________________________
> From: dev <mailto:dev-bounces@dpdk.org> on behalf of Vamsi Krishna Attunuru <mailto:vattunuru@marvell.com>
> Sent: Monday, July 1, 2019 7:21:22 PM
> To: Jerin Jacob Kollanukkaran; Burakov, Anatoly; mailto:dev@dpdk.org
> Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com; mailto:arybchenko@solarflare.com
> Subject: [EXT] Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI 
>  
> External Email
> 
> ----------------------------------------------------------------------
> ping..
> 
> ________________________________
> From: Jerin Jacob Kollanukkaran
> Sent: Thursday, June 27, 2019 3:04:58 PM
> To: Burakov, Anatoly; Vamsi Krishna Attunuru; mailto:dev@dpdk.org
> Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com; mailto:arybchenko@solarflare.com
> Subject: RE: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
> 
>> -----Original Message-----
>> From: Burakov, Anatoly <mailto:anatoly.burakov@intel.com>
>> Sent: Tuesday, June 25, 2019 7:09 PM
>> To: Jerin Jacob Kollanukkaran <mailto:jerinj@marvell.com>; Vamsi Krishna Attunuru
>> <mailto:vattunuru@marvell.com>; mailto:dev@dpdk.org
>> Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com;
>> mailto:arybchenko@solarflare.com
>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>>
>> On 25-Jun-19 12:30 PM, Burakov, Anatoly wrote:
>>> On 25-Jun-19 12:15 PM, Jerin Jacob Kollanukkaran wrote:
>>>>> -----Original Message-----
>>>>> From: dev <mailto:dev-bounces@dpdk.org> On Behalf Of Burakov, Anatoly
>>>>> Sent: Tuesday, June 25, 2019 3:30 PM
>>>>> To: Vamsi Krishna Attunuru <mailto:vattunuru@marvell.com>; mailto:dev@dpdk.org
>>>>> Cc: mailto:ferruh.yigit@intel.com; mailto:olivier.matz@6wind.com;
>>>>> mailto:arybchenko@solarflare.com
>>>>> Subject: Re: [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI
>>>>>
>>>>> On 25-Jun-19 4:56 AM, mailto:vattunuru@marvell.com wrote:
>>>>>> From: Vamsi Attunuru <mailto:vattunuru@marvell.com>
>>>>>>
>>>>>> ----
>>>>>> V6 Changes:
>>>>>> * Added new mempool flag to ensure mbuf memory is not scattered
>>>>>> across page boundaries.
>>>>>> * Added KNI kernel module required PCI device information.
>>>>>> * Modified KNI example application to create mempool with new
>>>>>> mempool flag.
>>>>>>
>>>>> Others can chime in, but my 2 cents: this reduces the usefulness of
>>>>> KNI because it limits the kinds of mempools one can use them with,
>>>>> and makes it so that the code that works with every other PMD
>>>>> requires changes to work with KNI.
>>>>
>>>> # One option to make this flag as default only for packet mempool(not
>>>> allow allocate on page boundary).
>>>> In real world the overhead will be very minimal considering Huge page
>>>> size is 1G or 512M # Enable this flag explicitly only IOVA = VA mode
>>>> in library. Not need to expose to application # I don't think, there
>>>> needs to be any PMD specific change to make KNI with IOVA = VA mode #
>>>> No preference on flags to be passed by application vs in library.
>>>> But IMO this change would be
>>>> needed in mempool support KNI in IOVA = VA mode.
>>>>
>>>
>>> I would be OK to just make it default behavior to not cross page
>>> boundaries when allocating buffers. This would solve the problem for
>>> KNI and for any other use case that would rely on PA-contiguous
>>> buffers in face of IOVA as VA mode.
>>>
>>> We could also add a flag to explicitly allow page crossing without
>>> also making mbufs IOVA-non-contiguous, but i'm not sure if there are
>>> use cases that would benefit from this.
>>
>> On another thought, such a default would break 4K pages in case for packets
>> bigger than page size (i.e. jumbo frames). Should we care?
> 
> The hugepage size will not be 4K. Right?
> 
> Olivier,
> 
> As a maintainer any thoughts of exposing/not exposing the new mepool flag to
> Skip the page boundaries?
> 
> All,
> Either option is fine, Asking for feedback to processed further?
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information vattunuru
  2019-06-25 17:41             ` Stephen Hemminger
@ 2019-07-11 16:22             ` Ferruh Yigit
  2019-07-12 11:02               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  1 sibling, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-11 16:22 UTC (permalink / raw)
  To: vattunuru, dev; +Cc: olivier.matz, arybchenko

On 6/25/2019 4:56 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> PCI related information is needed in KNI kernel module,
> since it requires iommu domain info for address
> translations(using iommu_iova_to_phys() call) when
> KNI runs in IOVA = VA mode.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> ---
>  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
>  lib/librte_kni/rte_kni.c                          | 5 +++++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 91a1c14..5db5a13 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>  	void * mbuf_va;
>  	phys_addr_t mbuf_phys;
>  
> +	/* PCI info */
> +	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> +	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> +	uint8_t bus;                  /**< Device bus */
> +	uint8_t devid;                /**< Device ID */
> +	uint8_t function;             /**< Device function. */
> +
>  	uint16_t group_id;            /**< Group ID */
>  	uint32_t core_id;             /**< core ID to bind for kernel thread */
>  
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index e29d0cc..99c4bf5 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  		kni->ops.port_id = UINT16_MAX;
>  
>  	memset(&dev_info, 0, sizeof(dev_info));
> +	dev_info.bus = conf->addr.bus;
> +	dev_info.devid = conf->addr.devid;
> +	dev_info.function = conf->addr.function;
> +	dev_info.vendor_id = conf->id.vendor_id;
> +	dev_info.device_id = conf->id.device_id;

Sample application part to set 'conf' values were also removed, need to add them
back too.

The device part was removed from KNI but I can see why this patch requires it back.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/4] example/kni: add IOVA support for kni application
  2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 3/4] example/kni: add IOVA support for kni application vattunuru
@ 2019-07-11 16:23             ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-11 16:23 UTC (permalink / raw)
  To: vattunuru, dev; +Cc: olivier.matz, arybchenko, Kiran Kumar K

On 6/25/2019 4:56 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> Current KNI implementation operates in IOVA = PA mode,
> Patch adds support for IOVA = VA mode by addressing
> the issues with page address translations(IOVA <==> KVA).
> 
> In this patch KNI application creates mempool with
> "MEMPOOL_F_NO_PAGE_BOUND" flag to ensure all mbuf memory
> is with in the page boundaries and subsequently kernel KNI
> module uses iommu_iova_to_phys() and phys_to_virt() APIs
> to get the kernel virtual addresses.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
>  examples/kni/main.c                               | 53 ++++++++++++++++++++++-
>  lib/librte_eal/linux/eal/eal.c                    |  8 ----
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
>  lib/librte_kni/rte_kni.c                          |  2 +

Need to document this new iova_mode and related behavior in KNI documentation.

<...>

> @@ -975,7 +1026,7 @@ main(int argc, char** argv)
>  		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
>  
>  	/* Create the mbuf pool */
> -	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
> +	pktmbuf_pool = kni_packet_pool_create("mbuf_pool", NB_MBUF,

What about keeping the default mempool creation and use
"kni_packet_pool_create()" only when "iova_mode" is set. (Please read below
comment first J )

<...>

> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index 99c4bf5..4263f21 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -300,6 +300,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  	kni->group_id = conf->group_id;
>  	kni->mbuf_size = conf->mbuf_size;
>  
> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> +

The application using the "iova_mode" requires a specific mempool requirements,
and just updating sample application won't solve the problem here.

What do you think making "iova_mode" user configurable instead of making it
automatically set in the library? So the application that can justify the
requirement can set the "iova_mode".
And make "iova_mode" disabled by default.

To be able to remove the 'kni' check in the eal, still need to check
'rte_eal_iova_mode()', what about following

a) iova_mode=pa && "rte_eal_iova_mode() == RTE_IOVA_PA" ==> Run KNI
b) iova_mode=pa && "rte_eal_iova_mode() == RTE_IOVA_VA" ==> Return error
c) iova_mode=va && "rte_eal_iova_mode() == RTE_IOVA_PA" ==> Run KNI
d) iova_mode=va && "rte_eal_iova_mode() == RTE_IOVA_VA" ==> Run KNI

For b) the workaround for the user want to run KNI can be user giving eal
parameter to force DPDK to IOVA=PA mode.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-06-25  3:57           ` [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module vattunuru
@ 2019-07-11 16:30             ` Ferruh Yigit
  2019-07-12 10:38               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-07-11 16:43             ` [dpdk-dev] " Stephen Hemminger
  1 sibling, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-11 16:30 UTC (permalink / raw)
  To: vattunuru, dev; +Cc: olivier.matz, arybchenko, Kiran Kumar K

On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> Patch adds support for kernel module to work in IOVA = VA mode,
> the idea is to get physical address from iova address using
> iommu_iova_to_phys API and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
> 
> When compared with IOVA = PA mode, there is no performance
> drop with this approach.
> 
> This approach does not work with the kernel versions less
> than 4.4.0 because of API compatibility issues.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

<...>

> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>  
>  	/* Translate user space info into kernel space info */
> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -	kni->sync_va = dev_info.sync_va;
> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +	if (dev_info.iova_mode) {
> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE

We have "kni/compat.h" to put the version checks, please use abstracted feature
checks only in the code.
From experience this goes ugly quickly with the addition to distro kernels and
their specific versioning, so better to hide these all from the source code.

And this version requirement needs to be documented in kni doc.

> +		(void)pci;
> +		pr_err("Kernel version is not supported\n");

Can you please include 'iova_mode' condition into the message log, because this
kernel version is supported if user wants to use via 'iova_mode == 0' condition.

> +		return -EINVAL;
> +#else
> +		pci = pci_get_device(dev_info.vendor_id,
> +				     dev_info.device_id, NULL);
> +		while (pci) {
> +			if ((pci->bus->number == dev_info.bus) &&
> +			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> +			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
> +				domain = iommu_get_domain_for_dev(&pci->dev);
> +				break;
> +			}
> +			pci = pci_get_device(dev_info.vendor_id,
> +					     dev_info.device_id, pci);
> +		}

What if 'pci' is NULL here?
In kni it is not required to provide a device at all.

<...>

> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>  			return;
>  
>  		for (i = 0; i < num_rx; i++) {
> -			kva = pa2kva(kni->pa[i]);
> +			if (likely(kni->iova_mode == 1))
> +				kva = iova2kva(kni, kni->pa[i]);
> +			else
> +				kva = pa2kva(kni->pa[i]);

To reduce the churn, what about updating the 'pa2kva()' and put the
"(kni->iova_mode == 1)" check there? Does it help? (not only 'pa2kva()' but its
friends also, and if it makes more sense agree to rename the functions)

And btw, why 'likely' case is "kni->iova_mode == 1"?

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-06-25  3:57           ` [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module vattunuru
  2019-07-11 16:30             ` Ferruh Yigit
@ 2019-07-11 16:43             ` Stephen Hemminger
  1 sibling, 0 replies; 251+ messages in thread
From: Stephen Hemminger @ 2019-07-11 16:43 UTC (permalink / raw)
  To: vattunuru; +Cc: dev, ferruh.yigit, olivier.matz, arybchenko, Kiran Kumar K

On Tue, 25 Jun 2019 09:27:00 +0530
<vattunuru@marvell.com> wrote:

> diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
> index 1fc5eeb..b70c827 100644
> --- a/kernel/linux/kni/kni_misc.c
> +++ b/kernel/linux/kni/kni_misc.c
> @@ -294,6 +294,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	struct rte_kni_device_info dev_info;
>  	struct net_device *net_dev = NULL;
>  	struct kni_dev *kni, *dev, *n;
> +	struct pci_dev *pci = NULL;
> +	struct iommu_domain *domain = NULL;

Please don't do unnecessary initailization. It defeats the purpose
of compiler and static checkers.

> +	phys_addr_t phys_addr;
>  
>  	pr_info("Creating kni...\n");
>  	/* Check the buffer size, to avoid warning */
> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>  
>  	/* Translate user space info into kernel space info */
> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -	kni->sync_va = dev_info.sync_va;
> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +	if (dev_info.iova_mode) {
> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> +		(void)pci;
> +		pr_err("Kernel version is not supported\n");
> +		return -EINVAL;
> +#else
> +		pci = pci_get_device(dev_info.vendor_id,
> +				     dev_info.device_id, NULL);
> +		while (pci) {
> +			if ((pci->bus->number == dev_info.bus) &&
> +			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> +			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
> +				domain = iommu_get_domain_for_dev(&pci->dev);
> +				break;
> +			}
> +			pci = pci_get_device(dev_info.vendor_id,
> +					     dev_info.device_id, pci);
> +		}
> +#endif

Why not move the variable pci inside the if() statement, then (void)pci
is unnecessary.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-07-11 16:30             ` Ferruh Yigit
@ 2019-07-12 10:38               ` Vamsi Krishna Attunuru
  2019-07-12 11:10                 ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-12 10:38 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: olivier.matz, arybchenko, Kiran Kumar Kokkilagadda




________________________________
From: Ferruh Yigit <ferruh.yigit@intel.com>
Sent: Thursday, July 11, 2019 10:00 PM
To: Vamsi Krishna Attunuru; dev@dpdk.org
Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
Subject: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module

External Email

----------------------------------------------------------------------
On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
>
> Patch adds support for kernel module to work in IOVA = VA mode,
> the idea is to get physical address from iova address using
> iommu_iova_to_phys API and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
>
> When compared with IOVA = PA mode, there is no performance
> drop with this approach.
>
> This approach does not work with the kernel versions less
> than 4.4.0 because of API compatibility issues.
>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

<...>

> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>
>        /* Translate user space info into kernel space info */
> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -     kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -     kni->req_q = phys_to_virt(dev_info.req_phys);
> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -     kni->sync_va = dev_info.sync_va;
> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +     if (dev_info.iova_mode) {
> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE

We have "kni/compat.h" to put the version checks, please use abstracted feature
checks only in the code.
From experience this goes ugly quickly with the addition to distro kernels and
their specific versioning, so better to hide these all from the source code.

And this version requirement needs to be documented in kni doc.

ack

> +             (void)pci;
> +             pr_err("Kernel version is not supported\n");

Can you please include 'iova_mode' condition into the message log, because this
kernel version is supported if user wants to use via 'iova_mode == 0' condition.

ack

> +             return -EINVAL;
> +#else
> +             pci = pci_get_device(dev_info.vendor_id,
> +                                  dev_info.device_id, NULL);
> +             while (pci) {
> +                     if ((pci->bus->number == dev_info.bus) &&
> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> +                         (PCI_FUNC(pci->devfn) == dev_info.function)) {
> +                             domain = iommu_get_domain_for_dev(&pci->dev);
> +                             break;
> +                     }
> +                     pci = pci_get_device(dev_info.vendor_id,
> +                                          dev_info.device_id, pci);
> +             }

What if 'pci' is NULL here?
In kni it is not required to provide a device at all.

Ack, will add a NULL check.
other point is not clear to me, device info is absolutely required at least
for  IOVA=VA mode, since it requires to procure iommu domain details.
Any thoughts or ways to address this without device.?

<...>

> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>                        return;
>
>                for (i = 0; i < num_rx; i++) {
> -                     kva = pa2kva(kni->pa[i]);
> +                     if (likely(kni->iova_mode == 1))
> +                             kva = iova2kva(kni, kni->pa[i]);
> +                     else
> +                             kva = pa2kva(kni->pa[i]);

To reduce the churn, what about updating the 'pa2kva()' and put the
"(kni->iova_mode == 1)" check there? Does it help? (not only 'pa2kva()' but its
friends also, and if it makes more sense agree to rename the functions)

No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova address might
crash, hence the if..else check is added.

And btw, why 'likely' case is "kni->iova_mode == 1"?

no specific case other than branch predict, will remove this if it's really harmful to PA mode.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 2/4] lib/kni: add PCI related information
  2019-07-11 16:22             ` [dpdk-dev] " Ferruh Yigit
@ 2019-07-12 11:02               ` Vamsi Krishna Attunuru
  2019-07-12 11:11                 ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-12 11:02 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: olivier.matz, arybchenko




________________________________
From: dev <dev-bounces@dpdk.org> on behalf of Ferruh Yigit <ferruh.yigit@intel.com>
Sent: Thursday, July 11, 2019 9:52 PM
To: Vamsi Krishna Attunuru; dev@dpdk.org
Cc: olivier.matz@6wind.com; arybchenko@solarflare.com
Subject: [EXT] Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information

External Email

----------------------------------------------------------------------
On 6/25/2019 4:56 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> PCI related information is needed in KNI kernel module,
> since it requires iommu domain info for address
> translations(using iommu_iova_to_phys() call) when
> KNI runs in IOVA = VA mode.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> ---
>  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
>  lib/librte_kni/rte_kni.c                          | 5 +++++
>  2 files changed, 12 insertions(+)
>
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 91a1c14..5db5a13 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>        void * mbuf_va;
>        phys_addr_t mbuf_phys;
>
> +     /* PCI info */
> +     uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> +     uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> +     uint8_t bus;                  /**< Device bus */
> +     uint8_t devid;                /**< Device ID */
> +     uint8_t function;             /**< Device function. */
> +
>        uint16_t group_id;            /**< Group ID */
>        uint32_t core_id;             /**< core ID to bind for kernel thread */
>
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index e29d0cc..99c4bf5 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>                kni->ops.port_id = UINT16_MAX;
>
>        memset(&dev_info, 0, sizeof(dev_info));
> +     dev_info.bus = conf->addr.bus;
> +     dev_info.devid = conf->addr.devid;
> +     dev_info.function = conf->addr.function;
> +     dev_info.vendor_id = conf->id.vendor_id;
> +     dev_info.device_id = conf->id.device_id;

Sample application part to set 'conf' values were also removed, need to add them
back too.

Ack, adding in next version of patches.

The device part was removed from KNI but I can see why this patch requires it back.

Yes, rte_kni_device_info is the only struct carries all required info to kernel module,
all these device info can be protected under the config check though.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-07-12 10:38               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-07-12 11:10                 ` Ferruh Yigit
  2019-07-12 12:27                   ` Vamsi Krishna Attunuru
  2019-07-12 16:29                   ` Vamsi Krishna Attunuru
  0 siblings, 2 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-12 11:10 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: olivier.matz, arybchenko, Kiran Kumar Kokkilagadda

On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
> 
> 
> 
> --------------------------------------------------------------------------------
> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> *Sent:* Thursday, July 11, 2019 10:00 PM
> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni
> module
>  
> External Email
> 
> ----------------------------------------------------------------------
> On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
>> From: Kiran Kumar K <kirankumark@marvell.com>
>> 
>> Patch adds support for kernel module to work in IOVA = VA mode,
>> the idea is to get physical address from iova address using
>> iommu_iova_to_phys API and later use phys_to_virt API to
>> convert the physical address to kernel virtual address.
>> 
>> When compared with IOVA = PA mode, there is no performance
>> drop with this approach.
>> 
>> This approach does not work with the kernel versions less
>> than 4.4.0 because of API compatibility issues.
>> 
>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> 
> <...>
> 
>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>>        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>  
>>        /* Translate user space info into kernel space info */
>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
>> -
>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
>> -     kni->sync_va = dev_info.sync_va;
>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>> +     if (dev_info.iova_mode) {
>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> 
> We have "kni/compat.h" to put the version checks, please use abstracted feature
> checks only in the code.
> From experience this goes ugly quickly with the addition to distro kernels and
> their specific versioning, so better to hide these all from the source code.
> 
> And this version requirement needs to be documented in kni doc.
> 
> ack
> 
>> +             (void)pci;
>> +             pr_err("Kernel version is not supported\n");
> 
> Can you please include 'iova_mode' condition into the message log, because this
> kernel version is supported if user wants to use via 'iova_mode == 0' condition.
> 
> ack
> 
>> +             return -EINVAL;
>> +#else
>> +             pci = pci_get_device(dev_info.vendor_id,
>> +                                  dev_info.device_id, NULL);
>> +             while (pci) {
>> +                     if ((pci->bus->number == dev_info.bus) &&
>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
>> +                         (PCI_FUNC(pci->devfn) == dev_info.function)) {
>> +                             domain = iommu_get_domain_for_dev(&pci->dev);
>> +                             break;
>> +                     }
>> +                     pci = pci_get_device(dev_info.vendor_id,
>> +                                          dev_info.device_id, pci);
>> +             }
> 
> What if 'pci' is NULL here?
> In kni it is not required to provide a device at all.
> 
> Ack, will add a NULL check.
> other point is not clear to me, device info is absolutely required at least
> for  IOVA=VA mode, since it requires to procure iommu domain details.

"device info is absolutely required" *only* for IOVA=VA mode, so user may skip
to provide it.

> Any thoughts or ways to address this without device.?

Return error if 'iova_mode' requested but device info not?

But you didn't replied to passing 'iova_mode' from application, I would like
hear what you are thinking about it..

> 
> <...>
> 
>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>>                        return;
>>  
>>                for (i = 0; i < num_rx; i++) {
>> -                     kva = pa2kva(kni->pa[i]);
>> +                     if (likely(kni->iova_mode == 1))
>> +                             kva = iova2kva(kni, kni->pa[i]);
>> +                     else
>> +                             kva = pa2kva(kni->pa[i]);
> 
> To reduce the churn, what about updating the 'pa2kva()' and put the
> "(kni->iova_mode == 1)" check there? Does it help? (not only 'pa2kva()' but its
> friends also, and if it makes more sense agree to rename the functions)
> 
> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova address might
> crash, hence the if..else check is added.

I understand that part.
What I am suggestion is something like this:

kva = common_function(kni, kni->pa[i]);

---

common_function() {
	if (unlikely(kni->iova_mode == 1))
		return iova2kva(kni, kni->pa[i]);
	return pa2kva(kni->pa[i]);
}

To hide the check in the function and make code more readable

> 
> And btw, why 'likely' case is "kni->iova_mode == 1"?
> 
> no specific case other than branch predict, will remove this if it's really
> harmful to PA mode.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 2/4] lib/kni: add PCI related information
  2019-07-12 11:02               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-07-12 11:11                 ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-12 11:11 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev; +Cc: olivier.matz, arybchenko

On 7/12/2019 12:02 PM, Vamsi Krishna Attunuru wrote:
> 
> 
> 
> --------------------------------------------------------------------------------
> *From:* dev <dev-bounces@dpdk.org> on behalf of Ferruh Yigit
> <ferruh.yigit@intel.com>
> *Sent:* Thursday, July 11, 2019 9:52 PM
> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com
> *Subject:* [EXT] Re: [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information
>  
> External Email
> 
> ----------------------------------------------------------------------
> On 6/25/2019 4:56 AM, vattunuru@marvell.com wrote:
>> From: Vamsi Attunuru <vattunuru@marvell.com>
>> 
>> PCI related information is needed in KNI kernel module,
>> since it requires iommu domain info for address
>> translations(using iommu_iova_to_phys() call) when
>> KNI runs in IOVA = VA mode.
>> 
>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>> ---
>>  lib/librte_eal/linux/eal/include/rte_kni_common.h | 7 +++++++
>>  lib/librte_kni/rte_kni.c                          | 5 +++++
>>  2 files changed, 12 insertions(+)
>> 
>> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
>> index 91a1c14..5db5a13 100644
>> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
>> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
>> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>>        void * mbuf_va;
>>        phys_addr_t mbuf_phys;
>>  
>> +     /* PCI info */
>> +     uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
>> +     uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
>> +     uint8_t bus;                  /**< Device bus */
>> +     uint8_t devid;                /**< Device ID */
>> +     uint8_t function;             /**< Device function. */
>> +
>>        uint16_t group_id;            /**< Group ID */
>>        uint32_t core_id;             /**< core ID to bind for kernel thread */
>>  
>> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
>> index e29d0cc..99c4bf5 100644
>> --- a/lib/librte_kni/rte_kni.c
>> +++ b/lib/librte_kni/rte_kni.c
>> @@ -242,6 +242,11 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>>                kni->ops.port_id = UINT16_MAX;
>>  
>>        memset(&dev_info, 0, sizeof(dev_info));
>> +     dev_info.bus = conf->addr.bus;
>> +     dev_info.devid = conf->addr.devid;
>> +     dev_info.function = conf->addr.function;
>> +     dev_info.vendor_id = conf->id.vendor_id;
>> +     dev_info.device_id = conf->id.device_id;
> 
> Sample application part to set 'conf' values were also removed, need to add them
> back too.
> 
> Ack, adding in next version of patches.
> 
> The device part was removed from KNI but I can see why this patch requires it back.
> 
> Yes, rte_kni_device_info is the only struct carries all required info to kernel
> module,
> all these device info can be protected under the config check though.

But still, lets make the device info requirement only for 'iova_mode', not for
all KNI

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-07-12 11:10                 ` Ferruh Yigit
@ 2019-07-12 12:27                   ` Vamsi Krishna Attunuru
  2019-07-12 16:29                   ` Vamsi Krishna Attunuru
  1 sibling, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-12 12:27 UTC (permalink / raw)
  To: Ferruh Yigit, dev, Jerin Jacob Kollanukkaran
  Cc: olivier.matz, arybchenko, Kiran Kumar Kokkilagadda




________________________________
From: Ferruh Yigit <ferruh.yigit@intel.com>
Sent: Friday, July 12, 2019 4:40 PM
To: Vamsi Krishna Attunuru; dev@dpdk.org
Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module

On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
>
>
>
> --------------------------------------------------------------------------------
> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> *Sent:* Thursday, July 11, 2019 10:00 PM
> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni
> module
>
> External Email
>
> ----------------------------------------------------------------------
> On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
>> From: Kiran Kumar K <kirankumark@marvell.com>
>>
>> Patch adds support for kernel module to work in IOVA = VA mode,
>> the idea is to get physical address from iova address using
>> iommu_iova_to_phys API and later use phys_to_virt API to
>> convert the physical address to kernel virtual address.
>>
>> When compared with IOVA = PA mode, there is no performance
>> drop with this approach.
>>
>> This approach does not work with the kernel versions less
>> than 4.4.0 because of API compatibility issues.
>>
>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>
> <...>
>
>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>>        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>
>>        /* Translate user space info into kernel space info */
>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
>> -
>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
>> -     kni->sync_va = dev_info.sync_va;
>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>> +     if (dev_info.iova_mode) {
>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
>
> We have "kni/compat.h" to put the version checks, please use abstracted feature
> checks only in the code.
> From experience this goes ugly quickly with the addition to distro kernels and
> their specific versioning, so better to hide these all from the source code.
>
> And this version requirement needs to be documented in kni doc.
>
> ack
>
>> +             (void)pci;
>> +             pr_err("Kernel version is not supported\n");
>
> Can you please include 'iova_mode' condition into the message log, because this
> kernel version is supported if user wants to use via 'iova_mode == 0' condition.
>
> ack
>
>> +             return -EINVAL;
>> +#else
>> +             pci = pci_get_device(dev_info.vendor_id,
>> +                                  dev_info.device_id, NULL);
>> +             while (pci) {
>> +                     if ((pci->bus->number == dev_info.bus) &&
>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
>> +                         (PCI_FUNC(pci->devfn) == dev_info.function)) {
>> +                             domain = iommu_get_domain_for_dev(&pci->dev);
>> +                             break;
>> +                     }
>> +                     pci = pci_get_device(dev_info.vendor_id,
>> +                                          dev_info.device_id, pci);
>> +             }
>
> What if 'pci' is NULL here?
> In kni it is not required to provide a device at all.
>
> Ack, will add a NULL check.
> other point is not clear to me, device info is absolutely required at least
> for  IOVA=VA mode, since it requires to procure iommu domain details.

"device info is absolutely required" *only* for IOVA=VA mode, so user may skip
to provide it.

> Any thoughts or ways to address this without device.?

Return error if 'iova_mode' requested but device info not?

But you didn't replied to passing 'iova_mode' from application, I would like
hear what you are thinking about it..

There is a query on that regard in the cover letter mail chain. Anyways below is that

"""
> I support the idea to remove 'kni' forcing to the IOVA=PA mode, but also not
> sure about forcing all KNI users to update their code to allocate mempool in a
> very specific way.
>
> What about giving more control to the user on this?
>
> Any user want to use IOVA=VA and KNI together can update application to
> justify memory allocation of the KNI and give an explicit "kni iova_mode=1"
> config.

Jerin > Where this config comes, eal or kni sample app or KNI public API?
"""


>
> <...>
>
>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>>                        return;
>>
>>                for (i = 0; i < num_rx; i++) {
>> -                     kva = pa2kva(kni->pa[i]);
>> +                     if (likely(kni->iova_mode == 1))
>> +                             kva = iova2kva(kni, kni->pa[i]);
>> +                     else
>> +                             kva = pa2kva(kni->pa[i]);
>
> To reduce the churn, what about updating the 'pa2kva()' and put the
> "(kni->iova_mode == 1)" check there? Does it help? (not only 'pa2kva()' but its
> friends also, and if it makes more sense agree to rename the functions)
>
> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova address might
> crash, hence the if..else check is added.

I understand that part.
What I am suggestion is something like this:

kva = common_function(kni, kni->pa[i]);

---

common_function() {
        if (unlikely(kni->iova_mode == 1))
                return iova2kva(kni, kni->pa[i]);
        return pa2kva(kni->pa[i]);
}

To hide the check in the function and make code more readable

>
> And btw, why 'likely' case is "kni->iova_mode == 1"?
>
> no specific case other than branch predict, will remove this if it's really
> harmful to PA mode.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-07-12 11:10                 ` Ferruh Yigit
  2019-07-12 12:27                   ` Vamsi Krishna Attunuru
@ 2019-07-12 16:29                   ` Vamsi Krishna Attunuru
  2019-07-15 11:26                     ` Ferruh Yigit
  1 sibling, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-12 16:29 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: olivier.matz, arybchenko, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Friday, July 12, 2019 4:40 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in
> kni module
> 
> On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
> >
> >
> >
> > ----------------------------------------------------------------------
> > ----------
> > *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> > *Sent:* Thursday, July 11, 2019 10:00 PM
> > *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> > *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> > Kokkilagadda
> > *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support
> > in kni module
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> >> From: Kiran Kumar K <kirankumark@marvell.com>
> >>
> >> Patch adds support for kernel module to work in IOVA = VA mode, the
> >> idea is to get physical address from iova address using
> >> iommu_iova_to_phys API and later use phys_to_virt API to convert the
> >> physical address to kernel virtual address.
> >>
> >> When compared with IOVA = PA mode, there is no performance drop
> with
> >> this approach.
> >>
> >> This approach does not work with the kernel versions less than 4.4.0
> >> because of API compatibility issues.
> >>
> >> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > <...>
> >
> >> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t
> >>ioctl_num,
> >>        strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> >>
> >>        /* Translate user space info into kernel space info */
> >> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
> >> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
> >> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> >> -     kni->free_q = phys_to_virt(dev_info.free_phys);
> >> -
> >> -     kni->req_q = phys_to_virt(dev_info.req_phys);
> >> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
> >> -     kni->sync_va = dev_info.sync_va;
> >> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> >> +     if (dev_info.iova_mode) {
> >> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> >
> > We have "kni/compat.h" to put the version checks, please use
> > abstracted feature checks only in the code.
> > From experience this goes ugly quickly with the addition to distro
> > kernels and their specific versioning, so better to hide these all from the
> source code.
> >
> > And this version requirement needs to be documented in kni doc.
> >
> > ack
> >
> >> +             (void)pci;
> >> +             pr_err("Kernel version is not supported\n");
> >
> > Can you please include 'iova_mode' condition into the message log,
> > because this kernel version is supported if user wants to use via
> 'iova_mode == 0' condition.
> >
> > ack
> >
> >> +             return -EINVAL;
> >> +#else
> >> +             pci = pci_get_device(dev_info.vendor_id,
> >> +                                  dev_info.device_id, NULL);
> >> +             while (pci) {
> >> +                     if ((pci->bus->number == dev_info.bus) &&
> >> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
> >> +                         (PCI_FUNC(pci->devfn) ==
> >> +dev_info.function)) {
> >> +                             domain =
> >> +iommu_get_domain_for_dev(&pci->dev);
> >> +                             break;
> >> +                     }
> >> +                     pci = pci_get_device(dev_info.vendor_id,
> >> +                                          dev_info.device_id, pci);
> >> +             }
> >
> > What if 'pci' is NULL here?
> > In kni it is not required to provide a device at all.
> >
> > Ack, will add a NULL check.
> > other point is not clear to me, device info is absolutely required at
> > least for  IOVA=VA mode, since it requires to procure iommu domain
> details.
> 
> "device info is absolutely required" *only* for IOVA=VA mode, so user may
> skip to provide it.
> 
> > Any thoughts or ways to address this without device.?
> 
> Return error if 'iova_mode' requested but device info not?
> 
> But you didn't replied to passing 'iova_mode' from application, I would like
> hear what you are thinking about it..

One query regarding defining config for kni
Where this config comes, eal or kni sample app or KNI public API?

> 
> >
> > <...>
> >
> >> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
> >>                        return;
> >>
> >>                for (i = 0; i < num_rx; i++) {
> >> -                     kva = pa2kva(kni->pa[i]);
> >> +                     if (likely(kni->iova_mode == 1))
> >> +                             kva = iova2kva(kni, kni->pa[i]);
> >> +                     else
> >> +                             kva = pa2kva(kni->pa[i]);
> >
> > To reduce the churn, what about updating the 'pa2kva()' and put the
> > "(kni->iova_mode == 1)" check there? Does it help? (not only
> > 'pa2kva()' but its friends also, and if it makes more sense agree to
> > rename the functions)
> >
> > No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova
> > address might crash, hence the if..else check is added.
> 
> I understand that part.
> What I am suggestion is something like this:
> 
> kva = common_function(kni, kni->pa[i]);
> 
> ---
> 
> common_function() {
> 	if (unlikely(kni->iova_mode == 1))
> 		return iova2kva(kni, kni->pa[i]);
> 	return pa2kva(kni->pa[i]);
> }
> 
> To hide the check in the function and make code more readable
> 
> >
> > And btw, why 'likely' case is "kni->iova_mode == 1"?
> >
> > no specific case other than branch predict, will remove this if it's
> > really harmful to PA mode.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-07-12 16:29                   ` Vamsi Krishna Attunuru
@ 2019-07-15 11:26                     ` Ferruh Yigit
  2019-07-15 13:06                       ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-07-15 11:26 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: olivier.matz, arybchenko, Kiran Kumar Kokkilagadda

On 7/12/2019 5:29 PM, Vamsi Krishna Attunuru wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Friday, July 12, 2019 4:40 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
>> Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in
>> kni module
>>
>> On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
>>>
>>>
>>>
>>> ----------------------------------------------------------------------
>>> ----------
>>> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
>>> *Sent:* Thursday, July 11, 2019 10:00 PM
>>> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
>>> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
>>> Kokkilagadda
>>> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support
>>> in kni module
>>>
>>> External Email
>>>
>>> ----------------------------------------------------------------------
>>> On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
>>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>>
>>>> Patch adds support for kernel module to work in IOVA = VA mode, the
>>>> idea is to get physical address from iova address using
>>>> iommu_iova_to_phys API and later use phys_to_virt API to convert the
>>>> physical address to kernel virtual address.
>>>>
>>>> When compared with IOVA = PA mode, there is no performance drop
>> with
>>>> this approach.
>>>>
>>>> This approach does not work with the kernel versions less than 4.4.0
>>>> because of API compatibility issues.
>>>>
>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> <...>
>>>
>>>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t
>>>> ioctl_num,
>>>>         strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>>>
>>>>         /* Translate user space info into kernel space info */
>>>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
>>>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
>>>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>>>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
>>>> -
>>>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
>>>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
>>>> -     kni->sync_va = dev_info.sync_va;
>>>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>>>> +     if (dev_info.iova_mode) {
>>>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
>>>
>>> We have "kni/compat.h" to put the version checks, please use
>>> abstracted feature checks only in the code.
>>> From experience this goes ugly quickly with the addition to distro
>>> kernels and their specific versioning, so better to hide these all from the
>> source code.
>>>
>>> And this version requirement needs to be documented in kni doc.
>>>
>>> ack
>>>
>>>> +             (void)pci;
>>>> +             pr_err("Kernel version is not supported\n");
>>>
>>> Can you please include 'iova_mode' condition into the message log,
>>> because this kernel version is supported if user wants to use via
>> 'iova_mode == 0' condition.
>>>
>>> ack
>>>
>>>> +             return -EINVAL;
>>>> +#else
>>>> +             pci = pci_get_device(dev_info.vendor_id,
>>>> +                                  dev_info.device_id, NULL);
>>>> +             while (pci) {
>>>> +                     if ((pci->bus->number == dev_info.bus) &&
>>>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid) &&
>>>> +                         (PCI_FUNC(pci->devfn) ==
>>>> +dev_info.function)) {
>>>> +                             domain =
>>>> +iommu_get_domain_for_dev(&pci->dev);
>>>> +                             break;
>>>> +                     }
>>>> +                     pci = pci_get_device(dev_info.vendor_id,
>>>> +                                          dev_info.device_id, pci);
>>>> +             }
>>>
>>> What if 'pci' is NULL here?
>>> In kni it is not required to provide a device at all.
>>>
>>> Ack, will add a NULL check.
>>> other point is not clear to me, device info is absolutely required at
>>> least for  IOVA=VA mode, since it requires to procure iommu domain
>> details.
>>
>> "device info is absolutely required" *only* for IOVA=VA mode, so user may
>> skip to provide it.
>>
>>> Any thoughts or ways to address this without device.?
>>
>> Return error if 'iova_mode' requested but device info not?
>>
>> But you didn't replied to passing 'iova_mode' from application, I would like
>> hear what you are thinking about it..
> 
> One query regarding defining config for kni
> Where this config comes, eal or kni sample app or KNI public API?

Config comes from the application, but the KNI public API has to validate if the
request can be justified and return error if can't be.
I think the KNI API check is required to be able to remove the check in the eal.

> 
>>
>>>
>>> <...>
>>>
>>>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
>>>>                         return;
>>>>
>>>>                 for (i = 0; i < num_rx; i++) {
>>>> -                     kva = pa2kva(kni->pa[i]);
>>>> +                     if (likely(kni->iova_mode == 1))
>>>> +                             kva = iova2kva(kni, kni->pa[i]);
>>>> +                     else
>>>> +                             kva = pa2kva(kni->pa[i]);
>>>
>>> To reduce the churn, what about updating the 'pa2kva()' and put the
>>> "(kni->iova_mode == 1)" check there? Does it help? (not only
>>> 'pa2kva()' but its friends also, and if it makes more sense agree to
>>> rename the functions)
>>>
>>> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova
>>> address might crash, hence the if..else check is added.
>>
>> I understand that part.
>> What I am suggestion is something like this:
>>
>> kva = common_function(kni, kni->pa[i]);
>>
>> ---
>>
>> common_function() {
>> 	if (unlikely(kni->iova_mode == 1))
>> 		return iova2kva(kni, kni->pa[i]);
>> 	return pa2kva(kni->pa[i]);
>> }
>>
>> To hide the check in the function and make code more readable
>>
>>>
>>> And btw, why 'likely' case is "kni->iova_mode == 1"?
>>>
>>> no specific case other than branch predict, will remove this if it's
>>> really harmful to PA mode.
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module
  2019-07-15 11:26                     ` Ferruh Yigit
@ 2019-07-15 13:06                       ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-15 13:06 UTC (permalink / raw)
  To: Ferruh Yigit, dev; +Cc: olivier.matz, arybchenko, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Monday, July 15, 2019 4:57 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni
> module
> 
> On 7/12/2019 5:29 PM, Vamsi Krishna Attunuru wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ferruh Yigit <ferruh.yigit@intel.com>
> >> Sent: Friday, July 12, 2019 4:40 PM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> >> Kokkilagadda <kirankumark@marvell.com>
> >> Subject: Re: [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA
> >> support in kni module
> >>
> >> On 7/12/2019 11:38 AM, Vamsi Krishna Attunuru wrote:
> >>>
> >>>
> >>>
> >>> --------------------------------------------------------------------
> >>> --
> >>> ----------
> >>> *From:* Ferruh Yigit <ferruh.yigit@intel.com>
> >>> *Sent:* Thursday, July 11, 2019 10:00 PM
> >>> *To:* Vamsi Krishna Attunuru; dev@dpdk.org
> >>> *Cc:* olivier.matz@6wind.com; arybchenko@solarflare.com; Kiran Kumar
> >>> Kokkilagadda
> >>> *Subject:* [EXT] Re: [PATCH v6 4/4] kernel/linux/kni: add IOVA
> >>> support in kni module
> >>>
> >>> External Email
> >>>
> >>> --------------------------------------------------------------------
> >>> -- On 6/25/2019 4:57 AM, vattunuru@marvell.com wrote:
> >>>> From: Kiran Kumar K <kirankumark@marvell.com>
> >>>>
> >>>> Patch adds support for kernel module to work in IOVA = VA mode, the
> >>>> idea is to get physical address from iova address using
> >>>> iommu_iova_to_phys API and later use phys_to_virt API to convert
> >>>> the physical address to kernel virtual address.
> >>>>
> >>>> When compared with IOVA = PA mode, there is no performance drop
> >> with
> >>>> this approach.
> >>>>
> >>>> This approach does not work with the kernel versions less than
> >>>> 4.4.0 because of API compatibility issues.
> >>>>
> >>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> <...>
> >>>
> >>>> @@ -351,15 +354,56 @@ kni_ioctl_create(struct net *net, uint32_t
> >>>> ioctl_num,
> >>>>         strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> >>>>
> >>>>         /* Translate user space info into kernel space info */
> >>>> -     kni->tx_q = phys_to_virt(dev_info.tx_phys);
> >>>> -     kni->rx_q = phys_to_virt(dev_info.rx_phys);
> >>>> -     kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> >>>> -     kni->free_q = phys_to_virt(dev_info.free_phys);
> >>>> -
> >>>> -     kni->req_q = phys_to_virt(dev_info.req_phys);
> >>>> -     kni->resp_q = phys_to_virt(dev_info.resp_phys);
> >>>> -     kni->sync_va = dev_info.sync_va;
> >>>> -     kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> >>>> +     if (dev_info.iova_mode) {
> >>>> +#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
> >>>
> >>> We have "kni/compat.h" to put the version checks, please use
> >>> abstracted feature checks only in the code.
> >>> From experience this goes ugly quickly with the addition to distro
> >>> kernels and their specific versioning, so better to hide these all
> >>> from the
> >> source code.
> >>>
> >>> And this version requirement needs to be documented in kni doc.
> >>>
> >>> ack
> >>>
> >>>> +             (void)pci;
> >>>> +             pr_err("Kernel version is not supported\n");
> >>>
> >>> Can you please include 'iova_mode' condition into the message log,
> >>> because this kernel version is supported if user wants to use via
> >> 'iova_mode == 0' condition.
> >>>
> >>> ack
> >>>
> >>>> +             return -EINVAL;
> >>>> +#else
> >>>> +             pci = pci_get_device(dev_info.vendor_id,
> >>>> +                                  dev_info.device_id, NULL);
> >>>> +             while (pci) {
> >>>> +                     if ((pci->bus->number == dev_info.bus) &&
> >>>> +                         (PCI_SLOT(pci->devfn) == dev_info.devid)
> >>>> +&&
> >>>> +                         (PCI_FUNC(pci->devfn) ==
> >>>> +dev_info.function)) {
> >>>> +                             domain =
> >>>> +iommu_get_domain_for_dev(&pci->dev);
> >>>> +                             break;
> >>>> +                     }
> >>>> +                     pci = pci_get_device(dev_info.vendor_id,
> >>>> +                                          dev_info.device_id,
> >>>> +pci);
> >>>> +             }
> >>>
> >>> What if 'pci' is NULL here?
> >>> In kni it is not required to provide a device at all.
> >>>
> >>> Ack, will add a NULL check.
> >>> other point is not clear to me, device info is absolutely required
> >>> at least for  IOVA=VA mode, since it requires to procure iommu
> >>> domain
> >> details.
> >>
> >> "device info is absolutely required" *only* for IOVA=VA mode, so user
> >> may skip to provide it.
> >>
> >>> Any thoughts or ways to address this without device.?
> >>
> >> Return error if 'iova_mode' requested but device info not?
> >>
> >> But you didn't replied to passing 'iova_mode' from application, I
> >> would like hear what you are thinking about it..
> >
> > One query regarding defining config for kni Where this config comes,
> > eal or kni sample app or KNI public API?
> 
> Config comes from the application, but the KNI public API has to validate if the
> request can be justified and return error if can't be.
> I think the KNI API check is required to be able to remove the check in the eal.
> 

If eal is enabled in iova=VA mode, kni application can operate in that VA mode right.
I did not understand the application's use or requirement to configure/enforce PA mode
when eal is enabled with iova=VA mode.

How about using rte_eal_iova_mode() == VA check in kni application and kni lib to pass device
info(pci) and also for pool creation(with no page split flag), meaning all these patch changes will go
under that check.

Current eal check(is kni module inserted when eal mode is VA) also can be addressed by checking 
linux version, where if rte_eal_iova_mode() is  VA but kernel version < 4.4.0, 
PA mode can be enforced and all will be configured in PA mode.

With above approach, I think PA mode is still intact and there would not be any issues.

> >
> >>
> >>>
> >>> <...>
> >>>
> >>>> @@ -186,7 +202,10 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
> >>>>                         return;
> >>>>
> >>>>                 for (i = 0; i < num_rx; i++) {
> >>>> -                     kva = pa2kva(kni->pa[i]);
> >>>> +                     if (likely(kni->iova_mode == 1))
> >>>> +                             kva = iova2kva(kni, kni->pa[i]);
> >>>> +                     else
> >>>> +                             kva = pa2kva(kni->pa[i]);
> >>>
> >>> To reduce the churn, what about updating the 'pa2kva()' and put the
> >>> "(kni->iova_mode == 1)" check there? Does it help? (not only
> >>> 'pa2kva()' but its friends also, and if it makes more sense agree to
> >>> rename the functions)
> >>>
> >>> No, in VA mode, kni->pa[i] points to iova address, pa2kva() of iova
> >>> address might crash, hence the if..else check is added.
> >>
> >> I understand that part.
> >> What I am suggestion is something like this:
> >>
> >> kva = common_function(kni, kni->pa[i]);
> >>
> >> ---
> >>
> >> common_function() {
> >> 	if (unlikely(kni->iova_mode == 1))
> >> 		return iova2kva(kni, kni->pa[i]);
> >> 	return pa2kva(kni->pa[i]);
> >> }
> >>
> >> To hide the check in the function and make code more readable
> >>
> >>>
> >>> And btw, why 'likely' case is "kni->iova_mode == 1"?
> >>>
> >>> no specific case other than branch predict, will remove this if it's
> >>> really harmful to PA mode.
> >


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support
  2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
                             ` (4 preceding siblings ...)
  2019-06-25 10:00           ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI Burakov, Anatoly
@ 2019-07-17  9:04           ` vattunuru
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries vattunuru
                               ` (4 more replies)
  5 siblings, 5 replies; 251+ messages in thread
From: vattunuru @ 2019-07-17  9:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

----
V7 Changes:
* Removed previously proposed mempool flag and made those page boundary
checks default in mempool populate() except for the objects size bigger
than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA mode
is enabled. 

V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered
across page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new
mempool flag.

V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Kiran Kumar K (1):
  kni: add IOVA=VA support in KNI module

Vamsi Attunuru (3):
  mempool: modify mempool populate() to skip objects from page
    boundaries
  kni: add IOVA = VA support in KNI lib
  kni: modify IOVA mode checks to support VA

 doc/guides/prog_guide/kernel_nic_interface.rst    |  8 +++
 kernel/linux/kni/compat.h                         |  4 ++
 kernel/linux/kni/kni_dev.h                        |  4 ++
 kernel/linux/kni/kni_misc.c                       | 71 ++++++++++++++++++++---
 kernel/linux/kni/kni_net.c                        | 59 ++++++++++++++-----
 lib/librte_eal/linux/eal/eal.c                    |  4 +-
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 +++
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 59 +++++++++++++++++--
 lib/librte_mempool/rte_mempool.c                  |  2 +-
 lib/librte_mempool/rte_mempool_ops_default.c      | 33 ++++++++++-
 12 files changed, 223 insertions(+), 31 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
@ 2019-07-17  9:04             ` vattunuru
  2019-07-17 13:36               ` Andrew Rybchenko
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 2/4] kni: add IOVA = VA support in KNI lib vattunuru
                               ` (3 subsequent siblings)
  4 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-07-17  9:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Currently the phys address of a mempool object populated by the mempool
populate default() routine may not be contiguous with in that mbuf range.

Patch ensures that each object's phys address is contiguous by modifying
default behaviour of mempool populate() to prevent objects from being
across 2 pages, expect if the size of object is bigger than size of page.

Since the overhead after this modification will be very minimal considering
the hugepage sizes of 512M & 1G, default behaviour is modified except for
the object sizes bigger than the page size.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_mempool/rte_mempool.c             |  2 +-
 lib/librte_mempool/rte_mempool_ops_default.c | 33 ++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7260ce0..1c48325 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -339,7 +339,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
 		(char *)vaddr + off,
 		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
-		len - off, mempool_add_elem, NULL);
+		len - off, mempool_add_elem, opaque);
 
 	/* not enough room to store one object */
 	if (i == 0) {
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc8..85da264 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -45,19 +45,48 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object falls on a page boundary, else returns 0 */
+static inline int
+mempool_check_obj_bounds(void *obj, uint64_t hugepage_sz, size_t elt_sz)
+{
+	uintptr_t page_end, elt_addr = (uintptr_t)obj;
+	uint32_t pg_shift = rte_bsf32(hugepage_sz);
+	uint64_t page_mask;
+
+	page_mask =  ~((1ull << pg_shift) - 1);
+	page_end = (elt_addr & page_mask) + hugepage_sz;
+
+	if (elt_addr + elt_sz > page_end)
+		return -1;
+
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
-	size_t off;
+	struct rte_memzone *mz = obj_cb_arg;
+	size_t total_elt_sz, off;
 	unsigned int i;
 	void *obj;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+
+		/* Skip page boundary check if element is bigger than page */
+		if (mz->hugepage_sz >= total_elt_sz) {
+			if (mempool_check_obj_bounds((char *)vaddr + off,
+						    mz->hugepage_sz,
+						    total_elt_sz) < 0) {
+				i--; /* Decrement count & skip this obj */
+				off += total_elt_sz;
+				continue;
+			}
+		}
+
 		off += mp->header_size;
 		obj = (char *)vaddr + off;
 		obj_cb(mp, obj_cb_arg, obj,
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v7 2/4] kni: add IOVA = VA support in KNI lib
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries vattunuru
@ 2019-07-17  9:04             ` vattunuru
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 3/4] kni: add IOVA=VA support in KNI module vattunuru
                               ` (2 subsequent siblings)
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-17  9:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implmentation only operates in IOVA=PA mode, patch adds
required functionality in KNI lib to support IOVA=VA mode.

KNI kernel module requires device info to get iommu domain related
information for IOVA addr related translations. Patch defines device
related info in rte_kni_device_info struct and passes device info to
kernel KNI module when IOVA=VA mode is enabled.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++++
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 54 +++++++++++++++++++++++
 4 files changed, 64 insertions(+)

diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 37d9ee8..4fd8a90 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -111,6 +111,13 @@ struct rte_kni_device_info {
 	void * mbuf_va;
 	phys_addr_t mbuf_phys;
 
+	/* PCI info */
+	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+	uint8_t bus;                  /**< Device bus */
+	uint8_t devid;                /**< Device ID */
+	uint8_t function;             /**< Device function. */
+
 	uint16_t group_id;            /**< Group ID */
 	uint32_t core_id;             /**< core ID to bind for kernel thread */
 
@@ -121,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index cbd6599..ab15d10 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_kni.a
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
+CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
 
 EXPORT_MAP := rte_kni_version.map
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index 41fa2e3..fd46f87 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -9,3 +9,4 @@ version = 2
 sources = files('rte_kni.c')
 headers = files('rte_kni.h')
 deps += ['ethdev', 'pci']
+includes += include_directories('../../drivers/bus/pci')
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 4b51fb4..2cb653e 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -14,6 +14,7 @@
 #include <rte_spinlock.h>
 #include <rte_string_fns.h>
 #include <rte_ethdev.h>
+#include <rte_bus_pci.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_kni.h>
@@ -199,6 +200,26 @@ kni_release_mz(struct rte_kni *kni)
 	rte_memzone_free(kni->m_sync_addr);
 }
 
+static void
+kni_dev_pci_addr_get(struct rte_pci_addr *addr,
+		    struct rte_pci_id *id, uint16_t port_id)
+{
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+		*addr = pci_dev->addr;
+		*id = pci_dev->id;
+	}
+}
+
 struct rte_kni *
 rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	      const struct rte_kni_conf *conf,
@@ -247,6 +268,37 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 		kni->ops.port_id = UINT16_MAX;
 
 	memset(&dev_info, 0, sizeof(dev_info));
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		uint64_t page_sz = pktmbuf_pool->mz->hugepage_sz;
+		uint16_t port_id = conf->group_id;
+		struct rte_pci_addr addr = { 0 };
+		struct rte_pci_id id = { 0 };
+		size_t buf_sz;
+
+		kni_dev_pci_addr_get(&addr, &id, port_id);
+		dev_info.bus = addr.bus;
+		dev_info.devid = addr.devid;
+		dev_info.function = addr.function;
+		dev_info.vendor_id = id.vendor_id;
+		dev_info.device_id = id.device_id;
+
+		buf_sz = pktmbuf_pool->header_size + pktmbuf_pool->elt_size +
+			 pktmbuf_pool->trailer_size;
+
+		/* Return failure when mbuf size is bigger than page size,
+		 * because phys address of those mbuf might not be physically
+		 * contiguous and KNI kernal module can not translate those
+		 * mbuf's IOVA addresses.
+		 */
+		if (buf_sz > page_sz) {
+			RTE_LOG(ERR, KNI,
+				"KNI does not work in IOVA=VA mode when mbuf_sz > page_sz\n");
+			RTE_LOG(ERR, KNI, "buf_sz:0x%" PRIx64 " > ", buf_sz);
+			RTE_LOG(ERR, KNI, "page_sz:0x%" PRIx64 "\n", page_sz);
+			goto kni_fail;
+		}
+	}
 	dev_info.core_id = conf->core_id;
 	dev_info.force_bind = conf->force_bind;
 	dev_info.group_id = conf->group_id;
@@ -300,6 +352,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v7 3/4] kni: add IOVA=VA support in KNI module
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries vattunuru
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 2/4] kni: add IOVA = VA support in KNI lib vattunuru
@ 2019-07-17  9:04             ` vattunuru
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 4/4] kni: modify IOVA mode checks to support VA vattunuru
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-17  9:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	kirankumark, Vamsi Attunuru

From: Kiran Kumar K <kirankumark@marvell.com>

Patch adds support for kernel module to work in IOVA = VA mode,
the idea is to get physical address from IOVA address using
iommu_iova_to_phys API and later use phys_to_virt API to
convert the physical address to kernel virtual address.

When compared with IOVA = PA mode, there is no performance
drop with this approach.

This approach does not work with the kernel versions less
than 4.4.0 because of API compatibility issues.

Patch also updates these support details in KNI documentation.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst |  8 +++
 kernel/linux/kni/compat.h                      |  4 ++
 kernel/linux/kni/kni_dev.h                     |  4 ++
 kernel/linux/kni/kni_misc.c                    | 71 ++++++++++++++++++++++----
 kernel/linux/kni/kni_net.c                     | 59 ++++++++++++++++-----
 5 files changed, 124 insertions(+), 22 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 38369b3..fd2ce63 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -291,6 +291,14 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI can be operated in IOVA as VA scheme when following criteria are fullfilled
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
+- eal param `--iova-mode va` is passed or bus IOVA scheme is set to RTE_IOVA_VA
+
 Ethtool
 -------
 
diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..ee997a6 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,7 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 4, 0) <= LINUX_VERSION_CODE
+#define HAVE_IOVA_AS_VA_SUPPORT
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..d5898f3 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 
 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -41,6 +42,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 2b75502..8660205 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -295,6 +295,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 
 	pr_info("Creating kni...\n");
 	/* Check the buffer size, to avoid warning */
@@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_AS_VA_SUPPORT
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		if (pci == NULL) {
+			pr_err("pci dev does not exist\n");
+			return -ENODEV;
+		}
+
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+
+		if (domain == NULL) {
+			pr_err("Failed to get pci dev domain info\n");
+			return -ENODEV;
+		}
+#else
+		pr_err("Kernel version does not support IOVA as VA\n");
+		return -EINVAL;
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index a736407..9387e4c 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,21 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+			   (uintptr_t)m->buf_physaddr) + m->data_off);
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +77,24 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +211,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +299,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +371,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +470,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->pkt_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +482,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +550,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +583,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v7 4/4] kni: modify IOVA mode checks to support VA
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
                               ` (2 preceding siblings ...)
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 3/4] kni: add IOVA=VA support in KNI module vattunuru
@ 2019-07-17  9:04             ` vattunuru
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-17  9:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch addresses checks in KNI and eal that enforce IOVA=PA when
IOVA=VA mode is enabled, since KNI kernel module supports VA
mode for kernel versions >= 4.4.0.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_eal/linux/eal/eal.c | 4 +++-
 lib/librte_kni/rte_kni.c       | 5 -----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 2e5499f..9705243 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1070,12 +1070,14 @@ rte_eal_init(int argc, char **argv)
 		/* Workaround for KNI which requires physical address to work */
 		if (iova_mode == RTE_IOVA_VA &&
 				rte_eal_check_module("rte_kni") == 1) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
 			if (phys_addrs) {
 				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
+				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
 			} else {
 				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
 			}
+#endif
 		}
 #endif
 		rte_eal_get_configuration()->iova_mode = iova_mode;
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 2cb653e..959088e 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -98,11 +98,6 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
-	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
-		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
-		return -1;
-	}
-
 	/* Check FD and open */
 	if (kni_fd < 0) {
 		kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries vattunuru
@ 2019-07-17 13:36               ` Andrew Rybchenko
  2019-07-17 13:47                 ` Olivier Matz
  2019-07-17 17:31                 ` Vamsi Krishna Attunuru
  0 siblings, 2 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-17 13:36 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov, kirankumark

On 7/17/19 12:04 PM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Currently the phys address of a mempool object populated by the mempool
> populate default() routine may not be contiguous with in that mbuf range.
>
> Patch ensures that each object's phys address is contiguous by modifying
> default behaviour of mempool populate() to prevent objects from being
> across 2 pages, expect if the size of object is bigger than size of page.
>
> Since the overhead after this modification will be very minimal considering
> the hugepage sizes of 512M & 1G, default behaviour is modified except for
> the object sizes bigger than the page size.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>

NACK

Looking at MEMPOOL_F_NO_IOVA_CONTIG description I don't
understand why the patch is necessary at all. So, I'd like to
know more. Exact conditions, IOVA mode, hugepage sizes,
mempool flags and how it is populated.

> ---
>   lib/librte_mempool/rte_mempool.c             |  2 +-
>   lib/librte_mempool/rte_mempool_ops_default.c | 33 ++++++++++++++++++++++++++--
>   2 files changed, 32 insertions(+), 3 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 7260ce0..1c48325 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -339,7 +339,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
>   	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
>   		(char *)vaddr + off,
>   		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
> -		len - off, mempool_add_elem, NULL);
> +		len - off, mempool_add_elem, opaque);

The last argument is the callback opaque value. mempool_add_elem()
does not use the opaque. But it is incorrect to use it for other purposes
and require it to be memzone.

>   	/* not enough room to store one object */
>   	if (i == 0) {
> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> index 4e2bfc8..85da264 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -45,19 +45,48 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   	return mem_size;
>   }
>   
> +/* Returns -1 if object falls on a page boundary, else returns 0 */
> +static inline int
> +mempool_check_obj_bounds(void *obj, uint64_t hugepage_sz, size_t elt_sz)
> +{
> +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> +	uint32_t pg_shift = rte_bsf32(hugepage_sz);
> +	uint64_t page_mask;
> +
> +	page_mask =  ~((1ull << pg_shift) - 1);
> +	page_end = (elt_addr & page_mask) + hugepage_sz;
> +
> +	if (elt_addr + elt_sz > page_end)
> +		return -1;
> +
> +	return 0;
> +}
> +
>   int
>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
>   		void *vaddr, rte_iova_t iova, size_t len,
>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>   {
> -	size_t total_elt_sz;
> -	size_t off;
> +	struct rte_memzone *mz = obj_cb_arg;
> +	size_t total_elt_sz, off;

Why two variables are combined into one here? It is unrelated change.

>   	unsigned int i;
>   	void *obj;
>   
>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   
>   	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +
> +		/* Skip page boundary check if element is bigger than page */
> +		if (mz->hugepage_sz >= total_elt_sz) {
> +			if (mempool_check_obj_bounds((char *)vaddr + off,
> +						    mz->hugepage_sz,
> +						    total_elt_sz) < 0) {
> +				i--; /* Decrement count & skip this obj */
> +				off += total_elt_sz;
> +				continue;
> +			}
> +		}
> +

What I don't like here is that it makes one memory chunk insufficient
to populate all objects. I.e. we calculated memory chunk size required,
but skipped some object. May be it is not a problem, but breaks the
logic existing in the code.

>   		off += mp->header_size;
>   		obj = (char *)vaddr + off;
>   		obj_cb(mp, obj_cb_arg, obj,



^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries
  2019-07-17 13:36               ` Andrew Rybchenko
@ 2019-07-17 13:47                 ` Olivier Matz
  2019-07-17 17:31                 ` Vamsi Krishna Attunuru
  1 sibling, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-07-17 13:47 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: vattunuru, dev, thomas, jerinj, ferruh.yigit, anatoly.burakov,
	kirankumark

On Wed, Jul 17, 2019 at 04:36:37PM +0300, Andrew Rybchenko wrote:
> On 7/17/19 12:04 PM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> > 
> > Currently the phys address of a mempool object populated by the mempool
> > populate default() routine may not be contiguous with in that mbuf range.
> > 
> > Patch ensures that each object's phys address is contiguous by modifying
> > default behaviour of mempool populate() to prevent objects from being
> > across 2 pages, expect if the size of object is bigger than size of page.
> > 
> > Since the overhead after this modification will be very minimal considering
> > the hugepage sizes of 512M & 1G, default behaviour is modified except for
> > the object sizes bigger than the page size.
> > 
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> 
> NACK
> 
> Looking at MEMPOOL_F_NO_IOVA_CONTIG description I don't
> understand why the patch is necessary at all. So, I'd like to
> know more. Exact conditions, IOVA mode, hugepage sizes,
> mempool flags and how it is populated.
> 
> > ---
> >   lib/librte_mempool/rte_mempool.c             |  2 +-
> >   lib/librte_mempool/rte_mempool_ops_default.c | 33 ++++++++++++++++++++++++++--
> >   2 files changed, 32 insertions(+), 3 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 7260ce0..1c48325 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -339,7 +339,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
> >   	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
> >   		(char *)vaddr + off,
> >   		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
> > -		len - off, mempool_add_elem, NULL);
> > +		len - off, mempool_add_elem, opaque);
> 
> The last argument is the callback opaque value. mempool_add_elem()
> does not use the opaque. But it is incorrect to use it for other purposes
> and require it to be memzone.
> 
> >   	/* not enough room to store one object */
> >   	if (i == 0) {
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> > index 4e2bfc8..85da264 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -45,19 +45,48 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   	return mem_size;
> >   }
> > +/* Returns -1 if object falls on a page boundary, else returns 0 */
> > +static inline int
> > +mempool_check_obj_bounds(void *obj, uint64_t hugepage_sz, size_t elt_sz)
> > +{
> > +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> > +	uint32_t pg_shift = rte_bsf32(hugepage_sz);
> > +	uint64_t page_mask;
> > +
> > +	page_mask =  ~((1ull << pg_shift) - 1);
> > +	page_end = (elt_addr & page_mask) + hugepage_sz;
> > +
> > +	if (elt_addr + elt_sz > page_end)
> > +		return -1;
> > +
> > +	return 0;
> > +}
> > +
> >   int
> >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
> >   		void *vaddr, rte_iova_t iova, size_t len,
> >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >   {
> > -	size_t total_elt_sz;
> > -	size_t off;
> > +	struct rte_memzone *mz = obj_cb_arg;
> > +	size_t total_elt_sz, off;
> 
> Why two variables are combined into one here? It is unrelated change.
> 
> >   	unsigned int i;
> >   	void *obj;
> >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >   	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +
> > +		/* Skip page boundary check if element is bigger than page */
> > +		if (mz->hugepage_sz >= total_elt_sz) {
> > +			if (mempool_check_obj_bounds((char *)vaddr + off,
> > +						    mz->hugepage_sz,
> > +						    total_elt_sz) < 0) {
> > +				i--; /* Decrement count & skip this obj */
> > +				off += total_elt_sz;
> > +				continue;
> > +			}
> > +		}
> > +
> 
> What I don't like here is that it makes one memory chunk insufficient
> to populate all objects. I.e. we calculated memory chunk size required,
> but skipped some object. May be it is not a problem, but breaks the
> logic existing in the code.

+1

We should not skip the object, but realign it to the next page. This
way it won't break the logic, because this is what is expected when
required space is calculated in rte_mempool_op_calc_mem_size_default().

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries
  2019-07-17 13:36               ` Andrew Rybchenko
  2019-07-17 13:47                 ` Olivier Matz
@ 2019-07-17 17:31                 ` Vamsi Krishna Attunuru
  2019-07-18  9:28                   ` Andrew Rybchenko
  1 sibling, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-17 17:31 UTC (permalink / raw)
  To: Andrew Rybchenko, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Wednesday, July 17, 2019 7:07 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool
> populate() to skip objects from page boundaries
> 
> On 7/17/19 12:04 PM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Currently the phys address of a mempool object populated by the
> > mempool populate default() routine may not be contiguous with in that
> mbuf range.
> >
> > Patch ensures that each object's phys address is contiguous by
> > modifying default behaviour of mempool populate() to prevent objects
> > from being across 2 pages, expect if the size of object is bigger than size of
> page.
> >
> > Since the overhead after this modification will be very minimal
> > considering the hugepage sizes of 512M & 1G, default behaviour is
> > modified except for the object sizes bigger than the page size.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> 
> NACK
> 
> Looking at MEMPOOL_F_NO_IOVA_CONTIG description I don't understand
> why the patch is necessary at all. So, I'd like to know more. Exact conditions,
> IOVA mode, hugepage sizes, mempool flags and how it is populated.
> 

I presume the commit log clarifies the  changes in the patch, pls correct me if it's not clear. The requirement is to create mempool of objects that each object's phys address in contiguous with in it's range,  having flexibility to create such mempools helpful to the applications like KNI to operate in IOVA=VA mode where KNI kernel mode can safely translate IOVA addresses to PA. 

Regarding the exact conditions driven this approach are, when IOVA mode is set to VA and huge page size of 2MB/512MB used, since KNI application creates mempool without any flags set, mempool populate routine tends to reserve iova contiguous memzones and finally it ends up populating mempool with some objects that might being across two pages(over the page boundary), those mbuf's phys address might not be contiguous and these mempool are not suitable for operating KNI in IOVA=VA mode.

> > ---
> >   lib/librte_mempool/rte_mempool.c             |  2 +-
> >   lib/librte_mempool/rte_mempool_ops_default.c | 33
> ++++++++++++++++++++++++++--
> >   2 files changed, 32 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index 7260ce0..1c48325 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -339,7 +339,7 @@ rte_mempool_populate_iova(struct rte_mempool
> *mp, char *vaddr,
> >   	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
> >   		(char *)vaddr + off,
> >   		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
> > -		len - off, mempool_add_elem, NULL);
> > +		len - off, mempool_add_elem, opaque);
> 
> The last argument is the callback opaque value. mempool_add_elem() does
> not use the opaque. But it is incorrect to use it for other purposes and
> require it to be memzone.

To avoid multiple changes in the mempool APIs for adding new variable, opaque has been leveraged here. I think there is no harm in having this approach since it carries the relevant info and quite suitable between the (*mempool_populate_t) calls.  

> 
> >   	/* not enough room to store one object */
> >   	if (i == 0) {
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index 4e2bfc8..85da264 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -45,19 +45,48 @@ rte_mempool_op_calc_mem_size_default(const
> struct rte_mempool *mp,
> >   	return mem_size;
> >   }
> >
> > +/* Returns -1 if object falls on a page boundary, else returns 0 */
> > +static inline int mempool_check_obj_bounds(void *obj, uint64_t
> > +hugepage_sz, size_t elt_sz) {
> > +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> > +	uint32_t pg_shift = rte_bsf32(hugepage_sz);
> > +	uint64_t page_mask;
> > +
> > +	page_mask =  ~((1ull << pg_shift) - 1);
> > +	page_end = (elt_addr & page_mask) + hugepage_sz;
> > +
> > +	if (elt_addr + elt_sz > page_end)
> > +		return -1;
> > +
> > +	return 0;
> > +}
> > +
> >   int
> >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned
> int max_objs,
> >   		void *vaddr, rte_iova_t iova, size_t len,
> >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >   {
> > -	size_t total_elt_sz;
> > -	size_t off;
> > +	struct rte_memzone *mz = obj_cb_arg;
> > +	size_t total_elt_sz, off;
> 
> Why two variables are combined into one here? It is unrelated change.
> 
> >   	unsigned int i;
> >   	void *obj;
> >
> >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> >   	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs;
> > i++) {
> > +
> > +		/* Skip page boundary check if element is bigger than page */
> > +		if (mz->hugepage_sz >= total_elt_sz) {
> > +			if (mempool_check_obj_bounds((char *)vaddr + off,
> > +						    mz->hugepage_sz,
> > +						    total_elt_sz) < 0) {
> > +				i--; /* Decrement count & skip this obj */
> > +				off += total_elt_sz;
> > +				continue;
> > +			}
> > +		}
> > +
> 
> What I don't like here is that it makes one memory chunk insufficient to
> populate all objects. I.e. we calculated memory chunk size required, but
> skipped some object. May be it is not a problem, but breaks the logic existing
> in the code.
> 
> >   		off += mp->header_size;
> >   		obj = (char *)vaddr + off;
> >   		obj_cb(mp, obj_cb_arg, obj,
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries
  2019-07-17 17:31                 ` Vamsi Krishna Attunuru
@ 2019-07-18  9:28                   ` Andrew Rybchenko
  2019-07-18 14:16                     ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-18  9:28 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda

On 7/17/19 8:31 PM, Vamsi Krishna Attunuru wrote:
>> -----Original Message-----
>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>> Sent: Wednesday, July 17, 2019 7:07 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> olivier.matz@6wind.com; ferruh.yigit@intel.com;
>> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
>> <kirankumark@marvell.com>
>> Subject: Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool
>> populate() to skip objects from page boundaries
>>
>> On 7/17/19 12:04 PM, vattunuru@marvell.com wrote:
>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> Currently the phys address of a mempool object populated by the
>>> mempool populate default() routine may not be contiguous with in that
>> mbuf range.
>>> Patch ensures that each object's phys address is contiguous by
>>> modifying default behaviour of mempool populate() to prevent objects
>>> from being across 2 pages, expect if the size of object is bigger than size of
>> page.
>>> Since the overhead after this modification will be very minimal
>>> considering the hugepage sizes of 512M & 1G, default behaviour is
>>> modified except for the object sizes bigger than the page size.
>>>
>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>> NACK
>>
>> Looking at MEMPOOL_F_NO_IOVA_CONTIG description I don't understand
>> why the patch is necessary at all. So, I'd like to know more. Exact conditions,
>> IOVA mode, hugepage sizes, mempool flags and how it is populated.
>>
> I presume the commit log clarifies the  changes in the patch, pls correct me if it's not clear. The requirement is to create mempool of objects that each object's phys address in contiguous with in it's range,  having flexibility to create such mempools helpful to the applications like KNI to operate in IOVA=VA mode where KNI kernel mode can safely translate IOVA addresses to PA.

As I understand it breaks rte_mempool_populate_default() logic
which assumes that page boundaries may be ignored in IOVA=VA
mode (see no_pageshift =  ... and above description in the function).

Sorry, right now I can't come up with the right fix, but as
explained below suggested is wrong.

> Regarding the exact conditions driven this approach are, when IOVA mode is set to VA and huge page size of 2MB/512MB used, since KNI application creates mempool without any flags set, mempool populate routine tends to reserve iova contiguous memzones and finally it ends up populating mempool with some objects that might being across two pages(over the page boundary), those mbuf's phys address might not be contiguous and these mempool are not suitable for operating KNI in IOVA=VA mode.
>
>>> ---
>>>    lib/librte_mempool/rte_mempool.c             |  2 +-
>>>    lib/librte_mempool/rte_mempool_ops_default.c | 33
>> ++++++++++++++++++++++++++--
>>>    2 files changed, 32 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/lib/librte_mempool/rte_mempool.c
>>> b/lib/librte_mempool/rte_mempool.c
>>> index 7260ce0..1c48325 100644
>>> --- a/lib/librte_mempool/rte_mempool.c
>>> +++ b/lib/librte_mempool/rte_mempool.c
>>> @@ -339,7 +339,7 @@ rte_mempool_populate_iova(struct rte_mempool
>> *mp, char *vaddr,
>>>    	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
>>>    		(char *)vaddr + off,
>>>    		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
>>> -		len - off, mempool_add_elem, NULL);
>>> +		len - off, mempool_add_elem, opaque);
>> The last argument is the callback opaque value. mempool_add_elem() does
>> not use the opaque. But it is incorrect to use it for other purposes and
>> require it to be memzone.
> To avoid multiple changes in the mempool APIs for adding new variable, opaque has been leveraged here. I think there is no harm in having this approach since it carries the relevant info and quite suitable between the (*mempool_populate_t) calls.

Sorry, but it enforces any caller of rte_mempool_populate_iova()
(and rte_mempool_populate_virt()) pass memzone as opaque, but
by API definition it is free_cb argument. So, if different free_cb is
used, different opaque may be required.

>>>    	/* not enough room to store one object */
>>>    	if (i == 0) {
>>> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c
>>> b/lib/librte_mempool/rte_mempool_ops_default.c
>>> index 4e2bfc8..85da264 100644
>>> --- a/lib/librte_mempool/rte_mempool_ops_default.c
>>> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
>>> @@ -45,19 +45,48 @@ rte_mempool_op_calc_mem_size_default(const
>> struct rte_mempool *mp,
>>>    	return mem_size;
>>>    }
>>>
>>> +/* Returns -1 if object falls on a page boundary, else returns 0 */
>>> +static inline int mempool_check_obj_bounds(void *obj, uint64_t
>>> +hugepage_sz, size_t elt_sz) {
>>> +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
>>> +	uint32_t pg_shift = rte_bsf32(hugepage_sz);
>>> +	uint64_t page_mask;
>>> +
>>> +	page_mask =  ~((1ull << pg_shift) - 1);
>>> +	page_end = (elt_addr & page_mask) + hugepage_sz;
>>> +
>>> +	if (elt_addr + elt_sz > page_end)
>>> +		return -1;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>    int
>>>    rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned
>> int max_objs,
>>>    		void *vaddr, rte_iova_t iova, size_t len,
>>>    		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>>>    {
>>> -	size_t total_elt_sz;
>>> -	size_t off;
>>> +	struct rte_memzone *mz = obj_cb_arg;

Moreover, the change enforces obj_cb_arg to be memzone, but
it is obj_cb argument and obj_cb defines what should be in the
obj_cb_arg.

>>> +	size_t total_elt_sz, off;
>> Why two variables are combined into one here? It is unrelated change.
>>
>>>    	unsigned int i;
>>>    	void *obj;
>>>
>>>    	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>>
>>>    	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs;
>>> i++) {
>>> +
>>> +		/* Skip page boundary check if element is bigger than page */
>>> +		if (mz->hugepage_sz >= total_elt_sz) {
>>> +			if (mempool_check_obj_bounds((char *)vaddr + off,
>>> +						    mz->hugepage_sz,
>>> +						    total_elt_sz) < 0) {
>>> +				i--; /* Decrement count & skip this obj */
>>> +				off += total_elt_sz;
>>> +				continue;
>>> +			}
>>> +		}
>>> +
>> What I don't like here is that it makes one memory chunk insufficient to
>> populate all objects. I.e. we calculated memory chunk size required, but
>> skipped some object. May be it is not a problem, but breaks the logic existing
>> in the code.
>>
>>>    		off += mp->header_size;
>>>    		obj = (char *)vaddr + off;
>>>    		obj_cb(mp, obj_cb_arg, obj,

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries
  2019-07-18  9:28                   ` Andrew Rybchenko
@ 2019-07-18 14:16                     ` Vamsi Krishna Attunuru
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-18 14:16 UTC (permalink / raw)
  To: olivier.matz, Andrew Rybchenko, dev, olivier.matz
  Cc: thomas, Jerin Jacob Kollanukkaran, ferruh.yigit, anatoly.burakov,
	Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Thursday, July 18, 2019 2:58 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool
> populate() to skip objects from page boundaries
> 
> On 7/17/19 8:31 PM, Vamsi Krishna Attunuru wrote:
> >> -----Original Message-----
> >> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >> Sent: Wednesday, July 17, 2019 7:07 PM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> >> <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> >> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> >> <kirankumark@marvell.com>
> >> Subject: Re: [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool
> >> populate() to skip objects from page boundaries
> >>
> >> On 7/17/19 12:04 PM, vattunuru@marvell.com wrote:
> >>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> Currently the phys address of a mempool object populated by the
> >>> mempool populate default() routine may not be contiguous with in
> >>> that
> >> mbuf range.
> >>> Patch ensures that each object's phys address is contiguous by
> >>> modifying default behaviour of mempool populate() to prevent objects
> >>> from being across 2 pages, expect if the size of object is bigger
> >>> than size of
> >> page.
> >>> Since the overhead after this modification will be very minimal
> >>> considering the hugepage sizes of 512M & 1G, default behaviour is
> >>> modified except for the object sizes bigger than the page size.
> >>>
> >>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >> NACK
> >>
> >> Looking at MEMPOOL_F_NO_IOVA_CONTIG description I don't
> understand
> >> why the patch is necessary at all. So, I'd like to know more. Exact
> >> conditions, IOVA mode, hugepage sizes, mempool flags and how it is
> populated.
> >>
> > I presume the commit log clarifies the  changes in the patch, pls correct me
> if it's not clear. The requirement is to create mempool of objects that each
> object's phys address in contiguous with in it's range,  having flexibility to
> create such mempools helpful to the applications like KNI to operate in
> IOVA=VA mode where KNI kernel mode can safely translate IOVA addresses
> to PA.
> 
> As I understand it breaks rte_mempool_populate_default() logic which
> assumes that page boundaries may be ignored in IOVA=VA mode (see
> no_pageshift =  ... and above description in the function).
> 
> Sorry, right now I can't come up with the right fix, but as explained below
> suggested is wrong.

It would be nice if how this approach goes wrong was also explained, earlier we had flag based approach and later made it default as Olivier suggested. If there are no other ways to fix, can we stick with this approach or roll back to the earlier flag based one..? 

@Olivier,
Any suggestions..?
 
> 
> > Regarding the exact conditions driven this approach are, when IOVA mode
> is set to VA and huge page size of 2MB/512MB used, since KNI application
> creates mempool without any flags set, mempool populate routine tends to
> reserve iova contiguous memzones and finally it ends up populating
> mempool with some objects that might being across two pages(over the page
> boundary), those mbuf's phys address might not be contiguous and these
> mempool are not suitable for operating KNI in IOVA=VA mode.
> >
> >>> ---
> >>>    lib/librte_mempool/rte_mempool.c             |  2 +-
> >>>    lib/librte_mempool/rte_mempool_ops_default.c | 33
> >> ++++++++++++++++++++++++++--
> >>>    2 files changed, 32 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/lib/librte_mempool/rte_mempool.c
> >>> b/lib/librte_mempool/rte_mempool.c
> >>> index 7260ce0..1c48325 100644
> >>> --- a/lib/librte_mempool/rte_mempool.c
> >>> +++ b/lib/librte_mempool/rte_mempool.c
> >>> @@ -339,7 +339,7 @@ rte_mempool_populate_iova(struct
> rte_mempool
> >> *mp, char *vaddr,
> >>>    	i = rte_mempool_ops_populate(mp, mp->size - mp->populated_size,
> >>>    		(char *)vaddr + off,
> >>>    		(iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off),
> >>> -		len - off, mempool_add_elem, NULL);
> >>> +		len - off, mempool_add_elem, opaque);
> >> The last argument is the callback opaque value. mempool_add_elem()
> >> does not use the opaque. But it is incorrect to use it for other
> >> purposes and require it to be memzone.
> > To avoid multiple changes in the mempool APIs for adding new variable,
> opaque has been leveraged here. I think there is no harm in having this
> approach since it carries the relevant info and quite suitable between the
> (*mempool_populate_t) calls.
> 
> Sorry, but it enforces any caller of rte_mempool_populate_iova() (and
> rte_mempool_populate_virt()) pass memzone as opaque, but by API
> definition it is free_cb argument. So, if different free_cb is used, different
> opaque may be required.

This approach simplifies the changes and avoid multiple changes, can these parameter usage be documented to avoid the confusions.?
Else there will be lots of changes in multiple APIs for passing mz info, if everyone is fine with those changes, will make the required changes and send the next version.

> 
> >>>    	/* not enough room to store one object */
> >>>    	if (i == 0) {
> >>> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c
> >>> b/lib/librte_mempool/rte_mempool_ops_default.c
> >>> index 4e2bfc8..85da264 100644
> >>> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> >>> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> >>> @@ -45,19 +45,48 @@ rte_mempool_op_calc_mem_size_default(const
> >> struct rte_mempool *mp,
> >>>    	return mem_size;
> >>>    }
> >>>
> >>> +/* Returns -1 if object falls on a page boundary, else returns 0 */
> >>> +static inline int mempool_check_obj_bounds(void *obj, uint64_t
> >>> +hugepage_sz, size_t elt_sz) {
> >>> +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> >>> +	uint32_t pg_shift = rte_bsf32(hugepage_sz);
> >>> +	uint64_t page_mask;
> >>> +
> >>> +	page_mask =  ~((1ull << pg_shift) - 1);
> >>> +	page_end = (elt_addr & page_mask) + hugepage_sz;
> >>> +
> >>> +	if (elt_addr + elt_sz > page_end)
> >>> +		return -1;
> >>> +
> >>> +	return 0;
> >>> +}
> >>> +
> >>>    int
> >>>    rte_mempool_op_populate_default(struct rte_mempool *mp,
> unsigned
> >> int max_objs,
> >>>    		void *vaddr, rte_iova_t iova, size_t len,
> >>>    		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >>>    {
> >>> -	size_t total_elt_sz;
> >>> -	size_t off;
> >>> +	struct rte_memzone *mz = obj_cb_arg;
> 
> Moreover, the change enforces obj_cb_arg to be memzone, but it is obj_cb
> argument and obj_cb defines what should be in the obj_cb_arg.

Will revert this while sending next version.

> 
> >>> +	size_t total_elt_sz, off;
> >> Why two variables are combined into one here? It is unrelated change.
> >>
> >>>    	unsigned int i;
> >>>    	void *obj;
> >>>
> >>>    	total_elt_sz = mp->header_size + mp->elt_size +
> >>> mp->trailer_size;
> >>>
> >>>    	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs;
> >>> i++) {
> >>> +
> >>> +		/* Skip page boundary check if element is bigger than page */
> >>> +		if (mz->hugepage_sz >= total_elt_sz) {
> >>> +			if (mempool_check_obj_bounds((char *)vaddr + off,
> >>> +						    mz->hugepage_sz,
> >>> +						    total_elt_sz) < 0) {
> >>> +				i--; /* Decrement count & skip this obj */
> >>> +				off += total_elt_sz;
> >>> +				continue;
> >>> +			}
> >>> +		}
> >>> +
> >> What I don't like here is that it makes one memory chunk insufficient
> >> to populate all objects. I.e. we calculated memory chunk size
> >> required, but skipped some object. May be it is not a problem, but
> >> breaks the logic existing in the code.
> >>
> >>>    		off += mp->header_size;
> >>>    		obj = (char *)vaddr + off;
> >>>    		obj_cb(mp, obj_cb_arg, obj,

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages
  2019-07-18 14:16                     ` Vamsi Krishna Attunuru
@ 2019-07-19 13:38                       ` Olivier Matz
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 1/4] mempool: clarify default populate function Olivier Matz
                                           ` (9 more replies)
  0 siblings, 10 replies; 251+ messages in thread
From: Olivier Matz @ 2019-07-19 13:38 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

When IOVA mode is VA, a mempool can be created with objects that
are not physically contiguous, which breaks KNI.

To solve this, this patchset changes the default behavior of mempool
populate function, to prevent objects from being located across pages.

Olivier Matz (4):
  mempool: clarify default populate function
  mempool: unalign size when calculating required mem amount
  mempool: introduce function to get mempool page size
  mempool: prevent objects from being across pages

 lib/librte_mempool/rte_mempool.c             | 106 +++++++++++----------------
 lib/librte_mempool/rte_mempool.h             |   8 +-
 lib/librte_mempool/rte_mempool_ops.c         |   4 +-
 lib/librte_mempool/rte_mempool_ops_default.c |  39 +++++++++-
 4 files changed, 90 insertions(+), 67 deletions(-)

---

Hi,

> @Olivier,
> Any suggestions..?

I took some time to go a bit deeper. I still think we can change the
default behavior to avoid objects to be located accross pages. But
it is more complex that I expected.

I made a draft patchset, that, in short:
- cleans/renames variables
- removes the optimistic full iova contiguous allocation
- changes return value of calc_mem_size to return the unaligned size,
  therefore the allocation is smaller in case of big hugepages
- changes rte_mempool_op_populate_default() to prevent allocation
  of objects accross multiple pages

Andrew, Anatoly, did I miss something?
Vamsi, can you check if it solves your issue?

Anyway, even if validate the patchset it and make it work, I'm afraid
this is not something that could go in 19.08.

The only alternative I see is a specific mempool allocation function
when used in iova=va mode + kni, as you proposed previously.

It can probably be implemented without adding a flag, starting from
rte_mempool_create(), and replacing rte_mempool_populate_default(mp) by
something else: allocate pages one by one, and call
rte_mempool_populate_iova() for each of them.

Hope it helps. Unfortunately, I may not have too much time to spend on
it in the coming days.

Regards,
Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [RFC 1/4] mempool: clarify default populate function
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
@ 2019-07-19 13:38                         ` Olivier Matz
  2019-07-19 15:42                           ` Andrew Rybchenko
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 2/4] mempool: unalign size when calculating required mem amount Olivier Matz
                                           ` (8 subsequent siblings)
  9 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-07-19 13:38 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

No functional change. Clarify the populate function to make
the next commit easier to understand.

Rename the variables:
- to avoid negation in the name
- to have more understandable names

Remove useless variable (no_pageshift is equivalent to pg_sz == 0).

Remove duplicate affectation of "external" variable.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c | 50 +++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7260ce0be..0f29e8712 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -429,24 +429,18 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	rte_iova_t iova;
 	unsigned mz_id, n;
 	int ret;
-	bool no_contig, try_contig, no_pageshift, external;
+	bool need_iova_contig_obj;
+	bool try_iova_contig_mempool;
+	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
 		return ret;
 
-	/* check if we can retrieve a valid socket ID */
-	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
-	if (ret < 0)
-		return -EINVAL;
-	external = ret;
-
 	/* mempool must not be populated */
 	if (mp->nb_mem_chunks != 0)
 		return -EEXIST;
 
-	no_contig = mp->flags & MEMPOOL_F_NO_IOVA_CONTIG;
-
 	/*
 	 * the following section calculates page shift and page size values.
 	 *
@@ -496,16 +490,23 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * to go for contiguous memory even if we're in no-huge mode, because
 	 * external memory may in fact be IOVA-contiguous.
 	 */
-	external = rte_malloc_heap_socket_is_external(mp->socket_id) == 1;
-	no_pageshift = no_contig ||
-			(!external && rte_eal_iova_mode() == RTE_IOVA_VA);
-	try_contig = !no_contig && !no_pageshift &&
-			(rte_eal_has_hugepages() || external);
 
-	if (no_pageshift) {
+	/* check if we can retrieve a valid socket ID */
+	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+	if (ret < 0)
+		return -EINVAL;
+	alloc_in_ext_mem = (ret == 1);
+	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+	try_iova_contig_mempool = false;
+
+	if (!need_iova_contig_obj) {
+		pg_sz = 0;
+		pg_shift = 0;
+	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
 		pg_sz = 0;
 		pg_shift = 0;
-	} else if (try_contig) {
+	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
+		try_iova_contig_mempool = true;
 		pg_sz = get_min_page_size(mp->socket_id);
 		pg_shift = rte_bsf32(pg_sz);
 	} else {
@@ -517,7 +518,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		size_t min_chunk_size;
 		unsigned int flags;
 
-		if (try_contig || no_pageshift)
+		if (try_iova_contig_mempool || pg_sz == 0)
 			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
 					0, &min_chunk_size, &align);
 		else
@@ -541,7 +542,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		/* if we're trying to reserve contiguous memory, add appropriate
 		 * memzone flag.
 		 */
-		if (try_contig)
+		if (try_iova_contig_mempool)
 			flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
@@ -551,8 +552,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		 * minimum required contiguous chunk fits minimum page, adjust
 		 * memzone size to the page size, and try again.
 		 */
-		if (mz == NULL && try_contig && min_chunk_size <= pg_sz) {
-			try_contig = false;
+		if (mz == NULL && try_iova_contig_mempool &&
+				min_chunk_size <= pg_sz) {
+			try_iova_contig_mempool = false;
 			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
 
 			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
@@ -587,12 +589,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
-		if (no_contig)
-			iova = RTE_BAD_IOVA;
-		else
+		if (need_iova_contig_obj)
 			iova = mz->iova;
+		else
+			iova = RTE_BAD_IOVA;
 
-		if (no_pageshift || try_contig)
+		if (try_iova_contig_mempool || pg_sz == 0)
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [RFC 2/4] mempool: unalign size when calculating required mem amount
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 1/4] mempool: clarify default populate function Olivier Matz
@ 2019-07-19 13:38                         ` Olivier Matz
  2019-08-07 15:21                           ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size Olivier Matz
                                           ` (7 subsequent siblings)
  9 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-07-19 13:38 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

The size returned by rte_mempool_op_calc_mem_size_default() is aligned
to the specified page size. This means that with big pages, the returned
amount is more that what we really need to populate the mempool.

This problem is tempered by the allocation method of
rte_mempool_populate_default(): in some conditions (when
try_iova_contig_mempool=true), it first tries to allocate all objs
memory in an iova contiguous area, without the alignment constraint. If
it fails, it fallbacks to the big aligned allocation, that can also
fallback into several smaller allocations.

This commit changes rte_mempool_op_calc_mem_size_default() to return the
unaligned amount of memory (the alignment constraint is still returned
via the *align argument), and removes the optimistic contiguous
allocation done when try_iova_contig_mempool=true.

This will make the amount of allocated memory more predictible: it will
be more than the optimistic contiguous allocation, but less than the big
aligned allocation.

This opens the door for the next commits that will try to prevent objets
from being located across pages.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c     | 44 ++++--------------------------------
 lib/librte_mempool/rte_mempool.h     |  2 +-
 lib/librte_mempool/rte_mempool_ops.c |  4 +++-
 3 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 0f29e8712..335032dc8 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -430,7 +430,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool try_iova_contig_mempool;
 	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
@@ -477,18 +476,10 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * wasting some space this way, but it's much nicer than looping around
 	 * trying to reserve each and every page size.
 	 *
-	 * However, since size calculation will produce page-aligned sizes, it
-	 * makes sense to first try and see if we can reserve the entire memzone
-	 * in one contiguous chunk as well (otherwise we might end up wasting a
-	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
-	 * memory, then we'll go and reserve space page-by-page.
-	 *
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
 	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold. We should also try
-	 * to go for contiguous memory even if we're in no-huge mode, because
-	 * external memory may in fact be IOVA-contiguous.
+	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	/* check if we can retrieve a valid socket ID */
@@ -497,7 +488,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		return -EINVAL;
 	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
-	try_iova_contig_mempool = false;
 
 	if (!need_iova_contig_obj) {
 		pg_sz = 0;
@@ -506,7 +496,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		pg_sz = 0;
 		pg_shift = 0;
 	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		try_iova_contig_mempool = true;
 		pg_sz = get_min_page_size(mp->socket_id);
 		pg_shift = rte_bsf32(pg_sz);
 	} else {
@@ -518,12 +507,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		size_t min_chunk_size;
 		unsigned int flags;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					0, &min_chunk_size, &align);
-		else
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
+		mem_size = rte_mempool_ops_calc_mem_size(
+			mp, n, pg_shift, &min_chunk_size, &align);
 
 		if (mem_size < 0) {
 			ret = mem_size;
@@ -542,31 +527,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		/* if we're trying to reserve contiguous memory, add appropriate
 		 * memzone flag.
 		 */
-		if (try_iova_contig_mempool)
+		if (min_chunk_size == (size_t)mem_size)
 			flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
 				mp->socket_id, flags, align);
 
-		/* if we were trying to allocate contiguous memory, failed and
-		 * minimum required contiguous chunk fits minimum page, adjust
-		 * memzone size to the page size, and try again.
-		 */
-		if (mz == NULL && try_iova_contig_mempool &&
-				min_chunk_size <= pg_sz) {
-			try_iova_contig_mempool = false;
-			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
-			if (mem_size < 0) {
-				ret = mem_size;
-				goto fail;
-			}
-
-			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
@@ -594,7 +560,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		else
 			iova = RTE_BAD_IOVA;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
+		if (pg_sz == 0)
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a04..7bc10e699 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
  * @param[out] align
  *   Location for required memory chunk alignment.
  * @return
- *   Required memory size aligned at page boundary.
+ *   Required memory size.
  */
 typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		uint32_t obj_num,  uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index e02eb702c..22c5251eb 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to notify new memory area to external mempool */
+/* wrapper to calculate the memory size required to store given number
+ * of objects
+ */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 1/4] mempool: clarify default populate function Olivier Matz
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 2/4] mempool: unalign size when calculating required mem amount Olivier Matz
@ 2019-07-19 13:38                         ` Olivier Matz
  2019-08-07 15:21                           ` Andrew Rybchenko
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages Olivier Matz
                                           ` (6 subsequent siblings)
  9 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-07-19 13:38 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

In rte_mempool_populate_default(), we determine the page size,
which is needed for calc_size and allocation of memory.

Move this in a function and export it, it will be used in next
commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c | 50 +++++++++++++++++++++++++---------------
 lib/librte_mempool/rte_mempool.h |  6 +++++
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 335032dc8..7def0ba68 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,32 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
+{
+	bool need_iova_contig_obj;
+	bool alloc_in_ext_mem;
+	int ret;
+
+	/* check if we can retrieve a valid socket ID */
+	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+	if (ret < 0)
+		return -EINVAL;
+	alloc_in_ext_mem = (ret == 1);
+	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+
+	if (!need_iova_contig_obj)
+		*pg_sz = 0;
+	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
+		*pg_sz = get_min_page_size(mp->socket_id);
+	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
+		*pg_sz = get_min_page_size(mp->socket_id);
+	else
+		*pg_sz = getpagesize();
+
+	return 0;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
@@ -425,12 +451,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
 	ssize_t mem_size;
-	size_t align, pg_sz, pg_shift;
+	size_t align, pg_sz, pg_shift = 0;
 	rte_iova_t iova;
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -482,26 +507,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
-	/* check if we can retrieve a valid socket ID */
-	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
-	if (ret < 0)
-		return -EINVAL;
-	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
-	if (!need_iova_contig_obj) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		pg_sz = get_min_page_size(mp->socket_id);
-		pg_shift = rte_bsf32(pg_sz);
-	} else {
-		pg_sz = getpagesize();
+	if (pg_sz != 0)
 		pg_shift = rte_bsf32(pg_sz);
-	}
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 7bc10e699..00b927989 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1692,6 +1692,12 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
 		      void *arg);
 
+/**
+ * @internal Get page size used for mempool object allocation.
+ */
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
+
 #ifdef __cplusplus
 }
 #endif
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (2 preceding siblings ...)
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-07-19 13:38                         ` Olivier Matz
  2019-07-19 14:03                           ` Burakov, Anatoly
                                             ` (2 more replies)
  2019-07-23  5:37                         ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations " Vamsi Krishna Attunuru
                                           ` (5 subsequent siblings)
  9 siblings, 3 replies; 251+ messages in thread
From: Olivier Matz @ 2019-07-19 13:38 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

When using iova contiguous memory and objets smaller than page size,
ensure that objects are not located across several pages.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool_ops_default.c | 39 ++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc82d..2bbd67367 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -45,19 +45,54 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object falls on a page boundary, else returns 0 */
+static inline int
+mempool_check_obj_bounds(void *obj, uint64_t pg_sz, size_t elt_sz)
+{
+	uintptr_t page_end, elt_addr = (uintptr_t)obj;
+	uint32_t pg_shift;
+	uint64_t page_mask;
+
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+
+	pg_shift = rte_bsf32(pg_sz);
+	page_mask =  ~((1ull << pg_shift) - 1);
+	page_end = (elt_addr & page_mask) + pg_sz;
+
+	if (elt_addr + elt_sz > page_end)
+		return -1;
+
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
 
+	rte_mempool_get_page_size(mp, &pg_sz);
+
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	for (off = 0, i = 0; i < max_objs; i++) {
+		/* align offset to next page start if required */
+		if (mempool_check_obj_bounds((char *)vaddr + off,
+						pg_sz, total_elt_sz) < 0) {
+			off += RTE_PTR_ALIGN_CEIL((char *)vaddr + off, pg_sz) -
+				((char *)vaddr + off);
+		}
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
 		obj = (char *)vaddr + off;
 		obj_cb(mp, obj_cb_arg, obj,
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages Olivier Matz
@ 2019-07-19 14:03                           ` Burakov, Anatoly
  2019-10-28 14:07                             ` Olivier Matz
  2019-07-19 14:11                           ` Burakov, Anatoly
  2019-08-07 15:21                           ` Andrew Rybchenko
  2 siblings, 1 reply; 251+ messages in thread
From: Burakov, Anatoly @ 2019-07-19 14:03 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 19-Jul-19 2:38 PM, Olivier Matz wrote:
> When using iova contiguous memory and objets smaller than page size,
> ensure that objects are not located across several pages.
> 
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   lib/librte_mempool/rte_mempool_ops_default.c | 39 ++++++++++++++++++++++++++--
>   1 file changed, 37 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> index 4e2bfc82d..2bbd67367 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -45,19 +45,54 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   	return mem_size;
>   }
>   
> +/* Returns -1 if object falls on a page boundary, else returns 0 */
> +static inline int
> +mempool_check_obj_bounds(void *obj, uint64_t pg_sz, size_t elt_sz)
> +{
> +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> +	uint32_t pg_shift;
> +	uint64_t page_mask;
> +
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +
> +	pg_shift = rte_bsf32(pg_sz);
> +	page_mask =  ~((1ull << pg_shift) - 1);
> +	page_end = (elt_addr & page_mask) + pg_sz;

This looks like RTE_PTR_ALIGN should do this without the magic? E.g.

page_end = RTE_PTR_ALIGN(elt_addr, pg_sz)

would that not be equivalent?

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages Olivier Matz
  2019-07-19 14:03                           ` Burakov, Anatoly
@ 2019-07-19 14:11                           ` Burakov, Anatoly
  2019-08-07 15:21                           ` Andrew Rybchenko
  2 siblings, 0 replies; 251+ messages in thread
From: Burakov, Anatoly @ 2019-07-19 14:11 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 19-Jul-19 2:38 PM, Olivier Matz wrote:
> When using iova contiguous memory and objets smaller than page size,
> ensure that objects are not located across several pages.
> 
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---

<snip>

>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* align offset to next page start if required */
> +		if (mempool_check_obj_bounds((char *)vaddr + off,
> +						pg_sz, total_elt_sz) < 0) {
> +			off += RTE_PTR_ALIGN_CEIL((char *)vaddr + off, pg_sz) -
> +				((char *)vaddr + off);

Same here, PTR_ALIGN plus PTR_SUB. It's perhaps not as cool as a 
one-liner, but this is not a hot path :)

-- 
Thanks,
Anatoly

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 1/4] mempool: clarify default populate function
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 1/4] mempool: clarify default populate function Olivier Matz
@ 2019-07-19 15:42                           ` Andrew Rybchenko
  2019-10-08  9:36                             ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-19 15:42 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Thomas Monjalon, Anatoly Burakov, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 7/19/19 4:38 PM, Olivier Matz wrote:
> No functional change. Clarify the populate function to make
> the next commit easier to understand.
>
> Rename the variables:
> - to avoid negation in the name
> - to have more understandable names
>
> Remove useless variable (no_pageshift is equivalent to pg_sz == 0).
>
> Remove duplicate affectation of "external" variable.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

LGTM

Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (3 preceding siblings ...)
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages Olivier Matz
@ 2019-07-23  5:37                         ` Vamsi Krishna Attunuru
  2019-08-07 15:21                         ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
                                           ` (4 subsequent siblings)
  9 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-23  5:37 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Andrew Rybchenko, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Ferruh Yigit



> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Friday, July 19, 2019 7:09 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: Andrew Rybchenko <arybchenko@solarflare.com>; Thomas Monjalon
> <thomas@monjalon.net>; Anatoly Burakov <anatoly.burakov@intel.com>; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; Ferruh Yigit <ferruh.yigit@intel.com>
> Subject: [RFC 0/4] mempool: avoid objects allocations across pages
> 
> When IOVA mode is VA, a mempool can be created with objects that are not
> physically contiguous, which breaks KNI.
> 
> To solve this, this patchset changes the default behavior of mempool populate
> function, to prevent objects from being located across pages.
> 
> Olivier Matz (4):
>   mempool: clarify default populate function
>   mempool: unalign size when calculating required mem amount
>   mempool: introduce function to get mempool page size
>   mempool: prevent objects from being across pages
> 
>  lib/librte_mempool/rte_mempool.c             | 106 +++++++++++----------------
>  lib/librte_mempool/rte_mempool.h             |   8 +-
>  lib/librte_mempool/rte_mempool_ops.c         |   4 +-
>  lib/librte_mempool/rte_mempool_ops_default.c |  39 +++++++++-
>  4 files changed, 90 insertions(+), 67 deletions(-)
> 
> ---
> 
> Hi,
> 
> > @Olivier,
> > Any suggestions..?
> 
> I took some time to go a bit deeper. I still think we can change the default
> behavior to avoid objects to be located accross pages. But it is more complex
> that I expected.
> 
> I made a draft patchset, that, in short:
> - cleans/renames variables
> - removes the optimistic full iova contiguous allocation
> - changes return value of calc_mem_size to return the unaligned size,
>   therefore the allocation is smaller in case of big hugepages
> - changes rte_mempool_op_populate_default() to prevent allocation
>   of objects accross multiple pages
> 
> Andrew, Anatoly, did I miss something?
> Vamsi, can you check if it solves your issue?
> 
Thanks Olivier for your time, it solves our problem with the KNI in VA mode. 
Since the change set could not go in 19.08 and as you suggested, 
implemented mempool populate without a flag and used the same for kni application. 
Sending v8 with these changes.

> Anyway, even if validate the patchset it and make it work, I'm afraid this is not
> something that could go in 19.08.
> 
> The only alternative I see is a specific mempool allocation function when used in
> iova=va mode + kni, as you proposed previously.
> 
> It can probably be implemented without adding a flag, starting from
> rte_mempool_create(), and replacing rte_mempool_populate_default(mp) by
> something else: allocate pages one by one, and call
> rte_mempool_populate_iova() for each of them.
> 
> Hope it helps. Unfortunately, I may not have too much time to spend on it in the
> coming days.
> 
> Regards,
> Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v8 0/5] kni: add IOVA=VA support
  2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
                               ` (3 preceding siblings ...)
  2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 4/4] kni: modify IOVA mode checks to support VA vattunuru
@ 2019-07-23  5:38             ` vattunuru
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory vattunuru
                                 ` (6 more replies)
  4 siblings, 7 replies; 251+ messages in thread
From: vattunuru @ 2019-07-23  5:38 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.

V7 Changes:
* Removed previously proposed mempool flag and made those page boundary
checks default in mempool populate() except for the objects size bigger
than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA mode
is enabled.

V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered
across page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new
mempool flag.

V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Kiran Kumar K (1):
  kni: add IOVA=VA support in KNI module

Vamsi Attunuru (4):
  mempool: populate mempool with page sized chunks of memory
  add IOVA -VA support in KNI lib
  kni: add app specific mempool create & free routine
  kni: modify IOVA mode checks to support VA

 doc/guides/prog_guide/kernel_nic_interface.rst    |   8 ++
 doc/guides/rel_notes/release_19_08.rst            |   6 ++
 examples/kni/main.c                               |   6 +-
 kernel/linux/kni/compat.h                         |   4 +
 kernel/linux/kni/kni_dev.h                        |   4 +
 kernel/linux/kni/kni_misc.c                       |  71 +++++++++++++--
 kernel/linux/kni/kni_net.c                        |  59 ++++++++++---
 lib/librte_eal/linux/eal/eal.c                    |   4 +-
 lib/librte_eal/linux/eal/include/rte_kni_common.h |   8 ++
 lib/librte_kni/Makefile                           |   2 +
 lib/librte_kni/meson.build                        |   2 +
 lib/librte_kni/rte_kni.c                          | 100 ++++++++++++++++++++--
 lib/librte_kni/rte_kni.h                          |  49 +++++++++++
 lib/librte_kni/rte_kni_version.map                |   2 +
 lib/librte_mempool/rte_mempool.c                  |  59 +++++++++++++
 lib/librte_mempool/rte_mempool.h                  |  17 ++++
 lib/librte_mempool/rte_mempool_version.map        |   1 +
 17 files changed, 373 insertions(+), 29 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
@ 2019-07-23  5:38               ` vattunuru
  2019-07-23 11:08                 ` Andrew Rybchenko
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 2/5] add IOVA -VA support in KNI lib vattunuru
                                 ` (5 subsequent siblings)
  6 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-07-23  5:38 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds a routine to populate mempool from page aligned and
page sized chunks of memory to ensures memory objs do not fall
across the page boundaries. It's useful for applications that
require physically contiguous mbuf memory while running in
IOVA=VA mode.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_mempool/rte_mempool.c           | 59 ++++++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool.h           | 17 +++++++++
 lib/librte_mempool/rte_mempool_version.map |  1 +
 3 files changed, 77 insertions(+)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7260ce0..5312c8f 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,65 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Function to populate mempool from page sized mem chunks, allocate page size
+ * of memory in memzone and populate them. Return the number of objects added,
+ * or a negative value on error.
+ */
+int
+rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp)
+{
+	char mz_name[RTE_MEMZONE_NAMESIZE];
+	size_t align, pg_sz, pg_shift;
+	const struct rte_memzone *mz;
+	unsigned int mz_id, n;
+	size_t chunk_size;
+	int ret;
+
+	ret = mempool_ops_alloc_once(mp);
+	if (ret != 0)
+		return ret;
+
+	if (mp->nb_mem_chunks != 0)
+		return -EEXIST;
+
+	pg_sz = get_min_page_size(mp->socket_id);
+	pg_shift = rte_bsf32(pg_sz);
+
+	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
+
+		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
+			     &chunk_size, &align);
+
+		if (chunk_size > pg_sz)
+			goto fail;
+
+		ret = snprintf(mz_name, sizeof(mz_name),
+			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
+		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
+			ret = -ENAMETOOLONG;
+			goto fail;
+		}
+
+		mz = rte_memzone_reserve_aligned(mz_name, chunk_size,
+				mp->socket_id, 0, align);
+
+		ret = rte_mempool_populate_iova(mp, mz->addr,
+				mz->iova, mz->len,
+				rte_mempool_memchunk_mz_free,
+				(void *)(uintptr_t)mz);
+		if (ret < 0) {
+			rte_memzone_free(mz);
+			goto fail;
+		}
+	}
+
+	return mp->size;
+
+fail:
+	rte_mempool_free_memchunks(mp);
+	return ret;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a..73d6ada 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1064,6 +1064,23 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 /**
  * Add memory for objects in the pool at init
  *
+ * This is the function used to populate the mempool with page aligned memzone
+ * memory. It ensures all mempool objects being on the page by allocating
+ * memzones with page size.
+ *
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @return
+ *   The number of objects added on success.
+ *   On error, the chunk is not added in the memory list of the
+ *   mempool and a negative errno is returned.
+ */
+__rte_experimental
+int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp);
+
+/**
+ * Add memory for objects in the pool at init
+ *
  * This is the default function used by rte_mempool_create() to populate
  * the mempool. It adds memory allocated using rte_memzone_reserve().
  *
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca4..9a6fe65 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -57,4 +57,5 @@ EXPERIMENTAL {
 	global:
 
 	rte_mempool_ops_get_info;
+	rte_mempool_populate_from_pg_sz_chunks;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v8 2/5] add IOVA -VA support in KNI lib
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory vattunuru
@ 2019-07-23  5:38               ` vattunuru
  2019-07-23 10:54                 ` Andrew Rybchenko
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine vattunuru
                                 ` (4 subsequent siblings)
  6 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-07-23  5:38 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation only operates in IOVA=PA mode, patch adds
required functionality in KNI lib to support IOVA=VA mode.

KNI kernel module requires device info to get iommu domain related
information for IOVA addr related translations. Patch defines device
related info in rte_kni_device_info struct and passes device info to
kernel KNI module when IOVA=VA mode is enabled.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 +++++
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 36 +++++++++++++++++++++++
 4 files changed, 46 insertions(+)

diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 37d9ee8..4fd8a90 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -111,6 +111,13 @@ struct rte_kni_device_info {
 	void * mbuf_va;
 	phys_addr_t mbuf_phys;
 
+	/* PCI info */
+	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+	uint8_t bus;                  /**< Device bus */
+	uint8_t devid;                /**< Device ID */
+	uint8_t function;             /**< Device function. */
+
 	uint16_t group_id;            /**< Group ID */
 	uint32_t core_id;             /**< core ID to bind for kernel thread */
 
@@ -121,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index cbd6599..ab15d10 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_kni.a
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
+CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
 
 EXPORT_MAP := rte_kni_version.map
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index 41fa2e3..fd46f87 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -9,3 +9,4 @@ version = 2
 sources = files('rte_kni.c')
 headers = files('rte_kni.h')
 deps += ['ethdev', 'pci']
+includes += include_directories('../../drivers/bus/pci')
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 4b51fb4..ae17075 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -14,6 +14,7 @@
 #include <rte_spinlock.h>
 #include <rte_string_fns.h>
 #include <rte_ethdev.h>
+#include <rte_bus_pci.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_kni.h>
@@ -199,6 +200,26 @@ kni_release_mz(struct rte_kni *kni)
 	rte_memzone_free(kni->m_sync_addr);
 }
 
+static void
+kni_dev_pci_addr_get(struct rte_pci_addr *addr,
+		    struct rte_pci_id *id, uint16_t port_id)
+{
+	const struct rte_pci_device *pci_dev;
+	const struct rte_bus *bus = NULL;
+	struct rte_eth_dev_info dev_info;
+
+	memset(&dev_info, 0, sizeof(dev_info));
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+		*addr = pci_dev->addr;
+		*id = pci_dev->id;
+	}
+}
+
 struct rte_kni *
 rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	      const struct rte_kni_conf *conf,
@@ -247,6 +268,19 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 		kni->ops.port_id = UINT16_MAX;
 
 	memset(&dev_info, 0, sizeof(dev_info));
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		uint16_t port_id = conf->group_id;
+		struct rte_pci_addr addr = { 0 };
+		struct rte_pci_id id = { 0 };
+
+		kni_dev_pci_addr_get(&addr, &id, port_id);
+		dev_info.bus = addr.bus;
+		dev_info.devid = addr.devid;
+		dev_info.function = addr.function;
+		dev_info.vendor_id = id.vendor_id;
+		dev_info.device_id = id.device_id;
+	}
 	dev_info.core_id = conf->core_id;
 	dev_info.force_bind = conf->force_bind;
 	dev_info.group_id = conf->group_id;
@@ -300,6 +334,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory vattunuru
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 2/5] add IOVA -VA support in KNI lib vattunuru
@ 2019-07-23  5:38               ` vattunuru
  2019-07-23 10:50                 ` Andrew Rybchenko
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 4/5] kni: add IOVA=VA support in KNI module vattunuru
                                 ` (3 subsequent siblings)
  6 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-07-23  5:38 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

When KNI operates in IOVA = VA mode, it requires mbuf memory
to be physically contiguous to ensure KNI kernel module could
translate IOVA addresses properly. Patch adds a KNI specific
mempool create routine to populate the KNI packet mbuf pool
with memory objects that are being on a page.

KNI applications need to use this mempool create & free routines
so that mbuf related requirements in IOVA = VA mode are handled
inside those routines based on the enabled mode.

Updated the release notes with these new routine details.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 doc/guides/rel_notes/release_19_08.rst |  6 ++++
 examples/kni/main.c                    |  6 +++-
 lib/librte_kni/Makefile                |  1 +
 lib/librte_kni/meson.build             |  1 +
 lib/librte_kni/rte_kni.c               | 59 ++++++++++++++++++++++++++++++++++
 lib/librte_kni/rte_kni.h               | 49 ++++++++++++++++++++++++++++
 lib/librte_kni/rte_kni_version.map     |  2 ++
 7 files changed, 123 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index 0a3f840..bd01e99 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -281,6 +281,12 @@ API Changes
   offload flag from the library. The application must set this flag if it is
   supported by the platform and application wishes to use it.
 
+* kni: ``rte_kni_pktmbuf_pool_create`` ``rte_kni_pktmbuf_pool_free`` functions
+  were introduced for KNI applications for creating & freeing packet pool.
+  Since IOVA=VA mode was added in KNI, packet pool's mbuf memory should be
+  physically contiguous for the KNI kernel module to work in IOVA=VA mode,
+  this requirment was taken care in the kni packet pool creation fucntions.
+
 
 ABI Changes
 -----------
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 4710d71..3b9c067 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -975,7 +975,7 @@ main(int argc, char** argv)
 		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
 
 	/* Create the mbuf pool */
-	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
+	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
 		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
 	if (pktmbuf_pool == NULL) {
 		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
@@ -1043,6 +1043,10 @@ main(int argc, char** argv)
 			continue;
 		kni_free_kni(port);
 	}
+
+	if (pktmbuf_pool)
+		rte_kni_pktmbuf_pool_free(pktmbuf_pool);
+
 	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
 		if (kni_port_params_array[i]) {
 			rte_free(kni_port_params_array[i]);
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index ab15d10..5e3dd01 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -6,6 +6,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 # library name
 LIB = librte_kni.a
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
 CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index fd46f87..e357445 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+allow_experimental_apis = true
 if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
 	build = false
 	reason = 'only supported on 64-bit linux'
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index ae17075..897a5bb 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -687,6 +687,65 @@ kni_allocate_mbufs(struct rte_kni *kni)
 	}
 }
 
+struct rte_mempool *
+rte_kni_pktmbuf_pool_create(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
+	int socket_id)
+{
+	struct rte_pktmbuf_pool_private mbp_priv;
+	const char *mp_ops_name;
+	struct rte_mempool *mp;
+	unsigned int elt_size;
+	int ret;
+
+	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
+		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
+			priv_size);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size +
+		(unsigned int)data_room_size;
+	mbp_priv.mbuf_data_room_size = data_room_size;
+	mbp_priv.mbuf_priv_size = priv_size;
+
+	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
+		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
+	if (mp == NULL)
+		return NULL;
+
+	mp_ops_name = rte_mbuf_best_mempool_ops();
+	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+	rte_pktmbuf_pool_init(mp, &mbp_priv);
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		ret = rte_mempool_populate_from_pg_sz_chunks(mp);
+	else
+		ret = rte_mempool_populate_default(mp);
+
+	if (ret < 0) {
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+
+	rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
+
+	return mp;
+}
+
+void
+rte_kni_pktmbuf_pool_free(struct rte_mempool *mp)
+{
+	rte_mempool_free(mp);
+}
+
 struct rte_kni *
 rte_kni_get(const char *name)
 {
diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h
index 5699a64..7f11927 100644
--- a/lib/librte_kni/rte_kni.h
+++ b/lib/librte_kni/rte_kni.h
@@ -20,6 +20,7 @@
 #include <rte_pci.h>
 #include <rte_memory.h>
 #include <rte_mempool.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_ether.h>
 
 #include <rte_kni_common.h>
@@ -184,6 +185,54 @@ unsigned rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs,
 		unsigned num);
 
 /**
+ * Create a kni packet mbuf pool.
+ *
+ * This function creates and initializes a packet mbuf pool for KNI applications
+ * It calls the required mempool populate routine based on the IOVA mode.
+ *
+ * @param name
+ *   The name of the mbuf pool.
+ * @param n
+ *   The number of elements in the mbuf pool. The optimum size (in terms
+ *   of memory usage) for a mempool is when n is a power of two minus one:
+ *   n = (2^q - 1).
+ * @param cache_size
+ *   Size of the per-core object cache. See rte_mempool_create() for
+ *   details.
+ * @param priv_size
+ *   Size of application private are between the rte_mbuf structure
+ *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
+ * @param data_room_size
+ *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
+ * @param socket_id
+ *   The socket identifier where the memory should be allocated. The
+ *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
+ *   reserved zone.
+ * @return
+ *   The pointer to the new allocated mempool, on success. NULL on error
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ *    - E_RTE_SECONDARY - function was called from a secondary process instance
+ *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
+ *    - ENOSPC - the maximum number of memzones has already been allocated
+ *    - EEXIST - a memzone with the same name already exists
+ *    - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+__rte_experimental
+struct rte_mempool *rte_kni_pktmbuf_pool_create(const char *name,
+		unsigned int n, unsigned int cache_size, uint16_t priv_size,
+		uint16_t data_room_size, int socket_id);
+
+/**
+ * Free the given packet mempool.
+ *
+ * @param mp
+ *  The mempool pointer.
+ */
+__rte_experimental
+void rte_kni_pktmbuf_pool_free(struct rte_mempool *mp);
+
+/**
  * Get the KNI context of its name.
  *
  * @param name
diff --git a/lib/librte_kni/rte_kni_version.map b/lib/librte_kni/rte_kni_version.map
index c877dc6..aba9728 100644
--- a/lib/librte_kni/rte_kni_version.map
+++ b/lib/librte_kni/rte_kni_version.map
@@ -20,4 +20,6 @@ EXPERIMENTAL {
 	global:
 
 	rte_kni_update_link;
+	rte_kni_pktmbuf_pool_create;
+	rte_kni_pktmbuf_pool_free;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v8 4/5] kni: add IOVA=VA support in KNI module
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
                                 ` (2 preceding siblings ...)
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine vattunuru
@ 2019-07-23  5:38               ` vattunuru
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 5/5] kni: modify IOVA mode checks to support VA vattunuru
                                 ` (2 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-23  5:38 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Kiran Kumar K <kirankumark@marvell.com>

Patch adds support for kernel module to work in IOVA = VA mode,
the idea is to get physical address from IOVA address using
iommu_iova_to_phys API and later use phys_to_virt API to
convert the physical address to kernel virtual address.

When compared with IOVA = PA mode, there is no performance
drop with this approach.

This approach does not work with the kernel versions less
than 4.4.0 because of API compatibility issues.

Patch also updates these support details in KNI documentation.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 kernel/linux/kni/compat.h   |  4 +++
 kernel/linux/kni/kni_dev.h  |  4 +++
 kernel/linux/kni/kni_misc.c | 71 +++++++++++++++++++++++++++++++++++++++------
 kernel/linux/kni/kni_net.c  | 59 ++++++++++++++++++++++++++++---------
 4 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..ee997a6 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,7 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 4, 0) <= LINUX_VERSION_CODE
+#define HAVE_IOVA_AS_VA_SUPPORT
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..d5898f3 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 
 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -41,6 +42,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 2b75502..8660205 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -295,6 +295,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 
 	pr_info("Creating kni...\n");
 	/* Check the buffer size, to avoid warning */
@@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_AS_VA_SUPPORT
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		if (pci == NULL) {
+			pr_err("pci dev does not exist\n");
+			return -ENODEV;
+		}
+
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+
+		if (domain == NULL) {
+			pr_err("Failed to get pci dev domain info\n");
+			return -ENODEV;
+		}
+#else
+		pr_err("Kernel version does not support IOVA as VA\n");
+		return -EINVAL;
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index 7bd3a9f..8382859 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,21 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+			   (uintptr_t)m->buf_physaddr) + m->data_off);
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +77,24 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +211,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +299,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +371,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +470,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +482,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +550,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +583,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v8 5/5] kni: modify IOVA mode checks to support VA
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
                                 ` (3 preceding siblings ...)
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 4/5] kni: add IOVA=VA support in KNI module vattunuru
@ 2019-07-23  5:38               ` vattunuru
  2019-07-24  7:14               ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
  6 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-23  5:38 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch addresses checks in KNI and eal that enforce IOVA=PA when
IOVA=VA mode is enabled, since KNI kernel module supports VA
mode for kernel versions >= 4.4.0.

Updated KNI documentation with above details.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst | 8 ++++++++
 lib/librte_eal/linux/eal/eal.c                 | 4 +++-
 lib/librte_kni/rte_kni.c                       | 5 -----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 38369b3..fd2ce63 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -291,6 +291,14 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI can be operated in IOVA as VA scheme when following criteria are fullfilled
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
+- eal param `--iova-mode va` is passed or bus IOVA scheme is set to RTE_IOVA_VA
+
 Ethtool
 -------
 
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 2e5499f..9705243 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1070,12 +1070,14 @@ rte_eal_init(int argc, char **argv)
 		/* Workaround for KNI which requires physical address to work */
 		if (iova_mode == RTE_IOVA_VA &&
 				rte_eal_check_module("rte_kni") == 1) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
 			if (phys_addrs) {
 				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
+				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
 			} else {
 				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
 			}
+#endif
 		}
 #endif
 		rte_eal_get_configuration()->iova_mode = iova_mode;
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 897a5bb..6a95230 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -98,11 +98,6 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
-	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
-		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
-		return -1;
-	}
-
 	/* Check FD and open */
 	if (kni_fd < 0) {
 		kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine vattunuru
@ 2019-07-23 10:50                 ` Andrew Rybchenko
  2019-07-23 11:01                   ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-23 10:50 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov, kirankumark

On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> When KNI operates in IOVA = VA mode, it requires mbuf memory
> to be physically contiguous to ensure KNI kernel module could
> translate IOVA addresses properly. Patch adds a KNI specific
> mempool create routine to populate the KNI packet mbuf pool
> with memory objects that are being on a page.
>
> KNI applications need to use this mempool create & free routines
> so that mbuf related requirements in IOVA = VA mode are handled
> inside those routines based on the enabled mode.
>
> Updated the release notes with these new routine details.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
>   doc/guides/rel_notes/release_19_08.rst |  6 ++++
>   examples/kni/main.c                    |  6 +++-
>   lib/librte_kni/Makefile                |  1 +
>   lib/librte_kni/meson.build             |  1 +
>   lib/librte_kni/rte_kni.c               | 59 ++++++++++++++++++++++++++++++++++
>   lib/librte_kni/rte_kni.h               | 49 ++++++++++++++++++++++++++++
>   lib/librte_kni/rte_kni_version.map     |  2 ++
>   7 files changed, 123 insertions(+), 1 deletion(-)
>
> diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
> index 0a3f840..bd01e99 100644
> --- a/doc/guides/rel_notes/release_19_08.rst
> +++ b/doc/guides/rel_notes/release_19_08.rst
> @@ -281,6 +281,12 @@ API Changes
>     offload flag from the library. The application must set this flag if it is
>     supported by the platform and application wishes to use it.
>   
> +* kni: ``rte_kni_pktmbuf_pool_create`` ``rte_kni_pktmbuf_pool_free`` functions
> +  were introduced for KNI applications for creating & freeing packet pool.
> +  Since IOVA=VA mode was added in KNI, packet pool's mbuf memory should be
> +  physically contiguous for the KNI kernel module to work in IOVA=VA mode,
> +  this requirment was taken care in the kni packet pool creation fucntions.
> +
>   
>   ABI Changes
>   -----------
> diff --git a/examples/kni/main.c b/examples/kni/main.c
> index 4710d71..3b9c067 100644
> --- a/examples/kni/main.c
> +++ b/examples/kni/main.c
> @@ -975,7 +975,7 @@ main(int argc, char** argv)
>   		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
>   
>   	/* Create the mbuf pool */
> -	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
> +	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
>   		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
>   	if (pktmbuf_pool == NULL) {
>   		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
> @@ -1043,6 +1043,10 @@ main(int argc, char** argv)
>   			continue;
>   		kni_free_kni(port);
>   	}
> +
> +	if (pktmbuf_pool)

Typically pointer is compared to NULL, but it is not required here
anyway, since rte_mempool_free() handles NULL perfectly itself.

> +		rte_kni_pktmbuf_pool_free(pktmbuf_pool);
> +
>   	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
>   		if (kni_port_params_array[i]) {
>   			rte_free(kni_port_params_array[i]);

<...>

> diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h
> index 5699a64..7f11927 100644
> --- a/lib/librte_kni/rte_kni.h
> +++ b/lib/librte_kni/rte_kni.h
> @@ -20,6 +20,7 @@
>   #include <rte_pci.h>
>   #include <rte_memory.h>
>   #include <rte_mempool.h>
> +#include <rte_mbuf_pool_ops.h>

I don't understand why it is included here.

>   #include <rte_ether.h>
>   
>   #include <rte_kni_common.h>
>

<...>

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 2/5] add IOVA -VA support in KNI lib
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 2/5] add IOVA -VA support in KNI lib vattunuru
@ 2019-07-23 10:54                 ` Andrew Rybchenko
  0 siblings, 0 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-23 10:54 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	kirankumark, Bruce Richardson

On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Current KNI implementation only operates in IOVA=PA mode, patch adds
> required functionality in KNI lib to support IOVA=VA mode.
>
> KNI kernel module requires device info to get iommu domain related
> information for IOVA addr related translations. Patch defines device
> related info in rte_kni_device_info struct and passes device info to
> kernel KNI module when IOVA=VA mode is enabled.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>

<...>

> diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
> index cbd6599..ab15d10 100644
> --- a/lib/librte_kni/Makefile
> +++ b/lib/librte_kni/Makefile
> @@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
>   LIB = librte_kni.a
>   
>   CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
> +CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
>   LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
>   
>   EXPORT_MAP := rte_kni_version.map
> diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
> index 41fa2e3..fd46f87 100644
> --- a/lib/librte_kni/meson.build
> +++ b/lib/librte_kni/meson.build
> @@ -9,3 +9,4 @@ version = 2
>   sources = files('rte_kni.c')
>   headers = files('rte_kni.h')
>   deps += ['ethdev', 'pci']

Not directly related to the patch, but are mempool and mbuf
dependencies lost here. Cc Bruce to comment.
The library uses rte_mempool_obj_iter() and rte_pktmbuf_free()
(which uses rte_mbuf_sanity_check()).

> +includes += include_directories('../../drivers/bus/pci')
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index 4b51fb4..ae17075 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -14,6 +14,7 @@
>   #include <rte_spinlock.h>
>   #include <rte_string_fns.h>
>   #include <rte_ethdev.h>
> +#include <rte_bus_pci.h>
>   #include <rte_malloc.h>
>   #include <rte_log.h>
>   #include <rte_kni.h>
> @@ -199,6 +200,26 @@ kni_release_mz(struct rte_kni *kni)
>   	rte_memzone_free(kni->m_sync_addr);
>   }
>   
> +static void
> +kni_dev_pci_addr_get(struct rte_pci_addr *addr,
> +		    struct rte_pci_id *id, uint16_t port_id)

Please, consider to make port_id the first argument.
It is mandatory input argument which is used as a key
to fill in other two output arguments.

> +{
> +	const struct rte_pci_device *pci_dev;
> +	const struct rte_bus *bus = NULL;
> +	struct rte_eth_dev_info dev_info;
> +
> +	memset(&dev_info, 0, sizeof(dev_info));

Most likely not required even now, but for sure not required
if the patch [1] applied.

[1] http://patches.dpdk.org/patch/56959/

> +	rte_eth_dev_info_get(port_id, &dev_info);
> +
> +	if (dev_info.device)
> +		bus = rte_bus_find_by_device(dev_info.device);
> +	if (bus && !strcmp(bus->name, "pci")) {
> +		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
> +		*addr = pci_dev->addr;
> +		*id = pci_dev->id;

I think it would be better to always init addr and id in
the function. Otherwise caller does not know when it is
initialized or not.

> +	}
> +}
> +
>   struct rte_kni *
>   rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>   	      const struct rte_kni_conf *conf,
> @@ -247,6 +268,19 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>   		kni->ops.port_id = UINT16_MAX;
>   
>   	memset(&dev_info, 0, sizeof(dev_info));
> +
> +	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> +		uint16_t port_id = conf->group_id;
> +		struct rte_pci_addr addr = { 0 };
> +		struct rte_pci_id id = { 0 };

If addr and id are always initialized in kni_dev_pci_addr_get()
init here would be not required.

> +
> +		kni_dev_pci_addr_get(&addr, &id, port_id);
> +		dev_info.bus = addr.bus;
> +		dev_info.devid = addr.devid;
> +		dev_info.function = addr.function;
> +		dev_info.vendor_id = id.vendor_id;
> +		dev_info.device_id = id.device_id;
> +	}
>   	dev_info.core_id = conf->core_id;
>   	dev_info.force_bind = conf->force_bind;
>   	dev_info.group_id = conf->group_id;
> @@ -300,6 +334,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>   	kni->group_id = conf->group_id;
>   	kni->mbuf_size = conf->mbuf_size;
>   
> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> +
>   	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
>   	if (ret < 0)
>   		goto ioctl_fail;


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine
  2019-07-23 10:50                 ` Andrew Rybchenko
@ 2019-07-23 11:01                   ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-23 11:01 UTC (permalink / raw)
  To: Andrew Rybchenko, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Tuesday, July 23, 2019 4:20 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create &
> free routine
> 
> On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > When KNI operates in IOVA = VA mode, it requires mbuf memory to be
> > physically contiguous to ensure KNI kernel module could translate IOVA
> > addresses properly. Patch adds a KNI specific mempool create routine
> > to populate the KNI packet mbuf pool with memory objects that are
> > being on a page.
> >
> > KNI applications need to use this mempool create & free routines so
> > that mbuf related requirements in IOVA = VA mode are handled inside
> > those routines based on the enabled mode.
> >
> > Updated the release notes with these new routine details.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > ---
> >   doc/guides/rel_notes/release_19_08.rst |  6 ++++
> >   examples/kni/main.c                    |  6 +++-
> >   lib/librte_kni/Makefile                |  1 +
> >   lib/librte_kni/meson.build             |  1 +
> >   lib/librte_kni/rte_kni.c               | 59 ++++++++++++++++++++++++++++++++++
> >   lib/librte_kni/rte_kni.h               | 49 ++++++++++++++++++++++++++++
> >   lib/librte_kni/rte_kni_version.map     |  2 ++
> >   7 files changed, 123 insertions(+), 1 deletion(-)
> >
> > diff --git a/doc/guides/rel_notes/release_19_08.rst
> > b/doc/guides/rel_notes/release_19_08.rst
> > index 0a3f840..bd01e99 100644
> > --- a/doc/guides/rel_notes/release_19_08.rst
> > +++ b/doc/guides/rel_notes/release_19_08.rst
> > @@ -281,6 +281,12 @@ API Changes
> >     offload flag from the library. The application must set this flag if it is
> >     supported by the platform and application wishes to use it.
> >
> > +* kni: ``rte_kni_pktmbuf_pool_create`` ``rte_kni_pktmbuf_pool_free``
> > +functions
> > +  were introduced for KNI applications for creating & freeing packet pool.
> > +  Since IOVA=VA mode was added in KNI, packet pool's mbuf memory
> > +should be
> > +  physically contiguous for the KNI kernel module to work in IOVA=VA
> > +mode,
> > +  this requirment was taken care in the kni packet pool creation fucntions.
> > +
> >
> >   ABI Changes
> >   -----------
> > diff --git a/examples/kni/main.c b/examples/kni/main.c index
> > 4710d71..3b9c067 100644
> > --- a/examples/kni/main.c
> > +++ b/examples/kni/main.c
> > @@ -975,7 +975,7 @@ main(int argc, char** argv)
> >   		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
> >
> >   	/* Create the mbuf pool */
> > -	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
> > +	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
> >   		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
> >   	if (pktmbuf_pool == NULL) {
> >   		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n"); @@
> > -1043,6 +1043,10 @@ main(int argc, char** argv)
> >   			continue;
> >   		kni_free_kni(port);
> >   	}
> > +
> > +	if (pktmbuf_pool)
> 
> Typically pointer is compared to NULL, but it is not required here anyway, since
> rte_mempool_free() handles NULL perfectly itself.

Ack

> 
> > +		rte_kni_pktmbuf_pool_free(pktmbuf_pool);
> > +
> >   	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
> >   		if (kni_port_params_array[i]) {
> >   			rte_free(kni_port_params_array[i]);
> 
> <...>
> 
> > diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h index
> > 5699a64..7f11927 100644
> > --- a/lib/librte_kni/rte_kni.h
> > +++ b/lib/librte_kni/rte_kni.h
> > @@ -20,6 +20,7 @@
> >   #include <rte_pci.h>
> >   #include <rte_memory.h>
> >   #include <rte_mempool.h>
> > +#include <rte_mbuf_pool_ops.h>
> 
> I don't understand why it is included here.
> 
included to fix compilation of rte_kni.c post this patch changes,  could be included in rte_kni.c though.

> >   #include <rte_ether.h>
> >
> >   #include <rte_kni_common.h>
> >
> 
> <...>

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory vattunuru
@ 2019-07-23 11:08                 ` Andrew Rybchenko
  2019-07-23 12:28                   ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-23 11:08 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov, kirankumark

On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Patch adds a routine to populate mempool from page aligned and
> page sized chunks of memory to ensures memory objs do not fall
> across the page boundaries. It's useful for applications that
> require physically contiguous mbuf memory while running in
> IOVA=VA mode.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
>   lib/librte_mempool/rte_mempool.c           | 59 ++++++++++++++++++++++++++++++
>   lib/librte_mempool/rte_mempool.h           | 17 +++++++++
>   lib/librte_mempool/rte_mempool_version.map |  1 +
>   3 files changed, 77 insertions(+)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 7260ce0..5312c8f 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -414,6 +414,65 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   	return ret;
>   }
>   
> +/* Function to populate mempool from page sized mem chunks, allocate page size
> + * of memory in memzone and populate them. Return the number of objects added,
> + * or a negative value on error.
> + */
> +int
> +rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp)
> +{
> +	char mz_name[RTE_MEMZONE_NAMESIZE];
> +	size_t align, pg_sz, pg_shift;
> +	const struct rte_memzone *mz;
> +	unsigned int mz_id, n;
> +	size_t chunk_size;

I think it would be better to keep min_chunk_size name here.
It would make it easier to read and understand the code.

> +	int ret;
> +
> +	ret = mempool_ops_alloc_once(mp);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (mp->nb_mem_chunks != 0)
> +		return -EEXIST;
> +
> +	pg_sz = get_min_page_size(mp->socket_id);
> +	pg_shift = rte_bsf32(pg_sz);
> +
> +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> +
> +		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
> +			     &chunk_size, &align);

It is incorrect to ignore mempool pool ops and enforce
default handler. Use rte_mempool_ops_calc_mem_size().
Also it is better to treat negative return value as an error
as default function does.
(May be it my mistake in return value description that it is
not mentioned).

> +
> +		if (chunk_size > pg_sz)
> +			goto fail;
> +
> +		ret = snprintf(mz_name, sizeof(mz_name),
> +			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
> +		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
> +			ret = -ENAMETOOLONG;
> +			goto fail;
> +		}
> +
> +		mz = rte_memzone_reserve_aligned(mz_name, chunk_size,
> +				mp->socket_id, 0, align);

NULL return value must be handled.

> +
> +		ret = rte_mempool_populate_iova(mp, mz->addr,
> +				mz->iova, mz->len,
> +				rte_mempool_memchunk_mz_free,
> +				(void *)(uintptr_t)mz);
> +		if (ret < 0) {
> +			rte_memzone_free(mz);
> +			goto fail;
> +		}
> +	}
> +
> +	return mp->size;
> +
> +fail:
> +	rte_mempool_free_memchunks(mp);
> +	return ret;
> +}
> +
>   /* Default function to populate the mempool: allocate memory in memzones,
>    * and populate them. Return the number of objects added, or a negative
>    * value on error.
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 8053f7a..73d6ada 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -1064,6 +1064,23 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   /**
>    * Add memory for objects in the pool at init

The different from default must be highlighted in the summary.

>    *
> + * This is the function used to populate the mempool with page aligned memzone
> + * memory. It ensures all mempool objects being on the page by allocating
> + * memzones with page size.
> + *
> + * @param mp
> + *   A pointer to the mempool structure.
> + * @return
> + *   The number of objects added on success.
> + *   On error, the chunk is not added in the memory list of the
> + *   mempool and a negative errno is returned.
> + */
> +__rte_experimental
> +int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp);
> +
> +/**
> + * Add memory for objects in the pool at init
> + *
>    * This is the default function used by rte_mempool_create() to populate
>    * the mempool. It adds memory allocated using rte_memzone_reserve().
>    *
> diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
> index 17cbca4..9a6fe65 100644
> --- a/lib/librte_mempool/rte_mempool_version.map
> +++ b/lib/librte_mempool/rte_mempool_version.map
> @@ -57,4 +57,5 @@ EXPERIMENTAL {
>   	global:
>   
>   	rte_mempool_ops_get_info;
> +	rte_mempool_populate_from_pg_sz_chunks;
>   };


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-23 11:08                 ` Andrew Rybchenko
@ 2019-07-23 12:28                   ` Vamsi Krishna Attunuru
  2019-07-23 19:33                     ` Andrew Rybchenko
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-23 12:28 UTC (permalink / raw)
  To: Andrew Rybchenko, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Tuesday, July 23, 2019 4:38 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with
> page sized chunks of memory
> 
> On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Patch adds a routine to populate mempool from page aligned and page
> > sized chunks of memory to ensures memory objs do not fall across the
> > page boundaries. It's useful for applications that require physically
> > contiguous mbuf memory while running in IOVA=VA mode.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > ---
> >   lib/librte_mempool/rte_mempool.c           | 59
> ++++++++++++++++++++++++++++++
> >   lib/librte_mempool/rte_mempool.h           | 17 +++++++++
> >   lib/librte_mempool/rte_mempool_version.map |  1 +
> >   3 files changed, 77 insertions(+)
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index 7260ce0..5312c8f 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -414,6 +414,65 @@ rte_mempool_populate_virt(struct rte_mempool
> *mp, char *addr,
> >   	return ret;
> >   }
> >
> > +/* Function to populate mempool from page sized mem chunks, allocate
> > +page size
> > + * of memory in memzone and populate them. Return the number of
> > +objects added,
> > + * or a negative value on error.
> > + */
> > +int
> > +rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp) {
> > +	char mz_name[RTE_MEMZONE_NAMESIZE];
> > +	size_t align, pg_sz, pg_shift;
> > +	const struct rte_memzone *mz;
> > +	unsigned int mz_id, n;
> > +	size_t chunk_size;
> 
> I think it would be better to keep min_chunk_size name here.
> It would make it easier to read and understand the code.

ack
> 
> > +	int ret;
> > +
> > +	ret = mempool_ops_alloc_once(mp);
> > +	if (ret != 0)
> > +		return ret;
> > +
> > +	if (mp->nb_mem_chunks != 0)
> > +		return -EEXIST;
> > +
> > +	pg_sz = get_min_page_size(mp->socket_id);
> > +	pg_shift = rte_bsf32(pg_sz);
> > +
> > +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> > +
> > +		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
> > +			     &chunk_size, &align);
> 
> It is incorrect to ignore mempool pool ops and enforce default handler. Use
> rte_mempool_ops_calc_mem_size().
> Also it is better to treat negative return value as an error as default function
> does.
> (May be it my mistake in return value description that it is not mentioned).
> 

Yes, I thought so, but ops_calc_mem_size() would in turn call mempool pmd's calc_mem_size() op which
may/may not return required chunk_size and align values in this case. Or else it would be skipped completely
and use pg_sz for both memzone len and align, anyways this  page sized alignment will suits the pmd's specific align requirements. 
  
> > +
> > +		if (chunk_size > pg_sz)
> > +			goto fail;
> > +
> > +		ret = snprintf(mz_name, sizeof(mz_name),
> > +			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name,
> mz_id);
> > +		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
> > +			ret = -ENAMETOOLONG;
> > +			goto fail;
> > +		}
> > +
> > +		mz = rte_memzone_reserve_aligned(mz_name, chunk_size,
> > +				mp->socket_id, 0, align);
> 
> NULL return value must be handled.
> 
Ack.
> > +
> > +		ret = rte_mempool_populate_iova(mp, mz->addr,
> > +				mz->iova, mz->len,
> > +				rte_mempool_memchunk_mz_free,
> > +				(void *)(uintptr_t)mz);
> > +		if (ret < 0) {
> > +			rte_memzone_free(mz);
> > +			goto fail;
> > +		}
> > +	}
> > +
> > +	return mp->size;
> > +
> > +fail:
> > +	rte_mempool_free_memchunks(mp);
> > +	return ret;
> > +}
> > +
> >   /* Default function to populate the mempool: allocate memory in
> memzones,
> >    * and populate them. Return the number of objects added, or a negative
> >    * value on error.
> > diff --git a/lib/librte_mempool/rte_mempool.h
> > b/lib/librte_mempool/rte_mempool.h
> > index 8053f7a..73d6ada 100644
> > --- a/lib/librte_mempool/rte_mempool.h
> > +++ b/lib/librte_mempool/rte_mempool.h
> > @@ -1064,6 +1064,23 @@ rte_mempool_populate_virt(struct
> rte_mempool *mp, char *addr,
> >   /**
> >    * Add memory for objects in the pool at init
> 
> The different from default must be highlighted in the summary.

Sure, will add more details.
> 
> >    *
> > + * This is the function used to populate the mempool with page
> > +aligned memzone
> > + * memory. It ensures all mempool objects being on the page by
> > +allocating
> > + * memzones with page size.
> > + *
> > + * @param mp
> > + *   A pointer to the mempool structure.
> > + * @return
> > + *   The number of objects added on success.
> > + *   On error, the chunk is not added in the memory list of the
> > + *   mempool and a negative errno is returned.
> > + */
> > +__rte_experimental
> > +int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool
> *mp);
> > +
> > +/**
> > + * Add memory for objects in the pool at init
> > + *
> >    * This is the default function used by rte_mempool_create() to populate
> >    * the mempool. It adds memory allocated using rte_memzone_reserve().
> >    *
> > diff --git a/lib/librte_mempool/rte_mempool_version.map
> > b/lib/librte_mempool/rte_mempool_version.map
> > index 17cbca4..9a6fe65 100644
> > --- a/lib/librte_mempool/rte_mempool_version.map
> > +++ b/lib/librte_mempool/rte_mempool_version.map
> > @@ -57,4 +57,5 @@ EXPERIMENTAL {
> >   	global:
> >
> >   	rte_mempool_ops_get_info;
> > +	rte_mempool_populate_from_pg_sz_chunks;
> >   };


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-23 12:28                   ` Vamsi Krishna Attunuru
@ 2019-07-23 19:33                     ` Andrew Rybchenko
  2019-07-24  7:09                       ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-23 19:33 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda

On 7/23/19 3:28 PM, Vamsi Krishna Attunuru wrote:
>> -----Original Message-----
>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>> Sent: Tuesday, July 23, 2019 4:38 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> olivier.matz@6wind.com; ferruh.yigit@intel.com;
>> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
>> <kirankumark@marvell.com>
>> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with
>> page sized chunks of memory
>>
>> On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> Patch adds a routine to populate mempool from page aligned and page
>>> sized chunks of memory to ensures memory objs do not fall across the
>>> page boundaries. It's useful for applications that require physically
>>> contiguous mbuf memory while running in IOVA=VA mode.
>>>
>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>> ---

<...>

>>> +	int ret;
>>> +
>>> +	ret = mempool_ops_alloc_once(mp);
>>> +	if (ret != 0)
>>> +		return ret;
>>> +
>>> +	if (mp->nb_mem_chunks != 0)
>>> +		return -EEXIST;
>>> +
>>> +	pg_sz = get_min_page_size(mp->socket_id);
>>> +	pg_shift = rte_bsf32(pg_sz);
>>> +
>>> +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
>>> +
>>> +		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
>>> +			     &chunk_size, &align);
>> It is incorrect to ignore mempool pool ops and enforce default handler. Use
>> rte_mempool_ops_calc_mem_size().
>> Also it is better to treat negative return value as an error as default function
>> does.
>> (May be it my mistake in return value description that it is not mentioned).
>>
> Yes, I thought so, but ops_calc_mem_size() would in turn call mempool pmd's calc_mem_size() op which
> may/may not return required chunk_size and align values in this case. Or else it would be skipped completely
> and use pg_sz for both memzone len and align, anyways this  page sized alignment will suits the pmd's specific align requirements.

Anyway it is incorrect to violate driver ops. default is definitely 
wrong for bucket.
min_chunk_size and align is mempool driver requirements. You can harden it,
but should not violate it.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-23 19:33                     ` Andrew Rybchenko
@ 2019-07-24  7:09                       ` Vamsi Krishna Attunuru
  2019-07-24  7:27                         ` Andrew Rybchenko
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-24  7:09 UTC (permalink / raw)
  To: Andrew Rybchenko, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Wednesday, July 24, 2019 1:04 AM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page
> sized chunks of memory
> 
> On 7/23/19 3:28 PM, Vamsi Krishna Attunuru wrote:
> >> -----Original Message-----
> >> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >> Sent: Tuesday, July 23, 2019 4:38 PM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> >> <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> >> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> >> <kirankumark@marvell.com>
> >> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with
> >> page sized chunks of memory
> >>
> >> On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> >>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> Patch adds a routine to populate mempool from page aligned and page
> >>> sized chunks of memory to ensures memory objs do not fall across the
> >>> page boundaries. It's useful for applications that require
> >>> physically contiguous mbuf memory while running in IOVA=VA mode.
> >>>
> >>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >>> ---
> 
> <...>
> 
> >>> +	int ret;
> >>> +
> >>> +	ret = mempool_ops_alloc_once(mp);
> >>> +	if (ret != 0)
> >>> +		return ret;
> >>> +
> >>> +	if (mp->nb_mem_chunks != 0)
> >>> +		return -EEXIST;
> >>> +
> >>> +	pg_sz = get_min_page_size(mp->socket_id);
> >>> +	pg_shift = rte_bsf32(pg_sz);
> >>> +
> >>> +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> >>> +
> >>> +		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
> >>> +			     &chunk_size, &align);
> >> It is incorrect to ignore mempool pool ops and enforce default
> >> handler. Use rte_mempool_ops_calc_mem_size().
> >> Also it is better to treat negative return value as an error as
> >> default function does.
> >> (May be it my mistake in return value description that it is not mentioned).
> >>
> > Yes, I thought so, but ops_calc_mem_size() would in turn call mempool
> > pmd's calc_mem_size() op which may/may not return required chunk_size
> > and align values in this case. Or else it would be skipped completely and use
> pg_sz for both memzone len and align, anyways this  page sized alignment will
> suits the pmd's specific align requirements.
> 
> Anyway it is incorrect to violate driver ops. default is definitely wrong for
> bucket.
> min_chunk_size and align is mempool driver requirements. You can harden it,
> but should not violate it.

fine, I will modify the routine as below,  call pmd's calc_mem_size() op and over write min_chunk_size if it does not suit for this function's purpose.

+       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+       if (total_elt_sz > pg_sz)
+               return ret;

+       for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {

-               rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
-                            &chunk_size, &align);
+               ret = rte_mempool_ops_calc_mem_size(mp, n, pg_shift,
+                               &min_chunk_size, &align);
+
+               if (ret < 0)
                        goto fail;

+               if (min_chunk_size > pg_sz)
+                       min_chunk_size = pg_sz;

Changes look fine.?


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
                                 ` (4 preceding siblings ...)
  2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 5/5] kni: modify IOVA mode checks to support VA vattunuru
@ 2019-07-24  7:14               ` Vamsi Krishna Attunuru
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
  6 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-24  7:14 UTC (permalink / raw)
  To: ferruh.yigit, olivier.matz, anatoly.burakov
  Cc: thomas, Jerin Jacob Kollanukkaran, arybchenko,
	Kiran Kumar Kokkilagadda, dev, Vamsi Krishna Attunuru

Ping,

Req for V8 review comments..

> -----Original Message-----
> From: vattunuru@marvell.com <vattunuru@marvell.com>
> Sent: Tuesday, July 23, 2019 11:08 AM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>
> Subject: [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support
> 
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> ---
> V8 Changes:
> * Remove default mempool populate() routine changes.
> * Add kni app specific mempool create & free routines.
> * Add new mempool populate routine to allocate page-aligned memzones with
> page size to make sure all mempool objects reside on a page.
> * Update release notes and map files.
> 
> V7 Changes:
> * Removed previously proposed mempool flag and made those page boundary
> checks default in mempool populate() except for the objects size bigger than the
> size of page.
> * Removed KNI example application related changes since pool related
> requirement is taken care in mempool lib.
> * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> * Added wrapper functions in KNI module to hide IOVA checks and make address
> translation routines more readable.
> * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
> mode is enabled.
> 
> V6 Changes:
> * Added new mempool flag to ensure mbuf memory is not scattered across page
> boundaries.
> * Added KNI kernel module required PCI device information.
> * Modified KNI example application to create mempool with new mempool flag.
> 
> V5 changes:
> * Fixed build issue with 32b build
> 
> V4 changes:
> * Fixed build issues with older kernel versions
> * This approach will only work with kernel above 4.4.0
> 
> V3 Changes:
> * Add new approach to work kni with IOVA=VA mode using
> iommu_iova_to_phys API.
> 
> Kiran Kumar K (1):
>   kni: add IOVA=VA support in KNI module
> 
> Vamsi Attunuru (4):
>   mempool: populate mempool with page sized chunks of memory
>   add IOVA -VA support in KNI lib
>   kni: add app specific mempool create & free routine
>   kni: modify IOVA mode checks to support VA
> 
>  doc/guides/prog_guide/kernel_nic_interface.rst    |   8 ++
>  doc/guides/rel_notes/release_19_08.rst            |   6 ++
>  examples/kni/main.c                               |   6 +-
>  kernel/linux/kni/compat.h                         |   4 +
>  kernel/linux/kni/kni_dev.h                        |   4 +
>  kernel/linux/kni/kni_misc.c                       |  71 +++++++++++++--
>  kernel/linux/kni/kni_net.c                        |  59 ++++++++++---
>  lib/librte_eal/linux/eal/eal.c                    |   4 +-
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |   8 ++
>  lib/librte_kni/Makefile                           |   2 +
>  lib/librte_kni/meson.build                        |   2 +
>  lib/librte_kni/rte_kni.c                          | 100 ++++++++++++++++++++--
>  lib/librte_kni/rte_kni.h                          |  49 +++++++++++
>  lib/librte_kni/rte_kni_version.map                |   2 +
>  lib/librte_mempool/rte_mempool.c                  |  59 +++++++++++++
>  lib/librte_mempool/rte_mempool.h                  |  17 ++++
>  lib/librte_mempool/rte_mempool_version.map        |   1 +
>  17 files changed, 373 insertions(+), 29 deletions(-)
> 
> --
> 2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-24  7:09                       ` Vamsi Krishna Attunuru
@ 2019-07-24  7:27                         ` Andrew Rybchenko
  2019-07-29  6:25                           ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-24  7:27 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda

On 7/24/19 10:09 AM, Vamsi Krishna Attunuru wrote:
>
>> -----Original Message-----
>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>> Sent: Wednesday, July 24, 2019 1:04 AM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
>> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page
>> sized chunks of memory
>>
>> On 7/23/19 3:28 PM, Vamsi Krishna Attunuru wrote:
>>>> -----Original Message-----
>>>> From: Andrew Rybchenko <arybchenko@solarflare.com>
>>>> Sent: Tuesday, July 23, 2019 4:38 PM
>>>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>>>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
>>>> <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
>>>> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
>>>> <kirankumark@marvell.com>
>>>> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with
>>>> page sized chunks of memory
>>>>
>>>> On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
>>>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>>>
>>>>> Patch adds a routine to populate mempool from page aligned and page
>>>>> sized chunks of memory to ensures memory objs do not fall across the
>>>>> page boundaries. It's useful for applications that require
>>>>> physically contiguous mbuf memory while running in IOVA=VA mode.
>>>>>
>>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>> ---
>> <...>
>>
>>>>> +	int ret;
>>>>> +
>>>>> +	ret = mempool_ops_alloc_once(mp);
>>>>> +	if (ret != 0)
>>>>> +		return ret;
>>>>> +
>>>>> +	if (mp->nb_mem_chunks != 0)
>>>>> +		return -EEXIST;
>>>>> +
>>>>> +	pg_sz = get_min_page_size(mp->socket_id);
>>>>> +	pg_shift = rte_bsf32(pg_sz);
>>>>> +
>>>>> +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
>>>>> +
>>>>> +		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
>>>>> +			     &chunk_size, &align);
>>>> It is incorrect to ignore mempool pool ops and enforce default
>>>> handler. Use rte_mempool_ops_calc_mem_size().
>>>> Also it is better to treat negative return value as an error as
>>>> default function does.
>>>> (May be it my mistake in return value description that it is not mentioned).
>>>>
>>> Yes, I thought so, but ops_calc_mem_size() would in turn call mempool
>>> pmd's calc_mem_size() op which may/may not return required chunk_size
>>> and align values in this case. Or else it would be skipped completely and use
>> pg_sz for both memzone len and align, anyways this  page sized alignment will
>> suits the pmd's specific align requirements.
>>
>> Anyway it is incorrect to violate driver ops. default is definitely wrong for
>> bucket.
>> min_chunk_size and align is mempool driver requirements. You can harden it,
>> but should not violate it.
> fine, I will modify the routine as below,  call pmd's calc_mem_size() op and over write min_chunk_size if it does not suit for this function's purpose.
>
> +       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> +       if (total_elt_sz > pg_sz)
> +               return ret;
>
> +       for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
>
> -               rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
> -                            &chunk_size, &align);
> +               ret = rte_mempool_ops_calc_mem_size(mp, n, pg_shift,
> +                               &min_chunk_size, &align);
> +
> +               if (ret < 0)
>                          goto fail;
>
> +               if (min_chunk_size > pg_sz)
> +                       min_chunk_size = pg_sz;
>
> Changes look fine.?

No, you can't violate min_chunk_size requirement. If it is unacceptable,
error should be returned. So, you should not check total_elt_sz above
separately.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory
  2019-07-24  7:27                         ` Andrew Rybchenko
@ 2019-07-29  6:25                           ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-29  6:25 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda, dev

Hi,

> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Wednesday, July 24, 2019 12:58 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page
> sized chunks of memory
> 
> On 7/24/19 10:09 AM, Vamsi Krishna Attunuru wrote:
> >
> >> -----Original Message-----
> >> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >> Sent: Wednesday, July 24, 2019 1:04 AM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> >> <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> >> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> >> <kirankumark@marvell.com>
> >> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with
> >> page sized chunks of memory
> >>
> >> On 7/23/19 3:28 PM, Vamsi Krishna Attunuru wrote:
> >>>> -----Original Message-----
> >>>> From: Andrew Rybchenko <arybchenko@solarflare.com>
> >>>> Sent: Tuesday, July 23, 2019 4:38 PM
> >>>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >>>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> >>>> <jerinj@marvell.com>; olivier.matz@6wind.com;
> >>>> ferruh.yigit@intel.com; anatoly.burakov@intel.com; Kiran Kumar
> >>>> Kokkilagadda <kirankumark@marvell.com>
> >>>> Subject: Re: [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool
> >>>> with page sized chunks of memory
> >>>>
> >>>> On 7/23/19 8:38 AM, vattunuru@marvell.com wrote:
> >>>>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>>>
> >>>>> Patch adds a routine to populate mempool from page aligned and
> >>>>> page sized chunks of memory to ensures memory objs do not fall
> >>>>> across the page boundaries. It's useful for applications that
> >>>>> require physically contiguous mbuf memory while running in IOVA=VA
> mode.
> >>>>>
> >>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >>>>> ---
> >> <...>
> >>
> >>>>> +	int ret;
> >>>>> +
> >>>>> +	ret = mempool_ops_alloc_once(mp);
> >>>>> +	if (ret != 0)
> >>>>> +		return ret;
> >>>>> +
> >>>>> +	if (mp->nb_mem_chunks != 0)
> >>>>> +		return -EEXIST;
> >>>>> +
> >>>>> +	pg_sz = get_min_page_size(mp->socket_id);
> >>>>> +	pg_shift = rte_bsf32(pg_sz);
> >>>>> +
> >>>>> +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> >>>>> +
> >>>>> +		rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
> >>>>> +			     &chunk_size, &align);
> >>>> It is incorrect to ignore mempool pool ops and enforce default
> >>>> handler. Use rte_mempool_ops_calc_mem_size().
> >>>> Also it is better to treat negative return value as an error as
> >>>> default function does.
> >>>> (May be it my mistake in return value description that it is not mentioned).
> >>>>
> >>> Yes, I thought so, but ops_calc_mem_size() would in turn call
> >>> mempool pmd's calc_mem_size() op which may/may not return required
> >>> chunk_size and align values in this case. Or else it would be
> >>> skipped completely and use
> >> pg_sz for both memzone len and align, anyways this  page sized
> >> alignment will suits the pmd's specific align requirements.
> >>
> >> Anyway it is incorrect to violate driver ops. default is definitely
> >> wrong for bucket.
> >> min_chunk_size and align is mempool driver requirements. You can
> >> harden it, but should not violate it.
> > fine, I will modify the routine as below,  call pmd's calc_mem_size() op and
> over write min_chunk_size if it does not suit for this function's purpose.
> >
> > +       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > +       if (total_elt_sz > pg_sz)
> > +               return ret;
> >
> > +       for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> >
> > -               rte_mempool_op_calc_mem_size_default(mp, n, pg_shift,
> > -                            &chunk_size, &align);
> > +               ret = rte_mempool_ops_calc_mem_size(mp, n, pg_shift,
> > +                               &min_chunk_size, &align);
> > +
> > +               if (ret < 0)
> >                          goto fail;
> >
> > +               if (min_chunk_size > pg_sz)
> > +                       min_chunk_size = pg_sz;
> >
> > Changes look fine.?
> 
> No, you can't violate min_chunk_size requirement. If it is unacceptable, error
> should be returned. So, you should not check total_elt_sz above separately.

Fine, will put min_chunk_size req check and return error if it's inappropriate.
I will send V9 with addressing all other comments.
Btw, sorry for delayed response.



^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v9 0/5] kni: add IOVA=VA support
  2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
                                 ` (5 preceding siblings ...)
  2019-07-24  7:14               ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
@ 2019-07-29 12:13               ` vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory vattunuru
                                   ` (4 more replies)
  6 siblings, 5 replies; 251+ messages in thread
From: vattunuru @ 2019-07-29 12:13 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in
the new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.

V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.

V7 Changes:
* Removed previously proposed mempool flag and made those page boundary
checks default in mempool populate() except for the objects size bigger
than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA mode
is enabled.

V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered
across page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new
mempool flag.

V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Kiran Kumar K (1):
  kni: add IOVA=VA support in KNI module

Vamsi Attunuru (4):
  mempool: populate mempool with the page sized chunks of memory
  kni: add IOVA=VA support in KNI lib
  kni: add app specific mempool create & free routine
  kni: modify IOVA mode checks to support VA

 doc/guides/prog_guide/kernel_nic_interface.rst    |  8 ++
 doc/guides/rel_notes/release_19_08.rst            |  6 ++
 examples/kni/main.c                               |  5 +-
 kernel/linux/kni/compat.h                         |  4 +
 kernel/linux/kni/kni_dev.h                        |  4 +
 kernel/linux/kni/kni_misc.c                       | 71 ++++++++++++++---
 kernel/linux/kni/kni_net.c                        | 59 ++++++++++----
 lib/librte_eal/linux/eal/eal.c                    |  4 +-
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++
 lib/librte_kni/Makefile                           |  2 +
 lib/librte_kni/meson.build                        |  2 +
 lib/librte_kni/rte_kni.c                          | 95 +++++++++++++++++++++--
 lib/librte_kni/rte_kni.h                          | 48 ++++++++++++
 lib/librte_kni/rte_kni_version.map                |  2 +
 lib/librte_mempool/rte_mempool.c                  | 64 +++++++++++++++
 lib/librte_mempool/rte_mempool.h                  | 17 ++++
 lib/librte_mempool/rte_mempool_version.map        |  1 +
 17 files changed, 371 insertions(+), 29 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
@ 2019-07-29 12:13                 ` vattunuru
  2019-07-29 12:41                   ` Andrew Rybchenko
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib vattunuru
                                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 251+ messages in thread
From: vattunuru @ 2019-07-29 12:13 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds a routine to populate mempool from page aligned and
page sized chunks of memory to ensure memory objs do not fall
across the page boundaries. It's useful for applications that
require physically contiguous mbuf memory while running in
IOVA=VA mode.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_mempool/rte_mempool.c           | 64 ++++++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool.h           | 17 ++++++++
 lib/librte_mempool/rte_mempool_version.map |  1 +
 3 files changed, 82 insertions(+)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7260ce0..00619bd 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,70 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Function to populate mempool from page sized mem chunks, allocate page size
+ * of memory in memzone and populate them. Return the number of objects added,
+ * or a negative value on error.
+ */
+int
+rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp)
+{
+	char mz_name[RTE_MEMZONE_NAMESIZE];
+	size_t align, pg_sz, pg_shift;
+	const struct rte_memzone *mz;
+	unsigned int mz_id, n;
+	size_t min_chunk_size;
+	int ret;
+
+	ret = mempool_ops_alloc_once(mp);
+	if (ret != 0)
+		return ret;
+
+	if (mp->nb_mem_chunks != 0)
+		return -EEXIST;
+
+	pg_sz = get_min_page_size(mp->socket_id);
+	pg_shift = rte_bsf32(pg_sz);
+
+	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
+
+		ret = rte_mempool_ops_calc_mem_size(mp, n,
+				pg_shift, &min_chunk_size, &align);
+
+		if (ret < 0 || min_chunk_size > pg_sz)
+			goto fail;
+
+		ret = snprintf(mz_name, sizeof(mz_name),
+			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
+		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
+			ret = -ENAMETOOLONG;
+			goto fail;
+		}
+
+		mz = rte_memzone_reserve_aligned(mz_name, min_chunk_size,
+				mp->socket_id, 0, align);
+
+		if (mz == NULL) {
+			ret = -rte_errno;
+			goto fail;
+		}
+
+		ret = rte_mempool_populate_iova(mp, mz->addr,
+				mz->iova, mz->len,
+				rte_mempool_memchunk_mz_free,
+				(void *)(uintptr_t)mz);
+		if (ret < 0) {
+			rte_memzone_free(mz);
+			goto fail;
+		}
+	}
+
+	return mp->size;
+
+fail:
+	rte_mempool_free_memchunks(mp);
+	return ret;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a..3046e4f 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1062,6 +1062,23 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	void *opaque);
 
 /**
+ * Add memory from page sized memzones for objects in the pool at init
+ *
+ * This is the function used to populate the mempool with page aligned and
+ * page sized memzone memory to avoid spreading object memory across two pages
+ * and to ensure all mempool objects reside on the page memory.
+ *
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @return
+ *   The number of objects added on success.
+ *   On error, the chunk is not added in the memory list of the
+ *   mempool and a negative errno is returned.
+ */
+__rte_experimental
+int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp);
+
+/**
  * Add memory for objects in the pool at init
  *
  * This is the default function used by rte_mempool_create() to populate
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca4..9a6fe65 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -57,4 +57,5 @@ EXPERIMENTAL {
 	global:
 
 	rte_mempool_ops_get_info;
+	rte_mempool_populate_from_pg_sz_chunks;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory vattunuru
@ 2019-07-29 12:13                 ` vattunuru
  2019-07-29 12:24                   ` Igor Ryzhov
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 3/5] kni: add app specific mempool create & free routine vattunuru
                                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-07-29 12:13 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation only operates in IOVA=PA mode, patch adds
required functionality in KNI lib to support IOVA=VA mode.

KNI kernel module requires device info to get iommu domain related
information for IOVA addr related translations. Patch defines device
related info in rte_kni_device_info structure and passes device info
to the kernel KNI module when IOVA=VA mode is enabled.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++++++
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 30 +++++++++++++++++++++++
 4 files changed, 40 insertions(+)

diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 37d9ee8..4fd8a90 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -111,6 +111,13 @@ struct rte_kni_device_info {
 	void * mbuf_va;
 	phys_addr_t mbuf_phys;
 
+	/* PCI info */
+	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+	uint8_t bus;                  /**< Device bus */
+	uint8_t devid;                /**< Device ID */
+	uint8_t function;             /**< Device function. */
+
 	uint16_t group_id;            /**< Group ID */
 	uint32_t core_id;             /**< core ID to bind for kernel thread */
 
@@ -121,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index cbd6599..ab15d10 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_kni.a
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
+CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
 
 EXPORT_MAP := rte_kni_version.map
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index 41fa2e3..fd46f87 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -9,3 +9,4 @@ version = 2
 sources = files('rte_kni.c')
 headers = files('rte_kni.h')
 deps += ['ethdev', 'pci']
+includes += include_directories('../../drivers/bus/pci')
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 4b51fb4..2aaaeaa 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -14,6 +14,7 @@
 #include <rte_spinlock.h>
 #include <rte_string_fns.h>
 #include <rte_ethdev.h>
+#include <rte_bus_pci.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_kni.h>
@@ -199,6 +200,27 @@ kni_release_mz(struct rte_kni *kni)
 	rte_memzone_free(kni->m_sync_addr);
 }
 
+static void
+kni_dev_pci_addr_get(uint16_t port_id, struct rte_kni_device_info *kni_dev_info)
+{
+	const struct rte_pci_device *pci_dev;
+	struct rte_eth_dev_info dev_info;
+	const struct rte_bus *bus = NULL;
+
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+		kni_dev_info->bus = pci_dev->addr.bus;
+		kni_dev_info->devid = pci_dev->addr.devid;
+		kni_dev_info->function = pci_dev->addr.function;
+		kni_dev_info->vendor_id = pci_dev->id.vendor_id;
+		kni_dev_info->device_id = pci_dev->id.device_id;
+	}
+}
+
 struct rte_kni *
 rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	      const struct rte_kni_conf *conf,
@@ -247,6 +269,12 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 		kni->ops.port_id = UINT16_MAX;
 
 	memset(&dev_info, 0, sizeof(dev_info));
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		uint16_t port_id = conf->group_id;
+
+		kni_dev_pci_addr_get(port_id, &dev_info);
+	}
 	dev_info.core_id = conf->core_id;
 	dev_info.force_bind = conf->force_bind;
 	dev_info.group_id = conf->group_id;
@@ -300,6 +328,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v9 3/5] kni: add app specific mempool create & free routine
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib vattunuru
@ 2019-07-29 12:13                 ` vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 4/5] kni: add IOVA=VA support in KNI module vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 5/5] kni: modify IOVA mode checks to support VA vattunuru
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-29 12:13 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

When KNI operates in IOVA = VA mode, it requires mbuf memory
to be physically contiguous to ensure KNI kernel module could
translate IOVA addresses properly. Patch adds a KNI specific
mempool create routine to populate the KNI packet mbuf pool
with memory objects that are being on a page.

KNI applications need to use this mempool create & free routines
so that mbuf related requirements in IOVA = VA mode are handled
inside those routines based on the enabled mode.

Updated the release notes with these new routine details.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 doc/guides/rel_notes/release_19_08.rst |  6 ++++
 examples/kni/main.c                    |  5 ++-
 lib/librte_kni/Makefile                |  1 +
 lib/librte_kni/meson.build             |  1 +
 lib/librte_kni/rte_kni.c               | 60 ++++++++++++++++++++++++++++++++++
 lib/librte_kni/rte_kni.h               | 48 +++++++++++++++++++++++++++
 lib/librte_kni/rte_kni_version.map     |  2 ++
 7 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_19_08.rst b/doc/guides/rel_notes/release_19_08.rst
index c9bd3ce..b200aae 100644
--- a/doc/guides/rel_notes/release_19_08.rst
+++ b/doc/guides/rel_notes/release_19_08.rst
@@ -301,6 +301,12 @@ API Changes
   best-effort tc, and qsize field of struct ``rte_sched_port_params`` is
   changed to allow different size of the each queue.
 
+* kni: ``rte_kni_pktmbuf_pool_create`` ``rte_kni_pktmbuf_pool_free`` functions
+  were introduced for KNI applications for creating & freeing packet pool.
+  Since IOVA=VA mode was added in KNI, packet pool's mbuf memory should be
+  physically contiguous for the KNI kernel module to work in IOVA=VA mode,
+  this requirement was taken care in the kni packet pool creation fucntions.
+
 
 ABI Changes
 -----------
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 4710d71..fdfeed2 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -975,7 +975,7 @@ main(int argc, char** argv)
 		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
 
 	/* Create the mbuf pool */
-	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
+	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
 		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
 	if (pktmbuf_pool == NULL) {
 		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
@@ -1043,6 +1043,9 @@ main(int argc, char** argv)
 			continue;
 		kni_free_kni(port);
 	}
+
+	rte_kni_pktmbuf_pool_free(pktmbuf_pool);
+
 	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
 		if (kni_port_params_array[i]) {
 			rte_free(kni_port_params_array[i]);
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index ab15d10..5e3dd01 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -6,6 +6,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 # library name
 LIB = librte_kni.a
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
 CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index fd46f87..e357445 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+allow_experimental_apis = true
 if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
 	build = false
 	reason = 'only supported on 64-bit linux'
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 2aaaeaa..15dda45 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -22,6 +22,7 @@
 #include <rte_tailq.h>
 #include <rte_rwlock.h>
 #include <rte_eal_memconfig.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_kni_common.h>
 #include "rte_kni_fifo.h"
 
@@ -681,6 +682,65 @@ kni_allocate_mbufs(struct rte_kni *kni)
 	}
 }
 
+struct rte_mempool *
+rte_kni_pktmbuf_pool_create(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
+	int socket_id)
+{
+	struct rte_pktmbuf_pool_private mbp_priv;
+	const char *mp_ops_name;
+	struct rte_mempool *mp;
+	unsigned int elt_size;
+	int ret;
+
+	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
+		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
+			priv_size);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size +
+		(unsigned int)data_room_size;
+	mbp_priv.mbuf_data_room_size = data_room_size;
+	mbp_priv.mbuf_priv_size = priv_size;
+
+	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
+		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
+	if (mp == NULL)
+		return NULL;
+
+	mp_ops_name = rte_mbuf_best_mempool_ops();
+	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+	rte_pktmbuf_pool_init(mp, &mbp_priv);
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		ret = rte_mempool_populate_from_pg_sz_chunks(mp);
+	else
+		ret = rte_mempool_populate_default(mp);
+
+	if (ret < 0) {
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+
+	rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
+
+	return mp;
+}
+
+void
+rte_kni_pktmbuf_pool_free(struct rte_mempool *mp)
+{
+	rte_mempool_free(mp);
+}
+
 struct rte_kni *
 rte_kni_get(const char *name)
 {
diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h
index 5699a64..99d263d 100644
--- a/lib/librte_kni/rte_kni.h
+++ b/lib/librte_kni/rte_kni.h
@@ -184,6 +184,54 @@ unsigned rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs,
 		unsigned num);
 
 /**
+ * Create a kni packet mbuf pool.
+ *
+ * This function creates and initializes a packet mbuf pool for KNI applications
+ * It calls the required mempool populate routine based on the IOVA mode.
+ *
+ * @param name
+ *   The name of the mbuf pool.
+ * @param n
+ *   The number of elements in the mbuf pool. The optimum size (in terms
+ *   of memory usage) for a mempool is when n is a power of two minus one:
+ *   n = (2^q - 1).
+ * @param cache_size
+ *   Size of the per-core object cache. See rte_mempool_create() for
+ *   details.
+ * @param priv_size
+ *   Size of application private are between the rte_mbuf structure
+ *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
+ * @param data_room_size
+ *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
+ * @param socket_id
+ *   The socket identifier where the memory should be allocated. The
+ *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
+ *   reserved zone.
+ * @return
+ *   The pointer to the new allocated mempool, on success. NULL on error
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ *    - E_RTE_SECONDARY - function was called from a secondary process instance
+ *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
+ *    - ENOSPC - the maximum number of memzones has already been allocated
+ *    - EEXIST - a memzone with the same name already exists
+ *    - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+__rte_experimental
+struct rte_mempool *rte_kni_pktmbuf_pool_create(const char *name,
+		unsigned int n, unsigned int cache_size, uint16_t priv_size,
+		uint16_t data_room_size, int socket_id);
+
+/**
+ * Free the given packet mempool.
+ *
+ * @param mp
+ *  The mempool pointer.
+ */
+__rte_experimental
+void rte_kni_pktmbuf_pool_free(struct rte_mempool *mp);
+
+/**
  * Get the KNI context of its name.
  *
  * @param name
diff --git a/lib/librte_kni/rte_kni_version.map b/lib/librte_kni/rte_kni_version.map
index c877dc6..aba9728 100644
--- a/lib/librte_kni/rte_kni_version.map
+++ b/lib/librte_kni/rte_kni_version.map
@@ -20,4 +20,6 @@ EXPERIMENTAL {
 	global:
 
 	rte_kni_update_link;
+	rte_kni_pktmbuf_pool_create;
+	rte_kni_pktmbuf_pool_free;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v9 4/5] kni: add IOVA=VA support in KNI module
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
                                   ` (2 preceding siblings ...)
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 3/5] kni: add app specific mempool create & free routine vattunuru
@ 2019-07-29 12:13                 ` vattunuru
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 5/5] kni: modify IOVA mode checks to support VA vattunuru
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-29 12:13 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Kiran Kumar K <kirankumark@marvell.com>

Patch adds support for kernel module to work in IOVA = VA mode,
the idea is to get physical address from IOVA address using
iommu_iova_to_phys API and later use phys_to_virt API to
convert the physical address to kernel virtual address.

When compared with IOVA = PA mode, there is no performance
drop with this approach.

This approach does not work with the kernel versions less
than 4.4.0 because of API compatibility issues.

Patch also updates these support details in KNI documentation.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 kernel/linux/kni/compat.h   |  4 +++
 kernel/linux/kni/kni_dev.h  |  4 +++
 kernel/linux/kni/kni_misc.c | 71 +++++++++++++++++++++++++++++++++++++++------
 kernel/linux/kni/kni_net.c  | 59 ++++++++++++++++++++++++++++---------
 4 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..ee997a6 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,7 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 4, 0) <= LINUX_VERSION_CODE
+#define HAVE_IOVA_AS_VA_SUPPORT
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..d5898f3 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 
 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -41,6 +42,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 2b75502..8660205 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -295,6 +295,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 
 	pr_info("Creating kni...\n");
 	/* Check the buffer size, to avoid warning */
@@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_AS_VA_SUPPORT
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		if (pci == NULL) {
+			pr_err("pci dev does not exist\n");
+			return -ENODEV;
+		}
+
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+
+		if (domain == NULL) {
+			pr_err("Failed to get pci dev domain info\n");
+			return -ENODEV;
+		}
+#else
+		pr_err("Kernel version does not support IOVA as VA\n");
+		return -EINVAL;
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index 7bd3a9f..8382859 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,21 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+			   (uintptr_t)m->buf_physaddr) + m->data_off);
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +77,24 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +211,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +299,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +371,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +470,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +482,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +550,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +583,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v9 5/5] kni: modify IOVA mode checks to support VA
  2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
                                   ` (3 preceding siblings ...)
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 4/5] kni: add IOVA=VA support in KNI module vattunuru
@ 2019-07-29 12:13                 ` vattunuru
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-07-29 12:13 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch addresses checks in KNI and eal that enforce IOVA=PA when
IOVA=VA mode is enabled, since KNI kernel module supports VA
mode for kernel versions >= 4.4.0.

Updated KNI documentation with above details.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst | 8 ++++++++
 lib/librte_eal/linux/eal/eal.c                 | 4 +++-
 lib/librte_kni/rte_kni.c                       | 5 -----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 38369b3..fd2ce63 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -291,6 +291,14 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI can be operated in IOVA as VA scheme when following criteria are fullfilled
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
+- eal param `--iova-mode va` is passed or bus IOVA scheme is set to RTE_IOVA_VA
+
 Ethtool
 -------
 
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 34db787..428425d 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1068,12 +1068,14 @@ rte_eal_init(int argc, char **argv)
 		/* Workaround for KNI which requires physical address to work */
 		if (iova_mode == RTE_IOVA_VA &&
 				rte_eal_check_module("rte_kni") == 1) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
 			if (phys_addrs) {
 				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
+				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
 			} else {
 				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
 			}
+#endif
 		}
 #endif
 		rte_eal_get_configuration()->iova_mode = iova_mode;
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 15dda45..c77d76f 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -99,11 +99,6 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
-	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
-		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
-		return -1;
-	}
-
 	/* Check FD and open */
 	if (kni_fd < 0) {
 		kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib vattunuru
@ 2019-07-29 12:24                   ` Igor Ryzhov
  2019-07-29 13:22                     ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Igor Ryzhov @ 2019-07-29 12:24 UTC (permalink / raw)
  To: vattunuru
  Cc: dev, Thomas Monjalon, jerinj, Olivier Matz, Ferruh Yigit,
	anatoly.burakov, Andrew Rybchenko, kirankumark

Hi,

I believe iova_mode check should not be automatic and should be a part
of rte_kni_conf.
What if I want to use KNI just as a pure virtual device without any
connection to a real PCI device in an application that works in VA
mode?

Best regards,
Igor

On Mon, Jul 29, 2019 at 3:14 PM <vattunuru@marvell.com> wrote:
>
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Current KNI implementation only operates in IOVA=PA mode, patch adds
> required functionality in KNI lib to support IOVA=VA mode.
>
> KNI kernel module requires device info to get iommu domain related
> information for IOVA addr related translations. Patch defines device
> related info in rte_kni_device_info structure and passes device info
> to the kernel KNI module when IOVA=VA mode is enabled.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++++++
>  lib/librte_kni/Makefile                           |  1 +
>  lib/librte_kni/meson.build                        |  1 +
>  lib/librte_kni/rte_kni.c                          | 30 +++++++++++++++++++++++
>  4 files changed, 40 insertions(+)
>
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 37d9ee8..4fd8a90 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>         void * mbuf_va;
>         phys_addr_t mbuf_phys;
>
> +       /* PCI info */
> +       uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> +       uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> +       uint8_t bus;                  /**< Device bus */
> +       uint8_t devid;                /**< Device ID */
> +       uint8_t function;             /**< Device function. */
> +
>         uint16_t group_id;            /**< Group ID */
>         uint32_t core_id;             /**< core ID to bind for kernel thread */
>
> @@ -121,6 +128,7 @@ struct rte_kni_device_info {
>         unsigned mbuf_size;
>         unsigned int mtu;
>         uint8_t mac_addr[6];
> +       uint8_t iova_mode;
>  };
>
>  #define KNI_DEVICE "kni"
> diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
> index cbd6599..ab15d10 100644
> --- a/lib/librte_kni/Makefile
> +++ b/lib/librte_kni/Makefile
> @@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  LIB = librte_kni.a
>
>  CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
> +CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
>  LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
>
>  EXPORT_MAP := rte_kni_version.map
> diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
> index 41fa2e3..fd46f87 100644
> --- a/lib/librte_kni/meson.build
> +++ b/lib/librte_kni/meson.build
> @@ -9,3 +9,4 @@ version = 2
>  sources = files('rte_kni.c')
>  headers = files('rte_kni.h')
>  deps += ['ethdev', 'pci']
> +includes += include_directories('../../drivers/bus/pci')
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index 4b51fb4..2aaaeaa 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -14,6 +14,7 @@
>  #include <rte_spinlock.h>
>  #include <rte_string_fns.h>
>  #include <rte_ethdev.h>
> +#include <rte_bus_pci.h>
>  #include <rte_malloc.h>
>  #include <rte_log.h>
>  #include <rte_kni.h>
> @@ -199,6 +200,27 @@ kni_release_mz(struct rte_kni *kni)
>         rte_memzone_free(kni->m_sync_addr);
>  }
>
> +static void
> +kni_dev_pci_addr_get(uint16_t port_id, struct rte_kni_device_info *kni_dev_info)
> +{
> +       const struct rte_pci_device *pci_dev;
> +       struct rte_eth_dev_info dev_info;
> +       const struct rte_bus *bus = NULL;
> +
> +       rte_eth_dev_info_get(port_id, &dev_info);
> +
> +       if (dev_info.device)
> +               bus = rte_bus_find_by_device(dev_info.device);
> +       if (bus && !strcmp(bus->name, "pci")) {
> +               pci_dev = RTE_DEV_TO_PCI(dev_info.device);
> +               kni_dev_info->bus = pci_dev->addr.bus;
> +               kni_dev_info->devid = pci_dev->addr.devid;
> +               kni_dev_info->function = pci_dev->addr.function;
> +               kni_dev_info->vendor_id = pci_dev->id.vendor_id;
> +               kni_dev_info->device_id = pci_dev->id.device_id;
> +       }
> +}
> +
>  struct rte_kni *
>  rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>               const struct rte_kni_conf *conf,
> @@ -247,6 +269,12 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>                 kni->ops.port_id = UINT16_MAX;
>
>         memset(&dev_info, 0, sizeof(dev_info));
> +
> +       if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> +               uint16_t port_id = conf->group_id;
> +
> +               kni_dev_pci_addr_get(port_id, &dev_info);
> +       }
>         dev_info.core_id = conf->core_id;
>         dev_info.force_bind = conf->force_bind;
>         dev_info.group_id = conf->group_id;
> @@ -300,6 +328,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>         kni->group_id = conf->group_id;
>         kni->mbuf_size = conf->mbuf_size;
>
> +       dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> +
>         ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
>         if (ret < 0)
>                 goto ioctl_fail;
> --
> 2.8.4
>

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory vattunuru
@ 2019-07-29 12:41                   ` Andrew Rybchenko
  2019-07-29 13:33                     ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
  1 sibling, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-07-29 12:41 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov, kirankumark

On 7/29/19 3:13 PM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Patch adds a routine to populate mempool from page aligned and
> page sized chunks of memory to ensure memory objs do not fall
> across the page boundaries. It's useful for applications that
> require physically contiguous mbuf memory while running in
> IOVA=VA mode.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>

When two below issues fixed:
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>

As I understand it is likely to be a temporary solution until the problem
is fixed in a generic way.

> ---
>   lib/librte_mempool/rte_mempool.c           | 64 ++++++++++++++++++++++++++++++
>   lib/librte_mempool/rte_mempool.h           | 17 ++++++++
>   lib/librte_mempool/rte_mempool_version.map |  1 +
>   3 files changed, 82 insertions(+)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 7260ce0..00619bd 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -414,6 +414,70 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   	return ret;
>   }
>   
> +/* Function to populate mempool from page sized mem chunks, allocate page size
> + * of memory in memzone and populate them. Return the number of objects added,
> + * or a negative value on error.
> + */
> +int
> +rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp)
> +{
> +	char mz_name[RTE_MEMZONE_NAMESIZE];
> +	size_t align, pg_sz, pg_shift;
> +	const struct rte_memzone *mz;
> +	unsigned int mz_id, n;
> +	size_t min_chunk_size;
> +	int ret;
> +
> +	ret = mempool_ops_alloc_once(mp);
> +	if (ret != 0)
> +		return ret;
> +
> +	if (mp->nb_mem_chunks != 0)
> +		return -EEXIST;
> +
> +	pg_sz = get_min_page_size(mp->socket_id);
> +	pg_shift = rte_bsf32(pg_sz);
> +
> +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> +
> +		ret = rte_mempool_ops_calc_mem_size(mp, n,
> +				pg_shift, &min_chunk_size, &align);
> +
> +		if (ret < 0 || min_chunk_size > pg_sz)

If min_chunk_size is greater than pg_sz, ret is 0 and function returns 
success.

> +			goto fail;
> +
> +		ret = snprintf(mz_name, sizeof(mz_name),
> +			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
> +		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
> +			ret = -ENAMETOOLONG;
> +			goto fail;
> +		}
> +
> +		mz = rte_memzone_reserve_aligned(mz_name, min_chunk_size,
> +				mp->socket_id, 0, align);
> +
> +		if (mz == NULL) {
> +			ret = -rte_errno;
> +			goto fail;
> +		}
> +
> +		ret = rte_mempool_populate_iova(mp, mz->addr,
> +				mz->iova, mz->len,
> +				rte_mempool_memchunk_mz_free,
> +				(void *)(uintptr_t)mz);
> +		if (ret < 0) {
> +			rte_memzone_free(mz);
> +			goto fail;
> +		}
> +	}
> +
> +	return mp->size;
> +
> +fail:
> +	rte_mempool_free_memchunks(mp);
> +	return ret;
> +}
> +
>   /* Default function to populate the mempool: allocate memory in memzones,
>    * and populate them. Return the number of objects added, or a negative
>    * value on error.
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 8053f7a..3046e4f 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -1062,6 +1062,23 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   	void *opaque);
>   
>   /**

  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice.

is missing

> + * Add memory from page sized memzones for objects in the pool at init
> + *
> + * This is the function used to populate the mempool with page aligned and
> + * page sized memzone memory to avoid spreading object memory across two pages
> + * and to ensure all mempool objects reside on the page memory.
> + *
> + * @param mp
> + *   A pointer to the mempool structure.
> + * @return
> + *   The number of objects added on success.
> + *   On error, the chunk is not added in the memory list of the
> + *   mempool and a negative errno is returned.
> + */
> +__rte_experimental
> +int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp);
> +
> +/**
>    * Add memory for objects in the pool at init
>    *
>    * This is the default function used by rte_mempool_create() to populate
> diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
> index 17cbca4..9a6fe65 100644
> --- a/lib/librte_mempool/rte_mempool_version.map
> +++ b/lib/librte_mempool/rte_mempool_version.map
> @@ -57,4 +57,5 @@ EXPERIMENTAL {
>   	global:
>   
>   	rte_mempool_ops_get_info;
> +	rte_mempool_populate_from_pg_sz_chunks;
>   };


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib
  2019-07-29 12:24                   ` Igor Ryzhov
@ 2019-07-29 13:22                     ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-29 13:22 UTC (permalink / raw)
  To: Igor Ryzhov, ferruh.yigit
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran, Olivier Matz,
	Ferruh Yigit, anatoly.burakov, Andrew Rybchenko,
	Kiran Kumar Kokkilagadda, David Marchand



> -----Original Message-----
> From: Igor Ryzhov <iryzhov@nfware.com>
> Sent: Monday, July 29, 2019 5:55 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; Olivier Matz
> <olivier.matz@6wind.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> anatoly.burakov@intel.com; Andrew Rybchenko
> <arybchenko@solarflare.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: [EXT] Re: [dpdk-dev] [PATCH v9 2/5] kni: add IOVA=VA support in
> KNI lib
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi,
> 
> I believe iova_mode check should not be automatic and should be a part of
> rte_kni_conf.

Changing the mode through rte_kni_conf will be too late, since already the environment would be set to VA/PA mode during the eal_init itself which can not be switched back based on the kni config. 

> What if I want to use KNI just as a pure virtual device without any connection
> to a real PCI device in an application that works in VA mode?

For the above use case, it will fail because there is no iommu domain present correspondingly, without that kernel kni module could not translate the memory that application(working in VA mode) provided to it. 

One possible way is to use explicit iova=pa mode selection from eal command line option when there is vdev(not backed up by pci device) presents.

Other option will be adding a mechanism to find out the presence of vdev(without backed up by pci dev) before setting the environment's IOVA mode in eal_init(), accordingly change the mode to PA if both vdev and kni module are present.   

@Ferruh,  any other thoughts

BTW what is the plan for these KNI patches.

> 
> Best regards,
> Igor
> 
> On Mon, Jul 29, 2019 at 3:14 PM <vattunuru@marvell.com> wrote:
> >
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Current KNI implementation only operates in IOVA=PA mode, patch adds
> > required functionality in KNI lib to support IOVA=VA mode.
> >
> > KNI kernel module requires device info to get iommu domain related
> > information for IOVA addr related translations. Patch defines device
> > related info in rte_kni_device_info structure and passes device info
> > to the kernel KNI module when IOVA=VA mode is enabled.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > ---
> >  lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++++++
> >  lib/librte_kni/Makefile                           |  1 +
> >  lib/librte_kni/meson.build                        |  1 +
> >  lib/librte_kni/rte_kni.c                          | 30 +++++++++++++++++++++++
> >  4 files changed, 40 insertions(+)
> >
> > diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > index 37d9ee8..4fd8a90 100644
> > --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> > @@ -111,6 +111,13 @@ struct rte_kni_device_info {
> >         void * mbuf_va;
> >         phys_addr_t mbuf_phys;
> >
> > +       /* PCI info */
> > +       uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> > +       uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> > +       uint8_t bus;                  /**< Device bus */
> > +       uint8_t devid;                /**< Device ID */
> > +       uint8_t function;             /**< Device function. */
> > +
> >         uint16_t group_id;            /**< Group ID */
> >         uint32_t core_id;             /**< core ID to bind for kernel thread */
> >
> > @@ -121,6 +128,7 @@ struct rte_kni_device_info {
> >         unsigned mbuf_size;
> >         unsigned int mtu;
> >         uint8_t mac_addr[6];
> > +       uint8_t iova_mode;
> >  };
> >
> >  #define KNI_DEVICE "kni"
> > diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile index
> > cbd6599..ab15d10 100644
> > --- a/lib/librte_kni/Makefile
> > +++ b/lib/librte_kni/Makefile
> > @@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk  LIB = librte_kni.a
> >
> >  CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
> > +CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
> >  LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
> >
> >  EXPORT_MAP := rte_kni_version.map
> > diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
> > index 41fa2e3..fd46f87 100644
> > --- a/lib/librte_kni/meson.build
> > +++ b/lib/librte_kni/meson.build
> > @@ -9,3 +9,4 @@ version = 2
> >  sources = files('rte_kni.c')
> >  headers = files('rte_kni.h')
> >  deps += ['ethdev', 'pci']
> > +includes += include_directories('../../drivers/bus/pci')
> > diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c index
> > 4b51fb4..2aaaeaa 100644
> > --- a/lib/librte_kni/rte_kni.c
> > +++ b/lib/librte_kni/rte_kni.c
> > @@ -14,6 +14,7 @@
> >  #include <rte_spinlock.h>
> >  #include <rte_string_fns.h>
> >  #include <rte_ethdev.h>
> > +#include <rte_bus_pci.h>
> >  #include <rte_malloc.h>
> >  #include <rte_log.h>
> >  #include <rte_kni.h>
> > @@ -199,6 +200,27 @@ kni_release_mz(struct rte_kni *kni)
> >         rte_memzone_free(kni->m_sync_addr);
> >  }
> >
> > +static void
> > +kni_dev_pci_addr_get(uint16_t port_id, struct rte_kni_device_info
> > +*kni_dev_info) {
> > +       const struct rte_pci_device *pci_dev;
> > +       struct rte_eth_dev_info dev_info;
> > +       const struct rte_bus *bus = NULL;
> > +
> > +       rte_eth_dev_info_get(port_id, &dev_info);
> > +
> > +       if (dev_info.device)
> > +               bus = rte_bus_find_by_device(dev_info.device);
> > +       if (bus && !strcmp(bus->name, "pci")) {
> > +               pci_dev = RTE_DEV_TO_PCI(dev_info.device);
> > +               kni_dev_info->bus = pci_dev->addr.bus;
> > +               kni_dev_info->devid = pci_dev->addr.devid;
> > +               kni_dev_info->function = pci_dev->addr.function;
> > +               kni_dev_info->vendor_id = pci_dev->id.vendor_id;
> > +               kni_dev_info->device_id = pci_dev->id.device_id;
> > +       }
> > +}
> > +
> >  struct rte_kni *
> >  rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
> >               const struct rte_kni_conf *conf, @@ -247,6 +269,12 @@
> > rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
> >                 kni->ops.port_id = UINT16_MAX;
> >
> >         memset(&dev_info, 0, sizeof(dev_info));
> > +
> > +       if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> > +               uint16_t port_id = conf->group_id;
> > +
> > +               kni_dev_pci_addr_get(port_id, &dev_info);
> > +       }
> >         dev_info.core_id = conf->core_id;
> >         dev_info.force_bind = conf->force_bind;
> >         dev_info.group_id = conf->group_id; @@ -300,6 +328,8 @@
> > rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
> >         kni->group_id = conf->group_id;
> >         kni->mbuf_size = conf->mbuf_size;
> >
> > +       dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1
> > + : 0;
> > +
> >         ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
> >         if (ret < 0)
> >                 goto ioctl_fail;
> > --
> > 2.8.4
> >

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory
  2019-07-29 12:41                   ` Andrew Rybchenko
@ 2019-07-29 13:33                     ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-07-29 13:33 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, Kiran Kumar Kokkilagadda, dev



> -----Original Message-----
> From: Andrew Rybchenko <arybchenko@solarflare.com>
> Sent: Monday, July 29, 2019 6:12 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: [EXT] Re: [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool
> with the page sized chunks of memory
> 
> External Email
> 
> ----------------------------------------------------------------------
> On 7/29/19 3:13 PM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Patch adds a routine to populate mempool from page aligned and page
> > sized chunks of memory to ensure memory objs do not fall across the
> > page boundaries. It's useful for applications that require physically
> > contiguous mbuf memory while running in IOVA=VA mode.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> 
> When two below issues fixed:
> Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
> 
> As I understand it is likely to be a temporary solution until the problem is
> fixed in a generic way.
> 
> > ---
> >   lib/librte_mempool/rte_mempool.c           | 64
> ++++++++++++++++++++++++++++++
> >   lib/librte_mempool/rte_mempool.h           | 17 ++++++++
> >   lib/librte_mempool/rte_mempool_version.map |  1 +
> >   3 files changed, 82 insertions(+)
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index 7260ce0..00619bd 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -414,6 +414,70 @@ rte_mempool_populate_virt(struct rte_mempool
> *mp, char *addr,
> >   	return ret;
> >   }
> >
> > +/* Function to populate mempool from page sized mem chunks, allocate
> > +page size
> > + * of memory in memzone and populate them. Return the number of
> > +objects added,
> > + * or a negative value on error.
> > + */
> > +int
> > +rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp) {
> > +	char mz_name[RTE_MEMZONE_NAMESIZE];
> > +	size_t align, pg_sz, pg_shift;
> > +	const struct rte_memzone *mz;
> > +	unsigned int mz_id, n;
> > +	size_t min_chunk_size;
> > +	int ret;
> > +
> > +	ret = mempool_ops_alloc_once(mp);
> > +	if (ret != 0)
> > +		return ret;
> > +
> > +	if (mp->nb_mem_chunks != 0)
> > +		return -EEXIST;
> > +
> > +	pg_sz = get_min_page_size(mp->socket_id);
> > +	pg_shift = rte_bsf32(pg_sz);
> > +
> > +	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> > +
> > +		ret = rte_mempool_ops_calc_mem_size(mp, n,
> > +				pg_shift, &min_chunk_size, &align);
> > +
> > +		if (ret < 0 || min_chunk_size > pg_sz)
> 
> If min_chunk_size is greater than pg_sz, ret is 0 and function returns success.

Ack, will fix it in next version.

> 
> > +			goto fail;
> > +
> > +		ret = snprintf(mz_name, sizeof(mz_name),
> > +			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name,
> mz_id);
> > +		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
> > +			ret = -ENAMETOOLONG;
> > +			goto fail;
> > +		}
> > +
> > +		mz = rte_memzone_reserve_aligned(mz_name,
> min_chunk_size,
> > +				mp->socket_id, 0, align);
> > +
> > +		if (mz == NULL) {
> > +			ret = -rte_errno;
> > +			goto fail;
> > +		}
> > +
> > +		ret = rte_mempool_populate_iova(mp, mz->addr,
> > +				mz->iova, mz->len,
> > +				rte_mempool_memchunk_mz_free,
> > +				(void *)(uintptr_t)mz);
> > +		if (ret < 0) {
> > +			rte_memzone_free(mz);
> > +			goto fail;
> > +		}
> > +	}
> > +
> > +	return mp->size;
> > +
> > +fail:
> > +	rte_mempool_free_memchunks(mp);
> > +	return ret;
> > +}
> > +
> >   /* Default function to populate the mempool: allocate memory in
> memzones,
> >    * and populate them. Return the number of objects added, or a negative
> >    * value on error.
> > diff --git a/lib/librte_mempool/rte_mempool.h
> > b/lib/librte_mempool/rte_mempool.h
> > index 8053f7a..3046e4f 100644
> > --- a/lib/librte_mempool/rte_mempool.h
> > +++ b/lib/librte_mempool/rte_mempool.h
> > @@ -1062,6 +1062,23 @@ rte_mempool_populate_virt(struct
> rte_mempool *mp, char *addr,
> >   	void *opaque);
> >
> >   /**
> 
>   * @warning
>   * @b EXPERIMENTAL: this API may change without prior notice.
> 
> is missing

Ack

> 
> > + * Add memory from page sized memzones for objects in the pool at
> > +init
> > + *
> > + * This is the function used to populate the mempool with page
> > +aligned and
> > + * page sized memzone memory to avoid spreading object memory across
> > +two pages
> > + * and to ensure all mempool objects reside on the page memory.
> > + *
> > + * @param mp
> > + *   A pointer to the mempool structure.
> > + * @return
> > + *   The number of objects added on success.
> > + *   On error, the chunk is not added in the memory list of the
> > + *   mempool and a negative errno is returned.
> > + */
> > +__rte_experimental
> > +int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool
> *mp);
> > +
> > +/**
> >    * Add memory for objects in the pool at init
> >    *
> >    * This is the default function used by rte_mempool_create() to
> > populate diff --git a/lib/librte_mempool/rte_mempool_version.map
> > b/lib/librte_mempool/rte_mempool_version.map
> > index 17cbca4..9a6fe65 100644
> > --- a/lib/librte_mempool/rte_mempool_version.map
> > +++ b/lib/librte_mempool/rte_mempool_version.map
> > @@ -57,4 +57,5 @@ EXPERIMENTAL {
> >   	global:
> >
> >   	rte_mempool_ops_get_info;
> > +	rte_mempool_populate_from_pg_sz_chunks;
> >   };


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] ***Spam*** [RFC 0/4] mempool: avoid objects allocations across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (4 preceding siblings ...)
  2019-07-23  5:37                         ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations " Vamsi Krishna Attunuru
@ 2019-08-07 15:21                         ` Andrew Rybchenko
  2019-10-28 14:06                           ` Olivier Matz
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
                                           ` (3 subsequent siblings)
  9 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-08-07 15:21 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Thomas Monjalon, Anatoly Burakov, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 7/19/19 4:38 PM, Olivier Matz wrote:
> When IOVA mode is VA, a mempool can be created with objects that
> are not physically contiguous, which breaks KNI.
>
> To solve this, this patchset changes the default behavior of mempool
> populate function, to prevent objects from being located across pages.

I'll provide top level review notes on individual patches, but what
I don't understand in general, why do we add a rule to respect
page boundaries in any case even when it is not absolutely required.
It may add holes. Can it make negative impact on performance?

I think that KNI VA-mode requirements are very specific.
It is VA-mode, but page boundaries should be respected even
if VA is contiguous.

> Olivier Matz (4):
>    mempool: clarify default populate function
>    mempool: unalign size when calculating required mem amount
>    mempool: introduce function to get mempool page size
>    mempool: prevent objects from being across pages
>
>   lib/librte_mempool/rte_mempool.c             | 106 +++++++++++----------------
>   lib/librte_mempool/rte_mempool.h             |   8 +-
>   lib/librte_mempool/rte_mempool_ops.c         |   4 +-
>   lib/librte_mempool/rte_mempool_ops_default.c |  39 +++++++++-
>   4 files changed, 90 insertions(+), 67 deletions(-)
>
> ---
>
> Hi,
>
>> @Olivier,
>> Any suggestions..?
> I took some time to go a bit deeper. I still think we can change the
> default behavior to avoid objects to be located accross pages. But
> it is more complex that I expected.
>
> I made a draft patchset, that, in short:
> - cleans/renames variables
> - removes the optimistic full iova contiguous allocation
> - changes return value of calc_mem_size to return the unaligned size,
>    therefore the allocation is smaller in case of big hugepages
> - changes rte_mempool_op_populate_default() to prevent allocation
>    of objects accross multiple pages
>
> Andrew, Anatoly, did I miss something?
> Vamsi, can you check if it solves your issue?
>
> Anyway, even if validate the patchset it and make it work, I'm afraid
> this is not something that could go in 19.08.
>
> The only alternative I see is a specific mempool allocation function
> when used in iova=va mode + kni, as you proposed previously.
>
> It can probably be implemented without adding a flag, starting from
> rte_mempool_create(), and replacing rte_mempool_populate_default(mp) by
> something else: allocate pages one by one, and call
> rte_mempool_populate_iova() for each of them.
>
> Hope it helps. Unfortunately, I may not have too much time to spend on
> it in the coming days.
>
> Regards,
> Olivier


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] ***Spam*** [RFC 2/4] mempool: unalign size when calculating required mem amount
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 2/4] mempool: unalign size when calculating required mem amount Olivier Matz
@ 2019-08-07 15:21                           ` Andrew Rybchenko
  2019-10-28 14:06                             ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-08-07 15:21 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Thomas Monjalon, Anatoly Burakov, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 7/19/19 4:38 PM, Olivier Matz wrote:
> The size returned by rte_mempool_op_calc_mem_size_default() is aligned
> to the specified page size. This means that with big pages, the returned
> amount is more that what we really need to populate the mempool.
>
> This problem is tempered by the allocation method of
> rte_mempool_populate_default(): in some conditions (when
> try_iova_contig_mempool=true), it first tries to allocate all objs
> memory in an iova contiguous area, without the alignment constraint. If
> it fails, it fallbacks to the big aligned allocation, that can also
> fallback into several smaller allocations.
>
> This commit changes rte_mempool_op_calc_mem_size_default() to return the
> unaligned amount of memory (the alignment constraint is still returned
> via the *align argument),

It does not as far as I can see. There is no changes in
lib/librte_mempool/rte_mempool_ops_default.c

> and removes the optimistic contiguous
> allocation done when try_iova_contig_mempool=true.

Unfortunately I don't understand why these 2 changes are combined
into to one patch. The first looks good to me, the second is
questionable. It can impact performance.

> This will make the amount of allocated memory more predictible: it will
> be more than the optimistic contiguous allocation, but less than the big
> aligned allocation.
>
> This opens the door for the next commits that will try to prevent objets
> from being located across pages.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   lib/librte_mempool/rte_mempool.c     | 44 ++++--------------------------------
>   lib/librte_mempool/rte_mempool.h     |  2 +-
>   lib/librte_mempool/rte_mempool_ops.c |  4 +++-
>   3 files changed, 9 insertions(+), 41 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 0f29e8712..335032dc8 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -430,7 +430,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	unsigned mz_id, n;
>   	int ret;
>   	bool need_iova_contig_obj;
> -	bool try_iova_contig_mempool;
>   	bool alloc_in_ext_mem;
>   
>   	ret = mempool_ops_alloc_once(mp);
> @@ -477,18 +476,10 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	 * wasting some space this way, but it's much nicer than looping around
>   	 * trying to reserve each and every page size.
>   	 *
> -	 * However, since size calculation will produce page-aligned sizes, it
> -	 * makes sense to first try and see if we can reserve the entire memzone
> -	 * in one contiguous chunk as well (otherwise we might end up wasting a
> -	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
> -	 * memory, then we'll go and reserve space page-by-page.
> -	 *
>   	 * We also have to take into account the fact that memory that we're
>   	 * going to allocate from can belong to an externally allocated memory
>   	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold. We should also try
> -	 * to go for contiguous memory even if we're in no-huge mode, because
> -	 * external memory may in fact be IOVA-contiguous.
> +	 * synonymous with IOVA contiguousness will not hold.
>   	 */
>   
>   	/* check if we can retrieve a valid socket ID */
> @@ -497,7 +488,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		return -EINVAL;
>   	alloc_in_ext_mem = (ret == 1);
>   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> -	try_iova_contig_mempool = false;
>   
>   	if (!need_iova_contig_obj) {
>   		pg_sz = 0;
> @@ -506,7 +496,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		pg_sz = 0;
>   		pg_shift = 0;
>   	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
> -		try_iova_contig_mempool = true;
>   		pg_sz = get_min_page_size(mp->socket_id);
>   		pg_shift = rte_bsf32(pg_sz);
>   	} else {
> @@ -518,12 +507,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		size_t min_chunk_size;
>   		unsigned int flags;
>   
> -		if (try_iova_contig_mempool || pg_sz == 0)
> -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> -					0, &min_chunk_size, &align);
> -		else
> -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> -					pg_shift, &min_chunk_size, &align);
> +		mem_size = rte_mempool_ops_calc_mem_size(
> +			mp, n, pg_shift, &min_chunk_size, &align);
>   
>   		if (mem_size < 0) {
>   			ret = mem_size;
> @@ -542,31 +527,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		/* if we're trying to reserve contiguous memory, add appropriate
>   		 * memzone flag.
>   		 */
> -		if (try_iova_contig_mempool)
> +		if (min_chunk_size == (size_t)mem_size)
>   			flags |= RTE_MEMZONE_IOVA_CONTIG;
>   
>   		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>   				mp->socket_id, flags, align);
>   
> -		/* if we were trying to allocate contiguous memory, failed and
> -		 * minimum required contiguous chunk fits minimum page, adjust
> -		 * memzone size to the page size, and try again.
> -		 */
> -		if (mz == NULL && try_iova_contig_mempool &&
> -				min_chunk_size <= pg_sz) {
> -			try_iova_contig_mempool = false;
> -			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
> -
> -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> -					pg_shift, &min_chunk_size, &align);
> -			if (mem_size < 0) {
> -				ret = mem_size;
> -				goto fail;
> -			}
> -
> -			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> -				mp->socket_id, flags, align);
> -		}
>   		/* don't try reserving with 0 size if we were asked to reserve
>   		 * IOVA-contiguous memory.
>   		 */
> @@ -594,7 +560,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		else
>   			iova = RTE_BAD_IOVA;
>   
> -		if (try_iova_contig_mempool || pg_sz == 0)
> +		if (pg_sz == 0)
>   			ret = rte_mempool_populate_iova(mp, mz->addr,
>   				iova, mz->len,
>   				rte_mempool_memchunk_mz_free,
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 8053f7a04..7bc10e699 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
>    * @param[out] align
>    *   Location for required memory chunk alignment.
>    * @return
> - *   Required memory size aligned at page boundary.
> + *   Required memory size.
>    */
>   typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
>   		uint32_t obj_num,  uint32_t pg_shift,
> diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
> index e02eb702c..22c5251eb 100644
> --- a/lib/librte_mempool/rte_mempool_ops.c
> +++ b/lib/librte_mempool/rte_mempool_ops.c
> @@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
>   	return ops->get_count(mp);
>   }
>   
> -/* wrapper to notify new memory area to external mempool */
> +/* wrapper to calculate the memory size required to store given number
> + * of objects
> + */
>   ssize_t
>   rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
>   				uint32_t obj_num, uint32_t pg_shift,


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-08-07 15:21                           ` Andrew Rybchenko
  2019-10-28 14:06                             ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-08-07 15:21 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Thomas Monjalon, Anatoly Burakov, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 7/19/19 4:38 PM, Olivier Matz wrote:
> In rte_mempool_populate_default(), we determine the page size,
> which is needed for calc_size and allocation of memory.
> 
> Move this in a function and export it, it will be used in next
> commit.

The major change here is taking page sizes into account even in the
case of RTE_IOVA_VA. As I understand it is the main problem initially
that we should respect page boundaries even in RTE_IOVA_VA case.
It looks like we can have it even without removal of contiguous
allocation attempt.

> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   lib/librte_mempool/rte_mempool.c | 50 +++++++++++++++++++++++++---------------
>   lib/librte_mempool/rte_mempool.h |  6 +++++
>   2 files changed, 37 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 335032dc8..7def0ba68 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -414,6 +414,32 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   	return ret;
>   }
>   
> +int
> +rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
> +{
> +	bool need_iova_contig_obj;
> +	bool alloc_in_ext_mem;
> +	int ret;
> +
> +	/* check if we can retrieve a valid socket ID */
> +	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
> +	if (ret < 0)
> +		return -EINVAL;
> +	alloc_in_ext_mem = (ret == 1);
> +	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> +
> +	if (!need_iova_contig_obj)
> +		*pg_sz = 0;
> +	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> +		*pg_sz = get_min_page_size(mp->socket_id);
> +	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> +		*pg_sz = get_min_page_size(mp->socket_id);
> +	else
> +		*pg_sz = getpagesize();
> +
> +	return 0;
> +}
> +
>   /* Default function to populate the mempool: allocate memory in memzones,
>    * and populate them. Return the number of objects added, or a negative
>    * value on error.
> @@ -425,12 +451,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	char mz_name[RTE_MEMZONE_NAMESIZE];
>   	const struct rte_memzone *mz;
>   	ssize_t mem_size;
> -	size_t align, pg_sz, pg_shift;
> +	size_t align, pg_sz, pg_shift = 0;
>   	rte_iova_t iova;
>   	unsigned mz_id, n;
>   	int ret;
>   	bool need_iova_contig_obj;
> -	bool alloc_in_ext_mem;
>   
>   	ret = mempool_ops_alloc_once(mp);
>   	if (ret != 0)
> @@ -482,26 +507,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	 * synonymous with IOVA contiguousness will not hold.
>   	 */
>   
> -	/* check if we can retrieve a valid socket ID */
> -	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
> -	if (ret < 0)
> -		return -EINVAL;
> -	alloc_in_ext_mem = (ret == 1);
>   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> +	if (ret < 0)
> +		return ret;
>   
> -	if (!need_iova_contig_obj) {
> -		pg_sz = 0;
> -		pg_shift = 0;
> -	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
> -		pg_sz = 0;
> -		pg_shift = 0;
> -	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
> -		pg_sz = get_min_page_size(mp->socket_id);
> -		pg_shift = rte_bsf32(pg_sz);
> -	} else {
> -		pg_sz = getpagesize();
> +	if (pg_sz != 0)
>   		pg_shift = rte_bsf32(pg_sz);
> -	}
>   
>   	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
>   		size_t min_chunk_size;
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 7bc10e699..00b927989 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -1692,6 +1692,12 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
>   void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
>   		      void *arg);
>   
> +/**
> + * @internal Get page size used for mempool object allocation.
> + */
> +int
> +rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
> +
>   #ifdef __cplusplus
>   }
>   #endif
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-07-19 13:38                         ` [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages Olivier Matz
  2019-07-19 14:03                           ` Burakov, Anatoly
  2019-07-19 14:11                           ` Burakov, Anatoly
@ 2019-08-07 15:21                           ` Andrew Rybchenko
  2019-10-28 14:07                             ` Olivier Matz
  2 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-08-07 15:21 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru, dev
  Cc: Thomas Monjalon, Anatoly Burakov, Jerin Jacob Kollanukkaran,
	Kokkilagadda, Ferruh Yigit

On 7/19/19 4:38 PM, Olivier Matz wrote:
> When using iova contiguous memory and objets smaller than page size,
> ensure that objects are not located across several pages.

It looks like as an attempt to make exception a generic rule and
I think it is not a good idea.

mempool has a notion of IOVA contiguous/non-contiguous objects
depending if PA or VA. rte_mempool_op_populate_default() gets
a memory chunk which is contiguous in VA and, if iova is not bad,
IOVA-contiguous. The patch always enforces page boundaries even
if it is not required. For example, if memory chunk is IOVA_PA
contiguous, the patch could result in holes and extra memory usage

> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   lib/librte_mempool/rte_mempool_ops_default.c | 39 ++++++++++++++++++++++++++--
>   1 file changed, 37 insertions(+), 2 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> index 4e2bfc82d..2bbd67367 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -45,19 +45,54 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   	return mem_size;
>   }
>   
> +/* Returns -1 if object falls on a page boundary, else returns 0 */
> +static inline int
> +mempool_check_obj_bounds(void *obj, uint64_t pg_sz, size_t elt_sz)
> +{
> +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> +	uint32_t pg_shift;
> +	uint64_t page_mask;
> +
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +
> +	pg_shift = rte_bsf32(pg_sz);
> +	page_mask =  ~((1ull << pg_shift) - 1);
> +	page_end = (elt_addr & page_mask) + pg_sz;
> +
> +	if (elt_addr + elt_sz > page_end)
> +		return -1;
> +
> +	return 0;
> +}
> +
>   int
>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
>   		void *vaddr, rte_iova_t iova, size_t len,
>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>   {
> -	size_t total_elt_sz;
> +	size_t total_elt_sz, pg_sz;
>   	size_t off;
>   	unsigned int i;
>   	void *obj;
>   
> +	rte_mempool_get_page_size(mp, &pg_sz);
> +
>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* align offset to next page start if required */
> +		if (mempool_check_obj_bounds((char *)vaddr + off,
> +						pg_sz, total_elt_sz) < 0) {
> +			off += RTE_PTR_ALIGN_CEIL((char *)vaddr + off, pg_sz) -
> +				((char *)vaddr + off);
> +		}
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>   		off += mp->header_size;
>   		obj = (char *)vaddr + off;
>   		obj_cb(mp, obj_cb_arg, obj,


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v10 0/5] kni: add IOVA=VA support
  2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory vattunuru
  2019-07-29 12:41                   ` Andrew Rybchenko
@ 2019-08-16  6:12                   ` vattunuru
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks vattunuru
                                       ` (7 more replies)
  1 sibling, 8 replies; 251+ messages in thread
From: vattunuru @ 2019-08-16  6:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V10 Changes:
* Fixed function return code on failure when min_chunk_size > pg_sz.
* Marked new mempool populate routine as EXPERIMENTAL.

V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in
the new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.

V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.

V7 Changes:
* Removed previously proposed mempool flag and made those page boundary
checks default in mempool populate() except for the objects size bigger
than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA mode
is enabled.

V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered
across page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new
mempool flag.

V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Kiran Kumar K (1):
  kni: add IOVA=VA support in KNI module

Vamsi Attunuru (4):
  mempool: populate mempool with the page sized chunks
  kni: add IOVA=VA support in KNI lib
  kni: add app specific mempool create and free routines
  kni: modify IOVA mode checks to support VA

 doc/guides/prog_guide/kernel_nic_interface.rst    |  8 ++
 doc/guides/rel_notes/release_19_11.rst            |  5 ++
 examples/kni/main.c                               |  5 +-
 kernel/linux/kni/compat.h                         |  4 +
 kernel/linux/kni/kni_dev.h                        |  4 +
 kernel/linux/kni/kni_misc.c                       | 71 ++++++++++++++---
 kernel/linux/kni/kni_net.c                        | 59 ++++++++++----
 lib/librte_eal/linux/eal/eal.c                    |  4 +-
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++
 lib/librte_kni/Makefile                           |  2 +
 lib/librte_kni/meson.build                        |  2 +
 lib/librte_kni/rte_kni.c                          | 95 +++++++++++++++++++++--
 lib/librte_kni/rte_kni.h                          | 48 ++++++++++++
 lib/librte_kni/rte_kni_version.map                |  2 +
 lib/librte_mempool/rte_mempool.c                  | 69 ++++++++++++++++
 lib/librte_mempool/rte_mempool.h                  | 20 +++++
 lib/librte_mempool/rte_mempool_version.map        |  1 +
 17 files changed, 378 insertions(+), 29 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
@ 2019-08-16  6:12                     ` vattunuru
  2019-10-08  9:26                       ` Olivier Matz
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 2/5] kni: add IOVA=VA support in KNI lib vattunuru
                                       ` (6 subsequent siblings)
  7 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-08-16  6:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds a routine to populate mempool from page aligned and
page sized chunks of memory to ensure memory objs do not fall
across the page boundaries. It's useful for applications that
require physically contiguous mbuf memory while running in
IOVA=VA mode.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_mempool/rte_mempool.c           | 69 ++++++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool.h           | 20 +++++++++
 lib/librte_mempool/rte_mempool_version.map |  1 +
 3 files changed, 90 insertions(+)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7260ce0..0683750 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,75 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Function to populate mempool from page sized mem chunks, allocate page size
+ * of memory in memzone and populate them. Return the number of objects added,
+ * or a negative value on error.
+ */
+int
+rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp)
+{
+	char mz_name[RTE_MEMZONE_NAMESIZE];
+	size_t align, pg_sz, pg_shift;
+	const struct rte_memzone *mz;
+	unsigned int mz_id, n;
+	size_t min_chunk_size;
+	int ret;
+
+	ret = mempool_ops_alloc_once(mp);
+	if (ret != 0)
+		return ret;
+
+	if (mp->nb_mem_chunks != 0)
+		return -EEXIST;
+
+	pg_sz = get_min_page_size(mp->socket_id);
+	pg_shift = rte_bsf32(pg_sz);
+
+	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
+
+		ret = rte_mempool_ops_calc_mem_size(mp, n,
+				pg_shift, &min_chunk_size, &align);
+
+		if (ret < 0)
+			goto fail;
+
+		if (min_chunk_size > pg_sz) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		ret = snprintf(mz_name, sizeof(mz_name),
+			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
+		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
+			ret = -ENAMETOOLONG;
+			goto fail;
+		}
+
+		mz = rte_memzone_reserve_aligned(mz_name, min_chunk_size,
+				mp->socket_id, 0, align);
+
+		if (mz == NULL) {
+			ret = -rte_errno;
+			goto fail;
+		}
+
+		ret = rte_mempool_populate_iova(mp, mz->addr,
+				mz->iova, mz->len,
+				rte_mempool_memchunk_mz_free,
+				(void *)(uintptr_t)mz);
+		if (ret < 0) {
+			rte_memzone_free(mz);
+			goto fail;
+		}
+	}
+
+	return mp->size;
+
+fail:
+	rte_mempool_free_memchunks(mp);
+	return ret;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a..2f5126e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1062,6 +1062,26 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	void *opaque);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Add memory from page sized memzones for objects in the pool at init
+ *
+ * This is the function used to populate the mempool with page aligned and
+ * page sized memzone memory to avoid spreading object memory across two pages
+ * and to ensure all mempool objects reside on the page memory.
+ *
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @return
+ *   The number of objects added on success.
+ *   On error, the chunk is not added in the memory list of the
+ *   mempool and a negative errno is returned.
+ */
+__rte_experimental
+int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp);
+
+/**
  * Add memory for objects in the pool at init
  *
  * This is the default function used by rte_mempool_create() to populate
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca4..9a6fe65 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -57,4 +57,5 @@ EXPERIMENTAL {
 	global:
 
 	rte_mempool_ops_get_info;
+	rte_mempool_populate_from_pg_sz_chunks;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v10 2/5] kni: add IOVA=VA support in KNI lib
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks vattunuru
@ 2019-08-16  6:12                     ` vattunuru
  2019-10-15 15:36                       ` Yigit, Ferruh
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 3/5] kni: add app specific mempool create and free routines vattunuru
                                       ` (5 subsequent siblings)
  7 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-08-16  6:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation only operates in IOVA=PA mode, patch adds
required functionality in KNI lib to support IOVA=VA mode.

KNI kernel module requires device info to get iommu domain related
information for IOVA addr related translations. Patch defines device
related info in rte_kni_device_info structure and passes device info
to the kernel KNI module when IOVA=VA mode is enabled.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++++++
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 30 +++++++++++++++++++++++
 4 files changed, 40 insertions(+)

diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 37d9ee8..4fd8a90 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -111,6 +111,13 @@ struct rte_kni_device_info {
 	void * mbuf_va;
 	phys_addr_t mbuf_phys;
 
+	/* PCI info */
+	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+	uint8_t bus;                  /**< Device bus */
+	uint8_t devid;                /**< Device ID */
+	uint8_t function;             /**< Device function. */
+
 	uint16_t group_id;            /**< Group ID */
 	uint32_t core_id;             /**< core ID to bind for kernel thread */
 
@@ -121,6 +128,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index cbd6599..ab15d10 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 LIB = librte_kni.a
 
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
+CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
 
 EXPORT_MAP := rte_kni_version.map
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index 41fa2e3..fd46f87 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -9,3 +9,4 @@ version = 2
 sources = files('rte_kni.c')
 headers = files('rte_kni.h')
 deps += ['ethdev', 'pci']
+includes += include_directories('../../drivers/bus/pci')
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 4b51fb4..2aaaeaa 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -14,6 +14,7 @@
 #include <rte_spinlock.h>
 #include <rte_string_fns.h>
 #include <rte_ethdev.h>
+#include <rte_bus_pci.h>
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_kni.h>
@@ -199,6 +200,27 @@ kni_release_mz(struct rte_kni *kni)
 	rte_memzone_free(kni->m_sync_addr);
 }
 
+static void
+kni_dev_pci_addr_get(uint16_t port_id, struct rte_kni_device_info *kni_dev_info)
+{
+	const struct rte_pci_device *pci_dev;
+	struct rte_eth_dev_info dev_info;
+	const struct rte_bus *bus = NULL;
+
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	if (dev_info.device)
+		bus = rte_bus_find_by_device(dev_info.device);
+	if (bus && !strcmp(bus->name, "pci")) {
+		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
+		kni_dev_info->bus = pci_dev->addr.bus;
+		kni_dev_info->devid = pci_dev->addr.devid;
+		kni_dev_info->function = pci_dev->addr.function;
+		kni_dev_info->vendor_id = pci_dev->id.vendor_id;
+		kni_dev_info->device_id = pci_dev->id.device_id;
+	}
+}
+
 struct rte_kni *
 rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	      const struct rte_kni_conf *conf,
@@ -247,6 +269,12 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 		kni->ops.port_id = UINT16_MAX;
 
 	memset(&dev_info, 0, sizeof(dev_info));
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+		uint16_t port_id = conf->group_id;
+
+		kni_dev_pci_addr_get(port_id, &dev_info);
+	}
 	dev_info.core_id = conf->core_id;
 	dev_info.force_bind = conf->force_bind;
 	dev_info.group_id = conf->group_id;
@@ -300,6 +328,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v10 3/5] kni: add app specific mempool create and free routines
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks vattunuru
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 2/5] kni: add IOVA=VA support in KNI lib vattunuru
@ 2019-08-16  6:12                     ` vattunuru
  2019-10-15 15:40                       ` Yigit, Ferruh
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module vattunuru
                                       ` (4 subsequent siblings)
  7 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-08-16  6:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

When KNI operates in IOVA = VA mode, it requires mbuf memory
to be physically contiguous to ensure KNI kernel module could
translate IOVA addresses properly. Patch adds a KNI specific
mempool create routine to populate the KNI packet mbuf pool
with memory objects that are being on a page.

KNI applications need to use this mempool create & free routines
so that mbuf related requirements in IOVA = VA mode are handled
inside those routines based on the enabled mode.

Updated the release notes with these new routine details.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 doc/guides/rel_notes/release_19_11.rst |  5 +++
 examples/kni/main.c                    |  5 ++-
 lib/librte_kni/Makefile                |  1 +
 lib/librte_kni/meson.build             |  1 +
 lib/librte_kni/rte_kni.c               | 60 ++++++++++++++++++++++++++++++++++
 lib/librte_kni/rte_kni.h               | 48 +++++++++++++++++++++++++++
 lib/librte_kni/rte_kni_version.map     |  2 ++
 7 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 8490d89..8813a10 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -85,6 +85,11 @@ API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* kni: ``rte_kni_pktmbuf_pool_create`` ``rte_kni_pktmbuf_pool_free`` functions
+  were introduced for KNI applications for creating & freeing packet pool.
+  Since IOVA=VA mode was added in KNI, packet pool's mbuf memory should be
+  physically contiguous for the KNI kernel module to work in IOVA=VA mode,
+  this requirement was taken care in the kni packet pool creation fucntions.
 
 ABI Changes
 -----------
diff --git a/examples/kni/main.c b/examples/kni/main.c
index 4710d71..fdfeed2 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -975,7 +975,7 @@ main(int argc, char** argv)
 		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
 
 	/* Create the mbuf pool */
-	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
+	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
 		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
 	if (pktmbuf_pool == NULL) {
 		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
@@ -1043,6 +1043,9 @@ main(int argc, char** argv)
 			continue;
 		kni_free_kni(port);
 	}
+
+	rte_kni_pktmbuf_pool_free(pktmbuf_pool);
+
 	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
 		if (kni_port_params_array[i]) {
 			rte_free(kni_port_params_array[i]);
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index ab15d10..5e3dd01 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -6,6 +6,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 # library name
 LIB = librte_kni.a
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
 CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index fd46f87..e357445 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+allow_experimental_apis = true
 if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
 	build = false
 	reason = 'only supported on 64-bit linux'
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 2aaaeaa..15dda45 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -22,6 +22,7 @@
 #include <rte_tailq.h>
 #include <rte_rwlock.h>
 #include <rte_eal_memconfig.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_kni_common.h>
 #include "rte_kni_fifo.h"
 
@@ -681,6 +682,65 @@ kni_allocate_mbufs(struct rte_kni *kni)
 	}
 }
 
+struct rte_mempool *
+rte_kni_pktmbuf_pool_create(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
+	int socket_id)
+{
+	struct rte_pktmbuf_pool_private mbp_priv;
+	const char *mp_ops_name;
+	struct rte_mempool *mp;
+	unsigned int elt_size;
+	int ret;
+
+	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
+		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
+			priv_size);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size +
+		(unsigned int)data_room_size;
+	mbp_priv.mbuf_data_room_size = data_room_size;
+	mbp_priv.mbuf_priv_size = priv_size;
+
+	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
+		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
+	if (mp == NULL)
+		return NULL;
+
+	mp_ops_name = rte_mbuf_best_mempool_ops();
+	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+	rte_pktmbuf_pool_init(mp, &mbp_priv);
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		ret = rte_mempool_populate_from_pg_sz_chunks(mp);
+	else
+		ret = rte_mempool_populate_default(mp);
+
+	if (ret < 0) {
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+
+	rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
+
+	return mp;
+}
+
+void
+rte_kni_pktmbuf_pool_free(struct rte_mempool *mp)
+{
+	rte_mempool_free(mp);
+}
+
 struct rte_kni *
 rte_kni_get(const char *name)
 {
diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h
index 5699a64..99d263d 100644
--- a/lib/librte_kni/rte_kni.h
+++ b/lib/librte_kni/rte_kni.h
@@ -184,6 +184,54 @@ unsigned rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs,
 		unsigned num);
 
 /**
+ * Create a kni packet mbuf pool.
+ *
+ * This function creates and initializes a packet mbuf pool for KNI applications
+ * It calls the required mempool populate routine based on the IOVA mode.
+ *
+ * @param name
+ *   The name of the mbuf pool.
+ * @param n
+ *   The number of elements in the mbuf pool. The optimum size (in terms
+ *   of memory usage) for a mempool is when n is a power of two minus one:
+ *   n = (2^q - 1).
+ * @param cache_size
+ *   Size of the per-core object cache. See rte_mempool_create() for
+ *   details.
+ * @param priv_size
+ *   Size of application private are between the rte_mbuf structure
+ *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
+ * @param data_room_size
+ *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
+ * @param socket_id
+ *   The socket identifier where the memory should be allocated. The
+ *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
+ *   reserved zone.
+ * @return
+ *   The pointer to the new allocated mempool, on success. NULL on error
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ *    - E_RTE_SECONDARY - function was called from a secondary process instance
+ *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
+ *    - ENOSPC - the maximum number of memzones has already been allocated
+ *    - EEXIST - a memzone with the same name already exists
+ *    - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+__rte_experimental
+struct rte_mempool *rte_kni_pktmbuf_pool_create(const char *name,
+		unsigned int n, unsigned int cache_size, uint16_t priv_size,
+		uint16_t data_room_size, int socket_id);
+
+/**
+ * Free the given packet mempool.
+ *
+ * @param mp
+ *  The mempool pointer.
+ */
+__rte_experimental
+void rte_kni_pktmbuf_pool_free(struct rte_mempool *mp);
+
+/**
  * Get the KNI context of its name.
  *
  * @param name
diff --git a/lib/librte_kni/rte_kni_version.map b/lib/librte_kni/rte_kni_version.map
index c877dc6..aba9728 100644
--- a/lib/librte_kni/rte_kni_version.map
+++ b/lib/librte_kni/rte_kni_version.map
@@ -20,4 +20,6 @@ EXPERIMENTAL {
 	global:
 
 	rte_kni_update_link;
+	rte_kni_pktmbuf_pool_create;
+	rte_kni_pktmbuf_pool_free;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
                                       ` (2 preceding siblings ...)
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 3/5] kni: add app specific mempool create and free routines vattunuru
@ 2019-08-16  6:12                     ` vattunuru
  2019-10-15 15:43                       ` Yigit, Ferruh
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 5/5] kni: modify IOVA mode checks to support VA vattunuru
                                       ` (3 subsequent siblings)
  7 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-08-16  6:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Kiran Kumar K <kirankumark@marvell.com>

Patch adds support for kernel module to work in IOVA = VA mode,
the idea is to get physical address from IOVA address using
iommu_iova_to_phys API and later use phys_to_virt API to
convert the physical address to kernel virtual address.

When compared with IOVA = PA mode, there is no performance
drop with this approach.

This approach does not work with the kernel versions less
than 4.4.0 because of API compatibility issues.

Patch also updates these support details in KNI documentation.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
---
 kernel/linux/kni/compat.h   |  4 +++
 kernel/linux/kni/kni_dev.h  |  4 +++
 kernel/linux/kni/kni_misc.c | 71 +++++++++++++++++++++++++++++++++++++++------
 kernel/linux/kni/kni_net.c  | 59 ++++++++++++++++++++++++++++---------
 4 files changed, 116 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..ee997a6 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,7 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 4, 0) <= LINUX_VERSION_CODE
+#define HAVE_IOVA_AS_VA_SUPPORT
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..d5898f3 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -25,6 +25,7 @@
 #include <linux/netdevice.h>
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/iommu.h>
 
 #include <rte_kni_common.h>
 #define KNI_KTHREAD_RESCHEDULE_INTERVAL 5 /* us */
@@ -41,6 +42,9 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+	struct iommu_domain *domain;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 2b75502..8660205 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -295,6 +295,9 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	struct rte_kni_device_info dev_info;
 	struct net_device *net_dev = NULL;
 	struct kni_dev *kni, *dev, *n;
+	struct pci_dev *pci = NULL;
+	struct iommu_domain *domain = NULL;
+	phys_addr_t phys_addr;
 
 	pr_info("Creating kni...\n");
 	/* Check the buffer size, to avoid warning */
@@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_AS_VA_SUPPORT
+		pci = pci_get_device(dev_info.vendor_id,
+				     dev_info.device_id, NULL);
+		if (pci == NULL) {
+			pr_err("pci dev does not exist\n");
+			return -ENODEV;
+		}
+
+		while (pci) {
+			if ((pci->bus->number == dev_info.bus) &&
+			    (PCI_SLOT(pci->devfn) == dev_info.devid) &&
+			    (PCI_FUNC(pci->devfn) == dev_info.function)) {
+				domain = iommu_get_domain_for_dev(&pci->dev);
+				break;
+			}
+			pci = pci_get_device(dev_info.vendor_id,
+					     dev_info.device_id, pci);
+		}
+
+		if (domain == NULL) {
+			pr_err("Failed to get pci dev domain info\n");
+			return -ENODEV;
+		}
+#else
+		pr_err("Kernel version does not support IOVA as VA\n");
+		return -EINVAL;
+#endif
+		kni->domain = domain;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.tx_phys);
+		kni->tx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.rx_phys);
+		kni->rx_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.alloc_phys);
+		kni->alloc_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.free_phys);
+		kni->free_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.req_phys);
+		kni->req_q = phys_to_virt(phys_addr);
+		phys_addr = iommu_iova_to_phys(domain, dev_info.resp_phys);
+		kni->resp_q = phys_to_virt(phys_addr);
+		kni->sync_va = dev_info.sync_va;
+		phys_addr = iommu_iova_to_phys(domain, dev_info.sync_phys);
+		kni->sync_kva = phys_to_virt(phys_addr);
+		kni->iova_mode = 1;
+
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index 7bd3a9f..8382859 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,21 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *pa)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+				(uintptr_t)pa));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iommu_iova_to_phys(kni->domain,
+			   (uintptr_t)m->buf_physaddr) + m->data_off);
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +77,24 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +211,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +299,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +371,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +470,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +482,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +550,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +583,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v10 5/5] kni: modify IOVA mode checks to support VA
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
                                       ` (3 preceding siblings ...)
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module vattunuru
@ 2019-08-16  6:12                     ` vattunuru
  2019-09-25  4:00                     ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
                                       ` (2 subsequent siblings)
  7 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-08-16  6:12 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch addresses checks in KNI and eal that enforce IOVA=PA when
IOVA=VA mode is enabled, since KNI kernel module supports VA
mode for kernel versions >= 4.4.0.

Updated KNI documentation with above details.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst | 8 ++++++++
 lib/librte_eal/linux/eal/eal.c                 | 4 +++-
 lib/librte_kni/rte_kni.c                       | 5 -----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 38369b3..fd2ce63 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -291,6 +291,14 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI can be operated in IOVA as VA scheme when following criteria are fullfilled
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)
+- eal param `--iova-mode va` is passed or bus IOVA scheme is set to RTE_IOVA_VA
+
 Ethtool
 -------
 
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 946222c..73d64c8 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1114,12 +1114,14 @@ rte_eal_init(int argc, char **argv)
 		/* Workaround for KNI which requires physical address to work */
 		if (iova_mode == RTE_IOVA_VA &&
 				rte_eal_check_module("rte_kni") == 1) {
+#if KERNEL_VERSION(4, 4, 0) > LINUX_VERSION_CODE
 			if (phys_addrs) {
 				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
+				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
 			} else {
 				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
 			}
+#endif
 		}
 #endif
 		rte_eal_get_configuration()->iova_mode = iova_mode;
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 15dda45..c77d76f 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -99,11 +99,6 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
-	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
-		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
-		return -1;
-	}
-
 	/* Check FD and open */
 	if (kni_fd < 0) {
 		kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
                                       ` (4 preceding siblings ...)
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 5/5] kni: modify IOVA mode checks to support VA vattunuru
@ 2019-09-25  4:00                     ` Vamsi Krishna Attunuru
  2019-10-08  5:08                       ` Vamsi Krishna Attunuru
  2019-10-15 15:34                     ` Yigit, Ferruh
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
  7 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-09-25  4:00 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda

PING.

-----Original Message-----
From: vattunuru@marvell.com <vattunuru@marvell.com> 
Sent: Friday, August 16, 2019 11:43 AM
To: dev@dpdk.org
Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
Subject: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support 

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V10 Changes:
* Fixed function return code on failure when min_chunk_size > pg_sz.
* Marked new mempool populate routine as EXPERIMENTAL.

V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in the new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.

V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.

V7 Changes:
* Removed previously proposed mempool flag and made those page boundary checks default in mempool populate() except for the objects size bigger than the size of page.
* Removed KNI example application related changes since pool related requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA mode is enabled.

V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered across page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new mempool flag.

V5 changes:
* Fixed build issue with 32b build

V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

V3 Changes:
* Add new approach to work kni with IOVA=VA mode using iommu_iova_to_phys API.

Kiran Kumar K (1):
  kni: add IOVA=VA support in KNI module

Vamsi Attunuru (4):
  mempool: populate mempool with the page sized chunks
  kni: add IOVA=VA support in KNI lib
  kni: add app specific mempool create and free routines
  kni: modify IOVA mode checks to support VA

 doc/guides/prog_guide/kernel_nic_interface.rst    |  8 ++
 doc/guides/rel_notes/release_19_11.rst            |  5 ++
 examples/kni/main.c                               |  5 +-
 kernel/linux/kni/compat.h                         |  4 +
 kernel/linux/kni/kni_dev.h                        |  4 +
 kernel/linux/kni/kni_misc.c                       | 71 ++++++++++++++---
 kernel/linux/kni/kni_net.c                        | 59 ++++++++++----
 lib/librte_eal/linux/eal/eal.c                    |  4 +-
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++
 lib/librte_kni/Makefile                           |  2 +
 lib/librte_kni/meson.build                        |  2 +
 lib/librte_kni/rte_kni.c                          | 95 +++++++++++++++++++++--
 lib/librte_kni/rte_kni.h                          | 48 ++++++++++++
 lib/librte_kni/rte_kni_version.map                |  2 +
 lib/librte_mempool/rte_mempool.c                  | 69 ++++++++++++++++
 lib/librte_mempool/rte_mempool.h                  | 20 +++++
 lib/librte_mempool/rte_mempool_version.map        |  1 +
 17 files changed, 378 insertions(+), 29 deletions(-)

--
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
  2019-09-25  4:00                     ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
@ 2019-10-08  5:08                       ` Vamsi Krishna Attunuru
  2019-10-14  4:05                         ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-08  5:08 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda

@All, we are expecting to merge this in 19.11 release and if any one have comments please respond.

> -----Original Message-----
> From: Vamsi Krishna Attunuru
> Sent: Wednesday, September 25, 2019 9:30 AM
> To: vattunuru@marvell.com; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: RE: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> 
> PING.
> 
> -----Original Message-----
> From: vattunuru@marvell.com <vattunuru@marvell.com>
> Sent: Friday, August 16, 2019 11:43 AM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>; Vamsi Krishna Attunuru
> <vattunuru@marvell.com>
> Subject: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> 
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> ---
> V10 Changes:
> * Fixed function return code on failure when min_chunk_size > pg_sz.
> * Marked new mempool populate routine as EXPERIMENTAL.
> 
> V9 Changes:
> * Used rte_mempool_ops_calc_mem_size() instead of default handler in the
> new mempool populate routine.
> * Check min_chunk_size and return values.
> * Removed ethdev_info memset to '0' and moved pci dev_info populate into
> kni_dev_pci_addr_get() routine.
> * Addressed misc. review comments.
> 
> V8 Changes:
> * Remove default mempool populate() routine changes.
> * Add kni app specific mempool create & free routines.
> * Add new mempool populate routine to allocate page-aligned memzones
> with page size to make sure all mempool objects reside on a page.
> * Update release notes and map files.
> 
> V7 Changes:
> * Removed previously proposed mempool flag and made those page
> boundary checks default in mempool populate() except for the objects size
> bigger than the size of page.
> * Removed KNI example application related changes since pool related
> requirement is taken care in mempool lib.
> * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> * Added wrapper functions in KNI module to hide IOVA checks and make
> address translation routines more readable.
> * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
> mode is enabled.
> 
> V6 Changes:
> * Added new mempool flag to ensure mbuf memory is not scattered across
> page boundaries.
> * Added KNI kernel module required PCI device information.
> * Modified KNI example application to create mempool with new mempool
> flag.
> 
> V5 changes:
> * Fixed build issue with 32b build
> 
> V4 changes:
> * Fixed build issues with older kernel versions
> * This approach will only work with kernel above 4.4.0
> 
> V3 Changes:
> * Add new approach to work kni with IOVA=VA mode using
> iommu_iova_to_phys API.
> 
> Kiran Kumar K (1):
>   kni: add IOVA=VA support in KNI module
> 
> Vamsi Attunuru (4):
>   mempool: populate mempool with the page sized chunks
>   kni: add IOVA=VA support in KNI lib
>   kni: add app specific mempool create and free routines
>   kni: modify IOVA mode checks to support VA
> 
>  doc/guides/prog_guide/kernel_nic_interface.rst    |  8 ++
>  doc/guides/rel_notes/release_19_11.rst            |  5 ++
>  examples/kni/main.c                               |  5 +-
>  kernel/linux/kni/compat.h                         |  4 +
>  kernel/linux/kni/kni_dev.h                        |  4 +
>  kernel/linux/kni/kni_misc.c                       | 71 ++++++++++++++---
>  kernel/linux/kni/kni_net.c                        | 59 ++++++++++----
>  lib/librte_eal/linux/eal/eal.c                    |  4 +-
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++
>  lib/librte_kni/Makefile                           |  2 +
>  lib/librte_kni/meson.build                        |  2 +
>  lib/librte_kni/rte_kni.c                          | 95 +++++++++++++++++++++--
>  lib/librte_kni/rte_kni.h                          | 48 ++++++++++++
>  lib/librte_kni/rte_kni_version.map                |  2 +
>  lib/librte_mempool/rte_mempool.c                  | 69 ++++++++++++++++
>  lib/librte_mempool/rte_mempool.h                  | 20 +++++
>  lib/librte_mempool/rte_mempool_version.map        |  1 +
>  17 files changed, 378 insertions(+), 29 deletions(-)
> 
> --
> 2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks vattunuru
@ 2019-10-08  9:26                       ` Olivier Matz
  2019-10-09  5:29                         ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-08  9:26 UTC (permalink / raw)
  To: vattunuru
  Cc: dev, thomas, jerinj, ferruh.yigit, anatoly.burakov, arybchenko,
	kirankumark

On Fri, Aug 16, 2019 at 11:42:48AM +0530, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> Patch adds a routine to populate mempool from page aligned and
> page sized chunks of memory to ensure memory objs do not fall
> across the page boundaries. It's useful for applications that
> require physically contiguous mbuf memory while running in
> IOVA=VA mode.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>

Acked-by: Olivier Matz <olivier.matz@6wind.com>

Do you confirm that this patch could be removed if this patchset is
applied?
http://patchwork.dpdk.org/project/dpdk/list/?series=5624

I'll try to address Andrew's comments.

Thanks,
Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 1/4] mempool: clarify default populate function
  2019-07-19 15:42                           ` Andrew Rybchenko
@ 2019-10-08  9:36                             ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-08  9:36 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

On Fri, Jul 19, 2019 at 06:42:06PM +0300, Andrew Rybchenko wrote:
> On 7/19/19 4:38 PM, Olivier Matz wrote:
> > No functional change. Clarify the populate function to make
> > the next commit easier to understand.
> > 
> > Rename the variables:
> > - to avoid negation in the name
> > - to have more understandable names
> > 
> > Remove useless variable (no_pageshift is equivalent to pg_sz == 0).
> > 
> > Remove duplicate affectation of "external" variable.
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> 
> LGTM
> 
> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> 

I submitted this one in a separate patch:
http://patchwork.dpdk.org/patch/60687/


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks
  2019-10-08  9:26                       ` Olivier Matz
@ 2019-10-09  5:29                         ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-09  5:29 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, ferruh.yigit,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Tuesday, October 8, 2019 2:57 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: dev@dpdk.org; thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the
> page sized chunks
> 
> On Fri, Aug 16, 2019 at 11:42:48AM +0530, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Patch adds a routine to populate mempool from page aligned and page
> > sized chunks of memory to ensure memory objs do not fall across the
> > page boundaries. It's useful for applications that require physically
> > contiguous mbuf memory while running in IOVA=VA mode.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> 
> Acked-by: Olivier Matz <olivier.matz@6wind.com>
> 
> Do you confirm that this patch could be removed if this patchset is applied?
> https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__patchwork.dpdk.org_project_dpdk_list_-3Fseries-
> 3D5624&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=WllrYaumVkxaWjgKto6E_
> rtDQshhIhik2jkvzFyRhW8&m=VNggnjZkAmMB9zBeJoncc5h26m4L1UT3wxfEzeY2
> Shg&s=OVsOCqIXXvzJHMZdVqXp-7wvZdK2qYPze39QNYkf4Jo&e=
> 
> I'll try to address Andrew's comments.

Yes, this patch will be redundant and can be removed once the above patchset is applied.

Thanks
Vamsi

 > 
> Thanks,
> Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
  2019-10-08  5:08                       ` Vamsi Krishna Attunuru
@ 2019-10-14  4:05                         ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-14  4:05 UTC (permalink / raw)
  To: ferruh.yigit
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, anatoly.burakov,
	arybchenko, Kiran Kumar Kokkilagadda, dev

Hi Ferruh,

Mempool related patch has been acked by Olivier. Could you please review the kni related patches and make sure that it will be merged in 19.11 release.

Regards,
Vamsi

> -----Original Message-----
> From: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Sent: Tuesday, October 8, 2019 10:39 AM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: RE: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> 
> @All, we are expecting to merge this in 19.11 release and if any one have
> comments please respond.
> 
> > -----Original Message-----
> > From: Vamsi Krishna Attunuru
> > Sent: Wednesday, September 25, 2019 9:30 AM
> > To: vattunuru@marvell.com; dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> > anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> > Kokkilagadda <kirankumark@marvell.com>
> > Subject: RE: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> >
> > PING.
> >
> > -----Original Message-----
> > From: vattunuru@marvell.com <vattunuru@marvell.com>
> > Sent: Friday, August 16, 2019 11:43 AM
> > To: dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> > anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> > Kokkilagadda <kirankumark@marvell.com>; Vamsi Krishna Attunuru
> > <vattunuru@marvell.com>
> > Subject: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> >
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > ---
> > V10 Changes:
> > * Fixed function return code on failure when min_chunk_size > pg_sz.
> > * Marked new mempool populate routine as EXPERIMENTAL.
> >
> > V9 Changes:
> > * Used rte_mempool_ops_calc_mem_size() instead of default handler in
> > the new mempool populate routine.
> > * Check min_chunk_size and return values.
> > * Removed ethdev_info memset to '0' and moved pci dev_info populate
> > into
> > kni_dev_pci_addr_get() routine.
> > * Addressed misc. review comments.
> >
> > V8 Changes:
> > * Remove default mempool populate() routine changes.
> > * Add kni app specific mempool create & free routines.
> > * Add new mempool populate routine to allocate page-aligned memzones
> > with page size to make sure all mempool objects reside on a page.
> > * Update release notes and map files.
> >
> > V7 Changes:
> > * Removed previously proposed mempool flag and made those page
> > boundary checks default in mempool populate() except for the objects
> > size bigger than the size of page.
> > * Removed KNI example application related changes since pool related
> > requirement is taken care in mempool lib.
> > * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> > * Added wrapper functions in KNI module to hide IOVA checks and make
> > address translation routines more readable.
> > * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
> > mode is enabled.
> >
> > V6 Changes:
> > * Added new mempool flag to ensure mbuf memory is not scattered across
> > page boundaries.
> > * Added KNI kernel module required PCI device information.
> > * Modified KNI example application to create mempool with new mempool
> > flag.
> >
> > V5 changes:
> > * Fixed build issue with 32b build
> >
> > V4 changes:
> > * Fixed build issues with older kernel versions
> > * This approach will only work with kernel above 4.4.0
> >
> > V3 Changes:
> > * Add new approach to work kni with IOVA=VA mode using
> > iommu_iova_to_phys API.
> >
> > Kiran Kumar K (1):
> >   kni: add IOVA=VA support in KNI module
> >
> > Vamsi Attunuru (4):
> >   mempool: populate mempool with the page sized chunks
> >   kni: add IOVA=VA support in KNI lib
> >   kni: add app specific mempool create and free routines
> >   kni: modify IOVA mode checks to support VA
> >
> >  doc/guides/prog_guide/kernel_nic_interface.rst    |  8 ++
> >  doc/guides/rel_notes/release_19_11.rst            |  5 ++
> >  examples/kni/main.c                               |  5 +-
> >  kernel/linux/kni/compat.h                         |  4 +
> >  kernel/linux/kni/kni_dev.h                        |  4 +
> >  kernel/linux/kni/kni_misc.c                       | 71 ++++++++++++++---
> >  kernel/linux/kni/kni_net.c                        | 59 ++++++++++----
> >  lib/librte_eal/linux/eal/eal.c                    |  4 +-
> >  lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++
> >  lib/librte_kni/Makefile                           |  2 +
> >  lib/librte_kni/meson.build                        |  2 +
> >  lib/librte_kni/rte_kni.c                          | 95 +++++++++++++++++++++--
> >  lib/librte_kni/rte_kni.h                          | 48 ++++++++++++
> >  lib/librte_kni/rte_kni_version.map                |  2 +
> >  lib/librte_mempool/rte_mempool.c                  | 69 ++++++++++++++++
> >  lib/librte_mempool/rte_mempool.h                  | 20 +++++
> >  lib/librte_mempool/rte_mempool_version.map        |  1 +
> >  17 files changed, 378 insertions(+), 29 deletions(-)
> >
> > --
> > 2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
                                       ` (5 preceding siblings ...)
  2019-09-25  4:00                     ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
@ 2019-10-15 15:34                     ` Yigit, Ferruh
  2019-10-16 12:17                       ` Vamsi Krishna Attunuru
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
  7 siblings, 1 reply; 251+ messages in thread
From: Yigit, Ferruh @ 2019-10-15 15:34 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark

On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> ---
> V10 Changes:
> * Fixed function return code on failure when min_chunk_size > pg_sz.
> * Marked new mempool populate routine as EXPERIMENTAL.
> 
> V9 Changes:
> * Used rte_mempool_ops_calc_mem_size() instead of default handler in
> the new mempool populate routine.
> * Check min_chunk_size and return values.
> * Removed ethdev_info memset to '0' and moved pci dev_info populate into
> kni_dev_pci_addr_get() routine.
> * Addressed misc. review comments.
> 
> V8 Changes:
> * Remove default mempool populate() routine changes.
> * Add kni app specific mempool create & free routines.
> * Add new mempool populate routine to allocate page-aligned memzones
> with page size to make sure all mempool objects reside on a page.
> * Update release notes and map files.
> 
> V7 Changes:
> * Removed previously proposed mempool flag and made those page boundary
> checks default in mempool populate() except for the objects size bigger
> than the size of page.
> * Removed KNI example application related changes since pool related
> requirement is taken care in mempool lib.
> * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> * Added wrapper functions in KNI module to hide IOVA checks and make
> address translation routines more readable.
> * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA mode
> is enabled.
> 
> V6 Changes:
> * Added new mempool flag to ensure mbuf memory is not scattered
> across page boundaries.
> * Added KNI kernel module required PCI device information.
> * Modified KNI example application to create mempool with new
> mempool flag.
> 
> V5 changes:
> * Fixed build issue with 32b build
> 
> V4 changes:
> * Fixed build issues with older kernel versions
> * This approach will only work with kernel above 4.4.0
> 
> V3 Changes:
> * Add new approach to work kni with IOVA=VA mode using
> iommu_iova_to_phys API.
> 
> Kiran Kumar K (1):
>   kni: add IOVA=VA support in KNI module
> 
> Vamsi Attunuru (4):
>   mempool: populate mempool with the page sized chunks
>   kni: add IOVA=VA support in KNI lib
>   kni: add app specific mempool create and free routines
>   kni: modify IOVA mode checks to support VA

Hi Vamsi,

I am aware that this patchset is around for a long time, and I have seen your
request to merge in 19.11, but as you can understand the concern I have is to
break KNI or existing KNI applications while trying to add this new feature.

In high level, there are two issues,

1) kernel modules updates expect there will be a backed device of the KNI which
is not always true:

         if (dev_info.iova_mode) {
 #ifdef HAVE_IOVA_AS_VA_SUPPORT
                 pci = pci_get_device(dev_info.vendor_id,
                                      dev_info.device_id, NULL);
                 if (pci == NULL) {
                         pr_err("pci dev does not exist\n");
                         return -ENODEV;
                 }

For example this breaks:
./build/app/testpmd -w0:0.0 --vdev net_kni0 --vdev net_kni1  -- -i


2) Applications will have to change the API to allocate the mempool.
If the user upgraded to new version of DPDK, now it is possible to have iova=va
mode and application should use new KNI API 'rte_kni_pktmbuf_pool_create()' to
allocate mempool. And most probably application will have datapath and will use
the KNI only for exception path, will there be any affect using KNI version of
mempool alloc?


I would like to see KNI is enabled via iova=va mode, but can we have it limited
to a specific command line argument or config option? This increases the test
surface but at least old application can continue to work by default, what do
you think?

And I will put a few minor comments to the patches...



^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 2/5] kni: add IOVA=VA support in KNI lib
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 2/5] kni: add IOVA=VA support in KNI lib vattunuru
@ 2019-10-15 15:36                       ` Yigit, Ferruh
  0 siblings, 0 replies; 251+ messages in thread
From: Yigit, Ferruh @ 2019-10-15 15:36 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark

On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> Current KNI implementation only operates in IOVA=PA mode, patch adds
> required functionality in KNI lib to support IOVA=VA mode.
> 
> KNI kernel module requires device info to get iommu domain related
> information for IOVA addr related translations. Patch defines device
> related info in rte_kni_device_info structure and passes device info
> to the kernel KNI module when IOVA=VA mode is enabled.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> ---
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |  8 ++++++
>  lib/librte_kni/Makefile                           |  1 +
>  lib/librte_kni/meson.build                        |  1 +
>  lib/librte_kni/rte_kni.c                          | 30 +++++++++++++++++++++++
>  4 files changed, 40 insertions(+)
> 
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 37d9ee8..4fd8a90 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -111,6 +111,13 @@ struct rte_kni_device_info {
>  	void * mbuf_va;
>  	phys_addr_t mbuf_phys;
>  
> +	/* PCI info */
> +	uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
> +	uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
> +	uint8_t bus;                  /**< Device bus */
> +	uint8_t devid;                /**< Device ID */
> +	uint8_t function;             /**< Device function. */
> +
>  	uint16_t group_id;            /**< Group ID */
>  	uint32_t core_id;             /**< core ID to bind for kernel thread */
>  
> @@ -121,6 +128,7 @@ struct rte_kni_device_info {
>  	unsigned mbuf_size;
>  	unsigned int mtu;
>  	uint8_t mac_addr[6];
> +	uint8_t iova_mode;
>  };
>  
>  #define KNI_DEVICE "kni"
> diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
> index cbd6599..ab15d10 100644
> --- a/lib/librte_kni/Makefile
> +++ b/lib/librte_kni/Makefile
> @@ -7,6 +7,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  LIB = librte_kni.a
>  
>  CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
> +CFLAGS += -I$(RTE_SDK)/drivers/bus/pci
>  LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
>  
>  EXPORT_MAP := rte_kni_version.map
> diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
> index 41fa2e3..fd46f87 100644
> --- a/lib/librte_kni/meson.build
> +++ b/lib/librte_kni/meson.build
> @@ -9,3 +9,4 @@ version = 2
>  sources = files('rte_kni.c')
>  headers = files('rte_kni.h')
>  deps += ['ethdev', 'pci']
> +includes += include_directories('../../drivers/bus/pci')
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index 4b51fb4..2aaaeaa 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -14,6 +14,7 @@
>  #include <rte_spinlock.h>
>  #include <rte_string_fns.h>
>  #include <rte_ethdev.h>
> +#include <rte_bus_pci.h>
>  #include <rte_malloc.h>
>  #include <rte_log.h>
>  #include <rte_kni.h>
> @@ -199,6 +200,27 @@ kni_release_mz(struct rte_kni *kni)
>  	rte_memzone_free(kni->m_sync_addr);
>  }
>  
> +static void
> +kni_dev_pci_addr_get(uint16_t port_id, struct rte_kni_device_info *kni_dev_info)
> +{
> +	const struct rte_pci_device *pci_dev;
> +	struct rte_eth_dev_info dev_info;
> +	const struct rte_bus *bus = NULL;
> +
> +	rte_eth_dev_info_get(port_id, &dev_info);
> +
> +	if (dev_info.device)
> +		bus = rte_bus_find_by_device(dev_info.device);
> +	if (bus && !strcmp(bus->name, "pci")) {
> +		pci_dev = RTE_DEV_TO_PCI(dev_info.device);
> +		kni_dev_info->bus = pci_dev->addr.bus;
> +		kni_dev_info->devid = pci_dev->addr.devid;
> +		kni_dev_info->function = pci_dev->addr.function;
> +		kni_dev_info->vendor_id = pci_dev->id.vendor_id;
> +		kni_dev_info->device_id = pci_dev->id.device_id;
> +	}
> +}
> +
>  struct rte_kni *
>  rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  	      const struct rte_kni_conf *conf,
> @@ -247,6 +269,12 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  		kni->ops.port_id = UINT16_MAX;
>  
>  	memset(&dev_info, 0, sizeof(dev_info));
> +
> +	if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> +		uint16_t port_id = conf->group_id;
> +
> +		kni_dev_pci_addr_get(port_id, &dev_info);
> +	}
>  	dev_info.core_id = conf->core_id;
>  	dev_info.force_bind = conf->force_bind;
>  	dev_info.group_id = conf->group_id;
> @@ -300,6 +328,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>  	kni->group_id = conf->group_id;
>  	kni->mbuf_size = conf->mbuf_size;
>  
> +	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> +

"dev_info.iova_mode = 1;" can be moved into above "RTE_IOVA_VA" check block.

>  	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
>  	if (ret < 0)
>  		goto ioctl_fail;
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 3/5] kni: add app specific mempool create and free routines
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 3/5] kni: add app specific mempool create and free routines vattunuru
@ 2019-10-15 15:40                       ` Yigit, Ferruh
  0 siblings, 0 replies; 251+ messages in thread
From: Yigit, Ferruh @ 2019-10-15 15:40 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark

On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> When KNI operates in IOVA = VA mode, it requires mbuf memory
> to be physically contiguous to ensure KNI kernel module could
> translate IOVA addresses properly. Patch adds a KNI specific
> mempool create routine to populate the KNI packet mbuf pool
> with memory objects that are being on a page.
> 
> KNI applications need to use this mempool create & free routines
> so that mbuf related requirements in IOVA = VA mode are handled
> inside those routines based on the enabled mode.
> 
> Updated the release notes with these new routine details.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

<...>

> @@ -975,7 +975,7 @@ main(int argc, char** argv)
>  		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
>  
>  	/* Create the mbuf pool */
> -	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
> +	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,

This sample application is in our control but what about user applications?

If this is a must to switch to this API I think it should be at least documented
in the KNI documentation.

>  		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
>  	if (pktmbuf_pool == NULL) {
>  		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
> @@ -1043,6 +1043,9 @@ main(int argc, char** argv)
>  			continue;
>  		kni_free_kni(port);
>  	}
> +
> +	rte_kni_pktmbuf_pool_free(pktmbuf_pool);
> +
>  	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
>  		if (kni_port_params_array[i]) {
>  			rte_free(kni_port_params_array[i]);
> diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
> index ab15d10..5e3dd01 100644
> --- a/lib/librte_kni/Makefile
> +++ b/lib/librte_kni/Makefile
> @@ -6,6 +6,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  # library name
>  LIB = librte_kni.a
>  
> +CFLAGS += -DALLOW_EXPERIMENTAL_API

Can you please list the experimental APIs called that requires this flag as
commented, same for meson. This is to help how to know this flag later.

<...>

> @@ -20,4 +20,6 @@ EXPERIMENTAL {
>  	global:
>  
>  	rte_kni_update_link;
> +	rte_kni_pktmbuf_pool_create;
> +	rte_kni_pktmbuf_pool_free;

Can you please add comment on which DPDK version these APIs added, like done is
"lib/librte_eal/rte_eal_version.map"

>  };
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module vattunuru
@ 2019-10-15 15:43                       ` Yigit, Ferruh
  2019-10-15 15:46                         ` Stephen Hemminger
  0 siblings, 1 reply; 251+ messages in thread
From: Yigit, Ferruh @ 2019-10-15 15:43 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, olivier.matz, ferruh.yigit, anatoly.burakov,
	arybchenko, kirankumark

On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> From: Kiran Kumar K <kirankumark@marvell.com>
> 
> Patch adds support for kernel module to work in IOVA = VA mode,
> the idea is to get physical address from IOVA address using
> iommu_iova_to_phys API and later use phys_to_virt API to
> convert the physical address to kernel virtual address.
> 
> When compared with IOVA = PA mode, there is no performance
> drop with this approach.
> 
> This approach does not work with the kernel versions less
> than 4.4.0 because of API compatibility issues.
> 
> Patch also updates these support details in KNI documentation.
> 
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>

<...>

> @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>  
>  	/* Translate user space info into kernel space info */
> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -	kni->sync_va = dev_info.sync_va;
> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +	if (dev_info.iova_mode) {
> +#ifdef HAVE_IOVA_AS_VA_SUPPORT
> +		pci = pci_get_device(dev_info.vendor_id,
> +				     dev_info.device_id, NULL);
> +		if (pci == NULL) {
> +			pr_err("pci dev does not exist\n");
> +			return -ENODEV;
> +		}

If there is no PCI device KNI should still work.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-10-15 15:43                       ` Yigit, Ferruh
@ 2019-10-15 15:46                         ` Stephen Hemminger
  2019-10-16 11:26                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Stephen Hemminger @ 2019-10-15 15:46 UTC (permalink / raw)
  To: Yigit, Ferruh
  Cc: vattunuru, dev, thomas, jerinj, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, kirankumark

On Tue, 15 Oct 2019 16:43:08 +0100
"Yigit, Ferruh" <ferruh.yigit@linux.intel.com> wrote:

> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> > From: Kiran Kumar K <kirankumark@marvell.com>
> > 
> > Patch adds support for kernel module to work in IOVA = VA mode,
> > the idea is to get physical address from IOVA address using
> > iommu_iova_to_phys API and later use phys_to_virt API to
> > convert the physical address to kernel virtual address.
> > 
> > When compared with IOVA = PA mode, there is no performance
> > drop with this approach.
> > 
> > This approach does not work with the kernel versions less
> > than 4.4.0 because of API compatibility issues.
> > 
> > Patch also updates these support details in KNI documentation.
> > 
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>  
> 
> <...>
> 
> > @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
> >  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> >  
> >  	/* Translate user space info into kernel space info */
> > -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> > -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> > -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> > -	kni->free_q = phys_to_virt(dev_info.free_phys);
> > -
> > -	kni->req_q = phys_to_virt(dev_info.req_phys);
> > -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> > -	kni->sync_va = dev_info.sync_va;
> > -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> > +	if (dev_info.iova_mode) {
> > +#ifdef HAVE_IOVA_AS_VA_SUPPORT
> > +		pci = pci_get_device(dev_info.vendor_id,
> > +				     dev_info.device_id, NULL);
> > +		if (pci == NULL) {
> > +			pr_err("pci dev does not exist\n");
> > +			return -ENODEV;
> > +		}  
> 
> If there is no PCI device KNI should still work.

Right now it is possible to use KNI with netvsc PMD on Hyper-V/Azure.
With this patch that won't be possible.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-10-15 15:46                         ` Stephen Hemminger
@ 2019-10-16 11:26                           ` Vamsi Krishna Attunuru
  2019-10-16 14:37                             ` Vamsi Krishna Attunuru
  2019-10-16 16:14                             ` Ferruh Yigit
  0 siblings, 2 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-16 11:26 UTC (permalink / raw)
  To: Stephen Hemminger, Yigit, Ferruh
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, olivier.matz,
	ferruh.yigit, anatoly.burakov, arybchenko,
	Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Tuesday, October 15, 2019 9:16 PM
> To: Yigit, Ferruh <ferruh.yigit@linux.intel.com>
> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: [EXT] Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI
> module
> 
> External Email
> 
> ----------------------------------------------------------------------
> On Tue, 15 Oct 2019 16:43:08 +0100
> "Yigit, Ferruh" <ferruh.yigit@linux.intel.com> wrote:
> 
> > On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> > > From: Kiran Kumar K <kirankumark@marvell.com>
> > >
> > > Patch adds support for kernel module to work in IOVA = VA mode, the
> > > idea is to get physical address from IOVA address using
> > > iommu_iova_to_phys API and later use phys_to_virt API to convert the
> > > physical address to kernel virtual address.
> > >
> > > When compared with IOVA = PA mode, there is no performance drop with
> > > this approach.
> > >
> > > This approach does not work with the kernel versions less than 4.4.0
> > > because of API compatibility issues.
> > >
> > > Patch also updates these support details in KNI documentation.
> > >
> > > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > <...>
> >
> > > @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t
> ioctl_num,
> > >  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> > >
> > >  	/* Translate user space info into kernel space info */
> > > -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> > > -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> > > -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> > > -	kni->free_q = phys_to_virt(dev_info.free_phys);
> > > -
> > > -	kni->req_q = phys_to_virt(dev_info.req_phys);
> > > -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> > > -	kni->sync_va = dev_info.sync_va;
> > > -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> > > +	if (dev_info.iova_mode) {
> > > +#ifdef HAVE_IOVA_AS_VA_SUPPORT
> > > +		pci = pci_get_device(dev_info.vendor_id,
> > > +				     dev_info.device_id, NULL);
> > > +		if (pci == NULL) {
> > > +			pr_err("pci dev does not exist\n");
> > > +			return -ENODEV;
> > > +		}
> >
> > If there is no PCI device KNI should still work.
> 
> Right now it is possible to use KNI with netvsc PMD on Hyper-V/Azure.
> With this patch that won't be possible.

Hi Ferruh, Stephen,

These can be fixed by forcing iommu_mode as PA when vdevs are used
for KNI usecase.

rte_bus_get_iommu_class(void)
 {
        enum rte_iova_mode mode = RTE_IOVA_DC;
+       struct rte_devargs *devargs = NULL;
        bool buses_want_va = false;
        bool buses_want_pa = false;
        struct rte_bus *bus;

+       if (rte_eal_check_module("rte_kni") == 1) {
+               RTE_EAL_DEVARGS_FOREACH("vdev", devargs) {
+                       return RTE_IOVA_PA;
+               }
+       }
+
        TAILQ_FOREACH(bus, &rte_bus_list, next) {
                enum rte_iova_mode bus_iova_mode;

I think this will solve various use cases/combinations like PA or VA mode, pdev or vdev used for KNI.
Existing use cases would not be affected by these patch series with above fix.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
  2019-10-15 15:34                     ` Yigit, Ferruh
@ 2019-10-16 12:17                       ` Vamsi Krishna Attunuru
  2019-10-16 16:21                         ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-16 12:17 UTC (permalink / raw)
  To: Yigit, Ferruh
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda, dev



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Yigit, Ferruh
> Sent: Tuesday, October 15, 2019 9:05 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> 
> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > ---
> > V10 Changes:
> > * Fixed function return code on failure when min_chunk_size > pg_sz.
> > * Marked new mempool populate routine as EXPERIMENTAL.
> >
> > V9 Changes:
> > * Used rte_mempool_ops_calc_mem_size() instead of default handler in
> > the new mempool populate routine.
> > * Check min_chunk_size and return values.
> > * Removed ethdev_info memset to '0' and moved pci dev_info populate
> > into
> > kni_dev_pci_addr_get() routine.
> > * Addressed misc. review comments.
> >
> > V8 Changes:
> > * Remove default mempool populate() routine changes.
> > * Add kni app specific mempool create & free routines.
> > * Add new mempool populate routine to allocate page-aligned memzones
> > with page size to make sure all mempool objects reside on a page.
> > * Update release notes and map files.
> >
> > V7 Changes:
> > * Removed previously proposed mempool flag and made those page
> > boundary checks default in mempool populate() except for the objects
> > size bigger than the size of page.
> > * Removed KNI example application related changes since pool related
> > requirement is taken care in mempool lib.
> > * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> > * Added wrapper functions in KNI module to hide IOVA checks and make
> > address translation routines more readable.
> > * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
> > mode is enabled.
> >
> > V6 Changes:
> > * Added new mempool flag to ensure mbuf memory is not scattered across
> > page boundaries.
> > * Added KNI kernel module required PCI device information.
> > * Modified KNI example application to create mempool with new mempool
> > flag.
> >
> > V5 changes:
> > * Fixed build issue with 32b build
> >
> > V4 changes:
> > * Fixed build issues with older kernel versions
> > * This approach will only work with kernel above 4.4.0
> >
> > V3 Changes:
> > * Add new approach to work kni with IOVA=VA mode using
> > iommu_iova_to_phys API.
> >
> > Kiran Kumar K (1):
> >   kni: add IOVA=VA support in KNI module
> >
> > Vamsi Attunuru (4):
> >   mempool: populate mempool with the page sized chunks
> >   kni: add IOVA=VA support in KNI lib
> >   kni: add app specific mempool create and free routines
> >   kni: modify IOVA mode checks to support VA
> 
> Hi Vamsi,
> 
> I am aware that this patchset is around for a long time, and I have seen your
> request to merge in 19.11, but as you can understand the concern I have is to
> break KNI or existing KNI applications while trying to add this new feature.
> 
> In high level, there are two issues,
> 
> 1) kernel modules updates expect there will be a backed device of the KNI which
> is not always true:
> 
>          if (dev_info.iova_mode) {
>  #ifdef HAVE_IOVA_AS_VA_SUPPORT
>                  pci = pci_get_device(dev_info.vendor_id,
>                                       dev_info.device_id, NULL);
>                  if (pci == NULL) {
>                          pr_err("pci dev does not exist\n");
>                          return -ENODEV;
>                  }
> 
> For example this breaks:
> ./build/app/testpmd -w0:0.0 --vdev net_kni0 --vdev net_kni1  -- -i

Vamsi> Yes, these can be fixed by forcing iommu_mode to PA for
vdev or vdev&pdev based KNI usecases. 

> 
> 
> 2) Applications will have to change the API to allocate the mempool.
> If the user upgraded to new version of DPDK, now it is possible to have iova=va
> mode and application should use new KNI API 'rte_kni_pktmbuf_pool_create()'
> to allocate mempool. And most probably application will have datapath and will
> use the KNI only for exception path, will there be any affect using KNI version of
> mempool alloc?

Vamsi> There would not be any affect in using KNI version of mempool.

> 
> 
> I would like to see KNI is enabled via iova=va mode, but can we have it limited to
> a specific command line argument or config option? This increases the test
> surface but at least old application can continue to work by default, what do you
> think?

Vamsi> Yes, it's appropriate to control the mode to ensure old apps work by default.
We are fine with having a command line arg or config option to enable KNI in iova=va mode.
Earlier we thought of having similar approach that also controls mempool allocation using
a newer mempool flag. After multiple reviews, flag has been discard and added a separate
mempool populate routine for these usecase. 

When command line arg/config option is introduced, functionality will be as below. 
Please correct me if any cases are missed or not considered.
Without command:
Existing KNI is intact, iommu mode will be PA.
With  command:
Pdev/vdev's iommu mode is considered and accordingly iova=va/pa is enabled. Application is
supposed to use KNI version of mempool alloc. I think these mempool quirk will go away when
Olivier's mempool patchset(RFC) is merged.

> 
> And I will put a few minor comments to the patches...
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-10-16 11:26                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-10-16 14:37                             ` Vamsi Krishna Attunuru
  2019-10-16 16:14                             ` Ferruh Yigit
  1 sibling, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-16 14:37 UTC (permalink / raw)
  To: Stephen Hemminger, Yigit, Ferruh
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, olivier.matz,
	ferruh.yigit, anatoly.burakov, arybchenko,
	Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: Vamsi Krishna Attunuru
> Sent: Wednesday, October 16, 2019 4:56 PM
> To: Stephen Hemminger <stephen@networkplumber.org>; Yigit, Ferruh
> <ferruh.yigit@linux.intel.com>
> Cc: dev@dpdk.org; thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: RE: [EXT] Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in
> KNI module
> 
> 
> 
> > -----Original Message-----
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Sent: Tuesday, October 15, 2019 9:16 PM
> > To: Yigit, Ferruh <ferruh.yigit@linux.intel.com>
> > Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
> > thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> > olivier.matz@6wind.com; ferruh.yigit@intel.com;
> > anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> > Kokkilagadda <kirankumark@marvell.com>
> > Subject: [EXT] Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support
> > in KNI module
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > On Tue, 15 Oct 2019 16:43:08 +0100
> > "Yigit, Ferruh" <ferruh.yigit@linux.intel.com> wrote:
> >
> > > On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> > > > From: Kiran Kumar K <kirankumark@marvell.com>
> > > >
> > > > Patch adds support for kernel module to work in IOVA = VA mode,
> > > > the idea is to get physical address from IOVA address using
> > > > iommu_iova_to_phys API and later use phys_to_virt API to convert
> > > > the physical address to kernel virtual address.
> > > >
> > > > When compared with IOVA = PA mode, there is no performance drop
> > > > with this approach.
> > > >
> > > > This approach does not work with the kernel versions less than
> > > > 4.4.0 because of API compatibility issues.
> > > >
> > > > Patch also updates these support details in KNI documentation.
> > > >
> > > > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > > > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > >
> > > <...>
> > >
> > > > @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t
> > ioctl_num,
> > > >  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> > > >
> > > >  	/* Translate user space info into kernel space info */
> > > > -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> > > > -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> > > > -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> > > > -	kni->free_q = phys_to_virt(dev_info.free_phys);
> > > > -
> > > > -	kni->req_q = phys_to_virt(dev_info.req_phys);
> > > > -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> > > > -	kni->sync_va = dev_info.sync_va;
> > > > -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> > > > +	if (dev_info.iova_mode) {
> > > > +#ifdef HAVE_IOVA_AS_VA_SUPPORT
> > > > +		pci = pci_get_device(dev_info.vendor_id,
> > > > +				     dev_info.device_id, NULL);
> > > > +		if (pci == NULL) {
> > > > +			pr_err("pci dev does not exist\n");
> > > > +			return -ENODEV;
> > > > +		}
> > >
> > > If there is no PCI device KNI should still work.
> >
> > Right now it is possible to use KNI with netvsc PMD on Hyper-V/Azure.
> > With this patch that won't be possible.
> 
> Hi Ferruh, Stephen,
> 
> These can be fixed by forcing iommu_mode as PA when vdevs are used for KNI
> usecase.
> 
> rte_bus_get_iommu_class(void)
>  {
>         enum rte_iova_mode mode = RTE_IOVA_DC;
> +       struct rte_devargs *devargs = NULL;
>         bool buses_want_va = false;
>         bool buses_want_pa = false;
>         struct rte_bus *bus;
> 
> +       if (rte_eal_check_module("rte_kni") == 1) {
> +               RTE_EAL_DEVARGS_FOREACH("vdev", devargs) {
> +                       return RTE_IOVA_PA;
> +               }
> +       }
> +
>         TAILQ_FOREACH(bus, &rte_bus_list, next) {
>                 enum rte_iova_mode bus_iova_mode;
> 
> I think this will solve various use cases/combinations like PA or VA mode, pdev or
> vdev used for KNI.
> Existing use cases would not be affected by these patch series with above fix.

Hi Ferruh, Stephen,

Pushed a patch for above mentioned fix.  Fix is moved to vdev bus driver.
Patch adds get_iommu_class callback in vdev bus driver and returns iova=pa for kni usecase.
http://patches.dpdk.org/patch/61312/

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-10-16 11:26                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-10-16 14:37                             ` Vamsi Krishna Attunuru
@ 2019-10-16 16:14                             ` Ferruh Yigit
  2019-10-18 17:15                               ` Vamsi Krishna Attunuru
  1 sibling, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-10-16 16:14 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, Stephen Hemminger, Yigit, Ferruh
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, olivier.matz,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda

On 10/16/2019 12:26 PM, Vamsi Krishna Attunuru wrote:
> 
> 
>> -----Original Message-----
>> From: Stephen Hemminger <stephen@networkplumber.org>
>> Sent: Tuesday, October 15, 2019 9:16 PM
>> To: Yigit, Ferruh <ferruh.yigit@linux.intel.com>
>> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
>> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
>> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
>> <kirankumark@marvell.com>
>> Subject: [EXT] Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI
>> module
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> On Tue, 15 Oct 2019 16:43:08 +0100
>> "Yigit, Ferruh" <ferruh.yigit@linux.intel.com> wrote:
>>
>>> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
>>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>>
>>>> Patch adds support for kernel module to work in IOVA = VA mode, the
>>>> idea is to get physical address from IOVA address using
>>>> iommu_iova_to_phys API and later use phys_to_virt API to convert the
>>>> physical address to kernel virtual address.
>>>>
>>>> When compared with IOVA = PA mode, there is no performance drop with
>>>> this approach.
>>>>
>>>> This approach does not work with the kernel versions less than 4.4.0
>>>> because of API compatibility issues.
>>>>
>>>> Patch also updates these support details in KNI documentation.
>>>>
>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> <...>
>>>
>>>> @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t
>> ioctl_num,
>>>>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>>>
>>>>  	/* Translate user space info into kernel space info */
>>>> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
>>>> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
>>>> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>>>> -	kni->free_q = phys_to_virt(dev_info.free_phys);
>>>> -
>>>> -	kni->req_q = phys_to_virt(dev_info.req_phys);
>>>> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
>>>> -	kni->sync_va = dev_info.sync_va;
>>>> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>>>> +	if (dev_info.iova_mode) {
>>>> +#ifdef HAVE_IOVA_AS_VA_SUPPORT
>>>> +		pci = pci_get_device(dev_info.vendor_id,
>>>> +				     dev_info.device_id, NULL);
>>>> +		if (pci == NULL) {
>>>> +			pr_err("pci dev does not exist\n");
>>>> +			return -ENODEV;
>>>> +		}
>>>
>>> If there is no PCI device KNI should still work.
>>
>> Right now it is possible to use KNI with netvsc PMD on Hyper-V/Azure.
>> With this patch that won't be possible.
> 
> Hi Ferruh, Stephen,
> 
> These can be fixed by forcing iommu_mode as PA when vdevs are used
> for KNI usecase.
> 
> rte_bus_get_iommu_class(void)
>  {
>         enum rte_iova_mode mode = RTE_IOVA_DC;
> +       struct rte_devargs *devargs = NULL;
>         bool buses_want_va = false;
>         bool buses_want_pa = false;
>         struct rte_bus *bus;
> 
> +       if (rte_eal_check_module("rte_kni") == 1) {
> +               RTE_EAL_DEVARGS_FOREACH("vdev", devargs) {
> +                       return RTE_IOVA_PA;
> +               }
> +       }
> +
>         TAILQ_FOREACH(bus, &rte_bus_list, next) {
>                 enum rte_iova_mode bus_iova_mode;
> 
> I think this will solve various use cases/combinations like PA or VA mode, pdev or vdev used for KNI.
> Existing use cases would not be affected by these patch series with above fix.
> 

Hi Vamsi,

I think this is not a problem of using vdev so I think we can't solve this via
vdev check only.

The sample I give (KNI PMD) is using vdev, but application can use KNI library
APIs directly to create kni interface and have kernel/user space communication
without any device involved. KNI PMD (vdev) is just a wrapper to make this easy.

Just thinking aloud,
KNI is sharing in userspace buffer with kernel, so basically it needs to do
virtual address to kernel virtual address translation, in a reasonably fast manner.

iova=va breaks KNI because:
1) physical memory is not continuous anymore which break our address translation
logic
2) we were using physical address of the buffer for the address translation, but
we have no more have it, we now have iova address.

I assume 1) is resolved with 'rte_kni_pktmbuf_pool_create()' it would be helpful
though if you can explain how this works?

For second, a simple question, do we need to get a PCIe device information to be
able to convert iova to kernel virtual address? Can't I get this information
from iommu somehow?
Think about a case "--iova-mode=va" provided but there is no physical device
bind to vfio-pci, can I still allocated memor? And how can I use KNI in that case?

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
  2019-10-16 12:17                       ` Vamsi Krishna Attunuru
@ 2019-10-16 16:21                         ` Ferruh Yigit
  2019-10-18 16:42                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-10-16 16:21 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, Yigit, Ferruh
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, anatoly.burakov,
	arybchenko, Kiran Kumar Kokkilagadda, dev

On 10/16/2019 1:17 PM, Vamsi Krishna Attunuru wrote:
> 
> 
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Yigit, Ferruh
>> Sent: Tuesday, October 15, 2019 9:05 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> olivier.matz@6wind.com; ferruh.yigit@intel.com; anatoly.burakov@intel.com;
>> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
>> <kirankumark@marvell.com>
>> Subject: Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
>>
>> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> ---
>>> V10 Changes:
>>> * Fixed function return code on failure when min_chunk_size > pg_sz.
>>> * Marked new mempool populate routine as EXPERIMENTAL.
>>>
>>> V9 Changes:
>>> * Used rte_mempool_ops_calc_mem_size() instead of default handler in
>>> the new mempool populate routine.
>>> * Check min_chunk_size and return values.
>>> * Removed ethdev_info memset to '0' and moved pci dev_info populate
>>> into
>>> kni_dev_pci_addr_get() routine.
>>> * Addressed misc. review comments.
>>>
>>> V8 Changes:
>>> * Remove default mempool populate() routine changes.
>>> * Add kni app specific mempool create & free routines.
>>> * Add new mempool populate routine to allocate page-aligned memzones
>>> with page size to make sure all mempool objects reside on a page.
>>> * Update release notes and map files.
>>>
>>> V7 Changes:
>>> * Removed previously proposed mempool flag and made those page
>>> boundary checks default in mempool populate() except for the objects
>>> size bigger than the size of page.
>>> * Removed KNI example application related changes since pool related
>>> requirement is taken care in mempool lib.
>>> * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
>>> * Added wrapper functions in KNI module to hide IOVA checks and make
>>> address translation routines more readable.
>>> * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
>>> mode is enabled.
>>>
>>> V6 Changes:
>>> * Added new mempool flag to ensure mbuf memory is not scattered across
>>> page boundaries.
>>> * Added KNI kernel module required PCI device information.
>>> * Modified KNI example application to create mempool with new mempool
>>> flag.
>>>
>>> V5 changes:
>>> * Fixed build issue with 32b build
>>>
>>> V4 changes:
>>> * Fixed build issues with older kernel versions
>>> * This approach will only work with kernel above 4.4.0
>>>
>>> V3 Changes:
>>> * Add new approach to work kni with IOVA=VA mode using
>>> iommu_iova_to_phys API.
>>>
>>> Kiran Kumar K (1):
>>>   kni: add IOVA=VA support in KNI module
>>>
>>> Vamsi Attunuru (4):
>>>   mempool: populate mempool with the page sized chunks
>>>   kni: add IOVA=VA support in KNI lib
>>>   kni: add app specific mempool create and free routines
>>>   kni: modify IOVA mode checks to support VA
>>
>> Hi Vamsi,
>>
>> I am aware that this patchset is around for a long time, and I have seen your
>> request to merge in 19.11, but as you can understand the concern I have is to
>> break KNI or existing KNI applications while trying to add this new feature.
>>
>> In high level, there are two issues,
>>
>> 1) kernel modules updates expect there will be a backed device of the KNI which
>> is not always true:
>>
>>          if (dev_info.iova_mode) {
>>  #ifdef HAVE_IOVA_AS_VA_SUPPORT
>>                  pci = pci_get_device(dev_info.vendor_id,
>>                                       dev_info.device_id, NULL);
>>                  if (pci == NULL) {
>>                          pr_err("pci dev does not exist\n");
>>                          return -ENODEV;
>>                  }
>>
>> For example this breaks:
>> ./build/app/testpmd -w0:0.0 --vdev net_kni0 --vdev net_kni1  -- -i
> 
> Vamsi> Yes, these can be fixed by forcing iommu_mode to PA for
> vdev or vdev&pdev based KNI usecases. 
> 
>>
>>
>> 2) Applications will have to change the API to allocate the mempool.
>> If the user upgraded to new version of DPDK, now it is possible to have iova=va
>> mode and application should use new KNI API 'rte_kni_pktmbuf_pool_create()'
>> to allocate mempool. And most probably application will have datapath and will
>> use the KNI only for exception path, will there be any affect using KNI version of
>> mempool alloc?
> 
> Vamsi> There would not be any affect in using KNI version of mempool.

If you were developing a product, would you rely on a KNI API for pkt_mbuf
allocation? What if there is a problem, will it be as easy as mempool/mbuf API
to figure out and fix? I am not sure about pushing users to this direction?

> 
>>
>>
>> I would like to see KNI is enabled via iova=va mode, but can we have it limited to
>> a specific command line argument or config option? This increases the test
>> surface but at least old application can continue to work by default, what do you
>> think?
> 
> Vamsi> Yes, it's appropriate to control the mode to ensure old apps work by default.
> We are fine with having a command line arg or config option to enable KNI in iova=va mode.
> Earlier we thought of having similar approach that also controls mempool allocation using
> a newer mempool flag. After multiple reviews, flag has been discard and added a separate
> mempool populate routine for these usecase. 

I didn't like the idea of having a flag in mempool library, but perhaps we can
have it in KNI scope.

> 
> When command line arg/config option is introduced, functionality will be as below. 
> Please correct me if any cases are missed or not considered.
> Without command:
> Existing KNI is intact, iommu mode will be PA.

+1

> With  command:
> Pdev/vdev's iommu mode is considered and accordingly iova=va/pa is enabled. Application is
> supposed to use KNI version of mempool alloc. 

+1

> I think these mempool quirk will go away when
> Olivier's mempool patchset(RFC) is merged.
> 
>>
>> And I will put a few minor comments to the patches...
>>
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re:  [PATCH v10 0/5] kni: add IOVA=VA support
  2019-10-16 16:21                         ` Ferruh Yigit
@ 2019-10-18 16:42                           ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-18 16:42 UTC (permalink / raw)
  To: Ferruh Yigit, Yigit, Ferruh, olivier.matz
  Cc: thomas, Jerin Jacob Kollanukkaran, olivier.matz, anatoly.burakov,
	arybchenko, Kiran Kumar Kokkilagadda, dev



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Wednesday, October 16, 2019 9:52 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Yigit, Ferruh
> <ferruh.yigit@linux.intel.com>
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> olivier.matz@6wind.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; dev@dpdk.org
> Subject: [EXT] Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> 
> External Email
> 
> ----------------------------------------------------------------------
> On 10/16/2019 1:17 PM, Vamsi Krishna Attunuru wrote:
> >
> >
> >> -----Original Message-----
> >> From: dev <dev-bounces@dpdk.org> On Behalf Of Yigit, Ferruh
> >> Sent: Tuesday, October 15, 2019 9:05 PM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> >> <jerinj@marvell.com>; olivier.matz@6wind.com; ferruh.yigit@intel.com;
> >> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> >> Kokkilagadda <kirankumark@marvell.com>
> >> Subject: Re: [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support
> >>
> >> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> >>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> ---
> >>> V10 Changes:
> >>> * Fixed function return code on failure when min_chunk_size > pg_sz.
> >>> * Marked new mempool populate routine as EXPERIMENTAL.
> >>>
> >>> V9 Changes:
> >>> * Used rte_mempool_ops_calc_mem_size() instead of default handler in
> >>> the new mempool populate routine.
> >>> * Check min_chunk_size and return values.
> >>> * Removed ethdev_info memset to '0' and moved pci dev_info populate
> >>> into
> >>> kni_dev_pci_addr_get() routine.
> >>> * Addressed misc. review comments.
> >>>
> >>> V8 Changes:
> >>> * Remove default mempool populate() routine changes.
> >>> * Add kni app specific mempool create & free routines.
> >>> * Add new mempool populate routine to allocate page-aligned
> memzones
> >>> with page size to make sure all mempool objects reside on a page.
> >>> * Update release notes and map files.
> >>>
> >>> V7 Changes:
> >>> * Removed previously proposed mempool flag and made those page
> >>> boundary checks default in mempool populate() except for the objects
> >>> size bigger than the size of page.
> >>> * Removed KNI example application related changes since pool related
> >>> requirement is taken care in mempool lib.
> >>> * All PCI dev related info is moved under rte_eal_iova_mode() == VA
> check.
> >>> * Added wrapper functions in KNI module to hide IOVA checks and make
> >>> address translation routines more readable.
> >>> * Updated IOVA mode checks that enforcing IOVA=PA mode when
> IOVA=VA
> >>> mode is enabled.
> >>>
> >>> V6 Changes:
> >>> * Added new mempool flag to ensure mbuf memory is not scattered
> >>> across page boundaries.
> >>> * Added KNI kernel module required PCI device information.
> >>> * Modified KNI example application to create mempool with new
> >>> mempool flag.
> >>>
> >>> V5 changes:
> >>> * Fixed build issue with 32b build
> >>>
> >>> V4 changes:
> >>> * Fixed build issues with older kernel versions
> >>> * This approach will only work with kernel above 4.4.0
> >>>
> >>> V3 Changes:
> >>> * Add new approach to work kni with IOVA=VA mode using
> >>> iommu_iova_to_phys API.
> >>>
> >>> Kiran Kumar K (1):
> >>>   kni: add IOVA=VA support in KNI module
> >>>
> >>> Vamsi Attunuru (4):
> >>>   mempool: populate mempool with the page sized chunks
> >>>   kni: add IOVA=VA support in KNI lib
> >>>   kni: add app specific mempool create and free routines
> >>>   kni: modify IOVA mode checks to support VA
> >>
> >> Hi Vamsi,
> >>
> >> I am aware that this patchset is around for a long time, and I have
> >> seen your request to merge in 19.11, but as you can understand the
> >> concern I have is to break KNI or existing KNI applications while trying to
> add this new feature.
> >>
> >> In high level, there are two issues,
> >>
> >> 1) kernel modules updates expect there will be a backed device of the
> >> KNI which is not always true:
> >>
> >>          if (dev_info.iova_mode) {
> >>  #ifdef HAVE_IOVA_AS_VA_SUPPORT
> >>                  pci = pci_get_device(dev_info.vendor_id,
> >>                                       dev_info.device_id, NULL);
> >>                  if (pci == NULL) {
> >>                          pr_err("pci dev does not exist\n");
> >>                          return -ENODEV;
> >>                  }
> >>
> >> For example this breaks:
> >> ./build/app/testpmd -w0:0.0 --vdev net_kni0 --vdev net_kni1  -- -i
> >
> > Vamsi> Yes, these can be fixed by forcing iommu_mode to PA for
> > vdev or vdev&pdev based KNI usecases.
> >
> >>
> >>
> >> 2) Applications will have to change the API to allocate the mempool.
> >> If the user upgraded to new version of DPDK, now it is possible to
> >> have iova=va mode and application should use new KNI API
> 'rte_kni_pktmbuf_pool_create()'
> >> to allocate mempool. And most probably application will have datapath
> >> and will use the KNI only for exception path, will there be any
> >> affect using KNI version of mempool alloc?
> >
> > Vamsi> There would not be any affect in using KNI version of mempool.
> 
> If you were developing a product, would you rely on a KNI API for pkt_mbuf
> allocation? What if there is a problem, will it be as easy as mempool/mbuf
> API to figure out and fix? I am not sure about pushing users to this direction?

Vamsi >  If user wants to run KNI app in iova=va mode, mempool populate needs to ensure mbuf would not be allocated from a page boundary. IMO after having config option/cmd line parameter to enable KNI iova=va mode, existing KNI will be intact and these mempool API only be called upon request. I will document these details in KNI document. Based on earlier comments and discussions with Olivier and Andrew on the mempool populate patch, we arrived at these solution.

Hi Olivier, 
Please let us know your thoughts on these patch. I am open to any of your suggestions/solution to fix mempool populate issue for KNI iova=va use case.
 
> 
> >
> >>
> >>
> >> I would like to see KNI is enabled via iova=va mode, but can we have
> >> it limited to a specific command line argument or config option? This
> >> increases the test surface but at least old application can continue
> >> to work by default, what do you think?
> >
> > Vamsi> Yes, it's appropriate to control the mode to ensure old apps work
> by default.
> > We are fine with having a command line arg or config option to enable KNI
> in iova=va mode.
> > Earlier we thought of having similar approach that also controls
> > mempool allocation using a newer mempool flag. After multiple reviews,
> > flag has been discard and added a separate mempool populate routine for
> these usecase.
> 
> I didn't like the idea of having a flag in mempool library, but perhaps we can
> have it in KNI scope.

Vamsi> I failed to understand KNI scope here, IMO, though we have flag in KNI scope, it needs to be affected in mempool lib right finally. 

> 
> >
> > When command line arg/config option is introduced, functionality will be
> as below.
> > Please correct me if any cases are missed or not considered.
> > Without command:
> > Existing KNI is intact, iommu mode will be PA.
> 
> +1
> 
> > With  command:
> > Pdev/vdev's iommu mode is considered and accordingly iova=va/pa is
> > enabled. Application is supposed to use KNI version of mempool alloc.
> 
> +1
> 
> > I think these mempool quirk will go away when Olivier's mempool
> > patchset(RFC) is merged.
> >
> >>
> >> And I will put a few minor comments to the patches...
> >>
> >


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-10-16 16:14                             ` Ferruh Yigit
@ 2019-10-18 17:15                               ` Vamsi Krishna Attunuru
  2019-10-21 11:45                                 ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-18 17:15 UTC (permalink / raw)
  To: Ferruh Yigit, Stephen Hemminger, Yigit, Ferruh
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, olivier.matz,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda



> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Ferruh Yigit
> Sent: Wednesday, October 16, 2019 9:44 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Stephen Hemminger
> <stephen@networkplumber.org>; Yigit, Ferruh
> <ferruh.yigit@linux.intel.com>
> Cc: dev@dpdk.org; thomas@monjalon.net; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; olivier.matz@6wind.com;
> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> Kokkilagadda <kirankumark@marvell.com>
> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support
> in KNI module
> 
> On 10/16/2019 12:26 PM, Vamsi Krishna Attunuru wrote:
> >
> >
> >> -----Original Message-----
> >> From: Stephen Hemminger <stephen@networkplumber.org>
> >> Sent: Tuesday, October 15, 2019 9:16 PM
> >> To: Yigit, Ferruh <ferruh.yigit@linux.intel.com>
> >> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
> >> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> >> olivier.matz@6wind.com; ferruh.yigit@intel.com;
> >> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
> >> Kokkilagadda <kirankumark@marvell.com>
> >> Subject: [EXT] Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA
> >> support in KNI module
> >>
> >> External Email
> >>
> >> ---------------------------------------------------------------------
> >> -
> >> On Tue, 15 Oct 2019 16:43:08 +0100
> >> "Yigit, Ferruh" <ferruh.yigit@linux.intel.com> wrote:
> >>
> >>> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
> >>>> From: Kiran Kumar K <kirankumark@marvell.com>
> >>>>
> >>>> Patch adds support for kernel module to work in IOVA = VA mode, the
> >>>> idea is to get physical address from IOVA address using
> >>>> iommu_iova_to_phys API and later use phys_to_virt API to convert
> >>>> the physical address to kernel virtual address.
> >>>>
> >>>> When compared with IOVA = PA mode, there is no performance drop
> >>>> with this approach.
> >>>>
> >>>> This approach does not work with the kernel versions less than
> >>>> 4.4.0 because of API compatibility issues.
> >>>>
> >>>> Patch also updates these support details in KNI documentation.
> >>>>
> >>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> >>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> <...>
> >>>
> >>>> @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t
> >> ioctl_num,
> >>>>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
> >>>>
> >>>>  	/* Translate user space info into kernel space info */
> >>>> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> >>>> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> >>>> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> >>>> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> >>>> -
> >>>> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> >>>> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> >>>> -	kni->sync_va = dev_info.sync_va;
> >>>> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> >>>> +	if (dev_info.iova_mode) {
> >>>> +#ifdef HAVE_IOVA_AS_VA_SUPPORT
> >>>> +		pci = pci_get_device(dev_info.vendor_id,
> >>>> +				     dev_info.device_id, NULL);
> >>>> +		if (pci == NULL) {
> >>>> +			pr_err("pci dev does not exist\n");
> >>>> +			return -ENODEV;
> >>>> +		}
> >>>
> >>> If there is no PCI device KNI should still work.
> >>
> >> Right now it is possible to use KNI with netvsc PMD on Hyper-V/Azure.
> >> With this patch that won't be possible.
> >
> > Hi Ferruh, Stephen,
> >
> > These can be fixed by forcing iommu_mode as PA when vdevs are used for
> > KNI usecase.
> >
> > rte_bus_get_iommu_class(void)
> >  {
> >         enum rte_iova_mode mode = RTE_IOVA_DC;
> > +       struct rte_devargs *devargs = NULL;
> >         bool buses_want_va = false;
> >         bool buses_want_pa = false;
> >         struct rte_bus *bus;
> >
> > +       if (rte_eal_check_module("rte_kni") == 1) {
> > +               RTE_EAL_DEVARGS_FOREACH("vdev", devargs) {
> > +                       return RTE_IOVA_PA;
> > +               }
> > +       }
> > +
> >         TAILQ_FOREACH(bus, &rte_bus_list, next) {
> >                 enum rte_iova_mode bus_iova_mode;
> >
> > I think this will solve various use cases/combinations like PA or VA mode,
> pdev or vdev used for KNI.
> > Existing use cases would not be affected by these patch series with above
> fix.
> >
> 
> Hi Vamsi,
> 
> I think this is not a problem of using vdev so I think we can't solve this via
> vdev check only.
> 
> The sample I give (KNI PMD) is using vdev, but application can use KNI library
> APIs directly to create kni interface and have kernel/user space
> communication without any device involved. KNI PMD (vdev) is just a
> wrapper to make this easy.
> 
> Just thinking aloud,
> KNI is sharing in userspace buffer with kernel, so basically it needs to do
> virtual address to kernel virtual address translation, in a reasonably fast
> manner.
> 
> iova=va breaks KNI because:
> 1) physical memory is not continuous anymore which break our address
> translation logic
> 2) we were using physical address of the buffer for the address translation,
> but we have no more have it, we now have iova address.
> 
> I assume 1) is resolved with 'rte_kni_pktmbuf_pool_create()' it would be
> helpful though if you can explain how this works?

KNI kernel module uses pa2va, va2pa calls to translate buffer pointers, this will only work when both pa & va are 1-to-1 contiguous. In such cases, dpdk mbuf memory should not cross physical page boundary,  If it crosses, it's iova/va will be contiguous, but it's pa may or may not be contiguous. Kni pktmbuf pool api ensures that mempool is not populated with those type of mbufs(that cross page boundary). Once the pool is created, all mbufs in that pool resides with in the page limits. 
> 
> For second, a simple question, do we need to get a PCIe device information
> to be able to convert iova to kernel virtual address? Can't I get this
> information from iommu somehow?
> Think about a case "--iova-mode=va" provided but there is no physical device
> bind to vfio-pci, can I still allocated memor? And how can I use KNI in that
> case?

We found a way to translate iova/uva to kva using kernel mm subsystem calls, using get_user_pages_remote() call KNI module is able to get mapping for iova to kva. This solution worked for pdevs(without sharing any dev info) and also for wrapper vdev pmds.

I will push next version of patches with these solution and other command line arg changes. Since the major architectural issue is fixed in enabling iova=va for KNI, we are planning to merge these patch series in 19.11 release.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v11 0/4] kni: add IOVA=VA mode support
  2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
                                       ` (6 preceding siblings ...)
  2019-10-15 15:34                     ` Yigit, Ferruh
@ 2019-10-21  8:03                     ` vattunuru
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 1/4] mempool: populate mempool with the page sized chunks vattunuru
                                         ` (4 more replies)
  7 siblings, 5 replies; 251+ messages in thread
From: vattunuru @ 2019-10-21  8:03 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V11 Changes:
* Added iova to kva address translation routines in kernel module to
make it work in iova=va mode which enables DPDK to create kni devices
on any kind of backed device/memory.
* Added ``--legacy-kni`` eal option to make existing KNI applications
work with DPDK 19.11 and later versions.
* Removed previously added pci device info from kni device info struct.
 
V10 Changes:
* Fixed function return code on failure when min_chunk_size > pg_sz.
* Marked new mempool populate routine as EXPERIMENTAL.
 
V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in the
new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.
 
V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.
 
V7 Changes:
* Removed previously proposed mempool flag and made those page
boundary checks default in mempool populate() except for the objects size
bigger than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
mode is enabled.
 
V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered across
page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new mempool
flag.
 
V5 changes:
* Fixed build issue with 32b build
 
V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0
 
V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Vamsi Attunuru (4):
  mempool: populate mempool with the page sized chunks
  eal: add legacy kni option
  kni: add IOVA=VA support
  kni: add IOVA=VA support in kernel module

 doc/guides/prog_guide/kernel_nic_interface.rst    | 26 +++++++++
 doc/guides/rel_notes/release_19_11.rst            | 17 ++++++
 examples/kni/main.c                               |  6 +-
 kernel/linux/kni/compat.h                         |  4 ++
 kernel/linux/kni/kni_dev.h                        | 31 ++++++++++
 kernel/linux/kni/kni_misc.c                       | 39 ++++++++++---
 kernel/linux/kni/kni_net.c                        | 58 ++++++++++++++-----
 lib/librte_eal/common/eal_common_options.c        |  5 ++
 lib/librte_eal/common/eal_internal_cfg.h          |  2 +
 lib/librte_eal/common/eal_options.h               |  2 +
 lib/librte_eal/linux/eal/eal.c                    | 40 +++++++++----
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 67 ++++++++++++++++++++--
 lib/librte_kni/rte_kni.h                          | 48 ++++++++++++++++
 lib/librte_kni/rte_kni_version.map                |  3 +
 lib/librte_mempool/rte_mempool.c                  | 69 +++++++++++++++++++++++
 lib/librte_mempool/rte_mempool.h                  | 20 +++++++
 lib/librte_mempool/rte_mempool_version.map        |  2 +
 20 files changed, 402 insertions(+), 40 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v11 1/4] mempool: populate mempool with the page sized chunks
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
@ 2019-10-21  8:03                       ` vattunuru
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option vattunuru
                                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-10-21  8:03 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds a routine to populate mempool from page aligned and
page sized chunks of memory to ensure memory objs do not fall
across the page boundaries. It's useful for applications that
require physically contiguous mbuf memory while running in
IOVA=VA mode.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Acked-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c           | 69 ++++++++++++++++++++++++++++++
 lib/librte_mempool/rte_mempool.h           | 20 +++++++++
 lib/librte_mempool/rte_mempool_version.map |  2 +
 3 files changed, 91 insertions(+)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 0f29e87..ef1298d 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,75 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Function to populate mempool from page sized mem chunks, allocate page size
+ * of memory in memzone and populate them. Return the number of objects added,
+ * or a negative value on error.
+ */
+int
+rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp)
+{
+	char mz_name[RTE_MEMZONE_NAMESIZE];
+	size_t align, pg_sz, pg_shift;
+	const struct rte_memzone *mz;
+	unsigned int mz_id, n;
+	size_t min_chunk_size;
+	int ret;
+
+	ret = mempool_ops_alloc_once(mp);
+	if (ret != 0)
+		return ret;
+
+	if (mp->nb_mem_chunks != 0)
+		return -EEXIST;
+
+	pg_sz = get_min_page_size(mp->socket_id);
+	pg_shift = rte_bsf32(pg_sz);
+
+	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
+
+		ret = rte_mempool_ops_calc_mem_size(mp, n,
+				pg_shift, &min_chunk_size, &align);
+
+		if (ret < 0)
+			goto fail;
+
+		if (min_chunk_size > pg_sz) {
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		ret = snprintf(mz_name, sizeof(mz_name),
+			RTE_MEMPOOL_MZ_FORMAT "_%d", mp->name, mz_id);
+		if (ret < 0 || ret >= (int)sizeof(mz_name)) {
+			ret = -ENAMETOOLONG;
+			goto fail;
+		}
+
+		mz = rte_memzone_reserve_aligned(mz_name, min_chunk_size,
+				mp->socket_id, 0, align);
+
+		if (mz == NULL) {
+			ret = -rte_errno;
+			goto fail;
+		}
+
+		ret = rte_mempool_populate_iova(mp, mz->addr,
+				mz->iova, mz->len,
+				rte_mempool_memchunk_mz_free,
+				(void *)(uintptr_t)mz);
+		if (ret < 0) {
+			rte_memzone_free(mz);
+			goto fail;
+		}
+	}
+
+	return mp->size;
+
+fail:
+	rte_mempool_free_memchunks(mp);
+	return ret;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a..2f5126e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1062,6 +1062,26 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	void *opaque);
 
 /**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Add memory from page sized memzones for objects in the pool at init
+ *
+ * This is the function used to populate the mempool with page aligned and
+ * page sized memzone memory to avoid spreading object memory across two pages
+ * and to ensure all mempool objects reside on the page memory.
+ *
+ * @param mp
+ *   A pointer to the mempool structure.
+ * @return
+ *   The number of objects added on success.
+ *   On error, the chunk is not added in the memory list of the
+ *   mempool and a negative errno is returned.
+ */
+__rte_experimental
+int rte_mempool_populate_from_pg_sz_chunks(struct rte_mempool *mp);
+
+/**
  * Add memory for objects in the pool at init
  *
  * This is the default function used by rte_mempool_create() to populate
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca4..d6fe5a5 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -57,4 +57,6 @@ EXPERIMENTAL {
 	global:
 
 	rte_mempool_ops_get_info;
+	# added in 19.11
+	rte_mempool_populate_from_pg_sz_chunks;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 1/4] mempool: populate mempool with the page sized chunks vattunuru
@ 2019-10-21  8:03                       ` vattunuru
  2019-10-21 11:55                         ` Ferruh Yigit
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 3/4] kni: add IOVA=VA support vattunuru
                                         ` (2 subsequent siblings)
  4 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-10-21  8:03 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

This adds a "--legacy-kni" command-line option. It will
be used to run existing KNI applications with DPDK 19.11
and later.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 doc/guides/rel_notes/release_19_11.rst     | 4 ++++
 lib/librte_eal/common/eal_common_options.c | 5 +++++
 lib/librte_eal/common/eal_internal_cfg.h   | 2 ++
 lib/librte_eal/common/eal_options.h        | 2 ++
 4 files changed, 13 insertions(+)

diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 85953b9..ab2c381 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -115,6 +115,10 @@ New Features
   Added eBPF JIT support for arm64 architecture to improve the eBPF program
   performance.
 
+* **Added EAL option to operate KNI in legacy mode.**
+
+  Added EAL option ``--legacy-kni`` to make existing KNI applications work
+  with DPDK 19.11 and later.
 
 Removed Items
 -------------
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 05cae5f..8f5174e 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -81,6 +81,7 @@ eal_long_options[] = {
 	{OPT_LEGACY_MEM,        0, NULL, OPT_LEGACY_MEM_NUM       },
 	{OPT_SINGLE_FILE_SEGMENTS, 0, NULL, OPT_SINGLE_FILE_SEGMENTS_NUM},
 	{OPT_MATCH_ALLOCATIONS, 0, NULL, OPT_MATCH_ALLOCATIONS_NUM},
+	{OPT_LEGACY_KNI,        0, NULL, OPT_LEGACY_KNI_NUM       },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -1408,6 +1409,9 @@ eal_parse_common_option(int opt, const char *optarg,
 			return -1;
 		}
 		break;
+	case OPT_LEGACY_KNI_NUM:
+		conf->legacy_kni = 1;
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
@@ -1636,6 +1640,7 @@ eal_common_usage(void)
 	       "                      (ex: --vdev=net_pcap0,iface=eth2).\n"
 	       "  --"OPT_IOVA_MODE"   Set IOVA mode. 'pa' for IOVA_PA\n"
 	       "                      'va' for IOVA_VA\n"
+	       "  --"OPT_LEGACY_KNI"  Run KNI in IOVA_PA mode (legacy mode)\n"
 	       "  -d LIB.so|DIR       Add a driver or driver directory\n"
 	       "                      (can be used multiple times)\n"
 	       "  --"OPT_VMWARE_TSC_MAP"    Use VMware TSC map instead of native RDTSC\n"
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index a42f349..eee71ec 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -82,6 +82,8 @@ struct internal_config {
 	rte_cpuset_t ctrl_cpuset;         /**< cpuset for ctrl threads */
 	volatile unsigned int init_complete;
 	/**< indicates whether EAL has completed initialization */
+	volatile unsigned legacy_kni;
+	/**< true to enable legacy kni behavior */
 };
 extern struct internal_config internal_config; /**< Global EAL configuration. */
 
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index 9855429..1010ed3 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -69,6 +69,8 @@ enum {
 	OPT_IOVA_MODE_NUM,
 #define OPT_MATCH_ALLOCATIONS  "match-allocations"
 	OPT_MATCH_ALLOCATIONS_NUM,
+#define OPT_LEGACY_KNI      "legacy-kni"
+	OPT_LEGACY_KNI_NUM,
 	OPT_LONG_MAX_NUM
 };
 
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v11 3/4] kni: add IOVA=VA support
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 1/4] mempool: populate mempool with the page sized chunks vattunuru
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option vattunuru
@ 2019-10-21  8:03                       ` vattunuru
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 4/4] kni: add IOVA=VA support in kernel module vattunuru
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-10-21  8:03 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation only operates in IOVA_PA mode
patch adds required functionality to enable KNI in
IOVA_VA mode.

Packet pool's mbuf memory should be physically contiguous
for the KNI kernel module to work in IOVA=VA mode, new KNI
packet pool create APIs are introduced to take care of this
memory requirement.

examples/kni/ updated to use this API to enable IOVA as VA
or IOVA as PA mode.

Existing KNI applications can use ``--legacy-kni`` eal option
to work with DPDK 19.11 and later versions. When this option
is selected, IOVA mode will be forced to PA mode to enable old
KNI applications work with the latest DPDK without code change.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst    | 26 +++++++++
 doc/guides/rel_notes/release_19_11.rst            | 13 +++++
 examples/kni/main.c                               |  6 +-
 lib/librte_eal/linux/eal/eal.c                    | 39 +++++++++----
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/Makefile                           |  1 +
 lib/librte_kni/meson.build                        |  1 +
 lib/librte_kni/rte_kni.c                          | 67 +++++++++++++++++++++--
 lib/librte_kni/rte_kni.h                          | 48 ++++++++++++++++
 lib/librte_kni/rte_kni_version.map                |  3 +
 10 files changed, 187 insertions(+), 18 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 2fd58e1..80f731c 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -300,6 +300,32 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI can be operated in IOVA_VA scheme when
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) and
+- eal option `iova-mode=va` is passed or bus IOVA scheme in the DPDK is selected
+  as RTE_IOVA_VA.
+
+Packet Pool APIs for IOVA=VA mode
+---------------------------------
+
+``rte_kni_pktmbuf_pool_create`` and ``rte_kni_pktmbuf_pool_free`` APIs need to
+be used for creating packet pools for running KNI applications in IOVA=VA mode.
+Packet pool's mbuf memory should be physically contiguous for the KNI kernel
+module to work in IOVA=VA mode, this memory requirement was taken care inside
+those KNI packet pool create APIs.
+
+Command-line option for legacy KNI
+----------------------------------
+
+Existing KNI applications can use ``--legacy-kni`` eal command-line option to
+work with DPDK 19.11 and later versions. When this option is selected, IOVA mode
+will be forced to PA mode to enable old KNI applications work with the latest
+DPDK without code changes.
+
 Ethtool
 -------
 
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index ab2c381..e4296a0 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -120,6 +120,19 @@ New Features
   Added EAL option ``--legacy-kni`` to make existing KNI applications work
   with DPDK 19.11 and later.
 
+* **Added IOVA as VA support for KNI.**
+
+  Added IOVA as VA support for KNI. When KNI needs to operate in IOVA = VA
+  mode, packet pool's mbuf memory should be physically contiguous. This memory
+  requirement taken care using the new ``rte_kni_pktmbuf_pool_create`` and
+  ``rte_kni_pktmbuf_pool_free`` routines.
+
+  The ``examples/kni/`` updated to use this API to enable IOVA as VA or
+  IOVA as PA mode.
+
+  When "--legacy-kni" selected, IOVA mode will be forced to PA mode to enable
+  old KNI application work with the latest DPDK without code changes.
+
 Removed Items
 -------------
 
diff --git a/examples/kni/main.c b/examples/kni/main.c
index c576fc7..d2f3b46 100644
--- a/examples/kni/main.c
+++ b/examples/kni/main.c
@@ -1017,8 +1017,9 @@ main(int argc, char** argv)
 		rte_exit(EXIT_FAILURE, "Could not parse input parameters\n");
 
 	/* Create the mbuf pool */
-	pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
+	pktmbuf_pool = rte_kni_pktmbuf_pool_create("mbuf_pool", NB_MBUF,
 		MEMPOOL_CACHE_SZ, 0, MBUF_DATA_SZ, rte_socket_id());
+
 	if (pktmbuf_pool == NULL) {
 		rte_exit(EXIT_FAILURE, "Could not initialise mbuf pool\n");
 		return -1;
@@ -1085,6 +1086,9 @@ main(int argc, char** argv)
 			continue;
 		kni_free_kni(port);
 	}
+
+	rte_kni_pktmbuf_pool_free(pktmbuf_pool);
+
 	for (i = 0; i < RTE_MAX_ETHPORTS; i++)
 		if (kni_port_params_array[i]) {
 			rte_free(kni_port_params_array[i]);
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index f397206..f807044 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -947,6 +947,29 @@ static int rte_eal_vfio_setup(void)
 }
 #endif
 
+static enum rte_iova_mode
+rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode)
+{
+	if (iova_mode == RTE_IOVA_PA)
+		goto exit;
+
+	if (internal_config.legacy_kni) {
+		iova_mode = RTE_IOVA_PA;
+		RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because legacy KNI is enabled\n");
+		goto exit;
+	}
+
+	if (iova_mode == RTE_IOVA_VA) {
+#if KERNEL_VERSION(4, 8, 0) > LINUX_VERSION_CODE
+		iova_mode = RTE_IOVA_PA;
+		RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
+#endif
+	}
+
+exit:
+	return iova_mode;
+}
+
 static void rte_eal_init_alert(const char *msg)
 {
 	fprintf(stderr, "EAL: FATAL: %s\n", msg);
@@ -1110,24 +1133,16 @@ rte_eal_init(int argc, char **argv)
 				RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
 			}
 		}
-#ifdef RTE_LIBRTE_KNI
-		/* Workaround for KNI which requires physical address to work */
-		if (iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			if (phys_addrs) {
-				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
-			} else {
-				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
-			}
-		}
-#endif
 		rte_eal_get_configuration()->iova_mode = iova_mode;
 	} else {
 		rte_eal_get_configuration()->iova_mode =
 			internal_config.iova_mode;
 	}
 
+	if (rte_eal_check_module("rte_kni") == 1)
+		rte_eal_get_configuration()->iova_mode =
+				rte_eal_kni_get_iova_mode(rte_eal_iova_mode());
+
 	if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
 		rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
 		rte_errno = EINVAL;
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index b51fe27..1b96cf6 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -123,6 +123,7 @@ struct rte_kni_device_info {
 	unsigned mbuf_size;
 	unsigned int mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/Makefile b/lib/librte_kni/Makefile
index cbd6599..6405524 100644
--- a/lib/librte_kni/Makefile
+++ b/lib/librte_kni/Makefile
@@ -6,6 +6,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
 # library name
 LIB = librte_kni.a
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 -fno-strict-aliasing
 LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev
 
diff --git a/lib/librte_kni/meson.build b/lib/librte_kni/meson.build
index 41fa2e3..dd4c8da 100644
--- a/lib/librte_kni/meson.build
+++ b/lib/librte_kni/meson.build
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Intel Corporation
 
+allow_experimental_apis = true
 if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
 	build = false
 	reason = 'only supported on 64-bit linux'
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 0f36485..1e53f05 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -21,6 +21,7 @@
 #include <rte_tailq.h>
 #include <rte_rwlock.h>
 #include <rte_eal_memconfig.h>
+#include <rte_mbuf_pool_ops.h>
 #include <rte_kni_common.h>
 #include "rte_kni_fifo.h"
 
@@ -97,11 +98,6 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
-	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
-		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
-		return -1;
-	}
-
 	/* Check FD and open */
 	if (kni_fd < 0) {
 		kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
@@ -300,6 +296,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
@@ -687,6 +685,65 @@ kni_allocate_mbufs(struct rte_kni *kni)
 	}
 }
 
+struct rte_mempool *
+rte_kni_pktmbuf_pool_create(const char *name, unsigned int n,
+	unsigned int cache_size, uint16_t priv_size, uint16_t data_room_size,
+	int socket_id)
+{
+	struct rte_pktmbuf_pool_private mbp_priv;
+	const char *mp_ops_name;
+	struct rte_mempool *mp;
+	unsigned int elt_size;
+	int ret;
+
+	if (RTE_ALIGN(priv_size, RTE_MBUF_PRIV_ALIGN) != priv_size) {
+		RTE_LOG(ERR, MBUF, "mbuf priv_size=%u is not aligned\n",
+			priv_size);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	elt_size = sizeof(struct rte_mbuf) + (unsigned int)priv_size +
+		(unsigned int)data_room_size;
+	mbp_priv.mbuf_data_room_size = data_room_size;
+	mbp_priv.mbuf_priv_size = priv_size;
+
+	mp = rte_mempool_create_empty(name, n, elt_size, cache_size,
+		 sizeof(struct rte_pktmbuf_pool_private), socket_id, 0);
+	if (mp == NULL)
+		return NULL;
+
+	mp_ops_name = rte_mbuf_best_mempool_ops();
+	ret = rte_mempool_set_ops_byname(mp, mp_ops_name, NULL);
+	if (ret != 0) {
+		RTE_LOG(ERR, MBUF, "error setting mempool handler\n");
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+	rte_pktmbuf_pool_init(mp, &mbp_priv);
+
+	if (rte_eal_iova_mode() == RTE_IOVA_VA)
+		ret = rte_mempool_populate_from_pg_sz_chunks(mp);
+	else
+		ret = rte_mempool_populate_default(mp);
+
+	if (ret < 0) {
+		rte_mempool_free(mp);
+		rte_errno = -ret;
+		return NULL;
+	}
+
+	rte_mempool_obj_iter(mp, rte_pktmbuf_init, NULL);
+
+	return mp;
+}
+
+void
+rte_kni_pktmbuf_pool_free(struct rte_mempool *mp)
+{
+	rte_mempool_free(mp);
+}
+
 struct rte_kni *
 rte_kni_get(const char *name)
 {
diff --git a/lib/librte_kni/rte_kni.h b/lib/librte_kni/rte_kni.h
index f6b66c3..2cfdc38 100644
--- a/lib/librte_kni/rte_kni.h
+++ b/lib/librte_kni/rte_kni.h
@@ -187,6 +187,54 @@ unsigned rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs,
 		unsigned num);
 
 /**
+ * Create a kni packet mbuf pool.
+ *
+ * This function creates and initializes a packet mbuf pool for KNI applications
+ * It calls the required mempool populate routine based on the IOVA mode.
+ *
+ * @param name
+ *   The name of the mbuf pool.
+ * @param n
+ *   The number of elements in the mbuf pool. The optimum size (in terms
+ *   of memory usage) for a mempool is when n is a power of two minus one:
+ *   n = (2^q - 1).
+ * @param cache_size
+ *   Size of the per-core object cache. See rte_mempool_create() for
+ *   details.
+ * @param priv_size
+ *   Size of application private are between the rte_mbuf structure
+ *   and the data buffer. This value must be aligned to RTE_MBUF_PRIV_ALIGN.
+ * @param data_room_size
+ *   Size of data buffer in each mbuf, including RTE_PKTMBUF_HEADROOM.
+ * @param socket_id
+ *   The socket identifier where the memory should be allocated. The
+ *   value can be *SOCKET_ID_ANY* if there is no NUMA constraint for the
+ *   reserved zone.
+ * @return
+ *   The pointer to the new allocated mempool, on success. NULL on error
+ *   with rte_errno set appropriately. Possible rte_errno values include:
+ *    - E_RTE_NO_CONFIG - function could not get pointer to rte_config structure
+ *    - E_RTE_SECONDARY - function was called from a secondary process instance
+ *    - EINVAL - cache size provided is too large, or priv_size is not aligned.
+ *    - ENOSPC - the maximum number of memzones has already been allocated
+ *    - EEXIST - a memzone with the same name already exists
+ *    - ENOMEM - no appropriate memory area found in which to create memzone
+ */
+__rte_experimental
+struct rte_mempool *rte_kni_pktmbuf_pool_create(const char *name,
+		unsigned int n, unsigned int cache_size, uint16_t priv_size,
+		uint16_t data_room_size, int socket_id);
+
+/**
+ * Free the given packet mempool.
+ *
+ * @param mp
+ *  The mempool pointer.
+ */
+__rte_experimental
+void rte_kni_pktmbuf_pool_free(struct rte_mempool *mp);
+
+/**
  * Get the KNI context of its name.
  *
  * @param name
diff --git a/lib/librte_kni/rte_kni_version.map b/lib/librte_kni/rte_kni_version.map
index c877dc6..5937bff 100644
--- a/lib/librte_kni/rte_kni_version.map
+++ b/lib/librte_kni/rte_kni_version.map
@@ -19,5 +19,8 @@ DPDK_2.0 {
 EXPERIMENTAL {
 	global:
 
+	# added in 19.11
 	rte_kni_update_link;
+	rte_kni_pktmbuf_pool_create;
+	rte_kni_pktmbuf_pool_free;
 };
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v11 4/4] kni: add IOVA=VA support in kernel module
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
                                         ` (2 preceding siblings ...)
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 3/4] kni: add IOVA=VA support vattunuru
@ 2019-10-21  8:03                       ` vattunuru
  2019-10-21 12:02                         ` Ferruh Yigit
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
  4 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-10-21  8:03 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds support for kernel module to work in
IOVA = VA mode by providing address translation
routines to convert IOVA aka user space VA to
kernel virtual addresses.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 kernel/linux/kni/compat.h   |  4 ++++
 kernel/linux/kni/kni_dev.h  | 31 ++++++++++++++++++++++++
 kernel/linux/kni/kni_misc.c | 39 +++++++++++++++++++++++-------
 kernel/linux/kni/kni_net.c  | 58 +++++++++++++++++++++++++++++++++++----------
 4 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..b5e8914 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,7 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 8, 0) <= LINUX_VERSION_CODE
+#define HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..abe9b14 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -41,6 +41,8 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
@@ -84,8 +86,37 @@ struct kni_dev {
 	void *va[MBUF_BURST_SZ];
 	void *alloc_pa[MBUF_BURST_SZ];
 	void *alloc_va[MBUF_BURST_SZ];
+
+	struct task_struct *usr_tsk;
 };
 
+static inline phys_addr_t iova_to_phys(struct task_struct *tsk,
+				       unsigned long iova)
+{
+	unsigned int flags = FOLL_TOUCH;
+	phys_addr_t offset, phys_addr;
+	struct page *page = NULL;
+	int ret;
+
+	offset = iova & (PAGE_SIZE - 1);
+
+	/* Read one page struct info */
+	ret =  get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				     flags, &page, 0, 0);
+	if (ret < 0)
+		return 0;
+
+	phys_addr = page_to_phys(page) | offset;
+	put_page(page);
+
+	return phys_addr;
+}
+
+static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova)
+{
+	return phys_to_virt(iova_to_phys(tsk, iova));
+}
+
 void kni_net_release_fifo_phy(struct kni_dev *kni);
 void kni_net_rx(struct kni_dev *kni);
 void kni_net_init(struct net_device *dev);
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 2b75502..7af7ab4 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -348,15 +348,36 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+		kni->tx_q = iova_to_kva(current, dev_info.tx_phys);
+		kni->rx_q = iova_to_kva(current, dev_info.rx_phys);
+		kni->alloc_q = iova_to_kva(current, dev_info.alloc_phys);
+		kni->free_q = iova_to_kva(current, dev_info.free_phys);
+
+		kni->req_q = iova_to_kva(current, dev_info.req_phys);
+		kni->resp_q = iova_to_kva(current, dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = iova_to_kva(current, dev_info.sync_phys);
+		kni->usr_tsk = current;
+		kni->iova_mode = 1;
+#else
+		pr_err("KNI module does not support IOVA to VA translation\n");
+		return -EINVAL;
+#endif
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index f25b127..e95207b 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,20 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *iova)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, (unsigned long)iova));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, m->buf_physaddr) +
+			    m->data_off);
+}
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +76,24 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +210,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +298,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +370,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +469,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +481,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +549,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +582,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support in KNI module
  2019-10-18 17:15                               ` Vamsi Krishna Attunuru
@ 2019-10-21 11:45                                 ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-10-21 11:45 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, Stephen Hemminger, Yigit, Ferruh
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, olivier.matz,
	anatoly.burakov, arybchenko, Kiran Kumar Kokkilagadda

On 10/18/2019 6:15 PM, Vamsi Krishna Attunuru wrote:
> 
> 
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Ferruh Yigit
>> Sent: Wednesday, October 16, 2019 9:44 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Stephen Hemminger
>> <stephen@networkplumber.org>; Yigit, Ferruh
>> <ferruh.yigit@linux.intel.com>
>> Cc: dev@dpdk.org; thomas@monjalon.net; Jerin Jacob Kollanukkaran
>> <jerinj@marvell.com>; olivier.matz@6wind.com;
>> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
>> Kokkilagadda <kirankumark@marvell.com>
>> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v10 4/5] kni: add IOVA=VA support
>> in KNI module
>>
>> On 10/16/2019 12:26 PM, Vamsi Krishna Attunuru wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Stephen Hemminger <stephen@networkplumber.org>
>>>> Sent: Tuesday, October 15, 2019 9:16 PM
>>>> To: Yigit, Ferruh <ferruh.yigit@linux.intel.com>
>>>> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
>>>> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>>>> olivier.matz@6wind.com; ferruh.yigit@intel.com;
>>>> anatoly.burakov@intel.com; arybchenko@solarflare.com; Kiran Kumar
>>>> Kokkilagadda <kirankumark@marvell.com>
>>>> Subject: [EXT] Re: [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA
>>>> support in KNI module
>>>>
>>>> External Email
>>>>
>>>> ---------------------------------------------------------------------
>>>> -
>>>> On Tue, 15 Oct 2019 16:43:08 +0100
>>>> "Yigit, Ferruh" <ferruh.yigit@linux.intel.com> wrote:
>>>>
>>>>> On 8/16/2019 7:12 AM, vattunuru@marvell.com wrote:
>>>>>> From: Kiran Kumar K <kirankumark@marvell.com>
>>>>>>
>>>>>> Patch adds support for kernel module to work in IOVA = VA mode, the
>>>>>> idea is to get physical address from IOVA address using
>>>>>> iommu_iova_to_phys API and later use phys_to_virt API to convert
>>>>>> the physical address to kernel virtual address.
>>>>>>
>>>>>> When compared with IOVA = PA mode, there is no performance drop
>>>>>> with this approach.
>>>>>>
>>>>>> This approach does not work with the kernel versions less than
>>>>>> 4.4.0 because of API compatibility issues.
>>>>>>
>>>>>> Patch also updates these support details in KNI documentation.
>>>>>>
>>>>>> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
>>>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>>>>
>>>>> <...>
>>>>>
>>>>>> @@ -348,15 +351,65 @@ kni_ioctl_create(struct net *net, uint32_t
>>>> ioctl_num,
>>>>>>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>>>>>>
>>>>>>  	/* Translate user space info into kernel space info */
>>>>>> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
>>>>>> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
>>>>>> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
>>>>>> -	kni->free_q = phys_to_virt(dev_info.free_phys);
>>>>>> -
>>>>>> -	kni->req_q = phys_to_virt(dev_info.req_phys);
>>>>>> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
>>>>>> -	kni->sync_va = dev_info.sync_va;
>>>>>> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
>>>>>> +	if (dev_info.iova_mode) {
>>>>>> +#ifdef HAVE_IOVA_AS_VA_SUPPORT
>>>>>> +		pci = pci_get_device(dev_info.vendor_id,
>>>>>> +				     dev_info.device_id, NULL);
>>>>>> +		if (pci == NULL) {
>>>>>> +			pr_err("pci dev does not exist\n");
>>>>>> +			return -ENODEV;
>>>>>> +		}
>>>>>
>>>>> If there is no PCI device KNI should still work.
>>>>
>>>> Right now it is possible to use KNI with netvsc PMD on Hyper-V/Azure.
>>>> With this patch that won't be possible.
>>>
>>> Hi Ferruh, Stephen,
>>>
>>> These can be fixed by forcing iommu_mode as PA when vdevs are used for
>>> KNI usecase.
>>>
>>> rte_bus_get_iommu_class(void)
>>>  {
>>>         enum rte_iova_mode mode = RTE_IOVA_DC;
>>> +       struct rte_devargs *devargs = NULL;
>>>         bool buses_want_va = false;
>>>         bool buses_want_pa = false;
>>>         struct rte_bus *bus;
>>>
>>> +       if (rte_eal_check_module("rte_kni") == 1) {
>>> +               RTE_EAL_DEVARGS_FOREACH("vdev", devargs) {
>>> +                       return RTE_IOVA_PA;
>>> +               }
>>> +       }
>>> +
>>>         TAILQ_FOREACH(bus, &rte_bus_list, next) {
>>>                 enum rte_iova_mode bus_iova_mode;
>>>
>>> I think this will solve various use cases/combinations like PA or VA mode,
>> pdev or vdev used for KNI.
>>> Existing use cases would not be affected by these patch series with above
>> fix.
>>>
>>
>> Hi Vamsi,
>>
>> I think this is not a problem of using vdev so I think we can't solve this via
>> vdev check only.
>>
>> The sample I give (KNI PMD) is using vdev, but application can use KNI library
>> APIs directly to create kni interface and have kernel/user space
>> communication without any device involved. KNI PMD (vdev) is just a
>> wrapper to make this easy.
>>
>> Just thinking aloud,
>> KNI is sharing in userspace buffer with kernel, so basically it needs to do
>> virtual address to kernel virtual address translation, in a reasonably fast
>> manner.
>>
>> iova=va breaks KNI because:
>> 1) physical memory is not continuous anymore which break our address
>> translation logic
>> 2) we were using physical address of the buffer for the address translation,
>> but we have no more have it, we now have iova address.
>>
>> I assume 1) is resolved with 'rte_kni_pktmbuf_pool_create()' it would be
>> helpful though if you can explain how this works?
> 
> KNI kernel module uses pa2va, va2pa calls to translate buffer pointers, this will only work when both pa & va are 1-to-1 contiguous. In such cases, dpdk mbuf memory should not cross physical page boundary,  If it crosses, it's iova/va will be contiguous, but it's pa may or may not be contiguous. Kni pktmbuf pool api ensures that mempool is not populated with those type of mbufs(that cross page boundary). Once the pool is created, all mbufs in that pool resides with in the page limits.

ack.

>>
>> For second, a simple question, do we need to get a PCIe device information
>> to be able to convert iova to kernel virtual address? Can't I get this
>> information from iommu somehow?
>> Think about a case "--iova-mode=va" provided but there is no physical device
>> bind to vfio-pci, can I still allocated memor? And how can I use KNI in that
>> case?
> 
> We found a way to translate iova/uva to kva using kernel mm subsystem calls, using get_user_pages_remote() call KNI module is able to get mapping for iova to kva. This solution worked for pdevs(without sharing any dev info) and also for wrapper vdev pmds.

Good to hear this.

> 
> I will push next version of patches with these solution and other command line arg changes. Since the major architectural issue is fixed in enabling iova=va for KNI, we are planning to merge these patch series in 19.11 release.
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option vattunuru
@ 2019-10-21 11:55                         ` Ferruh Yigit
  2019-10-21 13:13                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-10-21 11:55 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, anatoly.burakov,
	arybchenko, stephen

On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> This adds a "--legacy-kni" command-line option. It will
> be used to run existing KNI applications with DPDK 19.11
> and later.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>

<...>

> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index 9855429..1010ed3 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -69,6 +69,8 @@ enum {
>  	OPT_IOVA_MODE_NUM,
>  #define OPT_MATCH_ALLOCATIONS  "match-allocations"
>  	OPT_MATCH_ALLOCATIONS_NUM,
> +#define OPT_LEGACY_KNI      "legacy-kni"
> +	OPT_LEGACY_KNI_NUM,
>  	OPT_LONG_MAX_NUM
>  };

Two concerns,

1- "legacy-kni" doesn't have enough context

2- I prefer to keep existing behavior default, at least for a while, something
like next LTS etc, meanwhile this patch can be around for a good time and can be
good to switch.

Based on above to, what do you think to rename the option to 'kni-iova-va', if
not set by default it will be "IOVA=PA", when set it will enable "IOVA=VA" mode?

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v11 4/4] kni: add IOVA=VA support in kernel module
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 4/4] kni: add IOVA=VA support in kernel module vattunuru
@ 2019-10-21 12:02                         ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-10-21 12:02 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, anatoly.burakov,
	arybchenko, stephen

On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> Patch adds support for kernel module to work in
> IOVA = VA mode by providing address translation
> routines to convert IOVA aka user space VA to
> kernel virtual addresses.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>

<...>

> +static inline phys_addr_t iova_to_phys(struct task_struct *tsk,
> +				       unsigned long iova)
> +{
> +	unsigned int flags = FOLL_TOUCH;
> +	phys_addr_t offset, phys_addr;
> +	struct page *page = NULL;
> +	int ret;
> +
> +	offset = iova & (PAGE_SIZE - 1);
> +
> +	/* Read one page struct info */
> +	ret =  get_user_pages_remote(tsk, tsk->mm, iova, 1,
> +				     flags, &page, 0, 0);
> +	if (ret < 0)
> +		return 0;
> +
> +	phys_addr = page_to_phys(page) | offset;
> +	put_page(page);
> +
> +	return phys_addr;
> +}
> +
> +static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova)
> +{
> +	return phys_to_virt(iova_to_phys(tsk, iova));
> +}

Do you have any measurement for the performance affect of this change?

> +
>  void kni_net_release_fifo_phy(struct kni_dev *kni);
>  void kni_net_rx(struct kni_dev *kni);
>  void kni_net_init(struct net_device *dev);
> diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
> index 2b75502..7af7ab4 100644
> --- a/kernel/linux/kni/kni_misc.c
> +++ b/kernel/linux/kni/kni_misc.c
> @@ -348,15 +348,36 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
>  	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
>  
>  	/* Translate user space info into kernel space info */
> -	kni->tx_q = phys_to_virt(dev_info.tx_phys);
> -	kni->rx_q = phys_to_virt(dev_info.rx_phys);
> -	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
> -	kni->free_q = phys_to_virt(dev_info.free_phys);
> -
> -	kni->req_q = phys_to_virt(dev_info.req_phys);
> -	kni->resp_q = phys_to_virt(dev_info.resp_phys);
> -	kni->sync_va = dev_info.sync_va;
> -	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
> +	if (dev_info.iova_mode) {
> +#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT

Do you think is a runtime check required, for the case code has been compiled on
a box that has newer kernel but run in a box with older kernel?
Not sure about it myself either...

<...>

> @@ -62,6 +76,24 @@ kva2data_kva(struct rte_kni_mbuf *m)
>  	return phys_to_virt(m->buf_physaddr + m->data_off);
>  }
>  
> +static inline void *
> +get_kva(struct kni_dev *kni, void *pa)
> +{
> +	if (kni->iova_mode == 1)

This check done multiple times per packet, I wonder if this can be prevented,
again even with the original mode do you have any numbers related to the
performance affect of this check?

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21 11:55                         ` Ferruh Yigit
@ 2019-10-21 13:13                           ` Vamsi Krishna Attunuru
  2019-10-21 13:32                             ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-21 13:13 UTC (permalink / raw)
  To: Ferruh Yigit, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Monday, October 21, 2019 5:25 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> olivier.matz@6wind.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org
> Subject: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option
> 
> External Email
> 
> ----------------------------------------------------------------------
> On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > This adds a "--legacy-kni" command-line option. It will be used to run
> > existing KNI applications with DPDK 19.11 and later.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> 
> <...>
> 
> > diff --git a/lib/librte_eal/common/eal_options.h
> > b/lib/librte_eal/common/eal_options.h
> > index 9855429..1010ed3 100644
> > --- a/lib/librte_eal/common/eal_options.h
> > +++ b/lib/librte_eal/common/eal_options.h
> > @@ -69,6 +69,8 @@ enum {
> >  	OPT_IOVA_MODE_NUM,
> >  #define OPT_MATCH_ALLOCATIONS  "match-allocations"
> >  	OPT_MATCH_ALLOCATIONS_NUM,
> > +#define OPT_LEGACY_KNI      "legacy-kni"
> > +	OPT_LEGACY_KNI_NUM,
> >  	OPT_LONG_MAX_NUM
> >  };
> 
> Two concerns,
> 
> 1- "legacy-kni" doesn't have enough context
> 
> 2- I prefer to keep existing behavior default, at least for a while, something
> like next LTS etc, meanwhile this patch can be around for a good time and
> can be good to switch.
> 
> Based on above to, what do you think to rename the option to 'kni-iova-va',
> if not set by default it will be "IOVA=PA", when set it will enable "IOVA=VA"
> mode?

Hi Ferruh,

I think the new eal flag(legacy-kni) is quite intuitive. Since release notes will be having the required details about it's purpose and how it enables users to use existing applications on latest dpdk.

Current EAL does set iova as va if bus iommu returns DC, meaning iova=va is the kind of default mode(in most of use cases) and for running kni, we have to explicitly set the flag to run kni in iova=va mode all the time. I think having a flag for legacy usage(PA mode) is more appropriate than having kni-iova-va kind of flag.

Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va mode, we would like to have KNI running by default without any flags passed.   

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21 13:13                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-10-21 13:32                             ` Ferruh Yigit
  2019-10-21 14:38                               ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-10-21 13:32 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen

On 10/21/2019 2:13 PM, Vamsi Krishna Attunuru wrote:
> 
> 
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Monday, October 21, 2019 5:25 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
>> olivier.matz@6wind.com; anatoly.burakov@intel.com;
>> arybchenko@solarflare.com; stephen@networkplumber.org
>> Subject: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>
>>> This adds a "--legacy-kni" command-line option. It will be used to run
>>> existing KNI applications with DPDK 19.11 and later.
>>>
>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
>>
>> <...>
>>
>>> diff --git a/lib/librte_eal/common/eal_options.h
>>> b/lib/librte_eal/common/eal_options.h
>>> index 9855429..1010ed3 100644
>>> --- a/lib/librte_eal/common/eal_options.h
>>> +++ b/lib/librte_eal/common/eal_options.h
>>> @@ -69,6 +69,8 @@ enum {
>>>  	OPT_IOVA_MODE_NUM,
>>>  #define OPT_MATCH_ALLOCATIONS  "match-allocations"
>>>  	OPT_MATCH_ALLOCATIONS_NUM,
>>> +#define OPT_LEGACY_KNI      "legacy-kni"
>>> +	OPT_LEGACY_KNI_NUM,
>>>  	OPT_LONG_MAX_NUM
>>>  };
>>
>> Two concerns,
>>
>> 1- "legacy-kni" doesn't have enough context
>>
>> 2- I prefer to keep existing behavior default, at least for a while, something
>> like next LTS etc, meanwhile this patch can be around for a good time and
>> can be good to switch.
>>
>> Based on above to, what do you think to rename the option to 'kni-iova-va',
>> if not set by default it will be "IOVA=PA", when set it will enable "IOVA=VA"
>> mode?
> 
> Hi Ferruh,
> 
> I think the new eal flag(legacy-kni) is quite intuitive. Since release notes will be having the required details about it's purpose and how it enables users to use existing applications on latest dpdk.

what exactly 'legacy' means, what has been changed, is the old one completely
replaced/re-written ????, but whoever not following what is happening won't
underase tand what is old in the KNI, digging through release notes and commits
will give this information but it will be hard to get it from binary and get
harder by a few releases passed.

> 
> Current EAL does set iova as va if bus iommu returns DC, meaning iova=va is the kind of default mode(in most of use cases) and for running kni, we have to explicitly set the flag to run kni in iova=va mode all the time. I think having a flag for legacy usage(PA mode) is more appropriate than having kni-iova-va kind of flag.

It is about keeping the existing behavior same, right now if the kni module is
inserted it will force the PA mode. With your update it will be possible to run
iova=va with kni module inserted when flag is set. I suggest giving some time to
this new behavior before making it default.

> 
> Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va mode, we would like to have KNI running by default without any flags passed.   
> 

I see, but other way around will affect all existing KNI users, they will either
need to add this flag or update their application.

This is new feature, who want to use it adding a specific flag makes more sense
to me than all old users have to add the flag.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21 13:32                             ` Ferruh Yigit
@ 2019-10-21 14:38                               ` Vamsi Krishna Attunuru
  2019-10-22  9:29                                 ` Vamsi Krishna Attunuru
  2019-10-22 12:28                                 ` Andrew Rybchenko
  0 siblings, 2 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-21 14:38 UTC (permalink / raw)
  To: olivier.matz, Andrew Rybchenko
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen, Ferruh Yigit,
	dev

Hi Olivier, Andrew,

> >>> +#define OPT_LEGACY_KNI      "legacy-kni"
> >>> +	OPT_LEGACY_KNI_NUM,
> >>>  	OPT_LONG_MAX_NUM
> >>>  };
> >>
> >> Two concerns,
> >>
> >> 1- "legacy-kni" doesn't have enough context
> >>
> >> 2- I prefer to keep existing behavior default, at least for a while,
> >> something like next LTS etc, meanwhile this patch can be around for a
> >> good time and can be good to switch.
> >>
> >> Based on above to, what do you think to rename the option to
> >> 'kni-iova-va', if not set by default it will be "IOVA=PA", when set it will
> enable "IOVA=VA"
> >> mode?
> >
> > Hi Ferruh,
> >
> > I think the new eal flag(legacy-kni) is quite intuitive. Since release notes will
> be having the required details about it's purpose and how it enables users to
> use existing applications on latest dpdk.
> 
> what exactly 'legacy' means, what has been changed, is the old one
> completely replaced/re-written ????, but whoever not following what is
> happening won't underase tand what is old in the KNI, digging through
> release notes and commits will give this information but it will be hard to get
> it from binary and get harder by a few releases passed.
> 
> >
> > Current EAL does set iova as va if bus iommu returns DC, meaning iova=va
> is the kind of default mode(in most of use cases) and for running kni, we
> have to explicitly set the flag to run kni in iova=va mode all the time. I think
> having a flag for legacy usage(PA mode) is more appropriate than having kni-
> iova-va kind of flag.
> 
> It is about keeping the existing behavior same, right now if the kni module is
> inserted it will force the PA mode. With your update it will be possible to run
> iova=va with kni module inserted when flag is set. I suggest giving some time
> to this new behavior before making it default.
> 
> >
> > Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va
> mode, we would like to have KNI running by default without any flags
> passed.
> >
> 
> I see, but other way around will affect all existing KNI users, they will either
> need to add this flag or update their application.
> 
> This is new feature, who want to use it adding a specific flag makes more
> sense to me than all old users have to add the flag.


Ferruh suggested to have a flag for enabling these new feature and also not interested in having  newer mempool alloc APIs for KNI(see V10 review comments). Before reworking on the flag changes, I would like check with you whether the same flag can be used in mempool lib for checking and fulfilling the mempool  page boundary requirement (mempool patch v11 1/4), by doing so, it can avoid newer exported APIs both in mempool and KNI lib. Anyways, these mempool requirement can be addressed with Olivier's below patches.

http://patchwork.dpdk.org/project/dpdk/list/?series=5624

When those patches are merged,  flag check can be removed.

Regards
A Vamsi 






> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Monday, October 21, 2019 7:02 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> olivier.matz@6wind.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org
> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option
> 
> On 10/21/2019 2:13 PM, Vamsi Krishna Attunuru wrote:
> >
> >
> >> -----Original Message-----
> >> From: Ferruh Yigit <ferruh.yigit@intel.com>
> >> Sent: Monday, October 21, 2019 5:25 PM
> >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> >> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> >> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> >> <kirankumark@marvell.com>; olivier.matz@6wind.com;
> >> anatoly.burakov@intel.com; arybchenko@solarflare.com;
> >> stephen@networkplumber.org
> >> Subject: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni
> >> option
> >>
> >> External Email
> >>
> >> ---------------------------------------------------------------------
> >> - On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
> >>> From: Vamsi Attunuru <vattunuru@marvell.com>
> >>>
> >>> This adds a "--legacy-kni" command-line option. It will be used to
> >>> run existing KNI applications with DPDK 19.11 and later.
> >>>
> >>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> >>> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> >>
> >> <...>
> >>
> >>> diff --git a/lib/librte_eal/common/eal_options.h
> >>> b/lib/librte_eal/common/eal_options.h
> >>> index 9855429..1010ed3 100644
> >>> --- a/lib/librte_eal/common/eal_options.h
> >>> +++ b/lib/librte_eal/common/eal_options.h
> >>> @@ -69,6 +69,8 @@ enum {
> >>>  	OPT_IOVA_MODE_NUM,
> >>>  #define OPT_MATCH_ALLOCATIONS  "match-allocations"
> >>>  	OPT_MATCH_ALLOCATIONS_NUM,
> >>> +#define OPT_LEGACY_KNI      "legacy-kni"
> >>> +	OPT_LEGACY_KNI_NUM,
> >>>  	OPT_LONG_MAX_NUM
> >>>  };
> >>
> >> Two concerns,
> >>
> >> 1- "legacy-kni" doesn't have enough context
> >>
> >> 2- I prefer to keep existing behavior default, at least for a while,
> >> something like next LTS etc, meanwhile this patch can be around for a
> >> good time and can be good to switch.
> >>
> >> Based on above to, what do you think to rename the option to
> >> 'kni-iova-va', if not set by default it will be "IOVA=PA", when set it will
> enable "IOVA=VA"
> >> mode?
> >
> > Hi Ferruh,
> >
> > I think the new eal flag(legacy-kni) is quite intuitive. Since release notes will
> be having the required details about it's purpose and how it enables users to
> use existing applications on latest dpdk.
> 
> what exactly 'legacy' means, what has been changed, is the old one
> completely replaced/re-written ????, but whoever not following what is
> happening won't underase tand what is old in the KNI, digging through
> release notes and commits will give this information but it will be hard to get
> it from binary and get harder by a few releases passed.
> 
> >
> > Current EAL does set iova as va if bus iommu returns DC, meaning iova=va
> is the kind of default mode(in most of use cases) and for running kni, we
> have to explicitly set the flag to run kni in iova=va mode all the time. I think
> having a flag for legacy usage(PA mode) is more appropriate than having kni-
> iova-va kind of flag.
> 
> It is about keeping the existing behavior same, right now if the kni module is
> inserted it will force the PA mode. With your update it will be possible to run
> iova=va with kni module inserted when flag is set. I suggest giving some time
> to this new behavior before making it default.
> 
> >
> > Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va
> mode, we would like to have KNI running by default without any flags
> passed.
> >
> 
> I see, but other way around will affect all existing KNI users, they will either
> need to add this flag or update their application.
> 
> This is new feature, who want to use it adding a specific flag makes more
> sense to me than all old users have to add the flag.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21 14:38                               ` Vamsi Krishna Attunuru
@ 2019-10-22  9:29                                 ` Vamsi Krishna Attunuru
  2019-10-22 12:28                                 ` Andrew Rybchenko
  1 sibling, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-22  9:29 UTC (permalink / raw)
  To: olivier.matz, Andrew Rybchenko
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen, Ferruh Yigit,
	dev

Hi Olivier, Andrew,

Please share your thoughts/comments on below email.

> -----Original Message-----
> From: Vamsi Krishna Attunuru
> Sent: Monday, October 21, 2019 8:08 PM
> To: olivier.matz@6wind.com; Andrew Rybchenko
> <arybchenko@solarflare.com>
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> olivier.matz@6wind.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org; Ferruh Yigit
> <ferruh.yigit@intel.com>; dev@dpdk.org
> Subject: RE: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option
> 
> Hi Olivier, Andrew,
> 
> > >>> +#define OPT_LEGACY_KNI      "legacy-kni"
> > >>> +	OPT_LEGACY_KNI_NUM,
> > >>>  	OPT_LONG_MAX_NUM
> > >>>  };
> > >>
> > >> Two concerns,
> > >>
> > >> 1- "legacy-kni" doesn't have enough context
> > >>
> > >> 2- I prefer to keep existing behavior default, at least for a
> > >> while, something like next LTS etc, meanwhile this patch can be
> > >> around for a good time and can be good to switch.
> > >>
> > >> Based on above to, what do you think to rename the option to
> > >> 'kni-iova-va', if not set by default it will be "IOVA=PA", when set
> > >> it will
> > enable "IOVA=VA"
> > >> mode?
> > >
> > > Hi Ferruh,
> > >
> > > I think the new eal flag(legacy-kni) is quite intuitive. Since
> > > release notes will
> > be having the required details about it's purpose and how it enables
> > users to use existing applications on latest dpdk.
> >
> > what exactly 'legacy' means, what has been changed, is the old one
> > completely replaced/re-written ????, but whoever not following what is
> > happening won't underase tand what is old in the KNI, digging through
> > release notes and commits will give this information but it will be
> > hard to get it from binary and get harder by a few releases passed.
> >
> > >
> > > Current EAL does set iova as va if bus iommu returns DC, meaning
> > > iova=va
> > is the kind of default mode(in most of use cases) and for running kni,
> > we have to explicitly set the flag to run kni in iova=va mode all the
> > time. I think having a flag for legacy usage(PA mode) is more
> > appropriate than having kni- iova-va kind of flag.
> >
> > It is about keeping the existing behavior same, right now if the kni
> > module is inserted it will force the PA mode. With your update it will
> > be possible to run iova=va with kni module inserted when flag is set.
> > I suggest giving some time to this new behavior before making it default.
> >
> > >
> > > Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va
> > mode, we would like to have KNI running by default without any flags
> > passed.
> > >
> >
> > I see, but other way around will affect all existing KNI users, they
> > will either need to add this flag or update their application.
> >
> > This is new feature, who want to use it adding a specific flag makes
> > more sense to me than all old users have to add the flag.
> 
> 
> Ferruh suggested to have a flag for enabling these new feature and also not
> interested in having  newer mempool alloc APIs for KNI(see V10 review
> comments). Before reworking on the flag changes, I would like check with you
> whether the same flag can be used in mempool lib for checking and fulfilling the
> mempool  page boundary requirement (mempool patch v11 1/4), by doing so, it
> can avoid newer exported APIs both in mempool and KNI lib. Anyways, these
> mempool requirement can be addressed with Olivier's below patches.
> 
> http://patchwork.dpdk.org/project/dpdk/list/?series=5624
> 
> When those patches are merged,  flag check can be removed.
> 
> Regards
> A Vamsi
> 
> 
> 
> 
> 
> 
> > -----Original Message-----
> > From: Ferruh Yigit <ferruh.yigit@intel.com>
> > Sent: Monday, October 21, 2019 7:02 PM
> > To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> > Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > <kirankumark@marvell.com>; olivier.matz@6wind.com;
> > anatoly.burakov@intel.com; arybchenko@solarflare.com;
> > stephen@networkplumber.org
> > Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni
> > option
> >
> > On 10/21/2019 2:13 PM, Vamsi Krishna Attunuru wrote:
> > >
> > >
> > >> -----Original Message-----
> > >> From: Ferruh Yigit <ferruh.yigit@intel.com>
> > >> Sent: Monday, October 21, 2019 5:25 PM
> > >> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> > >> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
> > >> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > >> <kirankumark@marvell.com>; olivier.matz@6wind.com;
> > >> anatoly.burakov@intel.com; arybchenko@solarflare.com;
> > >> stephen@networkplumber.org
> > >> Subject: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni
> > >> option
> > >>
> > >> External Email
> > >>
> > >> -------------------------------------------------------------------
> > >> --
> > >> - On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
> > >>> From: Vamsi Attunuru <vattunuru@marvell.com>
> > >>>
> > >>> This adds a "--legacy-kni" command-line option. It will be used to
> > >>> run existing KNI applications with DPDK 19.11 and later.
> > >>>
> > >>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > >>> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> > >>
> > >> <...>
> > >>
> > >>> diff --git a/lib/librte_eal/common/eal_options.h
> > >>> b/lib/librte_eal/common/eal_options.h
> > >>> index 9855429..1010ed3 100644
> > >>> --- a/lib/librte_eal/common/eal_options.h
> > >>> +++ b/lib/librte_eal/common/eal_options.h
> > >>> @@ -69,6 +69,8 @@ enum {
> > >>>  	OPT_IOVA_MODE_NUM,
> > >>>  #define OPT_MATCH_ALLOCATIONS  "match-allocations"
> > >>>  	OPT_MATCH_ALLOCATIONS_NUM,
> > >>> +#define OPT_LEGACY_KNI      "legacy-kni"
> > >>> +	OPT_LEGACY_KNI_NUM,
> > >>>  	OPT_LONG_MAX_NUM
> > >>>  };
> > >>
> > >> Two concerns,
> > >>
> > >> 1- "legacy-kni" doesn't have enough context
> > >>
> > >> 2- I prefer to keep existing behavior default, at least for a
> > >> while, something like next LTS etc, meanwhile this patch can be
> > >> around for a good time and can be good to switch.
> > >>
> > >> Based on above to, what do you think to rename the option to
> > >> 'kni-iova-va', if not set by default it will be "IOVA=PA", when set
> > >> it will
> > enable "IOVA=VA"
> > >> mode?
> > >
> > > Hi Ferruh,
> > >
> > > I think the new eal flag(legacy-kni) is quite intuitive. Since
> > > release notes will
> > be having the required details about it's purpose and how it enables
> > users to use existing applications on latest dpdk.
> >
> > what exactly 'legacy' means, what has been changed, is the old one
> > completely replaced/re-written ????, but whoever not following what is
> > happening won't underase tand what is old in the KNI, digging through
> > release notes and commits will give this information but it will be
> > hard to get it from binary and get harder by a few releases passed.
> >
> > >
> > > Current EAL does set iova as va if bus iommu returns DC, meaning
> > > iova=va
> > is the kind of default mode(in most of use cases) and for running kni,
> > we have to explicitly set the flag to run kni in iova=va mode all the
> > time. I think having a flag for legacy usage(PA mode) is more
> > appropriate than having kni- iova-va kind of flag.
> >
> > It is about keeping the existing behavior same, right now if the kni
> > module is inserted it will force the PA mode. With your update it will
> > be possible to run iova=va with kni module inserted when flag is set.
> > I suggest giving some time to this new behavior before making it default.
> >
> > >
> > > Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va
> > mode, we would like to have KNI running by default without any flags
> > passed.
> > >
> >
> > I see, but other way around will affect all existing KNI users, they
> > will either need to add this flag or update their application.
> >
> > This is new feature, who want to use it adding a specific flag makes
> > more sense to me than all old users have to add the flag.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-21 14:38                               ` Vamsi Krishna Attunuru
  2019-10-22  9:29                                 ` Vamsi Krishna Attunuru
@ 2019-10-22 12:28                                 ` Andrew Rybchenko
  2019-10-22 13:31                                   ` Vamsi Krishna Attunuru
  1 sibling, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-22 12:28 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, olivier.matz
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, Ferruh Yigit, dev

Hi Vasmi,

On 10/21/19 5:38 PM, Vamsi Krishna Attunuru wrote:
> Hi Olivier, Andrew,
>
>>>>> +#define OPT_LEGACY_KNI      "legacy-kni"
>>>>> +	OPT_LEGACY_KNI_NUM,
>>>>>   	OPT_LONG_MAX_NUM
>>>>>   };
>>>> Two concerns,
>>>>
>>>> 1- "legacy-kni" doesn't have enough context
>>>>
>>>> 2- I prefer to keep existing behavior default, at least for a while,
>>>> something like next LTS etc, meanwhile this patch can be around for a
>>>> good time and can be good to switch.
>>>>
>>>> Based on above to, what do you think to rename the option to
>>>> 'kni-iova-va', if not set by default it will be "IOVA=PA", when set it will
>> enable "IOVA=VA"
>>>> mode?
>>> Hi Ferruh,
>>>
>>> I think the new eal flag(legacy-kni) is quite intuitive. Since release notes will
>> be having the required details about it's purpose and how it enables users to
>> use existing applications on latest dpdk.
>>
>> what exactly 'legacy' means, what has been changed, is the old one
>> completely replaced/re-written ????, but whoever not following what is
>> happening won't underase tand what is old in the KNI, digging through
>> release notes and commits will give this information but it will be hard to get
>> it from binary and get harder by a few releases passed.
>>
>>> Current EAL does set iova as va if bus iommu returns DC, meaning iova=va
>> is the kind of default mode(in most of use cases) and for running kni, we
>> have to explicitly set the flag to run kni in iova=va mode all the time. I think
>> having a flag for legacy usage(PA mode) is more appropriate than having kni-
>> iova-va kind of flag.
>>
>> It is about keeping the existing behavior same, right now if the kni module is
>> inserted it will force the PA mode. With your update it will be possible to run
>> iova=va with kni module inserted when flag is set. I suggest giving some time
>> to this new behavior before making it default.
>>
>>> Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va
>> mode, we would like to have KNI running by default without any flags
>> passed.
>> I see, but other way around will affect all existing KNI users, they will either
>> need to add this flag or update their application.
>>
>> This is new feature, who want to use it adding a specific flag makes more
>> sense to me than all old users have to add the flag.
>
> Ferruh suggested to have a flag for enabling these new feature and also not interested in having  newer mempool alloc APIs for KNI(see V10 review comments). Before reworking on the flag changes, I would like check with you whether the same flag can be used in mempool lib for checking and fulfilling the mempool  page boundary requirement (mempool patch v11 1/4), by doing so, it can avoid newer exported APIs both in mempool and KNI lib. Anyways, these mempool requirement can be addressed with Olivier's below patches.
>
> http://patchwork.dpdk.org/project/dpdk/list/?series=5624
>
> When those patches are merged,  flag check can be removed.

It is really hard to follow, sorry.
Is it really a problem to have KNI switcher to enable new
behaviour and document it that dedicated function
should be used to allocate mbufs?

Andrew.

> Regards
> A Vamsi
>
>
>
>
>
>
>> -----Original Message-----
>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>> Sent: Monday, October 21, 2019 7:02 PM
>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
>> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
>> olivier.matz@6wind.com; anatoly.burakov@intel.com;
>> arybchenko@solarflare.com; stephen@networkplumber.org
>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option
>>
>> On 10/21/2019 2:13 PM, Vamsi Krishna Attunuru wrote:
>>>
>>>> -----Original Message-----
>>>> From: Ferruh Yigit <ferruh.yigit@intel.com>
>>>> Sent: Monday, October 21, 2019 5:25 PM
>>>> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
>>>> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran
>>>> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
>>>> <kirankumark@marvell.com>; olivier.matz@6wind.com;
>>>> anatoly.burakov@intel.com; arybchenko@solarflare.com;
>>>> stephen@networkplumber.org
>>>> Subject: [EXT] Re: [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni
>>>> option
>>>>
>>>> External Email
>>>>
>>>> ---------------------------------------------------------------------
>>>> - On 10/21/2019 9:03 AM, vattunuru@marvell.com wrote:
>>>>> From: Vamsi Attunuru <vattunuru@marvell.com>
>>>>>
>>>>> This adds a "--legacy-kni" command-line option. It will be used to
>>>>> run existing KNI applications with DPDK 19.11 and later.
>>>>>
>>>>> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
>>>>> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
>>>> <...>
>>>>
>>>>> diff --git a/lib/librte_eal/common/eal_options.h
>>>>> b/lib/librte_eal/common/eal_options.h
>>>>> index 9855429..1010ed3 100644
>>>>> --- a/lib/librte_eal/common/eal_options.h
>>>>> +++ b/lib/librte_eal/common/eal_options.h
>>>>> @@ -69,6 +69,8 @@ enum {
>>>>>   	OPT_IOVA_MODE_NUM,
>>>>>   #define OPT_MATCH_ALLOCATIONS  "match-allocations"
>>>>>   	OPT_MATCH_ALLOCATIONS_NUM,
>>>>> +#define OPT_LEGACY_KNI      "legacy-kni"
>>>>> +	OPT_LEGACY_KNI_NUM,
>>>>>   	OPT_LONG_MAX_NUM
>>>>>   };
>>>> Two concerns,
>>>>
>>>> 1- "legacy-kni" doesn't have enough context
>>>>
>>>> 2- I prefer to keep existing behavior default, at least for a while,
>>>> something like next LTS etc, meanwhile this patch can be around for a
>>>> good time and can be good to switch.
>>>>
>>>> Based on above to, what do you think to rename the option to
>>>> 'kni-iova-va', if not set by default it will be "IOVA=PA", when set it will
>> enable "IOVA=VA"
>>>> mode?
>>> Hi Ferruh,
>>>
>>> I think the new eal flag(legacy-kni) is quite intuitive. Since release notes will
>> be having the required details about it's purpose and how it enables users to
>> use existing applications on latest dpdk.
>>
>> what exactly 'legacy' means, what has been changed, is the old one
>> completely replaced/re-written ????, but whoever not following what is
>> happening won't underase tand what is old in the KNI, digging through
>> release notes and commits will give this information but it will be hard to get
>> it from binary and get harder by a few releases passed.
>>
>>> Current EAL does set iova as va if bus iommu returns DC, meaning iova=va
>> is the kind of default mode(in most of use cases) and for running kni, we
>> have to explicitly set the flag to run kni in iova=va mode all the time. I think
>> having a flag for legacy usage(PA mode) is more appropriate than having kni-
>> iova-va kind of flag.
>>
>> It is about keeping the existing behavior same, right now if the kni module is
>> inserted it will force the PA mode. With your update it will be possible to run
>> iova=va with kni module inserted when flag is set. I suggest giving some time
>> to this new behavior before making it default.
>>
>>> Otx2 net pmd that runs on Octeontx2 platforms only supports iova=va
>> mode, we would like to have KNI running by default without any flags
>> passed.
>> I see, but other way around will affect all existing KNI users, they will either
>> need to add this flag or update their application.
>>
>> This is new feature, who want to use it adding a specific flag makes more
>> sense to me than all old users have to add the flag.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-22 12:28                                 ` Andrew Rybchenko
@ 2019-10-22 13:31                                   ` Vamsi Krishna Attunuru
  2019-10-23 10:12                                     ` Jerin Jacob
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-22 13:31 UTC (permalink / raw)
  To: Andrew Rybchenko, olivier.matz, Ferruh Yigit
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, Ferruh Yigit, dev

Hi Ferruh,

Can you please explain the problems in using kni dedicated mbuf alloc routines while enabling kni iova=va mode. Please see the below discussion with Andrew. He wanted to know the problems in having newer APIs.

I wanted to clarify ourselves about the need of mem pool patch(V11 1/4) and where to fit the fix(newer APIs or fix in mempool lib) before I start reworking the patches.

Regards
A Vamsi

> snipped
> >>
> >> This is new feature, who want to use it adding a specific flag makes
> >> more sense to me than all old users have to add the flag.
> >
> > Ferruh suggested to have a flag for enabling these new feature and also not
> interested in having  newer mempool alloc APIs for KNI(see V10 review
> comments). Before reworking on the flag changes, I would like check with you
> whether the same flag can be used in mempool lib for checking and fulfilling the
> mempool  page boundary requirement (mempool patch v11 1/4), by doing so, it
> can avoid newer exported APIs both in mempool and KNI lib. Anyways, these
> mempool requirement can be addressed with Olivier's below patches.
> >
> > https://urldefense.proofpoint.com/v2/url?u=http-3A__patchwork.dpdk.org
> > _project_dpdk_list_-3Fseries-
> 3D5624&d=DwICaQ&c=nKjWec2b6R0mOyPaz7xtfQ&
> > r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=tCO-
> 8E27vdyIVMm_35Qv6K
> > -OwGCIobWI0H9DGGBm-gc&s=9aLHS9Og5L0uhuLGFAuQ9NAT3-
> hlFmtc9RrrSbLQC00&e=
> >
> > When those patches are merged,  flag check can be removed.
> 
> It is really hard to follow, sorry.
> Is it really a problem to have KNI switcher to enable new behaviour and
> document it that dedicated function should be used to allocate mbufs?
> 
> Andrew.
> 
> > Regards
> > A Vamsi

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-22 13:31                                   ` Vamsi Krishna Attunuru
@ 2019-10-23 10:12                                     ` Jerin Jacob
  2019-10-23 14:47                                       ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-10-23 10:12 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: Andrew Rybchenko, olivier.matz, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
<vattunuru@marvell.com> wrote:
>
> Hi Ferruh,
>
> Can you please explain the problems in using kni dedicated mbuf alloc routines while enabling kni iova=va mode. Please see the below discussion with Andrew. He wanted to know the problems in having newer APIs.


While waiting  for the Ferruh reply, I would like to summarise the
current status

# In order to make KNI work with IOVA as VA, We need to make sure
mempool pool _object_ should not span across two huge pages

# This problem can be fixed by, either of:

a) Introduce a flag in mempool to define this constraint, so that,
when only needed, this constraint enforced and this is in line
with existing semantics of addressing such problems in mempool

b) Instead of creating a flag, Make this behavior by default in
mempool for IOVA as VA case

Upside:
b1) There is no need for specific mempool_create for KNI.

Downside:
b2) Not align with existing mempool API semantics
b3) There will be a trivial amount of memory waste as we can not
allocate from the edge. Considering the normal huge
page memory size is 1G or 512MB this not a real issue.

c) Make IOVA as PA when KNI kernel module is loaded

Upside:
c1) Doing option (a) would call for new KNI specific mempool create
API i.e existing KNI applications need a one-line change in
application to make it work with release 19.11 or later.

Downslide:
c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work with KNI
c3) Need root privilege to run KNI as IOVA as PA need root privilege

For the next year, we expect applications to work 19.11 without any
code change. My personal opinion to make go with option (a)
and update the release notes to document the change any it simple
one-line change.

The selection of (a) vs (b) is between KNI and Mempool maintainers.
Could we please reach a consensus? Or can we discuss this TB meeting?

We are going back and forth on this feature on for the last 3
releases. Now that, we solved all the technical problems, please help
us
to decide (a) vs (b) to make forward progress.







>
> I wanted to clarify ourselves about the need of mem pool patch(V11 1/4) and where to fit the fix(newer APIs or fix in mempool lib) before I start reworking the patches.
>
> Regards
> A Vamsi
>
> > snipped
> > >>
> > >> This is new feature, who want to use it adding a specific flag makes
> > >> more sense to me than all old users have to add the flag.
> > >
> > > Ferruh suggested to have a flag for enabling these new feature and also not
> > interested in having  newer mempool alloc APIs for KNI(see V10 review
> > comments). Before reworking on the flag changes, I would like check with you
> > whether the same flag can be used in mempool lib for checking and fulfilling the
> > mempool  page boundary requirement (mempool patch v11 1/4), by doing so, it
> > can avoid newer exported APIs both in mempool and KNI lib. Anyways, these
> > mempool requirement can be addressed with Olivier's below patches.
> > >
> > > https://urldefense.proofpoint.com/v2/url?u=http-3A__patchwork.dpdk.org
> > > _project_dpdk_list_-3Fseries-
> > 3D5624&d=DwICaQ&c=nKjWec2b6R0mOyPaz7xtfQ&
> > > r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=tCO-
> > 8E27vdyIVMm_35Qv6K
> > > -OwGCIobWI0H9DGGBm-gc&s=9aLHS9Og5L0uhuLGFAuQ9NAT3-
> > hlFmtc9RrrSbLQC00&e=
> > >
> > > When those patches are merged,  flag check can be removed.
> >
> > It is really hard to follow, sorry.
> > Is it really a problem to have KNI switcher to enable new behaviour and
> > document it that dedicated function should be used to allocate mbufs?
> >
> > Andrew.
> >
> > > Regards
> > > A Vamsi

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-23 10:12                                     ` Jerin Jacob
@ 2019-10-23 14:47                                       ` Olivier Matz
  2019-10-23 15:02                                         ` Jerin Jacob
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-23 14:47 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Vamsi Krishna Attunuru, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

Hi,

On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> <vattunuru@marvell.com> wrote:
> >
> > Hi Ferruh,
> >
> > Can you please explain the problems in using kni dedicated mbuf alloc routines while enabling kni iova=va mode. Please see the below discussion with Andrew. He wanted to know the problems in having newer APIs.
> 
> 
> While waiting  for the Ferruh reply, I would like to summarise the
> current status
> 
> # In order to make KNI work with IOVA as VA, We need to make sure
> mempool pool _object_ should not span across two huge pages
> 
> # This problem can be fixed by, either of:
> 
> a) Introduce a flag in mempool to define this constraint, so that,
> when only needed, this constraint enforced and this is in line
> with existing semantics of addressing such problems in mempool
> 
> b) Instead of creating a flag, Make this behavior by default in
> mempool for IOVA as VA case
> 
> Upside:
> b1) There is no need for specific mempool_create for KNI.
> 
> Downside:
> b2) Not align with existing mempool API semantics
> b3) There will be a trivial amount of memory waste as we can not
> allocate from the edge. Considering the normal huge
> page memory size is 1G or 512MB this not a real issue.
> 
> c) Make IOVA as PA when KNI kernel module is loaded
> 
> Upside:
> c1) Doing option (a) would call for new KNI specific mempool create
> API i.e existing KNI applications need a one-line change in
> application to make it work with release 19.11 or later.
> 
> Downslide:
> c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work with KNI
> c3) Need root privilege to run KNI as IOVA as PA need root privilege
> 
> For the next year, we expect applications to work 19.11 without any
> code change. My personal opinion to make go with option (a)
> and update the release notes to document the change any it simple
> one-line change.
> 
> The selection of (a) vs (b) is between KNI and Mempool maintainers.
> Could we please reach a consensus? Or can we discuss this TB meeting?
> 
> We are going back and forth on this feature on for the last 3
> releases. Now that, we solved all the technical problems, please help
> us
> to decide (a) vs (b) to make forward progress.

Thank you for the summary.
What is not clear to me is if (a) or (b) may break an existing
application, and if yes, in which case.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-23 14:47                                       ` Olivier Matz
@ 2019-10-23 15:02                                         ` Jerin Jacob
  2019-10-24 17:35                                           ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-10-23 15:02 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Vamsi Krishna Attunuru, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz <olivier.matz@6wind.com> wrote:
>
> Hi,
>
> On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > <vattunuru@marvell.com> wrote:
> > >
> > > Hi Ferruh,
> > >
> > > Can you please explain the problems in using kni dedicated mbuf alloc routines while enabling kni iova=va mode. Please see the below discussion with Andrew. He wanted to know the problems in having newer APIs.
> >
> >
> > While waiting  for the Ferruh reply, I would like to summarise the
> > current status
> >
> > # In order to make KNI work with IOVA as VA, We need to make sure
> > mempool pool _object_ should not span across two huge pages
> >
> > # This problem can be fixed by, either of:
> >
> > a) Introduce a flag in mempool to define this constraint, so that,
> > when only needed, this constraint enforced and this is in line
> > with existing semantics of addressing such problems in mempool
> >
> > b) Instead of creating a flag, Make this behavior by default in
> > mempool for IOVA as VA case
> >
> > Upside:
> > b1) There is no need for specific mempool_create for KNI.
> >
> > Downside:
> > b2) Not align with existing mempool API semantics
> > b3) There will be a trivial amount of memory waste as we can not
> > allocate from the edge. Considering the normal huge
> > page memory size is 1G or 512MB this not a real issue.
> >
> > c) Make IOVA as PA when KNI kernel module is loaded
> >
> > Upside:
> > c1) Doing option (a) would call for new KNI specific mempool create
> > API i.e existing KNI applications need a one-line change in
> > application to make it work with release 19.11 or later.
> >
> > Downslide:
> > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work with KNI
> > c3) Need root privilege to run KNI as IOVA as PA need root privilege
> >
> > For the next year, we expect applications to work 19.11 without any
> > code change. My personal opinion to make go with option (a)
> > and update the release notes to document the change any it simple
> > one-line change.
> >
> > The selection of (a) vs (b) is between KNI and Mempool maintainers.
> > Could we please reach a consensus? Or can we discuss this TB meeting?
> >
> > We are going back and forth on this feature on for the last 3
> > releases. Now that, we solved all the technical problems, please help
> > us
> > to decide (a) vs (b) to make forward progress.
>
> Thank you for the summary.
> What is not clear to me is if (a) or (b) may break an existing
> application, and if yes, in which case.

Thanks for the reply.

To be clear we are talking about out of tree KNI tree application.
Which they don't want to
change rte_pktmbuf_pool_create() to rte_kni_pktmbuf_pool_create() and
build for v19.11

So in case (b) there is no issue as It will be using rte_pktmbuf_pool_create ().
But in case of (a) it will create an issue if out of tree KNI
application is using rte_pktmbuf_pool_create() which is not using the
NEW flag.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-23 15:02                                         ` Jerin Jacob
@ 2019-10-24 17:35                                           ` Olivier Matz
  2019-10-24 19:30                                             ` Jerin Jacob
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-24 17:35 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Vamsi Krishna Attunuru, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

Hi,

On Wed, Oct 23, 2019 at 08:32:08PM +0530, Jerin Jacob wrote:
> On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz <olivier.matz@6wind.com> wrote:
> >
> > Hi,
> >
> > On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > > <vattunuru@marvell.com> wrote:
> > > >
> > > > Hi Ferruh,
> > > >
> > > > Can you please explain the problems in using kni dedicated mbuf alloc routines while enabling kni iova=va mode. Please see the below discussion with Andrew. He wanted to know the problems in having newer APIs.
> > >
> > >
> > > While waiting  for the Ferruh reply, I would like to summarise the
> > > current status
> > >
> > > # In order to make KNI work with IOVA as VA, We need to make sure
> > > mempool pool _object_ should not span across two huge pages
> > >
> > > # This problem can be fixed by, either of:
> > >
> > > a) Introduce a flag in mempool to define this constraint, so that,
> > > when only needed, this constraint enforced and this is in line
> > > with existing semantics of addressing such problems in mempool
> > >
> > > b) Instead of creating a flag, Make this behavior by default in
> > > mempool for IOVA as VA case
> > >
> > > Upside:
> > > b1) There is no need for specific mempool_create for KNI.
> > >
> > > Downside:
> > > b2) Not align with existing mempool API semantics
> > > b3) There will be a trivial amount of memory waste as we can not
> > > allocate from the edge. Considering the normal huge
> > > page memory size is 1G or 512MB this not a real issue.
> > >
> > > c) Make IOVA as PA when KNI kernel module is loaded
> > >
> > > Upside:
> > > c1) Doing option (a) would call for new KNI specific mempool create
> > > API i.e existing KNI applications need a one-line change in
> > > application to make it work with release 19.11 or later.
> > >
> > > Downslide:
> > > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work with KNI
> > > c3) Need root privilege to run KNI as IOVA as PA need root privilege
> > >
> > > For the next year, we expect applications to work 19.11 without any
> > > code change. My personal opinion to make go with option (a)
> > > and update the release notes to document the change any it simple
> > > one-line change.
> > >
> > > The selection of (a) vs (b) is between KNI and Mempool maintainers.
> > > Could we please reach a consensus? Or can we discuss this TB meeting?
> > >
> > > We are going back and forth on this feature on for the last 3
> > > releases. Now that, we solved all the technical problems, please help
> > > us
> > > to decide (a) vs (b) to make forward progress.
> >
> > Thank you for the summary.
> > What is not clear to me is if (a) or (b) may break an existing
> > application, and if yes, in which case.
> 
> Thanks for the reply.
> 
> To be clear we are talking about out of tree KNI tree application.
> Which they don't want to
> change rte_pktmbuf_pool_create() to rte_kni_pktmbuf_pool_create() and
> build for v19.11
> 
> So in case (b) there is no issue as It will be using rte_pktmbuf_pool_create ().
> But in case of (a) it will create an issue if out of tree KNI
> application is using rte_pktmbuf_pool_create() which is not using the
> NEW flag.

Following yesterday's discussion at techboard, I looked at the mempool
code and at my previous RFC patch. It took some time to remind me what
was my worries.

Currently, in rte_mempool_populate_default(), when the mempool is
populated, we first try to allocate one iova-contiguous block of (n *
elt_size). On success, we use this memory to fully populate the mempool
without taking care of crossing page boundaries.

If we change the behavior to prevent objects from crossing pages, the
assumption that allocating (n * elt_size) is always enough becomes
wrong.  By luck, there is no real impact, because if the mempool is not
fully populated after this first iteration, it will allocate a new
chunk.

To be rigorous, we need to better calculate the amount of memory to
allocate, according to page size.

Looking at the code, I found another problem in the same area: let's say
we populate a mempool that requires 1.1GB (and we use 1G huge pages):

1/ mempool code will first tries to allocate an iova-contiguous zone
   of 1.1G -> fail
2/ it then tries to allocate a page-aligned non iova-contiguous
   zone of 1.1G, which is 2G. On success, a lot of memory is wasted.
3/ on error, we try to allocate the biggest zone, it can still return
   a zone between 1.1G and 2G, which can also waste memory.

I will rework my mempool patchset to properly address these issues,
hopefully tomorrow.


Also, I thought about another idea to solve your issue, not sure it is
better but it would not imply to change the mempool behavior. If I
understood the problem, when a mbuf is accross 2 pages, the copy of the
data can fail in kni because the mbuf is not virtually contiguous in the
kernel. So why not in this case splitting the memcpy() into several,
each of them being on a single page (and calling phys2virt() for each
page)? The same would have to be done when accessing the fields of the
mbuf structure if it crosses a page boundary. Would that work? This
could be a B plan.

Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-24 17:35                                           ` Olivier Matz
@ 2019-10-24 19:30                                             ` Jerin Jacob
  2019-10-25  9:20                                               ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-10-24 19:30 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Vamsi Krishna Attunuru, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

On Thu, Oct 24, 2019 at 11:05 PM Olivier Matz <olivier.matz@6wind.com> wrote:
>
> Hi,
>
> On Wed, Oct 23, 2019 at 08:32:08PM +0530, Jerin Jacob wrote:
> > On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz <olivier.matz@6wind.com> wrote:
> > >
> > > Hi,
> > >
> > > On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > > > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > > > <vattunuru@marvell.com> wrote:
> > > > >
> > > > > Hi Ferruh,
> > > > >
> > > > > Can you please explain the problems in using kni dedicated mbuf alloc routines while enabling kni iova=va mode. Please see the below discussion with Andrew. He wanted to know the problems in having newer APIs.
> > > >
> > > >
> > > > While waiting  for the Ferruh reply, I would like to summarise the
> > > > current status
> > > >
> > > > # In order to make KNI work with IOVA as VA, We need to make sure
> > > > mempool pool _object_ should not span across two huge pages
> > > >
> > > > # This problem can be fixed by, either of:
> > > >
> > > > a) Introduce a flag in mempool to define this constraint, so that,
> > > > when only needed, this constraint enforced and this is in line
> > > > with existing semantics of addressing such problems in mempool
> > > >
> > > > b) Instead of creating a flag, Make this behavior by default in
> > > > mempool for IOVA as VA case
> > > >
> > > > Upside:
> > > > b1) There is no need for specific mempool_create for KNI.
> > > >
> > > > Downside:
> > > > b2) Not align with existing mempool API semantics
> > > > b3) There will be a trivial amount of memory waste as we can not
> > > > allocate from the edge. Considering the normal huge
> > > > page memory size is 1G or 512MB this not a real issue.
> > > >
> > > > c) Make IOVA as PA when KNI kernel module is loaded
> > > >
> > > > Upside:
> > > > c1) Doing option (a) would call for new KNI specific mempool create
> > > > API i.e existing KNI applications need a one-line change in
> > > > application to make it work with release 19.11 or later.
> > > >
> > > > Downslide:
> > > > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work with KNI
> > > > c3) Need root privilege to run KNI as IOVA as PA need root privilege
> > > >
> > > > For the next year, we expect applications to work 19.11 without any
> > > > code change. My personal opinion to make go with option (a)
> > > > and update the release notes to document the change any it simple
> > > > one-line change.
> > > >
> > > > The selection of (a) vs (b) is between KNI and Mempool maintainers.
> > > > Could we please reach a consensus? Or can we discuss this TB meeting?
> > > >
> > > > We are going back and forth on this feature on for the last 3
> > > > releases. Now that, we solved all the technical problems, please help
> > > > us
> > > > to decide (a) vs (b) to make forward progress.
> > >
> > > Thank you for the summary.
> > > What is not clear to me is if (a) or (b) may break an existing
> > > application, and if yes, in which case.
> >
> > Thanks for the reply.
> >
> > To be clear we are talking about out of tree KNI tree application.
> > Which they don't want to
> > change rte_pktmbuf_pool_create() to rte_kni_pktmbuf_pool_create() and
> > build for v19.11
> >
> > So in case (b) there is no issue as It will be using rte_pktmbuf_pool_create ().
> > But in case of (a) it will create an issue if out of tree KNI
> > application is using rte_pktmbuf_pool_create() which is not using the
> > NEW flag.
>
> Following yesterday's discussion at techboard, I looked at the mempool
> code and at my previous RFC patch. It took some time to remind me what
> was my worries.

Thanks for the review Olivier.

Just to make sure the correct one is reviewed.

1) v7 had similar issue mentioned
http://patches.dpdk.org/patch/56585/

2) v11 addressed the review comments and you have given the Acked-by
for mempool change
http://patches.dpdk.org/patch/61559/

My thought process in the TB meeting was, since
rte_mempool_populate_from_pg_sz_chunks() reviwed
replace rte_pktmbuf_pool_create's  rte_mempool_populate_default() with
rte_mempool_populate_from_pg_sz_chunks()
in IOVA == VA case to avoid a new KNI mempool_create API.

>
> Currently, in rte_mempool_populate_default(), when the mempool is
> populated, we first try to allocate one iova-contiguous block of (n *
> elt_size). On success, we use this memory to fully populate the mempool
> without taking care of crossing page boundaries.
>
> If we change the behavior to prevent objects from crossing pages, the
> assumption that allocating (n * elt_size) is always enough becomes
> wrong.  By luck, there is no real impact, because if the mempool is not
> fully populated after this first iteration, it will allocate a new
> chunk.
>
> To be rigorous, we need to better calculate the amount of memory to
> allocate, according to page size.
>
> Looking at the code, I found another problem in the same area: let's say
> we populate a mempool that requires 1.1GB (and we use 1G huge pages):
>
> 1/ mempool code will first tries to allocate an iova-contiguous zone
>    of 1.1G -> fail
> 2/ it then tries to allocate a page-aligned non iova-contiguous
>    zone of 1.1G, which is 2G. On success, a lot of memory is wasted.
> 3/ on error, we try to allocate the biggest zone, it can still return
>    a zone between 1.1G and 2G, which can also waste memory.
>
> I will rework my mempool patchset to properly address these issues,
> hopefully tomorrow.

OK.


>
> Also, I thought about another idea to solve your issue, not sure it is
> better but it would not imply to change the mempool behavior. If I
> understood the problem, when a mbuf is accross 2 pages, the copy of the
> data can fail in kni because the mbuf is not virtually contiguous in the

For KNI use case, we would need _physically_ contiguous to make sure
that using, get_user_pages_remote() we get  physically contiguous memory map,
so that both KNI kernel thread and KNI kernel context and DPDK userspace can
use the same memory in different contexts.



> kernel. So why not in this case splitting the memcpy() into several,
> each of them being on a single page (and calling phys2virt() for each
> page)? The same would have to be done when accessing the fields of the
> mbuf structure if it crosses a page boundary. Would that work? This

If the above is the requirement, Does this logic need to be in slow
path or fast path?

> could be a B plan.

OK.

>
> Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-24 19:30                                             ` Jerin Jacob
@ 2019-10-25  9:20                                               ` Vamsi Krishna Attunuru
  2019-10-26 12:25                                                 ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-25  9:20 UTC (permalink / raw)
  To: Jerin Jacob, Olivier Matz
  Cc: Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev



> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Friday, October 25, 2019 1:01 AM
> To: Olivier Matz <olivier.matz@6wind.com>
> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
> Kumar Kokkilagadda <kirankumark@marvell.com>; anatoly.burakov@intel.com;
> stephen@networkplumber.org; dev@dpdk.org
> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
> 
> On Thu, Oct 24, 2019 at 11:05 PM Olivier Matz <olivier.matz@6wind.com>
> wrote:
> >
> > Hi,
> >
> > On Wed, Oct 23, 2019 at 08:32:08PM +0530, Jerin Jacob wrote:
> > > On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz <olivier.matz@6wind.com>
> wrote:
> > > >
> > > > Hi,
> > > >
> > > > On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > > > > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > > > > <vattunuru@marvell.com> wrote:
> > > > > >
> > > > > > Hi Ferruh,
> > > > > >
> > > > > > Can you please explain the problems in using kni dedicated mbuf alloc
> routines while enabling kni iova=va mode. Please see the below discussion with
> Andrew. He wanted to know the problems in having newer APIs.
> > > > >
> > > > >
> > > > > While waiting  for the Ferruh reply, I would like to summarise
> > > > > the current status
> > > > >
> > > > > # In order to make KNI work with IOVA as VA, We need to make
> > > > > sure mempool pool _object_ should not span across two huge pages
> > > > >
> > > > > # This problem can be fixed by, either of:
> > > > >
> > > > > a) Introduce a flag in mempool to define this constraint, so
> > > > > that, when only needed, this constraint enforced and this is in
> > > > > line with existing semantics of addressing such problems in
> > > > > mempool
> > > > >
> > > > > b) Instead of creating a flag, Make this behavior by default in
> > > > > mempool for IOVA as VA case
> > > > >
> > > > > Upside:
> > > > > b1) There is no need for specific mempool_create for KNI.
> > > > >
> > > > > Downside:
> > > > > b2) Not align with existing mempool API semantics
> > > > > b3) There will be a trivial amount of memory waste as we can not
> > > > > allocate from the edge. Considering the normal huge page memory
> > > > > size is 1G or 512MB this not a real issue.
> > > > >
> > > > > c) Make IOVA as PA when KNI kernel module is loaded
> > > > >
> > > > > Upside:
> > > > > c1) Doing option (a) would call for new KNI specific mempool
> > > > > create API i.e existing KNI applications need a one-line change
> > > > > in application to make it work with release 19.11 or later.
> > > > >
> > > > > Downslide:
> > > > > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work
> > > > > with KNI
> > > > > c3) Need root privilege to run KNI as IOVA as PA need root
> > > > > privilege
> > > > >
> > > > > For the next year, we expect applications to work 19.11 without
> > > > > any code change. My personal opinion to make go with option (a)
> > > > > and update the release notes to document the change any it
> > > > > simple one-line change.
> > > > >
> > > > > The selection of (a) vs (b) is between KNI and Mempool maintainers.
> > > > > Could we please reach a consensus? Or can we discuss this TB meeting?
> > > > >
> > > > > We are going back and forth on this feature on for the last 3
> > > > > releases. Now that, we solved all the technical problems, please
> > > > > help us to decide (a) vs (b) to make forward progress.
> > > >
> > > > Thank you for the summary.
> > > > What is not clear to me is if (a) or (b) may break an existing
> > > > application, and if yes, in which case.
> > >
> > > Thanks for the reply.
> > >
> > > To be clear we are talking about out of tree KNI tree application.
> > > Which they don't want to
> > > change rte_pktmbuf_pool_create() to rte_kni_pktmbuf_pool_create()
> > > and build for v19.11
> > >
> > > So in case (b) there is no issue as It will be using rte_pktmbuf_pool_create ().
> > > But in case of (a) it will create an issue if out of tree KNI
> > > application is using rte_pktmbuf_pool_create() which is not using
> > > the NEW flag.
> >
> > Following yesterday's discussion at techboard, I looked at the mempool
> > code and at my previous RFC patch. It took some time to remind me what
> > was my worries.
> 
> Thanks for the review Olivier.
> 
> Just to make sure the correct one is reviewed.
> 
> 1) v7 had similar issue mentioned
> https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__patches.dpdk.org_patch_56585_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7xtf
> Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YMVHe
> 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=mfN_afnyFm65sQYzaAg_-
> uM9o22A5j392TdBZY-bKK4&e=
> 
> 2) v11 addressed the review comments and you have given the Acked-by for
> mempool change https://urldefense.proofpoint.com/v2/url?u=http-
> 3A__patches.dpdk.org_patch_61559_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7xtf
> Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YMVHe
> 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=frFvKOHFDRhTam6jDZZc6omK2gb1RU62
> xzAiiBMnf0I&e=
> 
> My thought process in the TB meeting was, since
> rte_mempool_populate_from_pg_sz_chunks() reviwed replace
> rte_pktmbuf_pool_create's  rte_mempool_populate_default() with
> rte_mempool_populate_from_pg_sz_chunks()
> in IOVA == VA case to avoid a new KNI mempool_create API.
> 
> >
> > Currently, in rte_mempool_populate_default(), when the mempool is
> > populated, we first try to allocate one iova-contiguous block of (n *
> > elt_size). On success, we use this memory to fully populate the
> > mempool without taking care of crossing page boundaries.
> >
> > If we change the behavior to prevent objects from crossing pages, the
> > assumption that allocating (n * elt_size) is always enough becomes
> > wrong.  By luck, there is no real impact, because if the mempool is
> > not fully populated after this first iteration, it will allocate a new
> > chunk.
> >
> > To be rigorous, we need to better calculate the amount of memory to
> > allocate, according to page size.

Hi Olivier,

Thanks for the review, I think the below mentioned problems exist with 
current mempool_populate_default() api and will there be high chances of hitting 
those problems when we precalculate the memory size(after preventing objs from 
pg boundary and fit complete mempool memory in single mem chunk) and if
mempool size goes beyond page size as below example. ?,  

Regards,
Vamsi

> >
> > Looking at the code, I found another problem in the same area: let's
> > say we populate a mempool that requires 1.1GB (and we use 1G huge pages):
> >	
> > 1/ mempool code will first tries to allocate an iova-contiguous zone
> >    of 1.1G -> fail
> > 2/ it then tries to allocate a page-aligned non iova-contiguous
> >    zone of 1.1G, which is 2G. On success, a lot of memory is wasted.
> > 3/ on error, we try to allocate the biggest zone, it can still return
> >    a zone between 1.1G and 2G, which can also waste memory.
> >
> > I will rework my mempool patchset to properly address these issues,
> > hopefully tomorrow.
> 
> OK.
> 
> 
> >
> > Also, I thought about another idea to solve your issue, not sure it is
> > better but it would not imply to change the mempool behavior. If I
> > understood the problem, when a mbuf is accross 2 pages, the copy of
> > the data can fail in kni because the mbuf is not virtually contiguous
> > in the
> 
> For KNI use case, we would need _physically_ contiguous to make sure that
> using, get_user_pages_remote() we get  physically contiguous memory map, so
> that both KNI kernel thread and KNI kernel context and DPDK userspace can use
> the same memory in different contexts.
> 
> 
> 
> > kernel. So why not in this case splitting the memcpy() into several,
> > each of them being on a single page (and calling phys2virt() for each
> > page)? The same would have to be done when accessing the fields of the
> > mbuf structure if it crosses a page boundary. Would that work? This
> 
> If the above is the requirement, Does this logic need to be in slow path or fast
> path?
> 
> > could be a B plan.
> 
> OK.
> 
> >
> > Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-25  9:20                                               ` Vamsi Krishna Attunuru
@ 2019-10-26 12:25                                                 ` Olivier Matz
  2019-10-26 14:09                                                   ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-26 12:25 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: Jerin Jacob, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

Hi Jerin, Hi Vamsi,

On Fri, Oct 25, 2019 at 09:20:20AM +0000, Vamsi Krishna Attunuru wrote:
> 
> 
> > -----Original Message-----
> > From: Jerin Jacob <jerinjacobk@gmail.com>
> > Sent: Friday, October 25, 2019 1:01 AM
> > To: Olivier Matz <olivier.matz@6wind.com>
> > Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> > thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
> > Kumar Kokkilagadda <kirankumark@marvell.com>; anatoly.burakov@intel.com;
> > stephen@networkplumber.org; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
> > 
> > On Thu, Oct 24, 2019 at 11:05 PM Olivier Matz <olivier.matz@6wind.com>
> > wrote:
> > >
> > > Hi,
> > >
> > > On Wed, Oct 23, 2019 at 08:32:08PM +0530, Jerin Jacob wrote:
> > > > On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz <olivier.matz@6wind.com>
> > wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > > > > > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > > > > > <vattunuru@marvell.com> wrote:
> > > > > > >
> > > > > > > Hi Ferruh,
> > > > > > >
> > > > > > > Can you please explain the problems in using kni dedicated mbuf alloc
> > routines while enabling kni iova=va mode. Please see the below discussion with
> > Andrew. He wanted to know the problems in having newer APIs.
> > > > > >
> > > > > >
> > > > > > While waiting  for the Ferruh reply, I would like to summarise
> > > > > > the current status
> > > > > >
> > > > > > # In order to make KNI work with IOVA as VA, We need to make
> > > > > > sure mempool pool _object_ should not span across two huge pages
> > > > > >
> > > > > > # This problem can be fixed by, either of:
> > > > > >
> > > > > > a) Introduce a flag in mempool to define this constraint, so
> > > > > > that, when only needed, this constraint enforced and this is in
> > > > > > line with existing semantics of addressing such problems in
> > > > > > mempool
> > > > > >
> > > > > > b) Instead of creating a flag, Make this behavior by default in
> > > > > > mempool for IOVA as VA case
> > > > > >
> > > > > > Upside:
> > > > > > b1) There is no need for specific mempool_create for KNI.
> > > > > >
> > > > > > Downside:
> > > > > > b2) Not align with existing mempool API semantics
> > > > > > b3) There will be a trivial amount of memory waste as we can not
> > > > > > allocate from the edge. Considering the normal huge page memory
> > > > > > size is 1G or 512MB this not a real issue.
> > > > > >
> > > > > > c) Make IOVA as PA when KNI kernel module is loaded
> > > > > >
> > > > > > Upside:
> > > > > > c1) Doing option (a) would call for new KNI specific mempool
> > > > > > create API i.e existing KNI applications need a one-line change
> > > > > > in application to make it work with release 19.11 or later.
> > > > > >
> > > > > > Downslide:
> > > > > > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not work
> > > > > > with KNI
> > > > > > c3) Need root privilege to run KNI as IOVA as PA need root
> > > > > > privilege
> > > > > >
> > > > > > For the next year, we expect applications to work 19.11 without
> > > > > > any code change. My personal opinion to make go with option (a)
> > > > > > and update the release notes to document the change any it
> > > > > > simple one-line change.
> > > > > >
> > > > > > The selection of (a) vs (b) is between KNI and Mempool maintainers.
> > > > > > Could we please reach a consensus? Or can we discuss this TB meeting?
> > > > > >
> > > > > > We are going back and forth on this feature on for the last 3
> > > > > > releases. Now that, we solved all the technical problems, please
> > > > > > help us to decide (a) vs (b) to make forward progress.
> > > > >
> > > > > Thank you for the summary.
> > > > > What is not clear to me is if (a) or (b) may break an existing
> > > > > application, and if yes, in which case.
> > > >
> > > > Thanks for the reply.
> > > >
> > > > To be clear we are talking about out of tree KNI tree application.
> > > > Which they don't want to
> > > > change rte_pktmbuf_pool_create() to rte_kni_pktmbuf_pool_create()
> > > > and build for v19.11
> > > >
> > > > So in case (b) there is no issue as It will be using rte_pktmbuf_pool_create ().
> > > > But in case of (a) it will create an issue if out of tree KNI
> > > > application is using rte_pktmbuf_pool_create() which is not using
> > > > the NEW flag.
> > >
> > > Following yesterday's discussion at techboard, I looked at the mempool
> > > code and at my previous RFC patch. It took some time to remind me what
> > > was my worries.
> > 
> > Thanks for the review Olivier.
> > 
> > Just to make sure the correct one is reviewed.
> > 
> > 1) v7 had similar issue mentioned
> > https://urldefense.proofpoint.com/v2/url?u=http-
> > 3A__patches.dpdk.org_patch_56585_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7xtf
> > Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YMVHe
> > 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=mfN_afnyFm65sQYzaAg_-
> > uM9o22A5j392TdBZY-bKK4&e=

The v7 has the problem I described below: the iova-contiguous allocation
may fail because the calculated size is too small, and remaining objects
will be added in another chunk. This can happen if a fully
iova-contiguous mempool is requested (it happens for octeontx).

Also, the way page size is retrieved in
rte_mempool_op_populate_default() assume that memzones are used, which
is not correct.

> > 2) v11 addressed the review comments and you have given the Acked-by for
> > mempool change https://urldefense.proofpoint.com/v2/url?u=http-
> > 3A__patches.dpdk.org_patch_61559_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7xtf
> > Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YMVHe
> > 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=frFvKOHFDRhTam6jDZZc6omK2gb1RU62
> > xzAiiBMnf0I&e=

The v11 looked fine to me, because it does not impact any default
behavior, and the plan was to remove this new function when my mempool
patchset was in.


> > 
> > My thought process in the TB meeting was, since
> > rte_mempool_populate_from_pg_sz_chunks() reviwed replace
> > rte_pktmbuf_pool_create's  rte_mempool_populate_default() with
> > rte_mempool_populate_from_pg_sz_chunks()
> > in IOVA == VA case to avoid a new KNI mempool_create API.
> > 
> > >
> > > Currently, in rte_mempool_populate_default(), when the mempool is
> > > populated, we first try to allocate one iova-contiguous block of (n *
> > > elt_size). On success, we use this memory to fully populate the
> > > mempool without taking care of crossing page boundaries.
> > >
> > > If we change the behavior to prevent objects from crossing pages, the
> > > assumption that allocating (n * elt_size) is always enough becomes
> > > wrong.  By luck, there is no real impact, because if the mempool is
> > > not fully populated after this first iteration, it will allocate a new
> > > chunk.
> > >
> > > To be rigorous, we need to better calculate the amount of memory to
> > > allocate, according to page size.
> 
> Hi Olivier,
> 
> Thanks for the review, I think the below mentioned problems exist with 
> current mempool_populate_default() api and will there be high chances of hitting 
> those problems when we precalculate the memory size(after preventing objs from 
> pg boundary and fit complete mempool memory in single mem chunk) and if
> mempool size goes beyond page size as below example. ?,  

Yes, the problem described below (alloc a mempool of 1.1GB resulting in
2GB reserved) exists in the current version. It will be fixed in the new
version of my "mempool: avoid objects allocations across pages"
patchset.

FYI, a reworked patchset is alsmost ready, I'll send it monday.

> 
> Regards,
> Vamsi
> 
> > >
> > > Looking at the code, I found another problem in the same area: let's
> > > say we populate a mempool that requires 1.1GB (and we use 1G huge pages):
> > >	
> > > 1/ mempool code will first tries to allocate an iova-contiguous zone
> > >    of 1.1G -> fail
> > > 2/ it then tries to allocate a page-aligned non iova-contiguous
> > >    zone of 1.1G, which is 2G. On success, a lot of memory is wasted.
> > > 3/ on error, we try to allocate the biggest zone, it can still return
> > >    a zone between 1.1G and 2G, which can also waste memory.
> > >
> > > I will rework my mempool patchset to properly address these issues,
> > > hopefully tomorrow.
> > 
> > OK.
> > 
> > 
> > >
> > > Also, I thought about another idea to solve your issue, not sure it is
> > > better but it would not imply to change the mempool behavior. If I
> > > understood the problem, when a mbuf is accross 2 pages, the copy of
> > > the data can fail in kni because the mbuf is not virtually contiguous
> > > in the
> > 
> > For KNI use case, we would need _physically_ contiguous to make sure that
> > using, get_user_pages_remote() we get  physically contiguous memory map, so
> > that both KNI kernel thread and KNI kernel context and DPDK userspace can use
> > the same memory in different contexts.
> > 
> > 
> > 
> > > kernel. So why not in this case splitting the memcpy() into several,
> > > each of them being on a single page (and calling phys2virt() for each
> > > page)? The same would have to be done when accessing the fields of the
> > > mbuf structure if it crosses a page boundary. Would that work? This
> > 
> > If the above is the requirement, Does this logic need to be in slow path or fast
> > path?

In fast path. But I don't think the performance impact would be
significative.

Vamsi, do you confirm this approach could also solve the issue without
changing the mempool?

> > 
> > > could be a B plan.
> > 
> > OK.
> > 
> > >
> > > Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-26 12:25                                                 ` Olivier Matz
@ 2019-10-26 14:09                                                   ` Vamsi Krishna Attunuru
  2019-10-28 14:05                                                     ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-26 14:09 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Jerin Jacob, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

Hi Olivier,

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Saturday, October 26, 2019 5:55 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: Jerin Jacob <jerinjacobk@gmail.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> anatoly.burakov@intel.com; stephen@networkplumber.org; dev@dpdk.org
> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
> 
> Hi Jerin, Hi Vamsi,
> 
> On Fri, Oct 25, 2019 at 09:20:20AM +0000, Vamsi Krishna Attunuru wrote:
> >
> >
> > > -----Original Message-----
> > > From: Jerin Jacob <jerinjacobk@gmail.com>
> > > Sent: Friday, October 25, 2019 1:01 AM
> > > To: Olivier Matz <olivier.matz@6wind.com>
> > > Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Andrew
> Rybchenko
> > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> > > thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> > > Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > > anatoly.burakov@intel.com; stephen@networkplumber.org;
> dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy
> > > kni option
> > >
> > > On Thu, Oct 24, 2019 at 11:05 PM Olivier Matz
> > > <olivier.matz@6wind.com>
> > > wrote:
> > > >
> > > > Hi,
> > > >
> > > > On Wed, Oct 23, 2019 at 08:32:08PM +0530, Jerin Jacob wrote:
> > > > > On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz
> > > > > <olivier.matz@6wind.com>
> > > wrote:
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > > > > > > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > > > > > > <vattunuru@marvell.com> wrote:
> > > > > > > >
> > > > > > > > Hi Ferruh,
> > > > > > > >
> > > > > > > > Can you please explain the problems in using kni dedicated
> > > > > > > > mbuf alloc
> > > routines while enabling kni iova=va mode. Please see the below
> > > discussion with Andrew. He wanted to know the problems in having
> newer APIs.
> > > > > > >
> > > > > > >
> > > > > > > While waiting  for the Ferruh reply, I would like to
> > > > > > > summarise the current status
> > > > > > >
> > > > > > > # In order to make KNI work with IOVA as VA, We need to make
> > > > > > > sure mempool pool _object_ should not span across two huge
> > > > > > > pages
> > > > > > >
> > > > > > > # This problem can be fixed by, either of:
> > > > > > >
> > > > > > > a) Introduce a flag in mempool to define this constraint, so
> > > > > > > that, when only needed, this constraint enforced and this is
> > > > > > > in line with existing semantics of addressing such problems
> > > > > > > in mempool
> > > > > > >
> > > > > > > b) Instead of creating a flag, Make this behavior by default
> > > > > > > in mempool for IOVA as VA case
> > > > > > >
> > > > > > > Upside:
> > > > > > > b1) There is no need for specific mempool_create for KNI.
> > > > > > >
> > > > > > > Downside:
> > > > > > > b2) Not align with existing mempool API semantics
> > > > > > > b3) There will be a trivial amount of memory waste as we can
> > > > > > > not allocate from the edge. Considering the normal huge page
> > > > > > > memory size is 1G or 512MB this not a real issue.
> > > > > > >
> > > > > > > c) Make IOVA as PA when KNI kernel module is loaded
> > > > > > >
> > > > > > > Upside:
> > > > > > > c1) Doing option (a) would call for new KNI specific mempool
> > > > > > > create API i.e existing KNI applications need a one-line
> > > > > > > change in application to make it work with release 19.11 or later.
> > > > > > >
> > > > > > > Downslide:
> > > > > > > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not
> > > > > > > work with KNI
> > > > > > > c3) Need root privilege to run KNI as IOVA as PA need root
> > > > > > > privilege
> > > > > > >
> > > > > > > For the next year, we expect applications to work 19.11
> > > > > > > without any code change. My personal opinion to make go with
> > > > > > > option (a) and update the release notes to document the
> > > > > > > change any it simple one-line change.
> > > > > > >
> > > > > > > The selection of (a) vs (b) is between KNI and Mempool
> maintainers.
> > > > > > > Could we please reach a consensus? Or can we discuss this TB
> meeting?
> > > > > > >
> > > > > > > We are going back and forth on this feature on for the last
> > > > > > > 3 releases. Now that, we solved all the technical problems,
> > > > > > > please help us to decide (a) vs (b) to make forward progress.
> > > > > >
> > > > > > Thank you for the summary.
> > > > > > What is not clear to me is if (a) or (b) may break an existing
> > > > > > application, and if yes, in which case.
> > > > >
> > > > > Thanks for the reply.
> > > > >
> > > > > To be clear we are talking about out of tree KNI tree application.
> > > > > Which they don't want to
> > > > > change rte_pktmbuf_pool_create() to
> > > > > rte_kni_pktmbuf_pool_create() and build for v19.11
> > > > >
> > > > > So in case (b) there is no issue as It will be using
> rte_pktmbuf_pool_create ().
> > > > > But in case of (a) it will create an issue if out of tree KNI
> > > > > application is using rte_pktmbuf_pool_create() which is not
> > > > > using the NEW flag.
> > > >
> > > > Following yesterday's discussion at techboard, I looked at the
> > > > mempool code and at my previous RFC patch. It took some time to
> > > > remind me what was my worries.
> > >
> > > Thanks for the review Olivier.
> > >
> > > Just to make sure the correct one is reviewed.
> > >
> > > 1) v7 had similar issue mentioned
> > > https://urldefense.proofpoint.com/v2/url?u=http-
> > >
> 3A__patches.dpdk.org_patch_56585_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7
> xtf
> > >
> Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YM
> VHe
> > > 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=mfN_afnyFm65sQYzaAg_-
> > > uM9o22A5j392TdBZY-bKK4&e=
> 
> The v7 has the problem I described below: the iova-contiguous allocation
> may fail because the calculated size is too small, and remaining objects will
> be added in another chunk. This can happen if a fully iova-contiguous
> mempool is requested (it happens for octeontx).
> 
> Also, the way page size is retrieved in
> rte_mempool_op_populate_default() assume that memzones are used,
> which is not correct.
> 
> > > 2) v11 addressed the review comments and you have given the Acked-by
> > > for mempool change https://urldefense.proofpoint.com/v2/url?u=http-
> > >
> 3A__patches.dpdk.org_patch_61559_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7
> xtf
> > >
> Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YM
> VHe
> > >
> 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=frFvKOHFDRhTam6jDZZc6omK2gb1RU
> 62
> > > xzAiiBMnf0I&e=
> 
> The v11 looked fine to me, because it does not impact any default behavior,
> and the plan was to remove this new function when my mempool patchset
> was in.
> 
> 
> > >
> > > My thought process in the TB meeting was, since
> > > rte_mempool_populate_from_pg_sz_chunks() reviwed replace
> > > rte_pktmbuf_pool_create's  rte_mempool_populate_default() with
> > > rte_mempool_populate_from_pg_sz_chunks()
> > > in IOVA == VA case to avoid a new KNI mempool_create API.
> > >
> > > >
> > > > Currently, in rte_mempool_populate_default(), when the mempool is
> > > > populated, we first try to allocate one iova-contiguous block of
> > > > (n * elt_size). On success, we use this memory to fully populate
> > > > the mempool without taking care of crossing page boundaries.
> > > >
> > > > If we change the behavior to prevent objects from crossing pages,
> > > > the assumption that allocating (n * elt_size) is always enough
> > > > becomes wrong.  By luck, there is no real impact, because if the
> > > > mempool is not fully populated after this first iteration, it will
> > > > allocate a new chunk.
> > > >
> > > > To be rigorous, we need to better calculate the amount of memory
> > > > to allocate, according to page size.
> >
> > Hi Olivier,
> >
> > Thanks for the review, I think the below mentioned problems exist with
> > current mempool_populate_default() api and will there be high chances
> > of hitting those problems when we precalculate the memory size(after
> > preventing objs from pg boundary and fit complete mempool memory in
> > single mem chunk) and if mempool size goes beyond page size as below
> > example. ?,
> 
> Yes, the problem described below (alloc a mempool of 1.1GB resulting in 2GB
> reserved) exists in the current version. It will be fixed in the new version of
> my "mempool: avoid objects allocations across pages"
> patchset.
> 
> FYI, a reworked patchset is alsmost ready, I'll send it monday.

Thanks a lot Olivier.

> 
> >
> > Regards,
> > Vamsi
> >
> > > >
> > > > Looking at the code, I found another problem in the same area:
> > > > let's say we populate a mempool that requires 1.1GB (and we use 1G
> huge pages):
> > > >
> > > > 1/ mempool code will first tries to allocate an iova-contiguous zone
> > > >    of 1.1G -> fail
> > > > 2/ it then tries to allocate a page-aligned non iova-contiguous
> > > >    zone of 1.1G, which is 2G. On success, a lot of memory is wasted.
> > > > 3/ on error, we try to allocate the biggest zone, it can still return
> > > >    a zone between 1.1G and 2G, which can also waste memory.
> > > >
> > > > I will rework my mempool patchset to properly address these
> > > > issues, hopefully tomorrow.
> > >
> > > OK.
> > >
> > >
> > > >
> > > > Also, I thought about another idea to solve your issue, not sure
> > > > it is better but it would not imply to change the mempool
> > > > behavior. If I understood the problem, when a mbuf is accross 2
> > > > pages, the copy of the data can fail in kni because the mbuf is
> > > > not virtually contiguous in the
> > >
> > > For KNI use case, we would need _physically_ contiguous to make sure
> > > that using, get_user_pages_remote() we get  physically contiguous
> > > memory map, so that both KNI kernel thread and KNI kernel context
> > > and DPDK userspace can use the same memory in different contexts.
> > >
> > >
> > >
> > > > kernel. So why not in this case splitting the memcpy() into
> > > > several, each of them being on a single page (and calling
> > > > phys2virt() for each page)? The same would have to be done when
> > > > accessing the fields of the mbuf structure if it crosses a page
> > > > boundary. Would that work? This
> > >
> > > If the above is the requirement, Does this logic need to be in slow
> > > path or fast path?
> 
> In fast path. But I don't think the performance impact would be significative.
> 
> Vamsi, do you confirm this approach could also solve the issue without
> changing the mempool?

So far there is no performance impact observed when these feature(kni iova as va) is enabled with this patch set. Not sure of impact if when memcpy split and extra address translations are introduced, this approach might solve but I feel it's not a clearer way of solving the issue.
> 
> > >
> > > > could be a B plan.
> > >
> > > OK.
> > >
> > > >
> > > > Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH 0/5] mempool: avoid objects allocations across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (5 preceding siblings ...)
  2019-08-07 15:21                         ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
@ 2019-10-28 14:01                         ` Olivier Matz
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt Olivier Matz
                                             ` (4 more replies)
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
                                           ` (2 subsequent siblings)
  9 siblings, 5 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:01 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

KNI supposes that mbufs are contiguous in kernel virtual memory. This
may not be true when using the IOVA=VA mode. To fix this, a possibility
is to ensure that objects do not cross page boundaries in mempool. This
patchset implements this in the last patch (5/5).

The previous patches prepare the job:
- allow to populate with an unaligned virtual area (1/5).
- reduce spaced wasted in the mempool size calculation when not using
  the iova-contiguous allocation (2/5).
- remove the iova-contiguous allocation when populating mempool (3/5):
  a va-contiguous alloc does the job as well if we want to populate
  without crossing page boundaries, so simplify the mempool populate
  function.
- export a function to get the minimum page used in a mempool (4/5)

Memory consumption impact when using hugepages:
- worst case: + ~0.1% for a mbuf pool (objsize ~= 2368)
- best case: -50% for if pool size is just above page size

The memory consumption impact with 4K pages in IOVA=VA mode could
however consume up to 75% more memory for mbuf pool, because there will
be only 1 mbuf per page. Not sure how common this usecase is.

Caveat: this changes the behavior of the mempool (calc_mem_size and
populate), and there is a small risk to break things, especially with
alternate mempool drivers.

rfc -> v1

* remove first cleanup patch, it was pushed separately
  a2b5a8722f20 ("mempool: clarify default populate function")
* add missing change in rte_mempool_op_calc_mem_size_default()
* allow unaligned addr/len in populate virt
* better split patches
* try to better explain the change
* use DPDK align macros when relevant

Olivier Matz (5):
  mempool: allow unaligned addr/len in populate virt
  mempool: reduce wasted space on mempool populate
  mempool: remove optimistic IOVA-contiguous allocation
  mempool: introduce function to get mempool page size
  mempool: prevent objects from being across pages

 lib/librte_mempool/rte_mempool.c             | 140 +++++++------------
 lib/librte_mempool/rte_mempool.h             |  12 +-
 lib/librte_mempool/rte_mempool_ops.c         |   4 +-
 lib/librte_mempool/rte_mempool_ops_default.c |  57 ++++++--
 lib/librte_mempool/rte_mempool_version.map   |   1 +
 5 files changed, 114 insertions(+), 100 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
@ 2019-10-28 14:01                           ` Olivier Matz
  2019-10-29  9:02                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-10-29  9:21                             ` [dpdk-dev] " Andrew Rybchenko
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate Olivier Matz
                                             ` (3 subsequent siblings)
  4 siblings, 2 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:01 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

rte_mempool_populate_virt() currently requires that both addr
and length are page-aligned.

Remove this uneeded constraint which can be annoying with big
hugepages (ex: 1GB).

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c | 18 +++++++-----------
 lib/librte_mempool/rte_mempool.h |  3 +--
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 0f29e8712..76cbacdf3 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	size_t off, phys_len;
 	int ret, cnt = 0;
 
-	/* address and len must be page-aligned */
-	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
-		return -EINVAL;
-	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
-		return -EINVAL;
-
 	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
 		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
 			len, free_cb, opaque);
 
-	for (off = 0; off + pg_sz <= len &&
+	for (off = 0; off < len &&
 		     mp->populated_size < mp->size; off += phys_len) {
 
 		iova = rte_mem_virt2iova(addr + off);
@@ -389,7 +383,10 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 		}
 
 		/* populate with the largest group of contiguous pages */
-		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
+		for (phys_len = RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
+			     (addr + off);
+		     off + phys_len < len;
+		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
 			rte_iova_t iova_tmp;
 
 			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
@@ -575,8 +572,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags,
-					RTE_MAX(pg_sz, align));
+					mp->socket_id, flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -601,7 +597,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 				(void *)(uintptr_t)mz);
 		else
 			ret = rte_mempool_populate_virt(mp, mz->addr,
-				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
+				mz->len, pg_sz,
 				rte_mempool_memchunk_mz_free,
 				(void *)(uintptr_t)mz);
 		if (ret < 0) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a04..0fe8aa7b8 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
  *   A pointer to the mempool structure.
  * @param addr
  *   The virtual address of memory that should be used to store objects.
- *   Must be page-aligned.
  * @param len
- *   The length of memory in bytes. Must be page-aligned.
+ *   The length of memory in bytes.
  * @param pg_sz
  *   The size of memory pages in this virtual area.
  * @param free_cb
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt Olivier Matz
@ 2019-10-28 14:01                           ` Olivier Matz
  2019-10-29 10:09                             ` Andrew Rybchenko
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
                                             ` (2 subsequent siblings)
  4 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:01 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

The size returned by rte_mempool_op_calc_mem_size_default() is aligned
to the specified page size. Therefore, with big pages, the returned size
can be much more that what we really need to populate the mempool.

For instance, populating a mempool that requires 1.1GB of memory with
1GB hugepages can result in allocating 2GB of memory.

This problem is hidden most of the time due to the allocation method of
rte_mempool_populate_default(): when try_iova_contig_mempool=true, it
first tries to allocate an iova contiguous area, without the alignment
constraint. If it fails, it fallbacks to an aligned allocation that does
not require to be iova-contiguous. This can also fallback into several
smaller aligned allocations.

This commit changes rte_mempool_op_calc_mem_size_default() to relax the
alignment constraint to a cache line and to return a smaller size.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c             |  7 ++---
 lib/librte_mempool/rte_mempool.h             |  2 +-
 lib/librte_mempool/rte_mempool_ops.c         |  4 ++-
 lib/librte_mempool/rte_mempool_ops_default.c | 28 +++++++++++++++-----
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 76cbacdf3..3275e48c9 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -474,11 +474,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * wasting some space this way, but it's much nicer than looping around
 	 * trying to reserve each and every page size.
 	 *
-	 * However, since size calculation will produce page-aligned sizes, it
-	 * makes sense to first try and see if we can reserve the entire memzone
-	 * in one contiguous chunk as well (otherwise we might end up wasting a
-	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
-	 * memory, then we'll go and reserve space page-by-page.
+	 * If we fail to get enough contiguous memory, then we'll go and
+	 * reserve space in smaller chunks.
 	 *
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 0fe8aa7b8..8fa3c04e5 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
  * @param[out] align
  *   Location for required memory chunk alignment.
  * @return
- *   Required memory size aligned at page boundary.
+ *   Required memory size.
  */
 typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		uint32_t obj_num,  uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index e02eb702c..22c5251eb 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to notify new memory area to external mempool */
+/* wrapper to calculate the memory size required to store given number
+ * of objects
+ */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc82d..f6aea7662 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -12,7 +12,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				     size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
-	size_t obj_per_page, pg_num, pg_sz;
+	size_t obj_per_page, pg_sz, objs_in_last_page;
 	size_t mem_size;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
@@ -33,14 +33,30 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 			mem_size =
 				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
 		} else {
-			pg_num = (obj_num + obj_per_page - 1) / obj_per_page;
-			mem_size = pg_num << pg_shift;
+			/* In the best case, the allocator will return a
+			 * page-aligned address. For example, with 5 objs,
+			 * the required space is as below:
+			 *  |     page0     |     page1     |  page2 (last) |
+			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
+			 *  <------------- mem_size ------------->
+			 */
+			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
+			/* room required for the last page */
+			mem_size = objs_in_last_page * total_elt_sz;
+			/* room required for other pages */
+			mem_size += ((obj_num - objs_in_last_page) /
+				obj_per_page) << pg_shift;
+
+			/* In the worst case, the allocator returns a
+			 * non-aligned pointer, wasting up to
+			 * total_elt_sz. Add a margin for that.
+			 */
+			 mem_size += total_elt_sz - 1;
 		}
 	}
 
-	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
-
-	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
+	*min_chunk_size = total_elt_sz;
+	*align = RTE_CACHE_LINE_SIZE;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt Olivier Matz
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate Olivier Matz
@ 2019-10-28 14:01                           ` Olivier Matz
  2019-10-29 10:25                             ` Andrew Rybchenko
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size Olivier Matz
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages Olivier Matz
  4 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:01 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

The previous commit reduced the amount of required memory when
populating the mempool with non iova-contiguous memory.

Since there is no big advantage to have a fully iova-contiguous mempool
if it is not explicitly asked, remove this code, it simplifies the
populate function.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 3275e48c9..5e1f202dc 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -427,7 +427,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool try_iova_contig_mempool;
 	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
@@ -480,9 +479,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
 	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold. We should also try
-	 * to go for contiguous memory even if we're in no-huge mode, because
-	 * external memory may in fact be IOVA-contiguous.
+	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	/* check if we can retrieve a valid socket ID */
@@ -491,7 +488,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		return -EINVAL;
 	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
-	try_iova_contig_mempool = false;
 
 	if (!need_iova_contig_obj) {
 		pg_sz = 0;
@@ -500,7 +496,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		pg_sz = 0;
 		pg_shift = 0;
 	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		try_iova_contig_mempool = true;
 		pg_sz = get_min_page_size(mp->socket_id);
 		pg_shift = rte_bsf32(pg_sz);
 	} else {
@@ -510,14 +505,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
-		unsigned int flags;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					0, &min_chunk_size, &align);
-		else
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
+		mem_size = rte_mempool_ops_calc_mem_size(
+			mp, n, pg_shift, &min_chunk_size, &align);
 
 		if (mem_size < 0) {
 			ret = mem_size;
@@ -531,36 +521,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
-		flags = mz_flags;
-
 		/* if we're trying to reserve contiguous memory, add appropriate
 		 * memzone flag.
 		 */
-		if (try_iova_contig_mempool)
-			flags |= RTE_MEMZONE_IOVA_CONTIG;
+		if (min_chunk_size == (size_t)mem_size)
+			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-
-		/* if we were trying to allocate contiguous memory, failed and
-		 * minimum required contiguous chunk fits minimum page, adjust
-		 * memzone size to the page size, and try again.
-		 */
-		if (mz == NULL && try_iova_contig_mempool &&
-				min_chunk_size <= pg_sz) {
-			try_iova_contig_mempool = false;
-			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
-			if (mem_size < 0) {
-				ret = mem_size;
-				goto fail;
-			}
+				mp->socket_id, mz_flags, align);
 
-			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
@@ -569,7 +538,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags, align);
+					mp->socket_id, mz_flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -587,7 +556,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		else
 			iova = RTE_BAD_IOVA;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
+		if (pg_sz == 0)
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
                                             ` (2 preceding siblings ...)
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
@ 2019-10-28 14:01                           ` Olivier Matz
  2019-10-29 10:31                             ` Andrew Rybchenko
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages Olivier Matz
  4 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:01 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

In rte_mempool_populate_default(), we determine the page size,
which is needed for calc_size and allocation of memory.

Move this in a function and export it, it will be used in next
commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c           | 51 ++++++++++++++--------
 lib/librte_mempool/rte_mempool.h           |  7 +++
 lib/librte_mempool/rte_mempool_version.map |  1 +
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 5e1f202dc..7664764e5 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -411,6 +411,33 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Get the minimal page size used in a mempool before populating it. */
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
+{
+	bool need_iova_contig_obj;
+	bool alloc_in_ext_mem;
+	int ret;
+
+	/* check if we can retrieve a valid socket ID */
+	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+	if (ret < 0)
+		return -EINVAL;
+	alloc_in_ext_mem = (ret == 1);
+	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+
+	if (!need_iova_contig_obj)
+		*pg_sz = 0;
+	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
+		*pg_sz = 0;
+	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
+		*pg_sz = get_min_page_size(mp->socket_id);
+	else
+		*pg_sz = getpagesize();
+
+	return 0;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
@@ -422,12 +449,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
 	ssize_t mem_size;
-	size_t align, pg_sz, pg_shift;
+	size_t align, pg_sz, pg_shift = 0;
 	rte_iova_t iova;
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -482,26 +508,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
-	/* check if we can retrieve a valid socket ID */
-	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
-	if (ret < 0)
-		return -EINVAL;
-	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
-	if (!need_iova_contig_obj) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		pg_sz = get_min_page_size(mp->socket_id);
-		pg_shift = rte_bsf32(pg_sz);
-	} else {
-		pg_sz = getpagesize();
+	if (pg_sz != 0)
 		pg_shift = rte_bsf32(pg_sz);
-	}
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8fa3c04e5..6d98b3743 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1691,6 +1691,13 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
 		      void *arg);
 
+/**
+ * @internal Get page size used for mempool object allocation.
+ */
+__rte_experimental
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca460..4eff2767d 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -56,5 +56,6 @@ DPDK_18.05 {
 EXPERIMENTAL {
 	global:
 
+	rte_mempool_get_page_size;
 	rte_mempool_ops_get_info;
 };
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
                                             ` (3 preceding siblings ...)
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-10-28 14:01                           ` Olivier Matz
  2019-10-29 10:59                             ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
  2019-10-29 17:25                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  4 siblings, 2 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:01 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

When populating a mempool, ensure that objects are not located across
several pages, except if user did not request iova contiguous objects.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
 lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++--
 2 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 7664764e5..b23fd1b06 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
 
 	if (!need_iova_contig_obj)
 		*pg_sz = 0;
-	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
-		*pg_sz = 0;
 	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
 		*pg_sz = get_min_page_size(mp->socket_id);
 	else
@@ -478,17 +476,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * then just set page shift and page size to 0, because the user has
 	 * indicated that there's no need to care about anything.
 	 *
-	 * if we do need contiguous objects, there is also an option to reserve
-	 * the entire mempool memory as one contiguous block of memory, in
-	 * which case the page shift and alignment wouldn't matter as well.
+	 * if we do need contiguous objects (if a mempool driver has its
+	 * own calc_size() method returning min_chunk_size = mem_size),
+	 * there is also an option to reserve the entire mempool memory
+	 * as one contiguous block of memory.
 	 *
 	 * if we require contiguous objects, but not necessarily the entire
-	 * mempool reserved space to be contiguous, then there are two options.
-	 *
-	 * if our IO addresses are virtual, not actual physical (IOVA as VA
-	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous IO memory as far as the hardware is concerned, so
-	 * act as if we're getting contiguous memory.
+	 * mempool reserved space to be contiguous, pg_sz will be != 0,
+	 * and the default ops->populate() will take care of not placing
+	 * objects across pages.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
 	 * pages, or we might get memory from smaller pages, and how much of it
@@ -501,11 +497,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * If we fail to get enough contiguous memory, then we'll go and
 	 * reserve space in smaller chunks.
-	 *
-	 * We also have to take into account the fact that memory that we're
-	 * going to allocate from can belong to an externally allocated memory
-	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index f6aea7662..dd09a0a32 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
 
+	rte_mempool_get_page_size(mp, &pg_sz);
+
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	for (off = 0, i = 0; i < max_objs; i++) {
+		/* align offset to next page start if required */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
-		obj = (char *)vaddr + off;
+		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
  2019-10-26 14:09                                                   ` Vamsi Krishna Attunuru
@ 2019-10-28 14:05                                                     ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:05 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: Jerin Jacob, Andrew Rybchenko, Ferruh Yigit, thomas,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	anatoly.burakov, stephen, dev

On Sat, Oct 26, 2019 at 02:09:51PM +0000, Vamsi Krishna Attunuru wrote:
> Hi Olivier,
> 
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Saturday, October 26, 2019 5:55 PM
> > To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Cc: Jerin Jacob <jerinjacobk@gmail.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> > thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> > Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > anatoly.burakov@intel.com; stephen@networkplumber.org; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy kni option
> > 
> > Hi Jerin, Hi Vamsi,
> > 
> > On Fri, Oct 25, 2019 at 09:20:20AM +0000, Vamsi Krishna Attunuru wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Jerin Jacob <jerinjacobk@gmail.com>
> > > > Sent: Friday, October 25, 2019 1:01 AM
> > > > To: Olivier Matz <olivier.matz@6wind.com>
> > > > Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Andrew
> > Rybchenko
> > > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@intel.com>;
> > > > thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> > > > Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > > > anatoly.burakov@intel.com; stephen@networkplumber.org;
> > dev@dpdk.org
> > > > Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v11 2/4] eal: add legacy
> > > > kni option
> > > >
> > > > On Thu, Oct 24, 2019 at 11:05 PM Olivier Matz
> > > > <olivier.matz@6wind.com>
> > > > wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > On Wed, Oct 23, 2019 at 08:32:08PM +0530, Jerin Jacob wrote:
> > > > > > On Wed, Oct 23, 2019 at 8:17 PM Olivier Matz
> > > > > > <olivier.matz@6wind.com>
> > > > wrote:
> > > > > > >
> > > > > > > Hi,
> > > > > > >
> > > > > > > On Wed, Oct 23, 2019 at 03:42:39PM +0530, Jerin Jacob wrote:
> > > > > > > > On Tue, Oct 22, 2019 at 7:01 PM Vamsi Krishna Attunuru
> > > > > > > > <vattunuru@marvell.com> wrote:
> > > > > > > > >
> > > > > > > > > Hi Ferruh,
> > > > > > > > >
> > > > > > > > > Can you please explain the problems in using kni dedicated
> > > > > > > > > mbuf alloc
> > > > routines while enabling kni iova=va mode. Please see the below
> > > > discussion with Andrew. He wanted to know the problems in having
> > newer APIs.
> > > > > > > >
> > > > > > > >
> > > > > > > > While waiting  for the Ferruh reply, I would like to
> > > > > > > > summarise the current status
> > > > > > > >
> > > > > > > > # In order to make KNI work with IOVA as VA, We need to make
> > > > > > > > sure mempool pool _object_ should not span across two huge
> > > > > > > > pages
> > > > > > > >
> > > > > > > > # This problem can be fixed by, either of:
> > > > > > > >
> > > > > > > > a) Introduce a flag in mempool to define this constraint, so
> > > > > > > > that, when only needed, this constraint enforced and this is
> > > > > > > > in line with existing semantics of addressing such problems
> > > > > > > > in mempool
> > > > > > > >
> > > > > > > > b) Instead of creating a flag, Make this behavior by default
> > > > > > > > in mempool for IOVA as VA case
> > > > > > > >
> > > > > > > > Upside:
> > > > > > > > b1) There is no need for specific mempool_create for KNI.
> > > > > > > >
> > > > > > > > Downside:
> > > > > > > > b2) Not align with existing mempool API semantics
> > > > > > > > b3) There will be a trivial amount of memory waste as we can
> > > > > > > > not allocate from the edge. Considering the normal huge page
> > > > > > > > memory size is 1G or 512MB this not a real issue.
> > > > > > > >
> > > > > > > > c) Make IOVA as PA when KNI kernel module is loaded
> > > > > > > >
> > > > > > > > Upside:
> > > > > > > > c1) Doing option (a) would call for new KNI specific mempool
> > > > > > > > create API i.e existing KNI applications need a one-line
> > > > > > > > change in application to make it work with release 19.11 or later.
> > > > > > > >
> > > > > > > > Downslide:
> > > > > > > > c2) Driver which needs RTE_PCI_DRV_NEED_IOVA_AS_VA can not
> > > > > > > > work with KNI
> > > > > > > > c3) Need root privilege to run KNI as IOVA as PA need root
> > > > > > > > privilege
> > > > > > > >
> > > > > > > > For the next year, we expect applications to work 19.11
> > > > > > > > without any code change. My personal opinion to make go with
> > > > > > > > option (a) and update the release notes to document the
> > > > > > > > change any it simple one-line change.
> > > > > > > >
> > > > > > > > The selection of (a) vs (b) is between KNI and Mempool
> > maintainers.
> > > > > > > > Could we please reach a consensus? Or can we discuss this TB
> > meeting?
> > > > > > > >
> > > > > > > > We are going back and forth on this feature on for the last
> > > > > > > > 3 releases. Now that, we solved all the technical problems,
> > > > > > > > please help us to decide (a) vs (b) to make forward progress.
> > > > > > >
> > > > > > > Thank you for the summary.
> > > > > > > What is not clear to me is if (a) or (b) may break an existing
> > > > > > > application, and if yes, in which case.
> > > > > >
> > > > > > Thanks for the reply.
> > > > > >
> > > > > > To be clear we are talking about out of tree KNI tree application.
> > > > > > Which they don't want to
> > > > > > change rte_pktmbuf_pool_create() to
> > > > > > rte_kni_pktmbuf_pool_create() and build for v19.11
> > > > > >
> > > > > > So in case (b) there is no issue as It will be using
> > rte_pktmbuf_pool_create ().
> > > > > > But in case of (a) it will create an issue if out of tree KNI
> > > > > > application is using rte_pktmbuf_pool_create() which is not
> > > > > > using the NEW flag.
> > > > >
> > > > > Following yesterday's discussion at techboard, I looked at the
> > > > > mempool code and at my previous RFC patch. It took some time to
> > > > > remind me what was my worries.
> > > >
> > > > Thanks for the review Olivier.
> > > >
> > > > Just to make sure the correct one is reviewed.
> > > >
> > > > 1) v7 had similar issue mentioned
> > > > https://urldefense.proofpoint.com/v2/url?u=http-
> > > >
> > 3A__patches.dpdk.org_patch_56585_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7
> > xtf
> > > >
> > Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YM
> > VHe
> > > > 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=mfN_afnyFm65sQYzaAg_-
> > > > uM9o22A5j392TdBZY-bKK4&e=
> > 
> > The v7 has the problem I described below: the iova-contiguous allocation
> > may fail because the calculated size is too small, and remaining objects will
> > be added in another chunk. This can happen if a fully iova-contiguous
> > mempool is requested (it happens for octeontx).
> > 
> > Also, the way page size is retrieved in
> > rte_mempool_op_populate_default() assume that memzones are used,
> > which is not correct.
> > 
> > > > 2) v11 addressed the review comments and you have given the Acked-by
> > > > for mempool change https://urldefense.proofpoint.com/v2/url?u=http-
> > > >
> > 3A__patches.dpdk.org_patch_61559_&d=DwIBaQ&c=nKjWec2b6R0mOyPaz7
> > xtf
> > > >
> > Q&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=MMwAZe76YM
> > VHe
> > > >
> > 8UcHjL4IBnfX5YvtbocwICAZGBY97A&s=frFvKOHFDRhTam6jDZZc6omK2gb1RU
> > 62
> > > > xzAiiBMnf0I&e=
> > 
> > The v11 looked fine to me, because it does not impact any default behavior,
> > and the plan was to remove this new function when my mempool patchset
> > was in.
> > 
> > 
> > > >
> > > > My thought process in the TB meeting was, since
> > > > rte_mempool_populate_from_pg_sz_chunks() reviwed replace
> > > > rte_pktmbuf_pool_create's  rte_mempool_populate_default() with
> > > > rte_mempool_populate_from_pg_sz_chunks()
> > > > in IOVA == VA case to avoid a new KNI mempool_create API.
> > > >
> > > > >
> > > > > Currently, in rte_mempool_populate_default(), when the mempool is
> > > > > populated, we first try to allocate one iova-contiguous block of
> > > > > (n * elt_size). On success, we use this memory to fully populate
> > > > > the mempool without taking care of crossing page boundaries.
> > > > >
> > > > > If we change the behavior to prevent objects from crossing pages,
> > > > > the assumption that allocating (n * elt_size) is always enough
> > > > > becomes wrong.  By luck, there is no real impact, because if the
> > > > > mempool is not fully populated after this first iteration, it will
> > > > > allocate a new chunk.
> > > > >
> > > > > To be rigorous, we need to better calculate the amount of memory
> > > > > to allocate, according to page size.
> > >
> > > Hi Olivier,
> > >
> > > Thanks for the review, I think the below mentioned problems exist with
> > > current mempool_populate_default() api and will there be high chances
> > > of hitting those problems when we precalculate the memory size(after
> > > preventing objs from pg boundary and fit complete mempool memory in
> > > single mem chunk) and if mempool size goes beyond page size as below
> > > example. ?,
> > 
> > Yes, the problem described below (alloc a mempool of 1.1GB resulting in 2GB
> > reserved) exists in the current version. It will be fixed in the new version of
> > my "mempool: avoid objects allocations across pages"
> > patchset.
> > 
> > FYI, a reworked patchset is alsmost ready, I'll send it monday.
> 
> Thanks a lot Olivier.
> 
> > 
> > >
> > > Regards,
> > > Vamsi
> > >
> > > > >
> > > > > Looking at the code, I found another problem in the same area:
> > > > > let's say we populate a mempool that requires 1.1GB (and we use 1G
> > huge pages):
> > > > >
> > > > > 1/ mempool code will first tries to allocate an iova-contiguous zone
> > > > >    of 1.1G -> fail
> > > > > 2/ it then tries to allocate a page-aligned non iova-contiguous
> > > > >    zone of 1.1G, which is 2G. On success, a lot of memory is wasted.
> > > > > 3/ on error, we try to allocate the biggest zone, it can still return
> > > > >    a zone between 1.1G and 2G, which can also waste memory.
> > > > >
> > > > > I will rework my mempool patchset to properly address these
> > > > > issues, hopefully tomorrow.
> > > >
> > > > OK.
> > > >
> > > >
> > > > >
> > > > > Also, I thought about another idea to solve your issue, not sure
> > > > > it is better but it would not imply to change the mempool
> > > > > behavior. If I understood the problem, when a mbuf is accross 2
> > > > > pages, the copy of the data can fail in kni because the mbuf is
> > > > > not virtually contiguous in the
> > > >
> > > > For KNI use case, we would need _physically_ contiguous to make sure
> > > > that using, get_user_pages_remote() we get  physically contiguous
> > > > memory map, so that both KNI kernel thread and KNI kernel context
> > > > and DPDK userspace can use the same memory in different contexts.
> > > >
> > > >
> > > >
> > > > > kernel. So why not in this case splitting the memcpy() into
> > > > > several, each of them being on a single page (and calling
> > > > > phys2virt() for each page)? The same would have to be done when
> > > > > accessing the fields of the mbuf structure if it crosses a page
> > > > > boundary. Would that work? This
> > > >
> > > > If the above is the requirement, Does this logic need to be in slow
> > > > path or fast path?
> > 
> > In fast path. But I don't think the performance impact would be significative.
> > 
> > Vamsi, do you confirm this approach could also solve the issue without
> > changing the mempool?
> 
> So far there is no performance impact observed when these feature(kni iova as va) is enabled with this patch set. Not sure of impact if when memcpy split and extra address translations are introduced, this approach might solve but I feel it's not a clearer way of solving the issue.

Agree it's not the clearer way, but maybe less risky.

As you can see, the reworked patchset is not that short:
http://inbox.dpdk.org/dev/20190719133845.32432-1-olivier.matz@6wind.com/T/#m6249311baae8469f5727f07c32e4e6e6844eae6a


> > 
> > > >
> > > > > could be a B plan.
> > > >
> > > > OK.
> > > >
> > > > >
> > > > > Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] ***Spam*** [RFC 0/4] mempool: avoid objects allocations across pages
  2019-08-07 15:21                         ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
@ 2019-10-28 14:06                           ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:06 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

Hi Andrew,

Better late than never, here are answers to your comments.
I'm sending a new version of this patchset that addresses
them.

On Wed, Aug 07, 2019 at 06:21:01PM +0300, Andrew Rybchenko wrote:
> On 7/19/19 4:38 PM, Olivier Matz wrote:
> > When IOVA mode is VA, a mempool can be created with objects that
> > are not physically contiguous, which breaks KNI.
> > 
> > To solve this, this patchset changes the default behavior of mempool
> > populate function, to prevent objects from being located across pages.
> 
> I'll provide top level review notes on individual patches, but what
> I don't understand in general, why do we add a rule to respect
> page boundaries in any case even when it is not absolutely required.
> It may add holes. Can it make negative impact on performance?

In terms of memory consumption, the amount of wasted space is not
significant as soon as hugepages are used. In terms of performance, I
don't forsee any change.

> I think that KNI VA-mode requirements are very specific.
> It is VA-mode, but page boundaries should be respected even
> if VA is contiguous.

Yes, but on the other hand, changing the behavior does not hurt the
other use-cases in my opinion. I mean, having the ability to allocate an
IOVA-contiguous area at once does not bring a big added value,
especially given that there is no guarantee that it will success, and we
can fallback to a chunk-based allocation.


Olivier


> 
> > Olivier Matz (4):
> >    mempool: clarify default populate function
> >    mempool: unalign size when calculating required mem amount
> >    mempool: introduce function to get mempool page size
> >    mempool: prevent objects from being across pages
> > 
> >   lib/librte_mempool/rte_mempool.c             | 106 +++++++++++----------------
> >   lib/librte_mempool/rte_mempool.h             |   8 +-
> >   lib/librte_mempool/rte_mempool_ops.c         |   4 +-
> >   lib/librte_mempool/rte_mempool_ops_default.c |  39 +++++++++-
> >   4 files changed, 90 insertions(+), 67 deletions(-)
> > 
> > ---
> > 
> > Hi,
> > 
> > > @Olivier,
> > > Any suggestions..?
> > I took some time to go a bit deeper. I still think we can change the
> > default behavior to avoid objects to be located accross pages. But
> > it is more complex that I expected.
> > 
> > I made a draft patchset, that, in short:
> > - cleans/renames variables
> > - removes the optimistic full iova contiguous allocation
> > - changes return value of calc_mem_size to return the unaligned size,
> >    therefore the allocation is smaller in case of big hugepages
> > - changes rte_mempool_op_populate_default() to prevent allocation
> >    of objects accross multiple pages
> > 
> > Andrew, Anatoly, did I miss something?
> > Vamsi, can you check if it solves your issue?
> > 
> > Anyway, even if validate the patchset it and make it work, I'm afraid
> > this is not something that could go in 19.08.
> > 
> > The only alternative I see is a specific mempool allocation function
> > when used in iova=va mode + kni, as you proposed previously.
> > 
> > It can probably be implemented without adding a flag, starting from
> > rte_mempool_create(), and replacing rte_mempool_populate_default(mp) by
> > something else: allocate pages one by one, and call
> > rte_mempool_populate_iova() for each of them.
> > 
> > Hope it helps. Unfortunately, I may not have too much time to spend on
> > it in the coming days.
> > 
> > Regards,
> > Olivier
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size
  2019-08-07 15:21                           ` Andrew Rybchenko
@ 2019-10-28 14:06                             ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:06 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

On Wed, Aug 07, 2019 at 06:21:41PM +0300, Andrew Rybchenko wrote:
> On 7/19/19 4:38 PM, Olivier Matz wrote:
> > In rte_mempool_populate_default(), we determine the page size,
> > which is needed for calc_size and allocation of memory.
> > 
> > Move this in a function and export it, it will be used in next
> > commit.
> 
> The major change here is taking page sizes into account even in the
> case of RTE_IOVA_VA.

The patch should not change any functional thing. I've fixed it in new
version.

> As I understand it is the main problem initially
> that we should respect page boundaries even in RTE_IOVA_VA case.
> It looks like we can have it even without removal of contiguous
> allocation attempt.

That's true, but what would be the advantage? Removing it makes
the default populate function simpler, and helps to shorten the
~50 lines of explanations in the comments :)


> 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >   lib/librte_mempool/rte_mempool.c | 50 +++++++++++++++++++++++++---------------
> >   lib/librte_mempool/rte_mempool.h |  6 +++++
> >   2 files changed, 37 insertions(+), 19 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 335032dc8..7def0ba68 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -414,6 +414,32 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
> >   	return ret;
> >   }
> > +int
> > +rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
> > +{
> > +	bool need_iova_contig_obj;
> > +	bool alloc_in_ext_mem;
> > +	int ret;
> > +
> > +	/* check if we can retrieve a valid socket ID */
> > +	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
> > +	if (ret < 0)
> > +		return -EINVAL;
> > +	alloc_in_ext_mem = (ret == 1);
> > +	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> > +
> > +	if (!need_iova_contig_obj)
> > +		*pg_sz = 0;
> > +	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > +		*pg_sz = get_min_page_size(mp->socket_id);
> > +	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > +		*pg_sz = get_min_page_size(mp->socket_id);
> > +	else
> > +		*pg_sz = getpagesize();
> > +
> > +	return 0;
> > +}
> > +
> >   /* Default function to populate the mempool: allocate memory in memzones,
> >    * and populate them. Return the number of objects added, or a negative
> >    * value on error.
> > @@ -425,12 +451,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	char mz_name[RTE_MEMZONE_NAMESIZE];
> >   	const struct rte_memzone *mz;
> >   	ssize_t mem_size;
> > -	size_t align, pg_sz, pg_shift;
> > +	size_t align, pg_sz, pg_shift = 0;
> >   	rte_iova_t iova;
> >   	unsigned mz_id, n;
> >   	int ret;
> >   	bool need_iova_contig_obj;
> > -	bool alloc_in_ext_mem;
> >   	ret = mempool_ops_alloc_once(mp);
> >   	if (ret != 0)
> > @@ -482,26 +507,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	 * synonymous with IOVA contiguousness will not hold.
> >   	 */
> > -	/* check if we can retrieve a valid socket ID */
> > -	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
> > -	if (ret < 0)
> > -		return -EINVAL;
> > -	alloc_in_ext_mem = (ret == 1);
> >   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +	if (ret < 0)
> > +		return ret;
> > -	if (!need_iova_contig_obj) {
> > -		pg_sz = 0;
> > -		pg_shift = 0;
> > -	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
> > -		pg_sz = 0;
> > -		pg_shift = 0;
> > -	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
> > -		pg_sz = get_min_page_size(mp->socket_id);
> > -		pg_shift = rte_bsf32(pg_sz);
> > -	} else {
> > -		pg_sz = getpagesize();
> > +	if (pg_sz != 0)
> >   		pg_shift = rte_bsf32(pg_sz);
> > -	}
> >   	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
> >   		size_t min_chunk_size;
> > diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> > index 7bc10e699..00b927989 100644
> > --- a/lib/librte_mempool/rte_mempool.h
> > +++ b/lib/librte_mempool/rte_mempool.h
> > @@ -1692,6 +1692,12 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
> >   void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
> >   		      void *arg);
> > +/**
> > + * @internal Get page size used for mempool object allocation.
> > + */
> > +int
> > +rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
> > +
> >   #ifdef __cplusplus
> >   }
> >   #endif
> > 
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] ***Spam*** [RFC 2/4] mempool: unalign size when calculating required mem amount
  2019-08-07 15:21                           ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
@ 2019-10-28 14:06                             ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:06 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

On Wed, Aug 07, 2019 at 06:21:20PM +0300, Andrew Rybchenko wrote:
> On 7/19/19 4:38 PM, Olivier Matz wrote:
> > The size returned by rte_mempool_op_calc_mem_size_default() is aligned
> > to the specified page size. This means that with big pages, the returned
> > amount is more that what we really need to populate the mempool.
> > 
> > This problem is tempered by the allocation method of
> > rte_mempool_populate_default(): in some conditions (when
> > try_iova_contig_mempool=true), it first tries to allocate all objs
> > memory in an iova contiguous area, without the alignment constraint. If
> > it fails, it fallbacks to the big aligned allocation, that can also
> > fallback into several smaller allocations.
> > 
> > This commit changes rte_mempool_op_calc_mem_size_default() to return the
> > unaligned amount of memory (the alignment constraint is still returned
> > via the *align argument),
> 
> It does not as far as I can see. There is no changes in
> lib/librte_mempool/rte_mempool_ops_default.c

True, sorry. This is fixed in the new version.

> > and removes the optimistic contiguous
> > allocation done when try_iova_contig_mempool=true.
> 
> Unfortunately I don't understand why these 2 changes are combined
> into to one patch. The first looks good to me, the second is
> questionable. It can impact performance.

I splited this change into 2 patches, that's indeed clearer.

However I didn't get what kind do performance impact do you expect?

> > This will make the amount of allocated memory more predictible: it will
> > be more than the optimistic contiguous allocation, but less than the big
> > aligned allocation.
> > 
> > This opens the door for the next commits that will try to prevent objets
> > from being located across pages.
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >   lib/librte_mempool/rte_mempool.c     | 44 ++++--------------------------------
> >   lib/librte_mempool/rte_mempool.h     |  2 +-
> >   lib/librte_mempool/rte_mempool_ops.c |  4 +++-
> >   3 files changed, 9 insertions(+), 41 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 0f29e8712..335032dc8 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -430,7 +430,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	unsigned mz_id, n;
> >   	int ret;
> >   	bool need_iova_contig_obj;
> > -	bool try_iova_contig_mempool;
> >   	bool alloc_in_ext_mem;
> >   	ret = mempool_ops_alloc_once(mp);
> > @@ -477,18 +476,10 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	 * wasting some space this way, but it's much nicer than looping around
> >   	 * trying to reserve each and every page size.
> >   	 *
> > -	 * However, since size calculation will produce page-aligned sizes, it
> > -	 * makes sense to first try and see if we can reserve the entire memzone
> > -	 * in one contiguous chunk as well (otherwise we might end up wasting a
> > -	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
> > -	 * memory, then we'll go and reserve space page-by-page.
> > -	 *
> >   	 * We also have to take into account the fact that memory that we're
> >   	 * going to allocate from can belong to an externally allocated memory
> >   	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold. We should also try
> > -	 * to go for contiguous memory even if we're in no-huge mode, because
> > -	 * external memory may in fact be IOVA-contiguous.
> > +	 * synonymous with IOVA contiguousness will not hold.
> >   	 */
> >   	/* check if we can retrieve a valid socket ID */
> > @@ -497,7 +488,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   		return -EINVAL;
> >   	alloc_in_ext_mem = (ret == 1);
> >   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> > -	try_iova_contig_mempool = false;
> >   	if (!need_iova_contig_obj) {
> >   		pg_sz = 0;
> > @@ -506,7 +496,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   		pg_sz = 0;
> >   		pg_shift = 0;
> >   	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
> > -		try_iova_contig_mempool = true;
> >   		pg_sz = get_min_page_size(mp->socket_id);
> >   		pg_shift = rte_bsf32(pg_sz);
> >   	} else {
> > @@ -518,12 +507,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   		size_t min_chunk_size;
> >   		unsigned int flags;
> > -		if (try_iova_contig_mempool || pg_sz == 0)
> > -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> > -					0, &min_chunk_size, &align);
> > -		else
> > -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> > -					pg_shift, &min_chunk_size, &align);
> > +		mem_size = rte_mempool_ops_calc_mem_size(
> > +			mp, n, pg_shift, &min_chunk_size, &align);
> >   		if (mem_size < 0) {
> >   			ret = mem_size;
> > @@ -542,31 +527,12 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   		/* if we're trying to reserve contiguous memory, add appropriate
> >   		 * memzone flag.
> >   		 */
> > -		if (try_iova_contig_mempool)
> > +		if (min_chunk_size == (size_t)mem_size)
> >   			flags |= RTE_MEMZONE_IOVA_CONTIG;
> >   		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> >   				mp->socket_id, flags, align);
> > -		/* if we were trying to allocate contiguous memory, failed and
> > -		 * minimum required contiguous chunk fits minimum page, adjust
> > -		 * memzone size to the page size, and try again.
> > -		 */
> > -		if (mz == NULL && try_iova_contig_mempool &&
> > -				min_chunk_size <= pg_sz) {
> > -			try_iova_contig_mempool = false;
> > -			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
> > -
> > -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> > -					pg_shift, &min_chunk_size, &align);
> > -			if (mem_size < 0) {
> > -				ret = mem_size;
> > -				goto fail;
> > -			}
> > -
> > -			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> > -				mp->socket_id, flags, align);
> > -		}
> >   		/* don't try reserving with 0 size if we were asked to reserve
> >   		 * IOVA-contiguous memory.
> >   		 */
> > @@ -594,7 +560,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   		else
> >   			iova = RTE_BAD_IOVA;
> > -		if (try_iova_contig_mempool || pg_sz == 0)
> > +		if (pg_sz == 0)
> >   			ret = rte_mempool_populate_iova(mp, mz->addr,
> >   				iova, mz->len,
> >   				rte_mempool_memchunk_mz_free,
> > diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> > index 8053f7a04..7bc10e699 100644
> > --- a/lib/librte_mempool/rte_mempool.h
> > +++ b/lib/librte_mempool/rte_mempool.h
> > @@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
> >    * @param[out] align
> >    *   Location for required memory chunk alignment.
> >    * @return
> > - *   Required memory size aligned at page boundary.
> > + *   Required memory size.
> >    */
> >   typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
> >   		uint32_t obj_num,  uint32_t pg_shift,
> > diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
> > index e02eb702c..22c5251eb 100644
> > --- a/lib/librte_mempool/rte_mempool_ops.c
> > +++ b/lib/librte_mempool/rte_mempool_ops.c
> > @@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
> >   	return ops->get_count(mp);
> >   }
> > -/* wrapper to notify new memory area to external mempool */
> > +/* wrapper to calculate the memory size required to store given number
> > + * of objects
> > + */
> >   ssize_t
> >   rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
> >   				uint32_t obj_num, uint32_t pg_shift,
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-08-07 15:21                           ` Andrew Rybchenko
@ 2019-10-28 14:07                             ` Olivier Matz
  2019-10-29 11:03                               ` Andrew Rybchenko
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:07 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

On Wed, Aug 07, 2019 at 06:21:58PM +0300, Andrew Rybchenko wrote:
> On 7/19/19 4:38 PM, Olivier Matz wrote:
> > When using iova contiguous memory and objets smaller than page size,
> > ensure that objects are not located across several pages.
> 
> It looks like as an attempt to make exception a generic rule and
> I think it is not a good idea.
> 
> mempool has a notion of IOVA contiguous/non-contiguous objects
> depending if PA or VA. rte_mempool_op_populate_default() gets
> a memory chunk which is contiguous in VA and, if iova is not bad,
> IOVA-contiguous. The patch always enforces page boundaries even
> if it is not required. For example, if memory chunk is IOVA_PA
> contiguous, the patch could result in holes and extra memory usage

Yes, it may increase memory usage, but the amount should be limited.
On the other hand, the new patchset provides enhancements that will
reduce the memory consumption.

More importantly, it will fix the KNI + IOVA=VA issue. I also wonder if
this problem couldn't happen in case IOVA=PA. Are there any guarantees
that on all architectures a PA-contiguous in always VA-contiguous in the
kernel?



> 
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >   lib/librte_mempool/rte_mempool_ops_default.c | 39 ++++++++++++++++++++++++++--
> >   1 file changed, 37 insertions(+), 2 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> > index 4e2bfc82d..2bbd67367 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -45,19 +45,54 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   	return mem_size;
> >   }
> > +/* Returns -1 if object falls on a page boundary, else returns 0 */
> > +static inline int
> > +mempool_check_obj_bounds(void *obj, uint64_t pg_sz, size_t elt_sz)
> > +{
> > +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> > +	uint32_t pg_shift;
> > +	uint64_t page_mask;
> > +
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +
> > +	pg_shift = rte_bsf32(pg_sz);
> > +	page_mask =  ~((1ull << pg_shift) - 1);
> > +	page_end = (elt_addr & page_mask) + pg_sz;
> > +
> > +	if (elt_addr + elt_sz > page_end)
> > +		return -1;
> > +
> > +	return 0;
> > +}
> > +
> >   int
> >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
> >   		void *vaddr, rte_iova_t iova, size_t len,
> >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >   {
> > -	size_t total_elt_sz;
> > +	size_t total_elt_sz, pg_sz;
> >   	size_t off;
> >   	unsigned int i;
> >   	void *obj;
> > +	rte_mempool_get_page_size(mp, &pg_sz);
> > +
> >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* align offset to next page start if required */
> > +		if (mempool_check_obj_bounds((char *)vaddr + off,
> > +						pg_sz, total_elt_sz) < 0) {
> > +			off += RTE_PTR_ALIGN_CEIL((char *)vaddr + off, pg_sz) -
> > +				((char *)vaddr + off);
> > +		}
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >   		off += mp->header_size;
> >   		obj = (char *)vaddr + off;
> >   		obj_cb(mp, obj_cb_arg, obj,
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-07-19 14:03                           ` Burakov, Anatoly
@ 2019-10-28 14:07                             ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-28 14:07 UTC (permalink / raw)
  To: Burakov, Anatoly
  Cc: Vamsi Krishna Attunuru, dev, Andrew Rybchenko, Thomas Monjalon,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

Hi Anatoly,

On Fri, Jul 19, 2019 at 03:03:29PM +0100, Burakov, Anatoly wrote:
> On 19-Jul-19 2:38 PM, Olivier Matz wrote:
> > When using iova contiguous memory and objets smaller than page size,
> > ensure that objects are not located across several pages.
> > 
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >   lib/librte_mempool/rte_mempool_ops_default.c | 39 ++++++++++++++++++++++++++--
> >   1 file changed, 37 insertions(+), 2 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> > index 4e2bfc82d..2bbd67367 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -45,19 +45,54 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   	return mem_size;
> >   }
> > +/* Returns -1 if object falls on a page boundary, else returns 0 */
> > +static inline int
> > +mempool_check_obj_bounds(void *obj, uint64_t pg_sz, size_t elt_sz)
> > +{
> > +	uintptr_t page_end, elt_addr = (uintptr_t)obj;
> > +	uint32_t pg_shift;
> > +	uint64_t page_mask;
> > +
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +
> > +	pg_shift = rte_bsf32(pg_sz);
> > +	page_mask =  ~((1ull << pg_shift) - 1);
> > +	page_end = (elt_addr & page_mask) + pg_sz;
> 
> This looks like RTE_PTR_ALIGN should do this without the magic? E.g.
> 
> page_end = RTE_PTR_ALIGN(elt_addr, pg_sz)
> 
> would that not be equivalent?

Yes, I simplified this part in the new version, thanks.

Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt Olivier Matz
@ 2019-10-29  9:02                             ` Vamsi Krishna Attunuru
  2019-10-29  9:13                               ` Olivier Matz
  2019-10-29  9:21                             ` [dpdk-dev] " Andrew Rybchenko
  1 sibling, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-29  9:02 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, dev

Hi Olivier,

Thanks for patch set, able run the tests with 512MB page size with this patch set on Octeontx2 platform, somehow mbuf is holding null fields when pool is created with 2MB page size, tests like l3fwd, kni are failing due to the malformed mbufs. Can you confirm if the patch set was verified on any platform with different page sizes. Meanwhile I will also debug this issue.

Regards
A Vamsi

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Monday, October 28, 2019 7:31 PM
> To: dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Subject: [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
> 
> External Email
> 
> ----------------------------------------------------------------------
> rte_mempool_populate_virt() currently requires that both addr and length are
> page-aligned.
> 
> Remove this uneeded constraint which can be annoying with big hugepages (ex:
> 1GB).
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  lib/librte_mempool/rte_mempool.c | 18 +++++++-----------
> lib/librte_mempool/rte_mempool.h |  3 +--
>  2 files changed, 8 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_mempool/rte_mempool.c
> b/lib/librte_mempool/rte_mempool.c
> index 0f29e8712..76cbacdf3 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool
> *mp, char *addr,
>  	size_t off, phys_len;
>  	int ret, cnt = 0;
> 
> -	/* address and len must be page-aligned */
> -	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
> -		return -EINVAL;
> -	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
> -		return -EINVAL;
> -
>  	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
>  		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
>  			len, free_cb, opaque);
> 
> -	for (off = 0; off + pg_sz <= len &&
> +	for (off = 0; off < len &&
>  		     mp->populated_size < mp->size; off += phys_len) {
> 
>  		iova = rte_mem_virt2iova(addr + off); @@ -389,7 +383,10 @@
> rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>  		}
> 
>  		/* populate with the largest group of contiguous pages */
> -		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
> +		for (phys_len = RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
> +			     (addr + off);
> +		     off + phys_len < len;
> +		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
>  			rte_iova_t iova_tmp;
> 
>  			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
> @@ -575,8 +572,7 @@ rte_mempool_populate_default(struct rte_mempool
> *mp)
>  			 * have
>  			 */
>  			mz = rte_memzone_reserve_aligned(mz_name, 0,
> -					mp->socket_id, flags,
> -					RTE_MAX(pg_sz, align));
> +					mp->socket_id, flags, align);
>  		}
>  		if (mz == NULL) {
>  			ret = -rte_errno;
> @@ -601,7 +597,7 @@ rte_mempool_populate_default(struct rte_mempool
> *mp)
>  				(void *)(uintptr_t)mz);
>  		else
>  			ret = rte_mempool_populate_virt(mp, mz->addr,
> -				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
> +				mz->len, pg_sz,
>  				rte_mempool_memchunk_mz_free,
>  				(void *)(uintptr_t)mz);
>  		if (ret < 0) {
> diff --git a/lib/librte_mempool/rte_mempool.h
> b/lib/librte_mempool/rte_mempool.h
> index 8053f7a04..0fe8aa7b8 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool
> *mp, char *vaddr,
>   *   A pointer to the mempool structure.
>   * @param addr
>   *   The virtual address of memory that should be used to store objects.
> - *   Must be page-aligned.
>   * @param len
> - *   The length of memory in bytes. Must be page-aligned.
> + *   The length of memory in bytes.
>   * @param pg_sz
>   *   The size of memory pages in this virtual area.
>   * @param free_cb
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
  2019-10-29  9:02                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-10-29  9:13                               ` Olivier Matz
  2019-10-29  9:18                                 ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-29  9:13 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, dev

Hi Vamsi,

On Tue, Oct 29, 2019 at 09:02:26AM +0000, Vamsi Krishna Attunuru wrote:
> Hi Olivier,
> 
> Thanks for patch set, able run the tests with 512MB page size with this patch
> set on Octeontx2 platform, somehow mbuf is holding null fields when pool is
> created with 2MB page size, tests like l3fwd, kni are failing due to the
> malformed mbufs. Can you confirm if the patch set was verified on any platform
> with different page sizes. Meanwhile I will also debug this issue.

Thank you for testing.
I tested the patch locally on x86 with 2MB huge pages, and using
travis-ci.

Maybe it is related to the octeontx2 mempool driver? There is a specific
condition to align the object start address to a multiple of total_elt_sz.

Is it this specific patch that breaks your test? Or is it the full patchset?

Thanks,
Olivier



> 
> Regards
> A Vamsi
> 
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Monday, October 28, 2019 7:31 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
> > 
> > External Email
> > 
> > ----------------------------------------------------------------------
> > rte_mempool_populate_virt() currently requires that both addr and length are
> > page-aligned.
> > 
> > Remove this uneeded constraint which can be annoying with big hugepages (ex:
> > 1GB).
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  lib/librte_mempool/rte_mempool.c | 18 +++++++-----------
> > lib/librte_mempool/rte_mempool.h |  3 +--
> >  2 files changed, 8 insertions(+), 13 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index 0f29e8712..76cbacdf3 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool
> > *mp, char *addr,
> >  	size_t off, phys_len;
> >  	int ret, cnt = 0;
> > 
> > -	/* address and len must be page-aligned */
> > -	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
> > -		return -EINVAL;
> > -	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
> > -		return -EINVAL;
> > -
> >  	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
> >  		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
> >  			len, free_cb, opaque);
> > 
> > -	for (off = 0; off + pg_sz <= len &&
> > +	for (off = 0; off < len &&
> >  		     mp->populated_size < mp->size; off += phys_len) {
> > 
> >  		iova = rte_mem_virt2iova(addr + off); @@ -389,7 +383,10 @@
> > rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
> >  		}
> > 
> >  		/* populate with the largest group of contiguous pages */
> > -		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
> > +		for (phys_len = RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
> > +			     (addr + off);
> > +		     off + phys_len < len;
> > +		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
> >  			rte_iova_t iova_tmp;
> > 
> >  			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
> > @@ -575,8 +572,7 @@ rte_mempool_populate_default(struct rte_mempool
> > *mp)
> >  			 * have
> >  			 */
> >  			mz = rte_memzone_reserve_aligned(mz_name, 0,
> > -					mp->socket_id, flags,
> > -					RTE_MAX(pg_sz, align));
> > +					mp->socket_id, flags, align);
> >  		}
> >  		if (mz == NULL) {
> >  			ret = -rte_errno;
> > @@ -601,7 +597,7 @@ rte_mempool_populate_default(struct rte_mempool
> > *mp)
> >  				(void *)(uintptr_t)mz);
> >  		else
> >  			ret = rte_mempool_populate_virt(mp, mz->addr,
> > -				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
> > +				mz->len, pg_sz,
> >  				rte_mempool_memchunk_mz_free,
> >  				(void *)(uintptr_t)mz);
> >  		if (ret < 0) {
> > diff --git a/lib/librte_mempool/rte_mempool.h
> > b/lib/librte_mempool/rte_mempool.h
> > index 8053f7a04..0fe8aa7b8 100644
> > --- a/lib/librte_mempool/rte_mempool.h
> > +++ b/lib/librte_mempool/rte_mempool.h
> > @@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool
> > *mp, char *vaddr,
> >   *   A pointer to the mempool structure.
> >   * @param addr
> >   *   The virtual address of memory that should be used to store objects.
> > - *   Must be page-aligned.
> >   * @param len
> > - *   The length of memory in bytes. Must be page-aligned.
> > + *   The length of memory in bytes.
> >   * @param pg_sz
> >   *   The size of memory pages in this virtual area.
> >   * @param free_cb
> > --
> > 2.20.1
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
  2019-10-29  9:13                               ` Olivier Matz
@ 2019-10-29  9:18                                 ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-29  9:18 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, dev

Hi Olivier,

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Tuesday, October 29, 2019 2:43 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>; dev@dpdk.org
> Subject: Re: [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in populate
> virt
> 
> Hi Vamsi,
> 
> On Tue, Oct 29, 2019 at 09:02:26AM +0000, Vamsi Krishna Attunuru wrote:
> > Hi Olivier,
> >
> > Thanks for patch set, able run the tests with 512MB page size with
> > this patch set on Octeontx2 platform, somehow mbuf is holding null
> > fields when pool is created with 2MB page size, tests like l3fwd, kni
> > are failing due to the malformed mbufs. Can you confirm if the patch
> > set was verified on any platform with different page sizes. Meanwhile I will
> also debug this issue.
> 
> Thank you for testing.
> I tested the patch locally on x86 with 2MB huge pages, and using travis-ci.
> 
> Maybe it is related to the octeontx2 mempool driver? There is a specific
> condition to align the object start address to a multiple of total_elt_sz.
> 
> Is it this specific patch that breaks your test? Or is it the full patchset?

No, I am testing full patchset. I will check with specific patches to narrow down the non-working case.

> 
> Thanks,
> Olivier
> 
> 
> 
> >
> > Regards
> > A Vamsi
> >
> > > -----Original Message-----
> > > From: Olivier Matz <olivier.matz@6wind.com>
> > > Sent: Monday, October 28, 2019 7:31 PM
> > > To: dev@dpdk.org
> > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > <arybchenko@solarflare.com>; Ferruh Yigit
> > > <ferruh.yigit@linux.intel.com>; Giridharan, Ganesan
> > > <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > > <kirankumark@marvell.com>; Stephen Hemminger
> > > <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
> > > Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Subject: [EXT] [PATCH 1/5] mempool: allow unaligned addr/len in
> > > populate virt
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > --
> > > rte_mempool_populate_virt() currently requires that both addr and
> > > length are page-aligned.
> > >
> > > Remove this uneeded constraint which can be annoying with big hugepages
> (ex:
> > > 1GB).
> > >
> > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > ---
> > >  lib/librte_mempool/rte_mempool.c | 18 +++++++-----------
> > > lib/librte_mempool/rte_mempool.h |  3 +--
> > >  2 files changed, 8 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/lib/librte_mempool/rte_mempool.c
> > > b/lib/librte_mempool/rte_mempool.c
> > > index 0f29e8712..76cbacdf3 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool
> > > *mp, char *addr,
> > >  	size_t off, phys_len;
> > >  	int ret, cnt = 0;
> > >
> > > -	/* address and len must be page-aligned */
> > > -	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
> > > -		return -EINVAL;
> > > -	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
> > > -		return -EINVAL;
> > > -
> > >  	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
> > >  		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
> > >  			len, free_cb, opaque);
> > >
> > > -	for (off = 0; off + pg_sz <= len &&
> > > +	for (off = 0; off < len &&
> > >  		     mp->populated_size < mp->size; off += phys_len) {
> > >
> > >  		iova = rte_mem_virt2iova(addr + off); @@ -389,7 +383,10 @@
> > > rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
> > >  		}
> > >
> > >  		/* populate with the largest group of contiguous pages */
> > > -		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
> > > +		for (phys_len = RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
> > > +			     (addr + off);
> > > +		     off + phys_len < len;
> > > +		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
> > >  			rte_iova_t iova_tmp;
> > >
> > >  			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
> @@ -575,8
> > > +572,7 @@ rte_mempool_populate_default(struct rte_mempool
> > > *mp)
> > >  			 * have
> > >  			 */
> > >  			mz = rte_memzone_reserve_aligned(mz_name, 0,
> > > -					mp->socket_id, flags,
> > > -					RTE_MAX(pg_sz, align));
> > > +					mp->socket_id, flags, align);
> > >  		}
> > >  		if (mz == NULL) {
> > >  			ret = -rte_errno;
> > > @@ -601,7 +597,7 @@ rte_mempool_populate_default(struct
> rte_mempool
> > > *mp)
> > >  				(void *)(uintptr_t)mz);
> > >  		else
> > >  			ret = rte_mempool_populate_virt(mp, mz->addr,
> > > -				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
> > > +				mz->len, pg_sz,
> > >  				rte_mempool_memchunk_mz_free,
> > >  				(void *)(uintptr_t)mz);
> > >  		if (ret < 0) {
> > > diff --git a/lib/librte_mempool/rte_mempool.h
> > > b/lib/librte_mempool/rte_mempool.h
> > > index 8053f7a04..0fe8aa7b8 100644
> > > --- a/lib/librte_mempool/rte_mempool.h
> > > +++ b/lib/librte_mempool/rte_mempool.h
> > > @@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct
> > > rte_mempool *mp, char *vaddr,
> > >   *   A pointer to the mempool structure.
> > >   * @param addr
> > >   *   The virtual address of memory that should be used to store objects.
> > > - *   Must be page-aligned.
> > >   * @param len
> > > - *   The length of memory in bytes. Must be page-aligned.
> > > + *   The length of memory in bytes.
> > >   * @param pg_sz
> > >   *   The size of memory pages in this virtual area.
> > >   * @param free_cb
> > > --
> > > 2.20.1
> >

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt Olivier Matz
  2019-10-29  9:02                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-10-29  9:21                             ` Andrew Rybchenko
  2019-10-29 17:02                               ` Olivier Matz
  1 sibling, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-29  9:21 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/28/19 5:01 PM, Olivier Matz wrote:
> rte_mempool_populate_virt() currently requires that both addr
> and length are page-aligned.
>
> Remove this uneeded constraint which can be annoying with big
> hugepages (ex: 1GB).
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

One note below, other than that
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>

> ---
>   lib/librte_mempool/rte_mempool.c | 18 +++++++-----------
>   lib/librte_mempool/rte_mempool.h |  3 +--
>   2 files changed, 8 insertions(+), 13 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 0f29e8712..76cbacdf3 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   	size_t off, phys_len;
>   	int ret, cnt = 0;
>   
> -	/* address and len must be page-aligned */
> -	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
> -		return -EINVAL;
> -	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
> -		return -EINVAL;
> -
>   	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
>   		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
>   			len, free_cb, opaque);
>   
> -	for (off = 0; off + pg_sz <= len &&
> +	for (off = 0; off < len &&
>   		     mp->populated_size < mp->size; off += phys_len) {
>   
>   		iova = rte_mem_virt2iova(addr + off);
> @@ -389,7 +383,10 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
>   		}
>   
>   		/* populate with the largest group of contiguous pages */
> -		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
> +		for (phys_len = RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
> +			     (addr + off);
> +		     off + phys_len < len;

If the condition is false on the first check, below we'll populate memory
outside of specified length. So, we should either apply MIN above when 
phys_len
is initialized or drop MIN in the next line, but apply MIN when
rte_mempool_populate_iova() is called.

Bonus question not directly related to the patch is iova == RTE_BAD_IOVA
case when !rte_eal_has_hugepages():
Is it expected that we still use arithmetic iova + phys_len in this case?
I guess comparison will always be false and pages never merged, but it looks
suspicious anyway.

> +		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
>   			rte_iova_t iova_tmp;
>   
>   			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
> @@ -575,8 +572,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   			 * have
>   			 */
>   			mz = rte_memzone_reserve_aligned(mz_name, 0,
> -					mp->socket_id, flags,
> -					RTE_MAX(pg_sz, align));
> +					mp->socket_id, flags, align);
>   		}
>   		if (mz == NULL) {
>   			ret = -rte_errno;
> @@ -601,7 +597,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   				(void *)(uintptr_t)mz);
>   		else
>   			ret = rte_mempool_populate_virt(mp, mz->addr,
> -				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
> +				mz->len, pg_sz,
>   				rte_mempool_memchunk_mz_free,
>   				(void *)(uintptr_t)mz);
>   		if (ret < 0) {
> diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> index 8053f7a04..0fe8aa7b8 100644
> --- a/lib/librte_mempool/rte_mempool.h
> +++ b/lib/librte_mempool/rte_mempool.h
> @@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
>    *   A pointer to the mempool structure.
>    * @param addr
>    *   The virtual address of memory that should be used to store objects.
> - *   Must be page-aligned.
>    * @param len
> - *   The length of memory in bytes. Must be page-aligned.
> + *   The length of memory in bytes.
>    * @param pg_sz
>    *   The size of memory pages in this virtual area.
>    * @param free_cb


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate Olivier Matz
@ 2019-10-29 10:09                             ` Andrew Rybchenko
  2019-10-29 17:09                               ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-29 10:09 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/28/19 5:01 PM, Olivier Matz wrote:
> The size returned by rte_mempool_op_calc_mem_size_default() is aligned
> to the specified page size. Therefore, with big pages, the returned size
> can be much more that what we really need to populate the mempool.
>
> For instance, populating a mempool that requires 1.1GB of memory with
> 1GB hugepages can result in allocating 2GB of memory.
>
> This problem is hidden most of the time due to the allocation method of
> rte_mempool_populate_default(): when try_iova_contig_mempool=true, it
> first tries to allocate an iova contiguous area, without the alignment
> constraint. If it fails, it fallbacks to an aligned allocation that does
> not require to be iova-contiguous. This can also fallback into several
> smaller aligned allocations.
>
> This commit changes rte_mempool_op_calc_mem_size_default() to relax the
> alignment constraint to a cache line and to return a smaller size.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

One may be unrelated questions below

Reviewed-by: Andrew Rybdhenko <arybchenko@solarflare.com>

[snip]

> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> index 4e2bfc82d..f6aea7662 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -12,7 +12,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   				     size_t *min_chunk_size, size_t *align)
>   {
>   	size_t total_elt_sz;
> -	size_t obj_per_page, pg_num, pg_sz;
> +	size_t obj_per_page, pg_sz, objs_in_last_page;
>   	size_t mem_size;
>   
>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> @@ -33,14 +33,30 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   			mem_size =
>   				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
>   		} else {
> -			pg_num = (obj_num + obj_per_page - 1) / obj_per_page;
> -			mem_size = pg_num << pg_shift;
> +			/* In the best case, the allocator will return a
> +			 * page-aligned address. For example, with 5 objs,
> +			 * the required space is as below:
> +			 *  |     page0     |     page1     |  page2 (last) |
> +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> +			 *  <------------- mem_size ------------->
> +			 */
> +			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
> +			/* room required for the last page */
> +			mem_size = objs_in_last_page * total_elt_sz;
> +			/* room required for other pages */
> +			mem_size += ((obj_num - objs_in_last_page) /
> +				obj_per_page) << pg_shift;
> +
> +			/* In the worst case, the allocator returns a
> +			 * non-aligned pointer, wasting up to
> +			 * total_elt_sz. Add a margin for that.
> +			 */
> +			 mem_size += total_elt_sz - 1;
>   		}
>   	}
>   
> -	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
> -
> -	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
> +	*min_chunk_size = total_elt_sz;
> +	*align = RTE_CACHE_LINE_SIZE;

Not directly related to the patch, but may be RTE_MEMPOOL_ALIGN should be
used?

>   
>   	return mem_size;
>   }


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
@ 2019-10-29 10:25                             ` Andrew Rybchenko
  2019-10-29 17:20                               ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-29 10:25 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/28/19 5:01 PM, Olivier Matz wrote:
> The previous commit reduced the amount of required memory when
> populating the mempool with non iova-contiguous memory.
>
> Since there is no big advantage to have a fully iova-contiguous mempool
> if it is not explicitly asked, remove this code, it simplifies the
> populate function.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

One comment below, other than that
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>

> ---
>   lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
>   1 file changed, 8 insertions(+), 39 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 3275e48c9..5e1f202dc 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c

[snip]

> @@ -531,36 +521,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   			goto fail;
>   		}
>   
> -		flags = mz_flags;
> -
>   		/* if we're trying to reserve contiguous memory, add appropriate
>   		 * memzone flag.
>   		 */
> -		if (try_iova_contig_mempool)
> -			flags |= RTE_MEMZONE_IOVA_CONTIG;
> +		if (min_chunk_size == (size_t)mem_size)
> +			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
>   
>   		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> -				mp->socket_id, flags, align);
> -
> -		/* if we were trying to allocate contiguous memory, failed and
> -		 * minimum required contiguous chunk fits minimum page, adjust
> -		 * memzone size to the page size, and try again.
> -		 */
> -		if (mz == NULL && try_iova_contig_mempool &&
> -				min_chunk_size <= pg_sz) {
> -			try_iova_contig_mempool = false;
> -			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
> -
> -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> -					pg_shift, &min_chunk_size, &align);
> -			if (mem_size < 0) {
> -				ret = mem_size;
> -				goto fail;
> -			}
> +				mp->socket_id, mz_flags, align);
>   
> -			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> -				mp->socket_id, flags, align);
> -		}
>   		/* don't try reserving with 0 size if we were asked to reserve
>   		 * IOVA-contiguous memory.
>   		 */

[snip]

> @@ -587,7 +556,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   		else
>   			iova = RTE_BAD_IOVA;
>   
> -		if (try_iova_contig_mempool || pg_sz == 0)
> +		if (pg_sz == 0)

I think (mz_flags & RTE_MEMZONE_IOVA_CONTIG) is lost here.

>   			ret = rte_mempool_populate_iova(mp, mz->addr,
>   				iova, mz->len,
>   				rte_mempool_memchunk_mz_free,


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-10-29 10:31                             ` Andrew Rybchenko
  2019-10-29 17:20                               ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-29 10:31 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/28/19 5:01 PM, Olivier Matz wrote:
> In rte_mempool_populate_default(), we determine the page size,
> which is needed for calc_size and allocation of memory.
>
> Move this in a function and export it, it will be used in next
> commit.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

One question below:
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>

[snip]

> diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
> index 17cbca460..4eff2767d 100644
> --- a/lib/librte_mempool/rte_mempool_version.map
> +++ b/lib/librte_mempool/rte_mempool_version.map
> @@ -56,5 +56,6 @@ DPDK_18.05 {
>   EXPERIMENTAL {
>   	global:
>   
> +	rte_mempool_get_page_size;
>   	rte_mempool_ops_get_info;
>   };

Should internal function be here?


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] ***Spam*** [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages Olivier Matz
@ 2019-10-29 10:59                             ` Andrew Rybchenko
  2019-10-29 17:34                               ` Olivier Matz
  2019-10-29 17:25                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  1 sibling, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-29 10:59 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/28/19 5:01 PM, Olivier Matz wrote:
> When populating a mempool, ensure that objects are not located across
> several pages, except if user did not request iova contiguous objects.

I think it breaks distribution across memory channels which could
affect performance significantly.

> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
>   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++--
>   2 files changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 7664764e5..b23fd1b06 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
>   
>   	if (!need_iova_contig_obj)
>   		*pg_sz = 0;
> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> -		*pg_sz = 0;
>   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>   		*pg_sz = get_min_page_size(mp->socket_id);
>   	else
> @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	 * then just set page shift and page size to 0, because the user has
>   	 * indicated that there's no need to care about anything.
>   	 *
> -	 * if we do need contiguous objects, there is also an option to reserve
> -	 * the entire mempool memory as one contiguous block of memory, in
> -	 * which case the page shift and alignment wouldn't matter as well.
> +	 * if we do need contiguous objects (if a mempool driver has its
> +	 * own calc_size() method returning min_chunk_size = mem_size),
> +	 * there is also an option to reserve the entire mempool memory
> +	 * as one contiguous block of memory.
>   	 *
>   	 * if we require contiguous objects, but not necessarily the entire
> -	 * mempool reserved space to be contiguous, then there are two options.
> -	 *
> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> -	 * case), then no page shift needed - our memory allocation will give us
> -	 * contiguous IO memory as far as the hardware is concerned, so
> -	 * act as if we're getting contiguous memory.
> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> +	 * and the default ops->populate() will take care of not placing
> +	 * objects across pages.
>   	 *
>   	 * if our IO addresses are physical, we may get memory from bigger
>   	 * pages, or we might get memory from smaller pages, and how much of it
> @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>   	 *
>   	 * If we fail to get enough contiguous memory, then we'll go and
>   	 * reserve space in smaller chunks.
> -	 *
> -	 * We also have to take into account the fact that memory that we're
> -	 * going to allocate from can belong to an externally allocated memory
> -	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold.
>   	 */
>   
>   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> index f6aea7662..dd09a0a32 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
>   	return mem_size;
>   }
>   
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int
> +check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
> +{
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
> +		return -1;
> +	return 0;
> +}
> +
>   int
>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
>   		void *vaddr, rte_iova_t iova, size_t len,
>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>   {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>   	size_t off;
>   	unsigned int i;
>   	void *obj;
>   
> +	rte_mempool_get_page_size(mp, &pg_sz);
> +

The function may return an error which should be taken into account here.

>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>   
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* align offset to next page start if required */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>   		off += mp->header_size;
> -		obj = (char *)vaddr + off;
> +		obj = va + off;
>   		obj_cb(mp, obj_cb_arg, obj,
>   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages
  2019-10-28 14:07                             ` Olivier Matz
@ 2019-10-29 11:03                               ` Andrew Rybchenko
  0 siblings, 0 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-29 11:03 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Vamsi Krishna Attunuru, dev, Thomas Monjalon, Anatoly Burakov,
	Jerin Jacob Kollanukkaran, Kokkilagadda, Ferruh Yigit

On 10/28/19 5:07 PM, Olivier Matz wrote:
> On Wed, Aug 07, 2019 at 06:21:58PM +0300, Andrew Rybchenko wrote:
>> On 7/19/19 4:38 PM, Olivier Matz wrote:
>>> When using iova contiguous memory and objets smaller than page size,
>>> ensure that objects are not located across several pages.
>> It looks like as an attempt to make exception a generic rule and
>> I think it is not a good idea.
>>
>> mempool has a notion of IOVA contiguous/non-contiguous objects
>> depending if PA or VA. rte_mempool_op_populate_default() gets
>> a memory chunk which is contiguous in VA and, if iova is not bad,
>> IOVA-contiguous. The patch always enforces page boundaries even
>> if it is not required. For example, if memory chunk is IOVA_PA
>> contiguous, the patch could result in holes and extra memory usage
> Yes, it may increase memory usage, but the amount should be limited.
> On the other hand, the new patchset provides enhancements that will
> reduce the memory consumption.
>
> More importantly, it will fix the KNI + IOVA=VA issue. I also wonder if
> this problem couldn't happen in case IOVA=PA. Are there any guarantees
> that on all architectures a PA-contiguous in always VA-contiguous in the
> kernel?

I have no idea. The question is really interesting.
I hope it is true, otherwise we're in trouble since it is
assumed in the code as far as know.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt
  2019-10-29  9:21                             ` [dpdk-dev] " Andrew Rybchenko
@ 2019-10-29 17:02                               ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-29 17:02 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

Hi Andrew,

On Tue, Oct 29, 2019 at 12:21:27PM +0300, Andrew Rybchenko wrote:
> On 10/28/19 5:01 PM, Olivier Matz wrote:
> > rte_mempool_populate_virt() currently requires that both addr
> > and length are page-aligned.
> > 
> > Remove this uneeded constraint which can be annoying with big
> > hugepages (ex: 1GB).
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> 
> One note below, other than that
> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> 
> > ---
> >   lib/librte_mempool/rte_mempool.c | 18 +++++++-----------
> >   lib/librte_mempool/rte_mempool.h |  3 +--
> >   2 files changed, 8 insertions(+), 13 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 0f29e8712..76cbacdf3 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
> >   	size_t off, phys_len;
> >   	int ret, cnt = 0;
> > -	/* address and len must be page-aligned */
> > -	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
> > -		return -EINVAL;
> > -	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
> > -		return -EINVAL;
> > -
> >   	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
> >   		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
> >   			len, free_cb, opaque);
> > -	for (off = 0; off + pg_sz <= len &&
> > +	for (off = 0; off < len &&
> >   		     mp->populated_size < mp->size; off += phys_len) {
> >   		iova = rte_mem_virt2iova(addr + off);
> > @@ -389,7 +383,10 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
> >   		}
> >   		/* populate with the largest group of contiguous pages */
> > -		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
> > +		for (phys_len = RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
> > +			     (addr + off);
> > +		     off + phys_len < len;
> 
> If the condition is false on the first check, below we'll populate memory
> outside of specified length. So, we should either apply MIN above when
> phys_len
> is initialized or drop MIN in the next line, but apply MIN when
> rte_mempool_populate_iova() is called.

Correct, I'll fix it by adding a RTE_MIN(). Maybe I'll just move it
outside of the for.

> Bonus question not directly related to the patch is iova == RTE_BAD_IOVA
> case when !rte_eal_has_hugepages():
> Is it expected that we still use arithmetic iova + phys_len in this case?
> I guess comparison will always be false and pages never merged, but it looks
> suspicious anyway.

Yes, this is not very elegant.
I can change the test to
	if (iova_tmp != RTE_BAD_IOVA || iova_tmp != iova + phys_len)

> 
> > +		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
> >   			rte_iova_t iova_tmp;
> >   			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
> > @@ -575,8 +572,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   			 * have
> >   			 */
> >   			mz = rte_memzone_reserve_aligned(mz_name, 0,
> > -					mp->socket_id, flags,
> > -					RTE_MAX(pg_sz, align));
> > +					mp->socket_id, flags, align);
> >   		}
> >   		if (mz == NULL) {
> >   			ret = -rte_errno;
> > @@ -601,7 +597,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   				(void *)(uintptr_t)mz);
> >   		else
> >   			ret = rte_mempool_populate_virt(mp, mz->addr,
> > -				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
> > +				mz->len, pg_sz,
> >   				rte_mempool_memchunk_mz_free,
> >   				(void *)(uintptr_t)mz);
> >   		if (ret < 0) {
> > diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
> > index 8053f7a04..0fe8aa7b8 100644
> > --- a/lib/librte_mempool/rte_mempool.h
> > +++ b/lib/librte_mempool/rte_mempool.h
> > @@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
> >    *   A pointer to the mempool structure.
> >    * @param addr
> >    *   The virtual address of memory that should be used to store objects.
> > - *   Must be page-aligned.
> >    * @param len
> > - *   The length of memory in bytes. Must be page-aligned.
> > + *   The length of memory in bytes.
> >    * @param pg_sz
> >    *   The size of memory pages in this virtual area.
> >    * @param free_cb
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate
  2019-10-29 10:09                             ` Andrew Rybchenko
@ 2019-10-29 17:09                               ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-29 17:09 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Tue, Oct 29, 2019 at 01:09:01PM +0300, Andrew Rybchenko wrote:
> On 10/28/19 5:01 PM, Olivier Matz wrote:
> > The size returned by rte_mempool_op_calc_mem_size_default() is aligned
> > to the specified page size. Therefore, with big pages, the returned size
> > can be much more that what we really need to populate the mempool.
> > 
> > For instance, populating a mempool that requires 1.1GB of memory with
> > 1GB hugepages can result in allocating 2GB of memory.
> > 
> > This problem is hidden most of the time due to the allocation method of
> > rte_mempool_populate_default(): when try_iova_contig_mempool=true, it
> > first tries to allocate an iova contiguous area, without the alignment
> > constraint. If it fails, it fallbacks to an aligned allocation that does
> > not require to be iova-contiguous. This can also fallback into several
> > smaller aligned allocations.
> > 
> > This commit changes rte_mempool_op_calc_mem_size_default() to relax the
> > alignment constraint to a cache line and to return a smaller size.
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> 
> One may be unrelated questions below
> 
> Reviewed-by: Andrew Rybdhenko <arybchenko@solarflare.com>
> 
> [snip]
> 
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> > index 4e2bfc82d..f6aea7662 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -12,7 +12,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   				     size_t *min_chunk_size, size_t *align)
> >   {
> >   	size_t total_elt_sz;
> > -	size_t obj_per_page, pg_num, pg_sz;
> > +	size_t obj_per_page, pg_sz, objs_in_last_page;
> >   	size_t mem_size;
> >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > @@ -33,14 +33,30 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   			mem_size =
> >   				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
> >   		} else {
> > -			pg_num = (obj_num + obj_per_page - 1) / obj_per_page;
> > -			mem_size = pg_num << pg_shift;
> > +			/* In the best case, the allocator will return a
> > +			 * page-aligned address. For example, with 5 objs,
> > +			 * the required space is as below:
> > +			 *  |     page0     |     page1     |  page2 (last) |
> > +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > +			 *  <------------- mem_size ------------->
> > +			 */
> > +			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
> > +			/* room required for the last page */
> > +			mem_size = objs_in_last_page * total_elt_sz;
> > +			/* room required for other pages */
> > +			mem_size += ((obj_num - objs_in_last_page) /
> > +				obj_per_page) << pg_shift;
> > +
> > +			/* In the worst case, the allocator returns a
> > +			 * non-aligned pointer, wasting up to
> > +			 * total_elt_sz. Add a margin for that.
> > +			 */
> > +			 mem_size += total_elt_sz - 1;
> >   		}
> >   	}
> > -	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
> > -
> > -	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
> > +	*min_chunk_size = total_elt_sz;
> > +	*align = RTE_CACHE_LINE_SIZE;
> 
> Not directly related to the patch, but may be RTE_MEMPOOL_ALIGN should be
> used?

Yes, and there is another one that could be changed in
rte_mempool_populate_iova().

I can add a patch for that.

> 
> >   	return mem_size;
> >   }
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-29 10:25                             ` Andrew Rybchenko
@ 2019-10-29 17:20                               ` Olivier Matz
  2019-10-30  7:36                                 ` Andrew Rybchenko
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-29 17:20 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Tue, Oct 29, 2019 at 01:25:10PM +0300, Andrew Rybchenko wrote:
> On 10/28/19 5:01 PM, Olivier Matz wrote:
> > The previous commit reduced the amount of required memory when
> > populating the mempool with non iova-contiguous memory.
> > 
> > Since there is no big advantage to have a fully iova-contiguous mempool
> > if it is not explicitly asked, remove this code, it simplifies the
> > populate function.
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> 
> One comment below, other than that
> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> 
> > ---
> >   lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
> >   1 file changed, 8 insertions(+), 39 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 3275e48c9..5e1f202dc 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> 
> [snip]
> 
> > @@ -531,36 +521,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   			goto fail;
> >   		}
> > -		flags = mz_flags;
> > -
> >   		/* if we're trying to reserve contiguous memory, add appropriate
> >   		 * memzone flag.
> >   		 */
> > -		if (try_iova_contig_mempool)
> > -			flags |= RTE_MEMZONE_IOVA_CONTIG;
> > +		if (min_chunk_size == (size_t)mem_size)
> > +			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
> >   		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> > -				mp->socket_id, flags, align);
> > -
> > -		/* if we were trying to allocate contiguous memory, failed and
> > -		 * minimum required contiguous chunk fits minimum page, adjust
> > -		 * memzone size to the page size, and try again.
> > -		 */
> > -		if (mz == NULL && try_iova_contig_mempool &&
> > -				min_chunk_size <= pg_sz) {
> > -			try_iova_contig_mempool = false;
> > -			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
> > -
> > -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> > -					pg_shift, &min_chunk_size, &align);
> > -			if (mem_size < 0) {
> > -				ret = mem_size;
> > -				goto fail;
> > -			}
> > +				mp->socket_id, mz_flags, align);
> > -			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> > -				mp->socket_id, flags, align);
> > -		}
> >   		/* don't try reserving with 0 size if we were asked to reserve
> >   		 * IOVA-contiguous memory.
> >   		 */
> 
> [snip]
> 
> > @@ -587,7 +556,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   		else
> >   			iova = RTE_BAD_IOVA;
> > -		if (try_iova_contig_mempool || pg_sz == 0)
> > +		if (pg_sz == 0)
> 
> I think (mz_flags & RTE_MEMZONE_IOVA_CONTIG) is lost here.

Do you mean we should do instead
	if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
		populate_iova()
	else
		populate_virt()
?

I would say yes, but it should be removed in patch 5/5.
At the end, we don't want objects to be across pages, even
with RTE_MEMZONE_IOVA_CONTIG.

> 
> >   			ret = rte_mempool_populate_iova(mp, mz->addr,
> >   				iova, mz->len,
> >   				rte_mempool_memchunk_mz_free,
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size
  2019-10-29 10:31                             ` Andrew Rybchenko
@ 2019-10-29 17:20                               ` Olivier Matz
  2019-10-30  8:32                                 ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-29 17:20 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Tue, Oct 29, 2019 at 01:31:22PM +0300, Andrew Rybchenko wrote:
> On 10/28/19 5:01 PM, Olivier Matz wrote:
> > In rte_mempool_populate_default(), we determine the page size,
> > which is needed for calc_size and allocation of memory.
> > 
> > Move this in a function and export it, it will be used in next
> > commit.
> > 
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> 
> One question below:
> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> 
> [snip]
> 
> > diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
> > index 17cbca460..4eff2767d 100644
> > --- a/lib/librte_mempool/rte_mempool_version.map
> > +++ b/lib/librte_mempool/rte_mempool_version.map
> > @@ -56,5 +56,6 @@ DPDK_18.05 {
> >   EXPERIMENTAL {
> >   	global:
> > +	rte_mempool_get_page_size;
> >   	rte_mempool_ops_get_info;
> >   };
> 
> Should internal function be here?
> 

Good question. Let me ask a friend ;)

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-28 14:01                           ` [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages Olivier Matz
  2019-10-29 10:59                             ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
@ 2019-10-29 17:25                             ` Vamsi Krishna Attunuru
  2019-10-30  3:55                               ` Vamsi Krishna Attunuru
  2019-10-30  7:46                               ` Andrew Rybchenko
  1 sibling, 2 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-29 17:25 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon

Hi Olivier,

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Monday, October 28, 2019 7:31 PM
> To: dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; Stephen Hemminger
> <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
> Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> pages
> 
> External Email
> 
> ----------------------------------------------------------------------
> When populating a mempool, ensure that objects are not located across
> several pages, except if user did not request iova contiguous objects.
> 
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
>  lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
> -
>  2 files changed, 33 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/librte_mempool/rte_mempool.c
> b/lib/librte_mempool/rte_mempool.c
> index 7664764e5..b23fd1b06 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
> *mp, size_t *pg_sz)
> 
>  	if (!need_iova_contig_obj)
>  		*pg_sz = 0;
> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
> RTE_IOVA_VA)
> -		*pg_sz = 0;
>  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>  		*pg_sz = get_min_page_size(mp->socket_id);
>  	else
> @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
>  	 * then just set page shift and page size to 0, because the user has
>  	 * indicated that there's no need to care about anything.
>  	 *
> -	 * if we do need contiguous objects, there is also an option to reserve
> -	 * the entire mempool memory as one contiguous block of memory,
> in
> -	 * which case the page shift and alignment wouldn't matter as well.
> +	 * if we do need contiguous objects (if a mempool driver has its
> +	 * own calc_size() method returning min_chunk_size = mem_size),
> +	 * there is also an option to reserve the entire mempool memory
> +	 * as one contiguous block of memory.
>  	 *
>  	 * if we require contiguous objects, but not necessarily the entire
> -	 * mempool reserved space to be contiguous, then there are two
> options.
> -	 *
> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> -	 * case), then no page shift needed - our memory allocation will give
> us
> -	 * contiguous IO memory as far as the hardware is concerned, so
> -	 * act as if we're getting contiguous memory.
> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> +	 * and the default ops->populate() will take care of not placing
> +	 * objects across pages.
>  	 *
>  	 * if our IO addresses are physical, we may get memory from bigger
>  	 * pages, or we might get memory from smaller pages, and how
> much of it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
>  	 *
>  	 * If we fail to get enough contiguous memory, then we'll go and
>  	 * reserve space in smaller chunks.
> -	 *
> -	 * We also have to take into account the fact that memory that we're
> -	 * going to allocate from can belong to an externally allocated
> memory
> -	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold.
>  	 */
> 
>  	need_iova_contig_obj = !(mp->flags &
> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> a/lib/librte_mempool/rte_mempool_ops_default.c
> b/lib/librte_mempool/rte_mempool_ops_default.c
> index f6aea7662..dd09a0a32 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
> struct rte_mempool *mp,
>  	return mem_size;
>  }
> 
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> pg_sz))
> +		return -1;
> +	return 0;
> +}
> +
>  int
>  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> max_objs,
>  		void *vaddr, rte_iova_t iova, size_t len,
>  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>  	size_t off;
>  	unsigned int i;
>  	void *obj;
> 
> +	rte_mempool_get_page_size(mp, &pg_sz);
> +
>  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> 
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* align offset to next page start if required */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> off);

Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).

> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>  		off += mp->header_size;
> -		obj = (char *)vaddr + off;
> +		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] ***Spam*** [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-29 10:59                             ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
@ 2019-10-29 17:34                               ` Olivier Matz
  2019-10-30  7:56                                 ` [dpdk-dev] " Andrew Rybchenko
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-29 17:34 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Tue, Oct 29, 2019 at 01:59:00PM +0300, Andrew Rybchenko wrote:
> On 10/28/19 5:01 PM, Olivier Matz wrote:
> > When populating a mempool, ensure that objects are not located across
> > several pages, except if user did not request iova contiguous objects.
> 
> I think it breaks distribution across memory channels which could
> affect performance significantly.

With 2M hugepages, there are ~900 mbufs per page, and all of them will
be distributed across memory channels. For larger objects, I don't think
the distribution is that important.

With small pages, that may be true. I think the problem was already
there except in IOVA=VA mode.

This should be fixable, but I'm not sure a use-case where we can see a
regression really exists.


> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
> >   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++--
> >   2 files changed, 33 insertions(+), 19 deletions(-)
> > 
> > diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 7664764e5..b23fd1b06 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
> >   	if (!need_iova_contig_obj)
> >   		*pg_sz = 0;
> > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > -		*pg_sz = 0;
> >   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >   		*pg_sz = get_min_page_size(mp->socket_id);
> >   	else
> > @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	 * then just set page shift and page size to 0, because the user has
> >   	 * indicated that there's no need to care about anything.
> >   	 *
> > -	 * if we do need contiguous objects, there is also an option to reserve
> > -	 * the entire mempool memory as one contiguous block of memory, in
> > -	 * which case the page shift and alignment wouldn't matter as well.
> > +	 * if we do need contiguous objects (if a mempool driver has its
> > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > +	 * there is also an option to reserve the entire mempool memory
> > +	 * as one contiguous block of memory.
> >   	 *
> >   	 * if we require contiguous objects, but not necessarily the entire
> > -	 * mempool reserved space to be contiguous, then there are two options.
> > -	 *
> > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -	 * case), then no page shift needed - our memory allocation will give us
> > -	 * contiguous IO memory as far as the hardware is concerned, so
> > -	 * act as if we're getting contiguous memory.
> > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +	 * and the default ops->populate() will take care of not placing
> > +	 * objects across pages.
> >   	 *
> >   	 * if our IO addresses are physical, we may get memory from bigger
> >   	 * pages, or we might get memory from smaller pages, and how much of it
> > @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
> >   	 *
> >   	 * If we fail to get enough contiguous memory, then we'll go and
> >   	 * reserve space in smaller chunks.
> > -	 *
> > -	 * We also have to take into account the fact that memory that we're
> > -	 * going to allocate from can belong to an externally allocated memory
> > -	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold.
> >   	 */
> >   	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
> > diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..dd09a0a32 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
> >   	return mem_size;
> >   }
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int
> > +check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
> > +{
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
> > +		return -1;
> > +	return 0;
> > +}
> > +
> >   int
> >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
> >   		void *vaddr, rte_iova_t iova, size_t len,
> >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >   {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >   	size_t off;
> >   	unsigned int i;
> >   	void *obj;
> > +	rte_mempool_get_page_size(mp, &pg_sz);
> > +
> 
> The function may return an error which should be taken into account here.

That would be better, indeed.


> >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* align offset to next page start if required */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >   		off += mp->header_size;
> > -		obj = (char *)vaddr + off;
> > +		obj = va + off;
> >   		obj_cb(mp, obj_cb_arg, obj,
> >   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> 

Thanks for all the comments!
I will send a v2 tomorrow.

Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-29 17:25                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-10-30  3:55                               ` Vamsi Krishna Attunuru
  2019-10-30  7:46                               ` Andrew Rybchenko
  1 sibling, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-30  3:55 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon

Hi Olivier,

> -----Original Message-----
> From: Vamsi Krishna Attunuru
> Sent: Tuesday, October 29, 2019 10:55 PM
> To: Olivier Matz <olivier.matz@6wind.com>; dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: RE: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> pages
> 
> Hi Olivier,
> 
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Monday, October 28, 2019 7:31 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit
> > <ferruh.yigit@linux.intel.com>; Giridharan, Ganesan
> > <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > <kirankumark@marvell.com>; Stephen Hemminger
> <sthemmin@microsoft.com>;
> > Thomas Monjalon <thomas@monjalon.net>; Vamsi Krishna Attunuru
> > <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> > pages
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > When populating a mempool, ensure that objects are not located across
> > several pages, except if user did not request iova contiguous objects.
> >
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
> >  lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
> > -
> >  2 files changed, 33 insertions(+), 19 deletions(-)
> >
> > diff --git a/lib/librte_mempool/rte_mempool.c
> > b/lib/librte_mempool/rte_mempool.c
> > index 7664764e5..b23fd1b06 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
> *mp,
> > size_t *pg_sz)
> >
> >  	if (!need_iova_contig_obj)
> >  		*pg_sz = 0;
> > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
> > RTE_IOVA_VA)
> > -		*pg_sz = 0;
> >  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >  		*pg_sz = get_min_page_size(mp->socket_id);
> >  	else
> > @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >  	 * then just set page shift and page size to 0, because the user has
> >  	 * indicated that there's no need to care about anything.
> >  	 *
> > -	 * if we do need contiguous objects, there is also an option to reserve
> > -	 * the entire mempool memory as one contiguous block of memory,
> > in
> > -	 * which case the page shift and alignment wouldn't matter as well.
> > +	 * if we do need contiguous objects (if a mempool driver has its
> > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > +	 * there is also an option to reserve the entire mempool memory
> > +	 * as one contiguous block of memory.
> >  	 *
> >  	 * if we require contiguous objects, but not necessarily the entire
> > -	 * mempool reserved space to be contiguous, then there are two
> > options.
> > -	 *
> > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -	 * case), then no page shift needed - our memory allocation will give
> > us
> > -	 * contiguous IO memory as far as the hardware is concerned, so
> > -	 * act as if we're getting contiguous memory.
> > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +	 * and the default ops->populate() will take care of not placing
> > +	 * objects across pages.
> >  	 *
> >  	 * if our IO addresses are physical, we may get memory from bigger
> >  	 * pages, or we might get memory from smaller pages, and how much
> of
> > it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >  	 *
> >  	 * If we fail to get enough contiguous memory, then we'll go and
> >  	 * reserve space in smaller chunks.
> > -	 *
> > -	 * We also have to take into account the fact that memory that we're
> > -	 * going to allocate from can belong to an externally allocated
> > memory
> > -	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold.
> >  	 */
> >
> >  	need_iova_contig_obj = !(mp->flags &
> MEMPOOL_F_NO_IOVA_CONTIG); diff
> > --git a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..dd09a0a32 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
> > struct rte_mempool *mp,
> >  	return mem_size;
> >  }
> >
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +		return -1;
> > +	return 0;
> > +}
> > +
> >  int
> >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > max_objs,
> >  		void *vaddr, rte_iova_t iova, size_t len,
> >  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg) {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >  	size_t off;
> >  	unsigned int i;
> >  	void *obj;
> >
> > +	rte_mempool_get_page_size(mp, &pg_sz);
> > +
> >  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* align offset to next page start if required */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> 
> Moving offset to the start of next page and than freeing (vaddr + off +
> header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf
> alignment requirement(buffer address needs to be multiple of buffer size).

Earlier there was a flag to align the object to total object size, later it was deprecated(sew below link) after adding calc min chunk size callback. In this new patch(5/5), while moving to next page, the object addr is being align to the pg_sz. But oteontx2 mempool hw requires object addresses freed to it must be aligned with total object size.  We need these alignment requirement fulfilled to make it work with HW mempool.

https://git.dpdk.org/dpdk/commit/lib/librte_mempool/rte_mempool.c?id=ce1f2c61ed135e4133d0429e86e554bfd4d58cb0



> 
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >  		off += mp->header_size;
> > -		obj = (char *)vaddr + off;
> > +		obj = va + off;
> >  		obj_cb(mp, obj_cb_arg, obj,
> >  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > --
> > 2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-29 17:20                               ` Olivier Matz
@ 2019-10-30  7:36                                 ` Andrew Rybchenko
  2019-10-30  7:44                                   ` Andrew Rybchenko
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-30  7:36 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/29/19 8:20 PM, Olivier Matz wrote:
> On Tue, Oct 29, 2019 at 01:25:10PM +0300, Andrew Rybchenko wrote:
>> On 10/28/19 5:01 PM, Olivier Matz wrote:
>>> The previous commit reduced the amount of required memory when
>>> populating the mempool with non iova-contiguous memory.
>>>
>>> Since there is no big advantage to have a fully iova-contiguous mempool
>>> if it is not explicitly asked, remove this code, it simplifies the
>>> populate function.
>>>
>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>> One comment below, other than that
>> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
>>
>>> ---
>>>    lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
>>>    1 file changed, 8 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
>>> index 3275e48c9..5e1f202dc 100644
>>> --- a/lib/librte_mempool/rte_mempool.c
>>> +++ b/lib/librte_mempool/rte_mempool.c
>> [snip]
>>
>>> @@ -531,36 +521,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>>>    			goto fail;
>>>    		}
>>> -		flags = mz_flags;
>>> -
>>>    		/* if we're trying to reserve contiguous memory, add appropriate
>>>    		 * memzone flag.
>>>    		 */
>>> -		if (try_iova_contig_mempool)
>>> -			flags |= RTE_MEMZONE_IOVA_CONTIG;
>>> +		if (min_chunk_size == (size_t)mem_size)
>>> +			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
>>>    		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>>> -				mp->socket_id, flags, align);
>>> -
>>> -		/* if we were trying to allocate contiguous memory, failed and
>>> -		 * minimum required contiguous chunk fits minimum page, adjust
>>> -		 * memzone size to the page size, and try again.
>>> -		 */
>>> -		if (mz == NULL && try_iova_contig_mempool &&
>>> -				min_chunk_size <= pg_sz) {
>>> -			try_iova_contig_mempool = false;
>>> -			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
>>> -
>>> -			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
>>> -					pg_shift, &min_chunk_size, &align);
>>> -			if (mem_size < 0) {
>>> -				ret = mem_size;
>>> -				goto fail;
>>> -			}
>>> +				mp->socket_id, mz_flags, align);
>>> -			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>>> -				mp->socket_id, flags, align);
>>> -		}
>>>    		/* don't try reserving with 0 size if we were asked to reserve
>>>    		 * IOVA-contiguous memory.
>>>    		 */
>> [snip]
>>
>>> @@ -587,7 +556,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
>>>    		else
>>>    			iova = RTE_BAD_IOVA;
>>> -		if (try_iova_contig_mempool || pg_sz == 0)
>>> +		if (pg_sz == 0)
>> I think (mz_flags & RTE_MEMZONE_IOVA_CONTIG) is lost here.
> Do you mean we should do instead
> 	if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
> 		populate_iova()
> 	else
> 		populate_virt()
> ?

Yes.

> I would say yes, but it should be removed in patch 5/5.
> At the end, we don't want objects to be across pages, even
> with RTE_MEMZONE_IOVA_CONTIG.
>
>>>    			ret = rte_mempool_populate_iova(mp, mz->addr,
>>>    				iova, mz->len,
>>>    				rte_mempool_memchunk_mz_free,


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-30  7:36                                 ` Andrew Rybchenko
@ 2019-10-30  7:44                                   ` Andrew Rybchenko
  2019-10-30 10:38                                     ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-30  7:44 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/30/19 10:36 AM, Andrew Rybchenko wrote:
> On 10/29/19 8:20 PM, Olivier Matz wrote:
>> On Tue, Oct 29, 2019 at 01:25:10PM +0300, Andrew Rybchenko wrote:
>>> On 10/28/19 5:01 PM, Olivier Matz wrote:
>>>> The previous commit reduced the amount of required memory when
>>>> populating the mempool with non iova-contiguous memory.
>>>>
>>>> Since there is no big advantage to have a fully iova-contiguous 
>>>> mempool
>>>> if it is not explicitly asked, remove this code, it simplifies the
>>>> populate function.
>>>>
>>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>>> One comment below, other than that
>>> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
>>>
>>>> ---
>>>>    lib/librte_mempool/rte_mempool.c | 47 
>>>> ++++++--------------------------
>>>>    1 file changed, 8 insertions(+), 39 deletions(-)
>>>>
>>>> diff --git a/lib/librte_mempool/rte_mempool.c 
>>>> b/lib/librte_mempool/rte_mempool.c
>>>> index 3275e48c9..5e1f202dc 100644
>>>> --- a/lib/librte_mempool/rte_mempool.c
>>>> +++ b/lib/librte_mempool/rte_mempool.c
>>> [snip]
>>>
>>>> @@ -531,36 +521,15 @@ rte_mempool_populate_default(struct 
>>>> rte_mempool *mp)
>>>>                goto fail;
>>>>            }
>>>> -        flags = mz_flags;
>>>> -
>>>>            /* if we're trying to reserve contiguous memory, add 
>>>> appropriate
>>>>             * memzone flag.
>>>>             */
>>>> -        if (try_iova_contig_mempool)
>>>> -            flags |= RTE_MEMZONE_IOVA_CONTIG;
>>>> +        if (min_chunk_size == (size_t)mem_size)
>>>> +            mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
>>>>            mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>>>> -                mp->socket_id, flags, align);
>>>> -
>>>> -        /* if we were trying to allocate contiguous memory, failed 
>>>> and
>>>> -         * minimum required contiguous chunk fits minimum page, 
>>>> adjust
>>>> -         * memzone size to the page size, and try again.
>>>> -         */
>>>> -        if (mz == NULL && try_iova_contig_mempool &&
>>>> -                min_chunk_size <= pg_sz) {
>>>> -            try_iova_contig_mempool = false;
>>>> -            flags &= ~RTE_MEMZONE_IOVA_CONTIG;
>>>> -
>>>> -            mem_size = rte_mempool_ops_calc_mem_size(mp, n,
>>>> -                    pg_shift, &min_chunk_size, &align);
>>>> -            if (mem_size < 0) {
>>>> -                ret = mem_size;
>>>> -                goto fail;
>>>> -            }
>>>> +                mp->socket_id, mz_flags, align);
>>>> -            mz = rte_memzone_reserve_aligned(mz_name, mem_size,
>>>> -                mp->socket_id, flags, align);
>>>> -        }
>>>>            /* don't try reserving with 0 size if we were asked to 
>>>> reserve
>>>>             * IOVA-contiguous memory.
>>>>             */
>>> [snip]
>>>
>>>> @@ -587,7 +556,7 @@ rte_mempool_populate_default(struct rte_mempool 
>>>> *mp)
>>>>            else
>>>>                iova = RTE_BAD_IOVA;
>>>> -        if (try_iova_contig_mempool || pg_sz == 0)
>>>> +        if (pg_sz == 0)
>>> I think (mz_flags & RTE_MEMZONE_IOVA_CONTIG) is lost here.
>> Do you mean we should do instead
>>     if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
>>         populate_iova()
>>     else
>>         populate_virt()
>> ?
>
> Yes.
>
>> I would say yes, but it should be removed in patch 5/5.
>> At the end, we don't want objects to be across pages, even
>> with RTE_MEMZONE_IOVA_CONTIG.

Thinking more about it I don't understand why. If we know that
the memzone is IOVA contiguous, why we should not populate
it this way. It does not prevent patch 5/5 to do the job.

>>>>                ret = rte_mempool_populate_iova(mp, mz->addr,
>>>>                    iova, mz->len,
>>>>                    rte_mempool_memchunk_mz_free,


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-29 17:25                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-10-30  3:55                               ` Vamsi Krishna Attunuru
@ 2019-10-30  7:46                               ` Andrew Rybchenko
  2019-10-30  8:38                                 ` Jerin Jacob
  2019-10-30  8:42                                 ` Olivier Matz
  1 sibling, 2 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-30  7:46 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon

On 10/29/19 8:25 PM, Vamsi Krishna Attunuru wrote:
> Hi Olivier,
>
>> -----Original Message-----
>> From: Olivier Matz <olivier.matz@6wind.com>
>> Sent: Monday, October 28, 2019 7:31 PM
>> To: dev@dpdk.org
>> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
>> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
>> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
>> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
>> <kirankumark@marvell.com>; Stephen Hemminger
>> <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
>> Vamsi Krishna Attunuru <vattunuru@marvell.com>
>> Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
>> pages
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> When populating a mempool, ensure that objects are not located across
>> several pages, except if user did not request iova contiguous objects.
>>
>> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>> ---
>>   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
>>   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
>> -
>>   2 files changed, 33 insertions(+), 19 deletions(-)
>>
>> diff --git a/lib/librte_mempool/rte_mempool.c
>> b/lib/librte_mempool/rte_mempool.c
>> index 7664764e5..b23fd1b06 100644
>> --- a/lib/librte_mempool/rte_mempool.c
>> +++ b/lib/librte_mempool/rte_mempool.c
>> @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
>> *mp, size_t *pg_sz)
>>
>>   	if (!need_iova_contig_obj)
>>   		*pg_sz = 0;
>> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
>> RTE_IOVA_VA)
>> -		*pg_sz = 0;
>>   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>>   		*pg_sz = get_min_page_size(mp->socket_id);
>>   	else
>> @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
>> rte_mempool *mp)
>>   	 * then just set page shift and page size to 0, because the user has
>>   	 * indicated that there's no need to care about anything.
>>   	 *
>> -	 * if we do need contiguous objects, there is also an option to reserve
>> -	 * the entire mempool memory as one contiguous block of memory,
>> in
>> -	 * which case the page shift and alignment wouldn't matter as well.
>> +	 * if we do need contiguous objects (if a mempool driver has its
>> +	 * own calc_size() method returning min_chunk_size = mem_size),
>> +	 * there is also an option to reserve the entire mempool memory
>> +	 * as one contiguous block of memory.
>>   	 *
>>   	 * if we require contiguous objects, but not necessarily the entire
>> -	 * mempool reserved space to be contiguous, then there are two
>> options.
>> -	 *
>> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
>> -	 * case), then no page shift needed - our memory allocation will give
>> us
>> -	 * contiguous IO memory as far as the hardware is concerned, so
>> -	 * act as if we're getting contiguous memory.
>> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
>> +	 * and the default ops->populate() will take care of not placing
>> +	 * objects across pages.
>>   	 *
>>   	 * if our IO addresses are physical, we may get memory from bigger
>>   	 * pages, or we might get memory from smaller pages, and how
>> much of it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
>> rte_mempool *mp)
>>   	 *
>>   	 * If we fail to get enough contiguous memory, then we'll go and
>>   	 * reserve space in smaller chunks.
>> -	 *
>> -	 * We also have to take into account the fact that memory that we're
>> -	 * going to allocate from can belong to an externally allocated
>> memory
>> -	 * area, in which case the assumption of IOVA as VA mode being
>> -	 * synonymous with IOVA contiguousness will not hold.
>>   	 */
>>
>>   	need_iova_contig_obj = !(mp->flags &
>> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
>> a/lib/librte_mempool/rte_mempool_ops_default.c
>> b/lib/librte_mempool/rte_mempool_ops_default.c
>> index f6aea7662..dd09a0a32 100644
>> --- a/lib/librte_mempool/rte_mempool_ops_default.c
>> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
>> @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
>> struct rte_mempool *mp,
>>   	return mem_size;
>>   }
>>
>> +/* Returns -1 if object crosses a page boundary, else returns 0 */
>> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
>> +	if (pg_sz == 0)
>> +		return 0;
>> +	if (elt_sz > pg_sz)
>> +		return 0;
>> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
>> pg_sz))
>> +		return -1;
>> +	return 0;
>> +}
>> +
>>   int
>>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
>> max_objs,
>>   		void *vaddr, rte_iova_t iova, size_t len,
>>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>> {
>> -	size_t total_elt_sz;
>> +	char *va = vaddr;
>> +	size_t total_elt_sz, pg_sz;
>>   	size_t off;
>>   	unsigned int i;
>>   	void *obj;
>>
>> +	rte_mempool_get_page_size(mp, &pg_sz);
>> +
>>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>
>> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
>> +	for (off = 0, i = 0; i < max_objs; i++) {
>> +		/* align offset to next page start if required */
>> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
>> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
>> off);
> Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).

It sounds line octeontx2 mempool should have its own populate callback
which cares about it.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-29 17:34                               ` Olivier Matz
@ 2019-10-30  7:56                                 ` Andrew Rybchenko
  0 siblings, 0 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-30  7:56 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/29/19 8:34 PM, Olivier Matz wrote:
> On Tue, Oct 29, 2019 at 01:59:00PM +0300, Andrew Rybchenko wrote:
>> On 10/28/19 5:01 PM, Olivier Matz wrote:
>>> When populating a mempool, ensure that objects are not located across
>>> several pages, except if user did not request iova contiguous objects.
>> I think it breaks distribution across memory channels which could
>> affect performance significantly.
> With 2M hugepages, there are ~900 mbufs per page, and all of them will
> be distributed across memory channels. For larger objects, I don't think
> the distribution is that important.

Yes, that's true. I was thinking about the case when page is smaller.

> With small pages, that may be true. I think the problem was already
> there except in IOVA=VA mode.

I don't know any good real examples. In theory before even
small pages could be merged together and populated as one big
chunk. Right now merging does not make sense since page boundaries
will be respected by default anyway.

> This should be fixable, but I'm not sure a use-case where we can see a
> regression really exists.

Agreed. If total object size allows it, it is possible to spread
offset from page boundary.

I don't know either if it is critical or not.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size
  2019-10-29 17:20                               ` Olivier Matz
@ 2019-10-30  8:32                                 ` Olivier Matz
  2019-10-30 14:29                                   ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-30  8:32 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Tue, Oct 29, 2019 at 06:20:47PM +0100, Olivier Matz wrote:
> On Tue, Oct 29, 2019 at 01:31:22PM +0300, Andrew Rybchenko wrote:
> > On 10/28/19 5:01 PM, Olivier Matz wrote:
> > > In rte_mempool_populate_default(), we determine the page size,
> > > which is needed for calc_size and allocation of memory.
> > > 
> > > Move this in a function and export it, it will be used in next
> > > commit.
> > > 
> > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > 
> > One question below:
> > Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> > 
> > [snip]
> > 
> > > diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
> > > index 17cbca460..4eff2767d 100644
> > > --- a/lib/librte_mempool/rte_mempool_version.map
> > > +++ b/lib/librte_mempool/rte_mempool_version.map
> > > @@ -56,5 +56,6 @@ DPDK_18.05 {
> > >   EXPERIMENTAL {
> > >   	global:
> > > +	rte_mempool_get_page_size;
> > >   	rte_mempool_ops_get_info;
> > >   };
> > 
> > Should internal function be here?
> > 
> 
> Good question. Let me ask a friend ;)

I was influenced by a warning saying "rte_mempool_get_page_size is
flagged as experimental but is not listed in version map", but actually
it should not be flagged as experimental. I'll remove both.

My friend also suggested me to add it in a private header, which is a
good idea, but I think it should be in another patch because there are
already several functions in this case.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-30  7:46                               ` Andrew Rybchenko
@ 2019-10-30  8:38                                 ` Jerin Jacob
  2019-10-30 14:33                                   ` Olivier Matz
  2019-10-30  8:42                                 ` Olivier Matz
  1 sibling, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-10-30  8:38 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, Olivier Matz, dev, Anatoly Burakov,
	Ferruh Yigit, Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

On Wed, Oct 30, 2019 at 1:16 PM Andrew Rybchenko
<arybchenko@solarflare.com> wrote:
>

> >>   int
> >>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> >> max_objs,
> >>              void *vaddr, rte_iova_t iova, size_t len,
> >>              rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> >> {
> >> -    size_t total_elt_sz;
> >> +    char *va = vaddr;
> >> +    size_t total_elt_sz, pg_sz;
> >>      size_t off;
> >>      unsigned int i;
> >>      void *obj;
> >>
> >> +    rte_mempool_get_page_size(mp, &pg_sz);
> >> +
> >>      total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >>
> >> -    for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> >> +    for (off = 0, i = 0; i < max_objs; i++) {
> >> +            /* align offset to next page start if required */
> >> +            if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> >> +                    off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> >> off);
> > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
>
> It sounds line octeontx2 mempool should have its own populate callback
> which cares about it.

Driver specific populate function is not a bad idea. The only concern
would be to

# We need to duplicate rte_mempool_op_populate_default() and
rte_mempool_op_calc_mem_size_default()
# We need to make sure if some one changes the
rte_mempool_op_populate_default() and
rte_mempool_op_calc_mem_size_default() then he/she needs to update the
drivers too

# I would like to add one more point here is that:
- Calculation of  object pad requirements for MEMPOOL_F_NO_SPREAD i.e
optimize_object_size()
is NOT GENERIC. i.e get_gcd() based logic is not generic. DDR
controller defines the address to DDR channel "spread" and it will be
based on SoC or Mirco architecture.
So we need to consider that as well.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-30  7:46                               ` Andrew Rybchenko
  2019-10-30  8:38                                 ` Jerin Jacob
@ 2019-10-30  8:42                                 ` Olivier Matz
  1 sibling, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30  8:42 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Anatoly Burakov, Ferruh Yigit,
	Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

On Wed, Oct 30, 2019 at 10:46:31AM +0300, Andrew Rybchenko wrote:
> On 10/29/19 8:25 PM, Vamsi Krishna Attunuru wrote:
> > Hi Olivier,
> > 
> > > -----Original Message-----
> > > From: Olivier Matz <olivier.matz@6wind.com>
> > > Sent: Monday, October 28, 2019 7:31 PM
> > > To: dev@dpdk.org
> > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > > <kirankumark@marvell.com>; Stephen Hemminger
> > > <sthemmin@microsoft.com>; Thomas Monjalon <thomas@monjalon.net>;
> > > Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Subject: [EXT] [PATCH 5/5] mempool: prevent objects from being across
> > > pages
> > > 
> > > External Email
> > > 
> > > ----------------------------------------------------------------------
> > > When populating a mempool, ensure that objects are not located across
> > > several pages, except if user did not request iova contiguous objects.
> > > 
> > > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > ---
> > >   lib/librte_mempool/rte_mempool.c             | 23 +++++-----------
> > >   lib/librte_mempool/rte_mempool_ops_default.c | 29 ++++++++++++++++++-
> > > -
> > >   2 files changed, 33 insertions(+), 19 deletions(-)
> > > 
> > > diff --git a/lib/librte_mempool/rte_mempool.c
> > > b/lib/librte_mempool/rte_mempool.c
> > > index 7664764e5..b23fd1b06 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -428,8 +428,6 @@ rte_mempool_get_page_size(struct rte_mempool
> > > *mp, size_t *pg_sz)
> > > 
> > >   	if (!need_iova_contig_obj)
> > >   		*pg_sz = 0;
> > > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() ==
> > > RTE_IOVA_VA)
> > > -		*pg_sz = 0;
> > >   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > >   		*pg_sz = get_min_page_size(mp->socket_id);
> > >   	else
> > > @@ -478,17 +476,15 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > >   	 * then just set page shift and page size to 0, because the user has
> > >   	 * indicated that there's no need to care about anything.
> > >   	 *
> > > -	 * if we do need contiguous objects, there is also an option to reserve
> > > -	 * the entire mempool memory as one contiguous block of memory,
> > > in
> > > -	 * which case the page shift and alignment wouldn't matter as well.
> > > +	 * if we do need contiguous objects (if a mempool driver has its
> > > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > > +	 * there is also an option to reserve the entire mempool memory
> > > +	 * as one contiguous block of memory.
> > >   	 *
> > >   	 * if we require contiguous objects, but not necessarily the entire
> > > -	 * mempool reserved space to be contiguous, then there are two
> > > options.
> > > -	 *
> > > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > > -	 * case), then no page shift needed - our memory allocation will give
> > > us
> > > -	 * contiguous IO memory as far as the hardware is concerned, so
> > > -	 * act as if we're getting contiguous memory.
> > > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > > +	 * and the default ops->populate() will take care of not placing
> > > +	 * objects across pages.
> > >   	 *
> > >   	 * if our IO addresses are physical, we may get memory from bigger
> > >   	 * pages, or we might get memory from smaller pages, and how
> > > much of it @@ -501,11 +497,6 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > >   	 *
> > >   	 * If we fail to get enough contiguous memory, then we'll go and
> > >   	 * reserve space in smaller chunks.
> > > -	 *
> > > -	 * We also have to take into account the fact that memory that we're
> > > -	 * going to allocate from can belong to an externally allocated
> > > memory
> > > -	 * area, in which case the assumption of IOVA as VA mode being
> > > -	 * synonymous with IOVA contiguousness will not hold.
> > >   	 */
> > > 
> > >   	need_iova_contig_obj = !(mp->flags &
> > > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > > a/lib/librte_mempool/rte_mempool_ops_default.c
> > > b/lib/librte_mempool/rte_mempool_ops_default.c
> > > index f6aea7662..dd09a0a32 100644
> > > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > > @@ -61,21 +61,44 @@ rte_mempool_op_calc_mem_size_default(const
> > > struct rte_mempool *mp,
> > >   	return mem_size;
> > >   }
> > > 
> > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > +	if (pg_sz == 0)
> > > +		return 0;
> > > +	if (elt_sz > pg_sz)
> > > +		return 0;
> > > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > pg_sz))
> > > +		return -1;
> > > +	return 0;
> > > +}
> > > +
> > >   int
> > >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > max_objs,
> > >   		void *vaddr, rte_iova_t iova, size_t len,
> > >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > {
> > > -	size_t total_elt_sz;
> > > +	char *va = vaddr;
> > > +	size_t total_elt_sz, pg_sz;
> > >   	size_t off;
> > >   	unsigned int i;
> > >   	void *obj;
> > > 
> > > +	rte_mempool_get_page_size(mp, &pg_sz);
> > > +
> > >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > 
> > > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > +	for (off = 0, i = 0; i < max_objs; i++) {
> > > +		/* align offset to next page start if required */
> > > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > off);
> > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
> 
> It sounds line octeontx2 mempool should have its own populate callback
> which cares about it.
> 

Agree, even the calc_mem_size() should be modified to be rigorous.
Let me draft something in next version, so you can test it.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-30  7:44                                   ` Andrew Rybchenko
@ 2019-10-30 10:38                                     ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 10:38 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Wed, Oct 30, 2019 at 10:44:04AM +0300, Andrew Rybchenko wrote:
> On 10/30/19 10:36 AM, Andrew Rybchenko wrote:
> > On 10/29/19 8:20 PM, Olivier Matz wrote:
> > > On Tue, Oct 29, 2019 at 01:25:10PM +0300, Andrew Rybchenko wrote:
> > > > On 10/28/19 5:01 PM, Olivier Matz wrote:
> > > > > The previous commit reduced the amount of required memory when
> > > > > populating the mempool with non iova-contiguous memory.
> > > > > 
> > > > > Since there is no big advantage to have a fully
> > > > > iova-contiguous mempool
> > > > > if it is not explicitly asked, remove this code, it simplifies the
> > > > > populate function.
> > > > > 
> > > > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > > One comment below, other than that
> > > > Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> > > > 
> > > > > ---
> > > > >    lib/librte_mempool/rte_mempool.c | 47
> > > > > ++++++--------------------------
> > > > >    1 file changed, 8 insertions(+), 39 deletions(-)
> > > > > 
> > > > > diff --git a/lib/librte_mempool/rte_mempool.c
> > > > > b/lib/librte_mempool/rte_mempool.c
> > > > > index 3275e48c9..5e1f202dc 100644
> > > > > --- a/lib/librte_mempool/rte_mempool.c
> > > > > +++ b/lib/librte_mempool/rte_mempool.c
> > > > [snip]
> > > > 
> > > > > @@ -531,36 +521,15 @@ rte_mempool_populate_default(struct
> > > > > rte_mempool *mp)
> > > > >                goto fail;
> > > > >            }
> > > > > -        flags = mz_flags;
> > > > > -
> > > > >            /* if we're trying to reserve contiguous memory,
> > > > > add appropriate
> > > > >             * memzone flag.
> > > > >             */
> > > > > -        if (try_iova_contig_mempool)
> > > > > -            flags |= RTE_MEMZONE_IOVA_CONTIG;
> > > > > +        if (min_chunk_size == (size_t)mem_size)
> > > > > +            mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
> > > > >            mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> > > > > -                mp->socket_id, flags, align);
> > > > > -
> > > > > -        /* if we were trying to allocate contiguous memory,
> > > > > failed and
> > > > > -         * minimum required contiguous chunk fits minimum
> > > > > page, adjust
> > > > > -         * memzone size to the page size, and try again.
> > > > > -         */
> > > > > -        if (mz == NULL && try_iova_contig_mempool &&
> > > > > -                min_chunk_size <= pg_sz) {
> > > > > -            try_iova_contig_mempool = false;
> > > > > -            flags &= ~RTE_MEMZONE_IOVA_CONTIG;
> > > > > -
> > > > > -            mem_size = rte_mempool_ops_calc_mem_size(mp, n,
> > > > > -                    pg_shift, &min_chunk_size, &align);
> > > > > -            if (mem_size < 0) {
> > > > > -                ret = mem_size;
> > > > > -                goto fail;
> > > > > -            }
> > > > > +                mp->socket_id, mz_flags, align);
> > > > > -            mz = rte_memzone_reserve_aligned(mz_name, mem_size,
> > > > > -                mp->socket_id, flags, align);
> > > > > -        }
> > > > >            /* don't try reserving with 0 size if we were
> > > > > asked to reserve
> > > > >             * IOVA-contiguous memory.
> > > > >             */
> > > > [snip]
> > > > 
> > > > > @@ -587,7 +556,7 @@ rte_mempool_populate_default(struct
> > > > > rte_mempool *mp)
> > > > >            else
> > > > >                iova = RTE_BAD_IOVA;
> > > > > -        if (try_iova_contig_mempool || pg_sz == 0)
> > > > > +        if (pg_sz == 0)
> > > > I think (mz_flags & RTE_MEMZONE_IOVA_CONTIG) is lost here.
> > > Do you mean we should do instead
> > >     if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
> > >         populate_iova()
> > >     else
> > >         populate_virt()
> > > ?
> > 
> > Yes.
> > 
> > > I would say yes, but it should be removed in patch 5/5.
> > > At the end, we don't want objects to be across pages, even
> > > with RTE_MEMZONE_IOVA_CONTIG.
> 
> Thinking more about it I don't understand why. If we know that
> the memzone is IOVA contiguous, why we should not populate
> it this way. It does not prevent patch 5/5 to do the job.

You are right. The page-crossing check is done in the ops->populate,
so it is also done by populate_iova().

> > > > >                ret = rte_mempool_populate_iova(mp, mz->addr,
> > > > >                    iova, mz->len,
> > > > >                    rte_mempool_memchunk_mz_free,
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size
  2019-10-30  8:32                                 ` Olivier Matz
@ 2019-10-30 14:29                                   ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:29 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On Wed, Oct 30, 2019 at 09:32:08AM +0100, Olivier Matz wrote:
> On Tue, Oct 29, 2019 at 06:20:47PM +0100, Olivier Matz wrote:
> > On Tue, Oct 29, 2019 at 01:31:22PM +0300, Andrew Rybchenko wrote:
> > > On 10/28/19 5:01 PM, Olivier Matz wrote:
> > > > In rte_mempool_populate_default(), we determine the page size,
> > > > which is needed for calc_size and allocation of memory.
> > > > 
> > > > Move this in a function and export it, it will be used in next
> > > > commit.
> > > > 
> > > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > 
> > > One question below:
> > > Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> > > 
> > > [snip]
> > > 
> > > > diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
> > > > index 17cbca460..4eff2767d 100644
> > > > --- a/lib/librte_mempool/rte_mempool_version.map
> > > > +++ b/lib/librte_mempool/rte_mempool_version.map
> > > > @@ -56,5 +56,6 @@ DPDK_18.05 {
> > > >   EXPERIMENTAL {
> > > >   	global:
> > > > +	rte_mempool_get_page_size;
> > > >   	rte_mempool_ops_get_info;
> > > >   };
> > > 
> > > Should internal function be here?
> > > 
> > 
> > Good question. Let me ask a friend ;)
> 
> I was influenced by a warning saying "rte_mempool_get_page_size is
> flagged as experimental but is not listed in version map", but actually
> it should not be flagged as experimental. I'll remove both.
> 
> My friend also suggested me to add it in a private header, which is a
> good idea, but I think it should be in another patch because there are
> already several functions in this case.

Finally, I had to keep it in the API, because the octeontx2 driver will
need it. I also kept the @internal tag in the comment, because I think
this function should only be used in mempool drivers.

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-30  8:38                                 ` Jerin Jacob
@ 2019-10-30 14:33                                   ` Olivier Matz
  2019-10-30 14:54                                     ` Jerin Jacob
  0 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:33 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Andrew Rybchenko, Vamsi Krishna Attunuru, dev, Anatoly Burakov,
	Ferruh Yigit, Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

Hi Jerin,

On Wed, Oct 30, 2019 at 02:08:40PM +0530, Jerin Jacob wrote:
> On Wed, Oct 30, 2019 at 1:16 PM Andrew Rybchenko
> <arybchenko@solarflare.com> wrote:
> >
> 
> > >>   int
> > >>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > >> max_objs,
> > >>              void *vaddr, rte_iova_t iova, size_t len,
> > >>              rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > >> {
> > >> -    size_t total_elt_sz;
> > >> +    char *va = vaddr;
> > >> +    size_t total_elt_sz, pg_sz;
> > >>      size_t off;
> > >>      unsigned int i;
> > >>      void *obj;
> > >>
> > >> +    rte_mempool_get_page_size(mp, &pg_sz);
> > >> +
> > >>      total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > >>
> > >> -    for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > >> +    for (off = 0, i = 0; i < max_objs; i++) {
> > >> +            /* align offset to next page start if required */
> > >> +            if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > >> +                    off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > >> off);
> > > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
> >
> > It sounds line octeontx2 mempool should have its own populate callback
> > which cares about it.
> 
> Driver specific populate function is not a bad idea. The only concern
> would be to
> 
> # We need to duplicate rte_mempool_op_populate_default() and
> rte_mempool_op_calc_mem_size_default()
> # We need to make sure if some one changes the
> rte_mempool_op_populate_default() and
> rte_mempool_op_calc_mem_size_default() then he/she needs to update the
> drivers too

Agree, we need to be careful. Hopefully we shouldn't change this code
very often.

I'm sending a v2 with a patch to the octeontx2 driver which --I hope--
should solve the issue.

> # I would like to add one more point here is that:
> - Calculation of  object pad requirements for MEMPOOL_F_NO_SPREAD i.e
> optimize_object_size()
> is NOT GENERIC. i.e get_gcd() based logic is not generic. DDR
> controller defines the address to DDR channel "spread" and it will be
> based on SoC or Mirco architecture.
> So we need to consider that as well.

Could you give some details about how it should be optimized on your
platforms, and what is the behavior today (advertised nb_rank and
nb_channels)?

Thanks,
Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (6 preceding siblings ...)
  2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
@ 2019-10-30 14:36                         ` Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 1/6] mempool: allow unaligned addr/len in populate virt Olivier Matz
                                             ` (6 more replies)
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
  9 siblings, 7 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

KNI supposes that mbufs are contiguous in kernel virtual memory. This
may not be true when using the IOVA=VA mode. To fix this, a possibility
is to ensure that objects do not cross page boundaries in mempool. This
patchset implements this in the last patch (5/5).

The previous patches prepare the job:
- allow to populate with an unaligned virtual area (1/5).
- reduce spaced wasted in the mempool size calculation when not using
  the iova-contiguous allocation (2/5).
- remove the iova-contiguous allocation when populating mempool (3/5):
  a va-contiguous alloc does the job as well if we want to populate
  without crossing page boundaries, so simplify the mempool populate
  function.
- export a function to get the minimum page used in a mempool (4/5)

Memory consumption impact when using hugepages:
- worst case: + ~0.1% for a mbuf pool (objsize ~= 2368)
- best case: -50% for if pool size is just above page size

The memory consumption impact with 4K pages in IOVA=VA mode could
however consume up to 75% more memory for mbuf pool, because there will
be only 1 mbuf per page. Not sure how common this usecase is.

Caveat: this changes the behavior of the mempool (calc_mem_size and
populate), and there is a small risk to break things, especially with
alternate mempool drivers.

v2

* update octeontx2 driver to keep alignment constraint (issue seen by
  Vamsi)
* add a new patch to use RTE_MEMPOOL_ALIGN (Andrew)
* fix initialization of for loop in rte_mempool_populate_virt() (Andrew)
* use rte_mempool_populate_iova() if mz_flags has
  RTE_MEMZONE_IOVA_CONTIG (Andrew)
* check rte_mempool_get_page_size() return value (Andrew)
* some other minor style improvements

rfc -> v1

* remove first cleanup patch, it was pushed separately
  a2b5a8722f20 ("mempool: clarify default populate function")
* add missing change in rte_mempool_op_calc_mem_size_default()
* allow unaligned addr/len in populate virt
* better split patches
* try to better explain the change
* use DPDK align macros when relevant

Olivier Matz (6):
  mempool: allow unaligned addr/len in populate virt
  mempool: reduce wasted space on mempool populate
  mempool: remove optimistic IOVA-contiguous allocation
  mempool: introduce function to get mempool page size
  mempool: prevent objects from being across pages
  mempool: use the specific macro for object alignment

 drivers/mempool/octeontx2/Makefile           |   3 +
 drivers/mempool/octeontx2/meson.build        |   3 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 +++++++++++++--
 lib/librte_mempool/rte_mempool.c             | 147 ++++++++-----------
 lib/librte_mempool/rte_mempool.h             |  15 +-
 lib/librte_mempool/rte_mempool_ops.c         |   4 +-
 lib/librte_mempool/rte_mempool_ops_default.c |  60 ++++++--
 lib/librte_mempool/rte_mempool_version.map   |   1 +
 8 files changed, 236 insertions(+), 116 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 1/6] mempool: allow unaligned addr/len in populate virt
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
@ 2019-10-30 14:36                           ` Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 2/6] mempool: reduce wasted space on mempool populate Olivier Matz
                                             ` (5 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

rte_mempool_populate_virt() currently requires that both addr
and length are page-aligned.

Remove this uneeded constraint which can be annoying with big
hugepages (ex: 1GB).

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 lib/librte_mempool/rte_mempool.c | 23 +++++++++++------------
 lib/librte_mempool/rte_mempool.h |  3 +--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 0f29e8712..88e49c751 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	size_t off, phys_len;
 	int ret, cnt = 0;
 
-	/* address and len must be page-aligned */
-	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
-		return -EINVAL;
-	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
-		return -EINVAL;
-
 	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
 		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
 			len, free_cb, opaque);
 
-	for (off = 0; off + pg_sz <= len &&
+	for (off = 0; off < len &&
 		     mp->populated_size < mp->size; off += phys_len) {
 
 		iova = rte_mem_virt2iova(addr + off);
@@ -389,12 +383,18 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 		}
 
 		/* populate with the largest group of contiguous pages */
-		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
+		for (phys_len = RTE_MIN(
+			(size_t)(RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
+				(addr + off)),
+			len - off);
+		     off + phys_len < len;
+		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
 			rte_iova_t iova_tmp;
 
 			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
 
-			if (iova_tmp != iova + phys_len)
+			if (iova_tmp == RTE_BAD_IOVA ||
+					iova_tmp != iova + phys_len)
 				break;
 		}
 
@@ -575,8 +575,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags,
-					RTE_MAX(pg_sz, align));
+					mp->socket_id, flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -601,7 +600,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 				(void *)(uintptr_t)mz);
 		else
 			ret = rte_mempool_populate_virt(mp, mz->addr,
-				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
+				mz->len, pg_sz,
 				rte_mempool_memchunk_mz_free,
 				(void *)(uintptr_t)mz);
 		if (ret < 0) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a04..0fe8aa7b8 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
  *   A pointer to the mempool structure.
  * @param addr
  *   The virtual address of memory that should be used to store objects.
- *   Must be page-aligned.
  * @param len
- *   The length of memory in bytes. Must be page-aligned.
+ *   The length of memory in bytes.
  * @param pg_sz
  *   The size of memory pages in this virtual area.
  * @param free_cb
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 2/6] mempool: reduce wasted space on mempool populate
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 1/6] mempool: allow unaligned addr/len in populate virt Olivier Matz
@ 2019-10-30 14:36                           ` Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 3/6] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
                                             ` (4 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

The size returned by rte_mempool_op_calc_mem_size_default() is aligned
to the specified page size. Therefore, with big pages, the returned size
can be much more that what we really need to populate the mempool.

For instance, populating a mempool that requires 1.1GB of memory with
1GB hugepages can result in allocating 2GB of memory.

This problem is hidden most of the time due to the allocation method of
rte_mempool_populate_default(): when try_iova_contig_mempool=true, it
first tries to allocate an iova contiguous area, without the alignment
constraint. If it fails, it fallbacks to an aligned allocation that does
not require to be iova-contiguous. This can also fallback into several
smaller aligned allocations.

This commit changes rte_mempool_op_calc_mem_size_default() to relax the
alignment constraint to a cache line and to return a smaller size.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybdhenko <arybchenko@solarflare.com>
---
 lib/librte_mempool/rte_mempool.c             |  7 ++---
 lib/librte_mempool/rte_mempool.h             |  2 +-
 lib/librte_mempool/rte_mempool_ops.c         |  4 ++-
 lib/librte_mempool/rte_mempool_ops_default.c | 28 +++++++++++++++-----
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 88e49c751..4e0d576f5 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -477,11 +477,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * wasting some space this way, but it's much nicer than looping around
 	 * trying to reserve each and every page size.
 	 *
-	 * However, since size calculation will produce page-aligned sizes, it
-	 * makes sense to first try and see if we can reserve the entire memzone
-	 * in one contiguous chunk as well (otherwise we might end up wasting a
-	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
-	 * memory, then we'll go and reserve space page-by-page.
+	 * If we fail to get enough contiguous memory, then we'll go and
+	 * reserve space in smaller chunks.
 	 *
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 0fe8aa7b8..8fa3c04e5 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
  * @param[out] align
  *   Location for required memory chunk alignment.
  * @return
- *   Required memory size aligned at page boundary.
+ *   Required memory size.
  */
 typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		uint32_t obj_num,  uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index e02eb702c..22c5251eb 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to notify new memory area to external mempool */
+/* wrapper to calculate the memory size required to store given number
+ * of objects
+ */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc82d..f6aea7662 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -12,7 +12,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				     size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
-	size_t obj_per_page, pg_num, pg_sz;
+	size_t obj_per_page, pg_sz, objs_in_last_page;
 	size_t mem_size;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
@@ -33,14 +33,30 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 			mem_size =
 				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
 		} else {
-			pg_num = (obj_num + obj_per_page - 1) / obj_per_page;
-			mem_size = pg_num << pg_shift;
+			/* In the best case, the allocator will return a
+			 * page-aligned address. For example, with 5 objs,
+			 * the required space is as below:
+			 *  |     page0     |     page1     |  page2 (last) |
+			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
+			 *  <------------- mem_size ------------->
+			 */
+			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
+			/* room required for the last page */
+			mem_size = objs_in_last_page * total_elt_sz;
+			/* room required for other pages */
+			mem_size += ((obj_num - objs_in_last_page) /
+				obj_per_page) << pg_shift;
+
+			/* In the worst case, the allocator returns a
+			 * non-aligned pointer, wasting up to
+			 * total_elt_sz. Add a margin for that.
+			 */
+			 mem_size += total_elt_sz - 1;
 		}
 	}
 
-	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
-
-	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
+	*min_chunk_size = total_elt_sz;
+	*align = RTE_CACHE_LINE_SIZE;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 3/6] mempool: remove optimistic IOVA-contiguous allocation
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 1/6] mempool: allow unaligned addr/len in populate virt Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 2/6] mempool: reduce wasted space on mempool populate Olivier Matz
@ 2019-10-30 14:36                           ` Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 4/6] mempool: introduce function to get mempool page size Olivier Matz
                                             ` (3 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

The previous commit reduced the amount of required memory when
populating the mempool with non iova-contiguous memory.

Since there is no big advantage to have a fully iova-contiguous mempool
if it is not explicitly asked, remove this code, it simplifies the
populate function.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 4e0d576f5..213e574fc 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -430,7 +430,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool try_iova_contig_mempool;
 	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
@@ -483,9 +482,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
 	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold. We should also try
-	 * to go for contiguous memory even if we're in no-huge mode, because
-	 * external memory may in fact be IOVA-contiguous.
+	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	/* check if we can retrieve a valid socket ID */
@@ -494,7 +491,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		return -EINVAL;
 	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
-	try_iova_contig_mempool = false;
 
 	if (!need_iova_contig_obj) {
 		pg_sz = 0;
@@ -503,7 +499,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		pg_sz = 0;
 		pg_shift = 0;
 	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		try_iova_contig_mempool = true;
 		pg_sz = get_min_page_size(mp->socket_id);
 		pg_shift = rte_bsf32(pg_sz);
 	} else {
@@ -513,14 +508,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
-		unsigned int flags;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					0, &min_chunk_size, &align);
-		else
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
+		mem_size = rte_mempool_ops_calc_mem_size(
+			mp, n, pg_shift, &min_chunk_size, &align);
 
 		if (mem_size < 0) {
 			ret = mem_size;
@@ -534,36 +524,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
-		flags = mz_flags;
-
 		/* if we're trying to reserve contiguous memory, add appropriate
 		 * memzone flag.
 		 */
-		if (try_iova_contig_mempool)
-			flags |= RTE_MEMZONE_IOVA_CONTIG;
+		if (min_chunk_size == (size_t)mem_size)
+			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-
-		/* if we were trying to allocate contiguous memory, failed and
-		 * minimum required contiguous chunk fits minimum page, adjust
-		 * memzone size to the page size, and try again.
-		 */
-		if (mz == NULL && try_iova_contig_mempool &&
-				min_chunk_size <= pg_sz) {
-			try_iova_contig_mempool = false;
-			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
-			if (mem_size < 0) {
-				ret = mem_size;
-				goto fail;
-			}
+				mp->socket_id, mz_flags, align);
 
-			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
@@ -572,7 +541,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags, align);
+					mp->socket_id, mz_flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -590,7 +559,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		else
 			iova = RTE_BAD_IOVA;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
+		if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 4/6] mempool: introduce function to get mempool page size
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
                                             ` (2 preceding siblings ...)
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 3/6] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
@ 2019-10-30 14:36                           ` Olivier Matz
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 5/6] mempool: prevent objects from being across pages Olivier Matz
                                             ` (2 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

In rte_mempool_populate_default(), we determine the page size,
which is needed for calc_size and allocation of memory.

Move this in a function and export it, it will be used in next
commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 lib/librte_mempool/rte_mempool.c           | 51 ++++++++++++++--------
 lib/librte_mempool/rte_mempool.h           |  7 +++
 lib/librte_mempool/rte_mempool_version.map |  1 +
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 213e574fc..758c5410b 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,33 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Get the minimal page size used in a mempool before populating it. */
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
+{
+	bool need_iova_contig_obj;
+	bool alloc_in_ext_mem;
+	int ret;
+
+	/* check if we can retrieve a valid socket ID */
+	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+	if (ret < 0)
+		return -EINVAL;
+	alloc_in_ext_mem = (ret == 1);
+	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+
+	if (!need_iova_contig_obj)
+		*pg_sz = 0;
+	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
+		*pg_sz = 0;
+	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
+		*pg_sz = get_min_page_size(mp->socket_id);
+	else
+		*pg_sz = getpagesize();
+
+	return 0;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
@@ -425,12 +452,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
 	ssize_t mem_size;
-	size_t align, pg_sz, pg_shift;
+	size_t align, pg_sz, pg_shift = 0;
 	rte_iova_t iova;
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -485,26 +511,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
-	/* check if we can retrieve a valid socket ID */
-	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
-	if (ret < 0)
-		return -EINVAL;
-	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
-	if (!need_iova_contig_obj) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		pg_sz = get_min_page_size(mp->socket_id);
-		pg_shift = rte_bsf32(pg_sz);
-	} else {
-		pg_sz = getpagesize();
+	if (pg_sz != 0)
 		pg_shift = rte_bsf32(pg_sz);
-	}
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8fa3c04e5..6d98b3743 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1691,6 +1691,13 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
 		      void *arg);
 
+/**
+ * @internal Get page size used for mempool object allocation.
+ */
+__rte_experimental
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca460..4eff2767d 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -56,5 +56,6 @@ DPDK_18.05 {
 EXPERIMENTAL {
 	global:
 
+	rte_mempool_get_page_size;
 	rte_mempool_ops_get_info;
 };
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
                                             ` (3 preceding siblings ...)
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 4/6] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-10-30 14:36                           ` Olivier Matz
  2019-10-31  6:54                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 6/6] mempool: use the specific macro for object alignment Olivier Matz
  2019-11-01  3:56                           ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations across pages Nipun Gupta
  6 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

When populating a mempool, ensure that objects are not located across
several pages, except if user did not request iova contiguous objects.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 drivers/mempool/octeontx2/Makefile           |   3 +
 drivers/mempool/octeontx2/meson.build        |   3 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
 lib/librte_mempool/rte_mempool.c             |  23 ++--
 lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
 5 files changed, 147 insertions(+), 33 deletions(-)

diff --git a/drivers/mempool/octeontx2/Makefile b/drivers/mempool/octeontx2/Makefile
index 87cce22c6..d781cbfc6 100644
--- a/drivers/mempool/octeontx2/Makefile
+++ b/drivers/mempool/octeontx2/Makefile
@@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
 
 LIBABIVER := 1
 
+# for rte_mempool_get_page_size
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 #
 # all source are stored in SRCS-y
 #
diff --git a/drivers/mempool/octeontx2/meson.build b/drivers/mempool/octeontx2/meson.build
index 9fde40f0e..28f9634da 100644
--- a/drivers/mempool/octeontx2/meson.build
+++ b/drivers/mempool/octeontx2/meson.build
@@ -21,3 +21,6 @@ foreach flag: extra_flags
 endforeach
 
 deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
+
+# for rte_mempool_get_page_size
+allow_experimental_apis = true
diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index d769575f4..47117aec6 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -713,12 +713,76 @@ static ssize_t
 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
 		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
 {
-	/*
-	 * Simply need space for one more object to be able to
-	 * fulfill alignment requirements.
-	 */
-	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1, pg_shift,
-						    min_chunk_size, align);
+	size_t total_elt_sz;
+	size_t obj_per_page, pg_sz, objs_in_last_page;
+	size_t mem_size;
+
+	/* derived from rte_mempool_op_calc_mem_size_default() */
+
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+
+	if (total_elt_sz == 0) {
+		mem_size = 0;
+	} else if (pg_shift == 0) {
+		/* one object margin to fix alignment */
+		mem_size = total_elt_sz * (obj_num + 1);
+	} else {
+		pg_sz = (size_t)1 << pg_shift;
+		obj_per_page = pg_sz / total_elt_sz;
+
+		/* we need to keep one object to fix alignment */
+		if (obj_per_page > 0)
+			obj_per_page--;
+
+		if (obj_per_page == 0) {
+			/*
+			 * Note that if object size is bigger than page size,
+			 * then it is assumed that pages are grouped in subsets
+			 * of physically continuous pages big enough to store
+			 * at least one object.
+			 */
+			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
+						pg_sz) * obj_num;
+		} else {
+			/* In the best case, the allocator will return a
+			 * page-aligned address. For example, with 5 objs,
+			 * the required space is as below:
+			 *  |     page0     |     page1     |  page2 (last) |
+			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
+			 *  <------------- mem_size ------------->
+			 */
+			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
+			/* room required for the last page */
+			mem_size = objs_in_last_page * total_elt_sz;
+			/* room required for other pages */
+			mem_size += ((obj_num - objs_in_last_page) /
+				obj_per_page) << pg_shift;
+
+			/* In the worst case, the allocator returns a
+			 * non-aligned pointer, wasting up to
+			 * total_elt_sz. Add a margin for that.
+			 */
+			 mem_size += total_elt_sz - 1;
+		}
+	}
+
+	*min_chunk_size = total_elt_sz * 2;
+	*align = RTE_CACHE_LINE_SIZE;
+
+	return mem_size;
+}
+
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
 }
 
 static int
@@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 		  rte_iova_t iova, size_t len,
 		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
+	unsigned int i;
+	void *obj;
+	int ret;
 
 	if (iova == RTE_BAD_IOVA)
 		return -EINVAL;
@@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	/* Align object start address to a multiple of total_elt_sz */
-	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
 
 	if (len < off)
 		return -EINVAL;
 
-	vaddr = (char *)vaddr + off;
-	iova += off;
-	len -= off;
 
-	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
+	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
 
 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
 		return -EBUSY;
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
-					       obj_cb, obj_cb_arg);
+	/* the following is derived from rte_mempool_op_populate_default() */
+
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < max_objs; i++) {
+		/* avoid objects to cross page boundaries, and align
+		 * offset to a multiple of total_elt_sz.
+		 */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
+						total_elt_sz) + 1);
+		}
+
+		if (off + total_elt_sz > len)
+			break;
+
+		off += mp->header_size;
+		obj = va + off;
+		obj_cb(mp, obj_cb_arg, obj,
+		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
+		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
+		off += mp->elt_size + mp->trailer_size;
+	}
+
+	return i;
 }
 
 static struct rte_mempool_ops otx2_npa_ops = {
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 758c5410b..d3db9273d 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
 
 	if (!need_iova_contig_obj)
 		*pg_sz = 0;
-	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
-		*pg_sz = 0;
 	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
 		*pg_sz = get_min_page_size(mp->socket_id);
 	else
@@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * then just set page shift and page size to 0, because the user has
 	 * indicated that there's no need to care about anything.
 	 *
-	 * if we do need contiguous objects, there is also an option to reserve
-	 * the entire mempool memory as one contiguous block of memory, in
-	 * which case the page shift and alignment wouldn't matter as well.
+	 * if we do need contiguous objects (if a mempool driver has its
+	 * own calc_size() method returning min_chunk_size = mem_size),
+	 * there is also an option to reserve the entire mempool memory
+	 * as one contiguous block of memory.
 	 *
 	 * if we require contiguous objects, but not necessarily the entire
-	 * mempool reserved space to be contiguous, then there are two options.
-	 *
-	 * if our IO addresses are virtual, not actual physical (IOVA as VA
-	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous IO memory as far as the hardware is concerned, so
-	 * act as if we're getting contiguous memory.
+	 * mempool reserved space to be contiguous, pg_sz will be != 0,
+	 * and the default ops->populate() will take care of not placing
+	 * objects across pages.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
 	 * pages, or we might get memory from smaller pages, and how much of it
@@ -504,11 +500,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * If we fail to get enough contiguous memory, then we'll go and
 	 * reserve space in smaller chunks.
-	 *
-	 * We also have to take into account the fact that memory that we're
-	 * going to allocate from can belong to an externally allocated memory
-	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index f6aea7662..e5cd4600f 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
+}
+
 int
 rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
+	int ret;
+
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	for (off = 0, i = 0; i < max_objs; i++) {
+		/* avoid objects to cross page boundaries */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
-		obj = (char *)vaddr + off;
+		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v2 6/6] mempool: use the specific macro for object alignment
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
                                             ` (4 preceding siblings ...)
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 5/6] mempool: prevent objects from being across pages Olivier Matz
@ 2019-10-30 14:36                           ` Olivier Matz
  2019-10-30 14:55                             ` Andrew Rybchenko
  2019-11-01  3:56                           ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations across pages Nipun Gupta
  6 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-30 14:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

For consistency, RTE_MEMPOOL_ALIGN should be used in place of
RTE_CACHE_LINE_SIZE. They have the same value, because the only arch
that was defining a specific value for it has been removed from dpdk.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 lib/librte_mempool/rte_mempool.c             | 2 +-
 lib/librte_mempool/rte_mempool.h             | 3 +++
 lib/librte_mempool/rte_mempool_ops_default.c | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index d3db9273d..40cae3eb6 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -329,7 +329,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
 		off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr;
 	else
-		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
+		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_MEMPOOL_ALIGN) - vaddr;
 
 	if (off > len) {
 		ret = -EINVAL;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 6d98b3743..cb7c423e6 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -116,6 +116,9 @@ struct rte_mempool_objsz {
 #define	MEMPOOL_PG_NUM_DEFAULT	1
 
 #ifndef RTE_MEMPOOL_ALIGN
+/**
+ * Alignment of elements inside mempool.
+ */
 #define RTE_MEMPOOL_ALIGN	RTE_CACHE_LINE_SIZE
 #endif
 
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index e5cd4600f..390c490fd 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -56,7 +56,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	}
 
 	*min_chunk_size = total_elt_sz;
-	*align = RTE_CACHE_LINE_SIZE;
+	*align = RTE_MEMPOOL_ALIGN;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH 5/5] mempool: prevent objects from being across pages
  2019-10-30 14:33                                   ` Olivier Matz
@ 2019-10-30 14:54                                     ` Jerin Jacob
  0 siblings, 0 replies; 251+ messages in thread
From: Jerin Jacob @ 2019-10-30 14:54 UTC (permalink / raw)
  To: Olivier Matz
  Cc: Andrew Rybchenko, Vamsi Krishna Attunuru, dev, Anatoly Burakov,
	Ferruh Yigit, Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

On Wed, Oct 30, 2019 at 8:03 PM Olivier Matz <olivier.matz@6wind.com> wrote:
>
> Hi Jerin,
>
> On Wed, Oct 30, 2019 at 02:08:40PM +0530, Jerin Jacob wrote:
> > On Wed, Oct 30, 2019 at 1:16 PM Andrew Rybchenko
> > <arybchenko@solarflare.com> wrote:
> > >
> >
> > > >>   int
> > > >>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > >> max_objs,
> > > >>              void *vaddr, rte_iova_t iova, size_t len,
> > > >>              rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > >> {
> > > >> -    size_t total_elt_sz;
> > > >> +    char *va = vaddr;
> > > >> +    size_t total_elt_sz, pg_sz;
> > > >>      size_t off;
> > > >>      unsigned int i;
> > > >>      void *obj;
> > > >>
> > > >> +    rte_mempool_get_page_size(mp, &pg_sz);
> > > >> +
> > > >>      total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > >>
> > > >> -    for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > >> +    for (off = 0, i = 0; i < max_objs; i++) {
> > > >> +            /* align offset to next page start if required */
> > > >> +            if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > >> +                    off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > >> off);
> > > > Moving offset to the start of next page and than freeing (vaddr + off + header_size) to pool, this scheme is not aligning with octeontx2 mempool's buf alignment requirement(buffer address needs to be multiple of buffer size).
> > >
> > > It sounds line octeontx2 mempool should have its own populate callback
> > > which cares about it.
> >
> > Driver specific populate function is not a bad idea. The only concern
> > would be to
> >
> > # We need to duplicate rte_mempool_op_populate_default() and
> > rte_mempool_op_calc_mem_size_default()
> > # We need to make sure if some one changes the
> > rte_mempool_op_populate_default() and
> > rte_mempool_op_calc_mem_size_default() then he/she needs to update the
> > drivers too
>
> Agree, we need to be careful. Hopefully we shouldn't change this code
> very often.
>
> I'm sending a v2 with a patch to the octeontx2 driver which --I hope--
> should solve the issue.

Thanks, Olivier. We will test it.

>
> > # I would like to add one more point here is that:
> > - Calculation of  object pad requirements for MEMPOOL_F_NO_SPREAD i.e
> > optimize_object_size()
> > is NOT GENERIC. i.e get_gcd() based logic is not generic. DDR
> > controller defines the address to DDR channel "spread" and it will be
> > based on SoC or Mirco architecture.
> > So we need to consider that as well.
>
> Could you give some details about how it should be optimized on your
> platforms, and what is the behavior today (advertised nb_rank and
> nb_channels)?

I will start a new thread on this CCing all arch maintainers.


>
> Thanks,
> Olivier

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v2 6/6] mempool: use the specific macro for object alignment
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 6/6] mempool: use the specific macro for object alignment Olivier Matz
@ 2019-10-30 14:55                             ` Andrew Rybchenko
  0 siblings, 0 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-30 14:55 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru

On 10/30/19 5:36 PM, Olivier Matz wrote:
> For consistency, RTE_MEMPOOL_ALIGN should be used in place of
> RTE_CACHE_LINE_SIZE. They have the same value, because the only arch
> that was defining a specific value for it has been removed from dpdk.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>

Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 5/6] mempool: prevent objects from being across pages Olivier Matz
@ 2019-10-31  6:54                             ` Vamsi Krishna Attunuru
  2019-10-31  8:19                               ` Jerin Jacob
  2019-10-31  8:24                               ` Olivier Matz
  0 siblings, 2 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-10-31  6:54 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon

Hi Olivier,

Thanks for reworked patches.
With V2, Tests with 512MB & 2M page sizes work fine with octeontx2 mempool pmd. One more concern is, octeontx fpa mempool driver also has the similar requirements. How do we address that, Can you suggest the best way to avoid code duplication in PMDs.

Regards
A Vamsi 

> -----Original Message-----
> From: Olivier Matz <olivier.matz@6wind.com>
> Sent: Wednesday, October 30, 2019 8:06 PM
> To: dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> pages
> 
> External Email
> 
> ----------------------------------------------------------------------
> When populating a mempool, ensure that objects are not located across several
> pages, except if user did not request iova contiguous objects.
> 
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  drivers/mempool/octeontx2/Makefile           |   3 +
>  drivers/mempool/octeontx2/meson.build        |   3 +
>  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
>  lib/librte_mempool/rte_mempool.c             |  23 ++--
>  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
>  5 files changed, 147 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/mempool/octeontx2/Makefile
> b/drivers/mempool/octeontx2/Makefile
> index 87cce22c6..d781cbfc6 100644
> --- a/drivers/mempool/octeontx2/Makefile
> +++ b/drivers/mempool/octeontx2/Makefile
> @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> 
>  LIBABIVER := 1
> 
> +# for rte_mempool_get_page_size
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +
>  #
>  # all source are stored in SRCS-y
>  #
> diff --git a/drivers/mempool/octeontx2/meson.build
> b/drivers/mempool/octeontx2/meson.build
> index 9fde40f0e..28f9634da 100644
> --- a/drivers/mempool/octeontx2/meson.build
> +++ b/drivers/mempool/octeontx2/meson.build
> @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> 
>  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> +
> +# for rte_mempool_get_page_size
> +allow_experimental_apis = true
> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> index d769575f4..47117aec6 100644
> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> @@ -713,12 +713,76 @@ static ssize_t
>  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
>  		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> -	/*
> -	 * Simply need space for one more object to be able to
> -	 * fulfill alignment requirements.
> -	 */
> -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> pg_shift,
> -						    min_chunk_size, align);
> +	size_t total_elt_sz;
> +	size_t obj_per_page, pg_sz, objs_in_last_page;
> +	size_t mem_size;
> +
> +	/* derived from rte_mempool_op_calc_mem_size_default() */
> +
> +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> +
> +	if (total_elt_sz == 0) {
> +		mem_size = 0;
> +	} else if (pg_shift == 0) {
> +		/* one object margin to fix alignment */
> +		mem_size = total_elt_sz * (obj_num + 1);
> +	} else {
> +		pg_sz = (size_t)1 << pg_shift;
> +		obj_per_page = pg_sz / total_elt_sz;
> +
> +		/* we need to keep one object to fix alignment */
> +		if (obj_per_page > 0)
> +			obj_per_page--;
> +
> +		if (obj_per_page == 0) {
> +			/*
> +			 * Note that if object size is bigger than page size,
> +			 * then it is assumed that pages are grouped in subsets
> +			 * of physically continuous pages big enough to store
> +			 * at least one object.
> +			 */
> +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> +						pg_sz) * obj_num;
> +		} else {
> +			/* In the best case, the allocator will return a
> +			 * page-aligned address. For example, with 5 objs,
> +			 * the required space is as below:
> +			 *  |     page0     |     page1     |  page2 (last) |
> +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> +			 *  <------------- mem_size ------------->
> +			 */
> +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> 1;
> +			/* room required for the last page */
> +			mem_size = objs_in_last_page * total_elt_sz;
> +			/* room required for other pages */
> +			mem_size += ((obj_num - objs_in_last_page) /
> +				obj_per_page) << pg_shift;
> +
> +			/* In the worst case, the allocator returns a
> +			 * non-aligned pointer, wasting up to
> +			 * total_elt_sz. Add a margin for that.
> +			 */
> +			 mem_size += total_elt_sz - 1;
> +		}
> +	}
> +
> +	*min_chunk_size = total_elt_sz * 2;
> +	*align = RTE_CACHE_LINE_SIZE;
> +
> +	return mem_size;
> +}
> +
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> pg_sz))
> +		return -1;
> +	return 0;
>  }
> 
>  static int
> @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> unsigned int max_objs, void *vaddr,
>  		  rte_iova_t iova, size_t len,
>  		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>  	size_t off;
> +	unsigned int i;
> +	void *obj;
> +	int ret;
> 
>  	if (iova == RTE_BAD_IOVA)
>  		return -EINVAL;
> @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> unsigned int max_objs, void *vaddr,
>  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> 
>  	/* Align object start address to a multiple of total_elt_sz */
> -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> 
>  	if (len < off)
>  		return -EINVAL;
> 
> -	vaddr = (char *)vaddr + off;
> -	iova += off;
> -	len -= off;
> 
> -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> 
>  	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
>  		return -EBUSY;
> 
> -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> len,
> -					       obj_cb, obj_cb_arg);
> +	/* the following is derived from rte_mempool_op_populate_default() */
> +
> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> +	if (ret < 0)
> +		return ret;
> +
> +	for (i = 0; i < max_objs; i++) {
> +		/* avoid objects to cross page boundaries, and align
> +		 * offset to a multiple of total_elt_sz.
> +		 */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> off);
> +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> +						total_elt_sz) + 1);
> +		}
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
> +		off += mp->header_size;
> +		obj = va + off;
> +		obj_cb(mp, obj_cb_arg, obj,
> +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> +		off += mp->elt_size + mp->trailer_size;
> +	}
> +
> +	return i;
>  }
> 
>  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> index 758c5410b..d3db9273d 100644
> --- a/lib/librte_mempool/rte_mempool.c
> +++ b/lib/librte_mempool/rte_mempool.c
> @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> size_t *pg_sz)
> 
>  	if (!need_iova_contig_obj)
>  		*pg_sz = 0;
> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> -		*pg_sz = 0;
>  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>  		*pg_sz = get_min_page_size(mp->socket_id);
>  	else
> @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> *mp)
>  	 * then just set page shift and page size to 0, because the user has
>  	 * indicated that there's no need to care about anything.
>  	 *
> -	 * if we do need contiguous objects, there is also an option to reserve
> -	 * the entire mempool memory as one contiguous block of memory, in
> -	 * which case the page shift and alignment wouldn't matter as well.
> +	 * if we do need contiguous objects (if a mempool driver has its
> +	 * own calc_size() method returning min_chunk_size = mem_size),
> +	 * there is also an option to reserve the entire mempool memory
> +	 * as one contiguous block of memory.
>  	 *
>  	 * if we require contiguous objects, but not necessarily the entire
> -	 * mempool reserved space to be contiguous, then there are two
> options.
> -	 *
> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> -	 * case), then no page shift needed - our memory allocation will give us
> -	 * contiguous IO memory as far as the hardware is concerned, so
> -	 * act as if we're getting contiguous memory.
> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> +	 * and the default ops->populate() will take care of not placing
> +	 * objects across pages.
>  	 *
>  	 * if our IO addresses are physical, we may get memory from bigger
>  	 * pages, or we might get memory from smaller pages, and how much
> of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> rte_mempool *mp)
>  	 *
>  	 * If we fail to get enough contiguous memory, then we'll go and
>  	 * reserve space in smaller chunks.
> -	 *
> -	 * We also have to take into account the fact that memory that we're
> -	 * going to allocate from can belong to an externally allocated memory
> -	 * area, in which case the assumption of IOVA as VA mode being
> -	 * synonymous with IOVA contiguousness will not hold.
>  	 */
> 
>  	need_iova_contig_obj = !(mp->flags &
> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> a/lib/librte_mempool/rte_mempool_ops_default.c
> b/lib/librte_mempool/rte_mempool_ops_default.c
> index f6aea7662..e5cd4600f 100644
> --- a/lib/librte_mempool/rte_mempool_ops_default.c
> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> rte_mempool *mp,
>  	return mem_size;
>  }
> 
> +/* Returns -1 if object crosses a page boundary, else returns 0 */
> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> +	if (pg_sz == 0)
> +		return 0;
> +	if (elt_sz > pg_sz)
> +		return 0;
> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> pg_sz))
> +		return -1;
> +	return 0;
> +}
> +
>  int
>  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> max_objs,
>  		void *vaddr, rte_iova_t iova, size_t len,
>  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> -	size_t total_elt_sz;
> +	char *va = vaddr;
> +	size_t total_elt_sz, pg_sz;
>  	size_t off;
>  	unsigned int i;
>  	void *obj;
> +	int ret;
> +
> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> +	if (ret < 0)
> +		return ret;
> 
>  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> 
> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> +	for (off = 0, i = 0; i < max_objs; i++) {
> +		/* avoid objects to cross page boundaries */
> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> off);
> +
> +		if (off + total_elt_sz > len)
> +			break;
> +
>  		off += mp->header_size;
> -		obj = (char *)vaddr + off;
> +		obj = va + off;
>  		obj_cb(mp, obj_cb_arg, obj,
>  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-31  6:54                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-10-31  8:19                               ` Jerin Jacob
  2019-10-31  8:29                                 ` Olivier Matz
  2019-10-31  8:24                               ` Olivier Matz
  1 sibling, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-10-31  8:19 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: Olivier Matz, dev, Anatoly Burakov, Andrew Rybchenko,
	Ferruh Yigit, Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

On Thu, Oct 31, 2019 at 12:25 PM Vamsi Krishna Attunuru
<vattunuru@marvell.com> wrote:
>
> Hi Olivier,
>
> Thanks for reworked patches.
> With V2, Tests with 512MB & 2M page sizes work fine with octeontx2 mempool pmd. One more concern is, octeontx fpa mempool driver also has the similar requirements. How do we address that, Can you suggest the best way to avoid code duplication in PMDs.

# Actually both drivers don't call any HW specific function on those ops
# Is it possible to move the code under "  /* derived from
rte_mempool_op_calc_mem_size_default() */"
to static function in the mempool lib
ie.
it will keep the generic rte_mempool_op_calc_mem_size_default() clean
and we can introduce
other variant of rte_mempool_op_calc_mem_size_default() for this
specific requirement  which inclues the static generic function.

I don't think, it is a one-off requirement to have an object size
aligned to object start address in all HW based mempool
implementation.

The reason for the HW scheme doing such a scheme is the following:
# if object size aligned to object block size, when HW wants to
enqueue() a packet back to pool,
irrespective of the free pointer offset it can enqueue the mbuf
address to mempool.

Example:
X - mbuf start address
Y - the object block size
X + n <- is the packet pointer to send the packet

When submitting the packet o HW to send the packet, SW needs to only
mention the X + n and
when HW frees it, it can derive the X(mbuf pointer address) by the
following arithmetic
X = (X + n ) - ((X + n) MOD Y)

Hi Olivier,
It is not worth going back and forth on this code organization. You
can decide on a scheme, We will follow that.


>
> Regards
> A Vamsi
>
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Wednesday, October 30, 2019 8:06 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > pages
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > When populating a mempool, ensure that objects are not located across several
> > pages, except if user did not request iova contiguous objects.
> >
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  drivers/mempool/octeontx2/Makefile           |   3 +
> >  drivers/mempool/octeontx2/meson.build        |   3 +
> >  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> >  lib/librte_mempool/rte_mempool.c             |  23 ++--
> >  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> >  5 files changed, 147 insertions(+), 33 deletions(-)
> >
> > diff --git a/drivers/mempool/octeontx2/Makefile
> > b/drivers/mempool/octeontx2/Makefile
> > index 87cce22c6..d781cbfc6 100644
> > --- a/drivers/mempool/octeontx2/Makefile
> > +++ b/drivers/mempool/octeontx2/Makefile
> > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> >
> >  LIBABIVER := 1
> >
> > +# for rte_mempool_get_page_size
> > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > +
> >  #
> >  # all source are stored in SRCS-y
> >  #
> > diff --git a/drivers/mempool/octeontx2/meson.build
> > b/drivers/mempool/octeontx2/meson.build
> > index 9fde40f0e..28f9634da 100644
> > --- a/drivers/mempool/octeontx2/meson.build
> > +++ b/drivers/mempool/octeontx2/meson.build
> > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> >
> >  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > +
> > +# for rte_mempool_get_page_size
> > +allow_experimental_apis = true
> > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > index d769575f4..47117aec6 100644
> > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > @@ -713,12 +713,76 @@ static ssize_t
> >  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> >                      uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > -     /*
> > -      * Simply need space for one more object to be able to
> > -      * fulfill alignment requirements.
> > -      */
> > -     return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > pg_shift,
> > -                                                 min_chunk_size, align);
> > +     size_t total_elt_sz;
> > +     size_t obj_per_page, pg_sz, objs_in_last_page;
> > +     size_t mem_size;
> > +
> > +     /* derived from rte_mempool_op_calc_mem_size_default() */
> > +
> > +     total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > +
> > +     if (total_elt_sz == 0) {
> > +             mem_size = 0;
> > +     } else if (pg_shift == 0) {
> > +             /* one object margin to fix alignment */
> > +             mem_size = total_elt_sz * (obj_num + 1);
> > +     } else {
> > +             pg_sz = (size_t)1 << pg_shift;
> > +             obj_per_page = pg_sz / total_elt_sz;
> > +
> > +             /* we need to keep one object to fix alignment */
> > +             if (obj_per_page > 0)
> > +                     obj_per_page--;
> > +
> > +             if (obj_per_page == 0) {
> > +                     /*
> > +                      * Note that if object size is bigger than page size,
> > +                      * then it is assumed that pages are grouped in subsets
> > +                      * of physically continuous pages big enough to store
> > +                      * at least one object.
> > +                      */
> > +                     mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > +                                             pg_sz) * obj_num;
> > +             } else {
> > +                     /* In the best case, the allocator will return a
> > +                      * page-aligned address. For example, with 5 objs,
> > +                      * the required space is as below:
> > +                      *  |     page0     |     page1     |  page2 (last) |
> > +                      *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > +                      *  <------------- mem_size ------------->
> > +                      */
> > +                     objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > 1;
> > +                     /* room required for the last page */
> > +                     mem_size = objs_in_last_page * total_elt_sz;
> > +                     /* room required for other pages */
> > +                     mem_size += ((obj_num - objs_in_last_page) /
> > +                             obj_per_page) << pg_shift;
> > +
> > +                     /* In the worst case, the allocator returns a
> > +                      * non-aligned pointer, wasting up to
> > +                      * total_elt_sz. Add a margin for that.
> > +                      */
> > +                      mem_size += total_elt_sz - 1;
> > +             }
> > +     }
> > +
> > +     *min_chunk_size = total_elt_sz * 2;
> > +     *align = RTE_CACHE_LINE_SIZE;
> > +
> > +     return mem_size;
> > +}
> > +
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +     if (pg_sz == 0)
> > +             return 0;
> > +     if (elt_sz > pg_sz)
> > +             return 0;
> > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +             return -1;
> > +     return 0;
> >  }
> >
> >  static int
> > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >                 rte_iova_t iova, size_t len,
> >                 rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > {
> > -     size_t total_elt_sz;
> > +     char *va = vaddr;
> > +     size_t total_elt_sz, pg_sz;
> >       size_t off;
> > +     unsigned int i;
> > +     void *obj;
> > +     int ret;
> >
> >       if (iova == RTE_BAD_IOVA)
> >               return -EINVAL;
> > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> >       /* Align object start address to a multiple of total_elt_sz */
> > -     off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > +     off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> >
> >       if (len < off)
> >               return -EINVAL;
> >
> > -     vaddr = (char *)vaddr + off;
> > -     iova += off;
> > -     len -= off;
> >
> > -     npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > +     npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> >
> >       if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> >               return -EBUSY;
> >
> > -     return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > len,
> > -                                            obj_cb, obj_cb_arg);
> > +     /* the following is derived from rte_mempool_op_populate_default() */
> > +
> > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +     if (ret < 0)
> > +             return ret;
> > +
> > +     for (i = 0; i < max_objs; i++) {
> > +             /* avoid objects to cross page boundaries, and align
> > +              * offset to a multiple of total_elt_sz.
> > +              */
> > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +                     off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > +                                             total_elt_sz) + 1);
> > +             }
> > +
> > +             if (off + total_elt_sz > len)
> > +                     break;
> > +
> > +             off += mp->header_size;
> > +             obj = va + off;
> > +             obj_cb(mp, obj_cb_arg, obj,
> > +                    (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > +             rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > +             off += mp->elt_size + mp->trailer_size;
> > +     }
> > +
> > +     return i;
> >  }
> >
> >  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 758c5410b..d3db9273d 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > size_t *pg_sz)
> >
> >       if (!need_iova_contig_obj)
> >               *pg_sz = 0;
> > -     else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > -             *pg_sz = 0;
> >       else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >               *pg_sz = get_min_page_size(mp->socket_id);
> >       else
> > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > *mp)
> >        * then just set page shift and page size to 0, because the user has
> >        * indicated that there's no need to care about anything.
> >        *
> > -      * if we do need contiguous objects, there is also an option to reserve
> > -      * the entire mempool memory as one contiguous block of memory, in
> > -      * which case the page shift and alignment wouldn't matter as well.
> > +      * if we do need contiguous objects (if a mempool driver has its
> > +      * own calc_size() method returning min_chunk_size = mem_size),
> > +      * there is also an option to reserve the entire mempool memory
> > +      * as one contiguous block of memory.
> >        *
> >        * if we require contiguous objects, but not necessarily the entire
> > -      * mempool reserved space to be contiguous, then there are two
> > options.
> > -      *
> > -      * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -      * case), then no page shift needed - our memory allocation will give us
> > -      * contiguous IO memory as far as the hardware is concerned, so
> > -      * act as if we're getting contiguous memory.
> > +      * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +      * and the default ops->populate() will take care of not placing
> > +      * objects across pages.
> >        *
> >        * if our IO addresses are physical, we may get memory from bigger
> >        * pages, or we might get memory from smaller pages, and how much
> > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >        *
> >        * If we fail to get enough contiguous memory, then we'll go and
> >        * reserve space in smaller chunks.
> > -      *
> > -      * We also have to take into account the fact that memory that we're
> > -      * going to allocate from can belong to an externally allocated memory
> > -      * area, in which case the assumption of IOVA as VA mode being
> > -      * synonymous with IOVA contiguousness will not hold.
> >        */
> >
> >       need_iova_contig_obj = !(mp->flags &
> > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..e5cd4600f 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > rte_mempool *mp,
> >       return mem_size;
> >  }
> >
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +     if (pg_sz == 0)
> > +             return 0;
> > +     if (elt_sz > pg_sz)
> > +             return 0;
> > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +             return -1;
> > +     return 0;
> > +}
> > +
> >  int
> >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > max_objs,
> >               void *vaddr, rte_iova_t iova, size_t len,
> >               rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > -     size_t total_elt_sz;
> > +     char *va = vaddr;
> > +     size_t total_elt_sz, pg_sz;
> >       size_t off;
> >       unsigned int i;
> >       void *obj;
> > +     int ret;
> > +
> > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +     if (ret < 0)
> > +             return ret;
> >
> >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> >
> > -     for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +     for (off = 0, i = 0; i < max_objs; i++) {
> > +             /* avoid objects to cross page boundaries */
> > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +
> > +             if (off + total_elt_sz > len)
> > +                     break;
> > +
> >               off += mp->header_size;
> > -             obj = (char *)vaddr + off;
> > +             obj = va + off;
> >               obj_cb(mp, obj_cb_arg, obj,
> >                      (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >               rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > --
> > 2.20.1
>

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-31  6:54                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-10-31  8:19                               ` Jerin Jacob
@ 2019-10-31  8:24                               ` Olivier Matz
  2019-10-31  8:33                                 ` Andrew Rybchenko
  1 sibling, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-10-31  8:24 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: dev, Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon

Hi,

On Thu, Oct 31, 2019 at 06:54:50AM +0000, Vamsi Krishna Attunuru wrote:
> Hi Olivier,
> 
> Thanks for reworked patches.
> With V2, Tests with 512MB & 2M page sizes work fine with octeontx2
> mempool pmd.

Good to hear.

> One more concern is, octeontx fpa mempool driver also has the similar
> requirements. How do we address that, Can you suggest the best way to
> avoid code duplication in PMDs.

Well, we could provide additional helpers in librte_mempool:
rte_mempool_calc_mem_size_helper() and rte_mempool_populate_helper()
that would be internal to mempool lib + drivers. This helpers could take
an additional parameter to enable the alignemnt on objects.

But: with this approach, we are moving back driver specificities inside
the common code, and if tomorrow more drivers with different
requirements also asks to factorize them in common code, we may end up
with common functions that must support hardware specificities, making
them harder to maintain.

I agree that duplicating code is not satisfying either, but I think it
is still better than starting to move hw requirements in common code,
given that mempool drivers where introduced for that reason.

@Andrew, any opinion?



> 
> Regards
> A Vamsi 
> 
> > -----Original Message-----
> > From: Olivier Matz <olivier.matz@6wind.com>
> > Sent: Wednesday, October 30, 2019 8:06 PM
> > To: dev@dpdk.org
> > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > pages
> > 
> > External Email
> > 
> > ----------------------------------------------------------------------
> > When populating a mempool, ensure that objects are not located across several
> > pages, except if user did not request iova contiguous objects.
> > 
> > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > ---
> >  drivers/mempool/octeontx2/Makefile           |   3 +
> >  drivers/mempool/octeontx2/meson.build        |   3 +
> >  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> >  lib/librte_mempool/rte_mempool.c             |  23 ++--
> >  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> >  5 files changed, 147 insertions(+), 33 deletions(-)
> > 
> > diff --git a/drivers/mempool/octeontx2/Makefile
> > b/drivers/mempool/octeontx2/Makefile
> > index 87cce22c6..d781cbfc6 100644
> > --- a/drivers/mempool/octeontx2/Makefile
> > +++ b/drivers/mempool/octeontx2/Makefile
> > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> > 
> >  LIBABIVER := 1
> > 
> > +# for rte_mempool_get_page_size
> > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > +
> >  #
> >  # all source are stored in SRCS-y
> >  #
> > diff --git a/drivers/mempool/octeontx2/meson.build
> > b/drivers/mempool/octeontx2/meson.build
> > index 9fde40f0e..28f9634da 100644
> > --- a/drivers/mempool/octeontx2/meson.build
> > +++ b/drivers/mempool/octeontx2/meson.build
> > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> > 
> >  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > +
> > +# for rte_mempool_get_page_size
> > +allow_experimental_apis = true
> > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > index d769575f4..47117aec6 100644
> > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > @@ -713,12 +713,76 @@ static ssize_t
> >  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> >  		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > -	/*
> > -	 * Simply need space for one more object to be able to
> > -	 * fulfill alignment requirements.
> > -	 */
> > -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > pg_shift,
> > -						    min_chunk_size, align);
> > +	size_t total_elt_sz;
> > +	size_t obj_per_page, pg_sz, objs_in_last_page;
> > +	size_t mem_size;
> > +
> > +	/* derived from rte_mempool_op_calc_mem_size_default() */
> > +
> > +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > +
> > +	if (total_elt_sz == 0) {
> > +		mem_size = 0;
> > +	} else if (pg_shift == 0) {
> > +		/* one object margin to fix alignment */
> > +		mem_size = total_elt_sz * (obj_num + 1);
> > +	} else {
> > +		pg_sz = (size_t)1 << pg_shift;
> > +		obj_per_page = pg_sz / total_elt_sz;
> > +
> > +		/* we need to keep one object to fix alignment */
> > +		if (obj_per_page > 0)
> > +			obj_per_page--;
> > +
> > +		if (obj_per_page == 0) {
> > +			/*
> > +			 * Note that if object size is bigger than page size,
> > +			 * then it is assumed that pages are grouped in subsets
> > +			 * of physically continuous pages big enough to store
> > +			 * at least one object.
> > +			 */
> > +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > +						pg_sz) * obj_num;
> > +		} else {
> > +			/* In the best case, the allocator will return a
> > +			 * page-aligned address. For example, with 5 objs,
> > +			 * the required space is as below:
> > +			 *  |     page0     |     page1     |  page2 (last) |
> > +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > +			 *  <------------- mem_size ------------->
> > +			 */
> > +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > 1;
> > +			/* room required for the last page */
> > +			mem_size = objs_in_last_page * total_elt_sz;
> > +			/* room required for other pages */
> > +			mem_size += ((obj_num - objs_in_last_page) /
> > +				obj_per_page) << pg_shift;
> > +
> > +			/* In the worst case, the allocator returns a
> > +			 * non-aligned pointer, wasting up to
> > +			 * total_elt_sz. Add a margin for that.
> > +			 */
> > +			 mem_size += total_elt_sz - 1;
> > +		}
> > +	}
> > +
> > +	*min_chunk_size = total_elt_sz * 2;
> > +	*align = RTE_CACHE_LINE_SIZE;
> > +
> > +	return mem_size;
> > +}
> > +
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +		return -1;
> > +	return 0;
> >  }
> > 
> >  static int
> > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >  		  rte_iova_t iova, size_t len,
> >  		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >  	size_t off;
> > +	unsigned int i;
> > +	void *obj;
> > +	int ret;
> > 
> >  	if (iova == RTE_BAD_IOVA)
> >  		return -EINVAL;
> > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > unsigned int max_objs, void *vaddr,
> >  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > 
> >  	/* Align object start address to a multiple of total_elt_sz */
> > -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> > 
> >  	if (len < off)
> >  		return -EINVAL;
> > 
> > -	vaddr = (char *)vaddr + off;
> > -	iova += off;
> > -	len -= off;
> > 
> > -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> > 
> >  	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> >  		return -EBUSY;
> > 
> > -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > len,
> > -					       obj_cb, obj_cb_arg);
> > +	/* the following is derived from rte_mempool_op_populate_default() */
> > +
> > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +	if (ret < 0)
> > +		return ret;
> > +
> > +	for (i = 0; i < max_objs; i++) {
> > +		/* avoid objects to cross page boundaries, and align
> > +		 * offset to a multiple of total_elt_sz.
> > +		 */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > +						total_elt_sz) + 1);
> > +		}
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> > +		off += mp->header_size;
> > +		obj = va + off;
> > +		obj_cb(mp, obj_cb_arg, obj,
> > +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > +		off += mp->elt_size + mp->trailer_size;
> > +	}
> > +
> > +	return i;
> >  }
> > 
> >  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > index 758c5410b..d3db9273d 100644
> > --- a/lib/librte_mempool/rte_mempool.c
> > +++ b/lib/librte_mempool/rte_mempool.c
> > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > size_t *pg_sz)
> > 
> >  	if (!need_iova_contig_obj)
> >  		*pg_sz = 0;
> > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > -		*pg_sz = 0;
> >  	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> >  		*pg_sz = get_min_page_size(mp->socket_id);
> >  	else
> > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > *mp)
> >  	 * then just set page shift and page size to 0, because the user has
> >  	 * indicated that there's no need to care about anything.
> >  	 *
> > -	 * if we do need contiguous objects, there is also an option to reserve
> > -	 * the entire mempool memory as one contiguous block of memory, in
> > -	 * which case the page shift and alignment wouldn't matter as well.
> > +	 * if we do need contiguous objects (if a mempool driver has its
> > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > +	 * there is also an option to reserve the entire mempool memory
> > +	 * as one contiguous block of memory.
> >  	 *
> >  	 * if we require contiguous objects, but not necessarily the entire
> > -	 * mempool reserved space to be contiguous, then there are two
> > options.
> > -	 *
> > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > -	 * case), then no page shift needed - our memory allocation will give us
> > -	 * contiguous IO memory as far as the hardware is concerned, so
> > -	 * act as if we're getting contiguous memory.
> > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > +	 * and the default ops->populate() will take care of not placing
> > +	 * objects across pages.
> >  	 *
> >  	 * if our IO addresses are physical, we may get memory from bigger
> >  	 * pages, or we might get memory from smaller pages, and how much
> > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > rte_mempool *mp)
> >  	 *
> >  	 * If we fail to get enough contiguous memory, then we'll go and
> >  	 * reserve space in smaller chunks.
> > -	 *
> > -	 * We also have to take into account the fact that memory that we're
> > -	 * going to allocate from can belong to an externally allocated memory
> > -	 * area, in which case the assumption of IOVA as VA mode being
> > -	 * synonymous with IOVA contiguousness will not hold.
> >  	 */
> > 
> >  	need_iova_contig_obj = !(mp->flags &
> > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > a/lib/librte_mempool/rte_mempool_ops_default.c
> > b/lib/librte_mempool/rte_mempool_ops_default.c
> > index f6aea7662..e5cd4600f 100644
> > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > rte_mempool *mp,
> >  	return mem_size;
> >  }
> > 
> > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > +	if (pg_sz == 0)
> > +		return 0;
> > +	if (elt_sz > pg_sz)
> > +		return 0;
> > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > pg_sz))
> > +		return -1;
> > +	return 0;
> > +}
> > +
> >  int
> >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > max_objs,
> >  		void *vaddr, rte_iova_t iova, size_t len,
> >  		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > -	size_t total_elt_sz;
> > +	char *va = vaddr;
> > +	size_t total_elt_sz, pg_sz;
> >  	size_t off;
> >  	unsigned int i;
> >  	void *obj;
> > +	int ret;
> > +
> > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > +	if (ret < 0)
> > +		return ret;
> > 
> >  	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > 
> > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > +	for (off = 0, i = 0; i < max_objs; i++) {
> > +		/* avoid objects to cross page boundaries */
> > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > off);
> > +
> > +		if (off + total_elt_sz > len)
> > +			break;
> > +
> >  		off += mp->header_size;
> > -		obj = (char *)vaddr + off;
> > +		obj = va + off;
> >  		obj_cb(mp, obj_cb_arg, obj,
> >  		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> >  		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > --
> > 2.20.1
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-31  8:19                               ` Jerin Jacob
@ 2019-10-31  8:29                                 ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-31  8:29 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Vamsi Krishna Attunuru, dev, Anatoly Burakov, Andrew Rybchenko,
	Ferruh Yigit, Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

Hi Jerin,

On Thu, Oct 31, 2019 at 01:49:10PM +0530, Jerin Jacob wrote:
> On Thu, Oct 31, 2019 at 12:25 PM Vamsi Krishna Attunuru
> <vattunuru@marvell.com> wrote:
> >
> > Hi Olivier,
> >
> > Thanks for reworked patches.
> > With V2, Tests with 512MB & 2M page sizes work fine with octeontx2 mempool pmd. One more concern is, octeontx fpa mempool driver also has the similar requirements. How do we address that, Can you suggest the best way to avoid code duplication in PMDs.
> 
> # Actually both drivers don't call any HW specific function on those ops
> # Is it possible to move the code under "  /* derived from
> rte_mempool_op_calc_mem_size_default() */"
> to static function in the mempool lib
> ie.
> it will keep the generic rte_mempool_op_calc_mem_size_default() clean
> and we can introduce
> other variant of rte_mempool_op_calc_mem_size_default() for this
> specific requirement  which inclues the static generic function.
> 
> I don't think, it is a one-off requirement to have an object size
> aligned to object start address in all HW based mempool
> implementation.
> 
> The reason for the HW scheme doing such a scheme is the following:
> # if object size aligned to object block size, when HW wants to
> enqueue() a packet back to pool,
> irrespective of the free pointer offset it can enqueue the mbuf
> address to mempool.
> 
> Example:
> X - mbuf start address
> Y - the object block size
> X + n <- is the packet pointer to send the packet
> 
> When submitting the packet o HW to send the packet, SW needs to only
> mention the X + n and
> when HW frees it, it can derive the X(mbuf pointer address) by the
> following arithmetic
> X = (X + n ) - ((X + n) MOD Y)
> 
> Hi Olivier,
> It is not worth going back and forth on this code organization. You
> can decide on a scheme, We will follow that.

Thanks for the explanation.
Our mail crossed each others. Please see my answer to Vamsi.


> 
> 
> >
> > Regards
> > A Vamsi
> >
> > > -----Original Message-----
> > > From: Olivier Matz <olivier.matz@6wind.com>
> > > Sent: Wednesday, October 30, 2019 8:06 PM
> > > To: dev@dpdk.org
> > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > > pages
> > >
> > > External Email
> > >
> > > ----------------------------------------------------------------------
> > > When populating a mempool, ensure that objects are not located across several
> > > pages, except if user did not request iova contiguous objects.
> > >
> > > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > ---
> > >  drivers/mempool/octeontx2/Makefile           |   3 +
> > >  drivers/mempool/octeontx2/meson.build        |   3 +
> > >  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> > >  lib/librte_mempool/rte_mempool.c             |  23 ++--
> > >  lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> > >  5 files changed, 147 insertions(+), 33 deletions(-)
> > >
> > > diff --git a/drivers/mempool/octeontx2/Makefile
> > > b/drivers/mempool/octeontx2/Makefile
> > > index 87cce22c6..d781cbfc6 100644
> > > --- a/drivers/mempool/octeontx2/Makefile
> > > +++ b/drivers/mempool/octeontx2/Makefile
> > > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> > >
> > >  LIBABIVER := 1
> > >
> > > +# for rte_mempool_get_page_size
> > > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > > +
> > >  #
> > >  # all source are stored in SRCS-y
> > >  #
> > > diff --git a/drivers/mempool/octeontx2/meson.build
> > > b/drivers/mempool/octeontx2/meson.build
> > > index 9fde40f0e..28f9634da 100644
> > > --- a/drivers/mempool/octeontx2/meson.build
> > > +++ b/drivers/mempool/octeontx2/meson.build
> > > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> > >
> > >  deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > > +
> > > +# for rte_mempool_get_page_size
> > > +allow_experimental_apis = true
> > > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > index d769575f4..47117aec6 100644
> > > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > @@ -713,12 +713,76 @@ static ssize_t
> > >  otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> > >                      uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > > -     /*
> > > -      * Simply need space for one more object to be able to
> > > -      * fulfill alignment requirements.
> > > -      */
> > > -     return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > > pg_shift,
> > > -                                                 min_chunk_size, align);
> > > +     size_t total_elt_sz;
> > > +     size_t obj_per_page, pg_sz, objs_in_last_page;
> > > +     size_t mem_size;
> > > +
> > > +     /* derived from rte_mempool_op_calc_mem_size_default() */
> > > +
> > > +     total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > +
> > > +     if (total_elt_sz == 0) {
> > > +             mem_size = 0;
> > > +     } else if (pg_shift == 0) {
> > > +             /* one object margin to fix alignment */
> > > +             mem_size = total_elt_sz * (obj_num + 1);
> > > +     } else {
> > > +             pg_sz = (size_t)1 << pg_shift;
> > > +             obj_per_page = pg_sz / total_elt_sz;
> > > +
> > > +             /* we need to keep one object to fix alignment */
> > > +             if (obj_per_page > 0)
> > > +                     obj_per_page--;
> > > +
> > > +             if (obj_per_page == 0) {
> > > +                     /*
> > > +                      * Note that if object size is bigger than page size,
> > > +                      * then it is assumed that pages are grouped in subsets
> > > +                      * of physically continuous pages big enough to store
> > > +                      * at least one object.
> > > +                      */
> > > +                     mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > > +                                             pg_sz) * obj_num;
> > > +             } else {
> > > +                     /* In the best case, the allocator will return a
> > > +                      * page-aligned address. For example, with 5 objs,
> > > +                      * the required space is as below:
> > > +                      *  |     page0     |     page1     |  page2 (last) |
> > > +                      *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > > +                      *  <------------- mem_size ------------->
> > > +                      */
> > > +                     objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > > 1;
> > > +                     /* room required for the last page */
> > > +                     mem_size = objs_in_last_page * total_elt_sz;
> > > +                     /* room required for other pages */
> > > +                     mem_size += ((obj_num - objs_in_last_page) /
> > > +                             obj_per_page) << pg_shift;
> > > +
> > > +                     /* In the worst case, the allocator returns a
> > > +                      * non-aligned pointer, wasting up to
> > > +                      * total_elt_sz. Add a margin for that.
> > > +                      */
> > > +                      mem_size += total_elt_sz - 1;
> > > +             }
> > > +     }
> > > +
> > > +     *min_chunk_size = total_elt_sz * 2;
> > > +     *align = RTE_CACHE_LINE_SIZE;
> > > +
> > > +     return mem_size;
> > > +}
> > > +
> > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > +     if (pg_sz == 0)
> > > +             return 0;
> > > +     if (elt_sz > pg_sz)
> > > +             return 0;
> > > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > pg_sz))
> > > +             return -1;
> > > +     return 0;
> > >  }
> > >
> > >  static int
> > > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > unsigned int max_objs, void *vaddr,
> > >                 rte_iova_t iova, size_t len,
> > >                 rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > {
> > > -     size_t total_elt_sz;
> > > +     char *va = vaddr;
> > > +     size_t total_elt_sz, pg_sz;
> > >       size_t off;
> > > +     unsigned int i;
> > > +     void *obj;
> > > +     int ret;
> > >
> > >       if (iova == RTE_BAD_IOVA)
> > >               return -EINVAL;
> > > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > unsigned int max_objs, void *vaddr,
> > >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > >
> > >       /* Align object start address to a multiple of total_elt_sz */
> > > -     off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > > +     off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> > >
> > >       if (len < off)
> > >               return -EINVAL;
> > >
> > > -     vaddr = (char *)vaddr + off;
> > > -     iova += off;
> > > -     len -= off;
> > >
> > > -     npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > > +     npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> > >
> > >       if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> > >               return -EBUSY;
> > >
> > > -     return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > > len,
> > > -                                            obj_cb, obj_cb_arg);
> > > +     /* the following is derived from rte_mempool_op_populate_default() */
> > > +
> > > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > +     if (ret < 0)
> > > +             return ret;
> > > +
> > > +     for (i = 0; i < max_objs; i++) {
> > > +             /* avoid objects to cross page boundaries, and align
> > > +              * offset to a multiple of total_elt_sz.
> > > +              */
> > > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > off);
> > > +                     off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > > +                                             total_elt_sz) + 1);
> > > +             }
> > > +
> > > +             if (off + total_elt_sz > len)
> > > +                     break;
> > > +
> > > +             off += mp->header_size;
> > > +             obj = va + off;
> > > +             obj_cb(mp, obj_cb_arg, obj,
> > > +                    (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > > +             rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > +             off += mp->elt_size + mp->trailer_size;
> > > +     }
> > > +
> > > +     return i;
> > >  }
> > >
> > >  static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > > index 758c5410b..d3db9273d 100644
> > > --- a/lib/librte_mempool/rte_mempool.c
> > > +++ b/lib/librte_mempool/rte_mempool.c
> > > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > > size_t *pg_sz)
> > >
> > >       if (!need_iova_contig_obj)
> > >               *pg_sz = 0;
> > > -     else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > > -             *pg_sz = 0;
> > >       else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > >               *pg_sz = get_min_page_size(mp->socket_id);
> > >       else
> > > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > > *mp)
> > >        * then just set page shift and page size to 0, because the user has
> > >        * indicated that there's no need to care about anything.
> > >        *
> > > -      * if we do need contiguous objects, there is also an option to reserve
> > > -      * the entire mempool memory as one contiguous block of memory, in
> > > -      * which case the page shift and alignment wouldn't matter as well.
> > > +      * if we do need contiguous objects (if a mempool driver has its
> > > +      * own calc_size() method returning min_chunk_size = mem_size),
> > > +      * there is also an option to reserve the entire mempool memory
> > > +      * as one contiguous block of memory.
> > >        *
> > >        * if we require contiguous objects, but not necessarily the entire
> > > -      * mempool reserved space to be contiguous, then there are two
> > > options.
> > > -      *
> > > -      * if our IO addresses are virtual, not actual physical (IOVA as VA
> > > -      * case), then no page shift needed - our memory allocation will give us
> > > -      * contiguous IO memory as far as the hardware is concerned, so
> > > -      * act as if we're getting contiguous memory.
> > > +      * mempool reserved space to be contiguous, pg_sz will be != 0,
> > > +      * and the default ops->populate() will take care of not placing
> > > +      * objects across pages.
> > >        *
> > >        * if our IO addresses are physical, we may get memory from bigger
> > >        * pages, or we might get memory from smaller pages, and how much
> > > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > > rte_mempool *mp)
> > >        *
> > >        * If we fail to get enough contiguous memory, then we'll go and
> > >        * reserve space in smaller chunks.
> > > -      *
> > > -      * We also have to take into account the fact that memory that we're
> > > -      * going to allocate from can belong to an externally allocated memory
> > > -      * area, in which case the assumption of IOVA as VA mode being
> > > -      * synonymous with IOVA contiguousness will not hold.
> > >        */
> > >
> > >       need_iova_contig_obj = !(mp->flags &
> > > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > > a/lib/librte_mempool/rte_mempool_ops_default.c
> > > b/lib/librte_mempool/rte_mempool_ops_default.c
> > > index f6aea7662..e5cd4600f 100644
> > > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > > rte_mempool *mp,
> > >       return mem_size;
> > >  }
> > >
> > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > +     if (pg_sz == 0)
> > > +             return 0;
> > > +     if (elt_sz > pg_sz)
> > > +             return 0;
> > > +     if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > pg_sz))
> > > +             return -1;
> > > +     return 0;
> > > +}
> > > +
> > >  int
> > >  rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > max_objs,
> > >               void *vaddr, rte_iova_t iova, size_t len,
> > >               rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > > -     size_t total_elt_sz;
> > > +     char *va = vaddr;
> > > +     size_t total_elt_sz, pg_sz;
> > >       size_t off;
> > >       unsigned int i;
> > >       void *obj;
> > > +     int ret;
> > > +
> > > +     ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > +     if (ret < 0)
> > > +             return ret;
> > >
> > >       total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > >
> > > -     for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > +     for (off = 0, i = 0; i < max_objs; i++) {
> > > +             /* avoid objects to cross page boundaries */
> > > +             if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > +                     off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > off);
> > > +
> > > +             if (off + total_elt_sz > len)
> > > +                     break;
> > > +
> > >               off += mp->header_size;
> > > -             obj = (char *)vaddr + off;
> > > +             obj = va + off;
> > >               obj_cb(mp, obj_cb_arg, obj,
> > >                      (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > >               rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > --
> > > 2.20.1
> >

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-31  8:24                               ` Olivier Matz
@ 2019-10-31  8:33                                 ` Andrew Rybchenko
  2019-10-31  8:45                                   ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-10-31  8:33 UTC (permalink / raw)
  To: Olivier Matz, Vamsi Krishna Attunuru
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon

On 10/31/19 11:24 AM, Olivier Matz wrote:
> Hi,
>
> On Thu, Oct 31, 2019 at 06:54:50AM +0000, Vamsi Krishna Attunuru wrote:
>> Hi Olivier,
>>
>> Thanks for reworked patches.
>> With V2, Tests with 512MB & 2M page sizes work fine with octeontx2
>> mempool pmd.
> Good to hear.
>
>> One more concern is, octeontx fpa mempool driver also has the similar
>> requirements. How do we address that, Can you suggest the best way to
>> avoid code duplication in PMDs.
> Well, we could provide additional helpers in librte_mempool:
> rte_mempool_calc_mem_size_helper() and rte_mempool_populate_helper()
> that would be internal to mempool lib + drivers. This helpers could take
> an additional parameter to enable the alignemnt on objects.
>
> But: with this approach, we are moving back driver specificities inside
> the common code, and if tomorrow more drivers with different
> requirements also asks to factorize them in common code, we may end up
> with common functions that must support hardware specificities, making
> them harder to maintain.
>
> I agree that duplicating code is not satisfying either, but I think it
> is still better than starting to move hw requirements in common code,
> given that mempool drivers where introduced for that reason.
>
> @Andrew, any opinion?

I think I'd prefer to have helper function. It avoids HW specific
flags in external interface (which was discussed initially), but also
avoids code duplication which makes it easier to maintain.
May be we will reconsider it in the future, but right now I think
it is the best option.

>> Regards
>> A Vamsi
>>
>>> -----Original Message-----
>>> From: Olivier Matz <olivier.matz@6wind.com>
>>> Sent: Wednesday, October 30, 2019 8:06 PM
>>> To: dev@dpdk.org
>>> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
>>> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
>>> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
>>> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
>>> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
>>> <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
>>> Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
>>> pages
>>>
>>> External Email
>>>
>>> ----------------------------------------------------------------------
>>> When populating a mempool, ensure that objects are not located across several
>>> pages, except if user did not request iova contiguous objects.
>>>
>>> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>>> ---
>>>   drivers/mempool/octeontx2/Makefile           |   3 +
>>>   drivers/mempool/octeontx2/meson.build        |   3 +
>>>   drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
>>>   lib/librte_mempool/rte_mempool.c             |  23 ++--
>>>   lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
>>>   5 files changed, 147 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/drivers/mempool/octeontx2/Makefile
>>> b/drivers/mempool/octeontx2/Makefile
>>> index 87cce22c6..d781cbfc6 100644
>>> --- a/drivers/mempool/octeontx2/Makefile
>>> +++ b/drivers/mempool/octeontx2/Makefile
>>> @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
>>>
>>>   LIBABIVER := 1
>>>
>>> +# for rte_mempool_get_page_size
>>> +CFLAGS += -DALLOW_EXPERIMENTAL_API
>>> +
>>>   #
>>>   # all source are stored in SRCS-y
>>>   #
>>> diff --git a/drivers/mempool/octeontx2/meson.build
>>> b/drivers/mempool/octeontx2/meson.build
>>> index 9fde40f0e..28f9634da 100644
>>> --- a/drivers/mempool/octeontx2/meson.build
>>> +++ b/drivers/mempool/octeontx2/meson.build
>>> @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
>>>
>>>   deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
>>> +
>>> +# for rte_mempool_get_page_size
>>> +allow_experimental_apis = true
>>> diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> index d769575f4..47117aec6 100644
>>> --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
>>> @@ -713,12 +713,76 @@ static ssize_t
>>>   otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
>>>   		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
>>> -	/*
>>> -	 * Simply need space for one more object to be able to
>>> -	 * fulfill alignment requirements.
>>> -	 */
>>> -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
>>> pg_shift,
>>> -						    min_chunk_size, align);
>>> +	size_t total_elt_sz;
>>> +	size_t obj_per_page, pg_sz, objs_in_last_page;
>>> +	size_t mem_size;
>>> +
>>> +	/* derived from rte_mempool_op_calc_mem_size_default() */
>>> +
>>> +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>> +
>>> +	if (total_elt_sz == 0) {
>>> +		mem_size = 0;
>>> +	} else if (pg_shift == 0) {
>>> +		/* one object margin to fix alignment */
>>> +		mem_size = total_elt_sz * (obj_num + 1);
>>> +	} else {
>>> +		pg_sz = (size_t)1 << pg_shift;
>>> +		obj_per_page = pg_sz / total_elt_sz;
>>> +
>>> +		/* we need to keep one object to fix alignment */
>>> +		if (obj_per_page > 0)
>>> +			obj_per_page--;
>>> +
>>> +		if (obj_per_page == 0) {
>>> +			/*
>>> +			 * Note that if object size is bigger than page size,
>>> +			 * then it is assumed that pages are grouped in subsets
>>> +			 * of physically continuous pages big enough to store
>>> +			 * at least one object.
>>> +			 */
>>> +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
>>> +						pg_sz) * obj_num;
>>> +		} else {
>>> +			/* In the best case, the allocator will return a
>>> +			 * page-aligned address. For example, with 5 objs,
>>> +			 * the required space is as below:
>>> +			 *  |     page0     |     page1     |  page2 (last) |
>>> +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
>>> +			 *  <------------- mem_size ------------->
>>> +			 */
>>> +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
>>> 1;
>>> +			/* room required for the last page */
>>> +			mem_size = objs_in_last_page * total_elt_sz;
>>> +			/* room required for other pages */
>>> +			mem_size += ((obj_num - objs_in_last_page) /
>>> +				obj_per_page) << pg_shift;
>>> +
>>> +			/* In the worst case, the allocator returns a
>>> +			 * non-aligned pointer, wasting up to
>>> +			 * total_elt_sz. Add a margin for that.
>>> +			 */
>>> +			 mem_size += total_elt_sz - 1;
>>> +		}
>>> +	}
>>> +
>>> +	*min_chunk_size = total_elt_sz * 2;
>>> +	*align = RTE_CACHE_LINE_SIZE;
>>> +
>>> +	return mem_size;
>>> +}
>>> +
>>> +/* Returns -1 if object crosses a page boundary, else returns 0 */
>>> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
>>> +	if (pg_sz == 0)
>>> +		return 0;
>>> +	if (elt_sz > pg_sz)
>>> +		return 0;
>>> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
>>> pg_sz))
>>> +		return -1;
>>> +	return 0;
>>>   }
>>>
>>>   static int
>>> @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
>>> unsigned int max_objs, void *vaddr,
>>>   		  rte_iova_t iova, size_t len,
>>>   		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
>>> {
>>> -	size_t total_elt_sz;
>>> +	char *va = vaddr;
>>> +	size_t total_elt_sz, pg_sz;
>>>   	size_t off;
>>> +	unsigned int i;
>>> +	void *obj;
>>> +	int ret;
>>>
>>>   	if (iova == RTE_BAD_IOVA)
>>>   		return -EINVAL;
>>> @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
>>> unsigned int max_objs, void *vaddr,
>>>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>>
>>>   	/* Align object start address to a multiple of total_elt_sz */
>>> -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
>>> +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
>>>
>>>   	if (len < off)
>>>   		return -EINVAL;
>>>
>>> -	vaddr = (char *)vaddr + off;
>>> -	iova += off;
>>> -	len -= off;
>>>
>>> -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
>>> +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
>>>
>>>   	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
>>>   		return -EBUSY;
>>>
>>> -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
>>> len,
>>> -					       obj_cb, obj_cb_arg);
>>> +	/* the following is derived from rte_mempool_op_populate_default() */
>>> +
>>> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
>>> +	if (ret < 0)
>>> +		return ret;
>>> +
>>> +	for (i = 0; i < max_objs; i++) {
>>> +		/* avoid objects to cross page boundaries, and align
>>> +		 * offset to a multiple of total_elt_sz.
>>> +		 */
>>> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
>>> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
>>> off);
>>> +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
>>> +						total_elt_sz) + 1);
>>> +		}
>>> +
>>> +		if (off + total_elt_sz > len)
>>> +			break;
>>> +
>>> +		off += mp->header_size;
>>> +		obj = va + off;
>>> +		obj_cb(mp, obj_cb_arg, obj,
>>> +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>>> +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>>> +		off += mp->elt_size + mp->trailer_size;
>>> +	}
>>> +
>>> +	return i;
>>>   }
>>>
>>>   static struct rte_mempool_ops otx2_npa_ops = { diff --git
>>> a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
>>> index 758c5410b..d3db9273d 100644
>>> --- a/lib/librte_mempool/rte_mempool.c
>>> +++ b/lib/librte_mempool/rte_mempool.c
>>> @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
>>> size_t *pg_sz)
>>>
>>>   	if (!need_iova_contig_obj)
>>>   		*pg_sz = 0;
>>> -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
>>> -		*pg_sz = 0;
>>>   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
>>>   		*pg_sz = get_min_page_size(mp->socket_id);
>>>   	else
>>> @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
>>> *mp)
>>>   	 * then just set page shift and page size to 0, because the user has
>>>   	 * indicated that there's no need to care about anything.
>>>   	 *
>>> -	 * if we do need contiguous objects, there is also an option to reserve
>>> -	 * the entire mempool memory as one contiguous block of memory, in
>>> -	 * which case the page shift and alignment wouldn't matter as well.
>>> +	 * if we do need contiguous objects (if a mempool driver has its
>>> +	 * own calc_size() method returning min_chunk_size = mem_size),
>>> +	 * there is also an option to reserve the entire mempool memory
>>> +	 * as one contiguous block of memory.
>>>   	 *
>>>   	 * if we require contiguous objects, but not necessarily the entire
>>> -	 * mempool reserved space to be contiguous, then there are two
>>> options.
>>> -	 *
>>> -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
>>> -	 * case), then no page shift needed - our memory allocation will give us
>>> -	 * contiguous IO memory as far as the hardware is concerned, so
>>> -	 * act as if we're getting contiguous memory.
>>> +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
>>> +	 * and the default ops->populate() will take care of not placing
>>> +	 * objects across pages.
>>>   	 *
>>>   	 * if our IO addresses are physical, we may get memory from bigger
>>>   	 * pages, or we might get memory from smaller pages, and how much
>>> of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
>>> rte_mempool *mp)
>>>   	 *
>>>   	 * If we fail to get enough contiguous memory, then we'll go and
>>>   	 * reserve space in smaller chunks.
>>> -	 *
>>> -	 * We also have to take into account the fact that memory that we're
>>> -	 * going to allocate from can belong to an externally allocated memory
>>> -	 * area, in which case the assumption of IOVA as VA mode being
>>> -	 * synonymous with IOVA contiguousness will not hold.
>>>   	 */
>>>
>>>   	need_iova_contig_obj = !(mp->flags &
>>> MEMPOOL_F_NO_IOVA_CONTIG); diff --git
>>> a/lib/librte_mempool/rte_mempool_ops_default.c
>>> b/lib/librte_mempool/rte_mempool_ops_default.c
>>> index f6aea7662..e5cd4600f 100644
>>> --- a/lib/librte_mempool/rte_mempool_ops_default.c
>>> +++ b/lib/librte_mempool/rte_mempool_ops_default.c
>>> @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
>>> rte_mempool *mp,
>>>   	return mem_size;
>>>   }
>>>
>>> +/* Returns -1 if object crosses a page boundary, else returns 0 */
>>> +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
>>> +	if (pg_sz == 0)
>>> +		return 0;
>>> +	if (elt_sz > pg_sz)
>>> +		return 0;
>>> +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
>>> pg_sz))
>>> +		return -1;
>>> +	return 0;
>>> +}
>>> +
>>>   int
>>>   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
>>> max_objs,
>>>   		void *vaddr, rte_iova_t iova, size_t len,
>>>   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
>>> -	size_t total_elt_sz;
>>> +	char *va = vaddr;
>>> +	size_t total_elt_sz, pg_sz;
>>>   	size_t off;
>>>   	unsigned int i;
>>>   	void *obj;
>>> +	int ret;
>>> +
>>> +	ret = rte_mempool_get_page_size(mp, &pg_sz);
>>> +	if (ret < 0)
>>> +		return ret;
>>>
>>>   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
>>>
>>> -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
>>> +	for (off = 0, i = 0; i < max_objs; i++) {
>>> +		/* avoid objects to cross page boundaries */
>>> +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
>>> +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
>>> off);
>>> +
>>> +		if (off + total_elt_sz > len)
>>> +			break;
>>> +
>>>   		off += mp->header_size;
>>> -		obj = (char *)vaddr + off;
>>> +		obj = va + off;
>>>   		obj_cb(mp, obj_cb_arg, obj,
>>>   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
>>>   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
>>> --
>>> 2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 5/6] mempool: prevent objects from being across pages
  2019-10-31  8:33                                 ` Andrew Rybchenko
@ 2019-10-31  8:45                                   ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-10-31  8:45 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: Vamsi Krishna Attunuru, dev, Anatoly Burakov, Ferruh Yigit,
	Giridharan, Ganesan, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Stephen Hemminger, Thomas Monjalon

On Thu, Oct 31, 2019 at 11:33:30AM +0300, Andrew Rybchenko wrote:
> On 10/31/19 11:24 AM, Olivier Matz wrote:
> > Hi,
> > 
> > On Thu, Oct 31, 2019 at 06:54:50AM +0000, Vamsi Krishna Attunuru wrote:
> > > Hi Olivier,
> > > 
> > > Thanks for reworked patches.
> > > With V2, Tests with 512MB & 2M page sizes work fine with octeontx2
> > > mempool pmd.
> > Good to hear.
> > 
> > > One more concern is, octeontx fpa mempool driver also has the similar
> > > requirements. How do we address that, Can you suggest the best way to
> > > avoid code duplication in PMDs.
> > Well, we could provide additional helpers in librte_mempool:
> > rte_mempool_calc_mem_size_helper() and rte_mempool_populate_helper()
> > that would be internal to mempool lib + drivers. This helpers could take
> > an additional parameter to enable the alignemnt on objects.
> > 
> > But: with this approach, we are moving back driver specificities inside
> > the common code, and if tomorrow more drivers with different
> > requirements also asks to factorize them in common code, we may end up
> > with common functions that must support hardware specificities, making
> > them harder to maintain.
> > 
> > I agree that duplicating code is not satisfying either, but I think it
> > is still better than starting to move hw requirements in common code,
> > given that mempool drivers where introduced for that reason.
> > 
> > @Andrew, any opinion?
> 
> I think I'd prefer to have helper function. It avoids HW specific
> flags in external interface (which was discussed initially), but also
> avoids code duplication which makes it easier to maintain.
> May be we will reconsider it in the future, but right now I think
> it is the best option.

OK, let me try to propose a v3 with helpers like this, and we
can discuss on this basis.


> 
> > > Regards
> > > A Vamsi
> > > 
> > > > -----Original Message-----
> > > > From: Olivier Matz <olivier.matz@6wind.com>
> > > > Sent: Wednesday, October 30, 2019 8:06 PM
> > > > To: dev@dpdk.org
> > > > Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> > > > <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> > > > Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> > > > <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > > > Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> > > > <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > > Subject: [EXT] [PATCH v2 5/6] mempool: prevent objects from being across
> > > > pages
> > > > 
> > > > External Email
> > > > 
> > > > ----------------------------------------------------------------------
> > > > When populating a mempool, ensure that objects are not located across several
> > > > pages, except if user did not request iova contiguous objects.
> > > > 
> > > > Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > > > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > > > ---
> > > >   drivers/mempool/octeontx2/Makefile           |   3 +
> > > >   drivers/mempool/octeontx2/meson.build        |   3 +
> > > >   drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 ++++++++++++++++---
> > > >   lib/librte_mempool/rte_mempool.c             |  23 ++--
> > > >   lib/librte_mempool/rte_mempool_ops_default.c |  32 ++++-
> > > >   5 files changed, 147 insertions(+), 33 deletions(-)
> > > > 
> > > > diff --git a/drivers/mempool/octeontx2/Makefile
> > > > b/drivers/mempool/octeontx2/Makefile
> > > > index 87cce22c6..d781cbfc6 100644
> > > > --- a/drivers/mempool/octeontx2/Makefile
> > > > +++ b/drivers/mempool/octeontx2/Makefile
> > > > @@ -27,6 +27,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
> > > > 
> > > >   LIBABIVER := 1
> > > > 
> > > > +# for rte_mempool_get_page_size
> > > > +CFLAGS += -DALLOW_EXPERIMENTAL_API
> > > > +
> > > >   #
> > > >   # all source are stored in SRCS-y
> > > >   #
> > > > diff --git a/drivers/mempool/octeontx2/meson.build
> > > > b/drivers/mempool/octeontx2/meson.build
> > > > index 9fde40f0e..28f9634da 100644
> > > > --- a/drivers/mempool/octeontx2/meson.build
> > > > +++ b/drivers/mempool/octeontx2/meson.build
> > > > @@ -21,3 +21,6 @@ foreach flag: extra_flags  endforeach
> > > > 
> > > >   deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
> > > > +
> > > > +# for rte_mempool_get_page_size
> > > > +allow_experimental_apis = true
> > > > diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > index d769575f4..47117aec6 100644
> > > > --- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > +++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
> > > > @@ -713,12 +713,76 @@ static ssize_t
> > > >   otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
> > > >   		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)  {
> > > > -	/*
> > > > -	 * Simply need space for one more object to be able to
> > > > -	 * fulfill alignment requirements.
> > > > -	 */
> > > > -	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
> > > > pg_shift,
> > > > -						    min_chunk_size, align);
> > > > +	size_t total_elt_sz;
> > > > +	size_t obj_per_page, pg_sz, objs_in_last_page;
> > > > +	size_t mem_size;
> > > > +
> > > > +	/* derived from rte_mempool_op_calc_mem_size_default() */
> > > > +
> > > > +	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > > +
> > > > +	if (total_elt_sz == 0) {
> > > > +		mem_size = 0;
> > > > +	} else if (pg_shift == 0) {
> > > > +		/* one object margin to fix alignment */
> > > > +		mem_size = total_elt_sz * (obj_num + 1);
> > > > +	} else {
> > > > +		pg_sz = (size_t)1 << pg_shift;
> > > > +		obj_per_page = pg_sz / total_elt_sz;
> > > > +
> > > > +		/* we need to keep one object to fix alignment */
> > > > +		if (obj_per_page > 0)
> > > > +			obj_per_page--;
> > > > +
> > > > +		if (obj_per_page == 0) {
> > > > +			/*
> > > > +			 * Note that if object size is bigger than page size,
> > > > +			 * then it is assumed that pages are grouped in subsets
> > > > +			 * of physically continuous pages big enough to store
> > > > +			 * at least one object.
> > > > +			 */
> > > > +			mem_size = RTE_ALIGN_CEIL(2 * total_elt_sz,
> > > > +						pg_sz) * obj_num;
> > > > +		} else {
> > > > +			/* In the best case, the allocator will return a
> > > > +			 * page-aligned address. For example, with 5 objs,
> > > > +			 * the required space is as below:
> > > > +			 *  |     page0     |     page1     |  page2 (last) |
> > > > +			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
> > > > +			 *  <------------- mem_size ------------->
> > > > +			 */
> > > > +			objs_in_last_page = ((obj_num - 1) % obj_per_page) +
> > > > 1;
> > > > +			/* room required for the last page */
> > > > +			mem_size = objs_in_last_page * total_elt_sz;
> > > > +			/* room required for other pages */
> > > > +			mem_size += ((obj_num - objs_in_last_page) /
> > > > +				obj_per_page) << pg_shift;
> > > > +
> > > > +			/* In the worst case, the allocator returns a
> > > > +			 * non-aligned pointer, wasting up to
> > > > +			 * total_elt_sz. Add a margin for that.
> > > > +			 */
> > > > +			 mem_size += total_elt_sz - 1;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	*min_chunk_size = total_elt_sz * 2;
> > > > +	*align = RTE_CACHE_LINE_SIZE;
> > > > +
> > > > +	return mem_size;
> > > > +}
> > > > +
> > > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > > +	if (pg_sz == 0)
> > > > +		return 0;
> > > > +	if (elt_sz > pg_sz)
> > > > +		return 0;
> > > > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > > pg_sz))
> > > > +		return -1;
> > > > +	return 0;
> > > >   }
> > > > 
> > > >   static int
> > > > @@ -726,8 +790,12 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > > unsigned int max_objs, void *vaddr,
> > > >   		  rte_iova_t iova, size_t len,
> > > >   		  rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
> > > > {
> > > > -	size_t total_elt_sz;
> > > > +	char *va = vaddr;
> > > > +	size_t total_elt_sz, pg_sz;
> > > >   	size_t off;
> > > > +	unsigned int i;
> > > > +	void *obj;
> > > > +	int ret;
> > > > 
> > > >   	if (iova == RTE_BAD_IOVA)
> > > >   		return -EINVAL;
> > > > @@ -735,22 +803,45 @@ otx2_npa_populate(struct rte_mempool *mp,
> > > > unsigned int max_objs, void *vaddr,
> > > >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > > 
> > > >   	/* Align object start address to a multiple of total_elt_sz */
> > > > -	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
> > > > +	off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
> > > > 
> > > >   	if (len < off)
> > > >   		return -EINVAL;
> > > > 
> > > > -	vaddr = (char *)vaddr + off;
> > > > -	iova += off;
> > > > -	len -= off;
> > > > 
> > > > -	npa_lf_aura_op_range_set(mp->pool_id, iova, iova + len);
> > > > +	npa_lf_aura_op_range_set(mp->pool_id, iova + off, iova + len - off);
> > > > 
> > > >   	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
> > > >   		return -EBUSY;
> > > > 
> > > > -	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova,
> > > > len,
> > > > -					       obj_cb, obj_cb_arg);
> > > > +	/* the following is derived from rte_mempool_op_populate_default() */
> > > > +
> > > > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > > +	if (ret < 0)
> > > > +		return ret;
> > > > +
> > > > +	for (i = 0; i < max_objs; i++) {
> > > > +		/* avoid objects to cross page boundaries, and align
> > > > +		 * offset to a multiple of total_elt_sz.
> > > > +		 */
> > > > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
> > > > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > > off);
> > > > +			off += total_elt_sz - (((uintptr_t)(va + off - 1) %
> > > > +						total_elt_sz) + 1);
> > > > +		}
> > > > +
> > > > +		if (off + total_elt_sz > len)
> > > > +			break;
> > > > +
> > > > +		off += mp->header_size;
> > > > +		obj = va + off;
> > > > +		obj_cb(mp, obj_cb_arg, obj,
> > > > +		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > > > +		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > > +		off += mp->elt_size + mp->trailer_size;
> > > > +	}
> > > > +
> > > > +	return i;
> > > >   }
> > > > 
> > > >   static struct rte_mempool_ops otx2_npa_ops = { diff --git
> > > > a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
> > > > index 758c5410b..d3db9273d 100644
> > > > --- a/lib/librte_mempool/rte_mempool.c
> > > > +++ b/lib/librte_mempool/rte_mempool.c
> > > > @@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp,
> > > > size_t *pg_sz)
> > > > 
> > > >   	if (!need_iova_contig_obj)
> > > >   		*pg_sz = 0;
> > > > -	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
> > > > -		*pg_sz = 0;
> > > >   	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
> > > >   		*pg_sz = get_min_page_size(mp->socket_id);
> > > >   	else
> > > > @@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool
> > > > *mp)
> > > >   	 * then just set page shift and page size to 0, because the user has
> > > >   	 * indicated that there's no need to care about anything.
> > > >   	 *
> > > > -	 * if we do need contiguous objects, there is also an option to reserve
> > > > -	 * the entire mempool memory as one contiguous block of memory, in
> > > > -	 * which case the page shift and alignment wouldn't matter as well.
> > > > +	 * if we do need contiguous objects (if a mempool driver has its
> > > > +	 * own calc_size() method returning min_chunk_size = mem_size),
> > > > +	 * there is also an option to reserve the entire mempool memory
> > > > +	 * as one contiguous block of memory.
> > > >   	 *
> > > >   	 * if we require contiguous objects, but not necessarily the entire
> > > > -	 * mempool reserved space to be contiguous, then there are two
> > > > options.
> > > > -	 *
> > > > -	 * if our IO addresses are virtual, not actual physical (IOVA as VA
> > > > -	 * case), then no page shift needed - our memory allocation will give us
> > > > -	 * contiguous IO memory as far as the hardware is concerned, so
> > > > -	 * act as if we're getting contiguous memory.
> > > > +	 * mempool reserved space to be contiguous, pg_sz will be != 0,
> > > > +	 * and the default ops->populate() will take care of not placing
> > > > +	 * objects across pages.
> > > >   	 *
> > > >   	 * if our IO addresses are physical, we may get memory from bigger
> > > >   	 * pages, or we might get memory from smaller pages, and how much
> > > > of it @@ -504,11 +500,6 @@ rte_mempool_populate_default(struct
> > > > rte_mempool *mp)
> > > >   	 *
> > > >   	 * If we fail to get enough contiguous memory, then we'll go and
> > > >   	 * reserve space in smaller chunks.
> > > > -	 *
> > > > -	 * We also have to take into account the fact that memory that we're
> > > > -	 * going to allocate from can belong to an externally allocated memory
> > > > -	 * area, in which case the assumption of IOVA as VA mode being
> > > > -	 * synonymous with IOVA contiguousness will not hold.
> > > >   	 */
> > > > 
> > > >   	need_iova_contig_obj = !(mp->flags &
> > > > MEMPOOL_F_NO_IOVA_CONTIG); diff --git
> > > > a/lib/librte_mempool/rte_mempool_ops_default.c
> > > > b/lib/librte_mempool/rte_mempool_ops_default.c
> > > > index f6aea7662..e5cd4600f 100644
> > > > --- a/lib/librte_mempool/rte_mempool_ops_default.c
> > > > +++ b/lib/librte_mempool/rte_mempool_ops_default.c
> > > > @@ -61,21 +61,47 @@ rte_mempool_op_calc_mem_size_default(const struct
> > > > rte_mempool *mp,
> > > >   	return mem_size;
> > > >   }
> > > > 
> > > > +/* Returns -1 if object crosses a page boundary, else returns 0 */
> > > > +static int check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz) {
> > > > +	if (pg_sz == 0)
> > > > +		return 0;
> > > > +	if (elt_sz > pg_sz)
> > > > +		return 0;
> > > > +	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1,
> > > > pg_sz))
> > > > +		return -1;
> > > > +	return 0;
> > > > +}
> > > > +
> > > >   int
> > > >   rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int
> > > > max_objs,
> > > >   		void *vaddr, rte_iova_t iova, size_t len,
> > > >   		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)  {
> > > > -	size_t total_elt_sz;
> > > > +	char *va = vaddr;
> > > > +	size_t total_elt_sz, pg_sz;
> > > >   	size_t off;
> > > >   	unsigned int i;
> > > >   	void *obj;
> > > > +	int ret;
> > > > +
> > > > +	ret = rte_mempool_get_page_size(mp, &pg_sz);
> > > > +	if (ret < 0)
> > > > +		return ret;
> > > > 
> > > >   	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
> > > > 
> > > > -	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
> > > > +	for (off = 0, i = 0; i < max_objs; i++) {
> > > > +		/* avoid objects to cross page boundaries */
> > > > +		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0)
> > > > +			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va +
> > > > off);
> > > > +
> > > > +		if (off + total_elt_sz > len)
> > > > +			break;
> > > > +
> > > >   		off += mp->header_size;
> > > > -		obj = (char *)vaddr + off;
> > > > +		obj = va + off;
> > > >   		obj_cb(mp, obj_cb_arg, obj,
> > > >   		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
> > > >   		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
> > > > --
> > > > 2.20.1
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations across pages
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
                                             ` (5 preceding siblings ...)
  2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 6/6] mempool: use the specific macro for object alignment Olivier Matz
@ 2019-11-01  3:56                           ` Nipun Gupta
  6 siblings, 0 replies; 251+ messages in thread
From: Nipun Gupta @ 2019-11-01  3:56 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal

Series Acked-by: Nipun Gupta <nipun.gupta@nxp.com>

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Olivier Matz
> Sent: Wednesday, October 30, 2019 8:06 PM
> To: dev@dpdk.org
> Cc: Anatoly Burakov <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Ferruh Yigit <ferruh.yigit@linux.intel.com>;
> Giridharan, Ganesan <ggiridharan@rbbn.com>; Jerin Jacob Kollanukkaran
> <jerinj@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> Stephen Hemminger <sthemmin@microsoft.com>; Thomas Monjalon
> <thomas@monjalon.net>; Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Subject: [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations across
> pages
> 
> KNI supposes that mbufs are contiguous in kernel virtual memory. This
> may not be true when using the IOVA=VA mode. To fix this, a possibility
> is to ensure that objects do not cross page boundaries in mempool. This
> patchset implements this in the last patch (5/5).
> 
> The previous patches prepare the job:
> - allow to populate with an unaligned virtual area (1/5).
> - reduce spaced wasted in the mempool size calculation when not using
>   the iova-contiguous allocation (2/5).
> - remove the iova-contiguous allocation when populating mempool (3/5):
>   a va-contiguous alloc does the job as well if we want to populate
>   without crossing page boundaries, so simplify the mempool populate
>   function.
> - export a function to get the minimum page used in a mempool (4/5)
> 
> Memory consumption impact when using hugepages:
> - worst case: + ~0.1% for a mbuf pool (objsize ~= 2368)
> - best case: -50% for if pool size is just above page size
> 
> The memory consumption impact with 4K pages in IOVA=VA mode could
> however consume up to 75% more memory for mbuf pool, because there will
> be only 1 mbuf per page. Not sure how common this usecase is.
> 
> Caveat: this changes the behavior of the mempool (calc_mem_size and
> populate), and there is a small risk to break things, especially with
> alternate mempool drivers.
> 
> v2
> 
> * update octeontx2 driver to keep alignment constraint (issue seen by
>   Vamsi)
> * add a new patch to use RTE_MEMPOOL_ALIGN (Andrew)
> * fix initialization of for loop in rte_mempool_populate_virt() (Andrew)
> * use rte_mempool_populate_iova() if mz_flags has
>   RTE_MEMZONE_IOVA_CONTIG (Andrew)
> * check rte_mempool_get_page_size() return value (Andrew)
> * some other minor style improvements
> 
> rfc -> v1
> 
> * remove first cleanup patch, it was pushed separately
>   a2b5a8722f20 ("mempool: clarify default populate function")
> * add missing change in rte_mempool_op_calc_mem_size_default()
> * allow unaligned addr/len in populate virt
> * better split patches
> * try to better explain the change
> * use DPDK align macros when relevant
> 
> Olivier Matz (6):
>   mempool: allow unaligned addr/len in populate virt
>   mempool: reduce wasted space on mempool populate
>   mempool: remove optimistic IOVA-contiguous allocation
>   mempool: introduce function to get mempool page size
>   mempool: prevent objects from being across pages
>   mempool: use the specific macro for object alignment
> 
>  drivers/mempool/octeontx2/Makefile           |   3 +
>  drivers/mempool/octeontx2/meson.build        |   3 +
>  drivers/mempool/octeontx2/otx2_mempool_ops.c | 119 +++++++++++++--
>  lib/librte_mempool/rte_mempool.c             | 147 ++++++++-----------
>  lib/librte_mempool/rte_mempool.h             |  15 +-
>  lib/librte_mempool/rte_mempool_ops.c         |   4 +-
>  lib/librte_mempool/rte_mempool_ops_default.c |  60 ++++++--
>  lib/librte_mempool/rte_mempool_version.map   |   1 +
>  8 files changed, 236 insertions(+), 116 deletions(-)
> 
> --
> 2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 0/7] mempool: avoid objects allocations across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (7 preceding siblings ...)
  2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
@ 2019-11-04 15:12                         ` Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
                                             ` (6 more replies)
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
  9 siblings, 7 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

KNI supposes that mbufs are contiguous in kernel virtual memory. This
may not be true when using the IOVA=VA mode. To fix this, a possibility
is to ensure that objects do not cross page boundaries in mempool. This
patchset implements this in the last patch (5/5).

The previous patches prepare the job:
- allow to populate with an unaligned virtual area (1/5).
- reduce spaced wasted in the mempool size calculation when not using
  the iova-contiguous allocation (2/5).
- remove the iova-contiguous allocation when populating mempool (3/5):
  a va-contiguous alloc does the job as well if we want to populate
  without crossing page boundaries, so simplify the mempool populate
  function.
- export a function to get the minimum page used in a mempool (4/5)

Memory consumption impact when using hugepages:
- worst case: + ~0.1% for a mbuf pool (objsize ~= 2368)
- best case: -50% for if pool size is just above page size

The memory consumption impact with 4K pages in IOVA=VA mode could
however consume up to 75% more memory for mbuf pool, because there will
be only 1 mbuf per page. Not sure how common this usecase is.

Caveat: this changes the behavior of the mempool (calc_mem_size and
populate), and there is a small risk to break things, especially with
alternate mempool drivers.

v3

* introduce new helpers to calculate required memory size and to
  populate mempool, use them in drivers: the alignment constraint
  of octeontx/octeontx2 is managed in this common code.
* fix octeontx mempool driver by taking alignment constraint in account
  like in octeontx2
* fix bucket mempool driver with 4K pages: limit bucket size in this
  case to ensure that objects do not cross page boundaries. With larger
  pages, it was already ok, because bucket size (64K) is smaller than
  a page.
* fix some api comments in mempool header file

v2

* update octeontx2 driver to keep alignment constraint (issue seen by
  Vamsi)
* add a new patch to use RTE_MEMPOOL_ALIGN (Andrew)
* fix initialization of for loop in rte_mempool_populate_virt() (Andrew)
* use rte_mempool_populate_iova() if mz_flags has
  RTE_MEMZONE_IOVA_CONTIG (Andrew)
* check rte_mempool_get_page_size() return value (Andrew)
* some other minor style improvements

rfc -> v1

* remove first cleanup patch, it was pushed separately
  a2b5a8722f20 ("mempool: clarify default populate function")
* add missing change in rte_mempool_op_calc_mem_size_default()
* allow unaligned addr/len in populate virt
* better split patches
* try to better explain the change
* use DPDK align macros when relevant

Olivier Matz (7):
  mempool: allow unaligned addr/len in populate virt
  mempool: reduce wasted space on mempool populate
  mempool: remove optimistic IOVA-contiguous allocation
  mempool: introduce function to get mempool page size
  mempool: introduce helpers for populate and calc mem size
  mempool: prevent objects from being across pages
  mempool: use the specific macro for object alignment

 drivers/mempool/bucket/Makefile               |   2 +
 drivers/mempool/bucket/meson.build            |   3 +
 drivers/mempool/bucket/rte_mempool_bucket.c   |  10 +-
 drivers/mempool/dpaa/dpaa_mempool.c           |   4 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |   4 +-
 drivers/mempool/octeontx/Makefile             |   3 +
 drivers/mempool/octeontx/meson.build          |   3 +
 .../mempool/octeontx/rte_mempool_octeontx.c   |  21 +--
 drivers/mempool/octeontx2/Makefile            |   6 +
 drivers/mempool/octeontx2/meson.build         |   6 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c  |  21 ++-
 lib/librte_mempool/rte_mempool.c              | 147 +++++++-----------
 lib/librte_mempool/rte_mempool.h              |  64 ++++++--
 lib/librte_mempool/rte_mempool_ops.c          |   4 +-
 lib/librte_mempool/rte_mempool_ops_default.c  | 113 +++++++++++---
 lib/librte_mempool/rte_mempool_version.map    |   3 +
 16 files changed, 272 insertions(+), 142 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 1/7] mempool: allow unaligned addr/len in populate virt
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 2/7] mempool: reduce wasted space on mempool populate Olivier Matz
                                             ` (5 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

rte_mempool_populate_virt() currently requires that both addr
and length are page-aligned.

Remove this uneeded constraint which can be annoying with big
hugepages (ex: 1GB).

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c | 23 +++++++++++------------
 lib/librte_mempool/rte_mempool.h |  3 +--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 0f29e8712..88e49c751 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	size_t off, phys_len;
 	int ret, cnt = 0;
 
-	/* address and len must be page-aligned */
-	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
-		return -EINVAL;
-	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
-		return -EINVAL;
-
 	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
 		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
 			len, free_cb, opaque);
 
-	for (off = 0; off + pg_sz <= len &&
+	for (off = 0; off < len &&
 		     mp->populated_size < mp->size; off += phys_len) {
 
 		iova = rte_mem_virt2iova(addr + off);
@@ -389,12 +383,18 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 		}
 
 		/* populate with the largest group of contiguous pages */
-		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
+		for (phys_len = RTE_MIN(
+			(size_t)(RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
+				(addr + off)),
+			len - off);
+		     off + phys_len < len;
+		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
 			rte_iova_t iova_tmp;
 
 			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
 
-			if (iova_tmp != iova + phys_len)
+			if (iova_tmp == RTE_BAD_IOVA ||
+					iova_tmp != iova + phys_len)
 				break;
 		}
 
@@ -575,8 +575,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags,
-					RTE_MAX(pg_sz, align));
+					mp->socket_id, flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -601,7 +600,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 				(void *)(uintptr_t)mz);
 		else
 			ret = rte_mempool_populate_virt(mp, mz->addr,
-				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
+				mz->len, pg_sz,
 				rte_mempool_memchunk_mz_free,
 				(void *)(uintptr_t)mz);
 		if (ret < 0) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a04..0fe8aa7b8 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
  *   A pointer to the mempool structure.
  * @param addr
  *   The virtual address of memory that should be used to store objects.
- *   Must be page-aligned.
  * @param len
- *   The length of memory in bytes. Must be page-aligned.
+ *   The length of memory in bytes.
  * @param pg_sz
  *   The size of memory pages in this virtual area.
  * @param free_cb
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 2/7] mempool: reduce wasted space on mempool populate
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 3/7] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
                                             ` (4 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

The size returned by rte_mempool_op_calc_mem_size_default() is aligned
to the specified page size. Therefore, with big pages, the returned size
can be much more that what we really need to populate the mempool.

For instance, populating a mempool that requires 1.1GB of memory with
1GB hugepages can result in allocating 2GB of memory.

This problem is hidden most of the time due to the allocation method of
rte_mempool_populate_default(): when try_iova_contig_mempool=true, it
first tries to allocate an iova contiguous area, without the alignment
constraint. If it fails, it fallbacks to an aligned allocation that does
not require to be iova-contiguous. This can also fallback into several
smaller aligned allocations.

This commit changes rte_mempool_op_calc_mem_size_default() to relax the
alignment constraint to a cache line and to return a smaller size.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybdhenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c             |  7 ++---
 lib/librte_mempool/rte_mempool.h             |  9 +++----
 lib/librte_mempool/rte_mempool_ops.c         |  4 ++-
 lib/librte_mempool/rte_mempool_ops_default.c | 28 +++++++++++++++-----
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 88e49c751..4e0d576f5 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -477,11 +477,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * wasting some space this way, but it's much nicer than looping around
 	 * trying to reserve each and every page size.
 	 *
-	 * However, since size calculation will produce page-aligned sizes, it
-	 * makes sense to first try and see if we can reserve the entire memzone
-	 * in one contiguous chunk as well (otherwise we might end up wasting a
-	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
-	 * memory, then we'll go and reserve space page-by-page.
+	 * If we fail to get enough contiguous memory, then we'll go and
+	 * reserve space in smaller chunks.
 	 *
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 0fe8aa7b8..78b687bb6 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
  * @param[out] align
  *   Location for required memory chunk alignment.
  * @return
- *   Required memory size aligned at page boundary.
+ *   Required memory size.
  */
 typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		uint32_t obj_num,  uint32_t pg_shift,
@@ -477,11 +477,8 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
  *
- * Minimum size of memory chunk is a maximum of the page size and total
- * element size.
- *
- * Required memory chunk alignment is a maximum of page size and cache
- * line size.
+ * Minimum size of memory chunk is the total element size.
+ * Required memory chunk alignment is the cache line size.
  */
 ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index e02eb702c..22c5251eb 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to notify new memory area to external mempool */
+/* wrapper to calculate the memory size required to store given number
+ * of objects
+ */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc82d..f6aea7662 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -12,7 +12,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				     size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
-	size_t obj_per_page, pg_num, pg_sz;
+	size_t obj_per_page, pg_sz, objs_in_last_page;
 	size_t mem_size;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
@@ -33,14 +33,30 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 			mem_size =
 				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
 		} else {
-			pg_num = (obj_num + obj_per_page - 1) / obj_per_page;
-			mem_size = pg_num << pg_shift;
+			/* In the best case, the allocator will return a
+			 * page-aligned address. For example, with 5 objs,
+			 * the required space is as below:
+			 *  |     page0     |     page1     |  page2 (last) |
+			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
+			 *  <------------- mem_size ------------->
+			 */
+			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
+			/* room required for the last page */
+			mem_size = objs_in_last_page * total_elt_sz;
+			/* room required for other pages */
+			mem_size += ((obj_num - objs_in_last_page) /
+				obj_per_page) << pg_shift;
+
+			/* In the worst case, the allocator returns a
+			 * non-aligned pointer, wasting up to
+			 * total_elt_sz. Add a margin for that.
+			 */
+			 mem_size += total_elt_sz - 1;
 		}
 	}
 
-	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
-
-	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
+	*min_chunk_size = total_elt_sz;
+	*align = RTE_CACHE_LINE_SIZE;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 3/7] mempool: remove optimistic IOVA-contiguous allocation
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 2/7] mempool: reduce wasted space on mempool populate Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 4/7] mempool: introduce function to get mempool page size Olivier Matz
                                             ` (3 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

The previous commit reduced the amount of required memory when
populating the mempool with non iova-contiguous memory.

Since there is no big advantage to have a fully iova-contiguous mempool
if it is not explicitly asked, remove this code, it simplifies the
populate function.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 4e0d576f5..213e574fc 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -430,7 +430,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool try_iova_contig_mempool;
 	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
@@ -483,9 +482,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
 	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold. We should also try
-	 * to go for contiguous memory even if we're in no-huge mode, because
-	 * external memory may in fact be IOVA-contiguous.
+	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	/* check if we can retrieve a valid socket ID */
@@ -494,7 +491,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		return -EINVAL;
 	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
-	try_iova_contig_mempool = false;
 
 	if (!need_iova_contig_obj) {
 		pg_sz = 0;
@@ -503,7 +499,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		pg_sz = 0;
 		pg_shift = 0;
 	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		try_iova_contig_mempool = true;
 		pg_sz = get_min_page_size(mp->socket_id);
 		pg_shift = rte_bsf32(pg_sz);
 	} else {
@@ -513,14 +508,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
-		unsigned int flags;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					0, &min_chunk_size, &align);
-		else
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
+		mem_size = rte_mempool_ops_calc_mem_size(
+			mp, n, pg_shift, &min_chunk_size, &align);
 
 		if (mem_size < 0) {
 			ret = mem_size;
@@ -534,36 +524,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
-		flags = mz_flags;
-
 		/* if we're trying to reserve contiguous memory, add appropriate
 		 * memzone flag.
 		 */
-		if (try_iova_contig_mempool)
-			flags |= RTE_MEMZONE_IOVA_CONTIG;
+		if (min_chunk_size == (size_t)mem_size)
+			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-
-		/* if we were trying to allocate contiguous memory, failed and
-		 * minimum required contiguous chunk fits minimum page, adjust
-		 * memzone size to the page size, and try again.
-		 */
-		if (mz == NULL && try_iova_contig_mempool &&
-				min_chunk_size <= pg_sz) {
-			try_iova_contig_mempool = false;
-			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
-			if (mem_size < 0) {
-				ret = mem_size;
-				goto fail;
-			}
+				mp->socket_id, mz_flags, align);
 
-			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
@@ -572,7 +541,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags, align);
+					mp->socket_id, mz_flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -590,7 +559,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		else
 			iova = RTE_BAD_IOVA;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
+		if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 4/7] mempool: introduce function to get mempool page size
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
                                             ` (2 preceding siblings ...)
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 3/7] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
                                             ` (2 subsequent siblings)
  6 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

In rte_mempool_populate_default(), we determine the page size,
which is needed for calc_size and allocation of memory.

Move this in a function and export it, it will be used in a next
commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c           | 51 ++++++++++++++--------
 lib/librte_mempool/rte_mempool.h           |  7 +++
 lib/librte_mempool/rte_mempool_version.map |  1 +
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 213e574fc..758c5410b 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,33 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Get the minimal page size used in a mempool before populating it. */
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
+{
+	bool need_iova_contig_obj;
+	bool alloc_in_ext_mem;
+	int ret;
+
+	/* check if we can retrieve a valid socket ID */
+	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+	if (ret < 0)
+		return -EINVAL;
+	alloc_in_ext_mem = (ret == 1);
+	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+
+	if (!need_iova_contig_obj)
+		*pg_sz = 0;
+	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
+		*pg_sz = 0;
+	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
+		*pg_sz = get_min_page_size(mp->socket_id);
+	else
+		*pg_sz = getpagesize();
+
+	return 0;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
@@ -425,12 +452,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
 	ssize_t mem_size;
-	size_t align, pg_sz, pg_shift;
+	size_t align, pg_sz, pg_shift = 0;
 	rte_iova_t iova;
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -485,26 +511,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
-	/* check if we can retrieve a valid socket ID */
-	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
-	if (ret < 0)
-		return -EINVAL;
-	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
-	if (!need_iova_contig_obj) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		pg_sz = get_min_page_size(mp->socket_id);
-		pg_shift = rte_bsf32(pg_sz);
-	} else {
-		pg_sz = getpagesize();
+	if (pg_sz != 0)
 		pg_shift = rte_bsf32(pg_sz);
-	}
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 78b687bb6..1282414ba 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1688,6 +1688,13 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
 		      void *arg);
 
+/**
+ * @internal Get page size used for mempool object allocation.
+ */
+__rte_experimental
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca460..4eff2767d 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -56,5 +56,6 @@ DPDK_18.05 {
 EXPERIMENTAL {
 	global:
 
+	rte_mempool_get_page_size;
 	rte_mempool_ops_get_info;
 };
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 5/7] mempool: introduce helpers for populate and calc mem size
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
                                             ` (3 preceding siblings ...)
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 4/7] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-05 12:19                             ` Andrew Rybchenko
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 6/7] mempool: prevent objects from being across pages Olivier Matz
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment Olivier Matz
  6 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

Introduce new functions that can used by mempool drivers to
calculate required memory size and to populate mempool.

For now, these helpers just replace the *_default() functions
without change. They will be enhanced in next commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 drivers/mempool/bucket/Makefile               |  2 ++
 drivers/mempool/bucket/meson.build            |  3 ++
 drivers/mempool/bucket/rte_mempool_bucket.c   |  2 +-
 drivers/mempool/dpaa/dpaa_mempool.c           |  2 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  2 +-
 drivers/mempool/octeontx/Makefile             |  3 ++
 drivers/mempool/octeontx/meson.build          |  3 ++
 .../mempool/octeontx/rte_mempool_octeontx.c   |  4 +--
 drivers/mempool/octeontx2/Makefile            |  3 ++
 drivers/mempool/octeontx2/meson.build         |  3 ++
 drivers/mempool/octeontx2/otx2_mempool_ops.c  |  4 +--
 lib/librte_mempool/rte_mempool.h              | 28 +++++++++++++++--
 lib/librte_mempool/rte_mempool_ops_default.c  | 31 +++++++++++++++----
 lib/librte_mempool/rte_mempool_version.map    |  2 ++
 14 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/drivers/mempool/bucket/Makefile b/drivers/mempool/bucket/Makefile
index 7364916bc..47be6b5f7 100644
--- a/drivers/mempool/bucket/Makefile
+++ b/drivers/mempool/bucket/Makefile
@@ -15,6 +15,8 @@ LIB = librte_mempool_bucket.a
 
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
+# for mempool populate/calc_mem_size helpers
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 
 LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
 
diff --git a/drivers/mempool/bucket/meson.build b/drivers/mempool/bucket/meson.build
index 618d79128..29df6751c 100644
--- a/drivers/mempool/bucket/meson.build
+++ b/drivers/mempool/bucket/meson.build
@@ -6,4 +6,7 @@
 # This software was jointly developed between OKTET Labs (under contract
 # for Solarflare) and Solarflare Communications, Inc.
 
+# for mempool populate/calc_mem_size helpers
+allow_experimental_apis = true
+
 sources = files('rte_mempool_bucket.c')
diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
index 78d2b9d04..dfeaf4e45 100644
--- a/drivers/mempool/bucket/rte_mempool_bucket.c
+++ b/drivers/mempool/bucket/rte_mempool_bucket.c
@@ -585,7 +585,7 @@ bucket_populate(struct rte_mempool *mp, unsigned int max_objs,
 
 		hdr->fill_cnt = 0;
 		hdr->lcore_id = LCORE_ID_ANY;
-		rc = rte_mempool_op_populate_default(mp,
+		rc = rte_mempool_op_populate_helper(mp,
 						     RTE_MIN(bd->obj_per_bucket,
 							     max_objs - n_objs),
 						     iter + bucket_header_sz,
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index a25697f05..27736e6c2 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -341,7 +341,7 @@ dpaa_populate(struct rte_mempool *mp, unsigned int max_objs,
 	 */
 	TAILQ_INSERT_HEAD(&rte_dpaa_memsegs, ms, next);
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, paddr, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index f26c30b00..8f8dbeada 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -421,7 +421,7 @@ dpaa2_populate(struct rte_mempool *mp, unsigned int max_objs,
 	/* Insert entry into the PA->VA Table */
 	dpaax_iova_table_update(paddr, vaddr, len);
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, paddr, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/drivers/mempool/octeontx/Makefile b/drivers/mempool/octeontx/Makefile
index a3e1dce88..6540a2ab4 100644
--- a/drivers/mempool/octeontx/Makefile
+++ b/drivers/mempool/octeontx/Makefile
@@ -11,6 +11,9 @@ LIB = librte_mempool_octeontx.a
 
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -I$(RTE_SDK)/drivers/common/octeontx/
+# for mempool populate/calc_mem_size helpers
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 EXPORT_MAP := rte_mempool_octeontx_version.map
 
 LIBABIVER := 1
diff --git a/drivers/mempool/octeontx/meson.build b/drivers/mempool/octeontx/meson.build
index 3baaf7db2..f293afa96 100644
--- a/drivers/mempool/octeontx/meson.build
+++ b/drivers/mempool/octeontx/meson.build
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Cavium, Inc
 
+# for mempool populate/calc_mem_size helpers
+allow_experimental_apis = true
+
 sources = files('octeontx_fpavf.c',
 		'rte_mempool_octeontx.c'
 )
diff --git a/drivers/mempool/octeontx/rte_mempool_octeontx.c b/drivers/mempool/octeontx/rte_mempool_octeontx.c
index ab94dfe91..fff33e5c6 100644
--- a/drivers/mempool/octeontx/rte_mempool_octeontx.c
+++ b/drivers/mempool/octeontx/rte_mempool_octeontx.c
@@ -137,7 +137,7 @@ octeontx_fpavf_calc_mem_size(const struct rte_mempool *mp,
 	 * Simply need space for one more object to be able to
 	 * fulfil alignment requirements.
 	 */
-	mem_size = rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
+	mem_size = rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1,
 							pg_shift,
 							min_chunk_size, align);
 	if (mem_size >= 0) {
@@ -184,7 +184,7 @@ octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
 	if (ret < 0)
 		return ret;
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/drivers/mempool/octeontx2/Makefile b/drivers/mempool/octeontx2/Makefile
index 87cce22c6..8f55305c5 100644
--- a/drivers/mempool/octeontx2/Makefile
+++ b/drivers/mempool/octeontx2/Makefile
@@ -23,6 +23,9 @@ CFLAGS += -diag-disable 2259
 endif
 endif
 
+# for mempool populate/calc_mem_size helpers
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 EXPORT_MAP := rte_mempool_octeontx2_version.map
 
 LIBABIVER := 1
diff --git a/drivers/mempool/octeontx2/meson.build b/drivers/mempool/octeontx2/meson.build
index 9fde40f0e..5f93bb495 100644
--- a/drivers/mempool/octeontx2/meson.build
+++ b/drivers/mempool/octeontx2/meson.build
@@ -2,6 +2,9 @@
 # Copyright(C) 2019 Marvell International Ltd.
 #
 
+# for mempool populate/calc_mem_size helpers
+allow_experimental_apis = true
+
 sources = files('otx2_mempool_ops.c',
 		'otx2_mempool.c',
 		'otx2_mempool_irq.c',
diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index d769575f4..3aea92a01 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -717,7 +717,7 @@ otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
 	 * Simply need space for one more object to be able to
 	 * fulfill alignment requirements.
 	 */
-	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1, pg_shift,
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1, pg_shift,
 						    min_chunk_size, align);
 }
 
@@ -749,7 +749,7 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
 		return -EBUSY;
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 1282414ba..26a98af30 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -465,7 +465,7 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		size_t *min_chunk_size, size_t *align);
 
 /**
- * Default way to calculate memory size required to store given number of
+ * Helper to calculate memory size required to store given number of
  * objects.
  *
  * If page boundaries may be ignored, it is just a product of total
@@ -480,6 +480,18 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  * Minimum size of memory chunk is the total element size.
  * Required memory chunk alignment is the cache line size.
  */
+__rte_experimental
+ssize_t rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
+		uint32_t obj_num, uint32_t pg_shift,
+		size_t *min_chunk_size, size_t *align);
+
+/**
+ * Default way to calculate memory size required to store given number of
+ * objects.
+ *
+ * Equivalent to rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+ * min_chunk_size, align).
+ */
 ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
 		size_t *min_chunk_size, size_t *align);
@@ -533,9 +545,21 @@ typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
 
 /**
- * Default way to populate memory pool object using provided memory
+ * Helper to populate memory pool object using provided memory
  * chunk: just slice objects one by one.
  */
+__rte_experimental
+int rte_mempool_op_populate_helper(struct rte_mempool *mp,
+		unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
+
+/**
+ * Default way to populate memory pool object using provided memory chunk.
+ *
+ * Equivalent to rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+ * len, obj_cb, obj_cb_arg).
+ */
 int rte_mempool_op_populate_default(struct rte_mempool *mp,
 		unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index f6aea7662..0bfc63497 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -7,9 +7,9 @@
 #include <rte_mempool.h>
 
 ssize_t
-rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
-				     uint32_t obj_num, uint32_t pg_shift,
-				     size_t *min_chunk_size, size_t *align)
+rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
+				uint32_t obj_num, uint32_t pg_shift,
+				size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
 	size_t obj_per_page, pg_sz, objs_in_last_page;
@@ -61,10 +61,19 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+ssize_t
+rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
+				uint32_t obj_num, uint32_t pg_shift,
+				size_t *min_chunk_size, size_t *align)
+{
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						min_chunk_size, align);
+}
+
 int
-rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
-		void *vaddr, rte_iova_t iova, size_t len,
-		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int max_objs,
+			void *vaddr, rte_iova_t iova, size_t len,
+			rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
 	size_t total_elt_sz;
 	size_t off;
@@ -84,3 +93,13 @@ rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 
 	return i;
 }
+
+int
+rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
+				void *vaddr, rte_iova_t iova, size_t len,
+				rte_mempool_populate_obj_cb_t *obj_cb,
+				void *obj_cb_arg)
+{
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+					len, obj_cb, obj_cb_arg);
+}
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 4eff2767d..b5e9c91a9 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -57,5 +57,7 @@ EXPERIMENTAL {
 	global:
 
 	rte_mempool_get_page_size;
+	rte_mempool_op_calc_mem_size_helper;
+	rte_mempool_op_populate_helper;
 	rte_mempool_ops_get_info;
 };
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 6/7] mempool: prevent objects from being across pages
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
                                             ` (4 preceding siblings ...)
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-05 12:22                             ` Andrew Rybchenko
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment Olivier Matz
  6 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

When populating a mempool, ensure that objects are not located across
several pages, except if user did not request iova contiguous objects.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 drivers/mempool/bucket/rte_mempool_bucket.c   |  2 +-
 drivers/mempool/dpaa/dpaa_mempool.c           |  4 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  4 +-
 .../mempool/octeontx/rte_mempool_octeontx.c   | 21 +++---
 drivers/mempool/octeontx2/Makefile            |  3 +
 drivers/mempool/octeontx2/meson.build         |  3 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c  | 21 +++---
 lib/librte_mempool/rte_mempool.c              | 23 ++-----
 lib/librte_mempool/rte_mempool.h              | 24 +++++--
 lib/librte_mempool/rte_mempool_ops_default.c  | 66 +++++++++++++++----
 10 files changed, 115 insertions(+), 56 deletions(-)

diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
index dfeaf4e45..b978fd220 100644
--- a/drivers/mempool/bucket/rte_mempool_bucket.c
+++ b/drivers/mempool/bucket/rte_mempool_bucket.c
@@ -585,7 +585,7 @@ bucket_populate(struct rte_mempool *mp, unsigned int max_objs,
 
 		hdr->fill_cnt = 0;
 		hdr->lcore_id = LCORE_ID_ANY;
-		rc = rte_mempool_op_populate_helper(mp,
+		rc = rte_mempool_op_populate_helper(mp, 0,
 						     RTE_MIN(bd->obj_per_bucket,
 							     max_objs - n_objs),
 						     iter + bucket_header_sz,
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 27736e6c2..3a2528331 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -341,8 +341,8 @@ dpaa_populate(struct rte_mempool *mp, unsigned int max_objs,
 	 */
 	TAILQ_INSERT_HEAD(&rte_dpaa_memsegs, ms, next);
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, paddr,
+					       len, obj_cb, obj_cb_arg);
 }
 
 static const struct rte_mempool_ops dpaa_mpool_ops = {
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 8f8dbeada..36c93decf 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -421,8 +421,8 @@ dpaa2_populate(struct rte_mempool *mp, unsigned int max_objs,
 	/* Insert entry into the PA->VA Table */
 	dpaax_iova_table_update(paddr, vaddr, len);
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, paddr,
+					       len, obj_cb, obj_cb_arg);
 }
 
 static const struct rte_mempool_ops dpaa2_mpool_ops = {
diff --git a/drivers/mempool/octeontx/rte_mempool_octeontx.c b/drivers/mempool/octeontx/rte_mempool_octeontx.c
index fff33e5c6..bd0070020 100644
--- a/drivers/mempool/octeontx/rte_mempool_octeontx.c
+++ b/drivers/mempool/octeontx/rte_mempool_octeontx.c
@@ -132,14 +132,15 @@ octeontx_fpavf_calc_mem_size(const struct rte_mempool *mp,
 			     size_t *min_chunk_size, size_t *align)
 {
 	ssize_t mem_size;
+	size_t total_elt_sz;
 
-	/*
-	 * Simply need space for one more object to be able to
-	 * fulfil alignment requirements.
+	/* Need space for one more obj on each chunk to fulfill
+	 * alignment requirements.
 	 */
-	mem_size = rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1,
-							pg_shift,
-							min_chunk_size, align);
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+	mem_size = rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						total_elt_sz, min_chunk_size,
+						align);
 	if (mem_size >= 0) {
 		/*
 		 * Memory area which contains objects must be physically
@@ -168,7 +169,7 @@ octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	/* align object start address to a multiple of total_elt_sz */
-	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+	off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
 
 	if (len < off)
 		return -EINVAL;
@@ -184,8 +185,10 @@ octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
 	if (ret < 0)
 		return ret;
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp,
+					RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
+					max_objs, vaddr, iova, len,
+					obj_cb, obj_cb_arg);
 }
 
 static struct rte_mempool_ops octeontx_fpavf_ops = {
diff --git a/drivers/mempool/octeontx2/Makefile b/drivers/mempool/octeontx2/Makefile
index 8f55305c5..62e90f277 100644
--- a/drivers/mempool/octeontx2/Makefile
+++ b/drivers/mempool/octeontx2/Makefile
@@ -30,6 +30,9 @@ EXPORT_MAP := rte_mempool_octeontx2_version.map
 
 LIBABIVER := 1
 
+# for rte_mempool_get_page_size
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 #
 # all source are stored in SRCS-y
 #
diff --git a/drivers/mempool/octeontx2/meson.build b/drivers/mempool/octeontx2/meson.build
index 5f93bb495..883b643da 100644
--- a/drivers/mempool/octeontx2/meson.build
+++ b/drivers/mempool/octeontx2/meson.build
@@ -24,3 +24,6 @@ foreach flag: extra_flags
 endforeach
 
 deps += ['eal', 'mbuf', 'kvargs', 'bus_pci', 'common_octeontx2', 'mempool']
+
+# for rte_mempool_get_page_size
+allow_experimental_apis = true
diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index 3aea92a01..ea4b1c45d 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -713,12 +713,15 @@ static ssize_t
 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
 		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
 {
-	/*
-	 * Simply need space for one more object to be able to
-	 * fulfill alignment requirements.
+	size_t total_elt_sz;
+
+	/* Need space for one more obj on each chunk to fulfill
+	 * alignment requirements.
 	 */
-	return rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1, pg_shift,
-						    min_chunk_size, align);
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						total_elt_sz, min_chunk_size,
+						align);
 }
 
 static int
@@ -735,7 +738,7 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	/* Align object start address to a multiple of total_elt_sz */
-	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+	off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
 
 	if (len < off)
 		return -EINVAL;
@@ -749,8 +752,10 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
 		return -EBUSY;
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp,
+					RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
+					max_objs, vaddr, iova, len,
+					obj_cb, obj_cb_arg);
 }
 
 static struct rte_mempool_ops otx2_npa_ops = {
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 758c5410b..d3db9273d 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
 
 	if (!need_iova_contig_obj)
 		*pg_sz = 0;
-	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
-		*pg_sz = 0;
 	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
 		*pg_sz = get_min_page_size(mp->socket_id);
 	else
@@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * then just set page shift and page size to 0, because the user has
 	 * indicated that there's no need to care about anything.
 	 *
-	 * if we do need contiguous objects, there is also an option to reserve
-	 * the entire mempool memory as one contiguous block of memory, in
-	 * which case the page shift and alignment wouldn't matter as well.
+	 * if we do need contiguous objects (if a mempool driver has its
+	 * own calc_size() method returning min_chunk_size = mem_size),
+	 * there is also an option to reserve the entire mempool memory
+	 * as one contiguous block of memory.
 	 *
 	 * if we require contiguous objects, but not necessarily the entire
-	 * mempool reserved space to be contiguous, then there are two options.
-	 *
-	 * if our IO addresses are virtual, not actual physical (IOVA as VA
-	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous IO memory as far as the hardware is concerned, so
-	 * act as if we're getting contiguous memory.
+	 * mempool reserved space to be contiguous, pg_sz will be != 0,
+	 * and the default ops->populate() will take care of not placing
+	 * objects across pages.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
 	 * pages, or we might get memory from smaller pages, and how much of it
@@ -504,11 +500,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * If we fail to get enough contiguous memory, then we'll go and
 	 * reserve space in smaller chunks.
-	 *
-	 * We also have to take into account the fact that memory that we're
-	 * going to allocate from can belong to an externally allocated memory
-	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 26a98af30..f1cba3521 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -473,6 +473,10 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  * Otherwise, it is a number of pages required to store given number of
  * objects without crossing page boundary.
  *
+ * The chunk_reserve argument is the amount of memory that must be
+ * reserved at the beginning of each page, or at the beginning of the
+ * memory area if pg_shift is 0.
+ *
  * Note that if object size is bigger than page size, then it assumes
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
@@ -482,7 +486,7 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  */
 __rte_experimental
 ssize_t rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
-		uint32_t obj_num, uint32_t pg_shift,
+		uint32_t obj_num, uint32_t pg_shift, size_t chunk_reserve,
 		size_t *min_chunk_size, size_t *align);
 
 /**
@@ -490,7 +494,7 @@ ssize_t rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
  * objects.
  *
  * Equivalent to rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
- * min_chunk_size, align).
+ * 0, min_chunk_size, align).
  */
 ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
@@ -544,20 +548,30 @@ typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
 
+/**
+ * Align objects on addresses multiple of total_elt_sz.
+ */
+#define RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ 0x0001
+
 /**
  * Helper to populate memory pool object using provided memory
- * chunk: just slice objects one by one.
+ * chunk: just slice objects one by one, taking care of not
+ * crossing page boundaries.
+ *
+ * If RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is set in flags, the addresses
+ * of object headers will be aligned on a multiple of total_elt_sz.
+ * This feature is used by octeontx hardware.
  */
 __rte_experimental
 int rte_mempool_op_populate_helper(struct rte_mempool *mp,
-		unsigned int max_objs,
+		unsigned int flags, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
 
 /**
  * Default way to populate memory pool object using provided memory chunk.
  *
- * Equivalent to rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+ * Equivalent to rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, iova,
  * len, obj_cb, obj_cb_arg).
  */
 int rte_mempool_op_populate_default(struct rte_mempool *mp,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 0bfc63497..e6be7152b 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -9,6 +9,7 @@
 ssize_t
 rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
+				size_t chunk_reserve,
 				size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
@@ -19,10 +20,12 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 	if (total_elt_sz == 0) {
 		mem_size = 0;
 	} else if (pg_shift == 0) {
-		mem_size = total_elt_sz * obj_num;
+		mem_size = total_elt_sz * obj_num + chunk_reserve;
 	} else {
 		pg_sz = (size_t)1 << pg_shift;
-		obj_per_page = pg_sz / total_elt_sz;
+		if (chunk_reserve >= pg_sz)
+			return -EINVAL;
+		obj_per_page = (pg_sz - chunk_reserve) / total_elt_sz;
 		if (obj_per_page == 0) {
 			/*
 			 * Note that if object size is bigger than page size,
@@ -30,8 +33,8 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 			 * of physically continuous pages big enough to store
 			 * at least one object.
 			 */
-			mem_size =
-				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
+			mem_size = RTE_ALIGN_CEIL(total_elt_sz + chunk_reserve,
+						pg_sz) * obj_num;
 		} else {
 			/* In the best case, the allocator will return a
 			 * page-aligned address. For example, with 5 objs,
@@ -42,7 +45,8 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 			 */
 			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
 			/* room required for the last page */
-			mem_size = objs_in_last_page * total_elt_sz;
+			mem_size = objs_in_last_page * total_elt_sz +
+				chunk_reserve;
 			/* room required for other pages */
 			mem_size += ((obj_num - objs_in_last_page) /
 				obj_per_page) << pg_shift;
@@ -67,24 +71,60 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				size_t *min_chunk_size, size_t *align)
 {
 	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
-						min_chunk_size, align);
+						0, min_chunk_size, align);
+}
+
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
 }
 
 int
-rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int max_objs,
-			void *vaddr, rte_iova_t iova, size_t len,
-			rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
+			unsigned int max_objs, void *vaddr, rte_iova_t iova,
+			size_t len, rte_mempool_populate_obj_cb_t *obj_cb,
+			void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
+	int ret;
+
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
+		off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
+	else
+		off = 0;
+	for (i = 0; i < max_objs; i++) {
+		/* avoid objects to cross page boundaries */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+			if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
+				off += total_elt_sz -
+					(((uintptr_t)(va + off - 1) %
+						total_elt_sz) + 1);
+		}
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
-		obj = (char *)vaddr + off;
+		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
@@ -100,6 +140,6 @@ rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 				rte_mempool_populate_obj_cb_t *obj_cb,
 				void *obj_cb_arg)
 {
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+	return rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, iova,
 					len, obj_cb, obj_cb_arg);
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
                                             ` (5 preceding siblings ...)
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 6/7] mempool: prevent objects from being across pages Olivier Matz
@ 2019-11-04 15:12                           ` Olivier Matz
  2019-11-05 12:15                             ` Andrew Rybchenko
  6 siblings, 1 reply; 251+ messages in thread
From: Olivier Matz @ 2019-11-04 15:12 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

For consistency, RTE_MEMPOOL_ALIGN should be used in place of
RTE_CACHE_LINE_SIZE. They have the same value, because the only arch
that was defining a specific value for it has been removed from dpdk.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 drivers/mempool/bucket/rte_mempool_bucket.c  | 8 +++++++-
 lib/librte_mempool/rte_mempool.c             | 2 +-
 lib/librte_mempool/rte_mempool.h             | 3 +++
 lib/librte_mempool/rte_mempool_ops_default.c | 2 +-
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
index b978fd220..5ce1ef16f 100644
--- a/drivers/mempool/bucket/rte_mempool_bucket.c
+++ b/drivers/mempool/bucket/rte_mempool_bucket.c
@@ -401,6 +401,11 @@ bucket_alloc(struct rte_mempool *mp)
 	struct bucket_data *bd;
 	unsigned int i;
 	unsigned int bucket_header_size;
+	size_t pg_sz;
+
+	rc = rte_mempool_get_page_size(mp, &pg_sz);
+	if (rc < 0)
+		return rc;
 
 	bd = rte_zmalloc_socket("bucket_pool", sizeof(*bd),
 				RTE_CACHE_LINE_SIZE, mp->socket_id);
@@ -416,7 +421,8 @@ bucket_alloc(struct rte_mempool *mp)
 	RTE_BUILD_BUG_ON(sizeof(struct bucket_header) > RTE_CACHE_LINE_SIZE);
 	bd->header_size = mp->header_size + bucket_header_size;
 	bd->total_elt_size = mp->header_size + mp->elt_size + mp->trailer_size;
-	bd->bucket_mem_size = RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB * 1024;
+	bd->bucket_mem_size = RTE_MIN(pg_sz,
+			(size_t)(RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB * 1024));
 	bd->obj_per_bucket = (bd->bucket_mem_size - bucket_header_size) /
 		bd->total_elt_size;
 	bd->bucket_page_mask = ~(rte_align64pow2(bd->bucket_mem_size) - 1);
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index d3db9273d..40cae3eb6 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -329,7 +329,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
 		off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr;
 	else
-		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
+		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_MEMPOOL_ALIGN) - vaddr;
 
 	if (off > len) {
 		ret = -EINVAL;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index f1cba3521..e9989a87e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -116,6 +116,9 @@ struct rte_mempool_objsz {
 #define	MEMPOOL_PG_NUM_DEFAULT	1
 
 #ifndef RTE_MEMPOOL_ALIGN
+/**
+ * Alignment of elements inside mempool.
+ */
 #define RTE_MEMPOOL_ALIGN	RTE_CACHE_LINE_SIZE
 #endif
 
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index e6be7152b..22fccf9d7 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -60,7 +60,7 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 	}
 
 	*min_chunk_size = total_elt_sz;
-	*align = RTE_CACHE_LINE_SIZE;
+	*align = RTE_MEMPOOL_ALIGN;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v12 0/2] add IOVA=VA mode support
  2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
                                         ` (3 preceding siblings ...)
  2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 4/4] kni: add IOVA=VA support in kernel module vattunuru
@ 2019-11-05 11:04                       ` vattunuru
  2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 1/2] kni: " vattunuru
                                           ` (4 more replies)
  4 siblings, 5 replies; 251+ messages in thread
From: vattunuru @ 2019-11-05 11:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V12 Changes:
* Removed previously added `--legacy-kni` eal option.
* Removed previously added kni specific mempool create routines
and mempool populate routines.

This patch set(V12) is dependent on following patch set, since the mempool
related support to enable KNI in IOVA=VA mode is taken care in below
patchset.

   https://patchwork.dpdk.org/cover/62376/

V11 Changes:
* Added iova to kva address translation routines in kernel module to
make it work in iova=va mode which enables DPDK to create kni devices
on any kind of backed device/memory.
* Added ``--legacy-kni`` eal option to make existing KNI applications
work with DPDK 19.11 and later versions.
* Removed previously added pci device info from kni device info struct.
 
V10 Changes:
* Fixed function return code on failure when min_chunk_size > pg_sz.
* Marked new mempool populate routine as EXPERIMENTAL.
 
V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in the
new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.
 
V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.
 
V7 Changes:
* Removed previously proposed mempool flag and made those page
boundary checks default in mempool populate() except for the objects size
bigger than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
mode is enabled.
 
V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered across
page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new mempool
flag.
 
V5 changes:
* Fixed build issue with 32b build
 
V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0
 
V3 Changes:
* Add new approach to work kni with IOVA=VA mode using
iommu_iova_to_phys API.

Vamsi Attunuru (2):
  kni: add IOVA=VA mode support
  kni: add IOVA=VA support in kernel module

 doc/guides/prog_guide/kernel_nic_interface.rst    |  9 ++++
 doc/guides/rel_notes/release_19_11.rst            |  5 ++
 kernel/linux/kni/compat.h                         | 15 ++++++
 kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
 kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
 kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
 lib/librte_eal/linux/eal/eal.c                    | 29 ++++++-----
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/rte_kni.c                          |  7 +--
 9 files changed, 170 insertions(+), 39 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v12 1/2] kni: add IOVA=VA mode support
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
@ 2019-11-05 11:04                         ` vattunuru
  2019-11-14 10:57                           ` David Marchand
  2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 2/2] kni: add IOVA=VA support in kernel module vattunuru
                                           ` (3 subsequent siblings)
  4 siblings, 1 reply; 251+ messages in thread
From: vattunuru @ 2019-11-05 11:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation only operates in IOVA_PA mode
patch adds required functionality to enable KNI in
IOVA_VA mode.

KNI can operate in IOVA_VA mode when this mode is requested
using eal option (`iova-mode=va`) or when bus iommu scheme is
selected as RTE_IOVA_VA.

During KNI creation, app's iova_mode details are passed to
the KNI kernel module, accordingly kernel module translates
PA/IOVA addresses to KVA and vice-versa.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst    |  9 +++++++
 doc/guides/rel_notes/release_19_11.rst            |  5 ++++
 lib/librte_eal/linux/eal/eal.c                    | 29 +++++++++++++----------
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/rte_kni.c                          |  7 ++----
 5 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 2fd58e1..f869493 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -300,6 +300,15 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI operates in IOVA_VA scheme when
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) and
+- eal option `iova-mode=va` is passed or bus IOVA scheme in the DPDK is selected
+  as RTE_IOVA_VA.
+
 Ethtool
 -------
 
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index ae8e7b2..e5d2996 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -231,6 +231,11 @@ New Features
   * Added a console command to testpmd app, ``show port (port_id) ptypes`` which
     gives ability to print port supported ptypes in different protocol layers.
 
+* **Added IOVA as VA support for KNI.**
+
+  Added IOVA = VA support for KNI, KNI can operate in IOVA = VA mode when
+  `iova-mode=va` eal option is passed to the application or when bus IOVA
+  scheme is selected as RTE_IOVA_VA.
 
 Removed Items
 -------------
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 9e2d50c..a1c5bf6 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -922,6 +922,19 @@ static int rte_eal_vfio_setup(void)
 }
 #endif
 
+static enum rte_iova_mode
+rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode)
+{
+	if (iova_mode == RTE_IOVA_VA) {
+#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
+		iova_mode = RTE_IOVA_PA;
+		RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
+#endif
+	}
+
+	return iova_mode;
+}
+
 static void rte_eal_init_alert(const char *msg)
 {
 	fprintf(stderr, "EAL: FATAL: %s\n", msg);
@@ -1085,24 +1098,16 @@ rte_eal_init(int argc, char **argv)
 				RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
 			}
 		}
-#ifdef RTE_LIBRTE_KNI
-		/* Workaround for KNI which requires physical address to work */
-		if (iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			if (phys_addrs) {
-				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
-			} else {
-				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
-			}
-		}
-#endif
 		rte_eal_get_configuration()->iova_mode = iova_mode;
 	} else {
 		rte_eal_get_configuration()->iova_mode =
 			internal_config.iova_mode;
 	}
 
+	if (rte_eal_check_module("rte_kni") == 1)
+		rte_eal_get_configuration()->iova_mode =
+				rte_eal_kni_get_iova_mode(rte_eal_iova_mode());
+
 	if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
 		rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
 		rte_errno = EINVAL;
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 46f75a7..2427a96 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -125,6 +125,7 @@ struct rte_kni_device_info {
 	unsigned int min_mtu;
 	unsigned int max_mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 7fbcf22..7221280 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -97,11 +97,6 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
-	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
-		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
-		return -1;
-	}
-
 	/* Check FD and open */
 	if (kni_fd < 0) {
 		kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
@@ -302,6 +297,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v12 2/2] kni: add IOVA=VA support in kernel module
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
  2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 1/2] kni: " vattunuru
@ 2019-11-05 11:04                         ` vattunuru
  2019-11-06 10:49                         ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support Thomas Monjalon
                                           ` (2 subsequent siblings)
  4 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-11-05 11:04 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds support for kernel module to work in
IOVA = VA mode by providing address translation
routines to convert IOVA aka user space VA to
kernel virtual addresses.

When compared with IOVA=PA mode, there is no drop
in performance with this approach, also there is no
performance impact on IOVA=PA mode with this patch.

This approach does not work with the kernel versions
less than 4.6.0.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 kernel/linux/kni/compat.h   | 15 +++++++++++
 kernel/linux/kni/kni_dev.h  | 42 ++++++++++++++++++++++++++++++
 kernel/linux/kni/kni_misc.c | 39 +++++++++++++++++++++-------
 kernel/linux/kni/kni_net.c  | 62 +++++++++++++++++++++++++++++++++++----------
 4 files changed, 136 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..6591dd7 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,18 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 6, 0) <= LINUX_VERSION_CODE
+
+#define HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+
+#if (KERNEL_VERSION(4, 6, 0) <= LINUX_VERSION_CODE) && \
+	(KERNEL_VERSION(4, 9, 0) > LINUX_VERSION_CODE)
+#define GET_USER_PAGES_REMOTE_API_V1
+#elif KERNEL_VERSION(4, 9, 0) == LINUX_VERSION_CODE
+#define GET_USER_PAGES_REMOTE_API_V2
+#else
+#define GET_USER_PAGES_REMOTE_API_V3
+#endif
+
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..fb641b6 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -41,6 +41,8 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
@@ -84,8 +86,48 @@ struct kni_dev {
 	void *va[MBUF_BURST_SZ];
 	void *alloc_pa[MBUF_BURST_SZ];
 	void *alloc_va[MBUF_BURST_SZ];
+
+	struct task_struct *usr_tsk;
 };
 
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+static inline phys_addr_t iova_to_phys(struct task_struct *tsk,
+				       unsigned long iova)
+{
+	phys_addr_t offset, phys_addr;
+	struct page *page = NULL;
+	long ret;
+
+	offset = iova & (PAGE_SIZE - 1);
+
+	/* Read one page struct info */
+#ifdef GET_USER_PAGES_REMOTE_API_V3
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				    FOLL_TOUCH, &page, NULL, NULL);
+#endif
+#ifdef GET_USER_PAGES_REMOTE_API_V2
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				    FOLL_TOUCH, &page, NULL);
+#endif
+#ifdef GET_USER_PAGES_REMOTE_API_V1
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1
+				    0, 0, &page, NULL);
+#endif
+	if (ret < 0)
+		return 0;
+
+	phys_addr = page_to_phys(page) | offset;
+	put_page(page);
+
+	return phys_addr;
+}
+
+static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova)
+{
+	return phys_to_virt(iova_to_phys(tsk, iova));
+}
+#endif
+
 void kni_net_release_fifo_phy(struct kni_dev *kni);
 void kni_net_rx(struct kni_dev *kni);
 void kni_net_init(struct net_device *dev);
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 84ef03b..cda71bd 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -348,15 +348,36 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+		kni->tx_q = iova_to_kva(current, dev_info.tx_phys);
+		kni->rx_q = iova_to_kva(current, dev_info.rx_phys);
+		kni->alloc_q = iova_to_kva(current, dev_info.alloc_phys);
+		kni->free_q = iova_to_kva(current, dev_info.free_phys);
+
+		kni->req_q = iova_to_kva(current, dev_info.req_phys);
+		kni->resp_q = iova_to_kva(current, dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = iova_to_kva(current, dev_info.sync_phys);
+		kni->usr_tsk = current;
+		kni->iova_mode = 1;
+#else
+		pr_err("KNI module does not support IOVA to VA translation\n");
+		return -EINVAL;
+#endif
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index f25b127..1ba9b1b 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *iova)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, (unsigned long)iova));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, m->buf_physaddr) +
+			    m->data_off);
+}
+#endif
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +78,26 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+#endif
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+#endif
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +214,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +302,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +374,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +473,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +485,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +553,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +586,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment Olivier Matz
@ 2019-11-05 12:15                             ` Andrew Rybchenko
  2019-11-05 12:48                               ` Olivier Matz
  0 siblings, 1 reply; 251+ messages in thread
From: Andrew Rybchenko @ 2019-11-05 12:15 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

On 11/4/19 6:12 PM, Olivier Matz wrote:
> For consistency, RTE_MEMPOOL_ALIGN should be used in place of
> RTE_CACHE_LINE_SIZE. They have the same value, because the only arch
> that was defining a specific value for it has been removed from dpdk.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
> ---
>  drivers/mempool/bucket/rte_mempool_bucket.c  | 8 +++++++-
>  lib/librte_mempool/rte_mempool.c             | 2 +-
>  lib/librte_mempool/rte_mempool.h             | 3 +++
>  lib/librte_mempool/rte_mempool_ops_default.c | 2 +-
>  4 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
> index b978fd220..5ce1ef16f 100644
> --- a/drivers/mempool/bucket/rte_mempool_bucket.c
> +++ b/drivers/mempool/bucket/rte_mempool_bucket.c
> @@ -401,6 +401,11 @@ bucket_alloc(struct rte_mempool *mp)
>  	struct bucket_data *bd;
>  	unsigned int i;
>  	unsigned int bucket_header_size;
> +	size_t pg_sz;
> +
> +	rc = rte_mempool_get_page_size(mp, &pg_sz);
> +	if (rc < 0)
> +		return rc;

Looks unrelated to the patch.

>  	bd = rte_zmalloc_socket("bucket_pool", sizeof(*bd),
>  				RTE_CACHE_LINE_SIZE, mp->socket_id);

[snip]



^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v3 5/7] mempool: introduce helpers for populate and calc mem size
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
@ 2019-11-05 12:19                             ` Andrew Rybchenko
  0 siblings, 0 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-11-05 12:19 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

On 11/4/19 6:12 PM, Olivier Matz wrote:
> Introduce new functions that can used by mempool drivers to
> calculate required memory size and to populate mempool.
>
> For now, these helpers just replace the *_default() functions
> without change. They will be enhanced in next commit.
>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> Acked-by: Nipun Gupta <nipun.gupta@nxp.com>

Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v3 6/7] mempool: prevent objects from being across pages
  2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 6/7] mempool: prevent objects from being across pages Olivier Matz
@ 2019-11-05 12:22                             ` Andrew Rybchenko
  0 siblings, 0 replies; 251+ messages in thread
From: Andrew Rybchenko @ 2019-11-05 12:22 UTC (permalink / raw)
  To: Olivier Matz, dev
  Cc: Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

On 11/4/19 6:12 PM, Olivier Matz wrote:
> When populating a mempool, ensure that objects are not located across
> several pages, except if user did not request iova contiguous objects.
>
> Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> Acked-by: Nipun Gupta <nipun.gupta@nxp.com>

Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>



^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment
  2019-11-05 12:15                             ` Andrew Rybchenko
@ 2019-11-05 12:48                               ` Olivier Matz
  0 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 12:48 UTC (permalink / raw)
  To: Andrew Rybchenko
  Cc: dev, Anatoly Burakov, Ferruh Yigit, Giridharan, Ganesan,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta

On Tue, Nov 05, 2019 at 03:15:13PM +0300, Andrew Rybchenko wrote:
> On 11/4/19 6:12 PM, Olivier Matz wrote:
> > For consistency, RTE_MEMPOOL_ALIGN should be used in place of
> > RTE_CACHE_LINE_SIZE. They have the same value, because the only arch
> > that was defining a specific value for it has been removed from dpdk.
> >
> > Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> > Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
> > Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
> > ---
> >  drivers/mempool/bucket/rte_mempool_bucket.c  | 8 +++++++-
> >  lib/librte_mempool/rte_mempool.c             | 2 +-
> >  lib/librte_mempool/rte_mempool.h             | 3 +++
> >  lib/librte_mempool/rte_mempool_ops_default.c | 2 +-
> >  4 files changed, 12 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
> > index b978fd220..5ce1ef16f 100644
> > --- a/drivers/mempool/bucket/rte_mempool_bucket.c
> > +++ b/drivers/mempool/bucket/rte_mempool_bucket.c
> > @@ -401,6 +401,11 @@ bucket_alloc(struct rte_mempool *mp)
> >  	struct bucket_data *bd;
> >  	unsigned int i;
> >  	unsigned int bucket_header_size;
> > +	size_t pg_sz;
> > +
> > +	rc = rte_mempool_get_page_size(mp, &pg_sz);
> > +	if (rc < 0)
> > +		return rc;
> 
> Looks unrelated to the patch.

Oops, indeed, every change in rte_mempool_bucket.c should be in patch 6/7
"mempool: prevent objects from being across pages", Thanks for spotting it.


> 
> >  	bd = rte_zmalloc_socket("bucket_pool", sizeof(*bd),
> >  				RTE_CACHE_LINE_SIZE, mp->socket_id);
> 
> [snip]
> 
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages
  2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
                                           ` (8 preceding siblings ...)
  2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
@ 2019-11-05 15:36                         ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
                                             ` (8 more replies)
  9 siblings, 9 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:36 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

KNI supposes that mbufs are contiguous in kernel virtual memory. This
may not be true when using the IOVA=VA mode. To fix this, a possibility
is to ensure that objects do not cross page boundaries in mempool. This
patchset implements this in the last patch (5/5).

The previous patches prepare the job:
- allow to populate with an unaligned virtual area (1/5).
- reduce spaced wasted in the mempool size calculation when not using
  the iova-contiguous allocation (2/5).
- remove the iova-contiguous allocation when populating mempool (3/5):
  a va-contiguous alloc does the job as well if we want to populate
  without crossing page boundaries, so simplify the mempool populate
  function.
- export a function to get the minimum page used in a mempool (4/5)

Memory consumption impact when using hugepages:
- worst case: + ~0.1% for a mbuf pool (objsize ~= 2368)
- best case: -50% for if pool size is just above page size

The memory consumption impact with 4K pages in IOVA=VA mode could
however consume up to 75% more memory for mbuf pool, because there will
be only 1 mbuf per page. Not sure how common this usecase is.

Caveat: this changes the behavior of the mempool (calc_mem_size and
populate), and there is a small risk to break things, especially with
alternate mempool drivers.

v4

* remove useless comments in Makefiles and meson.build (sugg by David)
* add EXPERIMENTAL banner on new functions in API comments (David)
* sort by version in rte_mempool_version.map (David)
* remove duplicated -DALLOW_EXPERIMENTAL_API flag in octeontx2
  mempool driver
* enhance API comments for new helpers

v3

* introduce new helpers to calculate required memory size and to
  populate mempool, use them in drivers: the alignment constraint
  of octeontx/octeontx2 is managed in this common code.
* fix octeontx mempool driver by taking alignment constraint in account
  like in octeontx2
* fix bucket mempool driver with 4K pages: limit bucket size in this
  case to ensure that objects do not cross page boundaries. With larger
  pages, it was already ok, because bucket size (64K) is smaller than
  a page.
* fix some api comments in mempool header file

v2

* update octeontx2 driver to keep alignment constraint (issue seen by
  Vamsi)
* add a new patch to use RTE_MEMPOOL_ALIGN (Andrew)
* fix initialization of for loop in rte_mempool_populate_virt() (Andrew)
* use rte_mempool_populate_iova() if mz_flags has
  RTE_MEMZONE_IOVA_CONTIG (Andrew)
* check rte_mempool_get_page_size() return value (Andrew)
* some other minor style improvements

rfc -> v1

* remove first cleanup patch, it was pushed separately
  a2b5a8722f20 ("mempool: clarify default populate function")
* add missing change in rte_mempool_op_calc_mem_size_default()
* allow unaligned addr/len in populate virt
* better split patches
* try to better explain the change
* use DPDK align macros when relevant

Olivier Matz (7):
  mempool: allow unaligned addr/len in populate virt
  mempool: reduce wasted space on mempool populate
  mempool: remove optimistic IOVA-contiguous allocation
  mempool: introduce function to get mempool page size
  mempool: introduce helpers for populate and calc mem size
  mempool: prevent objects from being across pages
  mempool: use the specific macro for object alignment

 drivers/mempool/bucket/Makefile               |   1 +
 drivers/mempool/bucket/meson.build            |   2 +
 drivers/mempool/bucket/rte_mempool_bucket.c   |  10 +-
 drivers/mempool/dpaa/dpaa_mempool.c           |   4 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |   4 +-
 drivers/mempool/octeontx/Makefile             |   2 +
 drivers/mempool/octeontx/meson.build          |   2 +
 .../mempool/octeontx/rte_mempool_octeontx.c   |  21 +--
 drivers/mempool/octeontx2/Makefile            |   2 +
 drivers/mempool/octeontx2/meson.build         |   2 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c  |  21 ++-
 lib/librte_mempool/rte_mempool.c              | 147 +++++++-----------
 lib/librte_mempool/rte_mempool.h              | 114 ++++++++++++--
 lib/librte_mempool/rte_mempool_ops.c          |   4 +-
 lib/librte_mempool/rte_mempool_ops_default.c  | 113 +++++++++++---
 lib/librte_mempool/rte_mempool_version.map    |   6 +
 16 files changed, 312 insertions(+), 143 deletions(-)

-- 
2.20.1


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 1/7] mempool: allow unaligned addr/len in populate virt
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 2/7] mempool: reduce wasted space on mempool populate Olivier Matz
                                             ` (7 subsequent siblings)
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

rte_mempool_populate_virt() currently requires that both addr
and length are page-aligned.

Remove this uneeded constraint which can be annoying with big
hugepages (ex: 1GB).

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c | 23 +++++++++++------------
 lib/librte_mempool/rte_mempool.h |  3 +--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 0f29e8712..88e49c751 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -368,17 +368,11 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	size_t off, phys_len;
 	int ret, cnt = 0;
 
-	/* address and len must be page-aligned */
-	if (RTE_PTR_ALIGN_CEIL(addr, pg_sz) != addr)
-		return -EINVAL;
-	if (RTE_ALIGN_CEIL(len, pg_sz) != len)
-		return -EINVAL;
-
 	if (mp->flags & MEMPOOL_F_NO_IOVA_CONTIG)
 		return rte_mempool_populate_iova(mp, addr, RTE_BAD_IOVA,
 			len, free_cb, opaque);
 
-	for (off = 0; off + pg_sz <= len &&
+	for (off = 0; off < len &&
 		     mp->populated_size < mp->size; off += phys_len) {
 
 		iova = rte_mem_virt2iova(addr + off);
@@ -389,12 +383,18 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 		}
 
 		/* populate with the largest group of contiguous pages */
-		for (phys_len = pg_sz; off + phys_len < len; phys_len += pg_sz) {
+		for (phys_len = RTE_MIN(
+			(size_t)(RTE_PTR_ALIGN_CEIL(addr + off + 1, pg_sz) -
+				(addr + off)),
+			len - off);
+		     off + phys_len < len;
+		     phys_len = RTE_MIN(phys_len + pg_sz, len - off)) {
 			rte_iova_t iova_tmp;
 
 			iova_tmp = rte_mem_virt2iova(addr + off + phys_len);
 
-			if (iova_tmp != iova + phys_len)
+			if (iova_tmp == RTE_BAD_IOVA ||
+					iova_tmp != iova + phys_len)
 				break;
 		}
 
@@ -575,8 +575,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags,
-					RTE_MAX(pg_sz, align));
+					mp->socket_id, flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -601,7 +600,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 				(void *)(uintptr_t)mz);
 		else
 			ret = rte_mempool_populate_virt(mp, mz->addr,
-				RTE_ALIGN_FLOOR(mz->len, pg_sz), pg_sz,
+				mz->len, pg_sz,
 				rte_mempool_memchunk_mz_free,
 				(void *)(uintptr_t)mz);
 		if (ret < 0) {
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 8053f7a04..0fe8aa7b8 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1042,9 +1042,8 @@ int rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
  *   A pointer to the mempool structure.
  * @param addr
  *   The virtual address of memory that should be used to store objects.
- *   Must be page-aligned.
  * @param len
- *   The length of memory in bytes. Must be page-aligned.
+ *   The length of memory in bytes.
  * @param pg_sz
  *   The size of memory pages in this virtual area.
  * @param free_cb
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 2/7] mempool: reduce wasted space on mempool populate
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 3/7] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
                                             ` (6 subsequent siblings)
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

The size returned by rte_mempool_op_calc_mem_size_default() is aligned
to the specified page size. Therefore, with big pages, the returned size
can be much more that what we really need to populate the mempool.

For instance, populating a mempool that requires 1.1GB of memory with
1GB hugepages can result in allocating 2GB of memory.

This problem is hidden most of the time due to the allocation method of
rte_mempool_populate_default(): when try_iova_contig_mempool=true, it
first tries to allocate an iova contiguous area, without the alignment
constraint. If it fails, it fallbacks to an aligned allocation that does
not require to be iova-contiguous. This can also fallback into several
smaller aligned allocations.

This commit changes rte_mempool_op_calc_mem_size_default() to relax the
alignment constraint to a cache line and to return a smaller size.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybdhenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c             |  7 ++---
 lib/librte_mempool/rte_mempool.h             |  9 +++----
 lib/librte_mempool/rte_mempool_ops.c         |  4 ++-
 lib/librte_mempool/rte_mempool_ops_default.c | 28 +++++++++++++++-----
 4 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 88e49c751..4e0d576f5 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -477,11 +477,8 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * wasting some space this way, but it's much nicer than looping around
 	 * trying to reserve each and every page size.
 	 *
-	 * However, since size calculation will produce page-aligned sizes, it
-	 * makes sense to first try and see if we can reserve the entire memzone
-	 * in one contiguous chunk as well (otherwise we might end up wasting a
-	 * 1G page on a 10MB memzone). If we fail to get enough contiguous
-	 * memory, then we'll go and reserve space page-by-page.
+	 * If we fail to get enough contiguous memory, then we'll go and
+	 * reserve space in smaller chunks.
 	 *
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 0fe8aa7b8..78b687bb6 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -458,7 +458,7 @@ typedef unsigned (*rte_mempool_get_count)(const struct rte_mempool *mp);
  * @param[out] align
  *   Location for required memory chunk alignment.
  * @return
- *   Required memory size aligned at page boundary.
+ *   Required memory size.
  */
 typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		uint32_t obj_num,  uint32_t pg_shift,
@@ -477,11 +477,8 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  * that pages are grouped in subsets of physically continuous pages big
  * enough to store at least one object.
  *
- * Minimum size of memory chunk is a maximum of the page size and total
- * element size.
- *
- * Required memory chunk alignment is a maximum of page size and cache
- * line size.
+ * Minimum size of memory chunk is the total element size.
+ * Required memory chunk alignment is the cache line size.
  */
 ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops.c b/lib/librte_mempool/rte_mempool_ops.c
index e02eb702c..22c5251eb 100644
--- a/lib/librte_mempool/rte_mempool_ops.c
+++ b/lib/librte_mempool/rte_mempool_ops.c
@@ -100,7 +100,9 @@ rte_mempool_ops_get_count(const struct rte_mempool *mp)
 	return ops->get_count(mp);
 }
 
-/* wrapper to notify new memory area to external mempool */
+/* wrapper to calculate the memory size required to store given number
+ * of objects
+ */
 ssize_t
 rte_mempool_ops_calc_mem_size(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 4e2bfc82d..f6aea7662 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -12,7 +12,7 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				     size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
-	size_t obj_per_page, pg_num, pg_sz;
+	size_t obj_per_page, pg_sz, objs_in_last_page;
 	size_t mem_size;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
@@ -33,14 +33,30 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 			mem_size =
 				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
 		} else {
-			pg_num = (obj_num + obj_per_page - 1) / obj_per_page;
-			mem_size = pg_num << pg_shift;
+			/* In the best case, the allocator will return a
+			 * page-aligned address. For example, with 5 objs,
+			 * the required space is as below:
+			 *  |     page0     |     page1     |  page2 (last) |
+			 *  |obj0 |obj1 |xxx|obj2 |obj3 |xxx|obj4|
+			 *  <------------- mem_size ------------->
+			 */
+			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
+			/* room required for the last page */
+			mem_size = objs_in_last_page * total_elt_sz;
+			/* room required for other pages */
+			mem_size += ((obj_num - objs_in_last_page) /
+				obj_per_page) << pg_shift;
+
+			/* In the worst case, the allocator returns a
+			 * non-aligned pointer, wasting up to
+			 * total_elt_sz. Add a margin for that.
+			 */
+			 mem_size += total_elt_sz - 1;
 		}
 	}
 
-	*min_chunk_size = RTE_MAX((size_t)1 << pg_shift, total_elt_sz);
-
-	*align = RTE_MAX((size_t)RTE_CACHE_LINE_SIZE, (size_t)1 << pg_shift);
+	*min_chunk_size = total_elt_sz;
+	*align = RTE_CACHE_LINE_SIZE;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 3/7] mempool: remove optimistic IOVA-contiguous allocation
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 2/7] mempool: reduce wasted space on mempool populate Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 4/7] mempool: introduce function to get mempool page size Olivier Matz
                                             ` (5 subsequent siblings)
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

The previous commit reduced the amount of required memory when
populating the mempool with non iova-contiguous memory.

Since there is no big advantage to have a fully iova-contiguous mempool
if it is not explicitly asked, remove this code, it simplifies the
populate function.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c | 47 ++++++--------------------------
 1 file changed, 8 insertions(+), 39 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 4e0d576f5..213e574fc 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -430,7 +430,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool try_iova_contig_mempool;
 	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
@@ -483,9 +482,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * We also have to take into account the fact that memory that we're
 	 * going to allocate from can belong to an externally allocated memory
 	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold. We should also try
-	 * to go for contiguous memory even if we're in no-huge mode, because
-	 * external memory may in fact be IOVA-contiguous.
+	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	/* check if we can retrieve a valid socket ID */
@@ -494,7 +491,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		return -EINVAL;
 	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
-	try_iova_contig_mempool = false;
 
 	if (!need_iova_contig_obj) {
 		pg_sz = 0;
@@ -503,7 +499,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		pg_sz = 0;
 		pg_shift = 0;
 	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		try_iova_contig_mempool = true;
 		pg_sz = get_min_page_size(mp->socket_id);
 		pg_shift = rte_bsf32(pg_sz);
 	} else {
@@ -513,14 +508,9 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
-		unsigned int flags;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					0, &min_chunk_size, &align);
-		else
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
+		mem_size = rte_mempool_ops_calc_mem_size(
+			mp, n, pg_shift, &min_chunk_size, &align);
 
 		if (mem_size < 0) {
 			ret = mem_size;
@@ -534,36 +524,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			goto fail;
 		}
 
-		flags = mz_flags;
-
 		/* if we're trying to reserve contiguous memory, add appropriate
 		 * memzone flag.
 		 */
-		if (try_iova_contig_mempool)
-			flags |= RTE_MEMZONE_IOVA_CONTIG;
+		if (min_chunk_size == (size_t)mem_size)
+			mz_flags |= RTE_MEMZONE_IOVA_CONTIG;
 
 		mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-
-		/* if we were trying to allocate contiguous memory, failed and
-		 * minimum required contiguous chunk fits minimum page, adjust
-		 * memzone size to the page size, and try again.
-		 */
-		if (mz == NULL && try_iova_contig_mempool &&
-				min_chunk_size <= pg_sz) {
-			try_iova_contig_mempool = false;
-			flags &= ~RTE_MEMZONE_IOVA_CONTIG;
-
-			mem_size = rte_mempool_ops_calc_mem_size(mp, n,
-					pg_shift, &min_chunk_size, &align);
-			if (mem_size < 0) {
-				ret = mem_size;
-				goto fail;
-			}
+				mp->socket_id, mz_flags, align);
 
-			mz = rte_memzone_reserve_aligned(mz_name, mem_size,
-				mp->socket_id, flags, align);
-		}
 		/* don't try reserving with 0 size if we were asked to reserve
 		 * IOVA-contiguous memory.
 		 */
@@ -572,7 +541,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 			 * have
 			 */
 			mz = rte_memzone_reserve_aligned(mz_name, 0,
-					mp->socket_id, flags, align);
+					mp->socket_id, mz_flags, align);
 		}
 		if (mz == NULL) {
 			ret = -rte_errno;
@@ -590,7 +559,7 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 		else
 			iova = RTE_BAD_IOVA;
 
-		if (try_iova_contig_mempool || pg_sz == 0)
+		if (pg_sz == 0 || (mz_flags & RTE_MEMZONE_IOVA_CONTIG))
 			ret = rte_mempool_populate_iova(mp, mz->addr,
 				iova, mz->len,
 				rte_mempool_memchunk_mz_free,
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 4/7] mempool: introduce function to get mempool page size
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
                                             ` (2 preceding siblings ...)
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 3/7] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
                                             ` (4 subsequent siblings)
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

In rte_mempool_populate_default(), we determine the page size,
which is needed for calc_size and allocation of memory.

Move this in a function and export it, it will be used in a next
commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c           | 51 ++++++++++++++--------
 lib/librte_mempool/rte_mempool.h           | 11 +++++
 lib/librte_mempool/rte_mempool_version.map |  4 ++
 3 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 213e574fc..758c5410b 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -414,6 +414,33 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr,
 	return ret;
 }
 
+/* Get the minimal page size used in a mempool before populating it. */
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
+{
+	bool need_iova_contig_obj;
+	bool alloc_in_ext_mem;
+	int ret;
+
+	/* check if we can retrieve a valid socket ID */
+	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
+	if (ret < 0)
+		return -EINVAL;
+	alloc_in_ext_mem = (ret == 1);
+	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+
+	if (!need_iova_contig_obj)
+		*pg_sz = 0;
+	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
+		*pg_sz = 0;
+	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
+		*pg_sz = get_min_page_size(mp->socket_id);
+	else
+		*pg_sz = getpagesize();
+
+	return 0;
+}
+
 /* Default function to populate the mempool: allocate memory in memzones,
  * and populate them. Return the number of objects added, or a negative
  * value on error.
@@ -425,12 +452,11 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	char mz_name[RTE_MEMZONE_NAMESIZE];
 	const struct rte_memzone *mz;
 	ssize_t mem_size;
-	size_t align, pg_sz, pg_shift;
+	size_t align, pg_sz, pg_shift = 0;
 	rte_iova_t iova;
 	unsigned mz_id, n;
 	int ret;
 	bool need_iova_contig_obj;
-	bool alloc_in_ext_mem;
 
 	ret = mempool_ops_alloc_once(mp);
 	if (ret != 0)
@@ -485,26 +511,13 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
-	/* check if we can retrieve a valid socket ID */
-	ret = rte_malloc_heap_socket_is_external(mp->socket_id);
-	if (ret < 0)
-		return -EINVAL;
-	alloc_in_ext_mem = (ret == 1);
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
-	if (!need_iova_contig_obj) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA) {
-		pg_sz = 0;
-		pg_shift = 0;
-	} else if (rte_eal_has_hugepages() || alloc_in_ext_mem) {
-		pg_sz = get_min_page_size(mp->socket_id);
-		pg_shift = rte_bsf32(pg_sz);
-	} else {
-		pg_sz = getpagesize();
+	if (pg_sz != 0)
 		pg_shift = rte_bsf32(pg_sz);
-	}
 
 	for (mz_id = 0, n = mp->size; n > 0; mz_id++, n -= ret) {
 		size_t min_chunk_size;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 78b687bb6..9e41f595e 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -1688,6 +1688,17 @@ uint32_t rte_mempool_calc_obj_size(uint32_t elt_size, uint32_t flags,
 void rte_mempool_walk(void (*func)(struct rte_mempool *, void *arg),
 		      void *arg);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Get page size used for mempool object allocation.
+ * This function is internal to mempool library and mempool drivers.
+ */
+__rte_experimental
+int
+rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index 17cbca460..aa75cf086 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -56,5 +56,9 @@ DPDK_18.05 {
 EXPERIMENTAL {
 	global:
 
+	# added in 18.05
 	rte_mempool_ops_get_info;
+
+	# added in 19.11
+	rte_mempool_get_page_size;
 };
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 5/7] mempool: introduce helpers for populate and calc mem size
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
                                             ` (3 preceding siblings ...)
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 4/7] mempool: introduce function to get mempool page size Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 6/7] mempool: prevent objects from being across pages Olivier Matz
                                             ` (3 subsequent siblings)
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

Introduce new functions that can used by mempool drivers to
calculate required memory size and to populate mempool.

For now, these helpers just replace the *_default() functions
without change. They will be enhanced in next commit.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 drivers/mempool/bucket/Makefile               |  1 +
 drivers/mempool/bucket/meson.build            |  2 +
 drivers/mempool/bucket/rte_mempool_bucket.c   |  2 +-
 drivers/mempool/dpaa/dpaa_mempool.c           |  2 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  2 +-
 drivers/mempool/octeontx/Makefile             |  2 +
 drivers/mempool/octeontx/meson.build          |  2 +
 .../mempool/octeontx/rte_mempool_octeontx.c   |  4 +-
 drivers/mempool/octeontx2/Makefile            |  2 +
 drivers/mempool/octeontx2/meson.build         |  2 +
 drivers/mempool/octeontx2/otx2_mempool_ops.c  |  4 +-
 lib/librte_mempool/rte_mempool.h              | 71 ++++++++++++++++++-
 lib/librte_mempool/rte_mempool_ops_default.c  | 31 ++++++--
 lib/librte_mempool/rte_mempool_version.map    |  2 +
 14 files changed, 113 insertions(+), 16 deletions(-)

diff --git a/drivers/mempool/bucket/Makefile b/drivers/mempool/bucket/Makefile
index 7364916bc..d6660b09c 100644
--- a/drivers/mempool/bucket/Makefile
+++ b/drivers/mempool/bucket/Makefile
@@ -15,6 +15,7 @@ LIB = librte_mempool_bucket.a
 
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 
 LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
 
diff --git a/drivers/mempool/bucket/meson.build b/drivers/mempool/bucket/meson.build
index 618d79128..2fd77f9d4 100644
--- a/drivers/mempool/bucket/meson.build
+++ b/drivers/mempool/bucket/meson.build
@@ -6,4 +6,6 @@
 # This software was jointly developed between OKTET Labs (under contract
 # for Solarflare) and Solarflare Communications, Inc.
 
+allow_experimental_apis = true
+
 sources = files('rte_mempool_bucket.c')
diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
index 78d2b9d04..dfeaf4e45 100644
--- a/drivers/mempool/bucket/rte_mempool_bucket.c
+++ b/drivers/mempool/bucket/rte_mempool_bucket.c
@@ -585,7 +585,7 @@ bucket_populate(struct rte_mempool *mp, unsigned int max_objs,
 
 		hdr->fill_cnt = 0;
 		hdr->lcore_id = LCORE_ID_ANY;
-		rc = rte_mempool_op_populate_default(mp,
+		rc = rte_mempool_op_populate_helper(mp,
 						     RTE_MIN(bd->obj_per_bucket,
 							     max_objs - n_objs),
 						     iter + bucket_header_sz,
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index a25697f05..27736e6c2 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -341,7 +341,7 @@ dpaa_populate(struct rte_mempool *mp, unsigned int max_objs,
 	 */
 	TAILQ_INSERT_HEAD(&rte_dpaa_memsegs, ms, next);
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, paddr, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index f26c30b00..8f8dbeada 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -421,7 +421,7 @@ dpaa2_populate(struct rte_mempool *mp, unsigned int max_objs,
 	/* Insert entry into the PA->VA Table */
 	dpaax_iova_table_update(paddr, vaddr, len);
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, paddr, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/drivers/mempool/octeontx/Makefile b/drivers/mempool/octeontx/Makefile
index a3e1dce88..25efc5cf2 100644
--- a/drivers/mempool/octeontx/Makefile
+++ b/drivers/mempool/octeontx/Makefile
@@ -11,6 +11,8 @@ LIB = librte_mempool_octeontx.a
 
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -I$(RTE_SDK)/drivers/common/octeontx/
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 EXPORT_MAP := rte_mempool_octeontx_version.map
 
 LIBABIVER := 1
diff --git a/drivers/mempool/octeontx/meson.build b/drivers/mempool/octeontx/meson.build
index 3baaf7db2..195435950 100644
--- a/drivers/mempool/octeontx/meson.build
+++ b/drivers/mempool/octeontx/meson.build
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2017 Cavium, Inc
 
+allow_experimental_apis = true
+
 sources = files('octeontx_fpavf.c',
 		'rte_mempool_octeontx.c'
 )
diff --git a/drivers/mempool/octeontx/rte_mempool_octeontx.c b/drivers/mempool/octeontx/rte_mempool_octeontx.c
index ab94dfe91..fff33e5c6 100644
--- a/drivers/mempool/octeontx/rte_mempool_octeontx.c
+++ b/drivers/mempool/octeontx/rte_mempool_octeontx.c
@@ -137,7 +137,7 @@ octeontx_fpavf_calc_mem_size(const struct rte_mempool *mp,
 	 * Simply need space for one more object to be able to
 	 * fulfil alignment requirements.
 	 */
-	mem_size = rte_mempool_op_calc_mem_size_default(mp, obj_num + 1,
+	mem_size = rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1,
 							pg_shift,
 							min_chunk_size, align);
 	if (mem_size >= 0) {
@@ -184,7 +184,7 @@ octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
 	if (ret < 0)
 		return ret;
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/drivers/mempool/octeontx2/Makefile b/drivers/mempool/octeontx2/Makefile
index 87cce22c6..4961a4a35 100644
--- a/drivers/mempool/octeontx2/Makefile
+++ b/drivers/mempool/octeontx2/Makefile
@@ -23,6 +23,8 @@ CFLAGS += -diag-disable 2259
 endif
 endif
 
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
 EXPORT_MAP := rte_mempool_octeontx2_version.map
 
 LIBABIVER := 1
diff --git a/drivers/mempool/octeontx2/meson.build b/drivers/mempool/octeontx2/meson.build
index 9fde40f0e..8255377df 100644
--- a/drivers/mempool/octeontx2/meson.build
+++ b/drivers/mempool/octeontx2/meson.build
@@ -2,6 +2,8 @@
 # Copyright(C) 2019 Marvell International Ltd.
 #
 
+allow_experimental_apis = true
+
 sources = files('otx2_mempool_ops.c',
 		'otx2_mempool.c',
 		'otx2_mempool_irq.c',
diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index d769575f4..3aea92a01 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -717,7 +717,7 @@ otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
 	 * Simply need space for one more object to be able to
 	 * fulfill alignment requirements.
 	 */
-	return rte_mempool_op_calc_mem_size_default(mp, obj_num + 1, pg_shift,
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1, pg_shift,
 						    min_chunk_size, align);
 }
 
@@ -749,7 +749,7 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
 		return -EBUSY;
 
-	return rte_mempool_op_populate_default(mp, max_objs, vaddr, iova, len,
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
 					       obj_cb, obj_cb_arg);
 }
 
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 9e41f595e..ad7cc6ad2 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -465,8 +465,13 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
 		size_t *min_chunk_size, size_t *align);
 
 /**
- * Default way to calculate memory size required to store given number of
- * objects.
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Helper to calculate memory size required to store given
+ * number of objects.
+ *
+ * This function is internal to mempool library and mempool drivers.
  *
  * If page boundaries may be ignored, it is just a product of total
  * object size including header and trailer and number of objects.
@@ -479,6 +484,32 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  *
  * Minimum size of memory chunk is the total element size.
  * Required memory chunk alignment is the cache line size.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] obj_num
+ *   Number of objects to be added in mempool.
+ * @param[in] pg_shift
+ *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @param[out] min_chunk_size
+ *   Location for minimum size of the memory chunk which may be used to
+ *   store memory pool objects.
+ * @param[out] align
+ *   Location for required memory chunk alignment.
+ * @return
+ *   Required memory size.
+ */
+__rte_experimental
+ssize_t rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
+		uint32_t obj_num, uint32_t pg_shift,
+		size_t *min_chunk_size, size_t *align);
+
+/**
+ * Default way to calculate memory size required to store given number of
+ * objects.
+ *
+ * Equivalent to rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+ * min_chunk_size, align).
  */
 ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
@@ -533,8 +564,42 @@ typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
 
 /**
- * Default way to populate memory pool object using provided memory
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * @internal Helper to populate memory pool object using provided memory
  * chunk: just slice objects one by one.
+ *
+ * This function is internal to mempool library and mempool drivers.
+ *
+ * @param[in] mp
+ *   A pointer to the mempool structure.
+ * @param[in] max_objs
+ *   Maximum number of objects to be added in mempool.
+ * @param[in] vaddr
+ *   The virtual address of memory that should be used to store objects.
+ * @param[in] iova
+ *   The IO address corresponding to vaddr, or RTE_BAD_IOVA.
+ * @param[in] len
+ *   The length of memory in bytes.
+ * @param[in] obj_cb
+ *   Callback function to be executed for each populated object.
+ * @param[in] obj_cb_arg
+ *   An opaque pointer passed to the callback function.
+ * @return
+ *   The number of objects added in mempool.
+ */
+__rte_experimental
+int rte_mempool_op_populate_helper(struct rte_mempool *mp,
+		unsigned int max_objs,
+		void *vaddr, rte_iova_t iova, size_t len,
+		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
+
+/**
+ * Default way to populate memory pool object using provided memory chunk.
+ *
+ * Equivalent to rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+ * len, obj_cb, obj_cb_arg).
  */
 int rte_mempool_op_populate_default(struct rte_mempool *mp,
 		unsigned int max_objs,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index f6aea7662..0bfc63497 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -7,9 +7,9 @@
 #include <rte_mempool.h>
 
 ssize_t
-rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
-				     uint32_t obj_num, uint32_t pg_shift,
-				     size_t *min_chunk_size, size_t *align)
+rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
+				uint32_t obj_num, uint32_t pg_shift,
+				size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
 	size_t obj_per_page, pg_sz, objs_in_last_page;
@@ -61,10 +61,19 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 	return mem_size;
 }
 
+ssize_t
+rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
+				uint32_t obj_num, uint32_t pg_shift,
+				size_t *min_chunk_size, size_t *align)
+{
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						min_chunk_size, align);
+}
+
 int
-rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
-		void *vaddr, rte_iova_t iova, size_t len,
-		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int max_objs,
+			void *vaddr, rte_iova_t iova, size_t len,
+			rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
 {
 	size_t total_elt_sz;
 	size_t off;
@@ -84,3 +93,13 @@ rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 
 	return i;
 }
+
+int
+rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
+				void *vaddr, rte_iova_t iova, size_t len,
+				rte_mempool_populate_obj_cb_t *obj_cb,
+				void *obj_cb_arg)
+{
+	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+					len, obj_cb, obj_cb_arg);
+}
diff --git a/lib/librte_mempool/rte_mempool_version.map b/lib/librte_mempool/rte_mempool_version.map
index aa75cf086..ce9e791e7 100644
--- a/lib/librte_mempool/rte_mempool_version.map
+++ b/lib/librte_mempool/rte_mempool_version.map
@@ -61,4 +61,6 @@ EXPERIMENTAL {
 
 	# added in 19.11
 	rte_mempool_get_page_size;
+	rte_mempool_op_calc_mem_size_helper;
+	rte_mempool_op_populate_helper;
 };
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 6/7] mempool: prevent objects from being across pages
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
                                             ` (4 preceding siblings ...)
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 7/7] mempool: use the specific macro for object alignment Olivier Matz
                                             ` (2 subsequent siblings)
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

When populating a mempool, ensure that objects are not located across
several pages, except if user did not request iova contiguous objects.

Signed-off-by: Vamsi Krishna Attunuru <vattunuru@marvell.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
Acked-by: Andrew Rybchenko <arybchenko@solarflare.com>
---
 drivers/mempool/bucket/rte_mempool_bucket.c   | 10 ++-
 drivers/mempool/dpaa/dpaa_mempool.c           |  4 +-
 drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |  4 +-
 .../mempool/octeontx/rte_mempool_octeontx.c   | 21 +++---
 drivers/mempool/octeontx2/otx2_mempool_ops.c  | 21 +++---
 lib/librte_mempool/rte_mempool.c              | 23 ++-----
 lib/librte_mempool/rte_mempool.h              | 27 ++++++--
 lib/librte_mempool/rte_mempool_ops_default.c  | 66 +++++++++++++++----
 8 files changed, 119 insertions(+), 57 deletions(-)

diff --git a/drivers/mempool/bucket/rte_mempool_bucket.c b/drivers/mempool/bucket/rte_mempool_bucket.c
index dfeaf4e45..5ce1ef16f 100644
--- a/drivers/mempool/bucket/rte_mempool_bucket.c
+++ b/drivers/mempool/bucket/rte_mempool_bucket.c
@@ -401,6 +401,11 @@ bucket_alloc(struct rte_mempool *mp)
 	struct bucket_data *bd;
 	unsigned int i;
 	unsigned int bucket_header_size;
+	size_t pg_sz;
+
+	rc = rte_mempool_get_page_size(mp, &pg_sz);
+	if (rc < 0)
+		return rc;
 
 	bd = rte_zmalloc_socket("bucket_pool", sizeof(*bd),
 				RTE_CACHE_LINE_SIZE, mp->socket_id);
@@ -416,7 +421,8 @@ bucket_alloc(struct rte_mempool *mp)
 	RTE_BUILD_BUG_ON(sizeof(struct bucket_header) > RTE_CACHE_LINE_SIZE);
 	bd->header_size = mp->header_size + bucket_header_size;
 	bd->total_elt_size = mp->header_size + mp->elt_size + mp->trailer_size;
-	bd->bucket_mem_size = RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB * 1024;
+	bd->bucket_mem_size = RTE_MIN(pg_sz,
+			(size_t)(RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB * 1024));
 	bd->obj_per_bucket = (bd->bucket_mem_size - bucket_header_size) /
 		bd->total_elt_size;
 	bd->bucket_page_mask = ~(rte_align64pow2(bd->bucket_mem_size) - 1);
@@ -585,7 +591,7 @@ bucket_populate(struct rte_mempool *mp, unsigned int max_objs,
 
 		hdr->fill_cnt = 0;
 		hdr->lcore_id = LCORE_ID_ANY;
-		rc = rte_mempool_op_populate_helper(mp,
+		rc = rte_mempool_op_populate_helper(mp, 0,
 						     RTE_MIN(bd->obj_per_bucket,
 							     max_objs - n_objs),
 						     iter + bucket_header_sz,
diff --git a/drivers/mempool/dpaa/dpaa_mempool.c b/drivers/mempool/dpaa/dpaa_mempool.c
index 27736e6c2..3a2528331 100644
--- a/drivers/mempool/dpaa/dpaa_mempool.c
+++ b/drivers/mempool/dpaa/dpaa_mempool.c
@@ -341,8 +341,8 @@ dpaa_populate(struct rte_mempool *mp, unsigned int max_objs,
 	 */
 	TAILQ_INSERT_HEAD(&rte_dpaa_memsegs, ms, next);
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, paddr,
+					       len, obj_cb, obj_cb_arg);
 }
 
 static const struct rte_mempool_ops dpaa_mpool_ops = {
diff --git a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
index 8f8dbeada..36c93decf 100644
--- a/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
+++ b/drivers/mempool/dpaa2/dpaa2_hw_mempool.c
@@ -421,8 +421,8 @@ dpaa2_populate(struct rte_mempool *mp, unsigned int max_objs,
 	/* Insert entry into the PA->VA Table */
 	dpaax_iova_table_update(paddr, vaddr, len);
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, paddr, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, paddr,
+					       len, obj_cb, obj_cb_arg);
 }
 
 static const struct rte_mempool_ops dpaa2_mpool_ops = {
diff --git a/drivers/mempool/octeontx/rte_mempool_octeontx.c b/drivers/mempool/octeontx/rte_mempool_octeontx.c
index fff33e5c6..bd0070020 100644
--- a/drivers/mempool/octeontx/rte_mempool_octeontx.c
+++ b/drivers/mempool/octeontx/rte_mempool_octeontx.c
@@ -132,14 +132,15 @@ octeontx_fpavf_calc_mem_size(const struct rte_mempool *mp,
 			     size_t *min_chunk_size, size_t *align)
 {
 	ssize_t mem_size;
+	size_t total_elt_sz;
 
-	/*
-	 * Simply need space for one more object to be able to
-	 * fulfil alignment requirements.
+	/* Need space for one more obj on each chunk to fulfill
+	 * alignment requirements.
 	 */
-	mem_size = rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1,
-							pg_shift,
-							min_chunk_size, align);
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+	mem_size = rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						total_elt_sz, min_chunk_size,
+						align);
 	if (mem_size >= 0) {
 		/*
 		 * Memory area which contains objects must be physically
@@ -168,7 +169,7 @@ octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	/* align object start address to a multiple of total_elt_sz */
-	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+	off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
 
 	if (len < off)
 		return -EINVAL;
@@ -184,8 +185,10 @@ octeontx_fpavf_populate(struct rte_mempool *mp, unsigned int max_objs,
 	if (ret < 0)
 		return ret;
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp,
+					RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
+					max_objs, vaddr, iova, len,
+					obj_cb, obj_cb_arg);
 }
 
 static struct rte_mempool_ops octeontx_fpavf_ops = {
diff --git a/drivers/mempool/octeontx2/otx2_mempool_ops.c b/drivers/mempool/octeontx2/otx2_mempool_ops.c
index 3aea92a01..ea4b1c45d 100644
--- a/drivers/mempool/octeontx2/otx2_mempool_ops.c
+++ b/drivers/mempool/octeontx2/otx2_mempool_ops.c
@@ -713,12 +713,15 @@ static ssize_t
 otx2_npa_calc_mem_size(const struct rte_mempool *mp, uint32_t obj_num,
 		       uint32_t pg_shift, size_t *min_chunk_size, size_t *align)
 {
-	/*
-	 * Simply need space for one more object to be able to
-	 * fulfill alignment requirements.
+	size_t total_elt_sz;
+
+	/* Need space for one more obj on each chunk to fulfill
+	 * alignment requirements.
 	 */
-	return rte_mempool_op_calc_mem_size_helper(mp, obj_num + 1, pg_shift,
-						    min_chunk_size, align);
+	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
+	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
+						total_elt_sz, min_chunk_size,
+						align);
 }
 
 static int
@@ -735,7 +738,7 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
 	/* Align object start address to a multiple of total_elt_sz */
-	off = total_elt_sz - ((uintptr_t)vaddr % total_elt_sz);
+	off = total_elt_sz - ((((uintptr_t)vaddr - 1) % total_elt_sz) + 1);
 
 	if (len < off)
 		return -EINVAL;
@@ -749,8 +752,10 @@ otx2_npa_populate(struct rte_mempool *mp, unsigned int max_objs, void *vaddr,
 	if (npa_lf_aura_range_update_check(mp->pool_id) < 0)
 		return -EBUSY;
 
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova, len,
-					       obj_cb, obj_cb_arg);
+	return rte_mempool_op_populate_helper(mp,
+					RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ,
+					max_objs, vaddr, iova, len,
+					obj_cb, obj_cb_arg);
 }
 
 static struct rte_mempool_ops otx2_npa_ops = {
diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index 758c5410b..d3db9273d 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -431,8 +431,6 @@ rte_mempool_get_page_size(struct rte_mempool *mp, size_t *pg_sz)
 
 	if (!need_iova_contig_obj)
 		*pg_sz = 0;
-	else if (!alloc_in_ext_mem && rte_eal_iova_mode() == RTE_IOVA_VA)
-		*pg_sz = 0;
 	else if (rte_eal_has_hugepages() || alloc_in_ext_mem)
 		*pg_sz = get_min_page_size(mp->socket_id);
 	else
@@ -481,17 +479,15 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 * then just set page shift and page size to 0, because the user has
 	 * indicated that there's no need to care about anything.
 	 *
-	 * if we do need contiguous objects, there is also an option to reserve
-	 * the entire mempool memory as one contiguous block of memory, in
-	 * which case the page shift and alignment wouldn't matter as well.
+	 * if we do need contiguous objects (if a mempool driver has its
+	 * own calc_size() method returning min_chunk_size = mem_size),
+	 * there is also an option to reserve the entire mempool memory
+	 * as one contiguous block of memory.
 	 *
 	 * if we require contiguous objects, but not necessarily the entire
-	 * mempool reserved space to be contiguous, then there are two options.
-	 *
-	 * if our IO addresses are virtual, not actual physical (IOVA as VA
-	 * case), then no page shift needed - our memory allocation will give us
-	 * contiguous IO memory as far as the hardware is concerned, so
-	 * act as if we're getting contiguous memory.
+	 * mempool reserved space to be contiguous, pg_sz will be != 0,
+	 * and the default ops->populate() will take care of not placing
+	 * objects across pages.
 	 *
 	 * if our IO addresses are physical, we may get memory from bigger
 	 * pages, or we might get memory from smaller pages, and how much of it
@@ -504,11 +500,6 @@ rte_mempool_populate_default(struct rte_mempool *mp)
 	 *
 	 * If we fail to get enough contiguous memory, then we'll go and
 	 * reserve space in smaller chunks.
-	 *
-	 * We also have to take into account the fact that memory that we're
-	 * going to allocate from can belong to an externally allocated memory
-	 * area, in which case the assumption of IOVA as VA mode being
-	 * synonymous with IOVA contiguousness will not hold.
 	 */
 
 	need_iova_contig_obj = !(mp->flags & MEMPOOL_F_NO_IOVA_CONTIG);
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index ad7cc6ad2..225bf9fc9 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -491,6 +491,9 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  *   Number of objects to be added in mempool.
  * @param[in] pg_shift
  *   LOG2 of the physical pages size. If set to 0, ignore page boundaries.
+ * @param[in] chunk_reserve
+ *   Amount of memory that must be reserved at the beginning of each page,
+ *   or at the beginning of the memory area if pg_shift is 0.
  * @param[out] min_chunk_size
  *   Location for minimum size of the memory chunk which may be used to
  *   store memory pool objects.
@@ -501,7 +504,7 @@ typedef ssize_t (*rte_mempool_calc_mem_size_t)(const struct rte_mempool *mp,
  */
 __rte_experimental
 ssize_t rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
-		uint32_t obj_num, uint32_t pg_shift,
+		uint32_t obj_num, uint32_t pg_shift, size_t chunk_reserve,
 		size_t *min_chunk_size, size_t *align);
 
 /**
@@ -509,7 +512,7 @@ ssize_t rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
  * objects.
  *
  * Equivalent to rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
- * min_chunk_size, align).
+ * 0, min_chunk_size, align).
  */
 ssize_t rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 		uint32_t obj_num, uint32_t pg_shift,
@@ -563,17 +566,31 @@ typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
 
+/**
+ * Align objects on addresses multiple of total_elt_sz.
+ */
+#define RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ 0x0001
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice.
  *
  * @internal Helper to populate memory pool object using provided memory
- * chunk: just slice objects one by one.
+ * chunk: just slice objects one by one, taking care of not
+ * crossing page boundaries.
+ *
+ * If RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ is set in flags, the addresses
+ * of object headers will be aligned on a multiple of total_elt_sz.
+ * This feature is used by octeontx hardware.
  *
  * This function is internal to mempool library and mempool drivers.
  *
  * @param[in] mp
  *   A pointer to the mempool structure.
+ * @param[in] flags
+ *   Logical OR of following flags:
+ *   - RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ: align objects on addresses
+ *     multiple of total_elt_sz.
  * @param[in] max_objs
  *   Maximum number of objects to be added in mempool.
  * @param[in] vaddr
@@ -591,14 +608,14 @@ typedef int (*rte_mempool_populate_t)(struct rte_mempool *mp,
  */
 __rte_experimental
 int rte_mempool_op_populate_helper(struct rte_mempool *mp,
-		unsigned int max_objs,
+		unsigned int flags, unsigned int max_objs,
 		void *vaddr, rte_iova_t iova, size_t len,
 		rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg);
 
 /**
  * Default way to populate memory pool object using provided memory chunk.
  *
- * Equivalent to rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+ * Equivalent to rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, iova,
  * len, obj_cb, obj_cb_arg).
  */
 int rte_mempool_op_populate_default(struct rte_mempool *mp,
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index 0bfc63497..e6be7152b 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -9,6 +9,7 @@
 ssize_t
 rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 				uint32_t obj_num, uint32_t pg_shift,
+				size_t chunk_reserve,
 				size_t *min_chunk_size, size_t *align)
 {
 	size_t total_elt_sz;
@@ -19,10 +20,12 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 	if (total_elt_sz == 0) {
 		mem_size = 0;
 	} else if (pg_shift == 0) {
-		mem_size = total_elt_sz * obj_num;
+		mem_size = total_elt_sz * obj_num + chunk_reserve;
 	} else {
 		pg_sz = (size_t)1 << pg_shift;
-		obj_per_page = pg_sz / total_elt_sz;
+		if (chunk_reserve >= pg_sz)
+			return -EINVAL;
+		obj_per_page = (pg_sz - chunk_reserve) / total_elt_sz;
 		if (obj_per_page == 0) {
 			/*
 			 * Note that if object size is bigger than page size,
@@ -30,8 +33,8 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 			 * of physically continuous pages big enough to store
 			 * at least one object.
 			 */
-			mem_size =
-				RTE_ALIGN_CEIL(total_elt_sz, pg_sz) * obj_num;
+			mem_size = RTE_ALIGN_CEIL(total_elt_sz + chunk_reserve,
+						pg_sz) * obj_num;
 		} else {
 			/* In the best case, the allocator will return a
 			 * page-aligned address. For example, with 5 objs,
@@ -42,7 +45,8 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 			 */
 			objs_in_last_page = ((obj_num - 1) % obj_per_page) + 1;
 			/* room required for the last page */
-			mem_size = objs_in_last_page * total_elt_sz;
+			mem_size = objs_in_last_page * total_elt_sz +
+				chunk_reserve;
 			/* room required for other pages */
 			mem_size += ((obj_num - objs_in_last_page) /
 				obj_per_page) << pg_shift;
@@ -67,24 +71,60 @@ rte_mempool_op_calc_mem_size_default(const struct rte_mempool *mp,
 				size_t *min_chunk_size, size_t *align)
 {
 	return rte_mempool_op_calc_mem_size_helper(mp, obj_num, pg_shift,
-						min_chunk_size, align);
+						0, min_chunk_size, align);
+}
+
+/* Returns -1 if object crosses a page boundary, else returns 0 */
+static int
+check_obj_bounds(char *obj, size_t pg_sz, size_t elt_sz)
+{
+	if (pg_sz == 0)
+		return 0;
+	if (elt_sz > pg_sz)
+		return 0;
+	if (RTE_PTR_ALIGN(obj, pg_sz) != RTE_PTR_ALIGN(obj + elt_sz - 1, pg_sz))
+		return -1;
+	return 0;
 }
 
 int
-rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int max_objs,
-			void *vaddr, rte_iova_t iova, size_t len,
-			rte_mempool_populate_obj_cb_t *obj_cb, void *obj_cb_arg)
+rte_mempool_op_populate_helper(struct rte_mempool *mp, unsigned int flags,
+			unsigned int max_objs, void *vaddr, rte_iova_t iova,
+			size_t len, rte_mempool_populate_obj_cb_t *obj_cb,
+			void *obj_cb_arg)
 {
-	size_t total_elt_sz;
+	char *va = vaddr;
+	size_t total_elt_sz, pg_sz;
 	size_t off;
 	unsigned int i;
 	void *obj;
+	int ret;
+
+	ret = rte_mempool_get_page_size(mp, &pg_sz);
+	if (ret < 0)
+		return ret;
 
 	total_elt_sz = mp->header_size + mp->elt_size + mp->trailer_size;
 
-	for (off = 0, i = 0; off + total_elt_sz <= len && i < max_objs; i++) {
+	if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
+		off = total_elt_sz - (((uintptr_t)(va - 1) % total_elt_sz) + 1);
+	else
+		off = 0;
+	for (i = 0; i < max_objs; i++) {
+		/* avoid objects to cross page boundaries */
+		if (check_obj_bounds(va + off, pg_sz, total_elt_sz) < 0) {
+			off += RTE_PTR_ALIGN_CEIL(va + off, pg_sz) - (va + off);
+			if (flags & RTE_MEMPOOL_POPULATE_F_ALIGN_OBJ)
+				off += total_elt_sz -
+					(((uintptr_t)(va + off - 1) %
+						total_elt_sz) + 1);
+		}
+
+		if (off + total_elt_sz > len)
+			break;
+
 		off += mp->header_size;
-		obj = (char *)vaddr + off;
+		obj = va + off;
 		obj_cb(mp, obj_cb_arg, obj,
 		       (iova == RTE_BAD_IOVA) ? RTE_BAD_IOVA : (iova + off));
 		rte_mempool_ops_enqueue_bulk(mp, &obj, 1);
@@ -100,6 +140,6 @@ rte_mempool_op_populate_default(struct rte_mempool *mp, unsigned int max_objs,
 				rte_mempool_populate_obj_cb_t *obj_cb,
 				void *obj_cb_arg)
 {
-	return rte_mempool_op_populate_helper(mp, max_objs, vaddr, iova,
+	return rte_mempool_op_populate_helper(mp, 0, max_objs, vaddr, iova,
 					len, obj_cb, obj_cb_arg);
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v4 7/7] mempool: use the specific macro for object alignment
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
                                             ` (5 preceding siblings ...)
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 6/7] mempool: prevent objects from being across pages Olivier Matz
@ 2019-11-05 15:37                           ` Olivier Matz
  2019-11-05 16:03                           ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
  2019-11-06 10:39                           ` Thomas Monjalon
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 15:37 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

For consistency, RTE_MEMPOOL_ALIGN should be used in place of
RTE_CACHE_LINE_SIZE. They have the same value, because the only arch
that was defining a specific value for it has been removed from dpdk.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
Reviewed-by: Andrew Rybchenko <arybchenko@solarflare.com>
Acked-by: Nipun Gupta <nipun.gupta@nxp.com>
---
 lib/librte_mempool/rte_mempool.c             | 2 +-
 lib/librte_mempool/rte_mempool.h             | 3 +++
 lib/librte_mempool/rte_mempool_ops_default.c | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/librte_mempool/rte_mempool.c b/lib/librte_mempool/rte_mempool.c
index d3db9273d..40cae3eb6 100644
--- a/lib/librte_mempool/rte_mempool.c
+++ b/lib/librte_mempool/rte_mempool.c
@@ -329,7 +329,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr,
 	if (mp->flags & MEMPOOL_F_NO_CACHE_ALIGN)
 		off = RTE_PTR_ALIGN_CEIL(vaddr, 8) - vaddr;
 	else
-		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_CACHE_LINE_SIZE) - vaddr;
+		off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_MEMPOOL_ALIGN) - vaddr;
 
 	if (off > len) {
 		ret = -EINVAL;
diff --git a/lib/librte_mempool/rte_mempool.h b/lib/librte_mempool/rte_mempool.h
index 225bf9fc9..f81152af9 100644
--- a/lib/librte_mempool/rte_mempool.h
+++ b/lib/librte_mempool/rte_mempool.h
@@ -116,6 +116,9 @@ struct rte_mempool_objsz {
 #define	MEMPOOL_PG_NUM_DEFAULT	1
 
 #ifndef RTE_MEMPOOL_ALIGN
+/**
+ * Alignment of elements inside mempool.
+ */
 #define RTE_MEMPOOL_ALIGN	RTE_CACHE_LINE_SIZE
 #endif
 
diff --git a/lib/librte_mempool/rte_mempool_ops_default.c b/lib/librte_mempool/rte_mempool_ops_default.c
index e6be7152b..22fccf9d7 100644
--- a/lib/librte_mempool/rte_mempool_ops_default.c
+++ b/lib/librte_mempool/rte_mempool_ops_default.c
@@ -60,7 +60,7 @@ rte_mempool_op_calc_mem_size_helper(const struct rte_mempool *mp,
 	}
 
 	*min_chunk_size = total_elt_sz;
-	*align = RTE_CACHE_LINE_SIZE;
+	*align = RTE_MEMPOOL_ALIGN;
 
 	return mem_size;
 }
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
                                             ` (6 preceding siblings ...)
  2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 7/7] mempool: use the specific macro for object alignment Olivier Matz
@ 2019-11-05 16:03                           ` Olivier Matz
  2019-11-06 10:39                           ` Thomas Monjalon
  8 siblings, 0 replies; 251+ messages in thread
From: Olivier Matz @ 2019-11-05 16:03 UTC (permalink / raw)
  To: dev
  Cc: Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Thomas Monjalon, Vamsi Krishna Attunuru,
	Hemant Agrawal, Nipun Gupta, David Marchand

On Tue, Nov 05, 2019 at 04:36:59PM +0100, Olivier Matz wrote:
> KNI supposes that mbufs are contiguous in kernel virtual memory. This
> may not be true when using the IOVA=VA mode. To fix this, a possibility
> is to ensure that objects do not cross page boundaries in mempool. This
> patchset implements this in the last patch (5/5).
> 
> The previous patches prepare the job:
> - allow to populate with an unaligned virtual area (1/5).
> - reduce spaced wasted in the mempool size calculation when not using
>   the iova-contiguous allocation (2/5).
> - remove the iova-contiguous allocation when populating mempool (3/5):
>   a va-contiguous alloc does the job as well if we want to populate
>   without crossing page boundaries, so simplify the mempool populate
>   function.
> - export a function to get the minimum page used in a mempool (4/5)
> 
> Memory consumption impact when using hugepages:
> - worst case: + ~0.1% for a mbuf pool (objsize ~= 2368)
> - best case: -50% for if pool size is just above page size
> 
> The memory consumption impact with 4K pages in IOVA=VA mode could
> however consume up to 75% more memory for mbuf pool, because there will
> be only 1 mbuf per page. Not sure how common this usecase is.
> 
> Caveat: this changes the behavior of the mempool (calc_mem_size and
> populate), and there is a small risk to break things, especially with
> alternate mempool drivers.
> 
> v4
> 
> * remove useless comments in Makefiles and meson.build (sugg by David)
> * add EXPERIMENTAL banner on new functions in API comments (David)
> * sort by version in rte_mempool_version.map (David)
> * remove duplicated -DALLOW_EXPERIMENTAL_API flag in octeontx2
>   mempool driver
> * enhance API comments for new helpers

I forgot:

* move bucket mempool driver modifications from patch 7 to
  patch 6 (seen by Andrew)

> 
> v3
> 
> * introduce new helpers to calculate required memory size and to
>   populate mempool, use them in drivers: the alignment constraint
>   of octeontx/octeontx2 is managed in this common code.
> * fix octeontx mempool driver by taking alignment constraint in account
>   like in octeontx2
> * fix bucket mempool driver with 4K pages: limit bucket size in this
>   case to ensure that objects do not cross page boundaries. With larger
>   pages, it was already ok, because bucket size (64K) is smaller than
>   a page.
> * fix some api comments in mempool header file
> 
> v2
> 
> * update octeontx2 driver to keep alignment constraint (issue seen by
>   Vamsi)
> * add a new patch to use RTE_MEMPOOL_ALIGN (Andrew)
> * fix initialization of for loop in rte_mempool_populate_virt() (Andrew)
> * use rte_mempool_populate_iova() if mz_flags has
>   RTE_MEMZONE_IOVA_CONTIG (Andrew)
> * check rte_mempool_get_page_size() return value (Andrew)
> * some other minor style improvements
> 
> rfc -> v1
> 
> * remove first cleanup patch, it was pushed separately
>   a2b5a8722f20 ("mempool: clarify default populate function")
> * add missing change in rte_mempool_op_calc_mem_size_default()
> * allow unaligned addr/len in populate virt
> * better split patches
> * try to better explain the change
> * use DPDK align macros when relevant
> 
> Olivier Matz (7):
>   mempool: allow unaligned addr/len in populate virt
>   mempool: reduce wasted space on mempool populate
>   mempool: remove optimistic IOVA-contiguous allocation
>   mempool: introduce function to get mempool page size
>   mempool: introduce helpers for populate and calc mem size
>   mempool: prevent objects from being across pages
>   mempool: use the specific macro for object alignment
> 
>  drivers/mempool/bucket/Makefile               |   1 +
>  drivers/mempool/bucket/meson.build            |   2 +
>  drivers/mempool/bucket/rte_mempool_bucket.c   |  10 +-
>  drivers/mempool/dpaa/dpaa_mempool.c           |   4 +-
>  drivers/mempool/dpaa2/dpaa2_hw_mempool.c      |   4 +-
>  drivers/mempool/octeontx/Makefile             |   2 +
>  drivers/mempool/octeontx/meson.build          |   2 +
>  .../mempool/octeontx/rte_mempool_octeontx.c   |  21 +--
>  drivers/mempool/octeontx2/Makefile            |   2 +
>  drivers/mempool/octeontx2/meson.build         |   2 +
>  drivers/mempool/octeontx2/otx2_mempool_ops.c  |  21 ++-
>  lib/librte_mempool/rte_mempool.c              | 147 +++++++-----------
>  lib/librte_mempool/rte_mempool.h              | 114 ++++++++++++--
>  lib/librte_mempool/rte_mempool_ops.c          |   4 +-
>  lib/librte_mempool/rte_mempool_ops_default.c  | 113 +++++++++++---
>  lib/librte_mempool/rte_mempool_version.map    |   6 +
>  16 files changed, 312 insertions(+), 143 deletions(-)
> 
> -- 
> 2.20.1
> 

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages
  2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
                                             ` (7 preceding siblings ...)
  2019-11-05 16:03                           ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
@ 2019-11-06 10:39                           ` Thomas Monjalon
  8 siblings, 0 replies; 251+ messages in thread
From: Thomas Monjalon @ 2019-11-06 10:39 UTC (permalink / raw)
  To: Olivier Matz
  Cc: dev, Anatoly Burakov, Andrew Rybchenko, Ferruh Yigit, Giridharan,
	Ganesan, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Stephen Hemminger, Vamsi Krishna Attunuru, Hemant Agrawal,
	Nipun Gupta, David Marchand

05/11/2019 16:36, Olivier Matz:
> Olivier Matz (7):
>   mempool: allow unaligned addr/len in populate virt
>   mempool: reduce wasted space on mempool populate
>   mempool: remove optimistic IOVA-contiguous allocation
>   mempool: introduce function to get mempool page size
>   mempool: introduce helpers for populate and calc mem size
>   mempool: prevent objects from being across pages
>   mempool: use the specific macro for object alignment

Applied, thanks

Release notes were added.





^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
  2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 1/2] kni: " vattunuru
  2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 2/2] kni: add IOVA=VA support in kernel module vattunuru
@ 2019-11-06 10:49                         ` Thomas Monjalon
  2019-11-06 11:09                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-11-07 19:53                         ` [dpdk-dev] " Ferruh Yigit
  2019-11-15 11:18                         ` [dpdk-dev] [PATCH v13 0/2] kni: support IOVA mode vattunuru
  4 siblings, 1 reply; 251+ messages in thread
From: Thomas Monjalon @ 2019-11-06 10:49 UTC (permalink / raw)
  To: vattunuru
  Cc: dev, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen

For info, the mempool patches are merged now.

05/11/2019 12:04, vattunuru@marvell.com:
> Vamsi Attunuru (2):
>   kni: add IOVA=VA mode support
>   kni: add IOVA=VA support in kernel module

Should the kernel support be the first patch?

About the titles, can it be simply "support IOVA mode"?



^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-06 10:49                         ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support Thomas Monjalon
@ 2019-11-06 11:09                           ` Vamsi Krishna Attunuru
  2019-11-06 11:53                             ` Thomas Monjalon
  2019-11-07 10:34                             ` Vamsi Krishna Attunuru
  0 siblings, 2 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-06 11:09 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, ferruh.yigit, anatoly.burakov, arybchenko, stephen



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Wednesday, November 6, 2019 4:19 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: dev@dpdk.org; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
> Kumar Kokkilagadda <kirankumark@marvell.com>; olivier.matz@6wind.com;
> ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org
> Subject: [EXT] Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
> 
> External Email
> 
> ----------------------------------------------------------------------
> For info, the mempool patches are merged now.
> 
> 05/11/2019 12:04, vattunuru@marvell.com:
> > Vamsi Attunuru (2):
> >   kni: add IOVA=VA mode support
> >   kni: add IOVA=VA support in kernel module
> 
> Should the kernel support be the first patch?

But required variable `iova_mode` is defined in first patch currently. 
Either one is fine to me. Please let me know  if want to put kernel support patch at first, I will send next version accordingly.

> 
> About the titles, can it be simply "support IOVA mode"?

Yes, "support IOVA_VA mode" should be fine.
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-06 11:09                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-11-06 11:53                             ` Thomas Monjalon
  2019-11-06 11:59                               ` Vamsi Krishna Attunuru
  2019-11-07 10:34                             ` Vamsi Krishna Attunuru
  1 sibling, 1 reply; 251+ messages in thread
From: Thomas Monjalon @ 2019-11-06 11:53 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: dev, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, ferruh.yigit, anatoly.burakov, arybchenko, stephen

06/11/2019 12:09, Vamsi Krishna Attunuru:
> From: Thomas Monjalon <thomas@monjalon.net>
> > 05/11/2019 12:04, vattunuru@marvell.com:
> > > Vamsi Attunuru (2):
> > >   kni: add IOVA=VA mode support
> > >   kni: add IOVA=VA support in kernel module
> > 
> > Should the kernel support be the first patch?
> 
> But required variable `iova_mode` is defined in first patch currently. 
> Either one is fine to me. Please let me know  if want to put kernel support patch at first, I will send next version accordingly.
> 
> > 
> > About the titles, can it be simply "support IOVA mode"?
> 
> Yes, "support IOVA_VA mode" should be fine.

I really dislike the name IOVA_VA.

You mean support virtual IO addressing?



^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-06 11:53                             ` Thomas Monjalon
@ 2019-11-06 11:59                               ` Vamsi Krishna Attunuru
  0 siblings, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-06 11:59 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, ferruh.yigit, anatoly.burakov, arybchenko, stephen



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Wednesday, November 6, 2019 5:23 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: dev@dpdk.org; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
> Kumar Kokkilagadda <kirankumark@marvell.com>; olivier.matz@6wind.com;
> ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org
> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
> 
> 06/11/2019 12:09, Vamsi Krishna Attunuru:
> > From: Thomas Monjalon <thomas@monjalon.net>
> > > 05/11/2019 12:04, vattunuru@marvell.com:
> > > > Vamsi Attunuru (2):
> > > >   kni: add IOVA=VA mode support
> > > >   kni: add IOVA=VA support in kernel module
> > >
> > > Should the kernel support be the first patch?
> >
> > But required variable `iova_mode` is defined in first patch currently.
> > Either one is fine to me. Please let me know  if want to put kernel support
> patch at first, I will send next version accordingly.
> >
> > >
> > > About the titles, can it be simply "support IOVA mode"?
> >
> > Yes, "support IOVA_VA mode" should be fine.
> 
> I really dislike the name IOVA_VA.
> 
> You mean support virtual IO addressing?
> 
Yes.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-06 11:09                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-11-06 11:53                             ` Thomas Monjalon
@ 2019-11-07 10:34                             ` Vamsi Krishna Attunuru
  1 sibling, 0 replies; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-07 10:34 UTC (permalink / raw)
  To: ferruh.yigit
  Cc: dev, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen,
	Thomas Monjalon

Hi Ferruh,

Could  you please review v12.  

 Regards,
A Vamsi

> -----Original Message-----
> From: Vamsi Krishna Attunuru
> Sent: Wednesday, November 6, 2019 4:39 PM
> To: Thomas Monjalon <thomas@monjalon.net>
> Cc: dev@dpdk.org; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
> Kumar Kokkilagadda <kirankumark@marvell.com>; olivier.matz@6wind.com;
> ferruh.yigit@intel.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org
> Subject: RE: [EXT] Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
> 
> 
> 
> > -----Original Message-----
> > From: Thomas Monjalon <thomas@monjalon.net>
> > Sent: Wednesday, November 6, 2019 4:19 PM
> > To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Cc: dev@dpdk.org; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> > Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> > olivier.matz@6wind.com; ferruh.yigit@intel.com;
> > anatoly.burakov@intel.com; arybchenko@solarflare.com;
> > stephen@networkplumber.org
> > Subject: [EXT] Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > For info, the mempool patches are merged now.
> >
> > 05/11/2019 12:04, vattunuru@marvell.com:
> > > Vamsi Attunuru (2):
> > >   kni: add IOVA=VA mode support
> > >   kni: add IOVA=VA support in kernel module
> >
> > Should the kernel support be the first patch?
> 
> But required variable `iova_mode` is defined in first patch currently.
> Either one is fine to me. Please let me know  if want to put kernel support patch
> at first, I will send next version accordingly.
> 
> >
> > About the titles, can it be simply "support IOVA mode"?
> 
> Yes, "support IOVA_VA mode" should be fine.
> >


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
                                           ` (2 preceding siblings ...)
  2019-11-06 10:49                         ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support Thomas Monjalon
@ 2019-11-07 19:53                         ` Ferruh Yigit
  2019-11-08  4:16                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-11-15 11:18                         ` [dpdk-dev] [PATCH v13 0/2] kni: support IOVA mode vattunuru
  4 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-11-07 19:53 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, anatoly.burakov,
	arybchenko, stephen

On 11/5/2019 11:04 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> ---
> V12 Changes:
> * Removed previously added `--legacy-kni` eal option.
> * Removed previously added kni specific mempool create routines
> and mempool populate routines.
> 
> This patch set(V12) is dependent on following patch set, since the mempool
> related support to enable KNI in IOVA=VA mode is taken care in below
> patchset.
> 
>    https://patchwork.dpdk.org/cover/62376/

Hi Vasim, Jerin,

Overall looks good and I not getting any functional error but I am observing a
huge performance drop with this update, 3.8Mpps to 0.7Mpps [1].

I don't know really what to do, I think we need to give a decision as community,
and even we go with the patch we should document this performance drop clearly
and document how to mitigate it.



[1]
This is with kni sample application,
a) IOVA=VA mode selected
./examples/kni/build/kni -l0,40-47 --log-level=*:debug -- -p 0x3 -P --config
"(0,44,45,40),(1,46,47,41)"

forwarding performance is around 0.7Mpps and 'kni_single' kernel thread consumes
all cpu.

b) IOVA=PA mode forced
./examples/kni/build/kni -l0,40-47 --log-level=*:debug --iova=pa -- -p 0x3 -P
--config "(0,44,45,40),(1,46,47,41)"

forwarding performance is around 3.8Mpps and 'kni_single' core utilization is ~80%.

I am on 5.1.20-300.fc30.x86_64 kernel.
kni module inserted as: "insmod ./build/kmod/rte_kni.ko lo_mode=lo_mode_fifo"

> 
> V11 Changes:
> * Added iova to kva address translation routines in kernel module to
> make it work in iova=va mode which enables DPDK to create kni devices
> on any kind of backed device/memory.
> * Added ``--legacy-kni`` eal option to make existing KNI applications
> work with DPDK 19.11 and later versions.
> * Removed previously added pci device info from kni device info struct.
>  
> V10 Changes:
> * Fixed function return code on failure when min_chunk_size > pg_sz.
> * Marked new mempool populate routine as EXPERIMENTAL.
>  
> V9 Changes:
> * Used rte_mempool_ops_calc_mem_size() instead of default handler in the
> new mempool populate routine.
> * Check min_chunk_size and return values.
> * Removed ethdev_info memset to '0' and moved pci dev_info populate into
> kni_dev_pci_addr_get() routine.
> * Addressed misc. review comments.
>  
> V8 Changes:
> * Remove default mempool populate() routine changes.
> * Add kni app specific mempool create & free routines.
> * Add new mempool populate routine to allocate page-aligned memzones
> with page size to make sure all mempool objects reside on a page.
> * Update release notes and map files.
>  
> V7 Changes:
> * Removed previously proposed mempool flag and made those page
> boundary checks default in mempool populate() except for the objects size
> bigger than the size of page.
> * Removed KNI example application related changes since pool related
> requirement is taken care in mempool lib.
> * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> * Added wrapper functions in KNI module to hide IOVA checks and make
> address translation routines more readable.
> * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
> mode is enabled.
>  
> V6 Changes:
> * Added new mempool flag to ensure mbuf memory is not scattered across
> page boundaries.
> * Added KNI kernel module required PCI device information.
> * Modified KNI example application to create mempool with new mempool
> flag.
>  
> V5 changes:
> * Fixed build issue with 32b build
>  
> V4 changes:
> * Fixed build issues with older kernel versions
> * This approach will only work with kernel above 4.4.0
>  
> V3 Changes:
> * Add new approach to work kni with IOVA=VA mode using
> iommu_iova_to_phys API.
> 
> Vamsi Attunuru (2):
>   kni: add IOVA=VA mode support
>   kni: add IOVA=VA support in kernel module
> 
>  doc/guides/prog_guide/kernel_nic_interface.rst    |  9 ++++
>  doc/guides/rel_notes/release_19_11.rst            |  5 ++
>  kernel/linux/kni/compat.h                         | 15 ++++++
>  kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
>  kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
>  kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
>  lib/librte_eal/linux/eal/eal.c                    | 29 ++++++-----
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
>  lib/librte_kni/rte_kni.c                          |  7 +--
>  9 files changed, 170 insertions(+), 39 deletions(-)
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re:  [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-07 19:53                         ` [dpdk-dev] " Ferruh Yigit
@ 2019-11-08  4:16                           ` Vamsi Krishna Attunuru
  2019-11-08 14:26                             ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-08  4:16 UTC (permalink / raw)
  To: Ferruh Yigit, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen



> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Friday, November 8, 2019 1:23 AM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
> Kiran Kumar Kokkilagadda <kirankumark@marvell.com>;
> olivier.matz@6wind.com; anatoly.burakov@intel.com;
> arybchenko@solarflare.com; stephen@networkplumber.org
> Subject: [EXT] Re: [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support
> 
> External Email
> 
> ----------------------------------------------------------------------
> On 11/5/2019 11:04 AM, vattunuru@marvell.com wrote:
> 
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> >
> 
> > ---
> 
> > V12 Changes:
> 
> > * Removed previously added `--legacy-kni` eal option.
> 
> > * Removed previously added kni specific mempool create routines
> 
> > and mempool populate routines.
> 
> >
> 
> > This patch set(V12) is dependent on following patch set, since the mempool
> 
> > related support to enable KNI in IOVA=VA mode is taken care in below
> 
> > patchset.
> 
> >
> 
> >    https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__patchwork.dpdk.org_cover_62376_&d=DwIDaQ&c=nKjWec2b6R0mOyPaz7
> xtfQ&r=WllrYaumVkxaWjgKto6E_rtDQshhIhik2jkvzFyRhW8&m=sEREtIZWQwtHJ
> CxXRRv8euBGdr5q6K4L8Kz7PI25QcY&s=5Rz1dbSeIWt56cCtiwR6pfGprEFUumtcd
> 34TmF3sjs4&e=
> 
> 
> 
> Hi Vasim, Jerin,
> 
> 
> 
> Overall looks good and I not getting any functional error but I am observing a
> 
> huge performance drop with this update, 3.8Mpps to 0.7Mpps [1].	

Hi Ferruh,

When it comes to actual kernel netdev test cases  like iperf or any other use cases, there would not be any impact on performance. I think synthetic test case like loopback mode might not be the actual test case alone to depend on when the kernel module is featured to work with kind of devices(pdev or vdev). Users can always fallback to pa mode with cmd line option.

Please suggest your thoughts on considering what test case to use & evaluate the performance difference.

> 
> 
> 
> I don't know really what to do, I think we need to give a decision as community,
> 
> and even we go with the patch we should document this performance drop
> clearly
> 
> and document how to mitigate it.
> 
> 
> 
> 
> 
> 
> 
> [1]
> 
> This is with kni sample application,
> 
> a) IOVA=VA mode selected
> 
> ./examples/kni/build/kni -l0,40-47 --log-level=*:debug -- -p 0x3 -P --config
> 
> "(0,44,45,40),(1,46,47,41)"
> 
> 
> 
> forwarding performance is around 0.7Mpps and 'kni_single' kernel thread
> consumes
> 
> all cpu.
> 
> 
> 
> b) IOVA=PA mode forced
> 
> ./examples/kni/build/kni -l0,40-47 --log-level=*:debug --iova=pa -- -p 0x3 -P
> 
> --config "(0,44,45,40),(1,46,47,41)"
> 
> 
> 
> forwarding performance is around 3.8Mpps and 'kni_single' core utilization is
> ~80%.
> 
> 
> 
> I am on 5.1.20-300.fc30.x86_64 kernel.
> 
> kni module inserted as: "insmod ./build/kmod/rte_kni.ko
> lo_mode=lo_mode_fifo"
> 
> 
> 
> >
> 
> > V11 Changes:
> 
> > * Added iova to kva address translation routines in kernel module to
> 
> > make it work in iova=va mode which enables DPDK to create kni devices
> 
> > on any kind of backed device/memory.
> 
> > * Added ``--legacy-kni`` eal option to make existing KNI applications
> 
> > work with DPDK 19.11 and later versions.
> 
> > * Removed previously added pci device info from kni device info struct.
> 
> >
> 
> > V10 Changes:
> 
> > * Fixed function return code on failure when min_chunk_size > pg_sz.
> 
> > * Marked new mempool populate routine as EXPERIMENTAL.
> 
> >
> 
> > V9 Changes:
> 
> > * Used rte_mempool_ops_calc_mem_size() instead of default handler in the
> 
> > new mempool populate routine.
> 
> > * Check min_chunk_size and return values.
> 
> > * Removed ethdev_info memset to '0' and moved pci dev_info populate into
> 
> > kni_dev_pci_addr_get() routine.
> 
> > * Addressed misc. review comments.
> 
> >
> 
> > V8 Changes:
> 
> > * Remove default mempool populate() routine changes.
> 
> > * Add kni app specific mempool create & free routines.
> 
> > * Add new mempool populate routine to allocate page-aligned memzones
> 
> > with page size to make sure all mempool objects reside on a page.
> 
> > * Update release notes and map files.
> 
> >
> 
> > V7 Changes:
> 
> > * Removed previously proposed mempool flag and made those page
> 
> > boundary checks default in mempool populate() except for the objects size
> 
> > bigger than the size of page.
> 
> > * Removed KNI example application related changes since pool related
> 
> > requirement is taken care in mempool lib.
> 
> > * All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
> 
> > * Added wrapper functions in KNI module to hide IOVA checks and make
> 
> > address translation routines more readable.
> 
> > * Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
> 
> > mode is enabled.
> 
> >
> 
> > V6 Changes:
> 
> > * Added new mempool flag to ensure mbuf memory is not scattered across
> 
> > page boundaries.
> 
> > * Added KNI kernel module required PCI device information.
> 
> > * Modified KNI example application to create mempool with new mempool
> 
> > flag.
> 
> >
> 
> > V5 changes:
> 
> > * Fixed build issue with 32b build
> 
> >
> 
> > V4 changes:
> 
> > * Fixed build issues with older kernel versions
> 
> > * This approach will only work with kernel above 4.4.0
> 
> >
> 
> > V3 Changes:
> 
> > * Add new approach to work kni with IOVA=VA mode using
> 
> > iommu_iova_to_phys API.
> 
> >
> 
> > Vamsi Attunuru (2):
> 
> >   kni: add IOVA=VA mode support
> 
> >   kni: add IOVA=VA support in kernel module
> 
> >
> 
> >  doc/guides/prog_guide/kernel_nic_interface.rst    |  9 ++++
> 
> >  doc/guides/rel_notes/release_19_11.rst            |  5 ++
> 
> >  kernel/linux/kni/compat.h                         | 15 ++++++
> 
> >  kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
> 
> >  kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
> 
> >  kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
> 
> >  lib/librte_eal/linux/eal/eal.c                    | 29 ++++++-----
> 
> >  lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
> 
> >  lib/librte_kni/rte_kni.c                          |  7 +--
> 
> >  9 files changed, 170 insertions(+), 39 deletions(-)
> 
> >
> 
> 


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re:  [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-08  4:16                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-11-08 14:26                             ` Ferruh Yigit
  2019-11-08 14:54                               ` Jerin Jacob
  0 siblings, 1 reply; 251+ messages in thread
From: Ferruh Yigit @ 2019-11-08 14:26 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, dev
  Cc: thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen


>> Hi Vasim, Jerin,
>>
>> Overall looks good and I not getting any functional error but I am observing a
>> huge performance drop with this update, 3.8Mpps to 0.7Mpps [1].	
> 
> Hi Ferruh,
> When it comes to actual kernel netdev test cases  like iperf or any other use cases, there would not be any impact on performance. I think synthetic test case like loopback mode might not be the actual test case alone to depend on when the kernel module is featured to work with kind of devices(pdev or vdev). Users can always fallback to pa mode with cmd line option.
> 
> Please suggest your thoughts on considering what test case to use & evaluate the performance difference.

Hi Vasim,

I also assume the real life test cases will be affected less, but the loopback
performance testing is good to show performance impact of the change.

(Stephen's predictions that KNI is not as fast as tun/tap are getting more real
by time J)

At least I think the possible performance drop and how to mitigate it should be
documented both in release notes and kni documentation.

For the final decision, I am not objecting it but I would like to see more ack
from community to confirm that we trade off iova=va functionality against
performance.

@Jerin, @Thomas, should we conclude this in techboard? Perhaps we can get it for
rc2 and drop it back if rejected in techboard?


Regards,
ferruh

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-08 14:26                             ` Ferruh Yigit
@ 2019-11-08 14:54                               ` Jerin Jacob
  2019-11-13  6:33                                 ` Vamsi Krishna Attunuru
  0 siblings, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-11-08 14:54 UTC (permalink / raw)
  To: Ferruh Yigit
  Cc: Vamsi Krishna Attunuru, dev, thomas, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, olivier.matz, anatoly.burakov,
	arybchenko, stephen

On Fri, Nov 8, 2019 at 7:56 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>
>
> >> Hi Vasim, Jerin,
> >>
> >> Overall looks good and I not getting any functional error but I am observing a
> >> huge performance drop with this update, 3.8Mpps to 0.7Mpps [1].
> >
> > Hi Ferruh,
> > When it comes to actual kernel netdev test cases  like iperf or any other use cases, there would not be any impact on performance. I think synthetic test case like loopback mode might not be the actual test case alone to depend on when the kernel module is featured to work with kind of devices(pdev or vdev). Users can always fallback to pa mode with cmd line option.
> >
> > Please suggest your thoughts on considering what test case to use & evaluate the performance difference.
>
> Hi Vasim,
>
> I also assume the real life test cases will be affected less, but the loopback
> performance testing is good to show performance impact of the change.

Yes. real-world case Linux kernel stack will be the bottleneck.

>
> (Stephen's predictions that KNI is not as fast as tun/tap are getting more real
> by time J)
>
> At least I think the possible performance drop and how to mitigate it should be
> documented both in release notes and kni documentation.

+1 for adding documentation. Setting iova-mode=pa will be mitigation
if the application does
not care about iova-mode.

>
> For the final decision, I am not objecting it but I would like to see more ack
> from community to confirm that we trade off iova=va functionality against
> performance.

In my view, IOVA as VA mode case, translation cannot be avoided and we
have the requirement
where it needs to work with vdev(where is not backed by any IOMMU context) so
I am not sure how to avoid the translation cost. Since we have support
for both modes,i.e
existing IOVA as PA path still exists, I don't think, we are losing anything.

> @Jerin, @Thomas, should we conclude this in techboard? Perhaps we can get it for
> rc2 and drop it back if rejected in techboard?
>
> Regards,
> ferruh

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-08 14:54                               ` Jerin Jacob
@ 2019-11-13  6:33                                 ` Vamsi Krishna Attunuru
  2019-11-13 12:32                                   ` Ferruh Yigit
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-13  6:33 UTC (permalink / raw)
  To: Jerin Jacob, Ferruh Yigit
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen


> -----Original Message-----
> From: Jerin Jacob <jerinjacobk@gmail.com>
> Sent: Friday, November 8, 2019 8:24 PM
> To: Ferruh Yigit <ferruh.yigit@intel.com>
> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
> Kumar Kokkilagadda <kirankumark@marvell.com>; olivier.matz@6wind.com;
> anatoly.burakov@intel.com; arybchenko@solarflare.com;
> stephen@networkplumber.org
> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
> 
> On Fri, Nov 8, 2019 at 7:56 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> >
> >
> > >> Hi Vasim, Jerin,
> > >>
> > >> Overall looks good and I not getting any functional error but I am
> > >> observing a huge performance drop with this update, 3.8Mpps to 0.7Mpps
> [1].
> > >
> > > Hi Ferruh,
> > > When it comes to actual kernel netdev test cases  like iperf or any other use
> cases, there would not be any impact on performance. I think synthetic test case
> like loopback mode might not be the actual test case alone to depend on when
> the kernel module is featured to work with kind of devices(pdev or vdev). Users
> can always fallback to pa mode with cmd line option.
> > >
> > > Please suggest your thoughts on considering what test case to use &
> evaluate the performance difference.
> >
> > Hi Vasim,
> >
> > I also assume the real life test cases will be affected less, but the
> > loopback performance testing is good to show performance impact of the
> change.
> 
> Yes. real-world case Linux kernel stack will be the bottleneck.
> 
> >
> > (Stephen's predictions that KNI is not as fast as tun/tap are getting
> > more real by time J)
> >
> > At least I think the possible performance drop and how to mitigate it
> > should be documented both in release notes and kni documentation.
> 
> +1 for adding documentation. Setting iova-mode=pa will be mitigation
> if the application does
> not care about iova-mode.
> 
> >
> > For the final decision, I am not objecting it but I would like to see
> > more ack from community to confirm that we trade off iova=va
> > functionality against performance.
> 
> In my view, IOVA as VA mode case, translation cannot be avoided and we have
> the requirement where it needs to work with vdev(where is not backed by any
> IOMMU context) so I am not sure how to avoid the translation cost. Since we
> have support for both modes,i.e existing IOVA as PA path still exists, I don't
> think, we are losing anything.
> 
> > @Jerin, @Thomas, should we conclude this in techboard? Perhaps we can
> > get it for
> > rc2 and drop it back if rejected in techboard?

Hi Ferruh,

Any update on the conclusion about the acceptance of patch set.  I will send the next version of patch set with updated documentation part on performance impact if there are no other concerns. 

Regards
A Vamsi

> >
> > Regards,
> > ferruh

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
  2019-11-13  6:33                                 ` Vamsi Krishna Attunuru
@ 2019-11-13 12:32                                   ` Ferruh Yigit
  0 siblings, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-11-13 12:32 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru, Jerin Jacob
  Cc: dev, thomas, Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	olivier.matz, anatoly.burakov, arybchenko, stephen

On 11/13/2019 6:33 AM, Vamsi Krishna Attunuru wrote:
> 
>> -----Original Message-----
>> From: Jerin Jacob <jerinjacobk@gmail.com>
>> Sent: Friday, November 8, 2019 8:24 PM
>> To: Ferruh Yigit <ferruh.yigit@intel.com>
>> Cc: Vamsi Krishna Attunuru <vattunuru@marvell.com>; dev@dpdk.org;
>> thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Kiran
>> Kumar Kokkilagadda <kirankumark@marvell.com>; olivier.matz@6wind.com;
>> anatoly.burakov@intel.com; arybchenko@solarflare.com;
>> stephen@networkplumber.org
>> Subject: Re: [dpdk-dev] [EXT] Re: [PATCH v12 0/2] add IOVA=VA mode support
>>
>> On Fri, Nov 8, 2019 at 7:56 PM Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>>>
>>>
>>>>> Hi Vasim, Jerin,
>>>>>
>>>>> Overall looks good and I not getting any functional error but I am
>>>>> observing a huge performance drop with this update, 3.8Mpps to 0.7Mpps
>> [1].
>>>>
>>>> Hi Ferruh,
>>>> When it comes to actual kernel netdev test cases  like iperf or any other use
>> cases, there would not be any impact on performance. I think synthetic test case
>> like loopback mode might not be the actual test case alone to depend on when
>> the kernel module is featured to work with kind of devices(pdev or vdev). Users
>> can always fallback to pa mode with cmd line option.
>>>>
>>>> Please suggest your thoughts on considering what test case to use &
>> evaluate the performance difference.
>>>
>>> Hi Vasim,
>>>
>>> I also assume the real life test cases will be affected less, but the
>>> loopback performance testing is good to show performance impact of the
>> change.
>>
>> Yes. real-world case Linux kernel stack will be the bottleneck.
>>
>>>
>>> (Stephen's predictions that KNI is not as fast as tun/tap are getting
>>> more real by time J)
>>>
>>> At least I think the possible performance drop and how to mitigate it
>>> should be documented both in release notes and kni documentation.
>>
>> +1 for adding documentation. Setting iova-mode=pa will be mitigation
>> if the application does
>> not care about iova-mode.
>>
>>>
>>> For the final decision, I am not objecting it but I would like to see
>>> more ack from community to confirm that we trade off iova=va
>>> functionality against performance.
>>
>> In my view, IOVA as VA mode case, translation cannot be avoided and we have
>> the requirement where it needs to work with vdev(where is not backed by any
>> IOMMU context) so I am not sure how to avoid the translation cost. Since we
>> have support for both modes,i.e existing IOVA as PA path still exists, I don't
>> think, we are losing anything.
>>
>>> @Jerin, @Thomas, should we conclude this in techboard? Perhaps we can
>>> get it for
>>> rc2 and drop it back if rejected in techboard?
> 
> Hi Ferruh,
> 
> Any update on the conclusion about the acceptance of patch set.  I will send the next version of patch set with updated documentation part on performance impact if there are no other concerns. 
> 

Hi Vamsi,

From my perspective there is no other change request than the documentation
update, I suggest sending new version.

For the final decision, it can be in technical board but there is no meeting
this week, so I will start an offline discussion.

Thanks,
ferruh

^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v12 1/2] kni: add IOVA=VA mode support
  2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 1/2] kni: " vattunuru
@ 2019-11-14 10:57                           ` David Marchand
  2019-11-14 11:13                             ` David Marchand
  2019-11-14 17:48                             ` [dpdk-dev] " David Marchand
  0 siblings, 2 replies; 251+ messages in thread
From: David Marchand @ 2019-11-14 10:57 UTC (permalink / raw)
  To: Vamsi Attunuru
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran, kirankumark,
	Olivier Matz, Yigit, Ferruh, Burakov, Anatoly, Andrew Rybchenko,
	Stephen Hemminger

On Tue, Nov 5, 2019 at 12:05 PM <vattunuru@marvell.com> wrote:
>
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Current KNI implementation only operates in IOVA_PA mode
> patch adds required functionality to enable KNI in
> IOVA_VA mode.
>
> KNI can operate in IOVA_VA mode when this mode is requested
> using eal option (`iova-mode=va`) or when bus iommu scheme is
> selected as RTE_IOVA_VA.
>
> During KNI creation, app's iova_mode details are passed to
> the KNI kernel module, accordingly kernel module translates
> PA/IOVA addresses to KVA and vice-versa.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> ---
>  doc/guides/prog_guide/kernel_nic_interface.rst    |  9 +++++++
>  doc/guides/rel_notes/release_19_11.rst            |  5 ++++
>  lib/librte_eal/linux/eal/eal.c                    | 29 +++++++++++++----------
>  lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
>  lib/librte_kni/rte_kni.c                          |  7 ++----
>  5 files changed, 34 insertions(+), 17 deletions(-)
>
> diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
> index 2fd58e1..f869493 100644
> --- a/doc/guides/prog_guide/kernel_nic_interface.rst
> +++ b/doc/guides/prog_guide/kernel_nic_interface.rst
> @@ -300,6 +300,15 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
>  The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
>  It then puts the mbuf back in the cache.
>
> +IOVA = VA: Support
> +------------------
> +
> +KNI operates in IOVA_VA scheme when
> +
> +- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) and
> +- eal option `iova-mode=va` is passed or bus IOVA scheme in the DPDK is selected
> +  as RTE_IOVA_VA.
> +
>  Ethtool
>  -------
>
> diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
> index ae8e7b2..e5d2996 100644
> --- a/doc/guides/rel_notes/release_19_11.rst
> +++ b/doc/guides/rel_notes/release_19_11.rst
> @@ -231,6 +231,11 @@ New Features
>    * Added a console command to testpmd app, ``show port (port_id) ptypes`` which
>      gives ability to print port supported ptypes in different protocol layers.
>
> +* **Added IOVA as VA support for KNI.**
> +
> +  Added IOVA = VA support for KNI, KNI can operate in IOVA = VA mode when
> +  `iova-mode=va` eal option is passed to the application or when bus IOVA
> +  scheme is selected as RTE_IOVA_VA.

Add a mention about the kernel requirement here too please.

Missing newline.

>
>  Removed Items
>  -------------
> diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
> index 9e2d50c..a1c5bf6 100644
> --- a/lib/librte_eal/linux/eal/eal.c
> +++ b/lib/librte_eal/linux/eal/eal.c
> @@ -922,6 +922,19 @@ static int rte_eal_vfio_setup(void)
>  }
>  #endif
>
> +static enum rte_iova_mode
> +rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode)
> +{
> +       if (iova_mode == RTE_IOVA_VA) {
> +#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE

I understood that IOVA as VA is possible with kernel >= 4.6.
Should it be < ?


> +               iova_mode = RTE_IOVA_PA;
> +               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module does not support VA\n");
> +#endif
> +       }
> +
> +       return iova_mode;
> +}
> +
>  static void rte_eal_init_alert(const char *msg)
>  {
>         fprintf(stderr, "EAL: FATAL: %s\n", msg);
> @@ -1085,24 +1098,16 @@ rte_eal_init(int argc, char **argv)
>                                 RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
>                         }
>                 }
> -#ifdef RTE_LIBRTE_KNI

You removed this part, but we don't have to consider kni things if
RTE_LIBRTE_KNI is not compiled in.


> -               /* Workaround for KNI which requires physical address to work */
> -               if (iova_mode == RTE_IOVA_VA &&
> -                               rte_eal_check_module("rte_kni") == 1) {
> -                       if (phys_addrs) {

The check on physical address availability has been lost.


> -                               iova_mode = RTE_IOVA_PA;
> -                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
> -                       } else {
> -                               RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
> -                       }
> -               }
> -#endif
>                 rte_eal_get_configuration()->iova_mode = iova_mode;
>         } else {
>                 rte_eal_get_configuration()->iova_mode =
>                         internal_config.iova_mode;
>         }
>
> +       if (rte_eal_check_module("rte_kni") == 1)
> +               rte_eal_get_configuration()->iova_mode =
> +                               rte_eal_kni_get_iova_mode(rte_eal_iova_mode());
> +

You are forcing this even if the user specified a iova mode.


In the end, can't we just change the check on the workaround?
Something like:

@@ -1085,7 +1085,7 @@ rte_eal_init(int argc, char **argv)
                                RTE_LOG(DEBUG, EAL, "IOMMU is not
available, selecting IOVA as PA mode.\n");
                        }
                }
-#ifdef RTE_LIBRTE_KNI
+#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) < LINUX_VERSION_CODE
                /* Workaround for KNI which requires physical address to work */
                if (iova_mode == RTE_IOVA_VA &&
                                rte_eal_check_module("rte_kni") == 1) {


It would be the only change in the whole file lib/librte_eal/linux/eal/eal.c


>         if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
>                 rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
>                 rte_errno = EINVAL;
> diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> index 46f75a7..2427a96 100644
> --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
> +++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
> @@ -125,6 +125,7 @@ struct rte_kni_device_info {
>         unsigned int min_mtu;
>         unsigned int max_mtu;
>         uint8_t mac_addr[6];
> +       uint8_t iova_mode;
>  };
>
>  #define KNI_DEVICE "kni"
> diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
> index 7fbcf22..7221280 100644
> --- a/lib/librte_kni/rte_kni.c
> +++ b/lib/librte_kni/rte_kni.c
> @@ -97,11 +97,6 @@ static volatile int kni_fd = -1;
>  int
>  rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
>  {

Should this check still apply with kernel < 4.6?
Like:
+ #if KERNEL_VERSION(4, 6, 0) < LINUX_VERSION_CODE
> -       if (rte_eal_iova_mode() != RTE_IOVA_PA) {
> -               RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
> -               return -1;
> -       }
> -
+ #endif


>         /* Check FD and open */
>         if (kni_fd < 0) {
>                 kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
> @@ -302,6 +297,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
>         kni->group_id = conf->group_id;
>         kni->mbuf_size = conf->mbuf_size;
>
> +       dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
> +
>         ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
>         if (ret < 0)
>                 goto ioctl_fail;
> --
> 2.8.4
>


-- 
David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v12 1/2] kni: add IOVA=VA mode support
  2019-11-14 10:57                           ` David Marchand
@ 2019-11-14 11:13                             ` David Marchand
  2019-11-14 12:10                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-11-14 17:48                             ` [dpdk-dev] " David Marchand
  1 sibling, 1 reply; 251+ messages in thread
From: David Marchand @ 2019-11-14 11:13 UTC (permalink / raw)
  To: Vamsi Attunuru
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran, kirankumark,
	Olivier Matz, Yigit, Ferruh, Burakov, Anatoly, Andrew Rybchenko,
	Stephen Hemminger, Luca Boccassi

On Thu, Nov 14, 2019 at 11:57 AM David Marchand
<david.marchand@redhat.com> wrote:
> On Tue, Nov 5, 2019 at 12:05 PM <vattunuru@marvell.com> wrote:
> > diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
> > index 9e2d50c..a1c5bf6 100644
> > --- a/lib/librte_eal/linux/eal/eal.c
> > +++ b/lib/librte_eal/linux/eal/eal.c
> > @@ -922,6 +922,19 @@ static int rte_eal_vfio_setup(void)
> >  }
> >  #endif
> >
> > +static enum rte_iova_mode
> > +rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode)
> > +{
> > +       if (iova_mode == RTE_IOVA_VA) {
> > +#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
>
> I understood that IOVA as VA is possible with kernel >= 4.6.
> Should it be < ?

I meant "Should the #if check be < ?".

I did not mention this first, but thinking again, don't we have an issue here?
This won't work for distributions that ship dpdk with kni built with dkms.

Cc: Luca.


-- 
David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 1/2] kni: add IOVA=VA mode support
  2019-11-14 11:13                             ` David Marchand
@ 2019-11-14 12:10                               ` Vamsi Krishna Attunuru
  2019-11-14 12:25                                 ` David Marchand
  0 siblings, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-14 12:10 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Olivier Matz, Yigit, Ferruh, Burakov,
	Anatoly, Andrew Rybchenko, Stephen Hemminger, Luca Boccassi



> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Thursday, November 14, 2019 4:43 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> Cc: dev <dev@dpdk.org>; Thomas Monjalon <thomas@monjalon.net>; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; Olivier Matz <olivier.matz@6wind.com>; Yigit,
> Ferruh <ferruh.yigit@intel.com>; Burakov, Anatoly
> <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Stephen Hemminger
> <stephen@networkplumber.org>; Luca Boccassi <bluca@debian.org>
> Subject: [EXT] Re: [dpdk-dev] [PATCH v12 1/2] kni: add IOVA=VA mode support
> 
> External Email
> 
> ----------------------------------------------------------------------
> On Thu, Nov 14, 2019 at 11:57 AM David Marchand
> <david.marchand@redhat.com> wrote:
> > On Tue, Nov 5, 2019 at 12:05 PM <vattunuru@marvell.com> wrote:
> > > diff --git a/lib/librte_eal/linux/eal/eal.c
> > > b/lib/librte_eal/linux/eal/eal.c index 9e2d50c..a1c5bf6 100644
> > > --- a/lib/librte_eal/linux/eal/eal.c
> > > +++ b/lib/librte_eal/linux/eal/eal.c
> > > @@ -922,6 +922,19 @@ static int rte_eal_vfio_setup(void)  }  #endif
> > >
> > > +static enum rte_iova_mode
> > > +rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode) {
> > > +       if (iova_mode == RTE_IOVA_VA) { #if KERNEL_VERSION(4, 6, 0)
> > > +> LINUX_VERSION_CODE
> >
> > I understood that IOVA as VA is possible with kernel >= 4.6.
> > Should it be < ?
> 
> I meant "Should the #if check be < ?".
> 
> I did not mention this first, but thinking again, don't we have an issue here?
> This won't work for distributions that ship dpdk with kni built with dkms.
> 

Hi David,

Are you referring the following problem, like if dpdk was build with some older kernel (<4.6) version and shipped, later kernel was updated to 4.8 and kni module got build with newer kernel.?, In this case, kni application will continue work in PA mode without any issues. KNI module decides it's iova mode that application has communicated during kni allocation. If the user expectation is to make KNI application work in VA mode, the check has to be dynamic than. Please correct me if the issue that is anticipated is something different.

BTW, eal has few of these kernel version checks in header files too(rte_vfio.h & eal_vfio.h), not sure if it would cause similar issues.

Regards
A Vamsi

> Cc: Luca.
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re: [PATCH v12 1/2] kni: add IOVA=VA mode support
  2019-11-14 12:10                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-11-14 12:25                                 ` David Marchand
  0 siblings, 0 replies; 251+ messages in thread
From: David Marchand @ 2019-11-14 12:25 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Olivier Matz, Yigit, Ferruh, Burakov,
	Anatoly, Andrew Rybchenko, Stephen Hemminger, Luca Boccassi

On Thu, Nov 14, 2019 at 1:10 PM Vamsi Krishna Attunuru
<vattunuru@marvell.com> wrote:
>
>
>
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > Sent: Thursday, November 14, 2019 4:43 PM
> > To: Vamsi Krishna Attunuru <vattunuru@marvell.com>
> > Cc: dev <dev@dpdk.org>; Thomas Monjalon <thomas@monjalon.net>; Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > <kirankumark@marvell.com>; Olivier Matz <olivier.matz@6wind.com>; Yigit,
> > Ferruh <ferruh.yigit@intel.com>; Burakov, Anatoly
> > <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Stephen Hemminger
> > <stephen@networkplumber.org>; Luca Boccassi <bluca@debian.org>
> > Subject: [EXT] Re: [dpdk-dev] [PATCH v12 1/2] kni: add IOVA=VA mode support
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > On Thu, Nov 14, 2019 at 11:57 AM David Marchand
> > <david.marchand@redhat.com> wrote:
> > > On Tue, Nov 5, 2019 at 12:05 PM <vattunuru@marvell.com> wrote:
> > > > diff --git a/lib/librte_eal/linux/eal/eal.c
> > > > b/lib/librte_eal/linux/eal/eal.c index 9e2d50c..a1c5bf6 100644
> > > > --- a/lib/librte_eal/linux/eal/eal.c
> > > > +++ b/lib/librte_eal/linux/eal/eal.c
> > > > @@ -922,6 +922,19 @@ static int rte_eal_vfio_setup(void)  }  #endif
> > > >
> > > > +static enum rte_iova_mode
> > > > +rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode) {
> > > > +       if (iova_mode == RTE_IOVA_VA) { #if KERNEL_VERSION(4, 6, 0)
> > > > +> LINUX_VERSION_CODE
> > >
> > > I understood that IOVA as VA is possible with kernel >= 4.6.
> > > Should it be < ?
> >
> > I meant "Should the #if check be < ?".
> >
> > I did not mention this first, but thinking again, don't we have an issue here?
> > This won't work for distributions that ship dpdk with kni built with dkms.
> >
>
> Hi David,
>
> Are you referring the following problem, like if dpdk was build with some older kernel (<4.6) version and shipped, later kernel was updated to 4.8 and kni module got build with newer kernel.?, In this case, kni application will continue work in PA mode without any issues. KNI module decides it's iova mode that application has communicated during kni allocation. If the user expectation is to make KNI application work in VA mode, the check has to be dynamic than. Please correct me if the issue that is anticipated is something different.

The dpdk application will request PA (since compiled in with no regard
to the actual kni capabilities) and will refuse to run as non root
user.
Which is already the case before this change.
Ok, maybe something to improve later if people wants to use kni as non root.


My other comments still apply though.

--
David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v12 1/2] kni: add IOVA=VA mode support
  2019-11-14 10:57                           ` David Marchand
  2019-11-14 11:13                             ` David Marchand
@ 2019-11-14 17:48                             ` David Marchand
  1 sibling, 0 replies; 251+ messages in thread
From: David Marchand @ 2019-11-14 17:48 UTC (permalink / raw)
  To: Vamsi Attunuru
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Olivier Matz, Yigit, Ferruh, Burakov,
	Anatoly, Andrew Rybchenko, Stephen Hemminger

On Thu, Nov 14, 2019 at 11:57 AM David Marchand
<david.marchand@redhat.com> wrote:
> On Tue, Nov 5, 2019 at 12:05 PM <vattunuru@marvell.com> wrote:
> > diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
> > index 9e2d50c..a1c5bf6 100644
> > --- a/lib/librte_eal/linux/eal/eal.c
> > +++ b/lib/librte_eal/linux/eal/eal.c
> > @@ -922,6 +922,19 @@ static int rte_eal_vfio_setup(void)
> >  }
> >  #endif
> >
> > +static enum rte_iova_mode
> > +rte_eal_kni_get_iova_mode(enum rte_iova_mode iova_mode)
> > +{
> > +       if (iova_mode == RTE_IOVA_VA) {
> > +#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
>
> I understood that IOVA as VA is possible with kernel >= 4.6.
> Should it be < ?

I had read it backwards.
So, the check is ok.

[snip]

> In the end, can't we just change the check on the workaround?
> Something like:
>
> @@ -1085,7 +1085,7 @@ rte_eal_init(int argc, char **argv)
>                                 RTE_LOG(DEBUG, EAL, "IOMMU is not
> available, selecting IOVA as PA mode.\n");
>                         }
>                 }
> -#ifdef RTE_LIBRTE_KNI
> +#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) < LINUX_VERSION_CODE

Then it becomes:

+#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE

>                 /* Workaround for KNI which requires physical address to work */
>                 if (iova_mode == RTE_IOVA_VA &&
>                                 rte_eal_check_module("rte_kni") == 1) {
>
>
> It would be the only change in the whole file lib/librte_eal/linux/eal/eal.c


-- 
David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v13 0/2] kni: support IOVA mode
  2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
                                           ` (3 preceding siblings ...)
  2019-11-07 19:53                         ` [dpdk-dev] " Ferruh Yigit
@ 2019-11-15 11:18                         ` vattunuru
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 1/2] kni: support IOVA mode in kernel module vattunuru
                                             ` (2 more replies)
  4 siblings, 3 replies; 251+ messages in thread
From: vattunuru @ 2019-11-15 11:18 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, david.marchand,
	Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V13 Changes:
* For KNI case, to restore the existing behaviour, forcing IOVA as PA
if bus iommu returns IOVA_DC.
* Restored phys_addr and kni_lib defination checks.
* Simplified patch titles.
* Updated documentation about patch set's impact on performance and
kernel version requirement.

V12 Changes:
* Removed previously added `--legacy-kni` eal option.
* Removed previously added kni specific mempool create routines
and mempool populate routines.

V11 Changes:
* Added iova to kva address translation routines in kernel module to
make it work in iova=va mode which enables DPDK to create kni devices
on any kind of backed device/memory.
* Added ``--legacy-kni`` eal option to make existing KNI applications
work with DPDK 19.11 and later versions.
* Removed previously added pci device info from kni device info struct.
 
V10 Changes:
* Fixed function return code on failure when min_chunk_size > pg_sz.
* Marked new mempool populate routine as EXPERIMENTAL.
 
V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in the
new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.
 
V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.
 
V7 Changes:
* Removed previously proposed mempool flag and made those page
boundary checks default in mempool populate() except for the objects size
bigger than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
mode is enabled.
 
V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered across
page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new mempool
flag.
 
V5 changes:
* Fixed build issue with 32b build
 
V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

Vamsi Attunuru (2):
  kni: support IOVA mode in kernel module
  kni: support IOVA mode

 doc/guides/prog_guide/kernel_nic_interface.rst    | 14 +++++
 doc/guides/rel_notes/release_19_11.rst            | 12 ++++-
 kernel/linux/kni/compat.h                         | 14 +++++
 kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
 kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
 kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
 lib/librte_eal/linux/eal/eal.c                    | 19 ++++---
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/rte_kni.c                          |  6 +++
 9 files changed, 180 insertions(+), 29 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v13 1/2] kni: support IOVA mode in kernel module
  2019-11-15 11:18                         ` [dpdk-dev] [PATCH v13 0/2] kni: support IOVA mode vattunuru
@ 2019-11-15 11:18                           ` vattunuru
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode vattunuru
  2019-11-15 17:07                           ` [dpdk-dev] [PATCH v14 0/2] " vattunuru
  2 siblings, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-11-15 11:18 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, david.marchand,
	Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds support for kernel module to work in
IOVA = VA mode by providing address translation
routines to convert IOVA aka user space VA to
kernel virtual addresses.

When compared with IOVA = PA mode, KNI netdev ports
does not have any impact on performance with this
approach, also there is no performance impact on
IOVA = PA mode with this patch.

This approach does not work with the kernel versions
less than 4.6.0.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 kernel/linux/kni/compat.h                         | 14 +++++
 kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
 kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
 kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 5 files changed, 136 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..062b170 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,17 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 6, 0) <= LINUX_VERSION_CODE
+
+#define HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+
+#if KERNEL_VERSION(4, 9, 0) > LINUX_VERSION_CODE
+#define GET_USER_PAGES_REMOTE_API_V1
+#elif KERNEL_VERSION(4, 9, 0) == LINUX_VERSION_CODE
+#define GET_USER_PAGES_REMOTE_API_V2
+#else
+#define GET_USER_PAGES_REMOTE_API_V3
+#endif
+
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..fb641b6 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -41,6 +41,8 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
@@ -84,8 +86,48 @@ struct kni_dev {
 	void *va[MBUF_BURST_SZ];
 	void *alloc_pa[MBUF_BURST_SZ];
 	void *alloc_va[MBUF_BURST_SZ];
+
+	struct task_struct *usr_tsk;
 };
 
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+static inline phys_addr_t iova_to_phys(struct task_struct *tsk,
+				       unsigned long iova)
+{
+	phys_addr_t offset, phys_addr;
+	struct page *page = NULL;
+	long ret;
+
+	offset = iova & (PAGE_SIZE - 1);
+
+	/* Read one page struct info */
+#ifdef GET_USER_PAGES_REMOTE_API_V3
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				    FOLL_TOUCH, &page, NULL, NULL);
+#endif
+#ifdef GET_USER_PAGES_REMOTE_API_V2
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				    FOLL_TOUCH, &page, NULL);
+#endif
+#ifdef GET_USER_PAGES_REMOTE_API_V1
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1
+				    0, 0, &page, NULL);
+#endif
+	if (ret < 0)
+		return 0;
+
+	phys_addr = page_to_phys(page) | offset;
+	put_page(page);
+
+	return phys_addr;
+}
+
+static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova)
+{
+	return phys_to_virt(iova_to_phys(tsk, iova));
+}
+#endif
+
 void kni_net_release_fifo_phy(struct kni_dev *kni);
 void kni_net_rx(struct kni_dev *kni);
 void kni_net_init(struct net_device *dev);
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 84ef03b..cda71bd 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -348,15 +348,36 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+		kni->tx_q = iova_to_kva(current, dev_info.tx_phys);
+		kni->rx_q = iova_to_kva(current, dev_info.rx_phys);
+		kni->alloc_q = iova_to_kva(current, dev_info.alloc_phys);
+		kni->free_q = iova_to_kva(current, dev_info.free_phys);
+
+		kni->req_q = iova_to_kva(current, dev_info.req_phys);
+		kni->resp_q = iova_to_kva(current, dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = iova_to_kva(current, dev_info.sync_phys);
+		kni->usr_tsk = current;
+		kni->iova_mode = 1;
+#else
+		pr_err("KNI module does not support IOVA to VA translation\n");
+		return -EINVAL;
+#endif
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index f25b127..1ba9b1b 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *iova)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, (unsigned long)iova));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, m->buf_physaddr) +
+			    m->data_off);
+}
+#endif
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +78,26 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+#endif
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+#endif
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +214,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +302,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +374,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +473,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +485,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +553,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +586,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 46f75a7..2427a96 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -125,6 +125,7 @@ struct rte_kni_device_info {
 	unsigned int min_mtu;
 	unsigned int max_mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 11:18                         ` [dpdk-dev] [PATCH v13 0/2] kni: support IOVA mode vattunuru
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 1/2] kni: support IOVA mode in kernel module vattunuru
@ 2019-11-15 11:18                           ` vattunuru
  2019-11-15 12:11                             ` Ferruh Yigit
  2019-11-15 12:59                             ` David Marchand
  2019-11-15 17:07                           ` [dpdk-dev] [PATCH v14 0/2] " vattunuru
  2 siblings, 2 replies; 251+ messages in thread
From: vattunuru @ 2019-11-15 11:18 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, david.marchand,
	Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Current KNI implementation only operates in IOVA_PA mode
patch adds required functionality to enable KNI in
IOVA_VA mode.

KNI loopback mode tests will have performance impact in
this mode due to IOVA to KVA address translations.
However, In KNI real world use cases, the performace
impact will be based on Linux kernel stack and scheduler
latencies. Performance varies based on the KNI use case.
If bus iommu scheme is IOVA_DC and KNI module is loaded,
DPDK chooses IOVA as PA as existing behaviour.

During KNI creation, app's iova_mode details are passed to
the KNI kernel module, accordingly kernel module translates
PA/IOVA addresses to KVA and vice-versa.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst | 15 +++++++++++++++
 doc/guides/rel_notes/release_19_11.rst         | 15 ++++++++++++++-
 lib/librte_eal/linux/eal/eal.c                 | 23 ++++++++++++++++-------
 lib/librte_kni/rte_kni.c                       |  6 ++++++
 4 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 2fd58e1..3bed18f 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -300,6 +300,21 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI operates in IOVA_VA scheme when
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) and
+- eal option `iova-mode=va` is passed or bus IOVA scheme in the DPDK is selected
+  as RTE_IOVA_VA.
+
+KNI loopback mode tests will have performance impact in this mode due to IOVA to
+KVA address translations. However, In KNI real world use cases, the performace
+impact will be based on Linux kernel stack and scheduler latencies. Performance
+varies based on the KNI use case. If bus iommu scheme is IOVA_DC and KNI module
+is loaded, DPDK chooses IOVA as PA as existing behaviour.
+
 Ethtool
 -------
 
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 682c1bd..c207c93 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -292,8 +292,21 @@ New Features
   compilers store their internal representation of the source code that
   the linker uses at the final stage of compilation process.
 
-  See :doc:`../prog_guide/lto` for more information:
+* **Added IOVA as VA support for KNI.**
+
+  * Added IOVA = VA support for KNI, KNI can operate in IOVA = VA mode when
+    `iova-mode=va` eal option is passed to the application or when bus IOVA
+    scheme is selected as RTE_IOVA_VA. This mode only works on Linux Kernel
+    versions 4.6.0 and above.
 
+  * KNI loopback mode tests will have performance impact in this mode due
+    to IOVA to KVA address translations. However, In KNI real world use cases,
+    the performace impact will be based on Linux kernel stack and scheduler
+    latencies. Performance varies based on the KNI use case. If bus iommu
+    scheme is IOVA_DC and KNI module is loaded, DPDK chooses IOVA as PA as
+    existing behaviour.
+
+  See :doc:`../prog_guide/lto` for more information:
 
 
 Removed Items
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 9e2d50c..53ca84b 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1086,14 +1086,23 @@ rte_eal_init(int argc, char **argv)
 			}
 		}
 #ifdef RTE_LIBRTE_KNI
-		/* Workaround for KNI which requires physical address to work */
-		if (iova_mode == RTE_IOVA_VA &&
-				rte_eal_check_module("rte_kni") == 1) {
-			if (phys_addrs) {
+		if (rte_eal_check_module("rte_kni") == 1) {
+#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
+			if (iova_mode == RTE_IOVA_VA) {
 				iova_mode = RTE_IOVA_PA;
-				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
-			} else {
-				RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
+				RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because "
+						"Kernel version supports only 'PA' mode for KNI module\n");
+			}
+#endif
+			if (rte_bus_get_iommu_class() == RTE_IOVA_DC)
+				iova_mode = RTE_IOVA_PA;
+
+			if (iova_mode == RTE_IOVA_PA) {
+				if (phys_addrs && is_iommu_enabled())
+					RTE_LOG(WARNING, EAL, "Forced IOVA as 'PA' because KNI module is loaded\n");
+
+				if (!phys_addrs)
+					RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
 			}
 		}
 #endif
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 7fbcf22..a609d2f 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -6,6 +6,8 @@
 #error "KNI is not supported"
 #endif
 
+#include <linux/version.h>
+
 #include <string.h>
 #include <fcntl.h>
 #include <unistd.h>
@@ -97,10 +99,12 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
+#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
 	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
 		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
 		return -1;
 	}
+#endif
 
 	/* Check FD and open */
 	if (kni_fd < 0) {
@@ -302,6 +306,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode vattunuru
@ 2019-11-15 12:11                             ` Ferruh Yigit
  2019-11-15 12:59                             ` David Marchand
  1 sibling, 0 replies; 251+ messages in thread
From: Ferruh Yigit @ 2019-11-15 12:11 UTC (permalink / raw)
  To: vattunuru, dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, anatoly.burakov,
	arybchenko, stephen, david.marchand

On 11/15/2019 11:18 AM, vattunuru@marvell.com wrote:
> From: Vamsi Attunuru <vattunuru@marvell.com>
> 
> Current KNI implementation only operates in IOVA_PA mode
> patch adds required functionality to enable KNI in
> IOVA_VA mode.
> 
> KNI loopback mode tests will have performance impact in
> this mode due to IOVA to KVA address translations.
> However, In KNI real world use cases, the performace
> impact will be based on Linux kernel stack and scheduler
> latencies. Performance varies based on the KNI use case.
> If bus iommu scheme is IOVA_DC and KNI module is loaded,
> DPDK chooses IOVA as PA as existing behaviour.
> 
> During KNI creation, app's iova_mode details are passed to
> the KNI kernel module, accordingly kernel module translates
> PA/IOVA addresses to KVA and vice-versa.
> 
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> ---
>  doc/guides/prog_guide/kernel_nic_interface.rst | 15 +++++++++++++++
>  doc/guides/rel_notes/release_19_11.rst         | 15 ++++++++++++++-
>  lib/librte_eal/linux/eal/eal.c                 | 23 ++++++++++++++++-------
>  lib/librte_kni/rte_kni.c                       |  6 ++++++
>  4 files changed, 51 insertions(+), 8 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
> index 2fd58e1..3bed18f 100644
> --- a/doc/guides/prog_guide/kernel_nic_interface.rst
> +++ b/doc/guides/prog_guide/kernel_nic_interface.rst
> @@ -300,6 +300,21 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
>  The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
>  It then puts the mbuf back in the cache.
>  
> +IOVA = VA: Support
> +------------------
> +
> +KNI operates in IOVA_VA scheme when
> +
> +- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) and
> +- eal option `iova-mode=va` is passed or bus IOVA scheme in the DPDK is selected
> +  as RTE_IOVA_VA.
> +
> +KNI loopback mode tests will have performance impact in this mode due to IOVA to
> +KVA address translations. However, In KNI real world use cases, the performace
> +impact will be based on Linux kernel stack and scheduler latencies. Performance
> +varies based on the KNI use case. If bus iommu scheme is IOVA_DC and KNI module
> +is loaded, DPDK chooses IOVA as PA as existing behaviour.

I understand you are trying to neglect the affect but mentioning the Linux
kernel stack and scheduler latency etc is confusing I think, also I am for
mentioning the availability of the eal "--iova-mode=pa" option, what about
following like more simple message:

"
Due to IOVA to KVA address translations, based on the KNI use case there can be
a performance impact. For mitigation forcing IOVA to PA via eal "--iova-mode=pa"
option can be used.
"

IOVA_DC bus iommu scheme resulting IOVA as PA either can be another paragraph I
think, as a single sentences.


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode vattunuru
  2019-11-15 12:11                             ` Ferruh Yigit
@ 2019-11-15 12:59                             ` David Marchand
  2019-11-15 13:35                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
  2019-11-15 13:40                               ` [dpdk-dev] " Jerin Jacob
  1 sibling, 2 replies; 251+ messages in thread
From: David Marchand @ 2019-11-15 12:59 UTC (permalink / raw)
  To: Vamsi Attunuru, Yigit, Ferruh
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Olivier Matz, Burakov, Anatoly,
	Andrew Rybchenko, Stephen Hemminger

I can't see an interest in splitting this patch from the kmod update.
Ferruh, what do you think?


On Fri, Nov 15, 2019 at 12:19 PM <vattunuru@marvell.com> wrote:
>
> From: Vamsi Attunuru <vattunuru@marvell.com>
>
> Current KNI implementation only operates in IOVA_PA mode
> patch adds required functionality to enable KNI in
> IOVA_VA mode.
>
> KNI loopback mode tests will have performance impact in
> this mode due to IOVA to KVA address translations.
> However, In KNI real world use cases, the performace

performance

> impact will be based on Linux kernel stack and scheduler
> latencies. Performance varies based on the KNI use case.
> If bus iommu scheme is IOVA_DC and KNI module is loaded,
> DPDK chooses IOVA as PA as existing behaviour.
>
> During KNI creation, app's iova_mode details are passed to
> the KNI kernel module, accordingly kernel module translates
> PA/IOVA addresses to KVA and vice-versa.
>
> Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> ---
>  doc/guides/prog_guide/kernel_nic_interface.rst | 15 +++++++++++++++
>  doc/guides/rel_notes/release_19_11.rst         | 15 ++++++++++++++-
>  lib/librte_eal/linux/eal/eal.c                 | 23 ++++++++++++++++-------
>  lib/librte_kni/rte_kni.c                       |  6 ++++++
>  4 files changed, 51 insertions(+), 8 deletions(-)
>

[snip]

> diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
> index 9e2d50c..53ca84b 100644
> --- a/lib/librte_eal/linux/eal/eal.c
> +++ b/lib/librte_eal/linux/eal/eal.c
> @@ -1086,14 +1086,23 @@ rte_eal_init(int argc, char **argv)
>                         }
>                 }
>  #ifdef RTE_LIBRTE_KNI
> -               /* Workaround for KNI which requires physical address to work */
> -               if (iova_mode == RTE_IOVA_VA &&
> -                               rte_eal_check_module("rte_kni") == 1) {
> -                       if (phys_addrs) {
> +               if (rte_eal_check_module("rte_kni") == 1) {
> +#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
> +                       if (iova_mode == RTE_IOVA_VA) {
>                                 iova_mode = RTE_IOVA_PA;
> -                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
> -                       } else {
> -                               RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
> +                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because "
> +                                               "Kernel version supports only 'PA' mode for KNI module\n");
> +                       }
> +#endif
> +                       if (rte_bus_get_iommu_class() == RTE_IOVA_DC)
> +                               iova_mode = RTE_IOVA_PA;

If physical addresses are unavailable, this code forces PA anyway.


> +
> +                       if (iova_mode == RTE_IOVA_PA) {
> +                               if (phys_addrs && is_iommu_enabled())
> +                                       RTE_LOG(WARNING, EAL, "Forced IOVA as 'PA' because KNI module is loaded\n");
> +
> +                               if (!phys_addrs)
> +                                       RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
>                         }

Checking physical addresses availability after, and having a log is not enough.


So far, KNI could not work with IOVA as VA.
Your patchset adds support for IOVA as VA if kernel is >= 4.6.
Repeating my proposal (as far as eal.c is concerned) of just changing:

@@ -1085,7 +1085,7 @@ rte_eal_init(int argc, char **argv)
                                RTE_LOG(DEBUG, EAL, "IOMMU is not
available, selecting IOVA as PA mode.\n");
                        }
                }
-#ifdef RTE_LIBRTE_KNI
+#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
                /* Workaround for KNI which requires physical address to work */
                if (iova_mode == RTE_IOVA_VA &&



--
David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re:  [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 12:59                             ` David Marchand
@ 2019-11-15 13:35                               ` Vamsi Krishna Attunuru
  2019-11-15 13:40                                 ` David Marchand
  2019-11-15 13:40                               ` [dpdk-dev] " Jerin Jacob
  1 sibling, 1 reply; 251+ messages in thread
From: Vamsi Krishna Attunuru @ 2019-11-15 13:35 UTC (permalink / raw)
  To: David Marchand, Yigit, Ferruh
  Cc: dev, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Olivier Matz, Burakov, Anatoly,
	Andrew Rybchenko, Stephen Hemminger


> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Friday, November 15, 2019 6:29 PM
> To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Yigit, Ferruh
> <ferruh.yigit@intel.com>
> Cc: dev <dev@dpdk.org>; Thomas Monjalon <thomas@monjalon.net>; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> <kirankumark@marvell.com>; Olivier Matz <olivier.matz@6wind.com>;
> Burakov, Anatoly <anatoly.burakov@intel.com>; Andrew Rybchenko
> <arybchenko@solarflare.com>; Stephen Hemminger
> <stephen@networkplumber.org>
> Subject: [EXT] Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
> 
> External Email
> 
> ----------------------------------------------------------------------
> I can't see an interest in splitting this patch from the kmod update.
> Ferruh, what do you think?
> 
> 
> On Fri, Nov 15, 2019 at 12:19 PM <vattunuru@marvell.com> wrote:
> >
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Current KNI implementation only operates in IOVA_PA mode patch adds
> > required functionality to enable KNI in IOVA_VA mode.
> >
> > KNI loopback mode tests will have performance impact in this mode due
> > to IOVA to KVA address translations.
> > However, In KNI real world use cases, the performace
> 
> performance
> 
> > impact will be based on Linux kernel stack and scheduler latencies.
> > Performance varies based on the KNI use case.
> > If bus iommu scheme is IOVA_DC and KNI module is loaded, DPDK chooses
> > IOVA as PA as existing behaviour.
> >
> > During KNI creation, app's iova_mode details are passed to the KNI
> > kernel module, accordingly kernel module translates PA/IOVA addresses
> > to KVA and vice-versa.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> > ---
> >  doc/guides/prog_guide/kernel_nic_interface.rst | 15 +++++++++++++++
> >  doc/guides/rel_notes/release_19_11.rst         | 15 ++++++++++++++-
> >  lib/librte_eal/linux/eal/eal.c                 | 23 ++++++++++++++++-------
> >  lib/librte_kni/rte_kni.c                       |  6 ++++++
> >  4 files changed, 51 insertions(+), 8 deletions(-)
> >
> 
> [snip]
> 
> > diff --git a/lib/librte_eal/linux/eal/eal.c
> > b/lib/librte_eal/linux/eal/eal.c index 9e2d50c..53ca84b 100644
> > --- a/lib/librte_eal/linux/eal/eal.c
> > +++ b/lib/librte_eal/linux/eal/eal.c
> > @@ -1086,14 +1086,23 @@ rte_eal_init(int argc, char **argv)
> >                         }
> >                 }
> >  #ifdef RTE_LIBRTE_KNI
> > -               /* Workaround for KNI which requires physical address to work */
> > -               if (iova_mode == RTE_IOVA_VA &&
> > -                               rte_eal_check_module("rte_kni") == 1) {
> > -                       if (phys_addrs) {
> > +               if (rte_eal_check_module("rte_kni") == 1) { #if
> > +KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
> > +                       if (iova_mode == RTE_IOVA_VA) {
> >                                 iova_mode = RTE_IOVA_PA;
> > -                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI
> module is loaded\n");
> > -                       } else {
> > -                               RTE_LOG(DEBUG, EAL, "KNI can not work since physical
> addresses are unavailable\n");
> > +                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because "
> > +                                               "Kernel version supports only 'PA' mode for KNI
> module\n");
> > +                       }
> > +#endif
> > +                       if (rte_bus_get_iommu_class() == RTE_IOVA_DC)
> > +                               iova_mode = RTE_IOVA_PA;
> 
> If physical addresses are unavailable, this code forces PA anyway.
> 
> 
> > +
> > +                       if (iova_mode == RTE_IOVA_PA) {
> > +                               if (phys_addrs && is_iommu_enabled())
> > +                                       RTE_LOG(WARNING, EAL, "Forced
> > + IOVA as 'PA' because KNI module is loaded\n");
> > +
> > +                               if (!phys_addrs)
> > +                                       RTE_LOG(DEBUG, EAL, "KNI can
> > + not work since physical addresses are unavailable\n");
> >                         }
> 
> Checking physical addresses availability after, and having a log is not enough.
> 
> 
> So far, KNI could not work with IOVA as VA.
> Your patchset adds support for IOVA as VA if kernel is >= 4.6.
> Repeating my proposal (as far as eal.c is concerned) of just changing:

To keep the existing behavior intact when bus iommu returns IOVA_DC, had to handle those case also here.

How about below scheme:

#ifdef RTE_LIBRTE_KNI
-               /* Workaround for KNI which requires physical address to work */
-               if (iova_mode == RTE_IOVA_VA &&
-                               rte_eal_check_module("rte_kni") == 1) {
-                       if (phys_addrs) {
-                               iova_mode = RTE_IOVA_PA;
-                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
-                       } else {
-                               RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
+               if (rte_eal_check_module("rte_kni") == 1) {
+                       bool force_iova_as_pa = false;
+#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
+                       if (iova_mode == RTE_IOVA_VA) {
+                               force_iova_as_pa = true;
+                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because "
+                                               "Kernel version supports only 'PA' mode for KNI module\n");
+                       }
+#endif
+                       if (rte_bus_get_iommu_class() == RTE_IOVA_DC)
+                               force_iova_as_pa = true;
+
+                       if (force_iova_as_pa) {
+                               if (phys_addrs) {
+                                       iova_mode = RTE_IOVA_PA;
+                                       RTE_LOG(WARNING, EAL, "Forced IOVA as 'PA' because KNI module is loaded\n");
+                               } else
+                                       RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
                        }
                }
 #endif



> 
> @@ -1085,7 +1085,7 @@ rte_eal_init(int argc, char **argv)
>                                 RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting
> IOVA as PA mode.\n");
>                         }
>                 }
> -#ifdef RTE_LIBRTE_KNI
> +#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) >
> +LINUX_VERSION_CODE
>                 /* Workaround for KNI which requires physical address to work */
>                 if (iova_mode == RTE_IOVA_VA &&
> 
> 
> 
> --
> David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [EXT] Re:  [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 13:35                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-11-15 13:40                                 ` David Marchand
  0 siblings, 0 replies; 251+ messages in thread
From: David Marchand @ 2019-11-15 13:40 UTC (permalink / raw)
  To: Vamsi Krishna Attunuru
  Cc: Yigit, Ferruh, dev, Thomas Monjalon, Jerin Jacob Kollanukkaran,
	Kiran Kumar Kokkilagadda, Olivier Matz, Burakov, Anatoly,
	Andrew Rybchenko, Stephen Hemminger

On Fri, Nov 15, 2019 at 2:36 PM Vamsi Krishna Attunuru
<vattunuru@marvell.com> wrote:
>
>
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > Sent: Friday, November 15, 2019 6:29 PM
> > To: Vamsi Krishna Attunuru <vattunuru@marvell.com>; Yigit, Ferruh
> > <ferruh.yigit@intel.com>
> > Cc: dev <dev@dpdk.org>; Thomas Monjalon <thomas@monjalon.net>; Jerin
> > Jacob Kollanukkaran <jerinj@marvell.com>; Kiran Kumar Kokkilagadda
> > <kirankumark@marvell.com>; Olivier Matz <olivier.matz@6wind.com>;
> > Burakov, Anatoly <anatoly.burakov@intel.com>; Andrew Rybchenko
> > <arybchenko@solarflare.com>; Stephen Hemminger
> > <stephen@networkplumber.org>
> > Subject: [EXT] Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > I can't see an interest in splitting this patch from the kmod update.
> > Ferruh, what do you think?
> >
> >
> > On Fri, Nov 15, 2019 at 12:19 PM <vattunuru@marvell.com> wrote:
> > >
> > > From: Vamsi Attunuru <vattunuru@marvell.com>
> > >
> > > Current KNI implementation only operates in IOVA_PA mode patch adds
> > > required functionality to enable KNI in IOVA_VA mode.
> > >
> > > KNI loopback mode tests will have performance impact in this mode due
> > > to IOVA to KVA address translations.
> > > However, In KNI real world use cases, the performace
> >
> > performance
> >
> > > impact will be based on Linux kernel stack and scheduler latencies.
> > > Performance varies based on the KNI use case.
> > > If bus iommu scheme is IOVA_DC and KNI module is loaded, DPDK chooses
> > > IOVA as PA as existing behaviour.
> > >
> > > During KNI creation, app's iova_mode details are passed to the KNI
> > > kernel module, accordingly kernel module translates PA/IOVA addresses
> > > to KVA and vice-versa.
> > >
> > > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > > Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> > > ---
> > >  doc/guides/prog_guide/kernel_nic_interface.rst | 15 +++++++++++++++
> > >  doc/guides/rel_notes/release_19_11.rst         | 15 ++++++++++++++-
> > >  lib/librte_eal/linux/eal/eal.c                 | 23 ++++++++++++++++-------
> > >  lib/librte_kni/rte_kni.c                       |  6 ++++++
> > >  4 files changed, 51 insertions(+), 8 deletions(-)
> > >
> >
> > [snip]
> >
> > > diff --git a/lib/librte_eal/linux/eal/eal.c
> > > b/lib/librte_eal/linux/eal/eal.c index 9e2d50c..53ca84b 100644
> > > --- a/lib/librte_eal/linux/eal/eal.c
> > > +++ b/lib/librte_eal/linux/eal/eal.c
> > > @@ -1086,14 +1086,23 @@ rte_eal_init(int argc, char **argv)
> > >                         }
> > >                 }
> > >  #ifdef RTE_LIBRTE_KNI
> > > -               /* Workaround for KNI which requires physical address to work */
> > > -               if (iova_mode == RTE_IOVA_VA &&
> > > -                               rte_eal_check_module("rte_kni") == 1) {
> > > -                       if (phys_addrs) {
> > > +               if (rte_eal_check_module("rte_kni") == 1) { #if
> > > +KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
> > > +                       if (iova_mode == RTE_IOVA_VA) {
> > >                                 iova_mode = RTE_IOVA_PA;
> > > -                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI
> > module is loaded\n");
> > > -                       } else {
> > > -                               RTE_LOG(DEBUG, EAL, "KNI can not work since physical
> > addresses are unavailable\n");
> > > +                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because "
> > > +                                               "Kernel version supports only 'PA' mode for KNI
> > module\n");
> > > +                       }
> > > +#endif
> > > +                       if (rte_bus_get_iommu_class() == RTE_IOVA_DC)
> > > +                               iova_mode = RTE_IOVA_PA;
> >
> > If physical addresses are unavailable, this code forces PA anyway.
> >
> >
> > > +
> > > +                       if (iova_mode == RTE_IOVA_PA) {
> > > +                               if (phys_addrs && is_iommu_enabled())
> > > +                                       RTE_LOG(WARNING, EAL, "Forced
> > > + IOVA as 'PA' because KNI module is loaded\n");
> > > +
> > > +                               if (!phys_addrs)
> > > +                                       RTE_LOG(DEBUG, EAL, "KNI can
> > > + not work since physical addresses are unavailable\n");
> > >                         }
> >
> > Checking physical addresses availability after, and having a log is not enough.
> >
> >
> > So far, KNI could not work with IOVA as VA.
> > Your patchset adds support for IOVA as VA if kernel is >= 4.6.
> > Repeating my proposal (as far as eal.c is concerned) of just changing:
>
> To keep the existing behavior intact when bus iommu returns IOVA_DC, had to handle those case also here.

No.
If you want to change something, do it in a separate patch with full
explanation.

-- 
David Marchand


^ permalink raw reply	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 12:59                             ` David Marchand
  2019-11-15 13:35                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
@ 2019-11-15 13:40                               ` Jerin Jacob
  2019-11-15 14:56                                 ` David Marchand
  1 sibling, 1 reply; 251+ messages in thread
From: Jerin Jacob @ 2019-11-15 13:40 UTC (permalink / raw)
  To: David Marchand
  Cc: Vamsi Attunuru, Yigit, Ferruh, dev, Thomas Monjalon,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Olivier Matz, Burakov, Anatoly, Andrew Rybchenko,
	Stephen Hemminger

On Fri, Nov 15, 2019 at 6:29 PM David Marchand
<david.marchand@redhat.com> wrote:
>
> I can't see an interest in splitting this patch from the kmod update.
> Ferruh, what do you think?
>
>
> On Fri, Nov 15, 2019 at 12:19 PM <vattunuru@marvell.com> wrote:
> >
> > From: Vamsi Attunuru <vattunuru@marvell.com>
> >
> > Current KNI implementation only operates in IOVA_PA mode
> > patch adds required functionality to enable KNI in
> > IOVA_VA mode.
> >
> > KNI loopback mode tests will have performance impact in
> > this mode due to IOVA to KVA address translations.
> > However, In KNI real world use cases, the performace
>
> performance
>
> > impact will be based on Linux kernel stack and scheduler
> > latencies. Performance varies based on the KNI use case.
> > If bus iommu scheme is IOVA_DC and KNI module is loaded,
> > DPDK chooses IOVA as PA as existing behaviour.
> >
> > During KNI creation, app's iova_mode details are passed to
> > the KNI kernel module, accordingly kernel module translates
> > PA/IOVA addresses to KVA and vice-versa.
> >
> > Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
> > Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
> > Suggested-by: Ferruh Yigit <ferruh.yigit@intel.com>
> > ---
> >  doc/guides/prog_guide/kernel_nic_interface.rst | 15 +++++++++++++++
> >  doc/guides/rel_notes/release_19_11.rst         | 15 ++++++++++++++-
> >  lib/librte_eal/linux/eal/eal.c                 | 23 ++++++++++++++++-------
> >  lib/librte_kni/rte_kni.c                       |  6 ++++++
> >  4 files changed, 51 insertions(+), 8 deletions(-)
> >
>
> [snip]
>
> > diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
> > index 9e2d50c..53ca84b 100644
> > --- a/lib/librte_eal/linux/eal/eal.c
> > +++ b/lib/librte_eal/linux/eal/eal.c
> > @@ -1086,14 +1086,23 @@ rte_eal_init(int argc, char **argv)
> >                         }
> >                 }
> >  #ifdef RTE_LIBRTE_KNI
> > -               /* Workaround for KNI which requires physical address to work */
> > -               if (iova_mode == RTE_IOVA_VA &&
> > -                               rte_eal_check_module("rte_kni") == 1) {
> > -                       if (phys_addrs) {
> > +               if (rte_eal_check_module("rte_kni") == 1) {
> > +#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
> > +                       if (iova_mode == RTE_IOVA_VA) {
> >                                 iova_mode = RTE_IOVA_PA;
> > -                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
> > -                       } else {
> > -                               RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
> > +                               RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because "
> > +                                               "Kernel version supports only 'PA' mode for KNI module\n");
> > +                       }
> > +#endif
> > +                       if (rte_bus_get_iommu_class() == RTE_IOVA_DC)
> > +                               iova_mode = RTE_IOVA_PA;
>
> If physical addresses are unavailable, this code forces PA anyway.
>
>
> > +
> > +                       if (iova_mode == RTE_IOVA_PA) {
> > +                               if (phys_addrs && is_iommu_enabled())
> > +                                       RTE_LOG(WARNING, EAL, "Forced IOVA as 'PA' because KNI module is loaded\n");
> > +
> > +                               if (!phys_addrs)
> > +                                       RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
> >                         }
>
> Checking physical addresses availability after, and having a log is not enough.
>
>
> So far, KNI could not work with IOVA as VA.
> Your patchset adds support for IOVA as VA if kernel is >= 4.6.
> Repeating my proposal (as far as eal.c is concerned) of just changing:

We need achive the following.

IOVA as PA  has performance implication on KNI case. So we need to
make IOVA as PA when KNI module is loaded.
Your suggestion makes IOVA as PA when bus return IOVA as VA due the
fact the KNI check is latter. We need to move that up.

How about the following

[master]dell[dpdk.org] $ git diff
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 9e2d50cfb..085fde767 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1064,6 +1064,21 @@ rte_eal_init(int argc, char **argv)
                /* autodetect the IOVA mapping mode */
                enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();

+#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
+               /* IOVA as PA gives better performance for KNI. Choose IOVA as
+                 * PA when bus returns RTE_IOVA_DC and KNI module is present.
+                 */
+               if (iova_mode == RTE_IOVA_DC &&
+                               rte_eal_check_module("rte_kni") == 1) {
+                       if (phys_addrs) {
+                               iova_mode = RTE_IOVA_PA;
+                               RTE_LOG(WARNING, EAL, "Forcing IOVA as
'PA' because KNI module is loaded\n");
+                       } else {
+                               RTE_LOG(DEBUG, EAL, "KNI can not work
since physical addresses are unavailable\n");
+                       }
+               }
+#endif
+
                if (iova_mode == RTE_IOVA_DC) {
                        RTE_LOG(DEBUG, EAL, "Buses did not request a
specific IOVA mode.\n");

@@ -1085,18 +1100,6 @@ rte_eal_init(int argc, char **argv)
                                RTE_LOG(DEBUG, EAL, "IOMMU is not
available, selecting IOVA as PA mode.\n");
                        }
                }
-#ifdef RTE_LIBRTE_KNI
-               /* Workaround for KNI which requires physical address to work */
-               if (iova_mode == RTE_IOVA_VA &&
-                               rte_eal_check_module("rte_kni") == 1) {
-                       if (phys_addrs) {
-                               iova_mode = RTE_IOVA_PA;
-                               RTE_LOG(WARNING, EAL, "Forcing IOVA as
'PA' because KNI module is loaded\n");
-                       } else {
-                               RTE_LOG(DEBUG, EAL, "KNI can not work
since physical addresses are unavailable\n");
-                       }
-               }
-#endif
                rte_eal_get_configuration()->iova_mode = iova_mode;
        } else {
                rte_eal_get_configuration()->iova_mode =
[master]dell[dpdk.org] $
>
> @@ -1085,7 +1085,7 @@ rte_eal_init(int argc, char **argv)
>                                 RTE_LOG(DEBUG, EAL, "IOMMU is not
> available, selecting IOVA as PA mode.\n");
>                         }
>                 }
> -#ifdef RTE_LIBRTE_KNI
> +#if defined(RTE_LIBRTE_KNI) && KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
>                 /* Workaround for KNI which requires physical address to work */
>                 if (iova_mode == RTE_IOVA_VA &&
>
>
>
> --
> David Marchand
>

^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 13:40                               ` [dpdk-dev] " Jerin Jacob
@ 2019-11-15 14:56                                 ` David Marchand
  2019-11-15 15:22                                   ` Jerin Jacob
  0 siblings, 1 reply; 251+ messages in thread
From: David Marchand @ 2019-11-15 14:56 UTC (permalink / raw)
  To: Jerin Jacob
  Cc: Vamsi Attunuru, Yigit, Ferruh, dev, Thomas Monjalon,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Olivier Matz, Burakov, Anatoly, Andrew Rybchenko,
	Stephen Hemminger

On Fri, Nov 15, 2019 at 2:40 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
> On Fri, Nov 15, 2019 at 6:29 PM David Marchand
> <david.marchand@redhat.com> wrote:
> > So far, KNI could not work with IOVA as VA.
> > Your patchset adds support for IOVA as VA if kernel is >= 4.6.
>
> We need achive the following.
>
> IOVA as PA  has performance implication on KNI case. So we need to
> make IOVA as PA when KNI module is loaded.

- The current behaviour is:
  * buses announce PA, nothing to do wrt KNI,
  * buses announce VA or DC (whatever the considerations on iommu), if
physical addresses are available, then PA is used and KNI works,

- Now, with this new feature (on kernels >= 4.6), we can have KNI work
with VA, the previous workaround can be skipped.
There is another consideration wrt performance, as a consequence, for
kernels 4.6, if physical addresses are available, we can select PA for
KNI.

_But_ I did not see people complaining about the current behavior.
I see no reason to change this if the VA support can't be used (kernels < 4.6).

So how about (I inverted the #if LINUX_VERSION_CODE >=
KERNEL_VERSION(4, 6, 0), it was causing me headaches):

diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 9e2d50cfb..33f3c674c 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1073,6 +1073,11 @@ rte_eal_init(int argc, char **argv)
                                 */
                                iova_mode = RTE_IOVA_VA;
                                RTE_LOG(DEBUG, EAL, "Physical
addresses are unavailable, selecting IOVA as VA mode.\n");
+#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+                       } else if (rte_eal_check_module("rte_kni") == 1) {
+                               iova_mode = RTE_IOVA_PA;
+                               RTE_LOG(DEBUG, EAL, "Forcing IOVA as
'PA' for better perfomance with KNI\n");
+#endif
                        } else if (is_iommu_enabled()) {
                                /* we have an IOMMU, pick IOVA as VA mode */
                                iova_mode = RTE_IOVA_VA;
@@ -1085,7 +1090,7 @@ rte_eal_init(int argc, char **argv)
                                RTE_LOG(DEBUG, EAL, "IOMMU is not
available, selecting IOVA as PA mode.\n");
                        }
                }
-#ifdef RTE_LIBRTE_KNI
-               /* Workaround for KNI which requires physical address to work */
+#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0)
+               /* Workaround for KNI which requires physical address
to work with kernels < 4.6 */
                if (iova_mode == RTE_IOVA_VA &&
                                rte_eal_check_module("rte_kni") == 1) {


-- 
David Marchand


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* Re: [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode
  2019-11-15 14:56                                 ` David Marchand
@ 2019-11-15 15:22                                   ` Jerin Jacob
  0 siblings, 0 replies; 251+ messages in thread
From: Jerin Jacob @ 2019-11-15 15:22 UTC (permalink / raw)
  To: David Marchand
  Cc: Vamsi Attunuru, Yigit, Ferruh, dev, Thomas Monjalon,
	Jerin Jacob Kollanukkaran, Kiran Kumar Kokkilagadda,
	Olivier Matz, Burakov, Anatoly, Andrew Rybchenko,
	Stephen Hemminger

On Fri, Nov 15, 2019 at 8:27 PM David Marchand
<david.marchand@redhat.com> wrote:
>
> On Fri, Nov 15, 2019 at 2:40 PM Jerin Jacob <jerinjacobk@gmail.com> wrote:
> > On Fri, Nov 15, 2019 at 6:29 PM David Marchand
> > <david.marchand@redhat.com> wrote:
> > > So far, KNI could not work with IOVA as VA.
> > > Your patchset adds support for IOVA as VA if kernel is >= 4.6.
> >
> > We need achive the following.
> >
> > IOVA as PA  has performance implication on KNI case. So we need to
> > make IOVA as PA when KNI module is loaded.
>
> - The current behaviour is:
>   * buses announce PA, nothing to do wrt KNI,
>   * buses announce VA or DC (whatever the considerations on iommu), if
> physical addresses are available, then PA is used and KNI works,
>
> - Now, with this new feature (on kernels >= 4.6), we can have KNI work
> with VA, the previous workaround can be skipped.
> There is another consideration wrt performance, as a consequence, for
> kernels 4.6, if physical addresses are available, we can select PA for
> KNI.
>
> _But_ I did not see people complaining about the current behavior.
> I see no reason to change this if the VA support can't be used (kernels < 4.6).
>
> So how about (I inverted the #if LINUX_VERSION_CODE >=
> KERNEL_VERSION(4, 6, 0), it was causing me headaches):


That works too David. Thanks for the review.

>
> diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
> index 9e2d50cfb..33f3c674c 100644
> --- a/lib/librte_eal/linux/eal/eal.c
> +++ b/lib/librte_eal/linux/eal/eal.c
> @@ -1073,6 +1073,11 @@ rte_eal_init(int argc, char **argv)
>                                  */
>                                 iova_mode = RTE_IOVA_VA;
>                                 RTE_LOG(DEBUG, EAL, "Physical
> addresses are unavailable, selecting IOVA as VA mode.\n");
> +#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
> +                       } else if (rte_eal_check_module("rte_kni") == 1) {
> +                               iova_mode = RTE_IOVA_PA;
> +                               RTE_LOG(DEBUG, EAL, "Forcing IOVA as
> 'PA' for better perfomance with KNI\n");
> +#endif
>                         } else if (is_iommu_enabled()) {
>                                 /* we have an IOMMU, pick IOVA as VA mode */
>                                 iova_mode = RTE_IOVA_VA;
> @@ -1085,7 +1090,7 @@ rte_eal_init(int argc, char **argv)
>                                 RTE_LOG(DEBUG, EAL, "IOMMU is not
> available, selecting IOVA as PA mode.\n");
>                         }
>                 }
> -#ifdef RTE_LIBRTE_KNI
> -               /* Workaround for KNI which requires physical address to work */
> +#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0)
> +               /* Workaround for KNI which requires physical address
> to work with kernels < 4.6 */
>                 if (iova_mode == RTE_IOVA_VA &&
>                                 rte_eal_check_module("rte_kni") == 1) {
>
>
> --
> David Marchand
>

^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v14 0/2] kni: support IOVA mode
  2019-11-15 11:18                         ` [dpdk-dev] [PATCH v13 0/2] kni: support IOVA mode vattunuru
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 1/2] kni: support IOVA mode in kernel module vattunuru
  2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode vattunuru
@ 2019-11-15 17:07                           ` vattunuru
  2019-11-15 17:07                             ` [dpdk-dev] [PATCH v14 1/2] kni: support IOVA mode in kernel module vattunuru
  2019-11-15 17:07                             ` [dpdk-dev] [PATCH v14 2/2] kni: support IOVA mode vattunuru
  2 siblings, 2 replies; 251+ messages in thread
From: vattunuru @ 2019-11-15 17:07 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, david.marchand,
	Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

---
V14 Changes:
* Simplified forcing IOVA as PA scheme.
* Updated documentation

V13 Changes:
* For KNI case, to restore the existing behaviour, forcing IOVA as PA
if bus iommu returns IOVA_DC.
* Restored phys_addr and kni_lib defination checks.
* Simplified patch titles.
* Updated documentation about patch set's impact on performance and
kernel version requirement.

V12 Changes:
* Removed previously added `--legacy-kni` eal option.
* Removed previously added kni specific mempool create routines
and mempool populate routines.

V11 Changes:
* Added iova to kva address translation routines in kernel module to
make it work in iova=va mode which enables DPDK to create kni devices
on any kind of backed device/memory.
* Added ``--legacy-kni`` eal option to make existing KNI applications
work with DPDK 19.11 and later versions.
* Removed previously added pci device info from kni device info struct.
 
V10 Changes:
* Fixed function return code on failure when min_chunk_size > pg_sz.
* Marked new mempool populate routine as EXPERIMENTAL.
 
V9 Changes:
* Used rte_mempool_ops_calc_mem_size() instead of default handler in the
new mempool populate routine.
* Check min_chunk_size and return values.
* Removed ethdev_info memset to '0' and moved pci dev_info populate into
kni_dev_pci_addr_get() routine.
* Addressed misc. review comments.
 
V8 Changes:
* Remove default mempool populate() routine changes.
* Add kni app specific mempool create & free routines.
* Add new mempool populate routine to allocate page-aligned memzones
with page size to make sure all mempool objects reside on a page.
* Update release notes and map files.
 
V7 Changes:
* Removed previously proposed mempool flag and made those page
boundary checks default in mempool populate() except for the objects size
bigger than the size of page.
* Removed KNI example application related changes since pool related
requirement is taken care in mempool lib.
* All PCI dev related info is moved under rte_eal_iova_mode() == VA check.
* Added wrapper functions in KNI module to hide IOVA checks and make
address translation routines more readable.
* Updated IOVA mode checks that enforcing IOVA=PA mode when IOVA=VA
mode is enabled.
 
V6 Changes:
* Added new mempool flag to ensure mbuf memory is not scattered across
page boundaries.
* Added KNI kernel module required PCI device information.
* Modified KNI example application to create mempool with new mempool
flag.
 
V5 changes:
* Fixed build issue with 32b build
 
V4 changes:
* Fixed build issues with older kernel versions
* This approach will only work with kernel above 4.4.0

David Marchand (1):
  kni: support IOVA mode

Vamsi Attunuru (1):
  kni: support IOVA mode in kernel module

 doc/guides/prog_guide/kernel_nic_interface.rst    | 14 +++++
 doc/guides/rel_notes/release_19_11.rst            | 13 ++++-
 kernel/linux/kni/compat.h                         | 14 +++++
 kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
 kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
 kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
 lib/librte_eal/linux/eal/eal.c                    | 11 +++-
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 lib/librte_kni/rte_kni.c                          |  5 ++
 9 files changed, 176 insertions(+), 25 deletions(-)

-- 
2.8.4


^ permalink raw reply	[flat|nested] 251+ messages in thread

* [dpdk-dev] [PATCH v14 1/2] kni: support IOVA mode in kernel module
  2019-11-15 17:07                           ` [dpdk-dev] [PATCH v14 0/2] " vattunuru
@ 2019-11-15 17:07                             ` vattunuru
  2019-11-15 17:07                             ` [dpdk-dev] [PATCH v14 2/2] kni: support IOVA mode vattunuru
  1 sibling, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-11-15 17:07 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, david.marchand,
	Vamsi Attunuru

From: Vamsi Attunuru <vattunuru@marvell.com>

Patch adds support for kernel module to work in
IOVA = VA mode by providing address translation
routines to convert IOVA aka user space VA to
kernel virtual addresses.

When compared with IOVA = PA mode, KNI netdev ports
does not have any impact on performance with this
approach, also there is no performance impact on
IOVA = PA mode with this patch.

This approach does not work with the kernel versions
less than 4.6.0.

Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 kernel/linux/kni/compat.h                         | 14 +++++
 kernel/linux/kni/kni_dev.h                        | 42 +++++++++++++++
 kernel/linux/kni/kni_misc.c                       | 39 ++++++++++----
 kernel/linux/kni/kni_net.c                        | 62 ++++++++++++++++++-----
 lib/librte_eal/linux/eal/include/rte_kni_common.h |  1 +
 5 files changed, 136 insertions(+), 22 deletions(-)

diff --git a/kernel/linux/kni/compat.h b/kernel/linux/kni/compat.h
index 562d8bf..062b170 100644
--- a/kernel/linux/kni/compat.h
+++ b/kernel/linux/kni/compat.h
@@ -121,3 +121,17 @@
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
 #define HAVE_SIGNAL_FUNCTIONS_OWN_HEADER
 #endif
+
+#if KERNEL_VERSION(4, 6, 0) <= LINUX_VERSION_CODE
+
+#define HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+
+#if KERNEL_VERSION(4, 9, 0) > LINUX_VERSION_CODE
+#define GET_USER_PAGES_REMOTE_API_V1
+#elif KERNEL_VERSION(4, 9, 0) == LINUX_VERSION_CODE
+#define GET_USER_PAGES_REMOTE_API_V2
+#else
+#define GET_USER_PAGES_REMOTE_API_V3
+#endif
+
+#endif
diff --git a/kernel/linux/kni/kni_dev.h b/kernel/linux/kni/kni_dev.h
index c1ca678..fb641b6 100644
--- a/kernel/linux/kni/kni_dev.h
+++ b/kernel/linux/kni/kni_dev.h
@@ -41,6 +41,8 @@ struct kni_dev {
 	/* kni list */
 	struct list_head list;
 
+	uint8_t iova_mode;
+
 	uint32_t core_id;            /* Core ID to bind */
 	char name[RTE_KNI_NAMESIZE]; /* Network device name */
 	struct task_struct *pthread;
@@ -84,8 +86,48 @@ struct kni_dev {
 	void *va[MBUF_BURST_SZ];
 	void *alloc_pa[MBUF_BURST_SZ];
 	void *alloc_va[MBUF_BURST_SZ];
+
+	struct task_struct *usr_tsk;
 };
 
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+static inline phys_addr_t iova_to_phys(struct task_struct *tsk,
+				       unsigned long iova)
+{
+	phys_addr_t offset, phys_addr;
+	struct page *page = NULL;
+	long ret;
+
+	offset = iova & (PAGE_SIZE - 1);
+
+	/* Read one page struct info */
+#ifdef GET_USER_PAGES_REMOTE_API_V3
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				    FOLL_TOUCH, &page, NULL, NULL);
+#endif
+#ifdef GET_USER_PAGES_REMOTE_API_V2
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1,
+				    FOLL_TOUCH, &page, NULL);
+#endif
+#ifdef GET_USER_PAGES_REMOTE_API_V1
+	ret = get_user_pages_remote(tsk, tsk->mm, iova, 1
+				    0, 0, &page, NULL);
+#endif
+	if (ret < 0)
+		return 0;
+
+	phys_addr = page_to_phys(page) | offset;
+	put_page(page);
+
+	return phys_addr;
+}
+
+static inline void *iova_to_kva(struct task_struct *tsk, unsigned long iova)
+{
+	return phys_to_virt(iova_to_phys(tsk, iova));
+}
+#endif
+
 void kni_net_release_fifo_phy(struct kni_dev *kni);
 void kni_net_rx(struct kni_dev *kni);
 void kni_net_init(struct net_device *dev);
diff --git a/kernel/linux/kni/kni_misc.c b/kernel/linux/kni/kni_misc.c
index 84ef03b..cda71bd 100644
--- a/kernel/linux/kni/kni_misc.c
+++ b/kernel/linux/kni/kni_misc.c
@@ -348,15 +348,36 @@ kni_ioctl_create(struct net *net, uint32_t ioctl_num,
 	strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
 
 	/* Translate user space info into kernel space info */
-	kni->tx_q = phys_to_virt(dev_info.tx_phys);
-	kni->rx_q = phys_to_virt(dev_info.rx_phys);
-	kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-	kni->free_q = phys_to_virt(dev_info.free_phys);
-
-	kni->req_q = phys_to_virt(dev_info.req_phys);
-	kni->resp_q = phys_to_virt(dev_info.resp_phys);
-	kni->sync_va = dev_info.sync_va;
-	kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+	if (dev_info.iova_mode) {
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+		kni->tx_q = iova_to_kva(current, dev_info.tx_phys);
+		kni->rx_q = iova_to_kva(current, dev_info.rx_phys);
+		kni->alloc_q = iova_to_kva(current, dev_info.alloc_phys);
+		kni->free_q = iova_to_kva(current, dev_info.free_phys);
+
+		kni->req_q = iova_to_kva(current, dev_info.req_phys);
+		kni->resp_q = iova_to_kva(current, dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = iova_to_kva(current, dev_info.sync_phys);
+		kni->usr_tsk = current;
+		kni->iova_mode = 1;
+#else
+		pr_err("KNI module does not support IOVA to VA translation\n");
+		return -EINVAL;
+#endif
+	} else {
+
+		kni->tx_q = phys_to_virt(dev_info.tx_phys);
+		kni->rx_q = phys_to_virt(dev_info.rx_phys);
+		kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
+		kni->free_q = phys_to_virt(dev_info.free_phys);
+
+		kni->req_q = phys_to_virt(dev_info.req_phys);
+		kni->resp_q = phys_to_virt(dev_info.resp_phys);
+		kni->sync_va = dev_info.sync_va;
+		kni->sync_kva = phys_to_virt(dev_info.sync_phys);
+		kni->iova_mode = 0;
+	}
 
 	kni->mbuf_size = dev_info.mbuf_size;
 
diff --git a/kernel/linux/kni/kni_net.c b/kernel/linux/kni/kni_net.c
index f25b127..1ba9b1b 100644
--- a/kernel/linux/kni/kni_net.c
+++ b/kernel/linux/kni/kni_net.c
@@ -36,6 +36,22 @@ static void kni_net_rx_normal(struct kni_dev *kni);
 /* kni rx function pointer, with default to normal rx */
 static kni_net_rx_t kni_net_rx_func = kni_net_rx_normal;
 
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+/* iova to kernel virtual address */
+static inline void *
+iova2kva(struct kni_dev *kni, void *iova)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, (unsigned long)iova));
+}
+
+static inline void *
+iova2data_kva(struct kni_dev *kni, struct rte_kni_mbuf *m)
+{
+	return phys_to_virt(iova_to_phys(kni->usr_tsk, m->buf_physaddr) +
+			    m->data_off);
+}
+#endif
+
 /* physical address to kernel virtual address */
 static void *
 pa2kva(void *pa)
@@ -62,6 +78,26 @@ kva2data_kva(struct rte_kni_mbuf *m)
 	return phys_to_virt(m->buf_physaddr + m->data_off);
 }
 
+static inline void *
+get_kva(struct kni_dev *kni, void *pa)
+{
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+	if (kni->iova_mode == 1)
+		return iova2kva(kni, pa);
+#endif
+	return pa2kva(pa);
+}
+
+static inline void *
+get_data_kva(struct kni_dev *kni, void *pkt_kva)
+{
+#ifdef HAVE_IOVA_TO_KVA_MAPPING_SUPPORT
+	if (kni->iova_mode == 1)
+		return iova2data_kva(kni, pkt_kva);
+#endif
+	return kva2data_kva(pkt_kva);
+}
+
 /*
  * It can be called to process the request.
  */
@@ -178,7 +214,7 @@ kni_fifo_trans_pa2va(struct kni_dev *kni,
 			return;
 
 		for (i = 0; i < num_rx; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			kva_nb_segs = kva->nb_segs;
@@ -266,8 +302,8 @@ kni_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (likely(ret == 1)) {
 		void *data_kva;
 
-		pkt_kva = pa2kva(pkt_pa);
-		data_kva = kva2data_kva(pkt_kva);
+		pkt_kva = get_kva(kni, pkt_pa);
+		data_kva = get_data_kva(kni, pkt_kva);
 		pkt_va = pa2va(pkt_pa, pkt_kva);
 
 		len = skb->len;
@@ -338,9 +374,9 @@ kni_net_rx_normal(struct kni_dev *kni)
 
 	/* Transfer received packets to netif */
 	for (i = 0; i < num_rx; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -437,9 +473,9 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 		num = ret;
 		/* Copy mbufs */
 		for (i = 0; i < num; i++) {
-			kva = pa2kva(kni->pa[i]);
+			kva = get_kva(kni, kni->pa[i]);
 			len = kva->data_len;
-			data_kva = kva2data_kva(kva);
+			data_kva = get_data_kva(kni, kva);
 			kni->va[i] = pa2va(kni->pa[i], kva);
 
 			while (kva->next) {
@@ -449,8 +485,8 @@ kni_net_rx_lo_fifo(struct kni_dev *kni)
 				kva = next_kva;
 			}
 
-			alloc_kva = pa2kva(kni->alloc_pa[i]);
-			alloc_data_kva = kva2data_kva(alloc_kva);
+			alloc_kva = get_kva(kni, kni->alloc_pa[i]);
+			alloc_data_kva = get_data_kva(kni, alloc_kva);
 			kni->alloc_va[i] = pa2va(kni->alloc_pa[i], alloc_kva);
 
 			memcpy(alloc_data_kva, data_kva, len);
@@ -517,9 +553,9 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 
 	/* Copy mbufs to sk buffer and then call tx interface */
 	for (i = 0; i < num; i++) {
-		kva = pa2kva(kni->pa[i]);
+		kva = get_kva(kni, kni->pa[i]);
 		len = kva->pkt_len;
-		data_kva = kva2data_kva(kva);
+		data_kva = get_data_kva(kni, kva);
 		kni->va[i] = pa2va(kni->pa[i], kva);
 
 		skb = netdev_alloc_skb(dev, len);
@@ -550,8 +586,8 @@ kni_net_rx_lo_fifo_skb(struct kni_dev *kni)
 					break;
 
 				prev_kva = kva;
-				kva = pa2kva(kva->next);
-				data_kva = kva2data_kva(kva);
+				kva = get_kva(kni, kva->next);
+				data_kva = get_data_kva(kni, kva);
 				/* Convert physical address to virtual address */
 				prev_kva->next = pa2va(prev_kva->next, kva);
 			}
diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h
index 46f75a7..2427a96 100644
--- a/lib/librte_eal/linux/eal/include/rte_kni_common.h
+++ b/lib/librte_eal/linux/eal/include/rte_kni_common.h
@@ -125,6 +125,7 @@ struct rte_kni_device_info {
 	unsigned int min_mtu;
 	unsigned int max_mtu;
 	uint8_t mac_addr[6];
+	uint8_t iova_mode;
 };
 
 #define KNI_DEVICE "kni"
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

* [dpdk-dev]  [PATCH v14 2/2] kni: support IOVA mode
  2019-11-15 17:07                           ` [dpdk-dev] [PATCH v14 0/2] " vattunuru
  2019-11-15 17:07                             ` [dpdk-dev] [PATCH v14 1/2] kni: support IOVA mode in kernel module vattunuru
@ 2019-11-15 17:07                             ` vattunuru
  1 sibling, 0 replies; 251+ messages in thread
From: vattunuru @ 2019-11-15 17:07 UTC (permalink / raw)
  To: dev
  Cc: thomas, jerinj, kirankumark, olivier.matz, ferruh.yigit,
	anatoly.burakov, arybchenko, stephen, david.marchand,
	Vamsi Attunuru

From: David Marchand <david.marchand@redhat.com>

Current KNI implementation only operates in IOVA_PA mode
patch adds required functionality to enable KNI in
IOVA_VA mode.

Due to IOVA to KVA address translations, based on the KNI
use case there can be a performance impact.

To mitigate the impact on performance,forcing IOVA to PA
via eal "--iova-mode=pa" option can be used, IOVA_DC bus
iommu scheme can also result in IOVA as PA.

During KNI creation, app's iova_mode details are passed to
the KNI kernel module, accordingly kernel module translates
PA/IOVA addresses to KVA and vice-versa.

Signed-off-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 doc/guides/prog_guide/kernel_nic_interface.rst | 14 ++++++++++++++
 doc/guides/rel_notes/release_19_11.rst         | 13 ++++++++++++-
 lib/librte_eal/linux/eal/eal.c                 | 11 +++++++++--
 lib/librte_kni/rte_kni.c                       |  5 +++++
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/doc/guides/prog_guide/kernel_nic_interface.rst b/doc/guides/prog_guide/kernel_nic_interface.rst
index 2fd58e1..e688efc 100644
--- a/doc/guides/prog_guide/kernel_nic_interface.rst
+++ b/doc/guides/prog_guide/kernel_nic_interface.rst
@@ -300,6 +300,20 @@ The sk_buff is then freed and the mbuf sent in the tx_q FIFO.
 The DPDK TX thread dequeues the mbuf and sends it to the PMD via ``rte_eth_tx_burst()``.
 It then puts the mbuf back in the cache.
 
+IOVA = VA: Support
+------------------
+
+KNI operates in IOVA_VA scheme when
+
+- LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0) and
+- eal option `iova-mode=va` is passed or bus IOVA scheme in the DPDK is selected
+  as RTE_IOVA_VA.
+
+Due to IOVA to KVA address translations, based on the KNI use case there
+can be a performance impact. For mitigation, forcing IOVA to PA via eal
+"--iova-mode=pa" option can be used, IOVA_DC bus iommu scheme can also
+result in IOVA as PA.
+
 Ethtool
 -------
 
diff --git a/doc/guides/rel_notes/release_19_11.rst b/doc/guides/rel_notes/release_19_11.rst
index 682c1bd..bed3344 100644
--- a/doc/guides/rel_notes/release_19_11.rst
+++ b/doc/guides/rel_notes/release_19_11.rst
@@ -292,8 +292,19 @@ New Features
   compilers store their internal representation of the source code that
   the linker uses at the final stage of compilation process.
 
-  See :doc:`../prog_guide/lto` for more information:
+* **Added IOVA as VA support for KNI.**
+
+  * Added IOVA = VA support for KNI, KNI can operate in IOVA = VA mode when
+    `iova-mode=va` eal option is passed to the application or when bus IOVA
+    scheme is selected as RTE_IOVA_VA. This mode only works on Linux Kernel
+    versions 4.6.0 and above.
 
+  * Due to IOVA to KVA address translations, based on the KNI use case there
+    can be a performance impact. For mitigation, forcing IOVA to PA via eal
+    "--iova-mode=pa" option can be used, IOVA_DC bus iommu scheme can also
+    result in IOVA as PA.
+
+  See :doc:`../prog_guide/lto` for more information:
 
 
 Removed Items
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
index 9e2d50c..b5b7150 100644
--- a/lib/librte_eal/linux/eal/eal.c
+++ b/lib/librte_eal/linux/eal/eal.c
@@ -1073,6 +1073,11 @@ rte_eal_init(int argc, char **argv)
 				 */
 				iova_mode = RTE_IOVA_VA;
 				RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n");
+#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0)
+			} else if (rte_eal_check_module("rte_kni") == 1) {
+				iova_mode = RTE_IOVA_PA;
+				RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI perfomance.\n");
+#endif
 			} else if (is_iommu_enabled()) {
 				/* we have an IOMMU, pick IOVA as VA mode */
 				iova_mode = RTE_IOVA_VA;
@@ -1085,8 +1090,10 @@ rte_eal_init(int argc, char **argv)
 				RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
 			}
 		}
-#ifdef RTE_LIBRTE_KNI
-		/* Workaround for KNI which requires physical address to work */
+#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 6, 0)
+		/* Workaround for KNI which requires physical address to work
+		 * in kernels < 4.6
+		 */
 		if (iova_mode == RTE_IOVA_VA &&
 				rte_eal_check_module("rte_kni") == 1) {
 			if (phys_addrs) {
diff --git a/lib/librte_kni/rte_kni.c b/lib/librte_kni/rte_kni.c
index 7fbcf22..0507a48 100644
--- a/lib/librte_kni/rte_kni.c
+++ b/lib/librte_kni/rte_kni.c
@@ -10,6 +10,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/ioctl.h>
+#include <linux/version.h>
 
 #include <rte_spinlock.h>
 #include <rte_string_fns.h>
@@ -97,10 +98,12 @@ static volatile int kni_fd = -1;
 int
 rte_kni_init(unsigned int max_kni_ifaces __rte_unused)
 {
+#if KERNEL_VERSION(4, 6, 0) > LINUX_VERSION_CODE
 	if (rte_eal_iova_mode() != RTE_IOVA_PA) {
 		RTE_LOG(ERR, KNI, "KNI requires IOVA as PA\n");
 		return -1;
 	}
+#endif
 
 	/* Check FD and open */
 	if (kni_fd < 0) {
@@ -302,6 +305,8 @@ rte_kni_alloc(struct rte_mempool *pktmbuf_pool,
 	kni->group_id = conf->group_id;
 	kni->mbuf_size = conf->mbuf_size;
 
+	dev_info.iova_mode = (rte_eal_iova_mode() == RTE_IOVA_VA) ? 1 : 0;
+
 	ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
 	if (ret < 0)
 		goto ioctl_fail;
-- 
2.8.4


^ permalink raw reply related	[flat|nested] 251+ messages in thread

end of thread, other threads:[~2019-11-15 17:08 UTC | newest]

Thread overview: 251+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-27 10:49 [PATCH] kni: add IOVA va support for kni Kiran Kumar
2018-09-27 10:58 ` Burakov, Anatoly
2018-10-02 17:05 ` Ferruh Yigit
2019-04-01 17:30   ` Jerin Jacob Kollanukkaran
2019-04-01 18:20     ` Ferruh Yigit
2019-04-01  9:51 ` [PATCH v2] " Kiran Kumar Kokkilagadda
2019-04-03 16:29   ` Ferruh Yigit
2019-04-04  5:03     ` [dpdk-dev] [EXT] " Kiran Kumar Kokkilagadda
2019-04-04 11:20       ` Ferruh Yigit
2019-04-04 13:29         ` Burakov, Anatoly
2019-04-04  9:57     ` Burakov, Anatoly
2019-04-04 11:21       ` Ferruh Yigit
2019-04-16  4:55   ` [dpdk-dev] [PATCH v3] " kirankumark
2019-04-19 10:38     ` Thomas Monjalon
2019-04-22  4:39     ` [dpdk-dev] [PATCH v4] " kirankumark
2019-04-22  6:15       ` [dpdk-dev] [PATCH v5] " kirankumark
2019-04-26  9:11         ` Burakov, Anatoly
2019-06-25  3:56         ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI vattunuru
2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 1/4] lib/mempool: skip populating mempool objs that falls on page boundaries vattunuru
2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 2/4] lib/kni: add PCI related information vattunuru
2019-06-25 17:41             ` Stephen Hemminger
2019-06-26  3:48               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-06-26 14:58                 ` Stephen Hemminger
2019-06-27  9:43                   ` Vamsi Krishna Attunuru
2019-07-11 16:22             ` [dpdk-dev] " Ferruh Yigit
2019-07-12 11:02               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-07-12 11:11                 ` Ferruh Yigit
2019-06-25  3:56           ` [dpdk-dev] [PATCH v6 3/4] example/kni: add IOVA support for kni application vattunuru
2019-07-11 16:23             ` Ferruh Yigit
2019-06-25  3:57           ` [dpdk-dev] [PATCH v6 4/4] kernel/linux/kni: add IOVA support in kni module vattunuru
2019-07-11 16:30             ` Ferruh Yigit
2019-07-12 10:38               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-07-12 11:10                 ` Ferruh Yigit
2019-07-12 12:27                   ` Vamsi Krishna Attunuru
2019-07-12 16:29                   ` Vamsi Krishna Attunuru
2019-07-15 11:26                     ` Ferruh Yigit
2019-07-15 13:06                       ` Vamsi Krishna Attunuru
2019-07-11 16:43             ` [dpdk-dev] " Stephen Hemminger
2019-06-25 10:00           ` [dpdk-dev] [PATCH v6 0/4] add IOVA = VA support in KNI Burakov, Anatoly
2019-06-25 11:15             ` Jerin Jacob Kollanukkaran
2019-06-25 11:30               ` Burakov, Anatoly
2019-06-25 13:38                 ` Burakov, Anatoly
2019-06-27  9:34                   ` Jerin Jacob Kollanukkaran
2019-07-01 13:51                     ` Vamsi Krishna Attunuru
2019-07-04  6:42                       ` Vamsi Krishna Attunuru
2019-07-04  9:48                         ` Jerin Jacob Kollanukkaran
2019-07-11 16:21                           ` Ferruh Yigit
2019-07-17  9:04           ` [dpdk-dev] [PATCH v7 0/4] kni: add IOVA=VA support vattunuru
2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 1/4] mempool: modify mempool populate() to skip objects from page boundaries vattunuru
2019-07-17 13:36               ` Andrew Rybchenko
2019-07-17 13:47                 ` Olivier Matz
2019-07-17 17:31                 ` Vamsi Krishna Attunuru
2019-07-18  9:28                   ` Andrew Rybchenko
2019-07-18 14:16                     ` Vamsi Krishna Attunuru
2019-07-19 13:38                       ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations across pages Olivier Matz
2019-07-19 13:38                         ` [dpdk-dev] [RFC 1/4] mempool: clarify default populate function Olivier Matz
2019-07-19 15:42                           ` Andrew Rybchenko
2019-10-08  9:36                             ` Olivier Matz
2019-07-19 13:38                         ` [dpdk-dev] [RFC 2/4] mempool: unalign size when calculating required mem amount Olivier Matz
2019-08-07 15:21                           ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
2019-10-28 14:06                             ` Olivier Matz
2019-07-19 13:38                         ` [dpdk-dev] [RFC 3/4] mempool: introduce function to get mempool page size Olivier Matz
2019-08-07 15:21                           ` Andrew Rybchenko
2019-10-28 14:06                             ` Olivier Matz
2019-07-19 13:38                         ` [dpdk-dev] [RFC 4/4] mempool: prevent objects from being across pages Olivier Matz
2019-07-19 14:03                           ` Burakov, Anatoly
2019-10-28 14:07                             ` Olivier Matz
2019-07-19 14:11                           ` Burakov, Anatoly
2019-08-07 15:21                           ` Andrew Rybchenko
2019-10-28 14:07                             ` Olivier Matz
2019-10-29 11:03                               ` Andrew Rybchenko
2019-07-23  5:37                         ` [dpdk-dev] [RFC 0/4] mempool: avoid objects allocations " Vamsi Krishna Attunuru
2019-08-07 15:21                         ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
2019-10-28 14:06                           ` Olivier Matz
2019-10-28 14:01                         ` [dpdk-dev] [PATCH 0/5] " Olivier Matz
2019-10-28 14:01                           ` [dpdk-dev] [PATCH 1/5] mempool: allow unaligned addr/len in populate virt Olivier Matz
2019-10-29  9:02                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-10-29  9:13                               ` Olivier Matz
2019-10-29  9:18                                 ` Vamsi Krishna Attunuru
2019-10-29  9:21                             ` [dpdk-dev] " Andrew Rybchenko
2019-10-29 17:02                               ` Olivier Matz
2019-10-28 14:01                           ` [dpdk-dev] [PATCH 2/5] mempool: reduce wasted space on mempool populate Olivier Matz
2019-10-29 10:09                             ` Andrew Rybchenko
2019-10-29 17:09                               ` Olivier Matz
2019-10-28 14:01                           ` [dpdk-dev] [PATCH 3/5] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
2019-10-29 10:25                             ` Andrew Rybchenko
2019-10-29 17:20                               ` Olivier Matz
2019-10-30  7:36                                 ` Andrew Rybchenko
2019-10-30  7:44                                   ` Andrew Rybchenko
2019-10-30 10:38                                     ` Olivier Matz
2019-10-28 14:01                           ` [dpdk-dev] [PATCH 4/5] mempool: introduce function to get mempool page size Olivier Matz
2019-10-29 10:31                             ` Andrew Rybchenko
2019-10-29 17:20                               ` Olivier Matz
2019-10-30  8:32                                 ` Olivier Matz
2019-10-30 14:29                                   ` Olivier Matz
2019-10-28 14:01                           ` [dpdk-dev] [PATCH 5/5] mempool: prevent objects from being across pages Olivier Matz
2019-10-29 10:59                             ` [dpdk-dev] ***Spam*** " Andrew Rybchenko
2019-10-29 17:34                               ` Olivier Matz
2019-10-30  7:56                                 ` [dpdk-dev] " Andrew Rybchenko
2019-10-29 17:25                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-10-30  3:55                               ` Vamsi Krishna Attunuru
2019-10-30  7:46                               ` Andrew Rybchenko
2019-10-30  8:38                                 ` Jerin Jacob
2019-10-30 14:33                                   ` Olivier Matz
2019-10-30 14:54                                     ` Jerin Jacob
2019-10-30  8:42                                 ` Olivier Matz
2019-10-30 14:36                         ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations " Olivier Matz
2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 1/6] mempool: allow unaligned addr/len in populate virt Olivier Matz
2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 2/6] mempool: reduce wasted space on mempool populate Olivier Matz
2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 3/6] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 4/6] mempool: introduce function to get mempool page size Olivier Matz
2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 5/6] mempool: prevent objects from being across pages Olivier Matz
2019-10-31  6:54                             ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-10-31  8:19                               ` Jerin Jacob
2019-10-31  8:29                                 ` Olivier Matz
2019-10-31  8:24                               ` Olivier Matz
2019-10-31  8:33                                 ` Andrew Rybchenko
2019-10-31  8:45                                   ` Olivier Matz
2019-10-30 14:36                           ` [dpdk-dev] [PATCH v2 6/6] mempool: use the specific macro for object alignment Olivier Matz
2019-10-30 14:55                             ` Andrew Rybchenko
2019-11-01  3:56                           ` [dpdk-dev] [PATCH v2 0/6] mempool: avoid objects allocations across pages Nipun Gupta
2019-11-04 15:12                         ` [dpdk-dev] [PATCH v3 0/7] " Olivier Matz
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 2/7] mempool: reduce wasted space on mempool populate Olivier Matz
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 3/7] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 4/7] mempool: introduce function to get mempool page size Olivier Matz
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
2019-11-05 12:19                             ` Andrew Rybchenko
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 6/7] mempool: prevent objects from being across pages Olivier Matz
2019-11-05 12:22                             ` Andrew Rybchenko
2019-11-04 15:12                           ` [dpdk-dev] [PATCH v3 7/7] mempool: use the specific macro for object alignment Olivier Matz
2019-11-05 12:15                             ` Andrew Rybchenko
2019-11-05 12:48                               ` Olivier Matz
2019-11-05 15:36                         ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 1/7] mempool: allow unaligned addr/len in populate virt Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 2/7] mempool: reduce wasted space on mempool populate Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 3/7] mempool: remove optimistic IOVA-contiguous allocation Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 4/7] mempool: introduce function to get mempool page size Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 5/7] mempool: introduce helpers for populate and calc mem size Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 6/7] mempool: prevent objects from being across pages Olivier Matz
2019-11-05 15:37                           ` [dpdk-dev] [PATCH v4 7/7] mempool: use the specific macro for object alignment Olivier Matz
2019-11-05 16:03                           ` [dpdk-dev] [PATCH v4 0/7] mempool: avoid objects allocations across pages Olivier Matz
2019-11-06 10:39                           ` Thomas Monjalon
2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 2/4] kni: add IOVA = VA support in KNI lib vattunuru
2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 3/4] kni: add IOVA=VA support in KNI module vattunuru
2019-07-17  9:04             ` [dpdk-dev] [PATCH v7 4/4] kni: modify IOVA mode checks to support VA vattunuru
2019-07-23  5:38             ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support vattunuru
2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 1/5] mempool: populate mempool with page sized chunks of memory vattunuru
2019-07-23 11:08                 ` Andrew Rybchenko
2019-07-23 12:28                   ` Vamsi Krishna Attunuru
2019-07-23 19:33                     ` Andrew Rybchenko
2019-07-24  7:09                       ` Vamsi Krishna Attunuru
2019-07-24  7:27                         ` Andrew Rybchenko
2019-07-29  6:25                           ` Vamsi Krishna Attunuru
2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 2/5] add IOVA -VA support in KNI lib vattunuru
2019-07-23 10:54                 ` Andrew Rybchenko
2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 3/5] kni: add app specific mempool create & free routine vattunuru
2019-07-23 10:50                 ` Andrew Rybchenko
2019-07-23 11:01                   ` Vamsi Krishna Attunuru
2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 4/5] kni: add IOVA=VA support in KNI module vattunuru
2019-07-23  5:38               ` [dpdk-dev] [PATCH v8 5/5] kni: modify IOVA mode checks to support VA vattunuru
2019-07-24  7:14               ` [dpdk-dev] [PATCH v8 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
2019-07-29 12:13               ` [dpdk-dev] [PATCH v9 " vattunuru
2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 1/5] mempool: populate mempool with the page sized chunks of memory vattunuru
2019-07-29 12:41                   ` Andrew Rybchenko
2019-07-29 13:33                     ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-08-16  6:12                   ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support vattunuru
2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 1/5] mempool: populate mempool with the page sized chunks vattunuru
2019-10-08  9:26                       ` Olivier Matz
2019-10-09  5:29                         ` Vamsi Krishna Attunuru
2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 2/5] kni: add IOVA=VA support in KNI lib vattunuru
2019-10-15 15:36                       ` Yigit, Ferruh
2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 3/5] kni: add app specific mempool create and free routines vattunuru
2019-10-15 15:40                       ` Yigit, Ferruh
2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 4/5] kni: add IOVA=VA support in KNI module vattunuru
2019-10-15 15:43                       ` Yigit, Ferruh
2019-10-15 15:46                         ` Stephen Hemminger
2019-10-16 11:26                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-10-16 14:37                             ` Vamsi Krishna Attunuru
2019-10-16 16:14                             ` Ferruh Yigit
2019-10-18 17:15                               ` Vamsi Krishna Attunuru
2019-10-21 11:45                                 ` Ferruh Yigit
2019-08-16  6:12                     ` [dpdk-dev] [PATCH v10 5/5] kni: modify IOVA mode checks to support VA vattunuru
2019-09-25  4:00                     ` [dpdk-dev] [PATCH v10 0/5] kni: add IOVA=VA support Vamsi Krishna Attunuru
2019-10-08  5:08                       ` Vamsi Krishna Attunuru
2019-10-14  4:05                         ` Vamsi Krishna Attunuru
2019-10-15 15:34                     ` Yigit, Ferruh
2019-10-16 12:17                       ` Vamsi Krishna Attunuru
2019-10-16 16:21                         ` Ferruh Yigit
2019-10-18 16:42                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-10-21  8:03                     ` [dpdk-dev] [PATCH v11 0/4] kni: add IOVA=VA mode support vattunuru
2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 1/4] mempool: populate mempool with the page sized chunks vattunuru
2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 2/4] eal: add legacy kni option vattunuru
2019-10-21 11:55                         ` Ferruh Yigit
2019-10-21 13:13                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-10-21 13:32                             ` Ferruh Yigit
2019-10-21 14:38                               ` Vamsi Krishna Attunuru
2019-10-22  9:29                                 ` Vamsi Krishna Attunuru
2019-10-22 12:28                                 ` Andrew Rybchenko
2019-10-22 13:31                                   ` Vamsi Krishna Attunuru
2019-10-23 10:12                                     ` Jerin Jacob
2019-10-23 14:47                                       ` Olivier Matz
2019-10-23 15:02                                         ` Jerin Jacob
2019-10-24 17:35                                           ` Olivier Matz
2019-10-24 19:30                                             ` Jerin Jacob
2019-10-25  9:20                                               ` Vamsi Krishna Attunuru
2019-10-26 12:25                                                 ` Olivier Matz
2019-10-26 14:09                                                   ` Vamsi Krishna Attunuru
2019-10-28 14:05                                                     ` Olivier Matz
2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 3/4] kni: add IOVA=VA support vattunuru
2019-10-21  8:03                       ` [dpdk-dev] [PATCH v11 4/4] kni: add IOVA=VA support in kernel module vattunuru
2019-10-21 12:02                         ` Ferruh Yigit
2019-11-05 11:04                       ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support vattunuru
2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 1/2] kni: " vattunuru
2019-11-14 10:57                           ` David Marchand
2019-11-14 11:13                             ` David Marchand
2019-11-14 12:10                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-11-14 12:25                                 ` David Marchand
2019-11-14 17:48                             ` [dpdk-dev] " David Marchand
2019-11-05 11:04                         ` [dpdk-dev] [PATCH v12 2/2] kni: add IOVA=VA support in kernel module vattunuru
2019-11-06 10:49                         ` [dpdk-dev] [PATCH v12 0/2] add IOVA=VA mode support Thomas Monjalon
2019-11-06 11:09                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-11-06 11:53                             ` Thomas Monjalon
2019-11-06 11:59                               ` Vamsi Krishna Attunuru
2019-11-07 10:34                             ` Vamsi Krishna Attunuru
2019-11-07 19:53                         ` [dpdk-dev] " Ferruh Yigit
2019-11-08  4:16                           ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-11-08 14:26                             ` Ferruh Yigit
2019-11-08 14:54                               ` Jerin Jacob
2019-11-13  6:33                                 ` Vamsi Krishna Attunuru
2019-11-13 12:32                                   ` Ferruh Yigit
2019-11-15 11:18                         ` [dpdk-dev] [PATCH v13 0/2] kni: support IOVA mode vattunuru
2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 1/2] kni: support IOVA mode in kernel module vattunuru
2019-11-15 11:18                           ` [dpdk-dev] [PATCH v13 2/2] kni: support IOVA mode vattunuru
2019-11-15 12:11                             ` Ferruh Yigit
2019-11-15 12:59                             ` David Marchand
2019-11-15 13:35                               ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-11-15 13:40                                 ` David Marchand
2019-11-15 13:40                               ` [dpdk-dev] " Jerin Jacob
2019-11-15 14:56                                 ` David Marchand
2019-11-15 15:22                                   ` Jerin Jacob
2019-11-15 17:07                           ` [dpdk-dev] [PATCH v14 0/2] " vattunuru
2019-11-15 17:07                             ` [dpdk-dev] [PATCH v14 1/2] kni: support IOVA mode in kernel module vattunuru
2019-11-15 17:07                             ` [dpdk-dev] [PATCH v14 2/2] kni: support IOVA mode vattunuru
2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 2/5] kni: add IOVA=VA support in KNI lib vattunuru
2019-07-29 12:24                   ` Igor Ryzhov
2019-07-29 13:22                     ` [dpdk-dev] [EXT] " Vamsi Krishna Attunuru
2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 3/5] kni: add app specific mempool create & free routine vattunuru
2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 4/5] kni: add IOVA=VA support in KNI module vattunuru
2019-07-29 12:13                 ` [dpdk-dev] [PATCH v9 5/5] kni: modify IOVA mode checks to support VA vattunuru
2019-04-23  8:56       ` [dpdk-dev] [PATCH v4] kni: add IOVA va support for kni Burakov, Anatoly

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).