All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anatoly Burakov <anatoly.burakov@intel.com>
To: dev@dpdk.org
Cc: tiwei.bie@intel.com, ray.kinsella@intel.com,
	zhihong.wang@intel.com, maxime.coquelin@redhat.com,
	kuralamudhan.ramakrishnan@intel.com
Subject: [PATCH v3 9/9] mem: support using memfd segments for in-memory mode
Date: Tue,  4 Sep 2018 16:15:50 +0100	[thread overview]
Message-ID: <a6299ff133a0e4ccb7d9681b7c912b0863602bd6.1536073997.git.anatoly.burakov@intel.com> (raw)
In-Reply-To: <cover.1536073996.git.anatoly.burakov@intel.com>
In-Reply-To: <cover.1536073996.git.anatoly.burakov@intel.com>

Enable using memfd-created segments if supported by the system.

This will allow having real fd's for pages but without hugetlbfs
mounts, which will enable in-memory mode to be used with virtio.

The implementation is mostly piggy-backing on existing real-fd
code, except that we no longer need to unlink any files or track
per-page locks in single-file segments mode, because in-memory
mode does not support secondary processes anyway.

We move some checks from EAL command-line parsing code to memalloc
because it is now possible to use single-file segments mode with
in-memory mode, but only if memfd is supported.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_eal/common/eal_common_options.c |   6 +-
 lib/librte_eal/linuxapp/eal/eal_memalloc.c | 265 ++++++++++++++++++---
 2 files changed, 235 insertions(+), 36 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 873099acc..ddd624110 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -1384,10 +1384,10 @@ eal_check_common_options(struct internal_config *internal_cfg)
 			" is only supported in non-legacy memory mode\n");
 	}
 	if (internal_cfg->single_file_segments &&
-			internal_cfg->hugepage_unlink) {
+			internal_cfg->hugepage_unlink &&
+			!internal_cfg->in_memory) {
 		RTE_LOG(ERR, EAL, "Option --"OPT_SINGLE_FILE_SEGMENTS" is "
-			"not compatible with neither --"OPT_IN_MEMORY" nor "
-			"--"OPT_HUGE_UNLINK"\n");
+			"not compatible with --"OPT_HUGE_UNLINK"\n");
 		return -1;
 	}
 	if (internal_cfg->legacy_mem &&
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index 6fc9dc0ca..b2e2a9599 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -52,6 +52,23 @@ const int anonymous_hugepages_supported =
 #define RTE_MAP_HUGE_SHIFT 26
 #endif
 
+/*
+ * we don't actually care if memfd itself is supported - we only need to check
+ * if memfd supports hugetlbfs, as that already implies memfd support.
+ *
+ * also, this is not a constant, because while we may be *compiled* with memfd
+ * hugetlbfs support, we might not be *running* on a system that supports memfd
+ * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
+ * runtime, and fall back to anonymous memory.
+ */
+int memfd_create_supported =
+#ifdef MFD_HUGETLB
+#define MEMFD_SUPPORTED
+		1;
+#else
+		0;
+#endif
+
 /*
  * not all kernel version support fallocate on hugetlbfs, so fall back to
  * ftruncate and disallow deallocation if fallocate is not supported.
@@ -191,6 +208,31 @@ get_file_size(int fd)
 	return st.st_size;
 }
 
+static inline uint32_t
+bsf64(uint64_t v)
+{
+	return (uint32_t)__builtin_ctzll(v);
+}
+
+static inline uint32_t
+log2_u64(uint64_t v)
+{
+	if (v == 0)
+		return 0;
+	v = rte_align64pow2(v);
+	return bsf64(v);
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+	/* as per mmap() manpage, all page sizes are log2 of page size
+	 * shifted by MAP_HUGE_SHIFT
+	 */
+	int log2 = log2_u64(page_sz);
+	return log2 << RTE_MAP_HUGE_SHIFT;
+}
+
 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
 static int lock(int fd, int type)
 {
@@ -287,12 +329,64 @@ static int unlock_segment(int list_idx, int seg_idx)
 	return 0;
 }
 
+static int
+get_seg_memfd(struct hugepage_info *hi __rte_unused,
+		unsigned int list_idx __rte_unused,
+		unsigned int seg_idx __rte_unused)
+{
+#ifdef MEMFD_SUPPORTED
+	int fd;
+	char segname[250]; /* as per manpage, limit is 249 bytes plus null */
+
+	if (internal_config.single_file_segments) {
+		fd = fd_list[list_idx].memseg_list_fd;
+
+		if (fd < 0) {
+			int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+			snprintf(segname, sizeof(segname), "seg_%i", list_idx);
+			fd = memfd_create(segname, flags);
+			if (fd < 0) {
+				RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+					__func__, strerror(errno));
+				return -1;
+			}
+			fd_list[list_idx].memseg_list_fd = fd;
+		}
+	} else {
+		fd = fd_list[list_idx].fds[seg_idx];
+
+		if (fd < 0) {
+			int flags = MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+			snprintf(segname, sizeof(segname), "seg_%i-%i",
+					list_idx, seg_idx);
+			fd = memfd_create(segname, flags);
+			if (fd < 0) {
+				RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+					__func__, strerror(errno));
+				return -1;
+			}
+			fd_list[list_idx].fds[seg_idx] = fd;
+		}
+	}
+	return fd;
+#endif
+	return -1;
+}
+
 static int
 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 		unsigned int list_idx, unsigned int seg_idx)
 {
 	int fd;
 
+	/* for in-memory mode, we only make it here when we're sure we support
+	 * memfd, and this is a special case.
+	 */
+	if (internal_config.in_memory)
+		return get_seg_memfd(hi, list_idx, seg_idx);
+
 	if (internal_config.single_file_segments) {
 		/* create a hugepage file path */
 		eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
@@ -347,6 +441,33 @@ resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
 		uint64_t fa_offset, uint64_t page_sz, bool grow)
 {
 	bool again = false;
+
+	/* in-memory mode is a special case, because we don't need to perform
+	 * any locking, and we can be sure that fallocate() is supported.
+	 */
+	if (internal_config.in_memory) {
+		int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+				FALLOC_FL_KEEP_SIZE;
+		int ret;
+
+		/* grow or shrink the file */
+		ret = fallocate(fd, flags, fa_offset, page_sz);
+
+		if (ret < 0) {
+			RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+					__func__,
+					strerror(errno));
+			return -1;
+		}
+		/* increase/decrease total segment count */
+		fd_list[list_idx].count += (grow ? 1 : -1);
+		if (!grow && fd_list[list_idx].count == 0) {
+			close(fd_list[list_idx].memseg_list_fd);
+			fd_list[list_idx].memseg_list_fd = -1;
+		}
+		return 0;
+	}
+
 	do {
 		if (fallocate_supported == 0) {
 			/* we cannot deallocate memory if fallocate() is not
@@ -496,26 +617,34 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	void *new_addr;
 
 	alloc_sz = hi->hugepage_sz;
-	if (!internal_config.single_file_segments &&
-			internal_config.in_memory &&
-			anonymous_hugepages_supported) {
-		int log2, flags;
-
-		log2 = rte_log2_u32(alloc_sz);
-		/* as per mmap() manpage, all page sizes are log2 of page size
-		 * shifted by MAP_HUGE_SHIFT
-		 */
-		flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
+
+	/* these are checked at init, but code analyzers don't know that */
+	if (internal_config.in_memory && !anonymous_hugepages_supported) {
+		RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
+		return -1;
+	}
+	if (internal_config.in_memory && !memfd_create_supported &&
+			internal_config.single_file_segments) {
+		RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
+		return -1;
+	}
+
+	/* in-memory without memfd is a special case */
+	int mmap_flags;
+
+	if (internal_config.in_memory && !memfd_create_supported) {
+		int pagesz_flag, flags;
+
+		pagesz_flag = pagesz_flags(alloc_sz);
+		flags = pagesz_flag | MAP_HUGETLB | MAP_FIXED |
 				MAP_PRIVATE | MAP_ANONYMOUS;
 		fd = -1;
-		va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
+		mmap_flags = flags;
 
-		/* single-file segments codepath will never be active because
-		 * in-memory mode is incompatible with it and it's stopped at
-		 * EAL initialization stage, however the compiler doesn't know
-		 * that and complains about map_offset being used uninitialized
-		 * on failure codepaths while having in-memory mode enabled. so,
-		 * assign a value here.
+		/* single-file segments codepath will never be active
+		 * here because in-memory mode is incompatible with the
+		 * fallback path, and it's stopped at EAL initialization
+		 * stage.
 		 */
 		map_offset = 0;
 	} else {
@@ -539,7 +668,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 					__func__, strerror(errno));
 				goto resized;
 			}
-			if (internal_config.hugepage_unlink) {
+			if (internal_config.hugepage_unlink &&
+					!internal_config.in_memory) {
 				if (unlink(path)) {
 					RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
 						__func__, strerror(errno));
@@ -547,16 +677,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 				}
 			}
 		}
-
-		/*
-		 * map the segment, and populate page tables, the kernel fills
-		 * this segment with zeros if it's a new page.
-		 */
-		va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
-				MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
-				map_offset);
+		mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
 	}
 
+	/*
+	 * map the segment, and populate page tables, the kernel fills
+	 * this segment with zeros if it's a new page.
+	 */
+	va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
+			map_offset);
+
 	if (va == MAP_FAILED) {
 		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
 			strerror(errno));
@@ -663,7 +793,8 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 {
 	uint64_t map_offset;
 	char path[PATH_MAX];
-	int fd, ret;
+	int fd, ret = 0;
+	bool exit_early;
 
 	/* erase page data */
 	memset(ms->addr, 0, ms->len);
@@ -675,8 +806,17 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 		return -1;
 	}
 
+	exit_early = false;
+
+	/* if we're using anonymous hugepages, nothing to be done */
+	if (internal_config.in_memory && !memfd_create_supported)
+		exit_early = true;
+
 	/* if we've already unlinked the page, nothing needs to be done */
-	if (internal_config.hugepage_unlink) {
+	if (!internal_config.in_memory && internal_config.hugepage_unlink)
+		exit_early = true;
+
+	if (exit_early) {
 		memset(ms, 0, sizeof(*ms));
 		return 0;
 	}
@@ -699,11 +839,13 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 		/* if we're able to take out a write lock, we're the last one
 		 * holding onto this page.
 		 */
-		ret = lock(fd, LOCK_EX);
-		if (ret >= 0) {
-			/* no one else is using this page */
-			if (ret == 1)
-				unlink(path);
+		if (!internal_config.in_memory) {
+			ret = lock(fd, LOCK_EX);
+			if (ret >= 0) {
+				/* no one else is using this page */
+				if (ret == 1)
+					unlink(path);
+			}
 		}
 		/* closing fd will drop the lock */
 		close(fd);
@@ -1406,6 +1548,35 @@ eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
 	return fd;
 }
 
+static int
+test_memfd_create(void)
+{
+#ifdef MEMFD_SUPPORTED
+	unsigned int i;
+	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+		uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
+		int pagesz_flag = pagesz_flags(pagesz);
+		int flags;
+
+		flags = pagesz_flag | MFD_HUGETLB;
+		int fd = memfd_create("test", flags);
+		if (fd < 0) {
+			/* we failed - let memalloc know this isn't working */
+			if (errno == EINVAL) {
+				memfd_create_supported = 0;
+				return 0; /* not supported */
+			}
+
+			/* we got other error - something's wrong */
+			return -1; /* error */
+		}
+		close(fd);
+		return 1; /* supported */
+	}
+#endif
+	return 0; /* not supported */
+}
+
 int
 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
 {
@@ -1436,6 +1607,34 @@ eal_memalloc_init(void)
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY)
 		if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
 			return -1;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+			internal_config.in_memory) {
+		int mfd_res = test_memfd_create();
+
+		if (mfd_res < 0) {
+			RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
+			return -1;
+		}
+		if (mfd_res == 1)
+			RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+		else
+			RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
+
+		/* we only support single-file segments mode with in-memory mode
+		 * if we support hugetlbfs with memfd_create. this code will
+		 * test if we do.
+		 */
+		if (internal_config.single_file_segments &&
+				mfd_res != 1) {
+			RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
+			return -1;
+		}
+		/* this cannot ever happen but better safe than sorry */
+		if (!anonymous_hugepages_supported) {
+			RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
+			return -1;
+		}
+	}
 
 	/* initialize all of the fd lists */
 	if (rte_memseg_list_walk(fd_list_create_walk, NULL))
-- 
2.17.1

  parent reply	other threads:[~2018-09-04 15:15 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-08-23 16:59 [PATCH 0/8] Improve running DPDK without hugetlbfs mounpoint Anatoly Burakov
2018-08-23 16:59 ` [PATCH 1/8] fbarray: fix detach in noshconf mode Anatoly Burakov
2018-08-23 16:59 ` [PATCH 2/8] eal: don't allow legacy mode with in-memory mode Anatoly Burakov
2018-08-23 16:59 ` [PATCH 3/8] mem: raise maximum fd limit unconditionally Anatoly Burakov
2018-08-23 16:59 ` [PATCH 4/8] memalloc: rename lock list to fd list Anatoly Burakov
2018-08-23 16:59 ` [PATCH 5/8] memalloc: track page fd's in non-single file mode Anatoly Burakov
2018-08-23 16:59 ` [PATCH 6/8] memalloc: add EAL-internal API to get and set segment fd's Anatoly Burakov
2018-08-23 16:59 ` [PATCH 7/8] mem: add external API to retrieve page fd from EAL Anatoly Burakov
2018-08-23 16:59 ` [PATCH 8/8] mem: support using memfd segments for in-memory mode Anatoly Burakov
2018-08-24  4:39   ` Jerin Jacob
2018-08-24  8:56     ` Burakov, Anatoly
2018-09-04 15:01 ` [PATCH v2 0/9] Improve running DPDK without hugetlbfs mounpoint Anatoly Burakov
2018-09-04 15:15   ` [PATCH v3 " Anatoly Burakov
2018-09-19 13:04     ` Thomas Monjalon
2018-09-19 13:55       ` Burakov, Anatoly
2018-09-19 14:15         ` Thomas Monjalon
2018-09-04 15:15   ` [PATCH v3 1/9] fbarray: fix detach in noshconf mode Anatoly Burakov
2018-09-13 13:00     ` Maxime Coquelin
2018-09-04 15:15   ` [PATCH v3 2/9] eal: don't allow legacy mode with in-memory mode Anatoly Burakov
2018-09-13 13:06     ` Maxime Coquelin
2018-09-17  9:49       ` Burakov, Anatoly
2018-09-04 15:15   ` [PATCH v3 3/9] mem: raise maximum fd limit unconditionally Anatoly Burakov
2018-09-13 13:12     ` Maxime Coquelin
2018-09-04 15:15   ` [PATCH v3 4/9] memalloc: rename lock list to fd list Anatoly Burakov
2018-09-13 15:19     ` Maxime Coquelin
2018-09-04 15:15   ` [PATCH v3 5/9] memalloc: track page fd's in non-single file mode Anatoly Burakov
2018-09-13 15:56     ` Maxime Coquelin
2018-09-04 15:15   ` [PATCH v3 6/9] memalloc: add EAL-internal API to get and set segment fd's Anatoly Burakov
2018-09-14  7:54     ` Maxime Coquelin
2018-09-17  9:53       ` Burakov, Anatoly
2018-09-04 15:15   ` [PATCH v3 7/9] mem: add external API to retrieve page fd from EAL Anatoly Burakov
2018-09-14  8:00     ` Maxime Coquelin
2018-09-04 15:15   ` [PATCH v3 8/9] mem: allow querying offset into segment fd Anatoly Burakov
2018-09-14  7:57     ` Maxime Coquelin
2018-09-04 15:15   ` Anatoly Burakov [this message]
2018-09-14  8:06     ` [PATCH v3 9/9] mem: support using memfd segments for in-memory mode Maxime Coquelin
2018-09-04 15:01 ` [PATCH v2 1/9] fbarray: fix detach in noshconf mode Anatoly Burakov
2018-09-04 15:01 ` [PATCH v2 2/9] eal: don't allow legacy mode with in-memory mode Anatoly Burakov
2018-09-04 15:01 ` [PATCH v2 3/9] mem: raise maximum fd limit unconditionally Anatoly Burakov
2018-09-04 15:01 ` [PATCH v2 4/9] memalloc: rename lock list to fd list Anatoly Burakov
2018-09-04 15:01 ` [PATCH v2 5/9] memalloc: track page fd's in non-single file mode Anatoly Burakov
2018-09-04 15:01 ` [PATCH v2 6/9] memalloc: add EAL-internal API to get and set segment fd's Anatoly Burakov
2018-09-04 15:02 ` [PATCH v2 7/9] mem: add external API to retrieve page fd from EAL Anatoly Burakov
2018-09-04 15:02 ` [PATCH v2 8/9] mem: allow querying offset into segment fd Anatoly Burakov
2018-09-04 15:02 ` [PATCH v2 9/9] mem: support using memfd segments for in-memory mode Anatoly Burakov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a6299ff133a0e4ccb7d9681b7c912b0863602bd6.1536073997.git.anatoly.burakov@intel.com \
    --to=anatoly.burakov@intel.com \
    --cc=dev@dpdk.org \
    --cc=kuralamudhan.ramakrishnan@intel.com \
    --cc=maxime.coquelin@redhat.com \
    --cc=ray.kinsella@intel.com \
    --cc=tiwei.bie@intel.com \
    --cc=zhihong.wang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.