All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] eal: add cgroup-aware resource self discovery
@ 2016-01-24 18:49 Jianfeng Tan
  2016-01-25 13:46 ` Neil Horman
                   ` (6 more replies)
  0 siblings, 7 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-01-24 18:49 UTC (permalink / raw)
  To: dev; +Cc: yuanhan.liu

Current issue: DPDK is not that friendly to container environment usage.
It's because that it pre-alloc resource like cores and hugepages from cmd
line options. So for a DPDK application, it's necessary to check how much
resource is allocated to a container and then use that as an reference.

To address that, this patch introduces two APIs:
   a. rte_eal_res_self_discovery, to query how much resource can be used.
   b. rte_eal_res_self_discovery_apply, to apply self-discovered resource
      into DPDK.

Currently only Linux CGroup is added, similarly, we can add BSD jail as
well in the future. And even in Linux, there could be other way to query
and apply resources, like through a centralized daemon.

Known issue: current way to read individual attributes of cgroups directly
instead of via systemd's API is not a long-term solution. Please refer to
http://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
for more information.

Test example:
    a. cgcreate -g cpuset,hugetlb:/test-subgroup
    b. cgset -r cpuset.cpus=2-3 test-subgroup
    c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
    d. cgexec -g cpuset,hugetlb:test-subgroup \
	    ./examples/l2fwd/build/l2fwd --self-discovery=cgroup -n 4 -- -p 3

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_options.c      |  39 ++++
 lib/librte_eal/common/eal_internal_cfg.h        |   1 +
 lib/librte_eal/common/eal_options.h             |   2 +
 lib/librte_eal/common/include/rte_eal.h         |  34 +++
 lib/librte_eal/linuxapp/eal/Makefile            |   1 +
 lib/librte_eal/linuxapp/eal/eal_cgroup.c        | 294 ++++++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c |   5 +
 7 files changed, 376 insertions(+)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_cgroup.c

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..7235473 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_SELF_DISCOVERY,    1, NULL, OPT_SELF_DISCOVERY_NUM   },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -128,6 +129,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
 	internal_cfg->force_nchannel = 0;
 	internal_cfg->hugefile_prefix = HUGEFILE_PREFIX_DEFAULT;
 	internal_cfg->hugepage_dir = NULL;
+	internal_cfg->self_discovery = NULL;
 	internal_cfg->force_sockets = 0;
 	/* zero out the NUMA config */
 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
@@ -755,6 +757,24 @@ eal_parse_proc_type(const char *arg)
 }
 
 int
+__attribute__((weak))
+rte_eal_res_self_discovery(const char *type __rte_unused,
+			   char **p_corelist __rte_unused,
+			   uint64_t *p_memory __rte_unused)
+{
+	return -1;
+}
+
+int
+__attribute__((weak))
+rte_eal_res_self_discovery_apply(const char *type __rte_unused,
+				 int enable_core __rte_unused,
+				 int enable_mem __rte_unused)
+{
+	return -1;
+}
+
+int
 eal_parse_common_option(int opt, const char *optarg,
 			struct internal_config *conf)
 {
@@ -897,6 +917,25 @@ eal_parse_common_option(int opt, const char *optarg,
 		}
 		break;
 
+	case OPT_SELF_DISCOVERY_NUM: {
+		char *corelist;
+
+		if (rte_eal_res_self_discovery(optarg, &corelist, NULL) < 0) {
+			RTE_LOG(ERR, EAL, "invalid parameter for --"
+				OPT_SELF_DISCOVERY "\n");
+			return -1;
+		}
+
+		if (eal_parse_corelist(corelist) < 0) {
+			RTE_LOG(ERR, EAL, "invalid core list\n");
+			return -1;
+		}
+		/* Save it here for memory limit */
+		internal_config.self_discovery = strdup(optarg);
+
+		break;
+	}
+
 	/* don't know what to do, leave this to caller */
 	default:
 		return 1;
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..f3c8e31 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -83,6 +83,7 @@ struct internal_config {
 	volatile enum rte_intr_mode vfio_intr_mode;
 	const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
 	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
+	const char *self_discovery;       /**< specific type of self_discovery */
 
 	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
 	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..a499d73 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_SELF_DISCOVERY    "self-discovery"
+	OPT_SELF_DISCOVERY_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index d2816a8..ff81484 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -220,6 +220,40 @@ int rte_eal_has_hugepages(void);
 int rte_sys_gettid(void);
 
 /**
+ * An API to query resource self discovery.
+ *
+ * @type
+ *   Type of self resource discovery.
+ * @p_corelist
+ *   If succeed, fill core list which can be used. Caller to free.
+ * @p_memory
+ *   If succeed, fill how many (bytes) memory can be used.
+ *
+ * @return
+ *   - (-1), if failed.
+ *   - 0, if succeed.
+ */
+int rte_eal_res_self_discovery(const char *type,
+			       char **p_corelist, uint64_t *p_memory);
+/**
+ * An API to apply resource through self discovery.
+ *
+ * @type
+ *   Type of self resource discovery.
+ * @enable_core
+ *   If succeed, apply core resource.
+ * @p_memory
+ *   If succeed, apply memory resource.
+ *
+ * @return
+ *   - (-1), if failed.
+ *   - 0, if succeed.
+ */
+int rte_eal_res_self_discovery_apply(const char *type,
+				     int enable_core, int enable_mem);
+
+
+/**
  * Get system unique thread id.
  *
  * @return
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..834ae2f 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -87,6 +87,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_devargs.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_dev.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_options.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_cgroup.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += rte_malloc.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_elem.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_heap.c
diff --git a/lib/librte_eal/linuxapp/eal/eal_cgroup.c b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
new file mode 100644
index 0000000..d6a04ee
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
@@ -0,0 +1,294 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <mntent.h>
+#include <inttypes.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_common.h>
+
+#include "eal_internal_cfg.h"
+
+static int pid;
+
+static char *
+find_controller_dfs(const char *dir_path)
+{
+	FILE *f;
+	char *line;
+	char *ret;
+	size_t len;
+	ssize_t read;
+	DIR *dir;
+	struct dirent *ent;
+	char filepath[PATH_MAX];
+
+	// 1. check if this process belongs to this cgroup
+	snprintf(filepath, sizeof(filepath)-1, "%s/tasks", dir_path);
+	f = fopen(filepath, "r");
+	if (f == NULL)
+		return NULL;
+	len = 0;
+	line = NULL;
+	while ((read = getline(&line, &len, f)) != -1) {
+		int _pid = atoi(line);
+		free(line);
+		if (_pid == pid)
+			break;
+		len = 0;
+		line = NULL;
+	}
+	fclose(f);
+	if (read != -1)
+		return strdup(dir_path);
+
+	// 2. check its child cgroup
+	if (!(dir = opendir(dir_path)))
+		return NULL;
+
+	ret = NULL;
+	while ((ent = readdir(dir)) != NULL) {
+		if (ent->d_type != DT_DIR)
+			continue;
+		if (strcmp(ent->d_name, ".") == 0 ||
+		    strcmp(ent->d_name, "..") == 0)
+			continue;
+
+		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
+			 dir_path, ent->d_name);
+
+		ret = find_controller_dfs(filepath);
+		if (ret != NULL)
+			break;
+	}
+
+	closedir(dir);
+	return ret;
+}
+
+static char *
+find_controller(const char *controller)
+{
+	FILE *f;
+	char *path;
+	struct mntent *ent;
+
+	static const char *proc_mounts = "/proc/mounts";
+	static const char *fs_type = "cgroup";
+
+	f = setmntent(proc_mounts, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_mounts);
+		return NULL;
+	}
+
+	while (NULL != (ent = getmntent(f))) {
+		if (strcmp(ent->mnt_type, fs_type) != 0)
+			continue;
+		if (hasmntopt(ent, controller) == NULL)
+			continue;
+		break;
+	}
+
+	if (ent == NULL) {
+		path = NULL;
+		goto end;
+	}
+
+	path = find_controller_dfs(ent->mnt_dir);
+end:
+	endmntent(f);
+	return path;
+}
+
+static inline char *
+get_oneline_from_file(const char *path)
+{
+	FILE *f;
+	char *line = NULL;
+	size_t len = 0;
+
+	if (NULL == (f = fopen(path, "r")))
+		return NULL;
+	if (getline(&line, &len, f) == -1)
+		line = NULL;
+	line[strcspn(line, "\n")] = 0;
+	fclose(f);
+	return line;
+}
+
+static int
+cgroup_cpuset(char **p_corelist, int enable __rte_unused)
+{
+	char filepath[PATH_MAX];
+	char *controller;
+
+       	controller = find_controller("cpuset");
+	if (controller == NULL)
+		return -1;
+
+	snprintf(filepath, sizeof(filepath)-1, "%s/cpuset.cpus", controller);
+	*p_corelist = get_oneline_from_file(filepath);
+	RTE_LOG(INFO, EAL, "cgroup cpuset: %s\n", *p_corelist);
+	return 0;
+
+}
+
+static inline uint64_t
+get_hugetlb_limit(const char *path)
+{
+	uint64_t limit;
+	char *str;
+
+       	str = get_oneline_from_file(path);
+	sscanf(str, "%"PRIu64, &limit);
+	free(str);
+	return limit;
+}
+
+static int
+cgroup_hugetlb(uint64_t *p_memory, int enable)
+{
+	unsigned i;
+	char filepath[PATH_MAX];
+	char *controller;
+	DIR *dir;
+	struct dirent *ent;
+	uint64_t memory = 0;
+	static char prefix[] = "hugetlb";
+	static int prefix_len = sizeof(prefix) - 1;
+	static char suffix[] = "limit_in_bytes";
+
+       	controller = find_controller("hugetlb");
+	if (controller == NULL)
+		return -1;
+
+	if (!(dir = opendir(controller)))
+		return -1;
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (strncmp(ent->d_name, prefix, prefix_len) != 0)
+			continue;
+
+		char *sz_beg = ent->d_name + prefix_len + 1;
+		char *sz_end = strchr(sz_beg, '.');
+
+		if (strcmp(sz_end + 1, suffix) != 0)
+			continue;
+
+		char *tmp = strndup(sz_beg, sz_end - sz_beg);
+		uint64_t pagesize = rte_str_to_size(tmp);
+		free(tmp);
+
+		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
+			 controller, ent->d_name);
+		uint64_t m_limit = get_hugetlb_limit(filepath);
+		memory += m_limit;
+
+		/* Record those information into internal_config if hugepages
+		 * are already initialized.
+		 */
+		if (! enable)
+			continue;
+		for (i = 0; i < internal_config.num_hugepage_sizes; ++i) {
+			struct hugepage_info *hp;
+
+		       	hp = &internal_config.hugepage_info[i];
+			if (hp->hugepage_sz != pagesize)
+				continue;
+
+			if (m_limit < hp->hugepage_sz * hp->num_pages[0])
+				hp->num_pages[0] = m_limit / hp->hugepage_sz;
+		}
+	}
+
+	closedir(dir);
+	*p_memory = memory;
+	RTE_LOG(INFO, EAL, "cgroup hugetlb: %"PRIx64"\n", *p_memory);
+	return 0;
+}
+
+static int
+resource_self_discovery(const char *type, char **p_corelist, int enable_core,
+			uint64_t *p_memory, int enable_mem)
+{
+	if (strcmp(type, "cgroup") != 0) {
+		RTE_LOG(ERR, EAL, "type not supported: %s\n", type);
+		return -1;
+	}
+
+	pid = getpid();
+
+	if (p_corelist != NULL && cgroup_cpuset(p_corelist, enable_core) < 0) {
+		RTE_LOG(ERR, EAL, "Failed when discover resource cpuset\n");
+		return -1;
+	}
+	if (p_memory != NULL && cgroup_hugetlb(p_memory, enable_mem) < 0) {
+		RTE_LOG(ERR, EAL, "Failed when discover resource hugetlb\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+rte_eal_res_self_discovery(const char *type, char **p_corelist,
+			   uint64_t *p_memory)
+{
+	return resource_self_discovery(type, p_corelist, 0, p_memory, 0);
+}
+
+int
+rte_eal_res_self_discovery_apply(const char *type, int enable_core,
+				 int enable_mem)
+{
+	char *corelist, **pc = NULL;
+	uint64_t mem, *pm = NULL;
+	
+	if (enable_core)
+		pc = &corelist;
+	if (enable_mem)
+		pm = &mem;
+
+	return resource_self_discovery(type, pc, enable_core,
+				       pm, enable_mem);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 18858e2..a6b6548 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -354,6 +354,11 @@ eal_hugepage_info_init(void)
 	qsort(&internal_config.hugepage_info[0], num_sizes,
 	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
 
+	/* Apply cgroup hugetlb limit before we really use hugepages */
+	if (internal_config.self_discovery)
+		rte_eal_res_self_discovery_apply(internal_config.self_discovery,
+						 0, 1);
+
 	/* now we have all info, check we have at least one valid size */
 	for (i = 0; i < num_sizes; i++)
 		if (internal_config.hugepage_info[i].hugedir != NULL &&
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [RFC] eal: add cgroup-aware resource self discovery
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
@ 2016-01-25 13:46 ` Neil Horman
  2016-01-26  2:22   ` Tan, Jianfeng
  2016-01-29 11:22 ` [PATCH] eal: make resource initialization more robust Jianfeng Tan
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 63+ messages in thread
From: Neil Horman @ 2016-01-25 13:46 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, yuanhan.liu

On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> Current issue: DPDK is not that friendly to container environment usage.
> It's because that it pre-alloc resource like cores and hugepages from cmd
> line options. So for a DPDK application, it's necessary to check how much
> resource is allocated to a container and then use that as an reference.
> 
> To address that, this patch introduces two APIs:
>    a. rte_eal_res_self_discovery, to query how much resource can be used.
>    b. rte_eal_res_self_discovery_apply, to apply self-discovered resource
>       into DPDK.
> 
> Currently only Linux CGroup is added, similarly, we can add BSD jail as
> well in the future. And even in Linux, there could be other way to query
> and apply resources, like through a centralized daemon.
> 
> Known issue: current way to read individual attributes of cgroups directly
> instead of via systemd's API is not a long-term solution. Please refer to
> http://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
> for more information.
> 
> Test example:
>     a. cgcreate -g cpuset,hugetlb:/test-subgroup
>     b. cgset -r cpuset.cpus=2-3 test-subgroup
>     c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>     d. cgexec -g cpuset,hugetlb:test-subgroup \
> 	    ./examples/l2fwd/build/l2fwd --self-discovery=cgroup -n 4 -- -p 3
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_options.c      |  39 ++++
>  lib/librte_eal/common/eal_internal_cfg.h        |   1 +
>  lib/librte_eal/common/eal_options.h             |   2 +
>  lib/librte_eal/common/include/rte_eal.h         |  34 +++
>  lib/librte_eal/linuxapp/eal/Makefile            |   1 +
>  lib/librte_eal/linuxapp/eal/eal_cgroup.c        | 294 ++++++++++++++++++++++++
>  lib/librte_eal/linuxapp/eal/eal_hugepage_info.c |   5 +
>  7 files changed, 376 insertions(+)
>  create mode 100644 lib/librte_eal/linuxapp/eal/eal_cgroup.c
> 
> diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
> index 29942ea..7235473 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
>  	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
>  	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
>  	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
> +	{OPT_SELF_DISCOVERY,    1, NULL, OPT_SELF_DISCOVERY_NUM   },
>  	{0,                     0, NULL, 0                        }
>  };
>  
> @@ -128,6 +129,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
>  	internal_cfg->force_nchannel = 0;
>  	internal_cfg->hugefile_prefix = HUGEFILE_PREFIX_DEFAULT;
>  	internal_cfg->hugepage_dir = NULL;
> +	internal_cfg->self_discovery = NULL;
>  	internal_cfg->force_sockets = 0;
>  	/* zero out the NUMA config */
>  	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> @@ -755,6 +757,24 @@ eal_parse_proc_type(const char *arg)
>  }
>  
>  int
> +__attribute__((weak))
> +rte_eal_res_self_discovery(const char *type __rte_unused,
> +			   char **p_corelist __rte_unused,
> +			   uint64_t *p_memory __rte_unused)
> +{
> +	return -1;
> +}
> +
> +int
> +__attribute__((weak))
> +rte_eal_res_self_discovery_apply(const char *type __rte_unused,
> +				 int enable_core __rte_unused,
> +				 int enable_mem __rte_unused)
> +{
> +	return -1;
> +}
> +
> +int
>  eal_parse_common_option(int opt, const char *optarg,
>  			struct internal_config *conf)
>  {
> @@ -897,6 +917,25 @@ eal_parse_common_option(int opt, const char *optarg,
>  		}
>  		break;
>  
> +	case OPT_SELF_DISCOVERY_NUM: {
> +		char *corelist;
> +
> +		if (rte_eal_res_self_discovery(optarg, &corelist, NULL) < 0) {
> +			RTE_LOG(ERR, EAL, "invalid parameter for --"
> +				OPT_SELF_DISCOVERY "\n");
> +			return -1;
> +		}
> +
> +		if (eal_parse_corelist(corelist) < 0) {
> +			RTE_LOG(ERR, EAL, "invalid core list\n");
> +			return -1;
> +		}
> +		/* Save it here for memory limit */
> +		internal_config.self_discovery = strdup(optarg);
> +
> +		break;
> +	}
> +
>  	/* don't know what to do, leave this to caller */
>  	default:
>  		return 1;
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..f3c8e31 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -83,6 +83,7 @@ struct internal_config {
>  	volatile enum rte_intr_mode vfio_intr_mode;
>  	const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
>  	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
> +	const char *self_discovery;       /**< specific type of self_discovery */
>  
>  	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
>  	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index a881c62..a499d73 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
>  	OPT_VMWARE_TSC_MAP_NUM,
>  #define OPT_XEN_DOM0          "xen-dom0"
>  	OPT_XEN_DOM0_NUM,
> +#define OPT_SELF_DISCOVERY    "self-discovery"
> +	OPT_SELF_DISCOVERY_NUM,
>  	OPT_LONG_MAX_NUM
>  };
>  
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index d2816a8..ff81484 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -220,6 +220,40 @@ int rte_eal_has_hugepages(void);
>  int rte_sys_gettid(void);
>  
>  /**
> + * An API to query resource self discovery.
> + *
> + * @type
> + *   Type of self resource discovery.
> + * @p_corelist
> + *   If succeed, fill core list which can be used. Caller to free.
> + * @p_memory
> + *   If succeed, fill how many (bytes) memory can be used.
> + *
> + * @return
> + *   - (-1), if failed.
> + *   - 0, if succeed.
> + */
> +int rte_eal_res_self_discovery(const char *type,
> +			       char **p_corelist, uint64_t *p_memory);
> +/**
> + * An API to apply resource through self discovery.
> + *
> + * @type
> + *   Type of self resource discovery.
> + * @enable_core
> + *   If succeed, apply core resource.
> + * @p_memory
> + *   If succeed, apply memory resource.
> + *
> + * @return
> + *   - (-1), if failed.
> + *   - 0, if succeed.
> + */
> +int rte_eal_res_self_discovery_apply(const char *type,
> +				     int enable_core, int enable_mem);
> +
> +
> +/**
>   * Get system unique thread id.
>   *
>   * @return
> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
> index 26eced5..834ae2f 100644
> --- a/lib/librte_eal/linuxapp/eal/Makefile
> +++ b/lib/librte_eal/linuxapp/eal/Makefile
> @@ -87,6 +87,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_devargs.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_dev.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_options.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_thread.c
> +SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_cgroup.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += rte_malloc.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_elem.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_heap.c
> diff --git a/lib/librte_eal/linuxapp/eal/eal_cgroup.c b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
> new file mode 100644
> index 0000000..d6a04ee
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
> @@ -0,0 +1,294 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +#include <sys/stat.h>
> +#include <sys/file.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <mntent.h>
> +#include <inttypes.h>
> +
> +#include <rte_log.h>
> +#include <rte_eal.h>
> +#include <rte_common.h>
> +
> +#include "eal_internal_cfg.h"
> +
> +static int pid;
> +
> +static char *
> +find_controller_dfs(const char *dir_path)
> +{
> +	FILE *f;
> +	char *line;
> +	char *ret;
> +	size_t len;
> +	ssize_t read;
> +	DIR *dir;
> +	struct dirent *ent;
> +	char filepath[PATH_MAX];
> +
> +	// 1. check if this process belongs to this cgroup
> +	snprintf(filepath, sizeof(filepath)-1, "%s/tasks", dir_path);
> +	f = fopen(filepath, "r");
> +	if (f == NULL)
> +		return NULL;
> +	len = 0;
> +	line = NULL;
> +	while ((read = getline(&line, &len, f)) != -1) {
> +		int _pid = atoi(line);
> +		free(line);
> +		if (_pid == pid)
> +			break;
> +		len = 0;
> +		line = NULL;
> +	}
> +	fclose(f);
> +	if (read != -1)
> +		return strdup(dir_path);
> +
> +	// 2. check its child cgroup
> +	if (!(dir = opendir(dir_path)))
> +		return NULL;
> +
> +	ret = NULL;
> +	while ((ent = readdir(dir)) != NULL) {
> +		if (ent->d_type != DT_DIR)
> +			continue;
> +		if (strcmp(ent->d_name, ".") == 0 ||
> +		    strcmp(ent->d_name, "..") == 0)
> +			continue;
> +
> +		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
> +			 dir_path, ent->d_name);
> +
> +		ret = find_controller_dfs(filepath);
> +		if (ret != NULL)
> +			break;
> +	}
> +
> +	closedir(dir);
> +	return ret;
> +}
> +
> +static char *
> +find_controller(const char *controller)
> +{
> +	FILE *f;
> +	char *path;
> +	struct mntent *ent;
> +
> +	static const char *proc_mounts = "/proc/mounts";
> +	static const char *fs_type = "cgroup";
> +
> +	f = setmntent(proc_mounts, "r");
> +	if (f == NULL) {
> +		RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_mounts);
> +		return NULL;
> +	}
> +
> +	while (NULL != (ent = getmntent(f))) {
> +		if (strcmp(ent->mnt_type, fs_type) != 0)
> +			continue;
> +		if (hasmntopt(ent, controller) == NULL)
> +			continue;
> +		break;
> +	}
> +
> +	if (ent == NULL) {
> +		path = NULL;
> +		goto end;
> +	}
> +
> +	path = find_controller_dfs(ent->mnt_dir);
> +end:
> +	endmntent(f);
> +	return path;
> +}
> +
> +static inline char *
> +get_oneline_from_file(const char *path)
> +{
> +	FILE *f;
> +	char *line = NULL;
> +	size_t len = 0;
> +
> +	if (NULL == (f = fopen(path, "r")))
> +		return NULL;
> +	if (getline(&line, &len, f) == -1)
> +		line = NULL;
> +	line[strcspn(line, "\n")] = 0;
> +	fclose(f);
> +	return line;
> +}
> +
> +static int
> +cgroup_cpuset(char **p_corelist, int enable __rte_unused)
> +{
> +	char filepath[PATH_MAX];
> +	char *controller;
> +
> +       	controller = find_controller("cpuset");
> +	if (controller == NULL)
> +		return -1;
> +
> +	snprintf(filepath, sizeof(filepath)-1, "%s/cpuset.cpus", controller);
> +	*p_corelist = get_oneline_from_file(filepath);
> +	RTE_LOG(INFO, EAL, "cgroup cpuset: %s\n", *p_corelist);
> +	return 0;
> +
> +}
> +
> +static inline uint64_t
> +get_hugetlb_limit(const char *path)
> +{
> +	uint64_t limit;
> +	char *str;
> +
> +       	str = get_oneline_from_file(path);
> +	sscanf(str, "%"PRIu64, &limit);
> +	free(str);
> +	return limit;
> +}
> +
> +static int
> +cgroup_hugetlb(uint64_t *p_memory, int enable)
> +{
> +	unsigned i;
> +	char filepath[PATH_MAX];
> +	char *controller;
> +	DIR *dir;
> +	struct dirent *ent;
> +	uint64_t memory = 0;
> +	static char prefix[] = "hugetlb";
> +	static int prefix_len = sizeof(prefix) - 1;
> +	static char suffix[] = "limit_in_bytes";
> +
> +       	controller = find_controller("hugetlb");
> +	if (controller == NULL)
> +		return -1;
> +
> +	if (!(dir = opendir(controller)))
> +		return -1;
> +
> +	while ((ent = readdir(dir)) != NULL) {
> +		if (strncmp(ent->d_name, prefix, prefix_len) != 0)
> +			continue;
> +
> +		char *sz_beg = ent->d_name + prefix_len + 1;
> +		char *sz_end = strchr(sz_beg, '.');
> +
> +		if (strcmp(sz_end + 1, suffix) != 0)
> +			continue;
> +
> +		char *tmp = strndup(sz_beg, sz_end - sz_beg);
> +		uint64_t pagesize = rte_str_to_size(tmp);
> +		free(tmp);
> +
> +		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
> +			 controller, ent->d_name);
> +		uint64_t m_limit = get_hugetlb_limit(filepath);
> +		memory += m_limit;
> +
> +		/* Record those information into internal_config if hugepages
> +		 * are already initialized.
> +		 */
> +		if (! enable)
> +			continue;
> +		for (i = 0; i < internal_config.num_hugepage_sizes; ++i) {
> +			struct hugepage_info *hp;
> +
> +		       	hp = &internal_config.hugepage_info[i];
> +			if (hp->hugepage_sz != pagesize)
> +				continue;
> +
> +			if (m_limit < hp->hugepage_sz * hp->num_pages[0])
> +				hp->num_pages[0] = m_limit / hp->hugepage_sz;
> +		}
> +	}
> +
> +	closedir(dir);
> +	*p_memory = memory;
> +	RTE_LOG(INFO, EAL, "cgroup hugetlb: %"PRIx64"\n", *p_memory);
> +	return 0;
> +}
> +
> +static int
> +resource_self_discovery(const char *type, char **p_corelist, int enable_core,
> +			uint64_t *p_memory, int enable_mem)
> +{
> +	if (strcmp(type, "cgroup") != 0) {
> +		RTE_LOG(ERR, EAL, "type not supported: %s\n", type);
> +		return -1;
> +	}
> +
> +	pid = getpid();
> +
> +	if (p_corelist != NULL && cgroup_cpuset(p_corelist, enable_core) < 0) {
> +		RTE_LOG(ERR, EAL, "Failed when discover resource cpuset\n");
> +		return -1;
> +	}
> +	if (p_memory != NULL && cgroup_hugetlb(p_memory, enable_mem) < 0) {
> +		RTE_LOG(ERR, EAL, "Failed when discover resource hugetlb\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +int
> +rte_eal_res_self_discovery(const char *type, char **p_corelist,
> +			   uint64_t *p_memory)
> +{
> +	return resource_self_discovery(type, p_corelist, 0, p_memory, 0);
> +}
> +
> +int
> +rte_eal_res_self_discovery_apply(const char *type, int enable_core,
> +				 int enable_mem)
> +{
> +	char *corelist, **pc = NULL;
> +	uint64_t mem, *pm = NULL;
> +	
> +	if (enable_core)
> +		pc = &corelist;
> +	if (enable_mem)
> +		pm = &mem;
> +
> +	return resource_self_discovery(type, pc, enable_core,
> +				       pm, enable_mem);
> +}
> diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> index 18858e2..a6b6548 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> @@ -354,6 +354,11 @@ eal_hugepage_info_init(void)
>  	qsort(&internal_config.hugepage_info[0], num_sizes,
>  	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
>  
> +	/* Apply cgroup hugetlb limit before we really use hugepages */
> +	if (internal_config.self_discovery)
> +		rte_eal_res_self_discovery_apply(internal_config.self_discovery,
> +						 0, 1);
> +
>  	/* now we have all info, check we have at least one valid size */
>  	for (i = 0; i < num_sizes; i++)
>  		if (internal_config.hugepage_info[i].hugedir != NULL &&
> -- 
> 2.1.4
> 
> 


This doesn't make a whole lot of sense, for several reasons:

1) Applications, as a general rule shouldn't be interrogating the cgroups
interface at all.  

2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
(the isolcpus command line argument, or a taskset on a parent process for
instance, but there are several others).

Instead of trying to figure out what cpuset is valid for your process by
interrogating the cgroups heirarchy, instead you should follow the proscribed
method of calling sched_getaffinity after calling sched_setaffinity.  That will
give you the canonical cpuset that you are executing on, taking all cpuset
filters into account (including cgroups and any other restrictions).  Its far
simpler as well, as it doesn't require a ton of file/string processing.

Neil

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC] eal: add cgroup-aware resource self discovery
  2016-01-25 13:46 ` Neil Horman
@ 2016-01-26  2:22   ` Tan, Jianfeng
  2016-01-26 14:19     ` Neil Horman
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-01-26  2:22 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, yuanhan.liu


Hi Neil,

On 1/25/2016 9:46 PM, Neil Horman wrote:
> On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
...
>> -- 
>> 2.1.4
>>
>>
>
> This doesn't make a whole lot of sense, for several reasons:
>
> 1) Applications, as a general rule shouldn't be interrogating the cgroups
> interface at all.

The main reason to do this in DPDK is that DPDK obtains resource 
information from sysfs and proc, which are not well containerized so 
far. And DPDK pre-allocates resource instead of on-demand gradual 
allocating.

>
> 2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
> (the isolcpus command line argument, or a taskset on a parent process for
> instance, but there are several others).

Yes, I agree. To enable that, I'd like design the new API for resource 
self discovery in a flexible way. A parameter "type" is used to specify 
the solution to discovery way. In addition, I'm considering to add a 
callback function pointer so that users can write their own resource 
discovery functions.

>
> Instead of trying to figure out what cpuset is valid for your process by
> interrogating the cgroups heirarchy, instead you should follow the proscribed
> method of calling sched_getaffinity after calling sched_setaffinity.  That will
> give you the canonical cpuset that you are executing on, taking all cpuset
> filters into account (including cgroups and any other restrictions).  Its far
> simpler as well, as it doesn't require a ton of file/string processing.

Yes, this way is much better for cpuset discovery. But is there such a 
syscall for hugepages?

Thanks,
Jianfeng

>
> Neil
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC] eal: add cgroup-aware resource self discovery
  2016-01-26  2:22   ` Tan, Jianfeng
@ 2016-01-26 14:19     ` Neil Horman
  2016-01-27 12:02       ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Neil Horman @ 2016-01-26 14:19 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: dev, yuanhan.liu

On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
> 
> Hi Neil,
> 
> On 1/25/2016 9:46 PM, Neil Horman wrote:
> >On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> ...
> >>-- 
> >>2.1.4
> >>
> >>
> >
> >This doesn't make a whole lot of sense, for several reasons:
> >
> >1) Applications, as a general rule shouldn't be interrogating the cgroups
> >interface at all.
> 
> The main reason to do this in DPDK is that DPDK obtains resource information
> from sysfs and proc, which are not well containerized so far. And DPDK
> pre-allocates resource instead of on-demand gradual allocating.
> 
Not disagreeing with this, just suggesting that:

1) Interrogating cgroups really isn't the best way to collect that information
2) Pre-allocating those resources isn't particularly wise without some mechanism
to reallocate it, as resource constraints can change (consider your cpuset
getting rewritten)

> >
> >2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
> >(the isolcpus command line argument, or a taskset on a parent process for
> >instance, but there are several others).
> 
> Yes, I agree. To enable that, I'd like design the new API for resource self
> discovery in a flexible way. A parameter "type" is used to specify the
> solution to discovery way. In addition, I'm considering to add a callback
> function pointer so that users can write their own resource discovery
> functions.
> 
Why?  You don't need an API for this, or if you really want one, it can be very
generic if you use POSIX apis to gather the information.  What you have here is
going to be very linux specific, and will need reimplementing for BSD or other
operating systems.  To use the cpuset example, instead of reading and parsing
the mask files in the cgroup filesystem module to find your task and
corresponding mask, just call sched_setaffinity with an all f's mask, then call
sched_getaffinity.  The returned mask will be all the cpus your process is
allowed to execute on, taking into account every limiting filter the system you
are running on offers.

There are simmilar OS level POSIX apis for most resources out there.  You really
don't need to dig through cgroups just to learn what some of those reources are.

> >
> >Instead of trying to figure out what cpuset is valid for your process by
> >interrogating the cgroups heirarchy, instead you should follow the proscribed
> >method of calling sched_getaffinity after calling sched_setaffinity.  That will
> >give you the canonical cpuset that you are executing on, taking all cpuset
> >filters into account (including cgroups and any other restrictions).  Its far
> >simpler as well, as it doesn't require a ton of file/string processing.
> 
> Yes, this way is much better for cpuset discovery. But is there such a
> syscall for hugepages?
> 
In what capacity?  Interrogating how many hugepages you have, or to what node
they are affined to?  Capacity would require reading the requisite proc file, as
theres no posix api for this resource.  Node affinity can be implied by setting
the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
kernel will attempt to distribute hugepages evenly among the tasks' numa policy
configuration.

That said, I would advise that you strongly consider not exporting hugepages as
a resource, as:

a) Applications generally don't need to know that they are using hugepages, and
so they dont need to know where said hugepages live, they just allocate memory
via your allocation api and you give them something appropriate

b) Hugepages are a resource that are very specific to Linux, and to X86 Linux at
that.  Some OS implement simmilar resources, but they may have very different
semantics.  And other Arches may or may not implement various forms of compound
paging at all.  As the DPDK expands to support more OS'es and arches, it would
be nice to ensure that the programming surfaces that you expose have a more
broad level of support.

Neil

> Thanks,
> Jianfeng
> 
> >
> >Neil
> >
> 
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC] eal: add cgroup-aware resource self discovery
  2016-01-26 14:19     ` Neil Horman
@ 2016-01-27 12:02       ` Tan, Jianfeng
  2016-01-27 17:30         ` Neil Horman
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-01-27 12:02 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev, yuanhan.liu

Hi Neil,

On 1/26/2016 10:19 PM, Neil Horman wrote:
> On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
>> Hi Neil,
>>
>> On 1/25/2016 9:46 PM, Neil Horman wrote:
>>> On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
>> ...
>>>> -- 
>>>> 2.1.4
>>>>
>>>>
>>> This doesn't make a whole lot of sense, for several reasons:
>>>
>>> 1) Applications, as a general rule shouldn't be interrogating the cgroups
>>> interface at all.
>> The main reason to do this in DPDK is that DPDK obtains resource information
>> from sysfs and proc, which are not well containerized so far. And DPDK
>> pre-allocates resource instead of on-demand gradual allocating.
>>
> Not disagreeing with this, just suggesting that:
>
> 1) Interrogating cgroups really isn't the best way to collect that information
> 2) Pre-allocating those resources isn't particularly wise without some mechanism
> to reallocate it, as resource constraints can change (consider your cpuset
> getting rewritten)

In the case of reallocate,
For cpuset, DPDK panics in the initialization if set_affinity fails, but 
after that, cpuset rewritten will not bring any problem I believe.
For memory, a running application uses 2G hugepages, then admin 
decreases hugetlb cgroup into 1G, the application will not get killed, 
unless it tries to access more hugepages (I'll double check this).

So another way to address this problem is to add an option that DPDK 
tries best to allocate those resources, and if fails, it just posts a 
warning and uses those allocated resources, instead of panic. What do 
you think?

>
>>> 2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
>>> (the isolcpus command line argument, or a taskset on a parent process for
>>> instance, but there are several others).
>> Yes, I agree. To enable that, I'd like design the new API for resource self
>> discovery in a flexible way. A parameter "type" is used to specify the
>> solution to discovery way. In addition, I'm considering to add a callback
>> function pointer so that users can write their own resource discovery
>> functions.
>>
> Why?  You don't need an API for this, or if you really want one, it can be very
> generic if you use POSIX apis to gather the information.  What you have here is
> going to be very linux specific, and will need reimplementing for BSD or other
> operating systems.  To use the cpuset example, instead of reading and parsing
> the mask files in the cgroup filesystem module to find your task and
> corresponding mask, just call sched_setaffinity with an all f's mask, then call
> sched_getaffinity.  The returned mask will be all the cpus your process is
> allowed to execute on, taking into account every limiting filter the system you
> are running on offers.

Yes, it makes sense on cpu's side.

>
> There are simmilar OS level POSIX apis for most resources out there.  You really
> don't need to dig through cgroups just to learn what some of those reources are.
>
>>> Instead of trying to figure out what cpuset is valid for your process by
>>> interrogating the cgroups heirarchy, instead you should follow the proscribed
>>> method of calling sched_getaffinity after calling sched_setaffinity.  That will
>>> give you the canonical cpuset that you are executing on, taking all cpuset
>>> filters into account (including cgroups and any other restrictions).  Its far
>>> simpler as well, as it doesn't require a ton of file/string processing.
>> Yes, this way is much better for cpuset discovery. But is there such a
>> syscall for hugepages?
>>
> In what capacity?  Interrogating how many hugepages you have, or to what node
> they are affined to?  Capacity would require reading the requisite proc file, as
> theres no posix api for this resource.  Node affinity can be implied by setting
> the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
> kernel will attempt to distribute hugepages evenly among the tasks' numa policy
> configuration.

For memory affinity, I believe the existing way of reading 
/proc/self/pagemap already handle the problem. What I was asking is how 
much memory (or hugepages in Linux's case) can be used. By the way, what 
is /proc/nr_hugepages?

>
> That said, I would advise that you strongly consider not exporting hugepages as
> a resource, as:
>
> a) Applications generally don't need to know that they are using hugepages, and
> so they dont need to know where said hugepages live, they just allocate memory
> via your allocation api and you give them something appropriate

But the allocation api provider, DPDK library, needs to know if it's 
using hugepages or not.

> b) Hugepages are a resource that are very specific to Linux, and to X86 Linux at
> that.  Some OS implement simmilar resources, but they may have very different
> semantics.  And other Arches may or may not implement various forms of compound
> paging at all.  As the DPDK expands to support more OS'es and arches, it would
> be nice to ensure that the programming surfaces that you expose have a more
> broad level of support.

That's why I put current implement in lib/librte_eal/linuxapp/. And the 
new API uses the words of cores and memory, which is very generic IMO. 
In Linux's context, memory is interpreted into hugepages (maybe not 
correct because DPDK can be used with 4K memory). For other OSes, we 
could add similar limitation in their semantics.


Thanks,
Jianfeng

>
> Neil
>
>> Thanks,
>> Jianfeng
>>
>>> Neil
>>>
>>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [RFC] eal: add cgroup-aware resource self discovery
  2016-01-27 12:02       ` Tan, Jianfeng
@ 2016-01-27 17:30         ` Neil Horman
  0 siblings, 0 replies; 63+ messages in thread
From: Neil Horman @ 2016-01-27 17:30 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: dev, yuanhan.liu

On Wed, Jan 27, 2016 at 08:02:27PM +0800, Tan, Jianfeng wrote:
> Hi Neil,
> 
> On 1/26/2016 10:19 PM, Neil Horman wrote:
> >On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
> >>Hi Neil,
> >>
> >>On 1/25/2016 9:46 PM, Neil Horman wrote:
> >>>On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> >>...
> >>>>-- 
> >>>>2.1.4
> >>>>
> >>>>
> >>>This doesn't make a whole lot of sense, for several reasons:
> >>>
> >>>1) Applications, as a general rule shouldn't be interrogating the cgroups
> >>>interface at all.
> >>The main reason to do this in DPDK is that DPDK obtains resource information
> >>from sysfs and proc, which are not well containerized so far. And DPDK
> >>pre-allocates resource instead of on-demand gradual allocating.
> >>
> >Not disagreeing with this, just suggesting that:
> >
> >1) Interrogating cgroups really isn't the best way to collect that information
> >2) Pre-allocating those resources isn't particularly wise without some mechanism
> >to reallocate it, as resource constraints can change (consider your cpuset
> >getting rewritten)
> 
> In the case of reallocate,
> For cpuset, DPDK panics in the initialization if set_affinity fails, but
> after that, cpuset rewritten will not bring any problem I believe.
Yes, that seems reasonable, but I think you need to update
rte_thread_set_affinity to not assume that success in pthread_setaffinity_np
means that all cpus in the provided mask are available.  That is to say, cpusetp
is subsequently stored in lore information after the set, but may not reflect
the actual working set of processors, you should follow a successful set with a
call to pthread_getaffinity_np to retrieve the actual working cpuset

As for subsequent changes to the cpuset, I'm not sure how you want to handle
that. I would think that you might want to run a check periodically or alow for
a SIGHUP or some other signal to trigger a rescan of your working cpuset so as
to keep the application in sync with the system.

> For memory, a running application uses 2G hugepages, then admin decreases
> hugetlb cgroup into 1G, the application will not get killed, unless it tries
> to access more hugepages (I'll double check this).
> 
No, the semantics should be identical to malloc/mmap (if you use the alloc_hugepages
api or the mmap api).  You should get a NULL return or other no fatal indicator
if you allocate more than is available.

> So another way to address this problem is to add an option that DPDK tries
> best to allocate those resources, and if fails, it just posts a warning and
> uses those allocated resources, instead of panic. What do you think?
> 
Yes, that makes sense

> >
> >>>2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
> >>>(the isolcpus command line argument, or a taskset on a parent process for
> >>>instance, but there are several others).
> >>Yes, I agree. To enable that, I'd like design the new API for resource self
> >>discovery in a flexible way. A parameter "type" is used to specify the
> >>solution to discovery way. In addition, I'm considering to add a callback
> >>function pointer so that users can write their own resource discovery
> >>functions.
> >>
> >Why?  You don't need an API for this, or if you really want one, it can be very
> >generic if you use POSIX apis to gather the information.  What you have here is
> >going to be very linux specific, and will need reimplementing for BSD or other
> >operating systems.  To use the cpuset example, instead of reading and parsing
> >the mask files in the cgroup filesystem module to find your task and
> >corresponding mask, just call sched_setaffinity with an all f's mask, then call
> >sched_getaffinity.  The returned mask will be all the cpus your process is
> >allowed to execute on, taking into account every limiting filter the system you
> >are running on offers.
> 
> Yes, it makes sense on cpu's side.
> 
> >
> >There are simmilar OS level POSIX apis for most resources out there.  You really
> >don't need to dig through cgroups just to learn what some of those reources are.
> >
> >>>Instead of trying to figure out what cpuset is valid for your process by
> >>>interrogating the cgroups heirarchy, instead you should follow the proscribed
> >>>method of calling sched_getaffinity after calling sched_setaffinity.  That will
> >>>give you the canonical cpuset that you are executing on, taking all cpuset
> >>>filters into account (including cgroups and any other restrictions).  Its far
> >>>simpler as well, as it doesn't require a ton of file/string processing.
> >>Yes, this way is much better for cpuset discovery. But is there such a
> >>syscall for hugepages?
> >>
> >In what capacity?  Interrogating how many hugepages you have, or to what node
> >they are affined to?  Capacity would require reading the requisite proc file, as
> >theres no posix api for this resource.  Node affinity can be implied by setting
> >the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
> >kernel will attempt to distribute hugepages evenly among the tasks' numa policy
> >configuration.
> 
> For memory affinity, I believe the existing way of reading
> /proc/self/pagemap already handle the problem. What I was asking is how much
> memory (or hugepages in Linux's case) can be used. By the way, what is
> /proc/nr_hugepages?
> 
For affinity, you can parse /proc/self/pagemap or any number of other procfiles,
but again, doing so is going to be very OS specific, and doesn't get you much in
terms or resource management. It only tells you where the pages reside now.

/proc/nr_hugepages is the proc tunable that lets you allocate/realocate
hugepages.

> >
> >That said, I would advise that you strongly consider not exporting hugepages as
> >a resource, as:
> >
> >a) Applications generally don't need to know that they are using hugepages, and
> >so they dont need to know where said hugepages live, they just allocate memory
> >via your allocation api and you give them something appropriate
> 
> But the allocation api provider, DPDK library, needs to know if it's using
> hugepages or not.
> 
Right, but you're purpose was to expose thie library to applications.  I'm
saying you really don't need to expose such a library API to applications. If
you just want to use it internally to dpdk, thats fine.

> >b) Hugepages are a resource that are very specific to Linux, and to X86 Linux at
> >that.  Some OS implement simmilar resources, but they may have very different
> >semantics.  And other Arches may or may not implement various forms of compound
> >paging at all.  As the DPDK expands to support more OS'es and arches, it would
> >be nice to ensure that the programming surfaces that you expose have a more
> >broad level of support.
> 
> That's why I put current implement in lib/librte_eal/linuxapp/. And the new
> API uses the words of cores and memory, which is very generic IMO. In
> Linux's context, memory is interpreted into hugepages (maybe not correct
> because DPDK can be used with 4K memory). For other OSes, we could add
> similar limitation in their semantics.
> 
> 
> Thanks,
> Jianfeng
> 
> >
> >Neil
> >
> >>Thanks,
> >>Jianfeng
> >>
> >>>Neil
> >>>
> >>
> 
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] eal: make resource initialization more robust
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
  2016-01-25 13:46 ` Neil Horman
@ 2016-01-29 11:22 ` Jianfeng Tan
  2016-02-01 18:08   ` Neil Horman
                     ` (2 more replies)
  2016-03-04 10:05 ` [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
                   ` (4 subsequent siblings)
  6 siblings, 3 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-01-29 11:22 UTC (permalink / raw)
  To: dev

Current issue: DPDK is not that friendly to container environment, which
caused by that it pre-alloc resource like cores and hugepages. But there
are this or that resource limitations, for examples, cgroup, rlimit,
cpuset, etc.

For cores, this patch makes use of pthread_getaffinity_np to further
narrow down detected cores before parsing coremask (-c), corelist (-l),
and coremap (--lcores).

For hugepages, this patch adds a recover mechanism to the case that
there are no that many hugepages can be used. It relys on a mem access
to fault-in hugepages, and if fails with SIGBUS, recover to previously
saved stack environment with siglongjmp().

Test example:
    a. cgcreate -g cpuset,hugetlb:/test-subgroup
    b. cgset -r cpuset.cpus=2-3 test-subgroup
    c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
    d. cgexec -g cpuset,hugetlb:test-subgroup \
	    ./examples/l2fwd/build/l2fwd -n 4 -- -p 3

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_lcore.c | 10 +++-
 lib/librte_eal/linuxapp/eal/eal_memory.c | 78 ++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 9 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
index a4263ba..8e9c675 100644
--- a/lib/librte_eal/common/eal_common_lcore.c
+++ b/lib/librte_eal/common/eal_common_lcore.c
@@ -57,6 +57,13 @@ rte_eal_cpu_init(void)
 	struct rte_config *config = rte_eal_get_configuration();
 	unsigned lcore_id;
 	unsigned count = 0;
+	rte_cpuset_t cpuset;
+	pthread_t tid;
+
+	tid = pthread_self();
+	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cpuset) != 0)
+		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
+			CPU_SET(lcore_id, &cpuset);
 
 	/*
 	 * Parse the maximum set of logical cores, detect the subset of running
@@ -70,7 +77,8 @@ rte_eal_cpu_init(void)
 
 		/* in 1:1 mapping, record related cpu detected state */
 		lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
-		if (lcore_config[lcore_id].detected == 0) {
+		if (lcore_config[lcore_id].detected == 0 ||
+		    !CPU_ISSET(lcore_id, &cpuset)) {
 			config->lcore_role[lcore_id] = ROLE_OFF;
 			lcore_config[lcore_id].core_index = -1;
 			continue;
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 846fd31..837fd9e 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,12 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -396,7 +404,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		if (fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
@@ -405,11 +413,26 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
 			hugepg_tbl[i].orig_va = virtaddr;
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (sigsetjmp(jmpenv, 0)) {
+				RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				return i;
+			}
 			memset(virtaddr, 0, hugepage_sz);
 		}
 		else {
@@ -421,7 +444,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -429,7 +452,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1075,6 +1098,31 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static struct sigaction action_old;
+static int need_recover = 0;
+
+static void
+register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = sigbus_handler;
+
+	need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+	if (need_recover)
+		sigaction(SIGBUS, &action_old, NULL);
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1161,8 +1209,11 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		int pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1176,10 +1227,19 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+			internal_config.memory -=
+				hpi->hugepage_sz * (pages_old - pages_new);
+			nr_hugepages -= (pages_old - pages_new);
+			hpi->num_pages[0] = pages_new;
+			if (pages_new == 0)
+				continue;
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1226,6 +1286,8 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: make resource initialization more robust
  2016-01-29 11:22 ` [PATCH] eal: make resource initialization more robust Jianfeng Tan
@ 2016-02-01 18:08   ` Neil Horman
  2016-02-22  6:08   ` Tan, Jianfeng
  2016-02-28 21:12   ` Thomas Monjalon
  2 siblings, 0 replies; 63+ messages in thread
From: Neil Horman @ 2016-02-01 18:08 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev

On Fri, Jan 29, 2016 at 07:22:02PM +0800, Jianfeng Tan wrote:
> Current issue: DPDK is not that friendly to container environment, which
> caused by that it pre-alloc resource like cores and hugepages. But there
> are this or that resource limitations, for examples, cgroup, rlimit,
> cpuset, etc.
> 
> For cores, this patch makes use of pthread_getaffinity_np to further
> narrow down detected cores before parsing coremask (-c), corelist (-l),
> and coremap (--lcores).
> 
> For hugepages, this patch adds a recover mechanism to the case that
> there are no that many hugepages can be used. It relys on a mem access
> to fault-in hugepages, and if fails with SIGBUS, recover to previously
> saved stack environment with siglongjmp().
> 
> Test example:
>     a. cgcreate -g cpuset,hugetlb:/test-subgroup
>     b. cgset -r cpuset.cpus=2-3 test-subgroup
>     c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>     d. cgexec -g cpuset,hugetlb:test-subgroup \
> 	    ./examples/l2fwd/build/l2fwd -n 4 -- -p 3
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_lcore.c | 10 +++-
>  lib/librte_eal/linuxapp/eal/eal_memory.c | 78 ++++++++++++++++++++++++++++----
>  2 files changed, 79 insertions(+), 9 deletions(-)
> 

This looks alot better.  One minor comment, the sigbus handler, you should
probably store the previous bus handler and restore it after you map all the
hugepages you want (lest you overwrite something an application is doing with
sigbus).


Other than that, nice work.
Neil
 
> diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> index a4263ba..8e9c675 100644
> --- a/lib/librte_eal/common/eal_common_lcore.c
> +++ b/lib/librte_eal/common/eal_common_lcore.c
> @@ -57,6 +57,13 @@ rte_eal_cpu_init(void)
>  	struct rte_config *config = rte_eal_get_configuration();
>  	unsigned lcore_id;
>  	unsigned count = 0;
> +	rte_cpuset_t cpuset;
> +	pthread_t tid;
> +
> +	tid = pthread_self();
> +	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cpuset) != 0)
> +		for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++)
> +			CPU_SET(lcore_id, &cpuset);
>  
>  	/*
>  	 * Parse the maximum set of logical cores, detect the subset of running
> @@ -70,7 +77,8 @@ rte_eal_cpu_init(void)
>  
>  		/* in 1:1 mapping, record related cpu detected state */
>  		lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
> -		if (lcore_config[lcore_id].detected == 0) {
> +		if (lcore_config[lcore_id].detected == 0 ||
> +		    !CPU_ISSET(lcore_id, &cpuset)) {
>  			config->lcore_role[lcore_id] = ROLE_OFF;
>  			lcore_config[lcore_id].core_index = -1;
>  			continue;
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 846fd31..837fd9e 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -80,6 +80,8 @@
>  #include <errno.h>
>  #include <sys/ioctl.h>
>  #include <sys/time.h>
> +#include <signal.h>
> +#include <setjmp.h>
>  
>  #include <rte_log.h>
>  #include <rte_memory.h>
> @@ -309,6 +311,12 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
>  	return addr;
>  }
>  
> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> +	siglongjmp(jmpenv, 1);
> +}
>  /*
>   * Mmap all hugepages of hugepage table: it first open a file in
>   * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -396,7 +404,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  		if (fd < 0) {
>  			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
>  					strerror(errno));
> -			return -1;
> +			return i;
>  		}
>  
>  		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
> @@ -405,11 +413,26 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
>  					strerror(errno));
>  			close(fd);
> -			return -1;
> +			return i;
>  		}
>  
>  		if (orig) {
>  			hugepg_tbl[i].orig_va = virtaddr;
> +			/* In linux, hugetlb limitations, like cgroup, are
> +			 * enforced at fault time instead of mmap(), even
> +			 * with the option of MAP_POPULATE. Kernel will send
> +			 * a SIGBUS signal. To avoid to be killed, save stack
> +			 * environment here, if SIGBUS happens, we can jump
> +			 * back here.
> +			 */
> +			if (sigsetjmp(jmpenv, 0)) {
> +				RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
> +					"hugepages of size %u MB\n",
> +					(unsigned)(hugepage_sz / 0x100000));
> +				munmap(virtaddr, hugepage_sz);
> +				close(fd);
> +				return i;
> +			}
>  			memset(virtaddr, 0, hugepage_sz);
>  		}
>  		else {
> @@ -421,7 +444,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
>  				__func__, strerror(errno));
>  			close(fd);
> -			return -1;
> +			return i;
>  		}
>  
>  		close(fd);
> @@ -429,7 +452,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  		vma_addr = (char *)vma_addr + hugepage_sz;
>  		vma_len -= hugepage_sz;
>  	}
> -	return 0;
> +	return i;
>  }
>  
>  #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> @@ -1075,6 +1098,31 @@ calc_num_pages_per_socket(uint64_t * memory,
>  	return total_num_pages;
>  }
>  
> +static struct sigaction action_old;
> +static int need_recover = 0;
> +
> +static void
> +register_sigbus(void)
> +{
> +	sigset_t mask;
> +	struct sigaction action;
> +
> +	sigemptyset(&mask);
> +	sigaddset(&mask, SIGBUS);
> +	action.sa_flags = 0;
> +	action.sa_mask = mask;
> +	action.sa_handler = sigbus_handler;
> +
> +	need_recover = !sigaction(SIGBUS, &action, &action_old);
> +}
> +
> +static void
> +recover_sigbus(void)
> +{
> +	if (need_recover)
> +		sigaction(SIGBUS, &action_old, NULL);
> +}
> +
>  /*
>   * Prepare physical memory mapping: fill configuration structure with
>   * these infos, return 0 on success.
> @@ -1161,8 +1209,11 @@ rte_eal_hugepage_init(void)
>  
>  	hp_offset = 0; /* where we start the current page size entries */
>  
> +	register_sigbus();
> +
>  	/* map all hugepages and sort them */
>  	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
> +		int pages_old, pages_new;
>  		struct hugepage_info *hpi;
>  
>  		/*
> @@ -1176,10 +1227,19 @@ rte_eal_hugepage_init(void)
>  			continue;
>  
>  		/* map all hugepages available */
> -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
> -			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
> -					(unsigned)(hpi->hugepage_sz / 0x100000));
> -			goto fail;
> +		pages_old = hpi->num_pages[0];
> +		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> +		if (pages_new < pages_old) {
> +			RTE_LOG(DEBUG, EAL,
> +				"%d not %d hugepages of size %u MB allocated\n",
> +				pages_new, pages_old,
> +				(unsigned)(hpi->hugepage_sz / 0x100000));
> +			internal_config.memory -=
> +				hpi->hugepage_sz * (pages_old - pages_new);
> +			nr_hugepages -= (pages_old - pages_new);
> +			hpi->num_pages[0] = pages_new;
> +			if (pages_new == 0)
> +				continue;
>  		}
>  
>  		/* find physical addresses and sockets for each hugepage */
> @@ -1226,6 +1286,8 @@ rte_eal_hugepage_init(void)
>  #endif
>  	}
>  
> +	recover_sigbus();
> +
>  #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
>  	nr_hugefiles = 0;
>  	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
> -- 
> 2.1.4
> 
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: make resource initialization more robust
  2016-01-29 11:22 ` [PATCH] eal: make resource initialization more robust Jianfeng Tan
  2016-02-01 18:08   ` Neil Horman
@ 2016-02-22  6:08   ` Tan, Jianfeng
  2016-02-22 13:18     ` Neil Horman
  2016-02-28 21:12   ` Thomas Monjalon
  2 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-02-22  6:08 UTC (permalink / raw)
  To: dev, nhorman

Hi Neil,

Sorry that for my previous misconfiguration of email agent, I missed 
this email.

> This looks alot better.  One minor comment, the sigbus handler, you should
> probably store the previous bus handler and restore it after you map 
> all the
> hugepages you want (lest you overwrite something an application is 
> doing with
> sigbus).
>

I did not catch your point. I did store it "static struct sigaction 
action_old" and recover it after mapping all the hugepages. Can you give 
more details on this?

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: make resource initialization more robust
  2016-02-22  6:08   ` Tan, Jianfeng
@ 2016-02-22 13:18     ` Neil Horman
  0 siblings, 0 replies; 63+ messages in thread
From: Neil Horman @ 2016-02-22 13:18 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: dev

On Mon, Feb 22, 2016 at 02:08:51PM +0800, Tan, Jianfeng wrote:
> Hi Neil,
> 
> Sorry that for my previous misconfiguration of email agent, I missed this
> email.
> 
> >This looks alot better.  One minor comment, the sigbus handler, you should
> >probably store the previous bus handler and restore it after you map all
> >the
> >hugepages you want (lest you overwrite something an application is doing
> >with
> >sigbus).
> >
> 
> I did not catch your point. I did store it "static struct sigaction
> action_old" and recover it after mapping all the hugepages. Can you give
> more details on this?
> 
Nope, I can't because I missed the fact you had done that.  Apologies, it looks
good.

Acked-by: Neil Horman <nhorman@tuxdriver.com>

> Thanks,
> Jianfeng
> 
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: make resource initialization more robust
  2016-01-29 11:22 ` [PATCH] eal: make resource initialization more robust Jianfeng Tan
  2016-02-01 18:08   ` Neil Horman
  2016-02-22  6:08   ` Tan, Jianfeng
@ 2016-02-28 21:12   ` Thomas Monjalon
  2016-02-29  1:50     ` Tan, Jianfeng
  2 siblings, 1 reply; 63+ messages in thread
From: Thomas Monjalon @ 2016-02-28 21:12 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev

Hi,

2016-01-29 19:22, Jianfeng Tan:
> Current issue: DPDK is not that friendly to container environment, which
> caused by that it pre-alloc resource like cores and hugepages. But there
> are this or that resource limitations, for examples, cgroup, rlimit,
> cpuset, etc.
> 
> For cores, this patch makes use of pthread_getaffinity_np to further
> narrow down detected cores before parsing coremask (-c), corelist (-l),
> and coremap (--lcores).
> 
> For hugepages, this patch adds a recover mechanism to the case that
> there are no that many hugepages can be used. It relys on a mem access
> to fault-in hugepages, and if fails with SIGBUS, recover to previously
> saved stack environment with siglongjmp().

They are some interesting ideas.
However, I am not sure a library should try to be so smart silently.
It needs more feedback to decide wether it can be the default behaviour
or an option.

Please send coremask and hugepage mapping as separate patches as they
are totally different and may be integrated separately.

Thanks

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: make resource initialization more robust
  2016-02-28 21:12   ` Thomas Monjalon
@ 2016-02-29  1:50     ` Tan, Jianfeng
  0 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-02-29  1:50 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

Hi Thomas,

On 2/29/2016 5:12 AM, Thomas Monjalon wrote:
> Hi,
>
> 2016-01-29 19:22, Jianfeng Tan:
>> Current issue: DPDK is not that friendly to container environment, which
>> caused by that it pre-alloc resource like cores and hugepages. But there
>> are this or that resource limitations, for examples, cgroup, rlimit,
>> cpuset, etc.
>>
>> For cores, this patch makes use of pthread_getaffinity_np to further
>> narrow down detected cores before parsing coremask (-c), corelist (-l),
>> and coremap (--lcores).
>>
>> For hugepages, this patch adds a recover mechanism to the case that
>> there are no that many hugepages can be used. It relys on a mem access
>> to fault-in hugepages, and if fails with SIGBUS, recover to previously
>> saved stack environment with siglongjmp().
> They are some interesting ideas.
> However, I am not sure a library should try to be so smart silently.
> It needs more feedback to decide wether it can be the default behaviour
> or an option.
>
> Please send coremask and hugepage mapping as separate patches as they
> are totally different and may be integrated separately.

Good advise, thanks! I'll do it.

And one more thing FYI, coremask using pthread_getaffinity_np() may have 
issue on some Linux versions or distros: it excludes isolcpus. This is 
reported by Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>, 
and I'm still working it out.

Thanks,
Jianfeng

>
> Thanks

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] eal: add option --avail-cores to detect lcores
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
  2016-01-25 13:46 ` Neil Horman
  2016-01-29 11:22 ` [PATCH] eal: make resource initialization more robust Jianfeng Tan
@ 2016-03-04 10:05 ` Jianfeng Tan
  2016-03-08  8:54   ` Panu Matilainen
  2016-04-26 12:39   ` Tan, Jianfeng
  2016-03-04 10:58 ` [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-03-04 10:05 UTC (permalink / raw)
  To: dev

This patch adds option, --avail-cores, to use lcores which are available
by calling pthread_getaffinity_np() to narrow down detected cores before
parsing coremask (-c), corelist (-l), and coremap (--lcores).

Test example:
$ taskset 0xc0000 ./examples/helloworld/build/helloworld \
		--avail-cores -m 1024

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
 lib/librte_eal/common/eal_common_options.c | 52 ++++++++++++++++++++++++++++++
 lib/librte_eal/common/eal_options.h        |  2 ++
 2 files changed, 54 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..dc4882d 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_AVAIL_CORES,       0, NULL, OPT_AVAIL_CORES_NUM      },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -681,6 +682,37 @@ err:
 }
 
 static int
+eal_parse_avail_cores(void)
+{
+	int i, count;
+	pthread_t tid;
+	rte_cpuset_t cpuset;
+	struct rte_config *cfg = rte_eal_get_configuration();
+
+	tid = pthread_self();
+	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cpuset) != 0)
+		return -1;
+
+	for (i = 0, count = 0; i < RTE_MAX_LCORE; i++) {
+		if (lcore_config[i].detected && !CPU_ISSET(i, &cpuset)) {
+			RTE_LOG(DEBUG, EAL, "Flag lcore %u as undetected\n", i);
+			lcore_config[i].detected = 0;
+			lcore_config[i].core_index = -1;
+			cfg->lcore_role[i] = ROLE_OFF;
+			count++;
+		}
+	}
+	cfg->lcore_count -= count;
+	if (cfg->lcore_count == 0) {
+		RTE_LOG(ERR, EAL, "No lcores available\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+
+static int
 eal_parse_syslog(const char *facility, struct internal_config *conf)
 {
 	int i;
@@ -754,6 +786,10 @@ eal_parse_proc_type(const char *arg)
 	return RTE_PROC_INVALID;
 }
 
+static int param_coremask;
+static int param_corelist;
+static int param_coremap;
+
 int
 eal_parse_common_option(int opt, const char *optarg,
 			struct internal_config *conf)
@@ -775,6 +811,7 @@ eal_parse_common_option(int opt, const char *optarg,
 		break;
 	/* coremask */
 	case 'c':
+		param_coremask = 1;
 		if (eal_parse_coremask(optarg) < 0) {
 			RTE_LOG(ERR, EAL, "invalid coremask\n");
 			return -1;
@@ -782,6 +819,7 @@ eal_parse_common_option(int opt, const char *optarg,
 		break;
 	/* corelist */
 	case 'l':
+		param_corelist = 1;
 		if (eal_parse_corelist(optarg) < 0) {
 			RTE_LOG(ERR, EAL, "invalid core list\n");
 			return -1;
@@ -890,12 +928,25 @@ eal_parse_common_option(int opt, const char *optarg,
 		break;
 	}
 	case OPT_LCORES_NUM:
+		param_coremap = 1;
 		if (eal_parse_lcores(optarg) < 0) {
 			RTE_LOG(ERR, EAL, "invalid parameter for --"
 				OPT_LCORES "\n");
 			return -1;
 		}
 		break;
+	case OPT_AVAIL_CORES_NUM:
+		if (param_coremask || param_corelist || param_coremap) {
+			RTE_LOG(ERR, EAL, "should put --" OPT_AVAIL_CORES
+				" before -c, -l and --" OPT_LCORES "\n");
+			return -1;
+		}
+		if (eal_parse_avail_cores() < 0) {
+			RTE_LOG(ERR, EAL, "failed to use --"
+				OPT_AVAIL_CORES "\n");
+			return -1;
+		}
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
@@ -990,6 +1041,7 @@ eal_common_usage(void)
 	       "                      ',' is used for single number separator.\n"
 	       "                      '( )' can be omitted for single element group,\n"
 	       "                      '@' can be omitted if cpus and lcores have the same value\n"
+	       "  --"OPT_AVAIL_CORES"       Use pthread_getaffinity_np() to detect cores to be used\n"
 	       "  --"OPT_MASTER_LCORE" ID   Core ID that is used as master\n"
 	       "  -n CHANNELS         Number of memory channels\n"
 	       "  -m MB               Memory to allocate (see also --"OPT_SOCKET_MEM")\n"
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..b2ddea3 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_AVAIL_CORES       "avail-cores"
+	OPT_AVAIL_CORES_NUM,
 	OPT_LONG_MAX_NUM
 };
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH] eal: make hugetlb initialization more robust
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
                   ` (2 preceding siblings ...)
  2016-03-04 10:05 ` [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
@ 2016-03-04 10:58 ` Jianfeng Tan
  2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
                     ` (2 more replies)
  2016-05-31  3:37 ` [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
                   ` (2 subsequent siblings)
  6 siblings, 3 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-03-04 10:58 UTC (permalink / raw)
  To: dev

This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
 lib/librte_eal/common/eal_common_options.c |  4 ++
 lib/librte_eal/common/eal_internal_cfg.h   |  1 +
 lib/librte_eal/common/eal_options.h        |  2 +
 lib/librte_eal/linuxapp/eal/eal.c          |  1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 95 +++++++++++++++++++++++++++---
 5 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..8ff6a2e 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
 			return -1;
 		}
 		break;
+	case OPT_HUGE_TRYBEST_NUM:
+		internal_config.huge_trybest = 1;
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+	volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
 	unsigned hugepage_unlink;         /**< true to unlink backing files */
 	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
 	volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST      "huge-trybest"
+	OPT_HUGE_TRYBEST_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ceac435..3e23877 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
 	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..1766d7f 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,12 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -396,7 +404,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		if (fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables,
@@ -407,7 +415,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +425,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig && internal_config.huge_trybest) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (sigsetjmp(jmpenv, 0)) {
+				RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
 			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +459,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1065,33 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = sigbus_handler;
+
+	need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+	if (need_recover) {
+		sigaction(SIGBUS, &action_old, NULL);
+		need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1178,12 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	if (internal_config.huge_trybest)
+		register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		int pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1197,24 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+			if (internal_config.huge_trybest) {
+				int pages = pages_old - pages_new;
+
+				internal_config.memory -=
+					hpi->hugepage_sz * pages;
+				nr_hugepages -= pages;
+				hpi->num_pages[0] = pages_new;
+				if (pages_new == 0)
+					continue;
+			} else
+				goto fail;
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1187,6 +1261,9 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	if (internal_config.huge_trybest)
+		recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1450,8 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	if (internal_config.huge_trybest)
+		recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v2] eal: make hugetlb initialization more robust
  2016-03-04 10:58 ` [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
@ 2016-03-08  1:42   ` Jianfeng Tan
  2016-03-08  8:46     ` Tan, Jianfeng
                       ` (2 more replies)
  2016-05-09 10:48   ` [PATCH v3] " Jianfeng Tan
  2016-05-12  0:44   ` [PATCH v4] " Jianfeng Tan
  2 siblings, 3 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-03-08  1:42 UTC (permalink / raw)
  To: dev

This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/common/eal_common_options.c |   4 ++
 lib/librte_eal/common/eal_internal_cfg.h   |   1 +
 lib/librte_eal/common/eal_options.h        |   2 +
 lib/librte_eal/linuxapp/eal/eal.c          |   1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 104 ++++++++++++++++++++++++++---
 5 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..8ff6a2e 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
 			return -1;
 		}
 		break;
+	case OPT_HUGE_TRYBEST_NUM:
+		internal_config.huge_trybest = 1;
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+	volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
 	unsigned hugepage_unlink;         /**< true to unlink backing files */
 	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
 	volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST      "huge-trybest"
+	OPT_HUGE_TRYBEST_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index ceac435..3e23877 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
 	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..e4e1f3b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling setjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int wrap_setjmp(void)
+{
+	return setjmp(jmpenv);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		if (fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables,
@@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig && internal_config.huge_trybest) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (wrap_setjmp()) {
+				RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
 			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = sigbus_handler;
+
+	need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+	if (need_recover) {
+		sigaction(SIGBUS, &action_old, NULL);
+		need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	if (internal_config.huge_trybest)
+		register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		int pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+			if (internal_config.huge_trybest) {
+				int pages = pages_old - pages_new;
+
+				internal_config.memory -=
+					hpi->hugepage_sz * pages;
+				nr_hugepages -= pages;
+				hpi->num_pages[0] = pages_new;
+				if (pages_new == 0)
+					continue;
+			} else
+				goto fail;
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	if (internal_config.huge_trybest)
+		recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	if (internal_config.huge_trybest)
+		recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v2] eal: make hugetlb initialization more robust
  2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
@ 2016-03-08  8:46     ` Tan, Jianfeng
  2016-05-04 11:07     ` Sergio Gonzalez Monroy
  2016-05-04 12:25     ` Sergio Gonzalez Monroy
  2 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-08  8:46 UTC (permalink / raw)
  To: dev



On 3/8/2016 9:42 AM, Jianfeng Tan wrote:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails
> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
>
> Test example:
>    a. cgcreate -g hugetlb:/test-subgroup
>    b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>    c. cgexec -g hugetlb:test-subgroup \
> 	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>

Sorry, forgot to add ack from Neil.
Acked-by: Neil Horman <nhorman@tuxdriver.com>

> ---
> v2:
>   - Address the compiling error by move setjmp into a wrap method.
>
>   lib/librte_eal/common/eal_common_options.c |   4 ++
>   lib/librte_eal/common/eal_internal_cfg.h   |   1 +
>   lib/librte_eal/common/eal_options.h        |   2 +
>   lib/librte_eal/linuxapp/eal/eal.c          |   1 +
>   lib/librte_eal/linuxapp/eal/eal_memory.c   | 104 ++++++++++++++++++++++++++---
>   5 files changed, 104 insertions(+), 8 deletions(-)
>
> diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
> index 29942ea..8ff6a2e 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
>   	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
>   	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
>   	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
> +	{OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
>   	{0,                     0, NULL, 0                        }
>   };
>   
> @@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
>   			return -1;
>   		}
>   		break;
> +	case OPT_HUGE_TRYBEST_NUM:
> +		internal_config.huge_trybest = 1;
> +		break;
>   
>   	/* don't know what to do, leave this to caller */
>   	default:
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..90a3533 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -64,6 +64,7 @@ struct internal_config {
>   	volatile unsigned force_nchannel; /**< force number of channels */
>   	volatile unsigned force_nrank;    /**< force number of ranks */
>   	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
> +	volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
>   	unsigned hugepage_unlink;         /**< true to unlink backing files */
>   	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
>   	volatile unsigned no_pci;         /**< true to disable PCI */
> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index a881c62..02397c5 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
>   	OPT_VMWARE_TSC_MAP_NUM,
>   #define OPT_XEN_DOM0          "xen-dom0"
>   	OPT_XEN_DOM0_NUM,
> +#define OPT_HUGE_TRYBEST      "huge-trybest"
> +	OPT_HUGE_TRYBEST_NUM,
>   	OPT_LONG_MAX_NUM
>   };
>   
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
> index ceac435..3e23877 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -343,6 +343,7 @@ eal_usage(const char *prgname)
>   	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
>   	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
>   	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
> +	       "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
>   	       "\n");
>   	/* Allow the application to print its usage message too if hook is set */
>   	if ( rte_application_usage_hook ) {
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..e4e1f3b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -80,6 +80,8 @@
>   #include <errno.h>
>   #include <sys/ioctl.h>
>   #include <sys/time.h>
> +#include <signal.h>
> +#include <setjmp.h>
>   
>   #include <rte_log.h>
>   #include <rte_memory.h>
> @@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
>   	return addr;
>   }
>   
> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> +	siglongjmp(jmpenv, 1);
> +}
> +
> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
> + * non-static local variable in the stack frame calling setjmp might be
> + * clobbered by a call to longjmp.
> + */
> +static int wrap_setjmp(void)
> +{
> +	return setjmp(jmpenv);
> +}
>   /*
>    * Mmap all hugepages of hugepage table: it first open a file in
>    * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   		if (fd < 0) {
>   			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
>   					strerror(errno));
> -			return -1;
> +			return i;
>   		}
>   
>   		/* map the segment, and populate page tables,
> @@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
>   					strerror(errno));
>   			close(fd);
> -			return -1;
> +			return i;
>   		}
>   
>   		if (orig) {
> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   			hugepg_tbl[i].final_va = virtaddr;
>   		}
>   
> +		if (orig && internal_config.huge_trybest) {
> +			/* In linux, hugetlb limitations, like cgroup, are
> +			 * enforced at fault time instead of mmap(), even
> +			 * with the option of MAP_POPULATE. Kernel will send
> +			 * a SIGBUS signal. To avoid to be killed, save stack
> +			 * environment here, if SIGBUS happens, we can jump
> +			 * back here.
> +			 */
> +			if (wrap_setjmp()) {
> +				RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
> +					"hugepages of size %u MB\n",
> +					(unsigned)(hugepage_sz / 0x100000));
> +				munmap(virtaddr, hugepage_sz);
> +				close(fd);
> +				unlink(hugepg_tbl[i].filepath);
> +				return i;
> +			}
> +			*(int *)virtaddr = 0;
> +		}
> +
> +
>   		/* set shared flock on the file. */
>   		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
>   			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
>   				__func__, strerror(errno));
>   			close(fd);
> -			return -1;
> +			return i;
>   		}
>   
>   		close(fd);
> @@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   		vma_addr = (char *)vma_addr + hugepage_sz;
>   		vma_len -= hugepage_sz;
>   	}
> -	return 0;
> +	return i;
>   }
>   
>   #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> @@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
>   	return total_num_pages;
>   }
>   
> +static struct sigaction action_old;
> +static int need_recover;
> +
> +static void
> +register_sigbus(void)
> +{
> +	sigset_t mask;
> +	struct sigaction action;
> +
> +	sigemptyset(&mask);
> +	sigaddset(&mask, SIGBUS);
> +	action.sa_flags = 0;
> +	action.sa_mask = mask;
> +	action.sa_handler = sigbus_handler;
> +
> +	need_recover = !sigaction(SIGBUS, &action, &action_old);
> +}
> +
> +static void
> +recover_sigbus(void)
> +{
> +	if (need_recover) {
> +		sigaction(SIGBUS, &action_old, NULL);
> +		need_recover = 0;
> +	}
> +}
> +
>   /*
>    * Prepare physical memory mapping: fill configuration structure with
>    * these infos, return 0 on success.
> @@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)
>   
>   	hp_offset = 0; /* where we start the current page size entries */
>   
> +	if (internal_config.huge_trybest)
> +		register_sigbus();
> +
>   	/* map all hugepages and sort them */
>   	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
> +		int pages_old, pages_new;
>   		struct hugepage_info *hpi;
>   
>   		/*
> @@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
>   			continue;
>   
>   		/* map all hugepages available */
> -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
> -			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
> -					(unsigned)(hpi->hugepage_sz / 0x100000));
> -			goto fail;
> +		pages_old = hpi->num_pages[0];
> +		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> +		if (pages_new < pages_old) {
> +			RTE_LOG(DEBUG, EAL,
> +				"%d not %d hugepages of size %u MB allocated\n",
> +				pages_new, pages_old,
> +				(unsigned)(hpi->hugepage_sz / 0x100000));
> +			if (internal_config.huge_trybest) {
> +				int pages = pages_old - pages_new;
> +
> +				internal_config.memory -=
> +					hpi->hugepage_sz * pages;
> +				nr_hugepages -= pages;
> +				hpi->num_pages[0] = pages_new;
> +				if (pages_new == 0)
> +					continue;
> +			} else
> +				goto fail;
>   		}
>   
>   		/* find physical addresses and sockets for each hugepage */
> @@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void)
>   #endif
>   	}
>   
> +	if (internal_config.huge_trybest)
> +		recover_sigbus();
> +
>   #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
>   	nr_hugefiles = 0;
>   	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
> @@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void)
>   	return 0;
>   
>   fail:
> +	if (internal_config.huge_trybest)
> +		recover_sigbus();
>   	free(tmp_hp);
>   	return -1;
>   }

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-04 10:05 ` [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
@ 2016-03-08  8:54   ` Panu Matilainen
  2016-03-08 17:38     ` Tan, Jianfeng
  2016-04-26 12:39   ` Tan, Jianfeng
  1 sibling, 1 reply; 63+ messages in thread
From: Panu Matilainen @ 2016-03-08  8:54 UTC (permalink / raw)
  To: Jianfeng Tan, dev

On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
> This patch adds option, --avail-cores, to use lcores which are available
> by calling pthread_getaffinity_np() to narrow down detected cores before
> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>
> Test example:
> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> 		--avail-cores -m 1024
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Acked-by: Neil Horman <nhorman@tuxdriver.com>

Hmm, to me this sounds like something that should be done always so 
there's no need for an option. Or if there's a chance it might do the 
wrong thing in some rare circumstance then perhaps there should be a 
disabler option instead?

Or am I just missing something?

	- Panu -

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-08  8:54   ` Panu Matilainen
@ 2016-03-08 17:38     ` Tan, Jianfeng
  2016-03-09 13:05       ` Panu Matilainen
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-08 17:38 UTC (permalink / raw)
  To: Panu Matilainen, dev

Hi Panu,

On 3/8/2016 4:54 PM, Panu Matilainen wrote:
> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>> This patch adds option, --avail-cores, to use lcores which are available
>> by calling pthread_getaffinity_np() to narrow down detected cores before
>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>
>> Test example:
>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>         --avail-cores -m 1024
>>
>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>
> Hmm, to me this sounds like something that should be done always so 
> there's no need for an option. Or if there's a chance it might do the 
> wrong thing in some rare circumstance then perhaps there should be a 
> disabler option instead?

Thanks for comments.

Yes, there's a use case that we cannot handle.

If we make it as default, DPDK applications may fail to start, when user 
specifies a core in isolcpus and its parent process (say bash) has a 
cpuset affinity that excludes isolcpus. Originally, DPDK applications 
just blindly do pthread_setaffinity_np() and it always succeeds because 
it always has root privilege to change any cpu affinity.

Now, if we do the checking in rte_eal_cpu_init(), those lcores will be 
flagged as undetected (in my older implementation) and leads to failure. 
To make it correct, we would always add "taskset mask" (or other ways) 
before DPDK application cmd lines.

How do you think?

Thanks,
Jianfeng

>
> Or am I just missing something?
>
>     - Panu -
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-08 17:38     ` Tan, Jianfeng
@ 2016-03-09 13:05       ` Panu Matilainen
  2016-03-09 13:53         ` Tan, Jianfeng
  2016-05-18 12:46         ` David Marchand
  0 siblings, 2 replies; 63+ messages in thread
From: Panu Matilainen @ 2016-03-09 13:05 UTC (permalink / raw)
  To: Tan, Jianfeng, dev

On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
> Hi Panu,
>
> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>> This patch adds option, --avail-cores, to use lcores which are available
>>> by calling pthread_getaffinity_np() to narrow down detected cores before
>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>
>>> Test example:
>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>         --avail-cores -m 1024
>>>
>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>
>> Hmm, to me this sounds like something that should be done always so
>> there's no need for an option. Or if there's a chance it might do the
>> wrong thing in some rare circumstance then perhaps there should be a
>> disabler option instead?
>
> Thanks for comments.
>
> Yes, there's a use case that we cannot handle.
>
> If we make it as default, DPDK applications may fail to start, when user
> specifies a core in isolcpus and its parent process (say bash) has a
> cpuset affinity that excludes isolcpus. Originally, DPDK applications
> just blindly do pthread_setaffinity_np() and it always succeeds because
> it always has root privilege to change any cpu affinity.
>
> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
> flagged as undetected (in my older implementation) and leads to failure.
> To make it correct, we would always add "taskset mask" (or other ways)
> before DPDK application cmd lines.
>
> How do you think?

I still think it sounds like something that should be done by default 
and maybe be overridable with some flag, rather than the other way 
around. Another alternative might be detecting the cores always but if 
running as root, override but with a warning.

But I dont know, just wondering. To look at it from another angle: why 
would somebody use this new --avail-cores option and in what situation, 
if things "just work" otherwise anyway?

	- Panu -

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 13:05       ` Panu Matilainen
@ 2016-03-09 13:53         ` Tan, Jianfeng
  2016-03-09 14:01           ` Ananyev, Konstantin
  2016-05-18 12:46         ` David Marchand
  1 sibling, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-09 13:53 UTC (permalink / raw)
  To: Panu Matilainen, dev



On 3/9/2016 9:05 PM, Panu Matilainen wrote:
> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
>> Hi Panu,
>>
>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>> This patch adds option, --avail-cores, to use lcores which are 
>>>> available
>>>> by calling pthread_getaffinity_np() to narrow down detected cores 
>>>> before
>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>
>>>> Test example:
>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>         --avail-cores -m 1024
>>>>
>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>
>>> Hmm, to me this sounds like something that should be done always so
>>> there's no need for an option. Or if there's a chance it might do the
>>> wrong thing in some rare circumstance then perhaps there should be a
>>> disabler option instead?
>>
>> Thanks for comments.
>>
>> Yes, there's a use case that we cannot handle.
>>
>> If we make it as default, DPDK applications may fail to start, when user
>> specifies a core in isolcpus and its parent process (say bash) has a
>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>> just blindly do pthread_setaffinity_np() and it always succeeds because
>> it always has root privilege to change any cpu affinity.
>>
>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>> flagged as undetected (in my older implementation) and leads to failure.
>> To make it correct, we would always add "taskset mask" (or other ways)
>> before DPDK application cmd lines.
>>
>> How do you think?
>
> I still think it sounds like something that should be done by default 
> and maybe be overridable with some flag, rather than the other way 
> around. Another alternative might be detecting the cores always but if 
> running as root, override but with a warning.

For your second solution, only root can setaffinity to isolcpus?
Your first solution seems like a promising way for me.

>
> But I dont know, just wondering. To look at it from another angle: why 
> would somebody use this new --avail-cores option and in what 
> situation, if things "just work" otherwise anyway?

For DPDK applications, the most common case to initialize DPDK is like 
this: "$dpdk-app [options for DPDK] -- [options for app]", so users need 
to specify which cores to run and how much hugepages are used. Suppose 
we need this dpdk-app to run in a container, users already give those 
information when they build up the cgroup for it to run inside, this 
option or this patch is to make DPDK more smart to discover how much 
resource will be used. Make sense?

Thanks,
Jianfeng


>
>     - Panu -
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 13:53         ` Tan, Jianfeng
@ 2016-03-09 14:01           ` Ananyev, Konstantin
  2016-03-09 14:17             ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Ananyev, Konstantin @ 2016-03-09 14:01 UTC (permalink / raw)
  To: Tan, Jianfeng, Panu Matilainen, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tan, Jianfeng
> Sent: Wednesday, March 09, 2016 1:53 PM
> To: Panu Matilainen; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
> 
> 
> 
> On 3/9/2016 9:05 PM, Panu Matilainen wrote:
> > On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
> >> Hi Panu,
> >>
> >> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
> >>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
> >>>> This patch adds option, --avail-cores, to use lcores which are
> >>>> available
> >>>> by calling pthread_getaffinity_np() to narrow down detected cores
> >>>> before
> >>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
> >>>>
> >>>> Test example:
> >>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> >>>>         --avail-cores -m 1024
> >>>>
> >>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> >>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> >>>
> >>> Hmm, to me this sounds like something that should be done always so
> >>> there's no need for an option. Or if there's a chance it might do the
> >>> wrong thing in some rare circumstance then perhaps there should be a
> >>> disabler option instead?
> >>
> >> Thanks for comments.
> >>
> >> Yes, there's a use case that we cannot handle.
> >>
> >> If we make it as default, DPDK applications may fail to start, when user
> >> specifies a core in isolcpus and its parent process (say bash) has a
> >> cpuset affinity that excludes isolcpus. Originally, DPDK applications
> >> just blindly do pthread_setaffinity_np() and it always succeeds because
> >> it always has root privilege to change any cpu affinity.
> >>
> >> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
> >> flagged as undetected (in my older implementation) and leads to failure.
> >> To make it correct, we would always add "taskset mask" (or other ways)
> >> before DPDK application cmd lines.
> >>
> >> How do you think?
> >
> > I still think it sounds like something that should be done by default
> > and maybe be overridable with some flag, rather than the other way
> > around. Another alternative might be detecting the cores always but if
> > running as root, override but with a warning.
> 
> For your second solution, only root can setaffinity to isolcpus?
> Your first solution seems like a promising way for me.
> 
> >
> > But I dont know, just wondering. To look at it from another angle: why
> > would somebody use this new --avail-cores option and in what
> > situation, if things "just work" otherwise anyway?
> 
> For DPDK applications, the most common case to initialize DPDK is like
> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
> to specify which cores to run and how much hugepages are used. Suppose
> we need this dpdk-app to run in a container, users already give those
> information when they build up the cgroup for it to run inside, this
> option or this patch is to make DPDK more smart to discover how much
> resource will be used. Make sense?

But then, all we need might be just a script that would extract this information from the system
and form a proper cmdline parameter for DPDK? 
Konstantin

> 
> Thanks,
> Jianfeng
> 
> 
> >
> >     - Panu -
> >


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 14:01           ` Ananyev, Konstantin
@ 2016-03-09 14:17             ` Tan, Jianfeng
  2016-03-09 14:44               ` Ananyev, Konstantin
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-09 14:17 UTC (permalink / raw)
  To: Ananyev, Konstantin, Panu Matilainen, dev



On 3/9/2016 10:01 PM, Ananyev, Konstantin wrote:
>
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tan, Jianfeng
>> Sent: Wednesday, March 09, 2016 1:53 PM
>> To: Panu Matilainen; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
>>
>>
>>
>> On 3/9/2016 9:05 PM, Panu Matilainen wrote:
>>> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
>>>> Hi Panu,
>>>>
>>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>>>> This patch adds option, --avail-cores, to use lcores which are
>>>>>> available
>>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
>>>>>> before
>>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>>>
>>>>>> Test example:
>>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>>>          --avail-cores -m 1024
>>>>>>
>>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>>> Hmm, to me this sounds like something that should be done always so
>>>>> there's no need for an option. Or if there's a chance it might do the
>>>>> wrong thing in some rare circumstance then perhaps there should be a
>>>>> disabler option instead?
>>>> Thanks for comments.
>>>>
>>>> Yes, there's a use case that we cannot handle.
>>>>
>>>> If we make it as default, DPDK applications may fail to start, when user
>>>> specifies a core in isolcpus and its parent process (say bash) has a
>>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>>>> just blindly do pthread_setaffinity_np() and it always succeeds because
>>>> it always has root privilege to change any cpu affinity.
>>>>
>>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>>>> flagged as undetected (in my older implementation) and leads to failure.
>>>> To make it correct, we would always add "taskset mask" (or other ways)
>>>> before DPDK application cmd lines.
>>>>
>>>> How do you think?
>>> I still think it sounds like something that should be done by default
>>> and maybe be overridable with some flag, rather than the other way
>>> around. Another alternative might be detecting the cores always but if
>>> running as root, override but with a warning.
>> For your second solution, only root can setaffinity to isolcpus?
>> Your first solution seems like a promising way for me.
>>
>>> But I dont know, just wondering. To look at it from another angle: why
>>> would somebody use this new --avail-cores option and in what
>>> situation, if things "just work" otherwise anyway?
>> For DPDK applications, the most common case to initialize DPDK is like
>> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
>> to specify which cores to run and how much hugepages are used. Suppose
>> we need this dpdk-app to run in a container, users already give those
>> information when they build up the cgroup for it to run inside, this
>> option or this patch is to make DPDK more smart to discover how much
>> resource will be used. Make sense?
> But then, all we need might be just a script that would extract this information from the system
> and form a proper cmdline parameter for DPDK?

Yes, a script will work. Or to construct (argc, argv) to call 
rte_eal_init() in the application. But as Neil Horman once suggested, a 
simple pthread_getaffinity_np() will get all things done. So if it worth 
a patch here?

Thanks,
Jianfeng

> Konstantin
>
>> Thanks,
>> Jianfeng
>>
>>
>>>      - Panu -
>>>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 14:17             ` Tan, Jianfeng
@ 2016-03-09 14:44               ` Ananyev, Konstantin
  2016-03-09 14:55                 ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Ananyev, Konstantin @ 2016-03-09 14:44 UTC (permalink / raw)
  To: Tan, Jianfeng, Panu Matilainen, dev



> -----Original Message-----
> From: Tan, Jianfeng
> Sent: Wednesday, March 09, 2016 2:17 PM
> To: Ananyev, Konstantin; Panu Matilainen; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
> 
> 
> 
> On 3/9/2016 10:01 PM, Ananyev, Konstantin wrote:
> >
> >> -----Original Message-----
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tan, Jianfeng
> >> Sent: Wednesday, March 09, 2016 1:53 PM
> >> To: Panu Matilainen; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
> >>
> >>
> >>
> >> On 3/9/2016 9:05 PM, Panu Matilainen wrote:
> >>> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
> >>>> Hi Panu,
> >>>>
> >>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
> >>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
> >>>>>> This patch adds option, --avail-cores, to use lcores which are
> >>>>>> available
> >>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
> >>>>>> before
> >>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
> >>>>>>
> >>>>>> Test example:
> >>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> >>>>>>          --avail-cores -m 1024
> >>>>>>
> >>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> >>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> >>>>> Hmm, to me this sounds like something that should be done always so
> >>>>> there's no need for an option. Or if there's a chance it might do the
> >>>>> wrong thing in some rare circumstance then perhaps there should be a
> >>>>> disabler option instead?
> >>>> Thanks for comments.
> >>>>
> >>>> Yes, there's a use case that we cannot handle.
> >>>>
> >>>> If we make it as default, DPDK applications may fail to start, when user
> >>>> specifies a core in isolcpus and its parent process (say bash) has a
> >>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
> >>>> just blindly do pthread_setaffinity_np() and it always succeeds because
> >>>> it always has root privilege to change any cpu affinity.
> >>>>
> >>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
> >>>> flagged as undetected (in my older implementation) and leads to failure.
> >>>> To make it correct, we would always add "taskset mask" (or other ways)
> >>>> before DPDK application cmd lines.
> >>>>
> >>>> How do you think?
> >>> I still think it sounds like something that should be done by default
> >>> and maybe be overridable with some flag, rather than the other way
> >>> around. Another alternative might be detecting the cores always but if
> >>> running as root, override but with a warning.
> >> For your second solution, only root can setaffinity to isolcpus?
> >> Your first solution seems like a promising way for me.
> >>
> >>> But I dont know, just wondering. To look at it from another angle: why
> >>> would somebody use this new --avail-cores option and in what
> >>> situation, if things "just work" otherwise anyway?
> >> For DPDK applications, the most common case to initialize DPDK is like
> >> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
> >> to specify which cores to run and how much hugepages are used. Suppose
> >> we need this dpdk-app to run in a container, users already give those
> >> information when they build up the cgroup for it to run inside, this
> >> option or this patch is to make DPDK more smart to discover how much
> >> resource will be used. Make sense?
> > But then, all we need might be just a script that would extract this information from the system
> > and form a proper cmdline parameter for DPDK?
> 
> Yes, a script will work. Or to construct (argc, argv) to call
> rte_eal_init() in the application. But as Neil Horman once suggested, a
> simple pthread_getaffinity_np() will get all things done. So if it worth
> a patch here?

Don't know...
Personally I would prefer not to put extra logic inside EAL.
For me - there are too many different options already.
From other side looking at the patch itself:
You are updating lcore_count and lcore_config[],based on physical cpu availability,
but these days it is not always one-to-one mapping between EAL lcore and physical cpu. 
Shouldn't that be taken into account?
Konstantin
 



^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 14:44               ` Ananyev, Konstantin
@ 2016-03-09 14:55                 ` Tan, Jianfeng
  2016-03-09 15:17                   ` Ananyev, Konstantin
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-09 14:55 UTC (permalink / raw)
  To: Ananyev, Konstantin, Panu Matilainen, dev

Hi Konstantin,

On 3/9/2016 10:44 PM, Ananyev, Konstantin wrote:
>
>> -----Original Message-----
>> From: Tan, Jianfeng
>> Sent: Wednesday, March 09, 2016 2:17 PM
>> To: Ananyev, Konstantin; Panu Matilainen; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
>>
>>
>>
>> On 3/9/2016 10:01 PM, Ananyev, Konstantin wrote:
>>>> -----Original Message-----
>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tan, Jianfeng
>>>> Sent: Wednesday, March 09, 2016 1:53 PM
>>>> To: Panu Matilainen; dev@dpdk.org
>>>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
>>>>
>>>>
>>>>
>>>> On 3/9/2016 9:05 PM, Panu Matilainen wrote:
>>>>> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
>>>>>> Hi Panu,
>>>>>>
>>>>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>>>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>>>>>> This patch adds option, --avail-cores, to use lcores which are
>>>>>>>> available
>>>>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
>>>>>>>> before
>>>>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>>>>>
>>>>>>>> Test example:
>>>>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>>>>>           --avail-cores -m 1024
>>>>>>>>
>>>>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>>>>> Hmm, to me this sounds like something that should be done always so
>>>>>>> there's no need for an option. Or if there's a chance it might do the
>>>>>>> wrong thing in some rare circumstance then perhaps there should be a
>>>>>>> disabler option instead?
>>>>>> Thanks for comments.
>>>>>>
>>>>>> Yes, there's a use case that we cannot handle.
>>>>>>
>>>>>> If we make it as default, DPDK applications may fail to start, when user
>>>>>> specifies a core in isolcpus and its parent process (say bash) has a
>>>>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>>>>>> just blindly do pthread_setaffinity_np() and it always succeeds because
>>>>>> it always has root privilege to change any cpu affinity.
>>>>>>
>>>>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>>>>>> flagged as undetected (in my older implementation) and leads to failure.
>>>>>> To make it correct, we would always add "taskset mask" (or other ways)
>>>>>> before DPDK application cmd lines.
>>>>>>
>>>>>> How do you think?
>>>>> I still think it sounds like something that should be done by default
>>>>> and maybe be overridable with some flag, rather than the other way
>>>>> around. Another alternative might be detecting the cores always but if
>>>>> running as root, override but with a warning.
>>>> For your second solution, only root can setaffinity to isolcpus?
>>>> Your first solution seems like a promising way for me.
>>>>
>>>>> But I dont know, just wondering. To look at it from another angle: why
>>>>> would somebody use this new --avail-cores option and in what
>>>>> situation, if things "just work" otherwise anyway?
>>>> For DPDK applications, the most common case to initialize DPDK is like
>>>> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
>>>> to specify which cores to run and how much hugepages are used. Suppose
>>>> we need this dpdk-app to run in a container, users already give those
>>>> information when they build up the cgroup for it to run inside, this
>>>> option or this patch is to make DPDK more smart to discover how much
>>>> resource will be used. Make sense?
>>> But then, all we need might be just a script that would extract this information from the system
>>> and form a proper cmdline parameter for DPDK?
>> Yes, a script will work. Or to construct (argc, argv) to call
>> rte_eal_init() in the application. But as Neil Horman once suggested, a
>> simple pthread_getaffinity_np() will get all things done. So if it worth
>> a patch here?
> Don't know...
> Personally I would prefer not to put extra logic inside EAL.
> For me - there are too many different options already.

Then how about make it default in rte_eal_cpu_init()? And it is already 
known it will bring trouble to those use isolcpus users, they need to 
add "taskset [mask]" before starting a DPDK app.

>  From other side looking at the patch itself:
> You are updating lcore_count and lcore_config[],based on physical cpu availability,
> but these days it is not always one-to-one mapping between EAL lcore and physical cpu.
> Shouldn't that be taken into account?

I have not see the problem so far, because this work is done before 
parsing coremask (-c), corelist (-l), and coremap (--lcores). If a core 
is disabled here, it's like it is not detected in rte_eal_cpu_init(). Or 
could you please give more hints?

Thanks,
Jianfeng

> Konstantin
>   
>
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 14:55                 ` Tan, Jianfeng
@ 2016-03-09 15:17                   ` Ananyev, Konstantin
  2016-03-09 17:45                     ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Ananyev, Konstantin @ 2016-03-09 15:17 UTC (permalink / raw)
  To: Tan, Jianfeng, Panu Matilainen, dev

Hi Jianfeng,

> -----Original Message-----
> From: Tan, Jianfeng
> Sent: Wednesday, March 09, 2016 2:56 PM
> To: Ananyev, Konstantin; Panu Matilainen; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
> 
> Hi Konstantin,
> 
> On 3/9/2016 10:44 PM, Ananyev, Konstantin wrote:
> >
> >> -----Original Message-----
> >> From: Tan, Jianfeng
> >> Sent: Wednesday, March 09, 2016 2:17 PM
> >> To: Ananyev, Konstantin; Panu Matilainen; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
> >>
> >>
> >>
> >> On 3/9/2016 10:01 PM, Ananyev, Konstantin wrote:
> >>>> -----Original Message-----
> >>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tan, Jianfeng
> >>>> Sent: Wednesday, March 09, 2016 1:53 PM
> >>>> To: Panu Matilainen; dev@dpdk.org
> >>>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
> >>>>
> >>>>
> >>>>
> >>>> On 3/9/2016 9:05 PM, Panu Matilainen wrote:
> >>>>> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
> >>>>>> Hi Panu,
> >>>>>>
> >>>>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
> >>>>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
> >>>>>>>> This patch adds option, --avail-cores, to use lcores which are
> >>>>>>>> available
> >>>>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
> >>>>>>>> before
> >>>>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
> >>>>>>>>
> >>>>>>>> Test example:
> >>>>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> >>>>>>>>           --avail-cores -m 1024
> >>>>>>>>
> >>>>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> >>>>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> >>>>>>> Hmm, to me this sounds like something that should be done always so
> >>>>>>> there's no need for an option. Or if there's a chance it might do the
> >>>>>>> wrong thing in some rare circumstance then perhaps there should be a
> >>>>>>> disabler option instead?
> >>>>>> Thanks for comments.
> >>>>>>
> >>>>>> Yes, there's a use case that we cannot handle.
> >>>>>>
> >>>>>> If we make it as default, DPDK applications may fail to start, when user
> >>>>>> specifies a core in isolcpus and its parent process (say bash) has a
> >>>>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
> >>>>>> just blindly do pthread_setaffinity_np() and it always succeeds because
> >>>>>> it always has root privilege to change any cpu affinity.
> >>>>>>
> >>>>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
> >>>>>> flagged as undetected (in my older implementation) and leads to failure.
> >>>>>> To make it correct, we would always add "taskset mask" (or other ways)
> >>>>>> before DPDK application cmd lines.
> >>>>>>
> >>>>>> How do you think?
> >>>>> I still think it sounds like something that should be done by default
> >>>>> and maybe be overridable with some flag, rather than the other way
> >>>>> around. Another alternative might be detecting the cores always but if
> >>>>> running as root, override but with a warning.
> >>>> For your second solution, only root can setaffinity to isolcpus?
> >>>> Your first solution seems like a promising way for me.
> >>>>
> >>>>> But I dont know, just wondering. To look at it from another angle: why
> >>>>> would somebody use this new --avail-cores option and in what
> >>>>> situation, if things "just work" otherwise anyway?
> >>>> For DPDK applications, the most common case to initialize DPDK is like
> >>>> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
> >>>> to specify which cores to run and how much hugepages are used. Suppose
> >>>> we need this dpdk-app to run in a container, users already give those
> >>>> information when they build up the cgroup for it to run inside, this
> >>>> option or this patch is to make DPDK more smart to discover how much
> >>>> resource will be used. Make sense?
> >>> But then, all we need might be just a script that would extract this information from the system
> >>> and form a proper cmdline parameter for DPDK?
> >> Yes, a script will work. Or to construct (argc, argv) to call
> >> rte_eal_init() in the application. But as Neil Horman once suggested, a
> >> simple pthread_getaffinity_np() will get all things done. So if it worth
> >> a patch here?
> > Don't know...
> > Personally I would prefer not to put extra logic inside EAL.
> > For me - there are too many different options already.
> 
> Then how about make it default in rte_eal_cpu_init()? And it is already
> known it will bring trouble to those use isolcpus users, they need to
> add "taskset [mask]" before starting a DPDK app.

As I said - provide a script?
Same might be for amount of hugepage memory available to the user? 

> 
> >  From other side looking at the patch itself:
> > You are updating lcore_count and lcore_config[],based on physical cpu availability,
> > but these days it is not always one-to-one mapping between EAL lcore and physical cpu.
> > Shouldn't that be taken into account?
> 
> I have not see the problem so far, because this work is done before
> parsing coremask (-c), corelist (-l), and coremap (--lcores). If a core
> is disabled here, it's like it is not detected in rte_eal_cpu_init(). Or
> could you please give more hints?

I didn't test try changes, so probably I am missing something.
Let say iuser allowed to use only cpus 0-3.
If he would type with:
 --avail-cores  --lcores='(1-7)@2',
then only lcores 1-3 would be started.
Again if user would specify '2@(1-7)' it would also be undetected
that cpus 4-7 are note available to the user. 
Is that so?

Konstantin

> 
> Thanks,
> Jianfeng
> 
> > Konstantin
> >
> >
> >


^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 15:17                   ` Ananyev, Konstantin
@ 2016-03-09 17:45                     ` Tan, Jianfeng
  2016-03-09 19:33                       ` Ananyev, Konstantin
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-09 17:45 UTC (permalink / raw)
  To: Ananyev, Konstantin, Panu Matilainen, dev

Hi Konstantin,

On 3/9/2016 11:17 PM, Ananyev, Konstantin wrote:
> Hi Jianfeng,
>
>> -----Original Message-----
>> From: Tan, Jianfeng
>> Sent: Wednesday, March 09, 2016 2:56 PM
>> To: Ananyev, Konstantin; Panu Matilainen; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
>>
>> Hi Konstantin,
>>
>> On 3/9/2016 10:44 PM, Ananyev, Konstantin wrote:
>>>> -----Original Message-----
>>>> From: Tan, Jianfeng
>>>> Sent: Wednesday, March 09, 2016 2:17 PM
>>>> To: Ananyev, Konstantin; Panu Matilainen; dev@dpdk.org
>>>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
>>>>
>>>>
>>>>
>>>> On 3/9/2016 10:01 PM, Ananyev, Konstantin wrote:
>>>>>> -----Original Message-----
>>>>>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tan, Jianfeng
>>>>>> Sent: Wednesday, March 09, 2016 1:53 PM
>>>>>> To: Panu Matilainen; dev@dpdk.org
>>>>>> Subject: Re: [dpdk-dev] [PATCH] eal: add option --avail-cores to detect lcores
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 3/9/2016 9:05 PM, Panu Matilainen wrote:
>>>>>>> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
>>>>>>>> Hi Panu,
>>>>>>>>
>>>>>>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>>>>>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>>>>>>>> This patch adds option, --avail-cores, to use lcores which are
>>>>>>>>>> available
>>>>>>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
>>>>>>>>>> before
>>>>>>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>>>>>>>
>>>>>>>>>> Test example:
>>>>>>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>>>>>>>            --avail-cores -m 1024
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>>>>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>>>>>>> Hmm, to me this sounds like something that should be done always so
>>>>>>>>> there's no need for an option. Or if there's a chance it might do the
>>>>>>>>> wrong thing in some rare circumstance then perhaps there should be a
>>>>>>>>> disabler option instead?
>>>>>>>> Thanks for comments.
>>>>>>>>
>>>>>>>> Yes, there's a use case that we cannot handle.
>>>>>>>>
>>>>>>>> If we make it as default, DPDK applications may fail to start, when user
>>>>>>>> specifies a core in isolcpus and its parent process (say bash) has a
>>>>>>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>>>>>>>> just blindly do pthread_setaffinity_np() and it always succeeds because
>>>>>>>> it always has root privilege to change any cpu affinity.
>>>>>>>>
>>>>>>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>>>>>>>> flagged as undetected (in my older implementation) and leads to failure.
>>>>>>>> To make it correct, we would always add "taskset mask" (or other ways)
>>>>>>>> before DPDK application cmd lines.
>>>>>>>>
>>>>>>>> How do you think?
>>>>>>> I still think it sounds like something that should be done by default
>>>>>>> and maybe be overridable with some flag, rather than the other way
>>>>>>> around. Another alternative might be detecting the cores always but if
>>>>>>> running as root, override but with a warning.
>>>>>> For your second solution, only root can setaffinity to isolcpus?
>>>>>> Your first solution seems like a promising way for me.
>>>>>>
>>>>>>> But I dont know, just wondering. To look at it from another angle: why
>>>>>>> would somebody use this new --avail-cores option and in what
>>>>>>> situation, if things "just work" otherwise anyway?
>>>>>> For DPDK applications, the most common case to initialize DPDK is like
>>>>>> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
>>>>>> to specify which cores to run and how much hugepages are used. Suppose
>>>>>> we need this dpdk-app to run in a container, users already give those
>>>>>> information when they build up the cgroup for it to run inside, this
>>>>>> option or this patch is to make DPDK more smart to discover how much
>>>>>> resource will be used. Make sense?
>>>>> But then, all we need might be just a script that would extract this information from the system
>>>>> and form a proper cmdline parameter for DPDK?
>>>> Yes, a script will work. Or to construct (argc, argv) to call
>>>> rte_eal_init() in the application. But as Neil Horman once suggested, a
>>>> simple pthread_getaffinity_np() will get all things done. So if it worth
>>>> a patch here?
>>> Don't know...
>>> Personally I would prefer not to put extra logic inside EAL.
>>> For me - there are too many different options already.
>> Then how about make it default in rte_eal_cpu_init()? And it is already
>> known it will bring trouble to those use isolcpus users, they need to
>> add "taskset [mask]" before starting a DPDK app.
> As I said - provide a script?

Yes. But what I want to say is this script is hard to be right, if there 
are different kinds of limitations. (Barely happen though :-) )

> Same might be for amount of hugepage memory available to the user?

Ditto. Limitations like hugetlbfs quota, cgroup hugetlb, some are used 
by app themself (more like an artificial argument) ...
>
>>>   From other side looking at the patch itself:
>>> You are updating lcore_count and lcore_config[],based on physical cpu availability,
>>> but these days it is not always one-to-one mapping between EAL lcore and physical cpu.
>>> Shouldn't that be taken into account?
>> I have not see the problem so far, because this work is done before
>> parsing coremask (-c), corelist (-l), and coremap (--lcores). If a core
>> is disabled here, it's like it is not detected in rte_eal_cpu_init(). Or
>> could you please give more hints?
> I didn't test try changes, so probably I am missing something.
> Let say iuser allowed to use only cpus 0-3.
> If he would type with:
>   --avail-cores  --lcores='(1-7)@2',
> then only lcores 1-3 would be started.
> Again if user would specify '2@(1-7)' it would also be undetected
> that cpus 4-7 are note available to the user.
> Is that so?

After reading the code:
For case --lcores='(1-7)@2', lcores 1-7 would be started, and bind to 
pcore 2.
For case --lcores='2@(1-7)', this will fail with "core 4 unavailable".

It's because:
a.  although 1:1 mapping is built-up and flagged as detected if pcore is 
found in sysfs. (ROLE_RTE, cpuset, detected is true)
b. in the beginning of eal_parse_lcores(), "reset lcore config". 
(ROLE_OFF, cpuset is empty, detected is still true)
c. pcore cpuset will be checked by convert_to_cpuset using the previous 
"detected" value.

I have tested it with the patch. Result aligns above analysis.
For case --lcores='(1-7)@2': sudo taskset 0xf 
./examples/helloworld/build/helloworld --avail-cores --lcores='(1-7)@2'
...
hello from core 2
hello from core 3
hello from core 4
hello from core 5
hello from core 6
hello from core 7
hello from core 1

For case --lcores='2@(1-7)': sudo taskset 0xf 
./examples/helloworld/build/helloworld --avail-cores --lcores='2@(1-7)'
...
EAL: core 4 unavailable
EAL: invalid parameter for --lcores
...

One thing may worth mention: shall "detected" be maintained in struct 
lcore_config? Maybe we need to maintain an data structure for pcores?

Thanks,
Jianfeng

>
> Konstantin
>
>> Thanks,
>> Jianfeng
>>
>>> Konstantin
>>>
>>>
>>>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 17:45                     ` Tan, Jianfeng
@ 2016-03-09 19:33                       ` Ananyev, Konstantin
  2016-03-10  1:36                         ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Ananyev, Konstantin @ 2016-03-09 19:33 UTC (permalink / raw)
  To: Tan, Jianfeng, Panu Matilainen, dev



> >>>>>>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
> >>>>>>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
> >>>>>>>>>> This patch adds option, --avail-cores, to use lcores which are
> >>>>>>>>>> available
> >>>>>>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
> >>>>>>>>>> before
> >>>>>>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
> >>>>>>>>>>
> >>>>>>>>>> Test example:
> >>>>>>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> >>>>>>>>>>            --avail-cores -m 1024
> >>>>>>>>>>
> >>>>>>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> >>>>>>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> >>>>>>>>> Hmm, to me this sounds like something that should be done always so
> >>>>>>>>> there's no need for an option. Or if there's a chance it might do the
> >>>>>>>>> wrong thing in some rare circumstance then perhaps there should be a
> >>>>>>>>> disabler option instead?
> >>>>>>>> Thanks for comments.
> >>>>>>>>
> >>>>>>>> Yes, there's a use case that we cannot handle.
> >>>>>>>>
> >>>>>>>> If we make it as default, DPDK applications may fail to start, when user
> >>>>>>>> specifies a core in isolcpus and its parent process (say bash) has a
> >>>>>>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
> >>>>>>>> just blindly do pthread_setaffinity_np() and it always succeeds because
> >>>>>>>> it always has root privilege to change any cpu affinity.
> >>>>>>>>
> >>>>>>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
> >>>>>>>> flagged as undetected (in my older implementation) and leads to failure.
> >>>>>>>> To make it correct, we would always add "taskset mask" (or other ways)
> >>>>>>>> before DPDK application cmd lines.
> >>>>>>>>
> >>>>>>>> How do you think?
> >>>>>>> I still think it sounds like something that should be done by default
> >>>>>>> and maybe be overridable with some flag, rather than the other way
> >>>>>>> around. Another alternative might be detecting the cores always but if
> >>>>>>> running as root, override but with a warning.
> >>>>>> For your second solution, only root can setaffinity to isolcpus?
> >>>>>> Your first solution seems like a promising way for me.
> >>>>>>
> >>>>>>> But I dont know, just wondering. To look at it from another angle: why
> >>>>>>> would somebody use this new --avail-cores option and in what
> >>>>>>> situation, if things "just work" otherwise anyway?
> >>>>>> For DPDK applications, the most common case to initialize DPDK is like
> >>>>>> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
> >>>>>> to specify which cores to run and how much hugepages are used. Suppose
> >>>>>> we need this dpdk-app to run in a container, users already give those
> >>>>>> information when they build up the cgroup for it to run inside, this
> >>>>>> option or this patch is to make DPDK more smart to discover how much
> >>>>>> resource will be used. Make sense?
> >>>>> But then, all we need might be just a script that would extract this information from the system
> >>>>> and form a proper cmdline parameter for DPDK?
> >>>> Yes, a script will work. Or to construct (argc, argv) to call
> >>>> rte_eal_init() in the application. But as Neil Horman once suggested, a
> >>>> simple pthread_getaffinity_np() will get all things done. So if it worth
> >>>> a patch here?
> >>> Don't know...
> >>> Personally I would prefer not to put extra logic inside EAL.
> >>> For me - there are too many different options already.
> >> Then how about make it default in rte_eal_cpu_init()? And it is already
> >> known it will bring trouble to those use isolcpus users, they need to
> >> add "taskset [mask]" before starting a DPDK app.
> > As I said - provide a script?
> 
> Yes. But what I want to say is this script is hard to be right, if there
> are different kinds of limitations. (Barely happen though :-) )

My thought was to keep dpdk code untouched - i.e. let it still blindly set_pthread_affinity()
based on the input parameters, and in addition provide a script for those who want to run
in '--avail-cores' mode. 
So it could do 'taskset -p $$' and then either form -c parameter list  for the app,
or check existing -c/-l/--lcores parameter and complain if not allowed pcpu detected.
But ok, might be it is easier and more convenient to have this logic inside EAL,
then in a separate script.

> 
> > Same might be for amount of hugepage memory available to the user?
> 
> Ditto. Limitations like hugetlbfs quota, cgroup hugetlb, some are used
> by app themself (more like an artificial argument) ...
> >
> >>>   From other side looking at the patch itself:
> >>> You are updating lcore_count and lcore_config[],based on physical cpu availability,
> >>> but these days it is not always one-to-one mapping between EAL lcore and physical cpu.
> >>> Shouldn't that be taken into account?
> >> I have not see the problem so far, because this work is done before
> >> parsing coremask (-c), corelist (-l), and coremap (--lcores). If a core
> >> is disabled here, it's like it is not detected in rte_eal_cpu_init(). Or
> >> could you please give more hints?
> > I didn't test try changes, so probably I am missing something.
> > Let say iuser allowed to use only cpus 0-3.
> > If he would type with:
> >   --avail-cores  --lcores='(1-7)@2',
> > then only lcores 1-3 would be started.
> > Again if user would specify '2@(1-7)' it would also be undetected
> > that cpus 4-7 are note available to the user.
> > Is that so?
> 
> After reading the code:
> For case --lcores='(1-7)@2', lcores 1-7 would be started, and bind to
> pcore 2.
> For case --lcores='2@(1-7)', this will fail with "core 4 unavailable".
> 
> It's because:
> a.  although 1:1 mapping is built-up and flagged as detected if pcore is
> found in sysfs. (ROLE_RTE, cpuset, detected is true)
> b. in the beginning of eal_parse_lcores(), "reset lcore config".
> (ROLE_OFF, cpuset is empty, detected is still true)
> c. pcore cpuset will be checked by convert_to_cpuset using the previous
> "detected" value.

Ok, my bad then - I misunderstood the code.
Thanks for explanation.
So if I get it right now - first inside lib/librte_eal/common/eal_common_lcore.c
Both lcore_count and lcore_config relate to the pcpus.
Then later, at lib/librte_eal/common/eal_common_options.c
they are overwritten related to lcores information.
Except lcore_config[].detected, which seems kept intact.
Is that correct? 

> 
> I have tested it with the patch. Result aligns above analysis.
> For case --lcores='(1-7)@2': sudo taskset 0xf
> ./examples/helloworld/build/helloworld --avail-cores --lcores='(1-7)@2'
> ...
> hello from core 2
> hello from core 3
> hello from core 4
> hello from core 5
> hello from core 6
> hello from core 7
> hello from core 1
> 
> For case --lcores='2@(1-7)': sudo taskset 0xf
> ./examples/helloworld/build/helloworld --avail-cores --lcores='2@(1-7)'
> ...
> EAL: core 4 unavailable
> EAL: invalid parameter for --lcores
> ...
> 
> One thing may worth mention: shall "detected" be maintained in struct
> lcore_config? Maybe we need to maintain an data structure for pcores?

Yes, it might be good to split pcpu and lcores information somehow,
as it is a bit confusing right now.
But I suppose this is a subject for another patch/discussion.
Konstantin



^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 19:33                       ` Ananyev, Konstantin
@ 2016-03-10  1:36                         ` Tan, Jianfeng
  0 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-03-10  1:36 UTC (permalink / raw)
  To: Ananyev, Konstantin, Panu Matilainen, dev



On 3/10/2016 3:33 AM, Ananyev, Konstantin wrote:
>
>>>>>>>>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>>>>>>>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>>>>>>>>>> This patch adds option, --avail-cores, to use lcores which are
>>>>>>>>>>>> available
>>>>>>>>>>>> by calling pthread_getaffinity_np() to narrow down detected cores
>>>>>>>>>>>> before
>>>>>>>>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>>>>>>>>>
>>>>>>>>>>>> Test example:
>>>>>>>>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>>>>>>>>>             --avail-cores -m 1024
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>>>>>>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>>>>>>>>> Hmm, to me this sounds like something that should be done always so
>>>>>>>>>>> there's no need for an option. Or if there's a chance it might do the
>>>>>>>>>>> wrong thing in some rare circumstance then perhaps there should be a
>>>>>>>>>>> disabler option instead?
>>>>>>>>>> Thanks for comments.
>>>>>>>>>>
>>>>>>>>>> Yes, there's a use case that we cannot handle.
>>>>>>>>>>
>>>>>>>>>> If we make it as default, DPDK applications may fail to start, when user
>>>>>>>>>> specifies a core in isolcpus and its parent process (say bash) has a
>>>>>>>>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>>>>>>>>>> just blindly do pthread_setaffinity_np() and it always succeeds because
>>>>>>>>>> it always has root privilege to change any cpu affinity.
>>>>>>>>>>
>>>>>>>>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>>>>>>>>>> flagged as undetected (in my older implementation) and leads to failure.
>>>>>>>>>> To make it correct, we would always add "taskset mask" (or other ways)
>>>>>>>>>> before DPDK application cmd lines.
>>>>>>>>>>
>>>>>>>>>> How do you think?
>>>>>>>>> I still think it sounds like something that should be done by default
>>>>>>>>> and maybe be overridable with some flag, rather than the other way
>>>>>>>>> around. Another alternative might be detecting the cores always but if
>>>>>>>>> running as root, override but with a warning.
>>>>>>>> For your second solution, only root can setaffinity to isolcpus?
>>>>>>>> Your first solution seems like a promising way for me.
>>>>>>>>
>>>>>>>>> But I dont know, just wondering. To look at it from another angle: why
>>>>>>>>> would somebody use this new --avail-cores option and in what
>>>>>>>>> situation, if things "just work" otherwise anyway?
>>>>>>>> For DPDK applications, the most common case to initialize DPDK is like
>>>>>>>> this: "$dpdk-app [options for DPDK] -- [options for app]", so users need
>>>>>>>> to specify which cores to run and how much hugepages are used. Suppose
>>>>>>>> we need this dpdk-app to run in a container, users already give those
>>>>>>>> information when they build up the cgroup for it to run inside, this
>>>>>>>> option or this patch is to make DPDK more smart to discover how much
>>>>>>>> resource will be used. Make sense?
>>>>>>> But then, all we need might be just a script that would extract this information from the system
>>>>>>> and form a proper cmdline parameter for DPDK?
>>>>>> Yes, a script will work. Or to construct (argc, argv) to call
>>>>>> rte_eal_init() in the application. But as Neil Horman once suggested, a
>>>>>> simple pthread_getaffinity_np() will get all things done. So if it worth
>>>>>> a patch here?
>>>>> Don't know...
>>>>> Personally I would prefer not to put extra logic inside EAL.
>>>>> For me - there are too many different options already.
>>>> Then how about make it default in rte_eal_cpu_init()? And it is already
>>>> known it will bring trouble to those use isolcpus users, they need to
>>>> add "taskset [mask]" before starting a DPDK app.
>>> As I said - provide a script?
>> Yes. But what I want to say is this script is hard to be right, if there
>> are different kinds of limitations. (Barely happen though :-) )
> My thought was to keep dpdk code untouched - i.e. let it still blindly set_pthread_affinity()
> based on the input parameters, and in addition provide a script for those who want to run
> in '--avail-cores' mode.
> So it could do 'taskset -p $$' and then either form -c parameter list  for the app,
> or check existing -c/-l/--lcores parameter and complain if not allowed pcpu detected.
> But ok, might be it is easier and more convenient to have this logic inside EAL,
> then in a separate script.
>
>>> Same might be for amount of hugepage memory available to the user?
>> Ditto. Limitations like hugetlbfs quota, cgroup hugetlb, some are used
>> by app themself (more like an artificial argument) ...
>>>>>    From other side looking at the patch itself:
>>>>> You are updating lcore_count and lcore_config[],based on physical cpu availability,
>>>>> but these days it is not always one-to-one mapping between EAL lcore and physical cpu.
>>>>> Shouldn't that be taken into account?
>>>> I have not see the problem so far, because this work is done before
>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores). If a core
>>>> is disabled here, it's like it is not detected in rte_eal_cpu_init(). Or
>>>> could you please give more hints?
>>> I didn't test try changes, so probably I am missing something.
>>> Let say iuser allowed to use only cpus 0-3.
>>> If he would type with:
>>>    --avail-cores  --lcores='(1-7)@2',
>>> then only lcores 1-3 would be started.
>>> Again if user would specify '2@(1-7)' it would also be undetected
>>> that cpus 4-7 are note available to the user.
>>> Is that so?
>> After reading the code:
>> For case --lcores='(1-7)@2', lcores 1-7 would be started, and bind to
>> pcore 2.
>> For case --lcores='2@(1-7)', this will fail with "core 4 unavailable".
>>
>> It's because:
>> a.  although 1:1 mapping is built-up and flagged as detected if pcore is
>> found in sysfs. (ROLE_RTE, cpuset, detected is true)
>> b. in the beginning of eal_parse_lcores(), "reset lcore config".
>> (ROLE_OFF, cpuset is empty, detected is still true)
>> c. pcore cpuset will be checked by convert_to_cpuset using the previous
>> "detected" value.
> Ok, my bad then - I misunderstood the code.
> Thanks for explanation.
> So if I get it right now - first inside lib/librte_eal/common/eal_common_lcore.c
> Both lcore_count and lcore_config relate to the pcpus.
> Then later, at lib/librte_eal/common/eal_common_options.c
> they are overwritten related to lcores information.
> Except lcore_config[].detected, which seems kept intact.
> Is that correct?

Yes, exactly. And really appreciate that you raise up this question for 
discussion.

>
>> I have tested it with the patch. Result aligns above analysis.
>> For case --lcores='(1-7)@2': sudo taskset 0xf
>> ./examples/helloworld/build/helloworld --avail-cores --lcores='(1-7)@2'
>> ...
>> hello from core 2
>> hello from core 3
>> hello from core 4
>> hello from core 5
>> hello from core 6
>> hello from core 7
>> hello from core 1
>>
>> For case --lcores='2@(1-7)': sudo taskset 0xf
>> ./examples/helloworld/build/helloworld --avail-cores --lcores='2@(1-7)'
>> ...
>> EAL: core 4 unavailable
>> EAL: invalid parameter for --lcores
>> ...
>>
>> One thing may worth mention: shall "detected" be maintained in struct
>> lcore_config? Maybe we need to maintain an data structure for pcores?
> Yes, it might be good to split pcpu and lcores information somehow,
> as it is a bit confusing right now.
> But I suppose this is a subject for another patch/discussion.

Yes, just another topic.

Thanks,
Jianfeng

> Konstantin
>
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-04 10:05 ` [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
  2016-03-08  8:54   ` Panu Matilainen
@ 2016-04-26 12:39   ` Tan, Jianfeng
  1 sibling, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-04-26 12:39 UTC (permalink / raw)
  To: dev; +Cc: david.marchand, sergio.gonzalez.monroy, nhorman, konstantin.ananyev

Hi,

Since some guys are asking about the status of this patch, I'd like to 
ping if anyone still has concerns.
Current conclusion is: with option --avail-cores.

Thanks,
Jianfeng

On 3/4/2016 6:05 PM, Jianfeng Tan wrote:
> This patch adds option, --avail-cores, to use lcores which are available
> by calling pthread_getaffinity_np() to narrow down detected cores before
> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>
> Test example:
> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> 		--avail-cores -m 1024
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> ---
>   lib/librte_eal/common/eal_common_options.c | 52 ++++++++++++++++++++++++++++++
>   lib/librte_eal/common/eal_options.h        |  2 ++
>   2 files changed, 54 insertions(+)
>
> diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
> index 29942ea..dc4882d 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
>   	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
>   	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
>   	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
> +	{OPT_AVAIL_CORES,       0, NULL, OPT_AVAIL_CORES_NUM      },
>   	{0,                     0, NULL, 0                        }
>   };
>   
> @@ -681,6 +682,37 @@ err:
>   }
>   
>   static int
> +eal_parse_avail_cores(void)
> +{
> +	int i, count;
> +	pthread_t tid;
> +	rte_cpuset_t cpuset;
> +	struct rte_config *cfg = rte_eal_get_configuration();
> +
> +	tid = pthread_self();
> +	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cpuset) != 0)
> +		return -1;
> +
> +	for (i = 0, count = 0; i < RTE_MAX_LCORE; i++) {
> +		if (lcore_config[i].detected && !CPU_ISSET(i, &cpuset)) {
> +			RTE_LOG(DEBUG, EAL, "Flag lcore %u as undetected\n", i);
> +			lcore_config[i].detected = 0;
> +			lcore_config[i].core_index = -1;
> +			cfg->lcore_role[i] = ROLE_OFF;
> +			count++;
> +		}
> +	}
> +	cfg->lcore_count -= count;
> +	if (cfg->lcore_count == 0) {
> +		RTE_LOG(ERR, EAL, "No lcores available\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +
> +static int
>   eal_parse_syslog(const char *facility, struct internal_config *conf)
>   {
>   	int i;
> @@ -754,6 +786,10 @@ eal_parse_proc_type(const char *arg)
>   	return RTE_PROC_INVALID;
>   }
>   
> +static int param_coremask;
> +static int param_corelist;
> +static int param_coremap;
> +
>   int
>   eal_parse_common_option(int opt, const char *optarg,
>   			struct internal_config *conf)
> @@ -775,6 +811,7 @@ eal_parse_common_option(int opt, const char *optarg,
>   		break;
>   	/* coremask */
>   	case 'c':
> +		param_coremask = 1;
>   		if (eal_parse_coremask(optarg) < 0) {
>   			RTE_LOG(ERR, EAL, "invalid coremask\n");
>   			return -1;
> @@ -782,6 +819,7 @@ eal_parse_common_option(int opt, const char *optarg,
>   		break;
>   	/* corelist */
>   	case 'l':
> +		param_corelist = 1;
>   		if (eal_parse_corelist(optarg) < 0) {
>   			RTE_LOG(ERR, EAL, "invalid core list\n");
>   			return -1;
> @@ -890,12 +928,25 @@ eal_parse_common_option(int opt, const char *optarg,
>   		break;
>   	}
>   	case OPT_LCORES_NUM:
> +		param_coremap = 1;
>   		if (eal_parse_lcores(optarg) < 0) {
>   			RTE_LOG(ERR, EAL, "invalid parameter for --"
>   				OPT_LCORES "\n");
>   			return -1;
>   		}
>   		break;
> +	case OPT_AVAIL_CORES_NUM:
> +		if (param_coremask || param_corelist || param_coremap) {
> +			RTE_LOG(ERR, EAL, "should put --" OPT_AVAIL_CORES
> +				" before -c, -l and --" OPT_LCORES "\n");
> +			return -1;
> +		}
> +		if (eal_parse_avail_cores() < 0) {
> +			RTE_LOG(ERR, EAL, "failed to use --"
> +				OPT_AVAIL_CORES "\n");
> +			return -1;
> +		}
> +		break;
>   
>   	/* don't know what to do, leave this to caller */
>   	default:
> @@ -990,6 +1041,7 @@ eal_common_usage(void)
>   	       "                      ',' is used for single number separator.\n"
>   	       "                      '( )' can be omitted for single element group,\n"
>   	       "                      '@' can be omitted if cpus and lcores have the same value\n"
> +	       "  --"OPT_AVAIL_CORES"       Use pthread_getaffinity_np() to detect cores to be used\n"
>   	       "  --"OPT_MASTER_LCORE" ID   Core ID that is used as master\n"
>   	       "  -n CHANNELS         Number of memory channels\n"
>   	       "  -m MB               Memory to allocate (see also --"OPT_SOCKET_MEM")\n"
> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index a881c62..b2ddea3 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
>   	OPT_VMWARE_TSC_MAP_NUM,
>   #define OPT_XEN_DOM0          "xen-dom0"
>   	OPT_XEN_DOM0_NUM,
> +#define OPT_AVAIL_CORES       "avail-cores"
> +	OPT_AVAIL_CORES_NUM,
>   	OPT_LONG_MAX_NUM
>   };
>   

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v2] eal: make hugetlb initialization more robust
  2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
  2016-03-08  8:46     ` Tan, Jianfeng
@ 2016-05-04 11:07     ` Sergio Gonzalez Monroy
  2016-05-04 11:28       ` Tan, Jianfeng
  2016-05-04 12:25     ` Sergio Gonzalez Monroy
  2 siblings, 1 reply; 63+ messages in thread
From: Sergio Gonzalez Monroy @ 2016-05-04 11:07 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: david.marchand, nhorman, konstantin.ananyev

On 08/03/2016 01:42, Jianfeng Tan wrote:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails
> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
>
> Test example:
>    a. cgcreate -g hugetlb:/test-subgroup
>    b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>    c. cgexec -g hugetlb:test-subgroup \
> 	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

I think you should mention in the commit message that this option also 
covers the case
of hugetlbfs mount with quota.

>   
> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> +	siglongjmp(jmpenv, 1);
> +}
> +
> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
> + * non-static local variable in the stack frame calling setjmp might be
> + * clobbered by a call to longjmp.
> + */
> +static int wrap_setjmp(void)
> +{
> +	return setjmp(jmpenv);
> +}

Use sigsetjmp instead of setjmp and restore the signal masks.

>   /*
>    * Mmap all hugepages of hugepage table: it first open a file in
>    * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   		if (fd < 0) {
>   			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
>   					strerror(errno));
> -			return -1;
> +			return i;

When using --try-best, we could get an error and still work as expected.
It can be confusing for users to see an error when it is expected behavior.

Any thoughts?

>   		}
>   
>   		/* map the segment, and populate page tables,
> @@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
>   					strerror(errno));
>   			close(fd);
> -			return -1;
> +			return i;
>   		}
>   

Same comment as above

>   		/* set shared flock on the file. */
>   		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
>   			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
>   				__func__, strerror(errno));
>   			close(fd);
> -			return -1;
> +			return i;

Same comment as above

> @@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
>   			continue;
>   
>   		/* map all hugepages available */
> -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
> -			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
> -					(unsigned)(hpi->hugepage_sz / 0x100000));
> -			goto fail;
> +		pages_old = hpi->num_pages[0];
> +		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> +		if (pages_new < pages_old) {
> +			RTE_LOG(DEBUG, EAL,
> +				"%d not %d hugepages of size %u MB allocated\n",
> +				pages_new, pages_old,
> +				(unsigned)(hpi->hugepage_sz / 0x100000));
> +			if (internal_config.huge_trybest) {
> +				int pages = pages_old - pages_new;
> +
> +				internal_config.memory -=
> +					hpi->hugepage_sz * pages;
> +				nr_hugepages -= pages;
> +				hpi->num_pages[0] = pages_new;
> +				if (pages_new == 0)
> +					continue;
> +			} else
> +				goto fail;
>   		}

There is another call to map_all_hugepages that you are not updating the 
check of the return value.

Sergio

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v2] eal: make hugetlb initialization more robust
  2016-05-04 11:07     ` Sergio Gonzalez Monroy
@ 2016-05-04 11:28       ` Tan, Jianfeng
  0 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-05-04 11:28 UTC (permalink / raw)
  To: Sergio Gonzalez Monroy, dev; +Cc: david.marchand, nhorman, konstantin.ananyev

Hi Sergio,


On 5/4/2016 7:07 PM, Sergio Gonzalez Monroy wrote:
> On 08/03/2016 01:42, Jianfeng Tan wrote:
>> This patch adds an option, --huge-trybest, to use a recover mechanism to
>> the case that there are not so many hugepages (declared in sysfs), which
>> can be used. It relys on a mem access to fault-in hugepages, and if 
>> fails
>> with SIGBUS, recover to previously saved stack environment with
>> siglongjmp().
>>
>> Test example:
>>    a. cgcreate -g hugetlb:/test-subgroup
>>    b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>>    c. cgexec -g hugetlb:test-subgroup \
>>       ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest
>
> I think you should mention in the commit message that this option also 
> covers the case
> of hugetlbfs mount with quota.

Yes, I should do that.

>
>>   +static sigjmp_buf jmpenv;
>> +
>> +static void sigbus_handler(int signo __rte_unused)
>> +{
>> +    siglongjmp(jmpenv, 1);
>> +}
>> +
>> +/* Put setjmp into a wrap method to avoid compiling error. Any 
>> non-volatile,
>> + * non-static local variable in the stack frame calling setjmp might be
>> + * clobbered by a call to longjmp.
>> + */
>> +static int wrap_setjmp(void)
>> +{
>> +    return setjmp(jmpenv);
>> +}
>
> Use sigsetjmp instead of setjmp and restore the signal masks.

The difference lies in whether signal mask will be saved for further 
restore. And you are right we should keep either sigsetjmp(xxx, 
1)/siglongjmp(xxx, 1) or setjmp()/longjmp. Nice catch! I'll go with the 
former.

>
>>   /*
>>    * Mmap all hugepages of hugepage table: it first open a file in
>>    * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
>> @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>           if (fd < 0) {
>>               RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
>>                       strerror(errno));
>> -            return -1;
>> +            return i;
>
> When using --try-best, we could get an error and still work as expected.
> It can be confusing for users to see an error when it is expected 
> behavior.
>
> Any thoughts?

Shall we remove those RTE_LOG complaints, because the failure here does 
not mean we cannot satisfy the requirements of applications?

...
> There is another call to map_all_hugepages that you are not updating 
> the check of the return value.

You are right, the second map_all_hugepages's return value check should 
be changed to compare with the total hugepages owned by the hpi. I'll 
fix this.

Thanks,
Jianfeng

>
> Sergio
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v2] eal: make hugetlb initialization more robust
  2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
  2016-03-08  8:46     ` Tan, Jianfeng
  2016-05-04 11:07     ` Sergio Gonzalez Monroy
@ 2016-05-04 12:25     ` Sergio Gonzalez Monroy
  2 siblings, 0 replies; 63+ messages in thread
From: Sergio Gonzalez Monroy @ 2016-05-04 12:25 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: david.marchand, nhorman, konstantin.ananyev

On 08/03/2016 01:42, Jianfeng Tan wrote:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails
> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
>
> Test example:
>    a. cgcreate -g hugetlb:/test-subgroup
>    b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>    c. cgexec -g hugetlb:test-subgroup \
> 	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

I think you should mention in the commit message that this option also 
covers the case
of hugetlbfs mount with quota.

>
> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> +	siglongjmp(jmpenv, 1);
> +}
> +
> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
> + * non-static local variable in the stack frame calling setjmp might be
> + * clobbered by a call to longjmp.
> + */
> +static int wrap_setjmp(void)
> +{
> +	return setjmp(jmpenv);
> +}

Use sigsetjmp instead of setjmp and restore the signal masks.

>   /*
>    * Mmap all hugepages of hugepage table: it first open a file in
>    * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   		if (fd < 0) {
>   			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
>   					strerror(errno));
> -			return -1;
> +			return i;

When using --try-best, we could get an error and still work as expected.
It can be confusing for users to see an error when it is expected behavior.

Any thoughts?

>   		}
>
>   		/* map the segment, and populate page tables,
> @@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>   			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
>   					strerror(errno));
>   			close(fd);
> -			return -1;
> +			return i;
>   		}
>

Same comment as above

>   		/* set shared flock on the file. */
>   		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
>   			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
>   				__func__, strerror(errno));
>   			close(fd);
> -			return -1;
> +			return i;

Same comment as above

> @@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
>   			continue;
>
>   		/* map all hugepages available */
> -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
> -			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
> -					(unsigned)(hpi->hugepage_sz / 0x100000));
> -			goto fail;
> +		pages_old = hpi->num_pages[0];
> +		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> +		if (pages_new < pages_old) {
> +			RTE_LOG(DEBUG, EAL,
> +				"%d not %d hugepages of size %u MB allocated\n",
> +				pages_new, pages_old,
> +				(unsigned)(hpi->hugepage_sz / 0x100000));
> +			if (internal_config.huge_trybest) {
> +				int pages = pages_old - pages_new;
> +
> +				internal_config.memory -=
> +					hpi->hugepage_sz * pages;
> +				nr_hugepages -= pages;
> +				hpi->num_pages[0] = pages_new;
> +				if (pages_new == 0)
> +					continue;
> +			} else
> +				goto fail;
>   		}

There is another call to map_all_hugepages that you are not updating the 
check of the return value.

Sergio

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v3] eal: make hugetlb initialization more robust
  2016-03-04 10:58 ` [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
  2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
@ 2016-05-09 10:48   ` Jianfeng Tan
  2016-05-10  8:54     ` Sergio Gonzalez Monroy
  2016-05-12  0:44   ` [PATCH v4] " Jianfeng Tan
  2 siblings, 1 reply; 63+ messages in thread
From: Jianfeng Tan @ 2016-05-09 10:48 UTC (permalink / raw)
  To: dev; +Cc: david.marchand, sergio.gonzalez.monroy, nhorman, Jianfeng Tan

This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Besides, this solution fixes an issue when hugetlbfs is specified with an
option of size. Currently DPDK does not respect the quota of a hugetblfs
mount. It fails to init the EAL because it tries to map the number of free
hugepages in the system rather than using the number specified in the quota
for that mount.

It's still an open issue with CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. Under
this case (such as IVSHMEM target), having hugetlbfs mounts with quota will
fail to remap hugepages as it relies on having mapped all free hugepages
in the system.

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v3:
 - Reword commit message to include it fixes the hugetlbfs quota issue.
 - setjmp -> sigsetjmp.
 - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
   so far.
 - Fix the second map_all_hugepages's return value check.
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/common/eal_common_options.c |   4 +
 lib/librte_eal/common/eal_internal_cfg.h   |   1 +
 lib/librte_eal/common/eal_options.h        |   2 +
 lib/librte_eal/linuxapp/eal/eal.c          |   1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 115 +++++++++++++++++++++++++----
 5 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 3efc90f..e9a111d 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -899,6 +900,9 @@ eal_parse_common_option(int opt, const char *optarg,
 			return -1;
 		}
 		break;
+	case OPT_HUGE_TRYBEST_NUM:
+		internal_config.huge_trybest = 1;
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+	volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
 	unsigned hugepage_unlink;         /**< true to unlink backing files */
 	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
 	volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST      "huge-trybest"
+	OPT_HUGE_TRYBEST_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 8aafd51..eeb1d4e 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
 	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..cb0df76 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int wrap_sigsetjmp(void)
+{
+	return sigsetjmp(jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -394,9 +411,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		/* try to create hugepage file */
 		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
 		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables,
@@ -404,10 +421,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (virtaddr == MAP_FAILED) {
-			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig && internal_config.huge_trybest) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (wrap_sigsetjmp()) {
+				RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
-			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = sigbus_handler;
+
+	need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+	if (need_recover) {
+		sigaction(SIGBUS, &action_old, NULL);
+		need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	if (internal_config.huge_trybest)
+		register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		int pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+			if (internal_config.huge_trybest) {
+				int pages = pages_old - pages_new;
+
+				internal_config.memory -=
+					hpi->hugepage_sz * pages;
+				nr_hugepages -= pages;
+				hpi->num_pages[0] = pages_new;
+				if (pages_new == 0)
+					continue;
+			} else
+				goto fail;
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1172,8 +1255,9 @@ rte_eal_hugepage_init(void)
 		hp_offset += new_pages_count[i];
 #else
 		/* remap all hugepages */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+		if ((uint32_t)map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
+		    hpi->num_pages[0]) {
+			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
 					(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
 		}
@@ -1187,6 +1271,9 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	if (internal_config.huge_trybest)
+		recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1460,8 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	if (internal_config.huge_trybest)
+		recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v3] eal: make hugetlb initialization more robust
  2016-05-09 10:48   ` [PATCH v3] " Jianfeng Tan
@ 2016-05-10  8:54     ` Sergio Gonzalez Monroy
  2016-05-10  9:11       ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Sergio Gonzalez Monroy @ 2016-05-10  8:54 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: david.marchand, nhorman


Hi Jianfeng,

On 09/05/2016 11:48, Jianfeng Tan wrote:

>   		/* find physical addresses and sockets for each hugepage */
> @@ -1172,8 +1255,9 @@ rte_eal_hugepage_init(void)
>   		hp_offset += new_pages_count[i];
>   #else
>   		/* remap all hugepages */
> -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
> -			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
> +		if ((uint32_t)map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
> +		    hpi->num_pages[0]) {

It probably makes more sense to have map_all_hugepages return uint32_t 
instead.

Sergio

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v3] eal: make hugetlb initialization more robust
  2016-05-10  8:54     ` Sergio Gonzalez Monroy
@ 2016-05-10  9:11       ` Tan, Jianfeng
  0 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-05-10  9:11 UTC (permalink / raw)
  To: Gonzalez Monroy, Sergio, dev; +Cc: david.marchand, nhorman

Hi Sergio,

> -----Original Message-----
> From: Gonzalez Monroy, Sergio
> Sent: Tuesday, May 10, 2016 4:55 PM
> To: Tan, Jianfeng; dev@dpdk.org
> Cc: david.marchand@6wind.com; nhorman@tuxdriver.com
> Subject: Re: [PATCH v3] eal: make hugetlb initialization more robust
> 
> 
> Hi Jianfeng,
> 
> On 09/05/2016 11:48, Jianfeng Tan wrote:
> 
> >   		/* find physical addresses and sockets for each hugepage */
> > @@ -1172,8 +1255,9 @@ rte_eal_hugepage_init(void)
> >   		hp_offset += new_pages_count[i];
> >   #else
> >   		/* remap all hugepages */
> > -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
> > -			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB
> pages\n",
> > +		if ((uint32_t)map_all_hugepages(&tmp_hp[hp_offset], hpi,
> 0) !=
> > +		    hpi->num_pages[0]) {
> 
> It probably makes more sense to have map_all_hugepages return uint32_t
> instead.

Yes, I agree. I was wrongly expecting there's a freebsd version map_all_hugepages with the same function type.

I'll fix this in next version.

Thanks,
Jianfeng

> 
> Sergio

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v4] eal: make hugetlb initialization more robust
  2016-03-04 10:58 ` [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
  2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
  2016-05-09 10:48   ` [PATCH v3] " Jianfeng Tan
@ 2016-05-12  0:44   ` Jianfeng Tan
  2016-05-17 16:39     ` David Marchand
  2016-05-17 16:40     ` Thomas Monjalon
  2 siblings, 2 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-05-12  0:44 UTC (permalink / raw)
  To: dev; +Cc: david.marchand, sergio.gonzalez.monroy, nhorman, Jianfeng Tan

This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Besides, this solution fixes an issue when hugetlbfs is specified with an
option of size. Currently DPDK does not respect the quota of a hugetblfs
mount. It fails to init the EAL because it tries to map the number of free
hugepages in the system rather than using the number specified in the quota
for that mount.

It's still an open issue with CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. Under
this case (such as IVSHMEM target), having hugetlbfs mounts with quota will
fail to remap hugepages as it relies on having mapped all free hugepages
in the system.

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
	  ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v4:
 - Change map_all_hugepages to return unsigned instead of int.
v3:
 - Reword commit message to include it fixes the hugetlbfs quota issue.
 - setjmp -> sigsetjmp.
 - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
   so far.
 - Fix the second map_all_hugepages's return value check.
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/common/eal_common_options.c |   4 +
 lib/librte_eal/common/eal_internal_cfg.h   |   1 +
 lib/librte_eal/common/eal_options.h        |   2 +
 lib/librte_eal/linuxapp/eal/eal.c          |   1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 118 +++++++++++++++++++++++++----
 5 files changed, 112 insertions(+), 14 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 3efc90f..e9a111d 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -899,6 +900,9 @@ eal_parse_common_option(int opt, const char *optarg,
 			return -1;
 		}
 		break;
+	case OPT_HUGE_TRYBEST_NUM:
+		internal_config.huge_trybest = 1;
+		break;
 
 	/* don't know what to do, leave this to caller */
 	default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
 	volatile unsigned force_nchannel; /**< force number of channels */
 	volatile unsigned force_nrank;    /**< force number of ranks */
 	volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+	volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
 	unsigned hugepage_unlink;         /**< true to unlink backing files */
 	volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/
 	volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST      "huge-trybest"
+	OPT_HUGE_TRYBEST_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 8aafd51..eeb1d4e 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
 	       "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
 	       "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
 	       "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without hugetlbfs\n"
+	       "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
 	       "\n");
 	/* Allow the application to print its usage message too if hook is set */
 	if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..8c77010 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int wrap_sigsetjmp(void)
+{
+	return sigsetjmp(jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -316,7 +333,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
  * map continguous physical blocks in contiguous virtual blocks.
  */
-static int
+static unsigned
 map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		struct hugepage_info *hpi, int orig)
 {
@@ -394,9 +411,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		/* try to create hugepage file */
 		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
 		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables,
@@ -404,10 +421,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (virtaddr == MAP_FAILED) {
-			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig && internal_config.huge_trybest) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (wrap_sigsetjmp()) {
+				RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
-			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,8 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1075,33 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = sigbus_handler;
+
+	need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+	if (need_recover) {
+		sigaction(SIGBUS, &action_old, NULL);
+		need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1188,12 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	if (internal_config.huge_trybest)
+		register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		unsigned pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1207,24 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
-			goto fail;
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+			if (internal_config.huge_trybest) {
+				int pages = pages_old - pages_new;
+
+				internal_config.memory -=
+					hpi->hugepage_sz * pages;
+				nr_hugepages -= pages;
+				hpi->num_pages[0] = pages_new;
+				if (pages_new == 0)
+					continue;
+			} else
+				goto fail;
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1172,8 +1256,9 @@ rte_eal_hugepage_init(void)
 		hp_offset += new_pages_count[i];
 #else
 		/* remap all hugepages */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
+		    hpi->num_pages[0]) {
+			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
 					(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
 		}
@@ -1187,6 +1272,9 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	if (internal_config.huge_trybest)
+		recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1461,8 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	if (internal_config.huge_trybest)
+		recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-12  0:44   ` [PATCH v4] " Jianfeng Tan
@ 2016-05-17 16:39     ` David Marchand
  2016-05-18  7:56       ` Sergio Gonzalez Monroy
  2016-05-19  2:00       ` Tan, Jianfeng
  2016-05-17 16:40     ` Thomas Monjalon
  1 sibling, 2 replies; 63+ messages in thread
From: David Marchand @ 2016-05-17 16:39 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, Sergio Gonzalez Monroy, Neil Horman

Hello Jianfeng,

On Thu, May 12, 2016 at 2:44 AM, Jianfeng Tan <jianfeng.tan@intel.com> wrote:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails
> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
>
> Besides, this solution fixes an issue when hugetlbfs is specified with an
> option of size. Currently DPDK does not respect the quota of a hugetblfs
> mount. It fails to init the EAL because it tries to map the number of free
> hugepages in the system rather than using the number specified in the quota
> for that mount.
>
> It's still an open issue with CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. Under
> this case (such as IVSHMEM target), having hugetlbfs mounts with quota will
> fail to remap hugepages as it relies on having mapped all free hugepages
> in the system.

For such a case case, maybe having some warning log message when it
fails would help the user.
+ a known issue in the release notes ?


> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..8c77010 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>                         hugepg_tbl[i].final_va = virtaddr;
>                 }
>
> +               if (orig && internal_config.huge_trybest) {
> +                       /* In linux, hugetlb limitations, like cgroup, are
> +                        * enforced at fault time instead of mmap(), even
> +                        * with the option of MAP_POPULATE. Kernel will send
> +                        * a SIGBUS signal. To avoid to be killed, save stack
> +                        * environment here, if SIGBUS happens, we can jump
> +                        * back here.
> +                        */
> +                       if (wrap_sigsetjmp()) {
> +                               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
> +                                       "hugepages of size %u MB\n",
> +                                       (unsigned)(hugepage_sz / 0x100000));
> +                               munmap(virtaddr, hugepage_sz);
> +                               close(fd);
> +                               unlink(hugepg_tbl[i].filepath);
> +                               return i;
> +                       }
> +                       *(int *)virtaddr = 0;
> +               }
> +
> +
>                 /* set shared flock on the file. */
>                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
> -                       RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
> +                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>                                 __func__, strerror(errno));
>                         close(fd);
> -                       return -1;
> +                       return i;
>                 }
>
>                 close(fd);

Maybe I missed something, but we are writing into some hugepage before
the flock has been called.
Are we sure there is nobody else using this hugepage ?

Especially, can't this cause trouble to a primary process running if
we start the exact same primary process ?


-- 
David Marchand

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-12  0:44   ` [PATCH v4] " Jianfeng Tan
  2016-05-17 16:39     ` David Marchand
@ 2016-05-17 16:40     ` Thomas Monjalon
  2016-05-18  8:06       ` Sergio Gonzalez Monroy
  1 sibling, 1 reply; 63+ messages in thread
From: Thomas Monjalon @ 2016-05-17 16:40 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, david.marchand, sergio.gonzalez.monroy, nhorman

2016-05-12 00:44, Jianfeng Tan:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails

relys -> relies

> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
> 
> Besides, this solution fixes an issue when hugetlbfs is specified with an
> option of size. Currently DPDK does not respect the quota of a hugetblfs
> mount. It fails to init the EAL because it tries to map the number of free
> hugepages in the system rather than using the number specified in the quota
> for that mount.

It looks to be a bug. Why adding an option?
What is the benefit of the old behaviour, not using --try-best?

> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> +	siglongjmp(jmpenv, 1);
> +}
> +
> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
> + * non-static local variable in the stack frame calling sigsetjmp might be
> + * clobbered by a call to longjmp.
> + */
> +static int wrap_sigsetjmp(void)
> +{
> +	return sigsetjmp(jmpenv, 1);
> +}

Please add the word "huge" to these variables and functions.

> +static struct sigaction action_old;
> +static int need_recover;
> +
> +static void
> +register_sigbus(void)
> +{
> +	sigset_t mask;
> +	struct sigaction action;
> +
> +	sigemptyset(&mask);
> +	sigaddset(&mask, SIGBUS);
> +	action.sa_flags = 0;
> +	action.sa_mask = mask;
> +	action.sa_handler = sigbus_handler;
> +
> +	need_recover = !sigaction(SIGBUS, &action, &action_old);
> +}
> +
> +static void
> +recover_sigbus(void)
> +{
> +	if (need_recover) {
> +		sigaction(SIGBUS, &action_old, NULL);
> +		need_recover = 0;
> +	}
> +}

Idem, Please add the word "huge".

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-17 16:39     ` David Marchand
@ 2016-05-18  7:56       ` Sergio Gonzalez Monroy
  2016-05-18  9:34         ` David Marchand
  2016-05-19  2:00       ` Tan, Jianfeng
  1 sibling, 1 reply; 63+ messages in thread
From: Sergio Gonzalez Monroy @ 2016-05-18  7:56 UTC (permalink / raw)
  To: David Marchand, Jianfeng Tan; +Cc: dev, Neil Horman

On 17/05/2016 17:39, David Marchand wrote:
> Hello Jianfeng,
>
> On Thu, May 12, 2016 at 2:44 AM, Jianfeng Tan <jianfeng.tan@intel.com> wrote:
>> This patch adds an option, --huge-trybest, to use a recover mechanism to
>> the case that there are not so many hugepages (declared in sysfs), which
>> can be used. It relys on a mem access to fault-in hugepages, and if fails
>> with SIGBUS, recover to previously saved stack environment with
>> siglongjmp().
>>
>> Besides, this solution fixes an issue when hugetlbfs is specified with an
>> option of size. Currently DPDK does not respect the quota of a hugetblfs
>> mount. It fails to init the EAL because it tries to map the number of free
>> hugepages in the system rather than using the number specified in the quota
>> for that mount.
>>
>> It's still an open issue with CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. Under
>> this case (such as IVSHMEM target), having hugetlbfs mounts with quota will
>> fail to remap hugepages as it relies on having mapped all free hugepages
>> in the system.
> For such a case case, maybe having some warning log message when it
> fails would help the user.
> + a known issue in the release notes ?
>
>
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> index 5b9132c..8c77010 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>                          hugepg_tbl[i].final_va = virtaddr;
>>                  }
>>
>> +               if (orig && internal_config.huge_trybest) {
>> +                       /* In linux, hugetlb limitations, like cgroup, are
>> +                        * enforced at fault time instead of mmap(), even
>> +                        * with the option of MAP_POPULATE. Kernel will send
>> +                        * a SIGBUS signal. To avoid to be killed, save stack
>> +                        * environment here, if SIGBUS happens, we can jump
>> +                        * back here.
>> +                        */
>> +                       if (wrap_sigsetjmp()) {
>> +                               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
>> +                                       "hugepages of size %u MB\n",
>> +                                       (unsigned)(hugepage_sz / 0x100000));
>> +                               munmap(virtaddr, hugepage_sz);
>> +                               close(fd);
>> +                               unlink(hugepg_tbl[i].filepath);
>> +                               return i;
>> +                       }
>> +                       *(int *)virtaddr = 0;
>> +               }
>> +
>> +
>>                  /* set shared flock on the file. */
>>                  if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
>> -                       RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
>> +                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>>                                  __func__, strerror(errno));
>>                          close(fd);
>> -                       return -1;
>> +                       return i;
>>                  }
>>
>>                  close(fd);
> Maybe I missed something, but we are writing into some hugepage before
> the flock has been called.
> Are we sure there is nobody else using this hugepage ?
>
> Especially, can't this cause trouble to a primary process running if
> we start the exact same primary process ?
>

We lock the hugepage directory during eal_hugepage_info_init(), and we 
do not unlock
until we have finished eal_memory_init.

I think that takes care of that case.

Sergio

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-17 16:40     ` Thomas Monjalon
@ 2016-05-18  8:06       ` Sergio Gonzalez Monroy
  2016-05-18  9:38         ` David Marchand
  2016-05-19  2:11         ` Tan, Jianfeng
  0 siblings, 2 replies; 63+ messages in thread
From: Sergio Gonzalez Monroy @ 2016-05-18  8:06 UTC (permalink / raw)
  To: Thomas Monjalon, Jianfeng Tan; +Cc: dev, david.marchand, nhorman

On 17/05/2016 17:40, Thomas Monjalon wrote:
> 2016-05-12 00:44, Jianfeng Tan:
>> This patch adds an option, --huge-trybest, to use a recover mechanism to
>> the case that there are not so many hugepages (declared in sysfs), which
>> can be used. It relys on a mem access to fault-in hugepages, and if fails
> relys -> relies
>
>> with SIGBUS, recover to previously saved stack environment with
>> siglongjmp().
>>
>> Besides, this solution fixes an issue when hugetlbfs is specified with an
>> option of size. Currently DPDK does not respect the quota of a hugetblfs
>> mount. It fails to init the EAL because it tries to map the number of free
>> hugepages in the system rather than using the number specified in the quota
>> for that mount.
> It looks to be a bug. Why adding an option?
> What is the benefit of the old behaviour, not using --try-best?

I do not see any benefit to the old behavior.
Given that we need the signal handling for the cgroup use case, I would 
be inclined to use
this method as the default instead of trying to figure out how many 
hugepages we have free, etc.

Thoughts?

Sergio

>> +static sigjmp_buf jmpenv;
>> +
>> +static void sigbus_handler(int signo __rte_unused)
>> +{
>> +	siglongjmp(jmpenv, 1);
>> +}
>> +
>> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
>> + * non-static local variable in the stack frame calling sigsetjmp might be
>> + * clobbered by a call to longjmp.
>> + */
>> +static int wrap_sigsetjmp(void)
>> +{
>> +	return sigsetjmp(jmpenv, 1);
>> +}
> Please add the word "huge" to these variables and functions.
>
>> +static struct sigaction action_old;
>> +static int need_recover;
>> +
>> +static void
>> +register_sigbus(void)
>> +{
>> +	sigset_t mask;
>> +	struct sigaction action;
>> +
>> +	sigemptyset(&mask);
>> +	sigaddset(&mask, SIGBUS);
>> +	action.sa_flags = 0;
>> +	action.sa_mask = mask;
>> +	action.sa_handler = sigbus_handler;
>> +
>> +	need_recover = !sigaction(SIGBUS, &action, &action_old);
>> +}
>> +
>> +static void
>> +recover_sigbus(void)
>> +{
>> +	if (need_recover) {
>> +		sigaction(SIGBUS, &action_old, NULL);
>> +		need_recover = 0;
>> +	}
>> +}
> Idem, Please add the word "huge".
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-18  7:56       ` Sergio Gonzalez Monroy
@ 2016-05-18  9:34         ` David Marchand
  0 siblings, 0 replies; 63+ messages in thread
From: David Marchand @ 2016-05-18  9:34 UTC (permalink / raw)
  To: Sergio Gonzalez Monroy; +Cc: Jianfeng Tan, dev, Neil Horman

Hello Sergio,

On Wed, May 18, 2016 at 9:56 AM, Sergio Gonzalez Monroy
<sergio.gonzalez.monroy@intel.com> wrote:
> On 17/05/2016 17:39, David Marchand wrote:
>>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> b/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> index 5b9132c..8c77010 100644
>>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>>                          hugepg_tbl[i].final_va = virtaddr;
>>>                  }
>>>
>>> +               if (orig && internal_config.huge_trybest) {
>>> +                       /* In linux, hugetlb limitations, like cgroup,
>>> are
>>> +                        * enforced at fault time instead of mmap(), even
>>> +                        * with the option of MAP_POPULATE. Kernel will
>>> send
>>> +                        * a SIGBUS signal. To avoid to be killed, save
>>> stack
>>> +                        * environment here, if SIGBUS happens, we can
>>> jump
>>> +                        * back here.
>>> +                        */
>>> +                       if (wrap_sigsetjmp()) {
>>> +                               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap
>>> more "
>>> +                                       "hugepages of size %u MB\n",
>>> +                                       (unsigned)(hugepage_sz /
>>> 0x100000));
>>> +                               munmap(virtaddr, hugepage_sz);
>>> +                               close(fd);
>>> +                               unlink(hugepg_tbl[i].filepath);
>>> +                               return i;
>>> +                       }
>>> +                       *(int *)virtaddr = 0;
>>> +               }
>>> +
>>> +
>>>                  /* set shared flock on the file. */
>>>                  if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
>>> -                       RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s
>>> \n",
>>> +                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s
>>> \n",
>>>                                  __func__, strerror(errno));
>>>                          close(fd);
>>> -                       return -1;
>>> +                       return i;
>>>                  }
>>>
>>>                  close(fd);
>>
>> Maybe I missed something, but we are writing into some hugepage before
>> the flock has been called.
>> Are we sure there is nobody else using this hugepage ?
>>
>> Especially, can't this cause trouble to a primary process running if
>> we start the exact same primary process ?
>>
>
> We lock the hugepage directory during eal_hugepage_info_init(), and we do
> not unlock
> until we have finished eal_memory_init.
>
> I think that takes care of that case.

Yes, thanks.

-- 
David Marchand

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-18  8:06       ` Sergio Gonzalez Monroy
@ 2016-05-18  9:38         ` David Marchand
  2016-05-19  2:11         ` Tan, Jianfeng
  1 sibling, 0 replies; 63+ messages in thread
From: David Marchand @ 2016-05-18  9:38 UTC (permalink / raw)
  To: Sergio Gonzalez Monroy; +Cc: Thomas Monjalon, Jianfeng Tan, dev, Neil Horman

On Wed, May 18, 2016 at 10:06 AM, Sergio Gonzalez Monroy
<sergio.gonzalez.monroy@intel.com> wrote:
> On 17/05/2016 17:40, Thomas Monjalon wrote:
>>
>> 2016-05-12 00:44, Jianfeng Tan:
>>>
>>> This patch adds an option, --huge-trybest, to use a recover mechanism to
>>> the case that there are not so many hugepages (declared in sysfs), which
>>> can be used. It relys on a mem access to fault-in hugepages, and if fails
>>
>> relys -> relies
>>
>>> with SIGBUS, recover to previously saved stack environment with
>>> siglongjmp().
>>>
>>> Besides, this solution fixes an issue when hugetlbfs is specified with an
>>> option of size. Currently DPDK does not respect the quota of a hugetblfs
>>> mount. It fails to init the EAL because it tries to map the number of
>>> free
>>> hugepages in the system rather than using the number specified in the
>>> quota
>>> for that mount.
>>
>> It looks to be a bug. Why adding an option?
>> What is the benefit of the old behaviour, not using --try-best?
>
>
> I do not see any benefit to the old behavior.
> Given that we need the signal handling for the cgroup use case, I would be
> inclined to use
> this method as the default instead of trying to figure out how many
> hugepages we have free, etc.

+1


-- 
David Marchand

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-03-09 13:05       ` Panu Matilainen
  2016-03-09 13:53         ` Tan, Jianfeng
@ 2016-05-18 12:46         ` David Marchand
  2016-05-19  2:25           ` Tan, Jianfeng
  1 sibling, 1 reply; 63+ messages in thread
From: David Marchand @ 2016-05-18 12:46 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: dev, Panu Matilainen

Hello Jianfeng,

On Wed, Mar 9, 2016 at 2:05 PM, Panu Matilainen <pmatilai@redhat.com> wrote:
> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
>>
>> Hi Panu,
>>
>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>>
>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>>
>>>> This patch adds option, --avail-cores, to use lcores which are available
>>>> by calling pthread_getaffinity_np() to narrow down detected cores before
>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>
>>>> Test example:
>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>         --avail-cores -m 1024
>>>>
>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>
>>>
>>> Hmm, to me this sounds like something that should be done always so
>>> there's no need for an option. Or if there's a chance it might do the
>>> wrong thing in some rare circumstance then perhaps there should be a
>>> disabler option instead?
>>
>>
>> Thanks for comments.
>>
>> Yes, there's a use case that we cannot handle.
>>
>> If we make it as default, DPDK applications may fail to start, when user
>> specifies a core in isolcpus and its parent process (say bash) has a
>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>> just blindly do pthread_setaffinity_np() and it always succeeds because
>> it always has root privilege to change any cpu affinity.
>>
>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>> flagged as undetected (in my older implementation) and leads to failure.
>> To make it correct, we would always add "taskset mask" (or other ways)
>> before DPDK application cmd lines.
>>
>> How do you think?
>
>
> I still think it sounds like something that should be done by default and
> maybe be overridable with some flag, rather than the other way around.
> Another alternative might be detecting the cores always but if running as
> root, override but with a warning.
>
> But I dont know, just wondering. To look at it from another angle: why would
> somebody use this new --avail-cores option and in what situation, if things
> "just work" otherwise anyway?

+1 and I don't even see why we should have an option to disable this,
since taskset would do the job.

Looking at your special case, if the user did set an isolcpus option
for another use, with no -c/-l, I understand the dpdk application
won't care too much about it.
So, this seems like somehow rude to the rest of the system and unwanted.

We can still help the user starting its application as root (without
taskset) by adding a warning message if a requested cpu (-c / -l ..)
is not part of the available cpus.


-- 
David Marchand

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-17 16:39     ` David Marchand
  2016-05-18  7:56       ` Sergio Gonzalez Monroy
@ 2016-05-19  2:00       ` Tan, Jianfeng
  1 sibling, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-05-19  2:00 UTC (permalink / raw)
  To: David Marchand; +Cc: dev, Sergio Gonzalez Monroy, Neil Horman

Hi David,


On 5/18/2016 12:39 AM, David Marchand wrote:
> Hello Jianfeng,
>
> On Thu, May 12, 2016 at 2:44 AM, Jianfeng Tan <jianfeng.tan@intel.com> wrote:
>> This patch adds an option, --huge-trybest, to use a recover mechanism to
>> the case that there are not so many hugepages (declared in sysfs), which
>> can be used. It relys on a mem access to fault-in hugepages, and if fails
>> with SIGBUS, recover to previously saved stack environment with
>> siglongjmp().
>>
>> Besides, this solution fixes an issue when hugetlbfs is specified with an
>> option of size. Currently DPDK does not respect the quota of a hugetblfs
>> mount. It fails to init the EAL because it tries to map the number of free
>> hugepages in the system rather than using the number specified in the quota
>> for that mount.
>>
>> It's still an open issue with CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. Under
>> this case (such as IVSHMEM target), having hugetlbfs mounts with quota will
>> fail to remap hugepages as it relies on having mapped all free hugepages
>> in the system.
>
>
>
>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> index 5b9132c..8c77010 100644
>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>                          hugepg_tbl[i].final_va = virtaddr;
>>                  }
>>
>> +               if (orig && internal_config.huge_trybest) {
>> +                       /* In linux, hugetlb limitations, like cgroup, are
>> +                        * enforced at fault time instead of mmap(), even
>> +                        * with the option of MAP_POPULATE. Kernel will send
>> +                        * a SIGBUS signal. To avoid to be killed, save stack
>> +                        * environment here, if SIGBUS happens, we can jump
>> +                        * back here.
>> +                        */
>> +                       if (wrap_sigsetjmp()) {
>> +                               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
>> +                                       "hugepages of size %u MB\n",
>> +                                       (unsigned)(hugepage_sz / 0x100000));
> For such a case case, maybe having some warning log message when it
> fails would help the user.
> + a known issue in the release notes ?

Do you mean when sigbus is triggered, like here, warn the user that "it 
fails to hold all free hugepages as sysfs shows", and
#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
/*we need to return error from rte_eal_init_memory */
#endif

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: make hugetlb initialization more robust
  2016-05-18  8:06       ` Sergio Gonzalez Monroy
  2016-05-18  9:38         ` David Marchand
@ 2016-05-19  2:11         ` Tan, Jianfeng
  1 sibling, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-05-19  2:11 UTC (permalink / raw)
  To: Sergio Gonzalez Monroy, Thomas Monjalon; +Cc: dev, david.marchand, nhorman

Hi Thomas & Sergio,


On 5/18/2016 4:06 PM, Sergio Gonzalez Monroy wrote:
> On 17/05/2016 17:40, Thomas Monjalon wrote:
>> 2016-05-12 00:44, Jianfeng Tan:
>>> This patch adds an option, --huge-trybest, to use a recover 
>>> mechanism to
>>> the case that there are not so many hugepages (declared in sysfs), 
>>> which
>>> can be used. It relys on a mem access to fault-in hugepages, and if 
>>> fails
>> relys -> relies
>>
>>> with SIGBUS, recover to previously saved stack environment with
>>> siglongjmp().
>>>
>>> Besides, this solution fixes an issue when hugetlbfs is specified 
>>> with an
>>> option of size. Currently DPDK does not respect the quota of a 
>>> hugetblfs
>>> mount. It fails to init the EAL because it tries to map the number 
>>> of free
>>> hugepages in the system rather than using the number specified in 
>>> the quota
>>> for that mount.
>> It looks to be a bug. Why adding an option?
>> What is the benefit of the old behaviour, not using --try-best?
>
> I do not see any benefit to the old behavior.
> Given that we need the signal handling for the cgroup use case, I 
> would be inclined to use
> this method as the default instead of trying to figure out how many 
> hugepages we have free, etc.
>
> Thoughts?

I tend to use this method as the default too, with some warning logs as 
suggested by David, and return error from rte_eal_memory() when sigbus 
is triggered under the case of RTE_EAL_SINGLE_FILE_SEGMENTS.

Thomas, all other trivial issues will be fixed in next version. Thank you!

Thanks,
Jianfeng

>
> Sergio
>
>>> +static sigjmp_buf jmpenv;
>>> +
>>> +static void sigbus_handler(int signo __rte_unused)
>>> +{
>>> +    siglongjmp(jmpenv, 1);
>>> +}
>>> +
>>> +/* Put setjmp into a wrap method to avoid compiling error. Any 
>>> non-volatile,
>>> + * non-static local variable in the stack frame calling sigsetjmp 
>>> might be
>>> + * clobbered by a call to longjmp.
>>> + */
>>> +static int wrap_sigsetjmp(void)
>>> +{
>>> +    return sigsetjmp(jmpenv, 1);
>>> +}
>> Please add the word "huge" to these variables and functions.
>>
>>> +static struct sigaction action_old;
>>> +static int need_recover;
>>> +
>>> +static void
>>> +register_sigbus(void)
>>> +{
>>> +    sigset_t mask;
>>> +    struct sigaction action;
>>> +
>>> +    sigemptyset(&mask);
>>> +    sigaddset(&mask, SIGBUS);
>>> +    action.sa_flags = 0;
>>> +    action.sa_mask = mask;
>>> +    action.sa_handler = sigbus_handler;
>>> +
>>> +    need_recover = !sigaction(SIGBUS, &action, &action_old);
>>> +}
>>> +
>>> +static void
>>> +recover_sigbus(void)
>>> +{
>>> +    if (need_recover) {
>>> +        sigaction(SIGBUS, &action_old, NULL);
>>> +        need_recover = 0;
>>> +    }
>>> +}
>> Idem, Please add the word "huge".
>>
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-05-18 12:46         ` David Marchand
@ 2016-05-19  2:25           ` Tan, Jianfeng
  2016-06-30 13:43             ` Thomas Monjalon
  0 siblings, 1 reply; 63+ messages in thread
From: Tan, Jianfeng @ 2016-05-19  2:25 UTC (permalink / raw)
  To: David Marchand; +Cc: dev, Panu Matilainen

Hi David,


On 5/18/2016 8:46 PM, David Marchand wrote:
> Hello Jianfeng,
>
> On Wed, Mar 9, 2016 at 2:05 PM, Panu Matilainen <pmatilai@redhat.com> wrote:
>> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
>>> Hi Panu,
>>>
>>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
>>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
>>>>> This patch adds option, --avail-cores, to use lcores which are available
>>>>> by calling pthread_getaffinity_np() to narrow down detected cores before
>>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
>>>>>
>>>>> Test example:
>>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
>>>>>          --avail-cores -m 1024
>>>>>
>>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
>>>>
>>>> Hmm, to me this sounds like something that should be done always so
>>>> there's no need for an option. Or if there's a chance it might do the
>>>> wrong thing in some rare circumstance then perhaps there should be a
>>>> disabler option instead?
>>>
>>> Thanks for comments.
>>>
>>> Yes, there's a use case that we cannot handle.
>>>
>>> If we make it as default, DPDK applications may fail to start, when user
>>> specifies a core in isolcpus and its parent process (say bash) has a
>>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
>>> just blindly do pthread_setaffinity_np() and it always succeeds because
>>> it always has root privilege to change any cpu affinity.
>>>
>>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
>>> flagged as undetected (in my older implementation) and leads to failure.
>>> To make it correct, we would always add "taskset mask" (or other ways)
>>> before DPDK application cmd lines.
>>>
>>> How do you think?
>>
>> I still think it sounds like something that should be done by default and
>> maybe be overridable with some flag, rather than the other way around.
>> Another alternative might be detecting the cores always but if running as
>> root, override but with a warning.
>>
>> But I dont know, just wondering. To look at it from another angle: why would
>> somebody use this new --avail-cores option and in what situation, if things
>> "just work" otherwise anyway?
> +1 and I don't even see why we should have an option to disable this,
> since taskset would do the job.
>
> Looking at your special case, if the user did set an isolcpus option
> for another use, with no -c/-l, I understand the dpdk application
> won't care too much about it.
> So, this seems like somehow rude to the rest of the system and unwanted.

The case you mentioned above is not the case I mean. But you make your 
point about this one.
The case I originally mean: user sets an isolcpus option for DPDK 
applications. Originally, DPDK apps would be started without any 
problem. But for now, fail to start them because the required cores are 
excluded before -c/-l. As per your comments following, we can add a 
warning message (or should we quit on this situation?). But it indeed 
has an effect on old users (they should changed to use "taskset 
./dpdk_app ..."). Do you think it's a problem?

Thanks,
Jianfeng


>
> We can still help the user starting its application as root (without
> taskset) by adding a warning message if a requested cpu (-c / -l ..)
> is not part of the available cpus.
>
>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v5] eal: fix allocating all free hugepages
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
                   ` (3 preceding siblings ...)
  2016-03-04 10:58 ` [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
@ 2016-05-31  3:37 ` Jianfeng Tan
  2016-06-06  2:49   ` Pei, Yulong
  2016-06-08 11:27   ` Sergio Gonzalez Monroy
  2016-08-31  3:07 ` [PATCH v2] eal: restrict cores detection Jianfeng Tan
  2016-09-01  1:31 ` [PATCH v3] " Jianfeng Tan
  6 siblings, 2 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-05-31  3:37 UTC (permalink / raw)
  To: dev
  Cc: sergio.gonzalez.monroy, nhorman, david.marchand, thomas.monjalon,
	Jianfeng Tan

EAL memory init allocates all free hugepages of the whole system,
which seen from sysfs, even when applications do not ask so many.
When there is a limitation on how many hugepages an application can
use (such as cgroup.hugetlb), or hugetlbfs is specified with an
option of size (exceeding the quota of the fs), it just fails to
start even there are enough hugepages allocated.

To fix above issue, this patch:
 - Changes the logic to continue memory init to see if hugetlb
   requirement of application can be addressed by already allocated
   hugepages.
 - To make sure each hugepage is allocated successfully, we add a
   recover mechanism, which relies on a mem access to fault-in
   hugepages, and if it fails with SIGBUS, recover to previously
   saved stack environment with siglongjmp().

For the case of CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS (enabled by
default when compiling IVSHMEM target), it's indispensable to
mapp all free hugepages in the system. Under this case, it fails
to start when allocating fails.

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
          ./examples/helloworld/build/helloworld -c 0x2 -n 4

       
Fixes: af75078fece ("first public release")

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v5:
 - Make this method as default instead of using an option.
 - When SIGBUS is triggered in the case of RTE_EAL_SINGLE_FILE_SEGMENTS,
   just return error.
 - Add prefix "huge_" to newly added function and static variables.
 - Move the internal_config.memory assignment after the page allocations.
v4:
 - Change map_all_hugepages to return unsigned instead of int.
v3:
 - Reword commit message to include it fixes the hugetlbfs quota issue.
 - setjmp -> sigsetjmp.
 - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
   so far.
 - Fix the second map_all_hugepages's return value check.
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/linuxapp/eal/eal.c        |  20 -----
 lib/librte_eal/linuxapp/eal/eal_memory.c | 138 ++++++++++++++++++++++++++++---
 2 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 8aafd51..4a8dfbd 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -465,24 +465,6 @@ eal_parse_vfio_intr(const char *mode)
 	return -1;
 }
 
-static inline size_t
-eal_get_hugepage_mem_size(void)
-{
-	uint64_t size = 0;
-	unsigned i, j;
-
-	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
-		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (hpi->hugedir != NULL) {
-			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
-				size += hpi->hugepage_sz * hpi->num_pages[j];
-			}
-		}
-	}
-
-	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
-}
-
 /* Parse the arguments for --log-level only */
 static void
 eal_log_level_parse(int argc, char **argv)
@@ -766,8 +748,6 @@ rte_eal_init(int argc, char **argv)
 	if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
 		if (internal_config.no_hugetlbfs)
 			internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
-		else
-			internal_config.memory = eal_get_hugepage_mem_size();
 	}
 
 	if (internal_config.vmware_tsc_map == 1) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..dc6f49b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf huge_jmpenv;
+
+static void huge_sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int huge_wrap_sigsetjmp(void)
+{
+	return sigsetjmp(huge_jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -316,7 +333,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
  * map continguous physical blocks in contiguous virtual blocks.
  */
-static int
+static unsigned
 map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		struct hugepage_info *hpi, int orig)
 {
@@ -394,9 +411,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		/* try to create hugepage file */
 		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
 		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables,
@@ -404,10 +421,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (virtaddr == MAP_FAILED) {
-			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (huge_wrap_sigsetjmp()) {
+				RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
-			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,8 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1075,51 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+	uint64_t size = 0;
+	unsigned i, j;
+
+	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+		if (hpi->hugedir != NULL) {
+			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+				size += hpi->hugepage_sz * hpi->num_pages[j];
+			}
+		}
+	}
+
+	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void
+huge_register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = huge_sigbus_handler;
+
+	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void
+huge_recover_sigbus(void)
+{
+	if (huge_need_recover) {
+		sigaction(SIGBUS, &huge_action_old, NULL);
+		huge_need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1206,11 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	huge_register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		unsigned pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1224,28 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+			RTE_LOG(ERR, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
+#else
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+
+			int pages = pages_old - pages_new;
+
+			nr_hugepages -= pages;
+			hpi->num_pages[0] = pages_new;
+			if (pages_new == 0)
+				continue;
+#endif
 		}
 
 		/* find physical addresses and sockets for each hugepage */
@@ -1172,8 +1277,9 @@ rte_eal_hugepage_init(void)
 		hp_offset += new_pages_count[i];
 #else
 		/* remap all hugepages */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
+		    hpi->num_pages[0]) {
+			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
 					(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
 		}
@@ -1187,6 +1293,11 @@ rte_eal_hugepage_init(void)
 #endif
 	}
 
+	huge_recover_sigbus();
+
+	if (internal_config.memory == 0 && internal_config.force_sockets == 0)
+		internal_config.memory = eal_get_hugepage_mem_size();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1484,7 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	huge_recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] eal: fix allocating all free hugepages
  2016-05-31  3:37 ` [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
@ 2016-06-06  2:49   ` Pei, Yulong
  2016-06-08 11:27   ` Sergio Gonzalez Monroy
  1 sibling, 0 replies; 63+ messages in thread
From: Pei, Yulong @ 2016-06-06  2:49 UTC (permalink / raw)
  To: Tan, Jianfeng, dev
  Cc: Gonzalez Monroy, Sergio, nhorman, david.marchand,
	thomas.monjalon, Tan, Jianfeng

Tested-by: Yulong Pei <Yulong.pei@intel.com>

1. Run dpdk app with multiple mount points, it works as expected.
2. Create new cgroup with limited hugepages like the following, and Run dpdk app with the newly created cgroup, it works as expected.

#cgcreate -g hugetlb:/test-subgroup
# cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
# cgexec -g hugetlb:test-subgroup ./x86_64-native-linuxapp-gcc/app/testpmd -c 0x3 -n 4 -- -i

Best Regards
Yulong Pei

-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jianfeng Tan
Sent: Tuesday, May 31, 2016 11:37 AM
To: dev@dpdk.org
Cc: Gonzalez Monroy, Sergio <sergio.gonzalez.monroy@intel.com>; nhorman@tuxdriver.com; david.marchand@6wind.com; thomas.monjalon@6wind.com; Tan, Jianfeng <jianfeng.tan@intel.com>
Subject: [dpdk-dev] [PATCH v5] eal: fix allocating all free hugepages

EAL memory init allocates all free hugepages of the whole system, which seen from sysfs, even when applications do not ask so many.
When there is a limitation on how many hugepages an application can use (such as cgroup.hugetlb), or hugetlbfs is specified with an option of size (exceeding the quota of the fs), it just fails to start even there are enough hugepages allocated.

To fix above issue, this patch:
 - Changes the logic to continue memory init to see if hugetlb
   requirement of application can be addressed by already allocated
   hugepages.
 - To make sure each hugepage is allocated successfully, we add a
   recover mechanism, which relies on a mem access to fault-in
   hugepages, and if it fails with SIGBUS, recover to previously
   saved stack environment with siglongjmp().

For the case of CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS (enabled by default when compiling IVSHMEM target), it's indispensable to mapp all free hugepages in the system. Under this case, it fails to start when allocating fails.

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
          ./examples/helloworld/build/helloworld -c 0x2 -n 4

       
Fixes: af75078fece ("first public release")

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v5:
 - Make this method as default instead of using an option.
 - When SIGBUS is triggered in the case of RTE_EAL_SINGLE_FILE_SEGMENTS,
   just return error.
 - Add prefix "huge_" to newly added function and static variables.
 - Move the internal_config.memory assignment after the page allocations.
v4:
 - Change map_all_hugepages to return unsigned instead of int.
v3:
 - Reword commit message to include it fixes the hugetlbfs quota issue.
 - setjmp -> sigsetjmp.
 - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
   so far.
 - Fix the second map_all_hugepages's return value check.
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/linuxapp/eal/eal.c        |  20 -----
 lib/librte_eal/linuxapp/eal/eal_memory.c | 138 ++++++++++++++++++++++++++++---
 2 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 8aafd51..4a8dfbd 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -465,24 +465,6 @@ eal_parse_vfio_intr(const char *mode)
 	return -1;
 }
 
-static inline size_t
-eal_get_hugepage_mem_size(void)
-{
-	uint64_t size = 0;
-	unsigned i, j;
-
-	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
-		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (hpi->hugedir != NULL) {
-			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
-				size += hpi->hugepage_sz * hpi->num_pages[j];
-			}
-		}
-	}
-
-	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
-}
-
 /* Parse the arguments for --log-level only */  static void  eal_log_level_parse(int argc, char **argv) @@ -766,8 +748,6 @@ rte_eal_init(int argc, char **argv)
 	if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
 		if (internal_config.no_hugetlbfs)
 			internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
-		else
-			internal_config.memory = eal_get_hugepage_mem_size();
 	}
 
 	if (internal_config.vmware_tsc_map == 1) { diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..dc6f49b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf huge_jmpenv;
+
+static void huge_sigbus_handler(int signo __rte_unused) {
+	siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any 
+non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might 
+be
+ * clobbered by a call to longjmp.
+ */
+static int huge_wrap_sigsetjmp(void)
+{
+	return sigsetjmp(huge_jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the @@ -316,7 +333,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
  * map continguous physical blocks in contiguous virtual blocks.
  */
-static int
+static unsigned
 map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		struct hugepage_info *hpi, int orig)
 {
@@ -394,9 +411,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		/* try to create hugepage file */
 		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
 		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables, @@ -404,10 +421,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (virtaddr == MAP_FAILED) {
-			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (huge_wrap_sigsetjmp()) {
+				RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
-			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,8 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1075,51 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+	uint64_t size = 0;
+	unsigned i, j;
+
+	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+		if (hpi->hugedir != NULL) {
+			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+				size += hpi->hugepage_sz * hpi->num_pages[j];
+			}
+		}
+	}
+
+	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; }
+
+static struct sigaction huge_action_old; static int huge_need_recover;
+
+static void
+huge_register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = huge_sigbus_handler;
+
+	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); }
+
+static void
+huge_recover_sigbus(void)
+{
+	if (huge_need_recover) {
+		sigaction(SIGBUS, &huge_action_old, NULL);
+		huge_need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1206,11 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	huge_register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		unsigned pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1224,28 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+			RTE_LOG(ERR, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
+#else
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+
+			int pages = pages_old - pages_new;
+
+			nr_hugepages -= pages;
+			hpi->num_pages[0] = pages_new;
+			if (pages_new == 0)
+				continue;
+#endif
 		}
 
 		/* find physical addresses and sockets for each hugepage */ @@ -1172,8 +1277,9 @@ rte_eal_hugepage_init(void)
 		hp_offset += new_pages_count[i];
 #else
 		/* remap all hugepages */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
+		    hpi->num_pages[0]) {
+			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
 					(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
 		}
@@ -1187,6 +1293,11 @@ rte_eal_hugepage_init(void)  #endif
 	}
 
+	huge_recover_sigbus();
+
+	if (internal_config.memory == 0 && internal_config.force_sockets == 0)
+		internal_config.memory = eal_get_hugepage_mem_size();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { @@ -1373,6 +1484,7 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	huge_recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
--
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] eal: fix allocating all free hugepages
  2016-05-31  3:37 ` [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
  2016-06-06  2:49   ` Pei, Yulong
@ 2016-06-08 11:27   ` Sergio Gonzalez Monroy
  2016-06-30 13:34     ` Thomas Monjalon
  1 sibling, 1 reply; 63+ messages in thread
From: Sergio Gonzalez Monroy @ 2016-06-08 11:27 UTC (permalink / raw)
  To: Jianfeng Tan, dev; +Cc: nhorman, david.marchand, thomas.monjalon

On 31/05/2016 04:37, Jianfeng Tan wrote:
> EAL memory init allocates all free hugepages of the whole system,
> which seen from sysfs, even when applications do not ask so many.
> When there is a limitation on how many hugepages an application can
> use (such as cgroup.hugetlb), or hugetlbfs is specified with an
> option of size (exceeding the quota of the fs), it just fails to
> start even there are enough hugepages allocated.
>
> To fix above issue, this patch:
>   - Changes the logic to continue memory init to see if hugetlb
>     requirement of application can be addressed by already allocated
>     hugepages.
>   - To make sure each hugepage is allocated successfully, we add a
>     recover mechanism, which relies on a mem access to fault-in
>     hugepages, and if it fails with SIGBUS, recover to previously
>     saved stack environment with siglongjmp().
>
> For the case of CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS (enabled by
> default when compiling IVSHMEM target), it's indispensable to
> mapp all free hugepages in the system. Under this case, it fails
> to start when allocating fails.
>
> Test example:
>    a. cgcreate -g hugetlb:/test-subgroup
>    b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>    c. cgexec -g hugetlb:test-subgroup \
>            ./examples/helloworld/build/helloworld -c 0x2 -n 4
>
>         
> Fixes: af75078fece ("first public release")
>
> Signed-off-by: Jianfeng Tan<jianfeng.tan@intel.com>
> Acked-by: Neil Horman<nhorman@tuxdriver.com>
> ---
> v5:
>   - Make this method as default instead of using an option.
>   - When SIGBUS is triggered in the case of RTE_EAL_SINGLE_FILE_SEGMENTS,
>     just return error.
>   - Add prefix "huge_" to newly added function and static variables.
>   - Move the internal_config.memory assignment after the page allocations.
> v4:
>   - Change map_all_hugepages to return unsigned instead of int.
> v3:
>   - Reword commit message to include it fixes the hugetlbfs quota issue.
>   - setjmp -> sigsetjmp.
>   - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
>     so far.
>   - Fix the second map_all_hugepages's return value check.
> v2:
>   - Address the compiling error by move setjmp into a wrap method.
>
>   lib/librte_eal/linuxapp/eal/eal.c        |  20 -----
>   lib/librte_eal/linuxapp/eal/eal_memory.c | 138 ++++++++++++++++++++++++++++---
>   2 files changed, 125 insertions(+), 33 deletions(-)
>

Acked-by: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v5] eal: fix allocating all free hugepages
  2016-06-08 11:27   ` Sergio Gonzalez Monroy
@ 2016-06-30 13:34     ` Thomas Monjalon
  0 siblings, 0 replies; 63+ messages in thread
From: Thomas Monjalon @ 2016-06-30 13:34 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: Sergio Gonzalez Monroy, dev, nhorman, david.marchand

> > EAL memory init allocates all free hugepages of the whole system,
> > which seen from sysfs, even when applications do not ask so many.
> > When there is a limitation on how many hugepages an application can
> > use (such as cgroup.hugetlb), or hugetlbfs is specified with an
> > option of size (exceeding the quota of the fs), it just fails to
> > start even there are enough hugepages allocated.
> >
> > To fix above issue, this patch:
> >   - Changes the logic to continue memory init to see if hugetlb
> >     requirement of application can be addressed by already allocated
> >     hugepages.
> >   - To make sure each hugepage is allocated successfully, we add a
> >     recover mechanism, which relies on a mem access to fault-in
> >     hugepages, and if it fails with SIGBUS, recover to previously
> >     saved stack environment with siglongjmp().
> >
> > For the case of CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS (enabled by
> > default when compiling IVSHMEM target), it's indispensable to
> > mapp all free hugepages in the system. Under this case, it fails
> > to start when allocating fails.
[...]
> > Signed-off-by: Jianfeng Tan<jianfeng.tan@intel.com>
> > Acked-by: Neil Horman<nhorman@tuxdriver.com>
> 
> Acked-by: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-05-19  2:25           ` Tan, Jianfeng
@ 2016-06-30 13:43             ` Thomas Monjalon
  2016-07-01  0:52               ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Thomas Monjalon @ 2016-06-30 13:43 UTC (permalink / raw)
  To: Tan, Jianfeng; +Cc: dev, David Marchand, Panu Matilainen

2016-05-19 10:25, Tan, Jianfeng:
> On 5/18/2016 8:46 PM, David Marchand wrote:
> > On Wed, Mar 9, 2016 at 2:05 PM, Panu Matilainen <pmatilai@redhat.com> wrote:
> >> On 03/08/2016 07:38 PM, Tan, Jianfeng wrote:
> >>> On 3/8/2016 4:54 PM, Panu Matilainen wrote:
> >>>> On 03/04/2016 12:05 PM, Jianfeng Tan wrote:
> >>>>> This patch adds option, --avail-cores, to use lcores which are available
> >>>>> by calling pthread_getaffinity_np() to narrow down detected cores before
> >>>>> parsing coremask (-c), corelist (-l), and coremap (--lcores).
> >>>>>
> >>>>> Test example:
> >>>>> $ taskset 0xc0000 ./examples/helloworld/build/helloworld \
> >>>>>          --avail-cores -m 1024
> >>>>>
> >>>>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> >>>>> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> >>>>
> >>>> Hmm, to me this sounds like something that should be done always so
> >>>> there's no need for an option. Or if there's a chance it might do the
> >>>> wrong thing in some rare circumstance then perhaps there should be a
> >>>> disabler option instead?
> >>>
> >>> Thanks for comments.
> >>>
> >>> Yes, there's a use case that we cannot handle.
> >>>
> >>> If we make it as default, DPDK applications may fail to start, when user
> >>> specifies a core in isolcpus and its parent process (say bash) has a
> >>> cpuset affinity that excludes isolcpus. Originally, DPDK applications
> >>> just blindly do pthread_setaffinity_np() and it always succeeds because
> >>> it always has root privilege to change any cpu affinity.
> >>>
> >>> Now, if we do the checking in rte_eal_cpu_init(), those lcores will be
> >>> flagged as undetected (in my older implementation) and leads to failure.
> >>> To make it correct, we would always add "taskset mask" (or other ways)
> >>> before DPDK application cmd lines.
> >>>
> >>> How do you think?
> >>
> >> I still think it sounds like something that should be done by default and
> >> maybe be overridable with some flag, rather than the other way around.
> >> Another alternative might be detecting the cores always but if running as
> >> root, override but with a warning.
> >>
> >> But I dont know, just wondering. To look at it from another angle: why would
> >> somebody use this new --avail-cores option and in what situation, if things
> >> "just work" otherwise anyway?
> > +1 and I don't even see why we should have an option to disable this,
> > since taskset would do the job.
> >
> > Looking at your special case, if the user did set an isolcpus option
> > for another use, with no -c/-l, I understand the dpdk application
> > won't care too much about it.
> > So, this seems like somehow rude to the rest of the system and unwanted.
> 
> The case you mentioned above is not the case I mean. But you make your 
> point about this one.
> The case I originally mean: user sets an isolcpus option for DPDK 
> applications. Originally, DPDK apps would be started without any 
> problem. But for now, fail to start them because the required cores are 
> excluded before -c/-l. As per your comments following, we can add a 
> warning message (or should we quit on this situation?). But it indeed 
> has an effect on old users (they should changed to use "taskset 
> ./dpdk_app ..."). Do you think it's a problem?

There is no activity on this patch.
Jianfeng, do not hesitate to ping if needed.
Should we class this patch as "changes requested"?

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] eal: add option --avail-cores to detect lcores
  2016-06-30 13:43             ` Thomas Monjalon
@ 2016-07-01  0:52               ` Tan, Jianfeng
  0 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-07-01  0:52 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, David Marchand, Panu Matilainen

Hi Thomas,

> > >
> > > Looking at your special case, if the user did set an isolcpus option
> > > for another use, with no -c/-l, I understand the dpdk application
> > > won't care too much about it.
> > > So, this seems like somehow rude to the rest of the system and
> unwanted.
> >
> > The case you mentioned above is not the case I mean. But you make your
> > point about this one.
> > The case I originally mean: user sets an isolcpus option for DPDK
> > applications. Originally, DPDK apps would be started without any
> > problem. But for now, fail to start them because the required cores are
> > excluded before -c/-l. As per your comments following, we can add a
> > warning message (or should we quit on this situation?). But it indeed
> > has an effect on old users (they should changed to use "taskset
> > ./dpdk_app ..."). Do you think it's a problem?
> 
> There is no activity on this patch.
> Jianfeng, do not hesitate to ping if needed.
> Should we class this patch as "changes requested"?

Yes, according to latest comments, it should be classified as "changes requested" (I've done that).

I'll resent a new version.

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v2] eal: restrict cores detection
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
                   ` (4 preceding siblings ...)
  2016-05-31  3:37 ` [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
@ 2016-08-31  3:07 ` Jianfeng Tan
  2016-08-31 15:30   ` Stephen Hemminger
  2016-09-01  1:31 ` [PATCH v3] " Jianfeng Tan
  6 siblings, 1 reply; 63+ messages in thread
From: Jianfeng Tan @ 2016-08-31  3:07 UTC (permalink / raw)
  To: dev; +Cc: david.marchand, pmatilai, thomas.monjalon, Jianfeng Tan

This patch uses pthread_getaffinity_np() to narrow down detected
cores before parsing coremask (-c), corelist (-l), and coremap
(--lcores).

The purpose of this patch is to leave out these core related options
when DPDK applications are deployed under container env, so that
users only specify core restriction as starting the instance.

Note: previously, some users are using isolated CPUs, which could
be excluded by default. Please add commands like taskset to use
those cores.

Test example:
$ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v2:
  - Make it as default instead of adding the new options.
 lib/librte_eal/common/eal_common_lcore.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
index 2cd4132..62e4f67 100644
--- a/lib/librte_eal/common/eal_common_lcore.c
+++ b/lib/librte_eal/common/eal_common_lcore.c
@@ -57,6 +57,14 @@ rte_eal_cpu_init(void)
 	struct rte_config *config = rte_eal_get_configuration();
 	unsigned lcore_id;
 	unsigned count = 0;
+	rte_cpuset_t cs;
+	pthread_t tid = pthread_self();
+
+	/* Add below method to obtain core restrictions, like ulimit,
+	 * cgroup.cpuset, etc. Will not use those cores, which are rebuffed.
+	 */
+	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cs) < 0)
+		CPU_ZERO(&cs);
 
 	/*
 	 * Parse the maximum set of logical cores, detect the subset of running
@@ -70,7 +78,8 @@ rte_eal_cpu_init(void)
 
 		/* in 1:1 mapping, record related cpu detected state */
 		lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
-		if (lcore_config[lcore_id].detected == 0) {
+		if (lcore_config[lcore_id].detected == 0 ||
+		    !CPU_ISSET(lcore_id, &cs)) {
 			config->lcore_role[lcore_id] = ROLE_OFF;
 			lcore_config[lcore_id].core_index = -1;
 			continue;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v2] eal: restrict cores detection
  2016-08-31  3:07 ` [PATCH v2] eal: restrict cores detection Jianfeng Tan
@ 2016-08-31 15:30   ` Stephen Hemminger
  2016-09-01  1:15     ` Tan, Jianfeng
  0 siblings, 1 reply; 63+ messages in thread
From: Stephen Hemminger @ 2016-08-31 15:30 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, david.marchand, pmatilai, thomas.monjalon

On Wed, 31 Aug 2016 03:07:10 +0000
Jianfeng Tan <jianfeng.tan@intel.com> wrote:

> This patch uses pthread_getaffinity_np() to narrow down detected
> cores before parsing coremask (-c), corelist (-l), and coremap
> (--lcores).
> 
> The purpose of this patch is to leave out these core related options
> when DPDK applications are deployed under container env, so that
> users only specify core restriction as starting the instance.
> 
> Note: previously, some users are using isolated CPUs, which could
> be excluded by default. Please add commands like taskset to use
> those cores.
> 
> Test example:
> $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Acked-by: Neil Horman <nhorman@tuxdriver.com>
> ---
> v2:
>   - Make it as default instead of adding the new options.
>  lib/librte_eal/common/eal_common_lcore.c | 11 ++++++++++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
> index 2cd4132..62e4f67 100644
> --- a/lib/librte_eal/common/eal_common_lcore.c
> +++ b/lib/librte_eal/common/eal_common_lcore.c
> @@ -57,6 +57,14 @@ rte_eal_cpu_init(void)
>  	struct rte_config *config = rte_eal_get_configuration();
>  	unsigned lcore_id;
>  	unsigned count = 0;
> +	rte_cpuset_t cs;
> +	pthread_t tid = pthread_self();
> +
> +	/* Add below method to obtain core restrictions, like ulimit,
> +	 * cgroup.cpuset, etc. Will not use those cores, which are rebuffed.
> +	 */
> +	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cs) < 0)
> +		CPU_ZERO(&cs);
>  

This patch makes sense but the comment is hard to read because of wording
and grammar.

If you choose variable names better then there really is no need for
a comment in many cases. Code is often easier to read/write than comments
for non-native English speakers.

Remove the comment and rename 'cs' as 'affinity_set' or something equally
as descriptive.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v2] eal: restrict cores detection
  2016-08-31 15:30   ` Stephen Hemminger
@ 2016-09-01  1:15     ` Tan, Jianfeng
  0 siblings, 0 replies; 63+ messages in thread
From: Tan, Jianfeng @ 2016-09-01  1:15 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, david.marchand, pmatilai, thomas.monjalon

Hi Stephen,

> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: Wednesday, August 31, 2016 11:31 PM
> To: Tan, Jianfeng
> Cc: dev@dpdk.org; david.marchand@6wind.com; pmatilai@redhat.com;
> thomas.monjalon@6wind.com
> Subject: Re: [dpdk-dev] [PATCH v2] eal: restrict cores detection
> 
> On Wed, 31 Aug 2016 03:07:10 +0000
> Jianfeng Tan <jianfeng.tan@intel.com> wrote:
> 
> > This patch uses pthread_getaffinity_np() to narrow down detected
> > cores before parsing coremask (-c), corelist (-l), and coremap
> > (--lcores).
> >
> > The purpose of this patch is to leave out these core related options
> > when DPDK applications are deployed under container env, so that
> > users only specify core restriction as starting the instance.
> >
> > Note: previously, some users are using isolated CPUs, which could
> > be excluded by default. Please add commands like taskset to use
> > those cores.
> >
> > Test example:
> > $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024
> >
> > Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> > Acked-by: Neil Horman <nhorman@tuxdriver.com>
> > ---
> > v2:
> >   - Make it as default instead of adding the new options.
> >  lib/librte_eal/common/eal_common_lcore.c | 11 ++++++++++-
> >  1 file changed, 10 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_eal/common/eal_common_lcore.c
> b/lib/librte_eal/common/eal_common_lcore.c
> > index 2cd4132..62e4f67 100644
> > --- a/lib/librte_eal/common/eal_common_lcore.c
> > +++ b/lib/librte_eal/common/eal_common_lcore.c
> > @@ -57,6 +57,14 @@ rte_eal_cpu_init(void)
> >  	struct rte_config *config = rte_eal_get_configuration();
> >  	unsigned lcore_id;
> >  	unsigned count = 0;
> > +	rte_cpuset_t cs;
> > +	pthread_t tid = pthread_self();
> > +
> > +	/* Add below method to obtain core restrictions, like ulimit,
> > +	 * cgroup.cpuset, etc. Will not use those cores, which are rebuffed.
> > +	 */
> > +	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t), &cs) < 0)
> > +		CPU_ZERO(&cs);
> >
> 
> This patch makes sense but the comment is hard to read because of wording
> and grammar.
> 
> If you choose variable names better then there really is no need for
> a comment in many cases. Code is often easier to read/write than comments
> for non-native English speakers.
> 
> Remove the comment and rename 'cs' as 'affinity_set' or something equally
> as descriptive.

Great suggestion. I'll resend one as you suggest.

Thanks,
Jianfeng

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v3] eal: restrict cores detection
  2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
                   ` (5 preceding siblings ...)
  2016-08-31  3:07 ` [PATCH v2] eal: restrict cores detection Jianfeng Tan
@ 2016-09-01  1:31 ` Jianfeng Tan
  2016-09-02 16:53   ` Bruce Richardson
                     ` (2 more replies)
  6 siblings, 3 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-09-01  1:31 UTC (permalink / raw)
  To: dev; +Cc: david.marchand, pmatilai, thomas.monjalon, stephen, Jianfeng Tan

This patch uses pthread_getaffinity_np() to narrow down detected
cores before parsing coremask (-c), corelist (-l), and coremap
(--lcores).

The purpose of this patch is to leave out these core related options
when DPDK applications are deployed under container env, so that
users only specify core restriction as starting the instance.

Note: previously, some users are using isolated CPUs, which could
be excluded by default. Please add commands like taskset to use
those cores.

Test example:
$ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v3:
  - Choose a more descriptive variable name, and remove comments
    as suggested by Stephen Hemminger.
v2:
  - Make it as default instead of adding the new options.
 lib/librte_eal/common/eal_common_lcore.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/librte_eal/common/eal_common_lcore.c b/lib/librte_eal/common/eal_common_lcore.c
index 2cd4132..71c575c 100644
--- a/lib/librte_eal/common/eal_common_lcore.c
+++ b/lib/librte_eal/common/eal_common_lcore.c
@@ -57,6 +57,12 @@ rte_eal_cpu_init(void)
 	struct rte_config *config = rte_eal_get_configuration();
 	unsigned lcore_id;
 	unsigned count = 0;
+	rte_cpuset_t affinity_set;
+	pthread_t tid = pthread_self();
+
+	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t),
+				   &affinity_set) < 0)
+		CPU_ZERO(&affinity_set);
 
 	/*
 	 * Parse the maximum set of logical cores, detect the subset of running
@@ -70,7 +76,8 @@ rte_eal_cpu_init(void)
 
 		/* in 1:1 mapping, record related cpu detected state */
 		lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
-		if (lcore_config[lcore_id].detected == 0) {
+		if (lcore_config[lcore_id].detected == 0 ||
+		    !CPU_ISSET(lcore_id, &affinity_set)) {
 			config->lcore_role[lcore_id] = ROLE_OFF;
 			lcore_config[lcore_id].core_index = -1;
 			continue;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v3] eal: restrict cores detection
  2016-09-01  1:31 ` [PATCH v3] " Jianfeng Tan
@ 2016-09-02 16:53   ` Bruce Richardson
  2016-09-16 14:04     ` Thomas Monjalon
  2016-09-16 14:02   ` Thomas Monjalon
  2016-12-02 17:48   ` [PATCH v4] eal: restrict cores auto detection Jianfeng Tan
  2 siblings, 1 reply; 63+ messages in thread
From: Bruce Richardson @ 2016-09-02 16:53 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, david.marchand, pmatilai, thomas.monjalon, stephen

On Thu, Sep 01, 2016 at 01:31:47AM +0000, Jianfeng Tan wrote:
> This patch uses pthread_getaffinity_np() to narrow down detected
> cores before parsing coremask (-c), corelist (-l), and coremap
> (--lcores).
> 
> The purpose of this patch is to leave out these core related options
> when DPDK applications are deployed under container env, so that
> users only specify core restriction as starting the instance.
> 
> Note: previously, some users are using isolated CPUs, which could
> be excluded by default. Please add commands like taskset to use
> those cores.
> 
> Test example:
> $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024
> 

So, to be clear, does this patch mean that DPDK cannot use isolated cores
any more unless you explicitly run the app using taskset?
Is so, NAK, since isolating cores has been part of standard DPDK setup since
the first versions, and I don't believe that we should break that behaviour.

/Bruce

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v3] eal: restrict cores detection
  2016-09-01  1:31 ` [PATCH v3] " Jianfeng Tan
  2016-09-02 16:53   ` Bruce Richardson
@ 2016-09-16 14:02   ` Thomas Monjalon
  2016-12-02 17:48   ` [PATCH v4] eal: restrict cores auto detection Jianfeng Tan
  2 siblings, 0 replies; 63+ messages in thread
From: Thomas Monjalon @ 2016-09-16 14:02 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, david.marchand, pmatilai, stephen

2016-09-01 01:31, Jianfeng Tan:
> This patch uses pthread_getaffinity_np() to narrow down detected
> cores before parsing coremask (-c), corelist (-l), and coremap
> (--lcores).
> 
> The purpose of this patch is to leave out these core related options
> when DPDK applications are deployed under container env, so that
> users only specify core restriction as starting the instance.
[...]
> --- a/lib/librte_eal/common/eal_common_lcore.c
> +++ b/lib/librte_eal/common/eal_common_lcore.c
> @@ -57,6 +57,12 @@ rte_eal_cpu_init(void)
>  	struct rte_config *config = rte_eal_get_configuration();
>  	unsigned lcore_id;
>  	unsigned count = 0;
> +	rte_cpuset_t affinity_set;
> +	pthread_t tid = pthread_self();
> +

A comment is needed here to explain which errors we are checking.

> +	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t),
> +				   &affinity_set) < 0)
> +		CPU_ZERO(&affinity_set);
>  
>  	/*
>  	 * Parse the maximum set of logical cores, detect the subset of running
> @@ -70,7 +76,8 @@ rte_eal_cpu_init(void)
>  
>  		/* in 1:1 mapping, record related cpu detected state */
>  		lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
> -		if (lcore_config[lcore_id].detected == 0) {
> +		if (lcore_config[lcore_id].detected == 0 ||
> +		    !CPU_ISSET(lcore_id, &affinity_set)) {
>  			config->lcore_role[lcore_id] = ROLE_OFF;
>  			lcore_config[lcore_id].core_index = -1;
>  			continue;
> 

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v3] eal: restrict cores detection
  2016-09-02 16:53   ` Bruce Richardson
@ 2016-09-16 14:04     ` Thomas Monjalon
  0 siblings, 0 replies; 63+ messages in thread
From: Thomas Monjalon @ 2016-09-16 14:04 UTC (permalink / raw)
  To: Bruce Richardson, Jianfeng Tan; +Cc: dev, david.marchand, pmatilai, stephen

2016-09-02 17:53, Bruce Richardson:
> On Thu, Sep 01, 2016 at 01:31:47AM +0000, Jianfeng Tan wrote:

It would help the discussion to have a problem statement here.

> > This patch uses pthread_getaffinity_np() to narrow down detected
> > cores before parsing coremask (-c), corelist (-l), and coremap
> > (--lcores).
> > 
> > The purpose of this patch is to leave out these core related options
> > when DPDK applications are deployed under container env, so that
> > users only specify core restriction as starting the instance.
> > 
> > Note: previously, some users are using isolated CPUs, which could
> > be excluded by default. Please add commands like taskset to use
> > those cores.
> > 
> > Test example:
> > $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024
> > 
> 
> So, to be clear, does this patch mean that DPDK cannot use isolated cores
> any more unless you explicitly run the app using taskset?
> Is so, NAK, since isolating cores has been part of standard DPDK setup since
> the first versions, and I don't believe that we should break that behaviour.

So how could we help the container use-case?
Any suggestions?

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v4] eal: restrict cores auto detection
  2016-09-01  1:31 ` [PATCH v3] " Jianfeng Tan
  2016-09-02 16:53   ` Bruce Richardson
  2016-09-16 14:02   ` Thomas Monjalon
@ 2016-12-02 17:48   ` Jianfeng Tan
  2016-12-08 18:19     ` Thomas Monjalon
  2 siblings, 1 reply; 63+ messages in thread
From: Jianfeng Tan @ 2016-12-02 17:48 UTC (permalink / raw)
  To: dev; +Cc: david.marchand, pmatilai, bruce.richardson, Jianfeng Tan

This patch uses pthread_getaffinity_np() to narrow down used
cores when none of below options is specified:
  * coremask (-c)
  * corelist (-l)
  * and coremap (--lcores)

The purpose of this patch is to leave out these core related options
when DPDK applications are deployed under container env, so that
users do not need decide the core related parameters when developing
applications. Instead, when applications are deployed in containers,
use cpu-set to constrain which cores can be used inside this container
instance. And DPDK application inside containers just rely on this
auto detect mechanism to start polling threads.

Note: previously, some users are using isolated CPUs, which could
be excluded by default. Please add commands like taskset to use
those cores.

Test example:
$ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
---
v4:
  - Address Bruce's comment: only enable this auto detection
    mechanism when none of core options is specified.
  - More detailed use case on how it helps in containers.
v3:
  - Choose a more descriptive variable name, and remove comments
    as suggested by Stephen Hemminger.
v2:
  - Make it as default instead of adding the new options.
---
 lib/librte_eal/common/eal_common_options.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 6ca8af1..d192de1 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -126,6 +126,7 @@ static const char dpdk_solib_path[] __attribute__((used)) =
 
 static int master_lcore_parsed;
 static int mem_parsed;
+static int core_specified;
 
 void
 eal_reset_internal_config(struct internal_config *internal_cfg)
@@ -797,6 +798,7 @@ eal_parse_common_option(int opt, const char *optarg,
 			RTE_LOG(ERR, EAL, "invalid coremask\n");
 			return -1;
 		}
+		core_specified = 1;
 		break;
 	/* corelist */
 	case 'l':
@@ -804,6 +806,7 @@ eal_parse_common_option(int opt, const char *optarg,
 			RTE_LOG(ERR, EAL, "invalid core list\n");
 			return -1;
 		}
+		core_specified = 1;
 		break;
 	/* size of memory */
 	case 'm':
@@ -912,6 +915,7 @@ eal_parse_common_option(int opt, const char *optarg,
 				OPT_LCORES "\n");
 			return -1;
 		}
+		core_specified = 1;
 		break;
 
 	/* don't know what to do, leave this to caller */
@@ -923,12 +927,38 @@ eal_parse_common_option(int opt, const char *optarg,
 	return 0;
 }
 
+static void
+eal_auto_detect_cores(struct rte_config *cfg)
+{
+	unsigned int lcore_id;
+	unsigned int removed = 0;
+	rte_cpuset_t affinity_set;
+	pthread_t tid = pthread_self();
+
+	if (pthread_getaffinity_np(tid, sizeof(rte_cpuset_t),
+				&affinity_set) < 0)
+		CPU_ZERO(&affinity_set);
+
+	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+		if (cfg->lcore_role[lcore_id] == ROLE_RTE &&
+		    !CPU_ISSET(lcore_id, &affinity_set)) {
+			cfg->lcore_role[lcore_id] = ROLE_OFF;
+			removed++;
+		}
+	}
+
+	cfg->lcore_count -= removed;
+}
+
 int
 eal_adjust_config(struct internal_config *internal_cfg)
 {
 	int i;
 	struct rte_config *cfg = rte_eal_get_configuration();
 
+	if (!core_specified)
+		eal_auto_detect_cores(cfg);
+
 	if (internal_config.process_type == RTE_PROC_AUTO)
 		internal_config.process_type = eal_proc_type_detect();
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: restrict cores auto detection
  2016-12-02 17:48   ` [PATCH v4] eal: restrict cores auto detection Jianfeng Tan
@ 2016-12-08 18:19     ` Thomas Monjalon
  2016-12-09 15:14       ` Bruce Richardson
  0 siblings, 1 reply; 63+ messages in thread
From: Thomas Monjalon @ 2016-12-08 18:19 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: dev, david.marchand, pmatilai, bruce.richardson

2016-12-02 17:48, Jianfeng Tan:
> This patch uses pthread_getaffinity_np() to narrow down used
> cores when none of below options is specified:
>   * coremask (-c)
>   * corelist (-l)
>   * and coremap (--lcores)
> 
> The purpose of this patch is to leave out these core related options
> when DPDK applications are deployed under container env, so that
> users do not need decide the core related parameters when developing
> applications. Instead, when applications are deployed in containers,
> use cpu-set to constrain which cores can be used inside this container
> instance. And DPDK application inside containers just rely on this
> auto detect mechanism to start polling threads.
> 
> Note: previously, some users are using isolated CPUs, which could
> be excluded by default. Please add commands like taskset to use
> those cores.
> 
> Test example:
> $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024

Bruce, what do you think of this version?
It requires taskset only if -c, -l and --lcores are not used.

>  static int master_lcore_parsed;
>  static int mem_parsed;
> +static int core_specified;

I think it's better to keep the word "parsed" as others.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: restrict cores auto detection
  2016-12-08 18:19     ` Thomas Monjalon
@ 2016-12-09 15:14       ` Bruce Richardson
  2016-12-21 14:31         ` Thomas Monjalon
  0 siblings, 1 reply; 63+ messages in thread
From: Bruce Richardson @ 2016-12-09 15:14 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Jianfeng Tan, dev, david.marchand, pmatilai

On Thu, Dec 08, 2016 at 07:19:41PM +0100, Thomas Monjalon wrote:
> 2016-12-02 17:48, Jianfeng Tan:
> > This patch uses pthread_getaffinity_np() to narrow down used
> > cores when none of below options is specified:
> >   * coremask (-c)
> >   * corelist (-l)
> >   * and coremap (--lcores)
> > 
> > The purpose of this patch is to leave out these core related options
> > when DPDK applications are deployed under container env, so that
> > users do not need decide the core related parameters when developing
> > applications. Instead, when applications are deployed in containers,
> > use cpu-set to constrain which cores can be used inside this container
> > instance. And DPDK application inside containers just rely on this
> > auto detect mechanism to start polling threads.
> > 
> > Note: previously, some users are using isolated CPUs, which could
> > be excluded by default. Please add commands like taskset to use
> > those cores.
> > 
> > Test example:
> > $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024
> 
> Bruce, what do you think of this version?
> It requires taskset only if -c, -l and --lcores are not used.
> 
I'm fine with that since it maintains backward compatibilty for those
options.

Acked-by: Bruce Richardson <bruce.richardson@intel.com>

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH v4] eal: restrict cores auto detection
  2016-12-09 15:14       ` Bruce Richardson
@ 2016-12-21 14:31         ` Thomas Monjalon
  0 siblings, 0 replies; 63+ messages in thread
From: Thomas Monjalon @ 2016-12-21 14:31 UTC (permalink / raw)
  To: Jianfeng Tan; +Cc: Bruce Richardson, dev, david.marchand, pmatilai

2016-12-09 15:14, Bruce Richardson:
> On Thu, Dec 08, 2016 at 07:19:41PM +0100, Thomas Monjalon wrote:
> > 2016-12-02 17:48, Jianfeng Tan:
> > > This patch uses pthread_getaffinity_np() to narrow down used
> > > cores when none of below options is specified:
> > >   * coremask (-c)
> > >   * corelist (-l)
> > >   * and coremap (--lcores)
> > > 
> > > The purpose of this patch is to leave out these core related options
> > > when DPDK applications are deployed under container env, so that
> > > users do not need decide the core related parameters when developing
> > > applications. Instead, when applications are deployed in containers,
> > > use cpu-set to constrain which cores can be used inside this container
> > > instance. And DPDK application inside containers just rely on this
> > > auto detect mechanism to start polling threads.
> > > 
> > > Note: previously, some users are using isolated CPUs, which could
> > > be excluded by default. Please add commands like taskset to use
> > > those cores.
> > > 
> > > Test example:
> > > $ taskset 0xc0000 ./examples/helloworld/build/helloworld -m 1024
> > 
> > Bruce, what do you think of this version?
> > It requires taskset only if -c, -l and --lcores are not used.
> > 
> I'm fine with that since it maintains backward compatibilty for those
> options.
> 
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>

Applied with "s/specified/parsed/", thanks

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2016-12-21 14:31 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-24 18:49 [RFC] eal: add cgroup-aware resource self discovery Jianfeng Tan
2016-01-25 13:46 ` Neil Horman
2016-01-26  2:22   ` Tan, Jianfeng
2016-01-26 14:19     ` Neil Horman
2016-01-27 12:02       ` Tan, Jianfeng
2016-01-27 17:30         ` Neil Horman
2016-01-29 11:22 ` [PATCH] eal: make resource initialization more robust Jianfeng Tan
2016-02-01 18:08   ` Neil Horman
2016-02-22  6:08   ` Tan, Jianfeng
2016-02-22 13:18     ` Neil Horman
2016-02-28 21:12   ` Thomas Monjalon
2016-02-29  1:50     ` Tan, Jianfeng
2016-03-04 10:05 ` [PATCH] eal: add option --avail-cores to detect lcores Jianfeng Tan
2016-03-08  8:54   ` Panu Matilainen
2016-03-08 17:38     ` Tan, Jianfeng
2016-03-09 13:05       ` Panu Matilainen
2016-03-09 13:53         ` Tan, Jianfeng
2016-03-09 14:01           ` Ananyev, Konstantin
2016-03-09 14:17             ` Tan, Jianfeng
2016-03-09 14:44               ` Ananyev, Konstantin
2016-03-09 14:55                 ` Tan, Jianfeng
2016-03-09 15:17                   ` Ananyev, Konstantin
2016-03-09 17:45                     ` Tan, Jianfeng
2016-03-09 19:33                       ` Ananyev, Konstantin
2016-03-10  1:36                         ` Tan, Jianfeng
2016-05-18 12:46         ` David Marchand
2016-05-19  2:25           ` Tan, Jianfeng
2016-06-30 13:43             ` Thomas Monjalon
2016-07-01  0:52               ` Tan, Jianfeng
2016-04-26 12:39   ` Tan, Jianfeng
2016-03-04 10:58 ` [PATCH] eal: make hugetlb initialization more robust Jianfeng Tan
2016-03-08  1:42   ` [PATCH v2] " Jianfeng Tan
2016-03-08  8:46     ` Tan, Jianfeng
2016-05-04 11:07     ` Sergio Gonzalez Monroy
2016-05-04 11:28       ` Tan, Jianfeng
2016-05-04 12:25     ` Sergio Gonzalez Monroy
2016-05-09 10:48   ` [PATCH v3] " Jianfeng Tan
2016-05-10  8:54     ` Sergio Gonzalez Monroy
2016-05-10  9:11       ` Tan, Jianfeng
2016-05-12  0:44   ` [PATCH v4] " Jianfeng Tan
2016-05-17 16:39     ` David Marchand
2016-05-18  7:56       ` Sergio Gonzalez Monroy
2016-05-18  9:34         ` David Marchand
2016-05-19  2:00       ` Tan, Jianfeng
2016-05-17 16:40     ` Thomas Monjalon
2016-05-18  8:06       ` Sergio Gonzalez Monroy
2016-05-18  9:38         ` David Marchand
2016-05-19  2:11         ` Tan, Jianfeng
2016-05-31  3:37 ` [PATCH v5] eal: fix allocating all free hugepages Jianfeng Tan
2016-06-06  2:49   ` Pei, Yulong
2016-06-08 11:27   ` Sergio Gonzalez Monroy
2016-06-30 13:34     ` Thomas Monjalon
2016-08-31  3:07 ` [PATCH v2] eal: restrict cores detection Jianfeng Tan
2016-08-31 15:30   ` Stephen Hemminger
2016-09-01  1:15     ` Tan, Jianfeng
2016-09-01  1:31 ` [PATCH v3] " Jianfeng Tan
2016-09-02 16:53   ` Bruce Richardson
2016-09-16 14:04     ` Thomas Monjalon
2016-09-16 14:02   ` Thomas Monjalon
2016-12-02 17:48   ` [PATCH v4] eal: restrict cores auto detection Jianfeng Tan
2016-12-08 18:19     ` Thomas Monjalon
2016-12-09 15:14       ` Bruce Richardson
2016-12-21 14:31         ` Thomas Monjalon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.