[RFC] eal: add cgroup-aware resource self discovery

* [RFC] eal: add cgroup-aware resource self discovery
@ 2016-01-24 18:49 Jianfeng Tan
  2016-01-25 13:46 ` Neil Horman
                   ` (6 more replies)
  0 siblings, 7 replies; 63+ messages in thread
From: Jianfeng Tan @ 2016-01-24 18:49 UTC (permalink / raw)
  To: dev; +Cc: yuanhan.liu

Current issue: DPDK is not that friendly to container environment usage.
It's because that it pre-alloc resource like cores and hugepages from cmd
line options. So for a DPDK application, it's necessary to check how much
resource is allocated to a container and then use that as an reference.

To address that, this patch introduces two APIs:
   a. rte_eal_res_self_discovery, to query how much resource can be used.
   b. rte_eal_res_self_discovery_apply, to apply self-discovered resource
      into DPDK.

Currently only Linux CGroup is added, similarly, we can add BSD jail as
well in the future. And even in Linux, there could be other way to query
and apply resources, like through a centralized daemon.

Known issue: current way to read individual attributes of cgroups directly
instead of via systemd's API is not a long-term solution. Please refer to
http://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
for more information.

Test example:
    a. cgcreate -g cpuset,hugetlb:/test-subgroup
    b. cgset -r cpuset.cpus=2-3 test-subgroup
    c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
    d. cgexec -g cpuset,hugetlb:test-subgroup \
	    ./examples/l2fwd/build/l2fwd --self-discovery=cgroup -n 4 -- -p 3

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_options.c      |  39 ++++
 lib/librte_eal/common/eal_internal_cfg.h        |   1 +
 lib/librte_eal/common/eal_options.h             |   2 +
 lib/librte_eal/common/include/rte_eal.h         |  34 +++
 lib/librte_eal/linuxapp/eal/Makefile            |   1 +
 lib/librte_eal/linuxapp/eal/eal_cgroup.c        | 294 ++++++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c |   5 +
 7 files changed, 376 insertions(+)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_cgroup.c

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..7235473 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_SELF_DISCOVERY,    1, NULL, OPT_SELF_DISCOVERY_NUM   },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -128,6 +129,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
 	internal_cfg->force_nchannel = 0;
 	internal_cfg->hugefile_prefix = HUGEFILE_PREFIX_DEFAULT;
 	internal_cfg->hugepage_dir = NULL;
+	internal_cfg->self_discovery = NULL;
 	internal_cfg->force_sockets = 0;
 	/* zero out the NUMA config */
 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
@@ -755,6 +757,24 @@ eal_parse_proc_type(const char *arg)
 }
 
 int
+__attribute__((weak))
+rte_eal_res_self_discovery(const char *type __rte_unused,
+			   char **p_corelist __rte_unused,
+			   uint64_t *p_memory __rte_unused)
+{
+	return -1;
+}
+
+int
+__attribute__((weak))
+rte_eal_res_self_discovery_apply(const char *type __rte_unused,
+				 int enable_core __rte_unused,
+				 int enable_mem __rte_unused)
+{
+	return -1;
+}
+
+int
 eal_parse_common_option(int opt, const char *optarg,
 			struct internal_config *conf)
 {
@@ -897,6 +917,25 @@ eal_parse_common_option(int opt, const char *optarg,
 		}
 		break;
 
+	case OPT_SELF_DISCOVERY_NUM: {
+		char *corelist;
+
+		if (rte_eal_res_self_discovery(optarg, &corelist, NULL) < 0) {
+			RTE_LOG(ERR, EAL, "invalid parameter for --"
+				OPT_SELF_DISCOVERY "\n");
+			return -1;
+		}
+
+		if (eal_parse_corelist(corelist) < 0) {
+			RTE_LOG(ERR, EAL, "invalid core list\n");
+			return -1;
+		}
+		/* Save it here for memory limit */
+		internal_config.self_discovery = strdup(optarg);
+
+		break;
+	}
+
 	/* don't know what to do, leave this to caller */
 	default:
 		return 1;
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..f3c8e31 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -83,6 +83,7 @@ struct internal_config {
 	volatile enum rte_intr_mode vfio_intr_mode;
 	const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
 	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
+	const char *self_discovery;       /**< specific type of self_discovery */
 
 	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
 	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..a499d73 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_SELF_DISCOVERY    "self-discovery"
+	OPT_SELF_DISCOVERY_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index d2816a8..ff81484 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -220,6 +220,40 @@ int rte_eal_has_hugepages(void);
 int rte_sys_gettid(void);
 
 /**
+ * An API to query resource self discovery.
+ *
+ * @type
+ *   Type of self resource discovery.
+ * @p_corelist
+ *   If succeed, fill core list which can be used. Caller to free.
+ * @p_memory
+ *   If succeed, fill how many (bytes) memory can be used.
+ *
+ * @return
+ *   - (-1), if failed.
+ *   - 0, if succeed.
+ */
+int rte_eal_res_self_discovery(const char *type,
+			       char **p_corelist, uint64_t *p_memory);
+/**
+ * An API to apply resource through self discovery.
+ *
+ * @type
+ *   Type of self resource discovery.
+ * @enable_core
+ *   If succeed, apply core resource.
+ * @p_memory
+ *   If succeed, apply memory resource.
+ *
+ * @return
+ *   - (-1), if failed.
+ *   - 0, if succeed.
+ */
+int rte_eal_res_self_discovery_apply(const char *type,
+				     int enable_core, int enable_mem);
+
+
+/**
  * Get system unique thread id.
  *
  * @return
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..834ae2f 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -87,6 +87,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_devargs.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_dev.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_options.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_cgroup.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += rte_malloc.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_elem.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_heap.c
diff --git a/lib/librte_eal/linuxapp/eal/eal_cgroup.c b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
new file mode 100644
index 0000000..d6a04ee
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
@@ -0,0 +1,294 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <mntent.h>
+#include <inttypes.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_common.h>
+
+#include "eal_internal_cfg.h"
+
+static int pid;
+
+static char *
+find_controller_dfs(const char *dir_path)
+{
+	FILE *f;
+	char *line;
+	char *ret;
+	size_t len;
+	ssize_t read;
+	DIR *dir;
+	struct dirent *ent;
+	char filepath[PATH_MAX];
+
+	// 1. check if this process belongs to this cgroup
+	snprintf(filepath, sizeof(filepath)-1, "%s/tasks", dir_path);
+	f = fopen(filepath, "r");
+	if (f == NULL)
+		return NULL;
+	len = 0;
+	line = NULL;
+	while ((read = getline(&line, &len, f)) != -1) {
+		int _pid = atoi(line);
+		free(line);
+		if (_pid == pid)
+			break;
+		len = 0;
+		line = NULL;
+	}
+	fclose(f);
+	if (read != -1)
+		return strdup(dir_path);
+
+	// 2. check its child cgroup
+	if (!(dir = opendir(dir_path)))
+		return NULL;
+
+	ret = NULL;
+	while ((ent = readdir(dir)) != NULL) {
+		if (ent->d_type != DT_DIR)
+			continue;
+		if (strcmp(ent->d_name, ".") == 0 ||
+		    strcmp(ent->d_name, "..") == 0)
+			continue;
+
+		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
+			 dir_path, ent->d_name);
+
+		ret = find_controller_dfs(filepath);
+		if (ret != NULL)
+			break;
+	}
+
+	closedir(dir);
+	return ret;
+}
+
+static char *
+find_controller(const char *controller)
+{
+	FILE *f;
+	char *path;
+	struct mntent *ent;
+
+	static const char *proc_mounts = "/proc/mounts";
+	static const char *fs_type = "cgroup";
+
+	f = setmntent(proc_mounts, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_mounts);
+		return NULL;
+	}
+
+	while (NULL != (ent = getmntent(f))) {
+		if (strcmp(ent->mnt_type, fs_type) != 0)
+			continue;
+		if (hasmntopt(ent, controller) == NULL)
+			continue;
+		break;
+	}
+
+	if (ent == NULL) {
+		path = NULL;
+		goto end;
+	}
+
+	path = find_controller_dfs(ent->mnt_dir);
+end:
+	endmntent(f);
+	return path;
+}
+
+static inline char *
+get_oneline_from_file(const char *path)
+{
+	FILE *f;
+	char *line = NULL;
+	size_t len = 0;
+
+	if (NULL == (f = fopen(path, "r")))
+		return NULL;
+	if (getline(&line, &len, f) == -1)
+		line = NULL;
+	line[strcspn(line, "\n")] = 0;
+	fclose(f);
+	return line;
+}
+
+static int
+cgroup_cpuset(char **p_corelist, int enable __rte_unused)
+{
+	char filepath[PATH_MAX];
+	char *controller;
+
+       	controller = find_controller("cpuset");
+	if (controller == NULL)
+		return -1;
+
+	snprintf(filepath, sizeof(filepath)-1, "%s/cpuset.cpus", controller);
+	*p_corelist = get_oneline_from_file(filepath);
+	RTE_LOG(INFO, EAL, "cgroup cpuset: %s\n", *p_corelist);
+	return 0;
+
+}
+
+static inline uint64_t
+get_hugetlb_limit(const char *path)
+{
+	uint64_t limit;
+	char *str;
+
+       	str = get_oneline_from_file(path);
+	sscanf(str, "%"PRIu64, &limit);
+	free(str);
+	return limit;
+}
+
+static int
+cgroup_hugetlb(uint64_t *p_memory, int enable)
+{
+	unsigned i;
+	char filepath[PATH_MAX];
+	char *controller;
+	DIR *dir;
+	struct dirent *ent;
+	uint64_t memory = 0;
+	static char prefix[] = "hugetlb";
+	static int prefix_len = sizeof(prefix) - 1;
+	static char suffix[] = "limit_in_bytes";
+
+       	controller = find_controller("hugetlb");
+	if (controller == NULL)
+		return -1;
+
+	if (!(dir = opendir(controller)))
+		return -1;
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (strncmp(ent->d_name, prefix, prefix_len) != 0)
+			continue;
+
+		char *sz_beg = ent->d_name + prefix_len + 1;
+		char *sz_end = strchr(sz_beg, '.');
+
+		if (strcmp(sz_end + 1, suffix) != 0)
+			continue;
+
+		char *tmp = strndup(sz_beg, sz_end - sz_beg);
+		uint64_t pagesize = rte_str_to_size(tmp);
+		free(tmp);
+
+		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
+			 controller, ent->d_name);
+		uint64_t m_limit = get_hugetlb_limit(filepath);
+		memory += m_limit;
+
+		/* Record those information into internal_config if hugepages
+		 * are already initialized.
+		 */
+		if (! enable)
+			continue;
+		for (i = 0; i < internal_config.num_hugepage_sizes; ++i) {
+			struct hugepage_info *hp;
+
+		       	hp = &internal_config.hugepage_info[i];
+			if (hp->hugepage_sz != pagesize)
+				continue;
+
+			if (m_limit < hp->hugepage_sz * hp->num_pages[0])
+				hp->num_pages[0] = m_limit / hp->hugepage_sz;
+		}
+	}
+
+	closedir(dir);
+	*p_memory = memory;
+	RTE_LOG(INFO, EAL, "cgroup hugetlb: %"PRIx64"\n", *p_memory);
+	return 0;
+}
+
+static int
+resource_self_discovery(const char *type, char **p_corelist, int enable_core,
+			uint64_t *p_memory, int enable_mem)
+{
+	if (strcmp(type, "cgroup") != 0) {
+		RTE_LOG(ERR, EAL, "type not supported: %s\n", type);
+		return -1;
+	}
+
+	pid = getpid();
+
+	if (p_corelist != NULL && cgroup_cpuset(p_corelist, enable_core) < 0) {
+		RTE_LOG(ERR, EAL, "Failed when discover resource cpuset\n");
+		return -1;
+	}
+	if (p_memory != NULL && cgroup_hugetlb(p_memory, enable_mem) < 0) {
+		RTE_LOG(ERR, EAL, "Failed when discover resource hugetlb\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+rte_eal_res_self_discovery(const char *type, char **p_corelist,
+			   uint64_t *p_memory)
+{
+	return resource_self_discovery(type, p_corelist, 0, p_memory, 0);
+}
+
+int
+rte_eal_res_self_discovery_apply(const char *type, int enable_core,
+				 int enable_mem)
+{
+	char *corelist, **pc = NULL;
+	uint64_t mem, *pm = NULL;
+	
+	if (enable_core)
+		pc = &corelist;
+	if (enable_mem)
+		pm = &mem;
+
+	return resource_self_discovery(type, pc, enable_core,
+				       pm, enable_mem);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 18858e2..a6b6548 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -354,6 +354,11 @@ eal_hugepage_info_init(void)
 	qsort(&internal_config.hugepage_info[0], num_sizes,
 	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
 
+	/* Apply cgroup hugetlb limit before we really use hugepages */
+	if (internal_config.self_discovery)
+		rte_eal_res_self_discovery_apply(internal_config.self_discovery,
+						 0, 1);
+
 	/* now we have all info, check we have at least one valid size */
 	for (i = 0; i < num_sizes; i++)
 		if (internal_config.hugepage_info[i].hugedir != NULL &&
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 63+ messages in thread