From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Tan, Jianfeng" Subject: Re: [PATCH v2] eal: make hugetlb initialization more robust Date: Tue, 8 Mar 2016 16:46:55 +0800 Message-ID: <56DE917F.8090808@intel.com> References: <1457089092-4128-1-git-send-email-jianfeng.tan@intel.com> <1457401359-132260-1-git-send-email-jianfeng.tan@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset=windows-1252; format=flowed Content-Transfer-Encoding: 7bit To: dev@dpdk.org Return-path: Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by dpdk.org (Postfix) with ESMTP id 7FFCD2C52 for ; Tue, 8 Mar 2016 09:46:57 +0100 (CET) In-Reply-To: <1457401359-132260-1-git-send-email-jianfeng.tan@intel.com> List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" On 3/8/2016 9:42 AM, Jianfeng Tan wrote: > This patch adds an option, --huge-trybest, to use a recover mechanism to > the case that there are not so many hugepages (declared in sysfs), which > can be used. It relys on a mem access to fault-in hugepages, and if fails > with SIGBUS, recover to previously saved stack environment with > siglongjmp(). > > Test example: > a. cgcreate -g hugetlb:/test-subgroup > b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup > c. cgexec -g hugetlb:test-subgroup \ > ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest > > Signed-off-by: Jianfeng Tan Sorry, forgot to add ack from Neil. Acked-by: Neil Horman > --- > v2: > - Address the compiling error by move setjmp into a wrap method. > > lib/librte_eal/common/eal_common_options.c | 4 ++ > lib/librte_eal/common/eal_internal_cfg.h | 1 + > lib/librte_eal/common/eal_options.h | 2 + > lib/librte_eal/linuxapp/eal/eal.c | 1 + > lib/librte_eal/linuxapp/eal/eal_memory.c | 104 ++++++++++++++++++++++++++--- > 5 files changed, 104 insertions(+), 8 deletions(-) > > diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c > index 29942ea..8ff6a2e 100644 > --- a/lib/librte_eal/common/eal_common_options.c > +++ b/lib/librte_eal/common/eal_common_options.c > @@ -95,6 +95,7 @@ eal_long_options[] = { > {OPT_VFIO_INTR, 1, NULL, OPT_VFIO_INTR_NUM }, > {OPT_VMWARE_TSC_MAP, 0, NULL, OPT_VMWARE_TSC_MAP_NUM }, > {OPT_XEN_DOM0, 0, NULL, OPT_XEN_DOM0_NUM }, > + {OPT_HUGE_TRYBEST, 0, NULL, OPT_HUGE_TRYBEST_NUM }, > {0, 0, NULL, 0 } > }; > > @@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg, > return -1; > } > break; > + case OPT_HUGE_TRYBEST_NUM: > + internal_config.huge_trybest = 1; > + break; > > /* don't know what to do, leave this to caller */ > default: > diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h > index 5f1367e..90a3533 100644 > --- a/lib/librte_eal/common/eal_internal_cfg.h > +++ b/lib/librte_eal/common/eal_internal_cfg.h > @@ -64,6 +64,7 @@ struct internal_config { > volatile unsigned force_nchannel; /**< force number of channels */ > volatile unsigned force_nrank; /**< force number of ranks */ > volatile unsigned no_hugetlbfs; /**< true to disable hugetlbfs */ > + volatile unsigned huge_trybest; /**< try best to allocate hugepages */ > unsigned hugepage_unlink; /**< true to unlink backing files */ > volatile unsigned xen_dom0_support; /**< support app running on Xen Dom0*/ > volatile unsigned no_pci; /**< true to disable PCI */ > diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h > index a881c62..02397c5 100644 > --- a/lib/librte_eal/common/eal_options.h > +++ b/lib/librte_eal/common/eal_options.h > @@ -83,6 +83,8 @@ enum { > OPT_VMWARE_TSC_MAP_NUM, > #define OPT_XEN_DOM0 "xen-dom0" > OPT_XEN_DOM0_NUM, > +#define OPT_HUGE_TRYBEST "huge-trybest" > + OPT_HUGE_TRYBEST_NUM, > OPT_LONG_MAX_NUM > }; > > diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c > index ceac435..3e23877 100644 > --- a/lib/librte_eal/linuxapp/eal/eal.c > +++ b/lib/librte_eal/linuxapp/eal/eal.c > @@ -343,6 +343,7 @@ eal_usage(const char *prgname) > " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" > " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" > " --"OPT_XEN_DOM0" Support running on Xen dom0 without hugetlbfs\n" > + " --"OPT_HUGE_TRYBEST" Try best to accommodate hugepages\n" > "\n"); > /* Allow the application to print its usage message too if hook is set */ > if ( rte_application_usage_hook ) { > diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c > index 5b9132c..e4e1f3b 100644 > --- a/lib/librte_eal/linuxapp/eal/eal_memory.c > +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c > @@ -80,6 +80,8 @@ > #include > #include > #include > +#include > +#include > > #include > #include > @@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz) > return addr; > } > > +static sigjmp_buf jmpenv; > + > +static void sigbus_handler(int signo __rte_unused) > +{ > + siglongjmp(jmpenv, 1); > +} > + > +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, > + * non-static local variable in the stack frame calling setjmp might be > + * clobbered by a call to longjmp. > + */ > +static int wrap_setjmp(void) > +{ > + return setjmp(jmpenv); > +} > /* > * Mmap all hugepages of hugepage table: it first open a file in > * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the > @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, > if (fd < 0) { > RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, > strerror(errno)); > - return -1; > + return i; > } > > /* map the segment, and populate page tables, > @@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, > RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, > strerror(errno)); > close(fd); > - return -1; > + return i; > } > > if (orig) { > @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, > hugepg_tbl[i].final_va = virtaddr; > } > > + if (orig && internal_config.huge_trybest) { > + /* In linux, hugetlb limitations, like cgroup, are > + * enforced at fault time instead of mmap(), even > + * with the option of MAP_POPULATE. Kernel will send > + * a SIGBUS signal. To avoid to be killed, save stack > + * environment here, if SIGBUS happens, we can jump > + * back here. > + */ > + if (wrap_setjmp()) { > + RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more " > + "hugepages of size %u MB\n", > + (unsigned)(hugepage_sz / 0x100000)); > + munmap(virtaddr, hugepage_sz); > + close(fd); > + unlink(hugepg_tbl[i].filepath); > + return i; > + } > + *(int *)virtaddr = 0; > + } > + > + > /* set shared flock on the file. */ > if (flock(fd, LOCK_SH | LOCK_NB) == -1) { > RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n", > __func__, strerror(errno)); > close(fd); > - return -1; > + return i; > } > > close(fd); > @@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, > vma_addr = (char *)vma_addr + hugepage_sz; > vma_len -= hugepage_sz; > } > - return 0; > + return i; > } > > #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS > @@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory, > return total_num_pages; > } > > +static struct sigaction action_old; > +static int need_recover; > + > +static void > +register_sigbus(void) > +{ > + sigset_t mask; > + struct sigaction action; > + > + sigemptyset(&mask); > + sigaddset(&mask, SIGBUS); > + action.sa_flags = 0; > + action.sa_mask = mask; > + action.sa_handler = sigbus_handler; > + > + need_recover = !sigaction(SIGBUS, &action, &action_old); > +} > + > +static void > +recover_sigbus(void) > +{ > + if (need_recover) { > + sigaction(SIGBUS, &action_old, NULL); > + need_recover = 0; > + } > +} > + > /* > * Prepare physical memory mapping: fill configuration structure with > * these infos, return 0 on success. > @@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void) > > hp_offset = 0; /* where we start the current page size entries */ > > + if (internal_config.huge_trybest) > + register_sigbus(); > + > /* map all hugepages and sort them */ > for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ > + int pages_old, pages_new; > struct hugepage_info *hpi; > > /* > @@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void) > continue; > > /* map all hugepages available */ > - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ > - RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", > - (unsigned)(hpi->hugepage_sz / 0x100000)); > - goto fail; > + pages_old = hpi->num_pages[0]; > + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1); > + if (pages_new < pages_old) { > + RTE_LOG(DEBUG, EAL, > + "%d not %d hugepages of size %u MB allocated\n", > + pages_new, pages_old, > + (unsigned)(hpi->hugepage_sz / 0x100000)); > + if (internal_config.huge_trybest) { > + int pages = pages_old - pages_new; > + > + internal_config.memory -= > + hpi->hugepage_sz * pages; > + nr_hugepages -= pages; > + hpi->num_pages[0] = pages_new; > + if (pages_new == 0) > + continue; > + } else > + goto fail; > } > > /* find physical addresses and sockets for each hugepage */ > @@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void) > #endif > } > > + if (internal_config.huge_trybest) > + recover_sigbus(); > + > #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS > nr_hugefiles = 0; > for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { > @@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void) > return 0; > > fail: > + if (internal_config.huge_trybest) > + recover_sigbus(); > free(tmp_hp); > return -1; > }