From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755611Ab3BLVck (ORCPT ); Tue, 12 Feb 2013 16:32:40 -0500 Received: from hubcas2.seas.wustl.edu ([128.252.145.2]:24141 "EHLO hubcas2.seas.wustl.edu" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752373Ab3BLVcj (ORCPT ); Tue, 12 Feb 2013 16:32:39 -0500 X-Greylist: delayed 303 seconds by postgrey-1.27 at vger.kernel.org; Tue, 12 Feb 2013 16:32:39 EST Message-ID: <511AB3C5.2080703@seas.wustl.edu> Date: Tue, 12 Feb 2013 15:27:33 -0600 From: Professor Berkley Shands Organization: Department of Computer Science and Engineering User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20130107 Thunderbird/17.0.2 MIME-Version: 1.0 To: linux-kernel Subject: NUMA allocations fail to be numa allocated Content-Type: text/plain; charset="ISO-8859-1"; format=flowed Content-Transfer-Encoding: 7bit X-Originating-IP: [205.158.150.226] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org using libnuma calls on RedHat 6.3 x86_64 with the default kernel and up to 3.4.29 don't allocate on the specified numa nodes, even when forced with numactl. It appears that setting the NUMA policy, and or numa nodes does little for large allocations. Using HUGETLBFS, and you get memory on most any node BUT the one you asked for. It appears that it allocates on the last node that did a free(). Here is a small program to demo the lack of numa awareness from user space. #include #include #include #include #include #include // for sched_getcpu() call static const unsigned long HUGE_PAGE_SIZE = 1UL << 21; // a 2MB huge page static const unsigned long HUGE_PAGE_SIZE1 = (1UL << 21) - 1; // less one static const unsigned long PAGE_SIZE = 1UL << 10; // a 4KB page static const unsigned long PAGE_SIZE1 = (1UL << 10) - 1; // less one int VerifyNumaNode(void *ptr, // address int node, // target node int Count); // count of 4KB pages int MoveAddrToNodeMulti(void *ptr, int node, int Count); void *Allocate(size_t length, int OnNode) { int shmid = -1; void *shmaddr = NULL; size_t new_length = length; int MaxNumaNode = numa_max_node(); // find highest NUMA number int LocalNumaNode = numa_node_of_cpu(sched_getcpu()); int NewNumaNode = LocalNumaNode; unsigned long MaskBits[2] = { 0UL, 0UL }; // up to 128 nodes struct bitmask NewMask; NewMask.size = 8; // Max nodes on an HP struct bitmask *CurrentMask = numa_get_membind(); // see if NUMA allocation is desired if (OnNode >= 0) { if (OnNode > MaxNumaNode) { fprintf(stderr, "Invalid NUMA HUGEPages allocation node %d max is %d\n", OnNode, MaxNumaNode); } else { NewNumaNode = OnNode; } } MaskBits[0] = 1UL << NewNumaNode; numa_set_membind(&NewMask); // restrict to this node if (new_length < HUGE_PAGE_SIZE) /* 2MB min alloc for huge pages */ { new_length = HUGE_PAGE_SIZE; } if (new_length & HUGE_PAGE_SIZE1) /* 2MB min alloc for huge pages */ { new_length = ((new_length >> 21) + 1) << 21; } if ((shmid = shmget(IPC_PRIVATE, new_length, /* length */ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) == -1) { fprintf(stderr, "shmget() failed for %ldMB\n", (long) (new_length >> 20)); numa_set_membind(CurrentMask); // unrestrict to this node return NULL; } shmaddr = shmat(shmid, NULL, 0); if (shmaddr == (void *) -1) { shmctl(shmid, IPC_RMID, NULL); numa_set_membind(CurrentMask); // unrestrict to this node return NULL; } else if ((unsigned long) (shmaddr) & (PAGE_SIZE - 1)) { fprintf(stderr, "huge page allocation was not page aligned\n"); } memset(shmaddr, 0x00, new_length); if (VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL) > 0) { MoveAddrToNodeMulti(shmaddr, NewNumaNode, new_length / 4096UL); } numa_set_membind(CurrentMask); // unrestrict to this node VerifyNumaNode(shmaddr, NewNumaNode, new_length / 4096UL); /* now delete the ID so it will free itself on exit */ shmctl(shmid, IPC_RMID, NULL); return shmaddr; } void Free(void *addr) { } int NumaNodeFromAddress(void *Address) { int status[1] = { -1 }; void *PTR = Address; void *PTR2[1] = { NULL }; PTR2[0] = &PTR; int retval = move_pages(0, // this thread 1, // just one pointer PTR2, // The given address NULL, // array of nodes, no moving, just asking status, // array of node results MPOL_MF_MOVE); if (retval) { fprintf(stderr, "Invalid Address %p - No NUMA node\n", Address); } return status[0]; } int MoveAddrToNodeMulti(void *ptr, int node, int Count) { unsigned long *PTR = new unsigned long[Count + 1]; unsigned long *PTR2 = new unsigned long[Count + 1]; int *status = new int[Count + 1]; int *NN = new int[Count + 1]; int retval = 0; int i = 0; unsigned long addr = 0; for (i = 0; i < Count; i++) { status[i] = -1; NN[i] = node; addr = ((unsigned long) ptr) + (i * 4096); PTR[i] = (unsigned long) (addr & ~4095UL); PTR2[i] = (unsigned long) &PTR[i]; } retval = move_pages(0, // this thread Count, // lots of pointers (void **) PTR2, // The given address NN, // move to new node please status, // array of node results MPOL_MF_MOVE); if (retval) { fprintf(stderr, "MoveAddrToNodeMulti to failed\n"); } else { retval = 0; for (i = 0; i < Count; i++) { if (status[i] != node) { fprintf(stderr, "Addr 0x%08lx is node %d not %d\n", PTR[i], status[i], node); retval++; } } } delete [] NN; delete [] status; delete [] PTR2; delete [] PTR; return retval; } int VerifyNumaNode(void *ptr, int node, int Count) { unsigned long *PTR = new unsigned long[Count + 1]; unsigned long *PTR2 = new unsigned long[Count + 1]; int *status = new int[Count + 1]; int retval = 0; int i = 0; unsigned long addr = 0; for (i = 0; i < Count; i++) { status[i] = -1; addr = ((unsigned long) ptr) + (i * 4096); PTR[i] = (unsigned long) (addr & ~4095UL); PTR2[i] = (unsigned long) &PTR[i]; } retval = move_pages(0, // this thread Count, // lots of pointers (void **) PTR2, // The given address NULL, // no new node status, // array of node results MPOL_MF_MOVE); if (retval) { fprintf(stderr, "VerifyNumaNode move_pages failed\n"); } else { retval = 0; for (i = 0; i < Count; i++) { if (status[i] != node) { fprintf(stderr, "Verify Addr 0x%08lx is node %d not %d\n", PTR[i], status[i], node); retval++; } } } // release temp stuff delete [] status; delete [] PTR2; delete [] PTR; return retval; } // small demo program showing: // // a: huge page allocations via hugetlb are not node allocated // b: huge pages cannot be move_page()'ed // c: Replacing the shm*() with numa_alloc_node() has the exact same problem // d: 4KB pages or 2MB pages act the same. int main(int argc, char **argv) { int Node = -1; unsigned long Size = 32UL * 1024UL *1024UL; // default to 32MB if (argc >= 2) { Node = atoi(argv[1]); } if (argc >= 3) { Size = atol(argv[2]) * 1024UL * 1024UL; } unsigned long *Array = (unsigned long *) Allocate(Size, Node); exit(-1); }