#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ }) // NOTE: assumes hugepage support is set to madvise-only and shmem thp is set // to "force". int main(void) { #if 0 // for better correlation of events in dmesg FILE *kmsg_file = fopen("/dev/kmsg", "w"); if (!kmsg_file) errx(1, "open kmsg"); setlinebuf(kmsg_file); #else // for running unprivileged #define kmsg_file stderr #endif // create a memfd containing 8 MiB of data, with some extra back-and-forth // to avoid having transhuge shmem pages at this point even in "force" mode int fd = SYSCHK(syscall(__NR_memfd_create, "memfd", 0)); for (int i=0; i<4*512; i++) { char dummy_page[0x1000]; memset(dummy_page, 'P', 0x1000); assert(SYSCHK(pwrite(fd, dummy_page, 0x1000, i*0x1000)) == 0x1000); } SYSCHK(fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, 0x4000, 0x1000)); SYSCHK(fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, 0x204000, 0x1000)); SYSCHK(fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, 0x404000, 0x1000)); SYSCHK(fallocate(fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, 0x604000, 0x1000)); for (int i=0; i<4*512; i++) { char dummy_page[0x1000]; memset(dummy_page, 'P', 0x1000); assert(SYSCHK(pwrite(fd, dummy_page, 0x1000, i*0x1000)) == 0x1000); } // create trigger_vma (2 MiB, offset 4 MiB) char *trigger_vma = SYSCHK(mmap((void*)0x200000 - 0x2000, 0x200000 + 0x2000, PROT_READ, MAP_PRIVATE|MAP_FIXED_NOREPLACE, fd, 0x400000 - 0x2000)) + 0x2000; SYSCHK(madvise(trigger_vma-0x2000, 0x200000+0x2000, MADV_DONTFORK)); // create mapping1 (8 MiB, full file) char *mapping1 = SYSCHK(mmap((void*)0x1000000, 0x800000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED_NOREPLACE, fd, 0)); // create anon page and anon_vma in mapping1 (at offset 4 MiB) *(volatile char *)(mapping1 + 0x400000) = 1; // make it so that we end up with two VMAs that share an anon_vma and have the // same offset range. // start with a placeholder mapping, then mremap over it. char *mapping2 = SYSCHK(mmap((void*)0x2000000, 0x800000, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED_NOREPLACE, -1, 0)); // move SYSCHK(mremap(mapping1, 0x400000, 0x400000, MREMAP_MAYMOVE|MREMAP_FIXED, mapping2)); // expand munmap(mapping2+0x400000, 0x400000); SYSCHK(mremap(mapping2, 0x400000, 0x800000, 0)); // trim SYSCHK(munmap(mapping2, 0x400000)); // replace first half of mapping2 with a new vma, not mergeable but // anonvma-mergeable. // with an extra page in front to make it go first in the anon_vma interval // tree. SYSCHK(mmap(mapping2+0x400000-0x1000, 0x1000+0x200000, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_FIXED, fd, 0x400000-0x1000)); // don't inherit either half of mapping2 SYSCHK(madvise(mapping2+0x400000-0x1000, 0x400000+0x1000, MADV_DONTFORK)); // inherit mapping1 in child int child_cont_fd = SYSCHK(eventfd(0, EFD_SEMAPHORE)); pid_t child = SYSCHK(fork()); if (child == 0) { SYSCHK(prctl(PR_SET_PDEATHSIG, SIGKILL)); if (getppid() == 1) exit(1); eventfd_t dummy_val; SYSCHK(eventfd_read(child_cont_fd, &dummy_val)); fprintf(kmsg_file, "child hitting rmap...\n"); prctl(PR_SET_NAME, "TEST1"); SYSCHK(madvise(mapping1+0x400000, 0x1000, MADV_PAGEOUT)); prctl(PR_SET_NAME, "thp_newbug_child"); fprintf(kmsg_file, "child done with rmap\n"); sleep(10); char cmd[1000]; sprintf(cmd, "echo; echo =========== PARENT ===========; head -n120 /proc/%d/smaps; echo; echo =========== CHILD ===========; head -n60 /proc/%d/smaps", getppid(), getpid()); system(cmd); exit(0); } // populate PTEs in trigger_vma and mapping2 with shmem pages for (int i=0; i<512; i++) ((volatile char *)trigger_vma)[i*0x1000]; for (int i=0; i<512; i++) ((volatile char *)mapping2)[0x400000 + i*0x1000]; // get rid of parent's mapping of anon page so that MADV_PAGEOUT works SYSCHK(madvise(mapping1+0x400000, 0x1000, MADV_DONTNEED)); sleep(1); // enable hugepage support for trigger_vma to make khugepaged look at us SYSCHK(madvise(trigger_vma-0x2000, 0x200000+0x2000, MADV_HUGEPAGE)); // wait for khugepaged (relies on first vma being trigger VMA) int parent_smaps_fd = SYSCHK(open("/proc/self/smaps", O_RDONLY)); while (1) { char parent_smaps[0x1001]; int parent_smaps_len = SYSCHK(pread(parent_smaps_fd, parent_smaps, 0x1000, 0)); parent_smaps[parent_smaps_len] = '\0'; char *line = strstr(parent_smaps, "\nRss:"); if (!line) errx(1, "no Rss in smaps_rollup?"); char *numptr = line + strlen("\nRss:"); while (*numptr == ' ' || *numptr == '\t') numptr++; int rss = atoi(numptr); printf("Rss = %d kB\n", rss); if (rss == 0) break; usleep(1000*100); } fprintf(kmsg_file, "khugepaged is about to write_trylock! associating anon_vma...\n"); *(volatile char *)(mapping2 + 0x400000 - 0x1000) = 1; fprintf(kmsg_file, "anon_vma associated, telling child to hit rmap now...\n"); SYSCHK(eventfd_write(child_cont_fd, 1)); // wait for child int wstatus; SYSCHK(waitpid(child, &wstatus, 0)); exit(0); }