From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752690AbdJSNVC (ORCPT ); Thu, 19 Oct 2017 09:21:02 -0400 Received: from mail-oi0-f65.google.com ([209.85.218.65]:47425 "EHLO mail-oi0-f65.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751501AbdJSNVA (ORCPT ); Thu, 19 Oct 2017 09:21:00 -0400 X-Google-Smtp-Source: ABhQp+RmF2GPgXWRPqJLEZ0ziu2YgoLV+ulu9DLGb60/zao6B2S+ZR9UpKX5Kj9GuZbSkol/pgU0Oj8n+TGZXovVbkQ= MIME-Version: 1.0 In-Reply-To: <20171019073050.GC29091@outlook.office365.com> References: <1507760379-21662-2-git-send-email-gs051095@gmail.com> <20171019073050.GC29091@outlook.office365.com> From: Gargi Sharma Date: Thu, 19 Oct 2017 14:20:28 +0100 Message-ID: Subject: Re: [v6,1/2] pid: Replace pid bitmap implementation with IDR API To: Andrei Vagin Cc: linux-kernel@vger.kernel.org, Rik van Riel , Julia Lawall , Andrew Morton , mingo@kernel.org, pasha.tatashin@oracle.com, ktkhai@virtuozzo.com, Oleg Nesterov , "Eric W. Biederman" , Christoph Hellwig , lkp@intel.com, tony.luck@intel.com Content-Type: text/plain; charset="UTF-8" Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Thu, Oct 19, 2017 at 8:30 AM, Andrei Vagin wrote: > Hi Gargi, > > This patch breaks CRIU, because it changes a meaning of ns_last_pid. > > ========================== Run zdtm/static/env00 in h ========================== > DEP env00.d > CC env00.o > LINK env00 > Start test > ./env00 --pidfile=env00.pid --outfile=env00.out --envname=ENV_00_TEST > Run criu dump > Run criu restore > =[log]=> dump/zdtm/static/env00/52/1/restore.log > ------------------------ grep Error ------------------------ > (00.000587) No mountpoints-6.img image > (00.000593) mnt: Reading mountpoint images (id 6 pid 52) > (00.000653) Forking task with 52 pid (flags 0x0) > (00.007568) PID: real 51 virt 52 > (00.010363) 52: Error (criu/cr-restore.c:1787): Pid 51 do not match expected 52 (task 52) > (00.010474) Error (criu/cr-restore.c:2449): Restoring FAILED. > ------------------------ ERROR OVER ------------------------ > > Before this patch, ns_last_pid contains a pid of a last process. With > this patch, it contains a pid of a "next" process. > > In CRIU we use ns_last_pid to restore a process with a specified pid, > and now this logic is broken: Hi Andrei, We looked at the uses of last_pid and did not find critical uses inside the kernel, hence figured it'll be okay to change it to store the next pid. But that was not true, I will fix this in the next version. Thanks, Gargi > > $ uname -a > Linux laptop 4.11.11-200.fc25.x86_64 #1 SMP Mon Jul 17 17:41:12 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux > $ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$' > 20000 > > $ uname -a > Linux fc24 4.14.0-rc5-next-20171018 #1 SMP Wed Oct 18 23:52:43 PDT 2017 x86_64 x86_64 x86_64 GNU/Linux > $ echo 19999 > /proc/sys/kernel/ns_last_pid && sh -c 'echo $$' > 19999 > > Thanks, > Andrei > > On Wed, Oct 11, 2017 at 06:19:38PM -0400, Gargi Sharma wrote: >> This patch replaces the current bitmap implemetation for >> Process ID allocation. Functions that are no longer required, >> for example, free_pidmap(), alloc_pidmap(), etc. are removed. >> The rest of the functions are modified to use the IDR API. >> The change was made to make the PID allocation less complex by >> replacing custom code with calls to generic API. >> >> Signed-off-by: Gargi Sharma >> Reviewed-by: Rik van Riel >> --- >> arch/powerpc/platforms/cell/spufs/sched.c | 2 +- >> fs/proc/loadavg.c | 2 +- >> include/linux/pid_namespace.h | 14 +-- >> init/main.c | 2 +- >> kernel/pid.c | 201 ++++++------------------------ >> kernel/pid_namespace.c | 44 +++---- >> 6 files changed, 57 insertions(+), 208 deletions(-) >> >> diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c >> index 1fbb5da..e47761c 100644 >> --- a/arch/powerpc/platforms/cell/spufs/sched.c >> +++ b/arch/powerpc/platforms/cell/spufs/sched.c >> @@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) >> LOAD_INT(c), LOAD_FRAC(c), >> count_active_contexts(), >> atomic_read(&nr_spu_contexts), >> - task_active_pid_ns(current)->last_pid); >> + idr_get_cursor(&task_active_pid_ns(current)->idr)); >> return 0; >> } >> >> diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c >> index 983fce5..ba3d0e2 100644 >> --- a/fs/proc/loadavg.c >> +++ b/fs/proc/loadavg.c >> @@ -23,7 +23,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) >> LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), >> LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), >> nr_running(), nr_threads, >> - task_active_pid_ns(current)->last_pid); >> + idr_get_cursor(&task_active_pid_ns(current)->idr)); >> return 0; >> } >> >> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h >> index b09136f..f4db4a7 100644 >> --- a/include/linux/pid_namespace.h >> +++ b/include/linux/pid_namespace.h >> @@ -9,15 +9,8 @@ >> #include >> #include >> #include >> +#include >> >> -struct pidmap { >> - atomic_t nr_free; >> - void *page; >> -}; >> - >> -#define BITS_PER_PAGE (PAGE_SIZE * 8) >> -#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) >> -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE) >> >> struct fs_pin; >> >> @@ -29,9 +22,8 @@ enum { /* definitions for pid_namespace's hide_pid field */ >> >> struct pid_namespace { >> struct kref kref; >> - struct pidmap pidmap[PIDMAP_ENTRIES]; >> + struct idr idr; >> struct rcu_head rcu; >> - int last_pid; >> unsigned int nr_hashed; >> struct task_struct *child_reaper; >> struct kmem_cache *pid_cachep; >> @@ -105,6 +97,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) >> >> extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); >> void pidhash_init(void); >> -void pidmap_init(void); >> +void pid_idr_init(void); >> >> #endif /* _LINUX_PID_NS_H */ >> diff --git a/init/main.c b/init/main.c >> index 0ee9c686..9f4db20 100644 >> --- a/init/main.c >> +++ b/init/main.c >> @@ -667,7 +667,7 @@ asmlinkage __visible void __init start_kernel(void) >> if (late_time_init) >> late_time_init(); >> calibrate_delay(); >> - pidmap_init(); >> + pid_idr_init(); >> anon_vma_init(); >> acpi_early_init(); >> #ifdef CONFIG_X86 >> diff --git a/kernel/pid.c b/kernel/pid.c >> index 020dedb..0ce5936 100644 >> --- a/kernel/pid.c >> +++ b/kernel/pid.c >> @@ -39,6 +39,7 @@ >> #include >> #include >> #include >> +#include >> >> #define pid_hashfn(nr, ns) \ >> hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) >> @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT; >> int pid_max_min = RESERVED_PIDS + 1; >> int pid_max_max = PID_MAX_LIMIT; >> >> -static inline int mk_pid(struct pid_namespace *pid_ns, >> - struct pidmap *map, int off) >> -{ >> - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; >> -} >> - >> -#define find_next_offset(map, off) \ >> - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) >> >> /* >> * PID-map pages start out as NULL, they get allocated upon >> @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, >> */ >> struct pid_namespace init_pid_ns = { >> .kref = KREF_INIT(2), >> - .pidmap = { >> - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } >> - }, >> - .last_pid = 0, >> + .idr = IDR_INIT, >> .nr_hashed = PIDNS_HASH_ADDING, >> .level = 0, >> .child_reaper = &init_task, >> @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns); >> >> static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); >> >> -static void free_pidmap(struct upid *upid) >> -{ >> - int nr = upid->nr; >> - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; >> - int offset = nr & BITS_PER_PAGE_MASK; >> - >> - clear_bit(offset, map->page); >> - atomic_inc(&map->nr_free); >> -} >> - >> -/* >> - * If we started walking pids at 'base', is 'a' seen before 'b'? >> - */ >> -static int pid_before(int base, int a, int b) >> -{ >> - /* >> - * This is the same as saying >> - * >> - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT >> - * and that mapping orders 'a' and 'b' with respect to 'base'. >> - */ >> - return (unsigned)(a - base) < (unsigned)(b - base); >> -} >> - >> -/* >> - * We might be racing with someone else trying to set pid_ns->last_pid >> - * at the pid allocation time (there's also a sysctl for this, but racing >> - * with this one is OK, see comment in kernel/pid_namespace.c about it). >> - * We want the winner to have the "later" value, because if the >> - * "earlier" value prevails, then a pid may get reused immediately. >> - * >> - * Since pids rollover, it is not sufficient to just pick the bigger >> - * value. We have to consider where we started counting from. >> - * >> - * 'base' is the value of pid_ns->last_pid that we observed when >> - * we started looking for a pid. >> - * >> - * 'pid' is the pid that we eventually found. >> - */ >> -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) >> -{ >> - int prev; >> - int last_write = base; >> - do { >> - prev = last_write; >> - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); >> - } while ((prev != last_write) && (pid_before(base, last_write, pid))); >> -} >> - >> -static int alloc_pidmap(struct pid_namespace *pid_ns) >> -{ >> - int i, offset, max_scan, pid, last = pid_ns->last_pid; >> - struct pidmap *map; >> - >> - pid = last + 1; >> - if (pid >= pid_max) >> - pid = RESERVED_PIDS; >> - offset = pid & BITS_PER_PAGE_MASK; >> - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; >> - /* >> - * If last_pid points into the middle of the map->page we >> - * want to scan this bitmap block twice, the second time >> - * we start with offset == 0 (or RESERVED_PIDS). >> - */ >> - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; >> - for (i = 0; i <= max_scan; ++i) { >> - if (unlikely(!map->page)) { >> - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); >> - /* >> - * Free the page if someone raced with us >> - * installing it: >> - */ >> - spin_lock_irq(&pidmap_lock); >> - if (!map->page) { >> - map->page = page; >> - page = NULL; >> - } >> - spin_unlock_irq(&pidmap_lock); >> - kfree(page); >> - if (unlikely(!map->page)) >> - return -ENOMEM; >> - } >> - if (likely(atomic_read(&map->nr_free))) { >> - for ( ; ; ) { >> - if (!test_and_set_bit(offset, map->page)) { >> - atomic_dec(&map->nr_free); >> - set_last_pid(pid_ns, last, pid); >> - return pid; >> - } >> - offset = find_next_offset(map, offset); >> - if (offset >= BITS_PER_PAGE) >> - break; >> - pid = mk_pid(pid_ns, map, offset); >> - if (pid >= pid_max) >> - break; >> - } >> - } >> - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { >> - ++map; >> - offset = 0; >> - } else { >> - map = &pid_ns->pidmap[0]; >> - offset = RESERVED_PIDS; >> - if (unlikely(last == offset)) >> - break; >> - } >> - pid = mk_pid(pid_ns, map, offset); >> - } >> - return -EAGAIN; >> -} >> - >> -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) >> -{ >> - int offset; >> - struct pidmap *map, *end; >> - >> - if (last >= PID_MAX_LIMIT) >> - return -1; >> - >> - offset = (last + 1) & BITS_PER_PAGE_MASK; >> - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; >> - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; >> - for (; map < end; map++, offset = 0) { >> - if (unlikely(!map->page)) >> - continue; >> - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); >> - if (offset < BITS_PER_PAGE) >> - return mk_pid(pid_ns, map, offset); >> - } >> - return -1; >> -} >> - >> void put_pid(struct pid *pid) >> { >> struct pid_namespace *ns; >> @@ -266,7 +124,7 @@ void free_pid(struct pid *pid) >> struct upid *upid = pid->numbers + i; >> struct pid_namespace *ns = upid->ns; >> hlist_del_rcu(&upid->pid_chain); >> - switch(--ns->nr_hashed) { >> + switch (--ns->nr_hashed) { >> case 2: >> case 1: >> /* When all that is left in the pid namespace >> @@ -284,12 +142,11 @@ void free_pid(struct pid *pid) >> schedule_work(&ns->proc_work); >> break; >> } >> + >> + idr_remove(&ns->idr, upid->nr); >> } >> spin_unlock_irqrestore(&pidmap_lock, flags); >> >> - for (i = 0; i <= pid->level; i++) >> - free_pidmap(pid->numbers + i); >> - >> call_rcu(&pid->rcu, delayed_put_pid); >> } >> >> @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns) >> >> tmp = ns; >> pid->level = ns->level; >> + >> for (i = ns->level; i >= 0; i--) { >> - nr = alloc_pidmap(tmp); >> + int pid_min = 1; >> + >> + idr_preload(GFP_KERNEL); >> + spin_lock_irq(&pidmap_lock); >> + >> + /* >> + * init really needs pid 1, but after reaching the maximum >> + * wrap back to RESERVED_PIDS >> + */ >> + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) >> + pid_min = RESERVED_PIDS; >> + >> + /* >> + * Store a null pointer so find_pid_ns does not find >> + * a partially initialized PID (see below). >> + */ >> + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, >> + pid_max, GFP_ATOMIC); >> + spin_unlock_irq(&pidmap_lock); >> + idr_preload_end(); >> + >> if (nr < 0) { >> retval = nr; >> goto out_free; >> @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) >> for ( ; upid >= pid->numbers; --upid) { >> hlist_add_head_rcu(&upid->pid_chain, >> &pid_hash[pid_hashfn(upid->nr, upid->ns)]); >> + /* Make the PID visible to find_pid_ns. */ >> + idr_replace(&upid->ns->idr, pid, upid->nr); >> upid->ns->nr_hashed++; >> } >> spin_unlock_irq(&pidmap_lock); >> @@ -350,8 +230,11 @@ struct pid *alloc_pid(struct pid_namespace *ns) >> put_pid_ns(ns); >> >> out_free: >> + spin_lock_irq(&pidmap_lock); >> while (++i <= ns->level) >> - free_pidmap(pid->numbers + i); >> + idr_remove(&ns->idr, (pid->numbers + i)->nr); >> + >> + spin_unlock_irq(&pidmap_lock); >> >> kmem_cache_free(ns->pid_cachep, pid); >> return ERR_PTR(retval); >> @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns); >> */ >> struct pid *find_ge_pid(int nr, struct pid_namespace *ns) >> { >> - struct pid *pid; >> - >> - do { >> - pid = find_pid_ns(nr, ns); >> - if (pid) >> - break; >> - nr = next_pidmap(ns, nr); >> - } while (nr > 0); >> - >> - return pid; >> + return idr_get_next(&ns->idr, &nr); >> } >> >> /* >> @@ -578,7 +452,7 @@ void __init pidhash_init(void) >> 0, 4096); >> } >> >> -void __init pidmap_init(void) >> +void __init pid_idr_init(void) >> { >> /* Verify no one has done anything silly: */ >> BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); >> @@ -590,10 +464,7 @@ void __init pidmap_init(void) >> PIDS_PER_CPU_MIN * num_possible_cpus()); >> pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); >> >> - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); >> - /* Reserve PID 0. We never call free_pidmap(0) */ >> - set_bit(0, init_pid_ns.pidmap[0].page); >> - atomic_dec(&init_pid_ns.pidmap[0].nr_free); >> + idr_init(&init_pid_ns.idr); >> >> init_pid_ns.pid_cachep = KMEM_CACHE(pid, >> SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); >> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c >> index 4918314..5009dbe 100644 >> --- a/kernel/pid_namespace.c >> +++ b/kernel/pid_namespace.c >> @@ -21,6 +21,7 @@ >> #include >> #include >> #include >> +#include >> >> struct pid_cache { >> int nr_ids; >> @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns >> struct pid_namespace *ns; >> unsigned int level = parent_pid_ns->level + 1; >> struct ucounts *ucounts; >> - int i; >> int err; >> >> err = -EINVAL; >> @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns >> if (ns == NULL) >> goto out_dec; >> >> - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); >> - if (!ns->pidmap[0].page) >> - goto out_free; >> + idr_init(&ns->idr); >> >> ns->pid_cachep = create_pid_cachep(level + 1); >> if (ns->pid_cachep == NULL) >> - goto out_free_map; >> + goto out_free_idr; >> >> err = ns_alloc_inum(&ns->ns); >> if (err) >> - goto out_free_map; >> + goto out_free_idr; >> ns->ns.ops = &pidns_operations; >> >> kref_init(&ns->kref); >> @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns >> ns->nr_hashed = PIDNS_HASH_ADDING; >> INIT_WORK(&ns->proc_work, proc_cleanup_work); >> >> - set_bit(0, ns->pidmap[0].page); >> - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); >> - >> - for (i = 1; i < PIDMAP_ENTRIES; i++) >> - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); >> - >> return ns; >> >> -out_free_map: >> - kfree(ns->pidmap[0].page); >> -out_free: >> +out_free_idr: >> + idr_destroy(&ns->idr); >> kmem_cache_free(pid_ns_cachep, ns); >> out_dec: >> dec_pid_namespaces(ucounts); >> @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p) >> >> static void destroy_pid_namespace(struct pid_namespace *ns) >> { >> - int i; >> - >> ns_free_inum(&ns->ns); >> - for (i = 0; i < PIDMAP_ENTRIES; i++) >> - kfree(ns->pidmap[i].page); >> + >> + idr_destroy(&ns->idr); >> call_rcu(&ns->rcu, delayed_free_pidns); >> } >> >> @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) >> int rc; >> struct task_struct *task, *me = current; >> int init_pids = thread_group_leader(me) ? 1 : 2; >> + struct pid *pid; >> >> /* Don't allow any more processes into the pid namespace */ >> disable_pid_allocation(pid_ns); >> @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) >> * maintain a tasklist for each pid namespace. >> * >> */ >> + rcu_read_lock(); >> read_lock(&tasklist_lock); >> - nr = next_pidmap(pid_ns, 1); >> - while (nr > 0) { >> - rcu_read_lock(); >> - >> - task = pid_task(find_vpid(nr), PIDTYPE_PID); >> + nr = 2; >> + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { >> + task = pid_task(pid, PIDTYPE_PID); >> if (task && !__fatal_signal_pending(task)) >> send_sig_info(SIGKILL, SEND_SIG_FORCED, task); >> - >> - rcu_read_unlock(); >> - >> - nr = next_pidmap(pid_ns, nr); >> } >> read_unlock(&tasklist_lock); >> + rcu_read_unlock(); >> >> /* >> * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. >> @@ -311,7 +297,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, >> * it should synchronize its usage with external means. >> */ >> >> - tmp.data = &pid_ns->last_pid; >> + tmp.data = &pid_ns->idr.idr_next; >> return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); >> } >>