From: Cedric Le Goater <legoater@free.fr>
To: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>,
linux-api@vger.kernel.org, containers@lists.linux-foundation.org,
mpm@selenic.com, linux-kernel@vger.kernel.org,
linux-mm@kvack.org, tglx@linutronix.de, viro@zeniv.linux.org.uk,
hpa@zytor.com, Andrew Morton <akpm@linux-foundation.org>,
torvalds@linux-foundation.org, mingo@elte.hu, xemul@openvz.org
Subject: Re: How much of a mess does OpenVZ make? ;) Was: What can OpenVZ do?
Date: Wed, 11 Mar 2009 09:26:28 +0100 [thread overview]
Message-ID: <49B775B4.1040800@free.fr> (raw)
In-Reply-To: <20090310215305.GA2078@x200.localdomain>
Alexey Dobriyan wrote:
> On Thu, Feb 26, 2009 at 06:57:55PM +0300, Alexey Dobriyan wrote:
>> On Thu, Feb 12, 2009 at 03:04:05PM -0800, Dave Hansen wrote:
>>> dave@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... kernel/cpt/ | diffstat
>
>>> 47 files changed, 20702 insertions(+)
>>>
>>> One important thing that leaves out is the interaction that this code
>>> has with the rest of the kernel. That's critically important when
>>> considering long-term maintenance, and I'd be curious how the OpenVZ
>>> folks view it.
>> OpenVZ as-is in some cases wants some functions to be made global
>> (and if C/R code will be modular, exported). Or probably several
>> iterators added.
>>
>> But it's negligible amount of changes compared to main code.
>
> Here is what C/R code wants from pid allocator.
>
> With the introduction of hierarchical PID namespaces, struct pid can
> have not one but many numbers -- tuple (pid_0, pid_1, ..., pid_N),
> where pid_i is pid number in pid_ns which has level i.
>
> Now root pid_ns of container has level n -- numbers from level n to N
> inclusively should be dumped and restored.
>
> During struct pid creation first n-1 numbers can be anything, because the're
> outside of pid_ns, but the rest should be the same.
>
> Code will be ifdeffed and commented, but anyhow, this is an example of
> change C/R will require from the rest of the kernel.
>
>
>
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -182,6 +182,34 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
> return -1;
> }
>
> +static int set_pidmap(struct pid_namespace *pid_ns, pid_t pid)
> +{
> + int offset;
> + struct pidmap *map;
> +
> + offset = pid & BITS_PER_PAGE_MASK;
> + map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
> + if (unlikely(!map->page)) {
> + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> + /*
> + * Free the page if someone raced with us
> + * installing it:
> + */
> + spin_lock_irq(&pidmap_lock);
> + if (map->page)
> + kfree(page);
> + else
> + map->page = page;
> + spin_unlock_irq(&pidmap_lock);
> + if (unlikely(!map->page))
> + return -ENOMEM;
> + }
> + if (test_and_set_bit(offset, map->page))
> + return -EBUSY;
> + atomic_dec(&map->nr_free);
> + return pid;
> +}
> +
> int next_pidmap(struct pid_namespace *pid_ns, int last)
> {
> int offset;
> @@ -239,7 +267,7 @@ void free_pid(struct pid *pid)
> call_rcu(&pid->rcu, delayed_put_pid);
> }
>
> -struct pid *alloc_pid(struct pid_namespace *ns)
> +struct pid *alloc_pid(struct pid_namespace *ns, int *cr_nr, unsigned int cr_level)
> {
> struct pid *pid;
> enum pid_type type;
> @@ -253,7 +281,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
>
> tmp = ns;
> for (i = ns->level; i >= 0; i--) {
> - nr = alloc_pidmap(tmp);
> + if (cr_nr && ns->level - i <= cr_level)
> + nr = set_pidmap(tmp, cr_nr[ns->level - i]);
> + else
> + nr = alloc_pidmap(tmp);
> if (nr < 0)
> goto out_free;
This patch supposes that the process is restored in a state which took several
clone(CLONE_NEWPID) to reach. if you replay these clone(), which is what restart
is at the end : an optimized replay, you would only need something like below.
Index: 2.6.git/kernel/pid.c
===================================================================
--- 2.6.git.orig/kernel/pid.c
+++ 2.6.git/kernel/pid.c
@@ -122,12 +122,12 @@ static void free_pidmap(struct upid *upi
atomic_inc(&map->nr_free);
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int alloc_pidmap(struct pid_namespace *pid_ns, pid_t upid)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
- pid = last + 1;
+ pid = upid ? upid : last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
@@ -239,7 +239,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t next_pid)
{
struct pid *pid;
enum pid_type type;
@@ -253,10 +253,15 @@ struct pid *alloc_pid(struct pid_namespa
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ nr = alloc_pidmap(tmp, next_pid);
if (nr < 0)
goto out_free;
+ /* The next_pid is only applicable for the ns namespace, not
+ * its parents.
+ */
+ next_pid = 0;
+
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
Well, that's how we do it but I'm not against your patch. It fits our need also.
It's just a bit intrusive for the pid bitmap. if we mix both path, we get something
like this fake patch, which is a bit less intrusive IMO. not tested though.
@@ -122,12 +122,12 @@ static void free_pidmap(struct upid *upi
atomic_inc(&map->nr_free);
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int alloc_pidmap(struct pid_namespace *pid_ns, pid_t upid)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
- pid = last + 1;
+ pid = upid ? upid : last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
@@ -239,7 +267,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int *cr_nr, unsigned int cr_level)
{
struct pid *pid;
enum pid_type type;
@@ -253,7 +281,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ if (cr_nr && ns->level - i <= cr_level)
+ nr = alloc_pidmap(tmp, cr_nr[ns->level - i]);
+ if (nr != cr_nr[ns->level - i])
+ return -EBUSY;
+ else
+ nr = alloc_pidmap(tmp);
if (nr < 0)
goto out_free;
next prev parent reply other threads:[~2009-03-11 8:27 UTC|newest]
Thread overview: 120+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-01-27 17:07 [RFC v13][PATCH 00/14] Kernel based checkpoint/restart Oren Laadan
2009-01-27 17:07 ` [RFC v13][PATCH 01/14] Create syscalls: sys_checkpoint, sys_restart Oren Laadan
2009-01-27 17:20 ` Randy Dunlap
2009-01-27 17:08 ` [RFC v13][PATCH 02/14] Checkpoint/restart: initial documentation Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 03/14] Make file_pos_read/write() public Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 04/14] General infrastructure for checkpoint restart Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 05/14] x86 support for checkpoint/restart Oren Laadan
2009-02-24 7:47 ` Nathan Lynch
2009-02-24 16:06 ` Dave Hansen
2009-03-18 7:21 ` Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 06/14] Dump memory address space Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 07/14] Restore " Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 08/14] Infrastructure for shared objects Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 09/14] Dump open file descriptors Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 10/14] Restore open file descriprtors Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 11/14] External checkpoint of a task other than ourself Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 12/14] Track in-kernel when we expect checkpoint/restart to work Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 13/14] Checkpoint multiple processes Oren Laadan
2009-01-27 17:08 ` [RFC v13][PATCH 14/14] Restart " Oren Laadan
2009-02-10 17:05 ` [RFC v13][PATCH 00/14] Kernel based checkpoint/restart Dave Hansen
2009-02-11 22:14 ` Andrew Morton
2009-02-12 9:17 ` Ingo Molnar
2009-02-12 18:11 ` Dave Hansen
2009-02-12 20:48 ` Serge E. Hallyn
2009-02-13 10:20 ` Ingo Molnar
2009-02-12 18:11 ` Dave Hansen
2009-02-12 19:30 ` Matt Mackall
2009-02-12 19:42 ` Andrew Morton
2009-02-12 21:51 ` What can OpenVZ do? Dave Hansen
2009-02-12 22:10 ` Andrew Morton
2009-02-12 23:04 ` How much of a mess does OpenVZ make? ;) Was: " Dave Hansen
2009-02-26 15:57 ` Alexey Dobriyan
2009-03-10 21:53 ` Alexey Dobriyan
2009-03-10 23:28 ` Serge E. Hallyn
2009-03-11 8:26 ` Cedric Le Goater [this message]
2009-03-12 14:53 ` Serge E. Hallyn
2009-03-12 21:01 ` Greg Kurz
2009-03-12 21:21 ` Serge E. Hallyn
2009-03-13 4:29 ` Ying Han
2009-03-13 5:34 ` Sukadev Bhattiprolu
2009-03-13 6:19 ` Ying Han
2009-03-13 17:27 ` Linus Torvalds
2009-03-13 19:02 ` Serge E. Hallyn
2009-03-13 19:35 ` Alexey Dobriyan
2009-03-13 21:01 ` Linus Torvalds
2009-03-13 21:51 ` Dave Hansen
2009-03-13 22:15 ` Oren Laadan
2009-03-14 0:27 ` Eric W. Biederman
2009-03-14 8:12 ` Ingo Molnar
2009-03-16 22:33 ` Kevin Fox
2009-03-19 21:19 ` Eric W. Biederman
2009-03-14 0:20 ` Alexey Dobriyan
2009-03-14 8:25 ` Ingo Molnar
2009-03-16 6:01 ` Oren Laadan
2009-03-13 20:48 ` Mike Waychison
2009-03-13 22:35 ` Oren Laadan
2009-03-18 18:54 ` Mike Waychison
2009-03-18 19:04 ` Oren Laadan
2009-03-13 15:27 ` Cedric Le Goater
2009-03-13 17:11 ` Greg Kurz
2009-03-13 17:37 ` Serge E. Hallyn
2009-03-13 15:47 ` Cedric Le Goater
2009-03-13 16:35 ` Serge E. Hallyn
2009-03-13 16:53 ` Cedric Le Goater
2009-02-26 16:27 ` Alexey Dobriyan
2009-02-26 17:33 ` Ingo Molnar
2009-02-26 18:30 ` Greg Kurz
2009-02-26 22:17 ` Alexey Dobriyan
2009-02-27 9:19 ` Greg Kurz
2009-02-27 10:53 ` Alexey Dobriyan
2009-02-27 14:33 ` Cedric Le Goater
2009-02-27 9:36 ` Cedric Le Goater
2009-02-26 22:31 ` Alexey Dobriyan
2009-02-27 9:03 ` Ingo Molnar
2009-02-27 9:19 ` Andrew Morton
2009-02-27 10:57 ` Alexey Dobriyan
2009-02-27 9:22 ` Andrew Morton
2009-02-27 10:59 ` Alexey Dobriyan
2009-02-27 16:14 ` Dave Hansen
2009-02-27 21:57 ` Alexey Dobriyan
2009-02-27 21:54 ` Dave Hansen
2009-03-01 1:33 ` Alexey Dobriyan
2009-03-01 20:02 ` Serge E. Hallyn
2009-03-01 20:56 ` Alexey Dobriyan
2009-03-01 22:21 ` Serge E. Hallyn
2009-03-03 16:17 ` Cedric Le Goater
2009-03-03 18:28 ` Serge E. Hallyn
2009-02-13 10:53 ` Ingo Molnar
2009-02-16 20:51 ` Dave Hansen
2009-02-17 22:23 ` Ingo Molnar
2009-02-17 22:30 ` Dave Hansen
2009-02-18 0:32 ` Ingo Molnar
2009-02-18 0:40 ` Dave Hansen
2009-02-18 5:11 ` Alexey Dobriyan
2009-02-18 18:16 ` Ingo Molnar
2009-02-18 21:27 ` Dave Hansen
2009-02-18 23:15 ` Ingo Molnar
2009-02-19 19:06 ` Banning checkpoint (was: Re: What can OpenVZ do?) Alexey Dobriyan
2009-02-19 19:11 ` Dave Hansen
2009-02-24 4:47 ` Alexey Dobriyan
2009-02-24 5:11 ` Dave Hansen
2009-02-24 15:43 ` Serge E. Hallyn
2009-02-24 20:09 ` Alexey Dobriyan
2009-02-12 22:17 ` What can OpenVZ do? Alexey Dobriyan
2009-02-13 10:27 ` Ingo Molnar
2009-02-13 11:32 ` Alexey Dobriyan
2009-02-13 11:45 ` Ingo Molnar
2009-02-13 22:28 ` Alexey Dobriyan
2009-03-14 0:04 ` Eric W. Biederman
2009-03-14 0:26 ` Serge E. Hallyn
2009-02-12 22:57 ` [RFC v13][PATCH 00/14] Kernel based checkpoint/restart Dave Hansen
2009-02-12 23:05 ` Matt Mackall
2009-02-12 23:13 ` Dave Hansen
2009-02-13 23:28 ` Andrew Morton
2009-02-14 23:08 ` Ingo Molnar
2009-02-14 23:31 ` Andrew Morton
2009-02-14 23:50 ` Ingo Molnar
2009-02-16 17:37 ` Dave Hansen
2009-03-13 2:45 ` Oren Laadan
2009-03-13 3:57 ` Oren Laadan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=49B775B4.1040800@free.fr \
--to=legoater@free.fr \
--cc=adobriyan@gmail.com \
--cc=akpm@linux-foundation.org \
--cc=containers@lists.linux-foundation.org \
--cc=dave@linux.vnet.ibm.com \
--cc=hpa@zytor.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mingo@elte.hu \
--cc=mpm@selenic.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
--cc=viro@zeniv.linux.org.uk \
--cc=xemul@openvz.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).