linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH -mm 1/9] unshare system call: system call handler function
@ 2005-12-13 22:54 JANAK DESAI
  2005-12-15 19:52 ` Eric W. Biederman
  0 siblings, 1 reply; 17+ messages in thread
From: JANAK DESAI @ 2005-12-13 22:54 UTC (permalink / raw)
  To: viro, chrisw, dwmw2, jamie, serue, mingo, linuxram, jmorris, sds, janak
  Cc: akpm, linux-kernel


[PATCH -mm 1/9] unshare system call: system call handler function 

sys_unshare system call handler function accepts the same flags as
clone system call, checks constraints on each of the flags and invokes
corresponding unshare functions to disassociate respective process
context if it was being shared with another task. 

Changes since the first submission of this patch on 12/12/05:
	- Moved cleaning up of old shared structures outside of the
	  block that holds task_lock (12/13/05)
 
Signed-off-by: Janak Desai <janak@us.ibm.com>
 
---
 
 fork.c |  232 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 232 insertions(+)
 
diff -Naurp 2.6.15-rc5-mm2/kernel/fork.c 2.6.15-rc5-mm2+patch/kernel/fork.c
--- 2.6.15-rc5-mm2/kernel/fork.c	2005-12-12 03:05:59.000000000 +0000
+++ 2.6.15-rc5-mm2+patch/kernel/fork.c	2005-12-13 18:38:26.000000000 +0000
@@ -1330,3 +1330,235 @@ void __init proc_caches_init(void)
 			sizeof(struct mm_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
+
+
+/*
+ * Check constraints on flags passed to the unshare system call and
+ * force unsharing of additional process context as appropriate.
+ */
+static inline void check_unshare_flags(unsigned long *flags_ptr)
+{
+	/*
+	 * If unsharing a thread from a thread group, must also
+	 * unshare vm.
+	 */
+	if (*flags_ptr & CLONE_THREAD)
+		*flags_ptr |= CLONE_VM;
+
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (*flags_ptr & CLONE_VM)
+		*flags_ptr |= CLONE_SIGHAND;
+
+	/*
+	 * If unsharing signal handlers and the task was created
+	 * using CLONE_THREAD, then must unshare the thread
+	 */
+	if ((*flags_ptr & CLONE_SIGHAND) &&
+	    (atomic_read(&current->signal->count) > 1))
+		*flags_ptr |= CLONE_THREAD;
+
+	/*
+	 * If unsharing namespace, must also unshare filesystem information.
+	 */
+	if (*flags_ptr & CLONE_NEWNS)
+		*flags_ptr |= CLONE_FS;
+}
+
+/*
+ * Unsharing of tasks created with CLONE_THREAD is not supported yet
+ */
+static int unshare_thread(unsigned long unshare_flags)
+{
+	if (unshare_flags & CLONE_THREAD)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of fs info for tasks created with CLONE_FS is not supported yet
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+	struct fs_struct *fs = current->fs;
+
+	if ((unshare_flags & CLONE_FS) &&
+	    (fs && atomic_read(&fs->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of namespace for tasks created without CLONE_NEWNS is not
+ * supported yet
+ */
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp)
+{
+	struct namespace *ns = current->namespace;
+
+	if ((unshare_flags & CLONE_NEWNS) &&
+	    (ns && atomic_read(&ns->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * supported yet
+ */
+static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+{
+	struct sighand_struct *sigh = current->sighand;
+
+	if ((unshare_flags & CLONE_SIGHAND) &&
+	    (sigh && atomic_read(&sigh->count) > 1))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/*
+ * Unsharing of vm for tasks created with CLONE_VM is not supported yet
+ */
+static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+{
+	struct mm_struct *mm = current->mm;
+
+	if ((unshare_flags & CLONE_VM) &&
+	    (mm && atomic_read(&mm->mm_users) > 1))
+		return -EINVAL;
+
+	return 0;
+
+}
+
+/*
+ * Unsharing of files for tasks created with CLONE_FILES is not supported yet
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+	struct files_struct *fd = current->files;
+
+	if ((unshare_flags & CLONE_FILES) &&
+	    (fd && atomic_read(&fd->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
+ * supported yet
+ */
+static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
+{
+	if (unshare_flags & CLONE_SYSVSEM)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone.  copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+	int err = 0;
+	struct fs_struct *fs, *new_fs = NULL;
+	struct namespace *ns, *new_ns = NULL;
+	struct sighand_struct *sigh, *new_sigh = NULL;
+	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
+	struct files_struct *fd, *new_fd = NULL;
+	struct sem_undo_list *new_ulist = NULL;
+
+	check_unshare_flags(&unshare_flags);
+
+	if ((err = unshare_thread(unshare_flags)))
+		goto bad_unshare_out;
+	if ((err = unshare_fs(unshare_flags, &new_fs)))
+		goto bad_unshare_cleanup_thread;
+	if ((err = unshare_namespace(unshare_flags, &new_ns)))
+		goto bad_unshare_cleanup_fs;
+	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+		goto bad_unshare_cleanup_ns;
+	if ((err = unshare_vm(unshare_flags, &new_mm)))
+		goto bad_unshare_cleanup_sigh;
+	if ((err = unshare_fd(unshare_flags, &new_fd)))
+		goto bad_unshare_cleanup_vm;
+	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
+		goto bad_unshare_cleanup_fd;
+
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+
+		task_lock(current);
+
+		if (new_fs) {
+			fs = current->fs;
+			current->fs = new_fs;
+			new_fs = fs;
+		}
+
+		if (new_ns) {
+			ns = current->namespace;
+			current->namespace = new_ns;
+			new_ns = ns;
+		}
+
+		if (new_sigh) {
+			sigh = current->sighand;
+			current->sighand = new_sigh;
+			new_sigh = sigh;
+		}
+
+		if (new_mm) {
+			mm = current->mm;
+			active_mm = current->active_mm;
+			current->mm = new_mm;
+			current->active_mm = new_mm;
+			activate_mm(active_mm, new_mm);
+			new_mm = mm;
+		}
+
+		if (new_fd) {
+			fd = current->files;
+			current->files = new_fd;
+			new_fd = fd;
+		}
+
+		task_unlock(current);
+	}
+
+bad_unshare_cleanup_fd:
+	if (new_fd)
+		put_files_struct(new_fd);
+
+bad_unshare_cleanup_vm:
+	if (new_mm)
+		mmput(new_mm);
+
+bad_unshare_cleanup_sigh:
+	if (new_sigh)
+		if (atomic_dec_and_test(&new_sigh->count))
+			kmem_cache_free(sighand_cachep, new_sigh);
+
+bad_unshare_cleanup_ns:
+	if (new_ns)
+		put_namespace(new_ns);
+
+bad_unshare_cleanup_fs:
+	if (new_fs)
+		put_fs_struct(new_fs);
+
+bad_unshare_cleanup_thread:
+bad_unshare_out:
+	return err;
+}



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-13 22:54 [PATCH -mm 1/9] unshare system call: system call handler function JANAK DESAI
@ 2005-12-15 19:52 ` Eric W. Biederman
  2005-12-15 20:38   ` JANAK DESAI
  0 siblings, 1 reply; 17+ messages in thread
From: Eric W. Biederman @ 2005-12-15 19:52 UTC (permalink / raw)
  To: janak
  Cc: viro, chrisw, dwmw2, jamie, serue, mingo, linuxram, jmorris, sds,
	akpm, linux-kernel

JANAK DESAI <janak@us.ibm.com> writes:

> [PATCH -mm 1/9] unshare system call: system call handler function 
>
> sys_unshare system call handler function accepts the same flags as
> clone system call, checks constraints on each of the flags and invokes
> corresponding unshare functions to disassociate respective process
> context if it was being shared with another task. 
>
> Changes since the first submission of this patch on 12/12/05:
> 	- Moved cleaning up of old shared structures outside of the
> 	  block that holds task_lock (12/13/05)
>  
> Signed-off-by: Janak Desai <janak@us.ibm.com>
>  
> ---
>  
>  fork.c | 232 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 232 insertions(+)
>  
> diff -Naurp 2.6.15-rc5-mm2/kernel/fork.c 2.6.15-rc5-mm2+patch/kernel/fork.c
> --- 2.6.15-rc5-mm2/kernel/fork.c	2005-12-12 03:05:59.000000000 +0000
> +++ 2.6.15-rc5-mm2+patch/kernel/fork.c	2005-12-13 18:38:26.000000000 +0000
> @@ -1330,3 +1330,235 @@ void __init proc_caches_init(void)
>  			sizeof(struct mm_struct), 0,
>  			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
>  }
> +
> +
> +/*
> + * Check constraints on flags passed to the unshare system call and
> + * force unsharing of additional process context as appropriate.
> + */

If it isn't legal how about we deny the unshare call.
Then we can share this code with clone.

Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 19:52 ` Eric W. Biederman
@ 2005-12-15 20:38   ` JANAK DESAI
  2005-12-15 21:13     ` Eric W. Biederman
  2005-12-15 21:28     ` Jamie Lokier
  0 siblings, 2 replies; 17+ messages in thread
From: JANAK DESAI @ 2005-12-15 20:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: viro, chrisw, dwmw2, jamie, serue, mingo, linuxram, jmorris, sds,
	akpm, linux-kernel

Eric W. Biederman wrote:

>JANAK DESAI <janak@us.ibm.com> writes:
>
>  
>
>>[PATCH -mm 1/9] unshare system call: system call handler function 
>>
>>sys_unshare system call handler function accepts the same flags as
>>clone system call, checks constraints on each of the flags and invokes
>>corresponding unshare functions to disassociate respective process
>>context if it was being shared with another task. 
>>
>>Changes since the first submission of this patch on 12/12/05:
>>	- Moved cleaning up of old shared structures outside of the
>>	  block that holds task_lock (12/13/05)
>> 
>>Signed-off-by: Janak Desai <janak@us.ibm.com>
>> 
>>---
>> 
>> fork.c | 232 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 files changed, 232 insertions(+)
>> 
>>diff -Naurp 2.6.15-rc5-mm2/kernel/fork.c 2.6.15-rc5-mm2+patch/kernel/fork.c
>>--- 2.6.15-rc5-mm2/kernel/fork.c	2005-12-12 03:05:59.000000000 +0000
>>+++ 2.6.15-rc5-mm2+patch/kernel/fork.c	2005-12-13 18:38:26.000000000 +0000
>>@@ -1330,3 +1330,235 @@ void __init proc_caches_init(void)
>> 			sizeof(struct mm_struct), 0,
>> 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
>> }
>>+
>>+
>>+/*
>>+ * Check constraints on flags passed to the unshare system call and
>>+ * force unsharing of additional process context as appropriate.
>>+ */
>>    
>>
>
>If it isn't legal how about we deny the unshare call.
>Then we can share this code with clone.
>
>Eric
>
>
>  
>
unshare is doing the reverse of what clone does. So if you are unsharing 
namespace
you want to make sure that you are not sharing fs. That's why the 
CLONE_FS flag
is forced (meaning you are also going to unshare fs). Since namespace is 
shared
by default and fs is not, if clone is called with CLONE_NEWNS, the 
intent is to
create a new namespace, which means you cannot share fs. clone checks to
makes sure that CLONE_NEWNS and CLONE_FS are not specified together, while
unshare will force CLONE_FS if CLONE_NEWNS is spefcified. Basically you
can have a shared namespace and non shared fs, but not the other way 
around and
since clone and unshare are doing opposite things, sharing code between 
them that
checks constraints on the flags can get convoluted.

-Janak

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 20:38   ` JANAK DESAI
@ 2005-12-15 21:13     ` Eric W. Biederman
  2005-12-15 21:32       ` Jamie Lokier
  2005-12-16  4:32       ` JANAK DESAI
  2005-12-15 21:28     ` Jamie Lokier
  1 sibling, 2 replies; 17+ messages in thread
From: Eric W. Biederman @ 2005-12-15 21:13 UTC (permalink / raw)
  To: JANAK DESAI
  Cc: Eric W. Biederman, viro, chrisw, dwmw2, jamie, serue, mingo,
	linuxram, jmorris, sds, akpm, linux-kernel

JANAK DESAI <janak@us.ibm.com> writes:

> Eric W. Biederman wrote:
>
>>If it isn't legal how about we deny the unshare call.
>>Then we can share this code with clone.
>>
>>Eric
>>
>>
>>
>>
> unshare is doing the reverse of what clone does. So if you are unsharing
> namespace
> you want to make sure that you are not sharing fs. That's why the CLONE_FS flag
> is forced (meaning you are also going to unshare fs). Since namespace is shared
> by default and fs is not, if clone is called with CLONE_NEWNS, the intent is to
> create a new namespace, which means you cannot share fs. clone checks to
> makes sure that CLONE_NEWNS and CLONE_FS are not specified together, while
> unshare will force CLONE_FS if CLONE_NEWNS is spefcified. Basically you
> can have a shared namespace and non shared fs, but not the other way around and
> since clone and unshare are doing opposite things, sharing code between them
> that
> checks constraints on the flags can get convoluted.

I follow but I am very disturbed.

You are leaving CLONE_NEWNS to mean you want a new namespace.

For clone CLONE_FS unset means generate an unshared fs_struct
          CLONE_FS set   means share the fs_struct with the parent

But for unshare CLONE_FS unset means share the fs_struct with others
            and CLONE_FS set   means generate an unshared fs_struct

The meaning of CLONE_FS between the two calls in now flipped,
but CLONE_NEWNS is not.  Please let's not implement it this way.

Part of the problem is the double negative in the name, leading
me to suggest that sys_share might almost be a better name.

So please code don't invert the meaning of the bits.  This will
allow sharing of the sanity checks with clone.

In addition this leaves open the possibility that routines like
copy_fs properly refactored can be shared between clone and unshare.


Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 20:38   ` JANAK DESAI
  2005-12-15 21:13     ` Eric W. Biederman
@ 2005-12-15 21:28     ` Jamie Lokier
  2005-12-16  4:35       ` JANAK DESAI
  1 sibling, 1 reply; 17+ messages in thread
From: Jamie Lokier @ 2005-12-15 21:28 UTC (permalink / raw)
  To: JANAK DESAI
  Cc: Eric W. Biederman, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

JANAK DESAI wrote:
> clone checks to
> makes sure that CLONE_NEWNS and CLONE_FS are not specified together, while
> unshare will force CLONE_FS if CLONE_NEWNS is spefcified.
[ ... ]
> since clone and unshare are doing opposite things, sharing code
> between them that checks constraints on the flags can get
> convoluted.

clone() and unshare() are not doing opposite things.  They do the same
thing, which is to unshare some properties while keeping others
shared, and the only differences are that clone() first creates a new
task, and the defaults for flags that aren't specified.

It is the polarity of _some_ flags which is opposite in your unshare()
API: In your API, CLONE_FS means "unshare fs", while with clone() it
means "share fs".  Same for other flags - except for CLONE_NEWNS,
where unshare() and clone() both take it to mean "unshare ns".

That's a bit of a confusing mixture of polarities, in my opinion.

I think it would be better if this:

	pid = clone(flags);

Could be always equivalent to this:

	pid = clone(CLONE_FLAGS_TO_SHARE_EVERYTHING)
	if (pid == 0)
		unshare(flags);

Or, if you don't like that, then this:

	pid = clone(CLONE_FLAGS_TO_SHARE_EVERYTHING)
	if (pid == 0)
		unshare(~flags);

However, if the API you've chosen for unshare() must be kept, then you
can still have the same checks in clone() and unshare().  Just XOR the
flags word with bits for polarities that are different between the two
calls, before doing the checks, and XOR again afterwards to include
any flags that were forced by the checks.

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 21:13     ` Eric W. Biederman
@ 2005-12-15 21:32       ` Jamie Lokier
  2005-12-15 22:34         ` Eric W. Biederman
  2005-12-16  4:36         ` JANAK DESAI
  2005-12-16  4:32       ` JANAK DESAI
  1 sibling, 2 replies; 17+ messages in thread
From: Jamie Lokier @ 2005-12-15 21:32 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: JANAK DESAI, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Eric W. Biederman wrote:
> I follow but I am very disturbed.
> 
> You are leaving CLONE_NEWNS to mean you want a new namespace.
> 
> For clone CLONE_FS unset means generate an unshared fs_struct
>           CLONE_FS set   means share the fs_struct with the parent
> 
> But for unshare CLONE_FS unset means share the fs_struct with others
>             and CLONE_FS set   means generate an unshared fs_struct
> 
> The meaning of CLONE_FS between the two calls in now flipped,
> but CLONE_NEWNS is not.  Please let's not implement it this way.

I agree.

> Part of the problem is the double negative in the name, leading
> me to suggest that sys_share might almost be a better name.

I agree with that suggestion, too.

Alternatively, we could just add a flag to clone(): CLONE_SELF,
meaning don't create a new task, just modify the properties of the
current task.

> So please code don't invert the meaning of the bits.  This will
> allow sharing of the sanity checks with clone.
> In addition this leaves open the possibility that routines like
> copy_fs properly refactored can be shared between clone and unshare.

And also make the API less confusing to document and use.

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 21:32       ` Jamie Lokier
@ 2005-12-15 22:34         ` Eric W. Biederman
  2005-12-16  4:36         ` JANAK DESAI
  1 sibling, 0 replies; 17+ messages in thread
From: Eric W. Biederman @ 2005-12-15 22:34 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: JANAK DESAI, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Jamie Lokier <jamie@shareable.org> writes:

>
>> Part of the problem is the double negative in the name, leading
>> me to suggest that sys_share might almost be a better name.
>
> I agree with that suggestion, too.
>
> Alternatively, we could just add a flag to clone(): CLONE_SELF,
> meaning don't create a new task, just modify the properties of the
> current task.

Internally I doubt it would make much difference.  There are
real differences from modifying current to copying from current.
Mostly it is ref counting but just enough that CLONE_SELF
is unlikely to be a sane thing to do.

Of course we could always implement spawn.  The syscall with
every possible option :)

Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 21:13     ` Eric W. Biederman
  2005-12-15 21:32       ` Jamie Lokier
@ 2005-12-16  4:32       ` JANAK DESAI
  2005-12-16 10:50         ` Jamie Lokier
  2005-12-16 12:39         ` Eric W. Biederman
  1 sibling, 2 replies; 17+ messages in thread
From: JANAK DESAI @ 2005-12-16  4:32 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: viro, chrisw, dwmw2, jamie, serue, mingo, linuxram, jmorris, sds,
	akpm, linux-kernel

Eric W. Biederman wrote:

>JANAK DESAI <janak@us.ibm.com> writes:
>
>  
>
>>Eric W. Biederman wrote:
>>
>>    
>>
>>>If it isn't legal how about we deny the unshare call.
>>>Then we can share this code with clone.
>>>
>>>Eric
>>>
>>>
>>>
>>>
>>>      
>>>
>>unshare is doing the reverse of what clone does. So if you are unsharing
>>namespace
>>you want to make sure that you are not sharing fs. That's why the CLONE_FS flag
>>is forced (meaning you are also going to unshare fs). Since namespace is shared
>>by default and fs is not, if clone is called with CLONE_NEWNS, the intent is to
>>create a new namespace, which means you cannot share fs. clone checks to
>>makes sure that CLONE_NEWNS and CLONE_FS are not specified together, while
>>unshare will force CLONE_FS if CLONE_NEWNS is spefcified. Basically you
>>can have a shared namespace and non shared fs, but not the other way around and
>>since clone and unshare are doing opposite things, sharing code between them
>>that
>>checks constraints on the flags can get convoluted.
>>    
>>
>
>I follow but I am very disturbed.
>
>You are leaving CLONE_NEWNS to mean you want a new namespace.
>
>For clone CLONE_FS unset means generate an unshared fs_struct
>          CLONE_FS set   means share the fs_struct with the parent
>
>But for unshare CLONE_FS unset means share the fs_struct with others
>            and CLONE_FS set   means generate an unshared fs_struct
>
>The meaning of CLONE_FS between the two calls in now flipped,
>but CLONE_NEWNS is not.  Please let's not implement it this way.
>
>  
>
CLONE_FS unset for unshare doesn't mean that share the fs_struct with
others. It just means that you leave the fs_struct alone (which may or
maynot be shared). I agree that it is confusing, but I am having difficulty
in seeing this reversal of flag meaning. clone creates a second task and
allows you to pick what you want to share with the parent. unshare allows
you to pick what you don't want to share anymore. The "what" in both
cases can be same and still you can end up with a task with "share" state
for a perticular structure. For example, if we #define  FS   CLONE_FS,
then clone(FS) will imply that you want to share fs_struct but unshare(FS)
will imply that we want to unshare fs_struct. Does it still appear that
the flag meaning has changed? clone and unshare are doing different
things, the flag just indicates the structure on which they operate.

>Part of the problem is the double negative in the name, leading
>me to suggest that sys_share might almost be a better name.
>
>  
>
I disagree. You cannot start sharing with this system call. You can
only unshare a shared structure. If you don't specify a flag corresponding
to a structure, that structure stays in its current state which may be
shared or unshared. Since the only action you can perform with this
call is unshare, unshare is the more appropriate name.

>So please code don't invert the meaning of the bits.  This will
>allow sharing of the sanity checks with clone.
>
>In addition this leaves open the possibility that routines like
>copy_fs properly refactored can be shared between clone and unshare.
>
>
>  
>
umm.. I am not sure how you were thinking of refactoring it, but my 
attempt at
using copy_* functions introduced race conditions.

>Eric
>
>
>  
>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 21:28     ` Jamie Lokier
@ 2005-12-16  4:35       ` JANAK DESAI
  0 siblings, 0 replies; 17+ messages in thread
From: JANAK DESAI @ 2005-12-16  4:35 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Eric W. Biederman, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Jamie Lokier wrote:

>JANAK DESAI wrote:
>  
>
>>clone checks to
>>makes sure that CLONE_NEWNS and CLONE_FS are not specified together, while
>>unshare will force CLONE_FS if CLONE_NEWNS is spefcified.
>>    
>>
>[ ... ]
>  
>
>>since clone and unshare are doing opposite things, sharing code
>>between them that checks constraints on the flags can get
>>convoluted.
>>    
>>
>
>clone() and unshare() are not doing opposite things.  They do the same
>thing, which is to unshare some properties while keeping others
>shared, and the only differences are that clone() first creates a new
>task, and the defaults for flags that aren't specified.
>
>  
>
Well .. by opposite I meant one allows you to share specific contexts while
the other allows you to unshare specific contexts. In case of clone, by not
specifying flags you can also unshare, however the same is not true for
the unshare system call. That is, if you don't specify a flag, that 
structure
is left alone in its current state. It does not become shared if it was 
not being
shared to begin with.

>It is the polarity of _some_ flags which is opposite in your unshare()
>API: In your API, CLONE_FS means "unshare fs", while with clone() it
>means "share fs".  Same for other flags - except for CLONE_NEWNS,
>where unshare() and clone() both take it to mean "unshare ns".
>
>That's a bit of a confusing mixture of polarities, in my opinion.
>
>I think it would be better if this:
>
>	pid = clone(flags);
>
>Could be always equivalent to this:
>
>	pid = clone(CLONE_FLAGS_TO_SHARE_EVERYTHING)
>	if (pid == 0)
>		unshare(flags);
>
>Or, if you don't like that, then this:
>
>	pid = clone(CLONE_FLAGS_TO_SHARE_EVERYTHING)
>	if (pid == 0)
>		unshare(~flags);
>
>However, if the API you've chosen for unshare() must be kept, then you
>can still have the same checks in clone() and unshare().  Just XOR the
>flags word with bits for polarities that are different between the two
>calls, before doing the checks, and XOR again afterwards to include
>any flags that were forced by the checks.
>
>  
>
I am in the process of writing up a more detailed description for this 
feature.
Description, which will include requirements, detail design and cost. When I
started, I did try to use as much of the existing code (clone, copy_*) as
possible, but I may have missed things. As I write things down, I will try
and see if I can make flags less confusing.

I am off for the next two weeks with limited email access so don't want to
rearrange stuff now.

>-- Jamie
>
>
>  
>


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-15 21:32       ` Jamie Lokier
  2005-12-15 22:34         ` Eric W. Biederman
@ 2005-12-16  4:36         ` JANAK DESAI
  1 sibling, 0 replies; 17+ messages in thread
From: JANAK DESAI @ 2005-12-16  4:36 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Eric W. Biederman, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Jamie Lokier wrote:

>Eric W. Biederman wrote:
>  
>
>>I follow but I am very disturbed.
>>
>>You are leaving CLONE_NEWNS to mean you want a new namespace.
>>
>>For clone CLONE_FS unset means generate an unshared fs_struct
>>          CLONE_FS set   means share the fs_struct with the parent
>>
>>But for unshare CLONE_FS unset means share the fs_struct with others
>>            and CLONE_FS set   means generate an unshared fs_struct
>>
>>The meaning of CLONE_FS between the two calls in now flipped,
>>but CLONE_NEWNS is not.  Please let's not implement it this way.
>>    
>>
>
>I agree.
>
>  
>
>>Part of the problem is the double negative in the name, leading
>>me to suggest that sys_share might almost be a better name.
>>    
>>
>
>I agree with that suggestion, too.
>
>Alternatively, we could just add a flag to clone(): CLONE_SELF,
>meaning don't create a new task, just modify the properties of the
>current task.
>  
>
... and this won't cause confusion :-) ? Clone to me implies that a second
entity is being created.

>  
>
>>So please code don't invert the meaning of the bits.  This will
>>allow sharing of the sanity checks with clone.
>>In addition this leaves open the possibility that routines like
>>copy_fs properly refactored can be shared between clone and unshare.
>>    
>>
>
>And also make the API less confusing to document and use.
>
>-- Jamie
>
>  
>
I am all for less confusing API and saner semantics. I will take a look
at yours and Eric's suggestions to see what can be done.

Thanks.

-Janak


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-16  4:32       ` JANAK DESAI
@ 2005-12-16 10:50         ` Jamie Lokier
  2005-12-16 12:46           ` Eric W. Biederman
  2005-12-16 14:32           ` Serge E. Hallyn
  2005-12-16 12:39         ` Eric W. Biederman
  1 sibling, 2 replies; 17+ messages in thread
From: Jamie Lokier @ 2005-12-16 10:50 UTC (permalink / raw)
  To: JANAK DESAI
  Cc: Eric W. Biederman, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

JANAK DESAI wrote:
> >I follow but I am very disturbed.
> >
> >You are leaving CLONE_NEWNS to mean you want a new namespace.
> >
> >For clone CLONE_FS unset means generate an unshared fs_struct
> >         CLONE_FS set   means share the fs_struct with the parent
> >
> >But for unshare CLONE_FS unset means share the fs_struct with others
> >           and CLONE_FS set   means generate an unshared fs_struct
> >
> >The meaning of CLONE_FS between the two calls in now flipped,
> >but CLONE_NEWNS is not.  Please let's not implement it this way.
> >
> > 
> CLONE_FS unset for unshare doesn't mean that share the fs_struct with
> others. It just means that you leave the fs_struct alone (which may or
> maynot be shared). I agree that it is confusing, but I am having difficulty
> in seeing this reversal of flag meaning. clone creates a second task and
> allows you to pick what you want to share with the parent. unshare allows
> you to pick what you don't want to share anymore. The "what" in both
> cases can be same and still you can end up with a task with "share" state
> for a perticular structure. 

> For example, if we #define  FS   CLONE_FS,
> then clone(FS) will imply that you want to share fs_struct but unshare(FS)
> will imply that we want to unshare fs_struct.

Right.

But clone(CLONE_NEWNS) and unshare(CLONE_NEWNS) _don't_ behave like
that: clone(CLONE_NEWNS) unshares ns, and unshare(CLONE_NEWNFS) _also_
unshares ns.

That is the inconsistency: some flags to clone() mean "keep sharing
this thing", and some flags mean "make a new instance of this thing".
It's not your fault that clone() has that inconsistency.  It's because
clone() needs to treat a value of 0 for any bit as the historically
default behaviour.

Like clone(), unshare() will have to change from year to year, as new
flags are added.  It would be good if the default behaviour of 0 bits
to unshare() also did the right thing, so that programs compiled in
2006 still function as expected in 2010.  Hmm, this
forward-compatibility does not look pretty.

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-16  4:32       ` JANAK DESAI
  2005-12-16 10:50         ` Jamie Lokier
@ 2005-12-16 12:39         ` Eric W. Biederman
  1 sibling, 0 replies; 17+ messages in thread
From: Eric W. Biederman @ 2005-12-16 12:39 UTC (permalink / raw)
  To: JANAK DESAI
  Cc: viro, chrisw, dwmw2, jamie, serue, mingo, linuxram, jmorris, sds,
	akpm, linux-kernel

JANAK DESAI <janak@us.ibm.com> writes:

> Eric W. Biederman wrote:
>
>>JANAK DESAI <janak@us.ibm.com> writes:
>>
>>
>>
>>>Eric W. Biederman wrote:
>>>
>>>
>>>
>>>>If it isn't legal how about we deny the unshare call.
>>>>Then we can share this code with clone.
>>>>
>>>>Eric
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>unshare is doing the reverse of what clone does. So if you are unsharing
>>>namespace
>>>you want to make sure that you are not sharing fs. That's why the CLONE_FS
> flag
>>>is forced (meaning you are also going to unshare fs). Since namespace is
> shared
>>>by default and fs is not, if clone is called with CLONE_NEWNS, the intent is
> to
>>>create a new namespace, which means you cannot share fs. clone checks to
>>>makes sure that CLONE_NEWNS and CLONE_FS are not specified together, while
>>>unshare will force CLONE_FS if CLONE_NEWNS is spefcified. Basically you
>>>can have a shared namespace and non shared fs, but not the other way around
> and
>>>since clone and unshare are doing opposite things, sharing code between them
>>>that
>>>checks constraints on the flags can get convoluted.
>>>
>>>
>>
>>I follow but I am very disturbed.
>>
>>You are leaving CLONE_NEWNS to mean you want a new namespace.
>>
>>For clone CLONE_FS unset means generate an unshared fs_struct
>>          CLONE_FS set   means share the fs_struct with the parent
>>
>>But for unshare CLONE_FS unset means share the fs_struct with others
>>            and CLONE_FS set   means generate an unshared fs_struct
>>
>>The meaning of CLONE_FS between the two calls in now flipped,
>>but CLONE_NEWNS is not.  Please let's not implement it this way.
>>
>>
>>
> CLONE_FS unset for unshare doesn't mean that share the fs_struct with
> others. It just means that you leave the fs_struct alone (which may or
> maynot be shared). 

Right but that leaves that data shared.  It is a bit challenging
describing the bits in a way that makes sense from both the
clone and the unshare perspective.

What I see as fundamental is that after a the syscall a resource
may be in one of two states.
- possibly shared with others
- definitely not shared with others

In the clone case the sharing is guaranteed unless your
parent has just exited.  But if you don't count your parent
then clone and unshare should look alike.

> I agree that it is confusing, but I am having difficulty
> in seeing this reversal of flag meaning. 

Carefully note that CLONE_NEWNS behaves differently than most
of the other clone flags because by default the NS stays shared.

> clone creates a second task and
> allows you to pick what you want to share with the parent. unshare allows
> you to pick what you don't want to share anymore. The "what" in both
> cases can be same and still you can end up with a task with "share" state
> for a perticular structure. For example, if we #define  FS   CLONE_FS,
> then clone(FS) will imply that you want to share fs_struct but unshare(FS)
> will imply that we want to unshare fs_struct. Does it still appear that
> the flag meaning has changed? 

So you are having clone and unshare do opposite things.

If that is the intention you are broken with respect to CLONE_NEWNS.
In your implementation.
clone(NEWNS) implies you don't share struct namespace  and
unshare(NEWNS) implies you don't share struct namespace.

> clone and unshare are doing different
> things, the flag just indicates the structure on which they operate.

Another way to think of it is one way to implement unshare is
to call clone with the appropriate flags and to have the parent
exit.  Not counting pids and the process tree this should
give identical results to unshare.

If I called sys_unshare(0) without changing the meaning of
the bits I would expect my VM to be unshared, my FS state
to be unshared, my file descriptors to be unshared,  my signal
handling to be unshared, my tgid to be unshared, to continue
sharing my namespace, my sysv semaphore undo semantics to be
unshared, and my thread local storage to be unshared.

Basically I expect sys_unshare(0) to take a thread an turn
it into a full process by default.  That preserves the current
defaults about which things should be shared and which things
should be unshared.  Basically I expect fork() unshare(0) to
be a noop, but not clone(...) unshare(0).

Just using the bits as resource identifiers and not preserving
the default share/unshare status leads to confusion from
implementors because we the set of legal bits is different
in the two cases, and it is confusing to users because they
can't just take bits passed into clone and get the same results
passing those bits into unshare.

They are different in that unshare does not create a new process
and the difference between operating on a new process versus
your current process should be the only difference, not something
that is lost in the noise of given fresh meaning to the bits.

The core expectation is that unshare(CLONE_NEWNS) will give
a new namespace.  Currently to meet that expectation you are
changing the set of bits internally, instead of erring about
a bad set of bits being specified.  If however you leave the
defaults to unshare everything but the namespace you don't
have to toggle the bits to meet the expectation of what will
work, and you don't have to change the meaning of the bits.

>>Part of the problem is the double negative in the name, leading
>>me to suggest that sys_share might almost be a better name.
>>
>>
>>
> I disagree. You cannot start sharing with this system call. You can
> only unshare a shared structure. If you don't specify a flag corresponding
> to a structure, that structure stays in its current state which may be
> shared or unshared. Since the only action you can perform with this
> call is unshare, unshare is the more appropriate name.

Of the two names I agree that unshare is more appropriate however
because we have double negatives things get confusing.  That was
the point in mentioning this strawman.  For my vision of a syscall
that unshared practically everything by default I think unshare
is clearly a better name.

>>So please code don't invert the meaning of the bits.  This will
>>allow sharing of the sanity checks with clone.
>>
>>In addition this leaves open the possibility that routines like
>>copy_fs properly refactored can be shared between clone and unshare.
>>
>>
>>
>>
> umm.. I am not sure how you were thinking of refactoring it, but my attempt at
> using copy_* functions introduced race conditions.

We clearly cannot reuse these copy_* functions as is.

I hadn't looked closely yet.  But lets see.

static int share_fs(unsigned long clone_flags, struct fs_struct **fsp)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                atomic_inc(&fs->count);
        }
        else if (likely(&current->fs != fsp) ||	(atomic_read(&fs->count) > 1)) {
		*fsp = __copy_fs_struct(fs);
                if (!*fsp)
                	return -ENOMEM;
        }
        return 0;
}

static inline int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
        return share_fs(clone_flags, &tsk->fs);
}

That should handle both cases.  The only real changes were
passing in the address of a fs_struct pointer, and the additional
test to see if we can avoid copying in the unshare case.

People interested in massive performance tuning of fork might be
concerned but the mere mortals it looks to only be a few cycles
different either way and the gain in maintainability should more than
make up for the performance difference.

Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-16 10:50         ` Jamie Lokier
@ 2005-12-16 12:46           ` Eric W. Biederman
  2005-12-16 17:00             ` Jamie Lokier
  2005-12-16 14:32           ` Serge E. Hallyn
  1 sibling, 1 reply; 17+ messages in thread
From: Eric W. Biederman @ 2005-12-16 12:46 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: JANAK DESAI, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Jamie Lokier <jamie@shareable.org> writes:

> Like clone(), unshare() will have to change from year to year, as new
> flags are added.  It would be good if the default behaviour of 0 bits
> to unshare() also did the right thing, so that programs compiled in
> 2006 still function as expected in 2010.  Hmm, this
> forward-compatibility does not look pretty.

Why all it requires is that whenever someone updates clone they update
unshare.  Given the tiniest bit of refactoring we should be
able to share all of the interesting code paths.

Which should also improve maintenance considerably.

Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-16 10:50         ` Jamie Lokier
  2005-12-16 12:46           ` Eric W. Biederman
@ 2005-12-16 14:32           ` Serge E. Hallyn
  1 sibling, 0 replies; 17+ messages in thread
From: Serge E. Hallyn @ 2005-12-16 14:32 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: JANAK DESAI, Eric W. Biederman, viro, chrisw, dwmw2, serue,
	mingo, linuxram, jmorris, sds, akpm, linux-kernel

Quoting Jamie Lokier (jamie@shareable.org):
> Like clone(), unshare() will have to change from year to year, as new
> flags are added.  It would be good if the default behaviour of 0 bits
> to unshare() also did the right thing, so that programs compiled in
> 2006 still function as expected in 2010.  Hmm, this
> forward-compatibility does not look pretty.

This is a very good point, which I didn't quite appreciate at first.

I suppose it would be a bad idea to define a new set of unshare flags,
and have unshare use UNSH_NS | UNSH_FS ?  Then clone() could do the
proper translation from CLONE_* to UNSH_*, and call unshare after the
clone().  It still requires more work as clone() is maintained, but
at least it won't be as confusing as you've shown the current case to
be.

-serge

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-16 12:46           ` Eric W. Biederman
@ 2005-12-16 17:00             ` Jamie Lokier
  2005-12-17  2:23               ` Eric W. Biederman
  0 siblings, 1 reply; 17+ messages in thread
From: Jamie Lokier @ 2005-12-16 17:00 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: JANAK DESAI, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Eric W. Biederman wrote:
> > Like clone(), unshare() will have to change from year to year, as new
> > flags are added.  It would be good if the default behaviour of 0 bits
> > to unshare() also did the right thing, so that programs compiled in
> > 2006 still function as expected in 2010.  Hmm, this
> > forward-compatibility does not look pretty.
> 
> Why all it requires is that whenever someone updates clone they update
> unshare.  Given the tiniest bit of refactoring we should be
> able to share all of the interesting code paths.

That only works if unshare() should always mean "unshare everything
except specified things", including things that we currently don't
unshare.

I guess that is probably fine.  Anything that would break
unshare()-using programs in future if it unshared by default, would be
likely to break clone()-using programs too.  Is that right?  Any
counterexamples?

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH -mm 1/9] unshare system call: system call handler function
  2005-12-16 17:00             ` Jamie Lokier
@ 2005-12-17  2:23               ` Eric W. Biederman
  0 siblings, 0 replies; 17+ messages in thread
From: Eric W. Biederman @ 2005-12-17  2:23 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: JANAK DESAI, viro, chrisw, dwmw2, serue, mingo, linuxram,
	jmorris, sds, akpm, linux-kernel

Jamie Lokier <jamie@shareable.org> writes:

> Eric W. Biederman wrote:
>> > Like clone(), unshare() will have to change from year to year, as new
>> > flags are added.  It would be good if the default behaviour of 0 bits
>> > to unshare() also did the right thing, so that programs compiled in
>> > 2006 still function as expected in 2010.  Hmm, this
>> > forward-compatibility does not look pretty.
>> 
>> Why all it requires is that whenever someone updates clone they update
>> unshare.  Given the tiniest bit of refactoring we should be
>> able to share all of the interesting code paths.
>
> That only works if unshare() should always mean "unshare everything
> except specified things", including things that we currently don't
> unshare.
>
> I guess that is probably fine.  Anything that would break
> unshare()-using programs in future if it unshared by default, would be
> likely to break clone()-using programs too.  Is that right?  Any
> counterexamples?

The only way I can see to confuse unshare is to add a clone
flag and not implement it in unshare.    If there is enough
in common between the implementations I don't see that being
a problem.

Eric

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH -mm 1/9] unshare system call : system call handler function
@ 2005-12-13 13:42 JANAK DESAI
  0 siblings, 0 replies; 17+ messages in thread
From: JANAK DESAI @ 2005-12-13 13:42 UTC (permalink / raw)
  To: viro, chrisw, dwmw2, jamie, serue, mingo, linuxram, jmorris, sds, janak
  Cc: akpm, linux-kernel


[PATCH -mm 1/9] unshare system call: System call handler function sys_unshare
                                                                                
Signed-off-by: Janak Desai


 fork.c |  235 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 235 insertions(+)
 
 
diff -Naurp 2.6.15-rc5-mm2/kernel/fork.c 2.6.15-rc5-mm2+patch/kernel/fork.c
--- 2.6.15-rc5-mm2/kernel/fork.c	2005-12-12 03:05:59.000000000 +0000
+++ 2.6.15-rc5-mm2+patch/kernel/fork.c	2005-12-12 19:31:48.000000000 +0000
@@ -1330,3 +1330,238 @@ void __init proc_caches_init(void)
 			sizeof(struct mm_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
+
+
+/*
+ * Check constraints on flags passed to the unshare system call and
+ * force unsharing of additional process context as appropriate.
+ */
+static inline void check_unshare_flags(unsigned long *flags_ptr)
+{
+	/*
+	 * If unsharing a thread from a thread group, must also
+	 * unshare vm.
+	 */
+	if (*flags_ptr & CLONE_THREAD)
+		*flags_ptr |= CLONE_VM;
+
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (*flags_ptr & CLONE_VM)
+		*flags_ptr |= CLONE_SIGHAND;
+
+	/*
+	 * If unsharing signal handlers and the task was created
+	 * using CLONE_THREAD, then must unshare the thread
+	 */
+	if ((*flags_ptr & CLONE_SIGHAND) &&
+	    (atomic_read(&current->signal->count) > 1))
+		*flags_ptr |= CLONE_THREAD;
+
+	/*
+	 * If unsharing namespace, must also unshare filesystem information.
+	 */
+	if (*flags_ptr & CLONE_NEWNS)
+		*flags_ptr |= CLONE_FS;
+}
+
+/*
+ * Unsharing of tasks created with CLONE_THREAD is not supported yet
+ */
+static int unshare_thread(unsigned long unshare_flags)
+{
+	if (unshare_flags & CLONE_THREAD)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of fs info for tasks created with CLONE_FS is not supported yet
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+	struct fs_struct *fs = current->fs;
+
+	if ((unshare_flags & CLONE_FS) &&
+	    (fs && atomic_read(&fs->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of namespace for tasks created without CLONE_NEWNS is not
+ * supported yet
+ */
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp)
+{
+	struct namespace *ns = current->namespace;
+
+	if ((unshare_flags & CLONE_NEWNS) &&
+	    (ns && atomic_read(&ns->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * supported yet
+ */
+static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+{
+	struct sighand_struct *sigh = current->sighand;
+
+	if ((unshare_flags & CLONE_SIGHAND) &&
+	    (sigh && atomic_read(&sigh->count) > 1))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/*
+ * Unsharing of vm for tasks created with CLONE_VM is not supported yet
+ */
+static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+{
+	struct mm_struct *mm = current->mm;
+
+	if ((unshare_flags & CLONE_VM) &&
+	    (mm && atomic_read(&mm->mm_users) > 1))
+		return -EINVAL;
+
+	return 0;
+
+}
+
+/*
+ * Unsharing of files for tasks created with CLONE_FILES is not supported yet
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+	struct files_struct *fd = current->files;
+
+	if ((unshare_flags & CLONE_FILES) &&
+	    (fd && atomic_read(&fd->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
+ * supported yet
+ */
+static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
+{
+	if (unshare_flags & CLONE_SYSVSEM)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone.  copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+	int err = 0;
+	struct fs_struct *fs, *new_fs = NULL;
+	struct namespace *ns, *new_ns = NULL;
+	struct sighand_struct *sigh, *new_sigh = NULL;
+	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
+	struct files_struct *fd, *new_fd = NULL;
+	struct sem_undo_list *new_ulist = NULL;
+
+	check_unshare_flags(&unshare_flags);
+
+	if ((err = unshare_thread(unshare_flags)))
+		goto bad_unshare_out;
+	if ((err = unshare_fs(unshare_flags, &new_fs)))
+		goto bad_unshare_cleanup_thread;
+	if ((err = unshare_namespace(unshare_flags, &new_ns)))
+		goto bad_unshare_cleanup_fs;
+	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+		goto bad_unshare_cleanup_ns;
+	if ((err = unshare_vm(unshare_flags, &new_mm)))
+		goto bad_unshare_cleanup_sigh;
+	if ((err = unshare_fd(unshare_flags, &new_fd)))
+		goto bad_unshare_cleanup_vm;
+	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
+		goto bad_unshare_cleanup_fd;
+
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+
+		task_lock(current);
+
+		if (new_fs) {
+			fs = current->fs;
+			current->fs = new_fs;
+			put_fs_struct(fs);
+		}
+
+		if (new_ns) {
+			ns = current->namespace;
+			current->namespace = new_ns;
+			put_namespace(ns);
+		}
+
+		if (new_sigh) {
+			sigh = current->sighand;
+			current->sighand = new_sigh;
+			if (atomic_dec_and_test(&sigh->count))
+				kmem_cache_free(sighand_cachep, sigh);
+		}
+
+		if (new_mm) {
+			mm = current->mm;
+			active_mm = current->active_mm;
+			current->mm = new_mm;
+			current->active_mm = new_mm;
+			activate_mm(active_mm, new_mm);
+			mmput(mm);
+		}
+
+		if (new_fd) {
+			fd = current->files;
+			current->files = new_fd;
+			put_files_struct(fd);
+		}
+
+		task_unlock(current);
+	}
+
+	return 0;
+
+bad_unshare_cleanup_fd:
+	if (new_fd)
+		put_files_struct(new_fd);
+
+bad_unshare_cleanup_vm:
+	if (new_mm)
+		mmput(new_mm);
+
+bad_unshare_cleanup_sigh:
+	if (new_sigh)
+		if (atomic_dec_and_test(&new_sigh->count))
+			kmem_cache_free(sighand_cachep, new_sigh);
+
+bad_unshare_cleanup_ns:
+	if (new_ns)
+		put_namespace(new_ns);
+
+bad_unshare_cleanup_fs:
+	if (new_fs)
+		put_fs_struct(new_fs);
+
+bad_unshare_cleanup_thread:
+bad_unshare_out:
+	return err;
+}


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2005-12-17  2:26 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-12-13 22:54 [PATCH -mm 1/9] unshare system call: system call handler function JANAK DESAI
2005-12-15 19:52 ` Eric W. Biederman
2005-12-15 20:38   ` JANAK DESAI
2005-12-15 21:13     ` Eric W. Biederman
2005-12-15 21:32       ` Jamie Lokier
2005-12-15 22:34         ` Eric W. Biederman
2005-12-16  4:36         ` JANAK DESAI
2005-12-16  4:32       ` JANAK DESAI
2005-12-16 10:50         ` Jamie Lokier
2005-12-16 12:46           ` Eric W. Biederman
2005-12-16 17:00             ` Jamie Lokier
2005-12-17  2:23               ` Eric W. Biederman
2005-12-16 14:32           ` Serge E. Hallyn
2005-12-16 12:39         ` Eric W. Biederman
2005-12-15 21:28     ` Jamie Lokier
2005-12-16  4:35       ` JANAK DESAI
  -- strict thread matches above, loose matches on Subject: below --
2005-12-13 13:42 [PATCH -mm 1/9] unshare system call : " JANAK DESAI

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).