linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
@ 2017-04-24 15:39 Cyrill Gorcunov
       [not found] ` <20170424154423.511592110-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Cyrill Gorcunov @ 2017-04-24 15:39 UTC (permalink / raw)
  To: linux-fsdevel, linux-kernel, linux-api
  Cc: viro, akpm, avagin, xemul, mtk.manpages, gorcunov, avagin, jbaron, luto

[-- Attachment #1: kcmp-epoll-4 --]
[-- Type: text/plain, Size: 6865 bytes --]

With current epoll architecture target files are addressed
with file_struct and file descriptor number, where the last
is not unique. Moreover files can be transferred from another
process via unix socket, added into queue and closed then
so we won't find this descriptor in the task fdinfo list.

Thus to checkpoint and restore such processes CRIU needs to
find out where exactly the target file is present to add it into
epoll queue. For this sake one can use kcmp call where
some particular target file from the queue is compared with
arbitrary file passed as an argument.

Because epoll target files can have same file descriptor
number but different file_struct a caller should explicitly
specify the offset within.

To test if some particular file is matching entry inside
epoll one have to

 - fill kcmp_epoll_slot structure with epoll file descriptor,
   target file number and target file offset (in case if only
   one target is present then it should be 0)

 - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
    - the kernel fetch file pointer matching file descriptor @fd of pid1
    - lookups for file struct in epoll queue of pid2 and returns traditional
      0,1,2 result for sorting purpose

v2:
 - Use KCMP_FILES salt for files comparision (for convenience sake,
   since the pointers are file structs so user can lookup over previously
   collected files tree)
 - Make kcmp_epoll_target as a separate helper instead of opencoding
   it with #ifdef

v3:
 - Use less if()s in kcmp_epoll_target for readability sake (by avagin@)
 - Use u32 for kcmp_epoll_slot::toff instead of u64, which makes the less
   memory pressue

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrey Vagin <avagin@openvz.org>
CC: Al Viro <viro@zeniv.linux.org.uk>
CC: Andrew Morton <akpm@linuxfoundation.org>
CC: Pavel Emelyanov <xemul@virtuozzo.com>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: Jason Baron <jbaron@akamai.com>
CC: Andy Lutomirski <luto@amacapital.net>
---
 fs/eventpoll.c            |   42 +++++++++++++++++++++++++++++++++
 include/linux/eventpoll.h |    3 ++
 include/uapi/linux/kcmp.h |   10 ++++++++
 kernel/kcmp.c             |   57 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 112 insertions(+)

Index: linux-ml.git/fs/eventpoll.c
===================================================================
--- linux-ml.git.orig/fs/eventpoll.c
+++ linux-ml.git/fs/eventpoll.c
@@ -1000,6 +1000,48 @@ static struct epitem *ep_find(struct eve
 	return epir;
 }
 
+static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
+{
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (epi->ffd.fd == tfd) {
+			if (toff == 0)
+				return epi;
+			else
+				toff--;
+		}
+		cond_resched();
+	}
+
+	return NULL;
+}
+
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
+				     unsigned long toff)
+{
+	struct file *file_raw;
+	struct eventpoll *ep;
+	struct epitem *epi;
+
+	if (!is_file_epoll(file))
+		return ERR_PTR(-EINVAL);
+
+	ep = file->private_data;
+
+	mutex_lock(&ep->mtx);
+	epi = ep_find_tfd(ep, tfd, toff);
+	if (epi)
+		file_raw = epi->ffd.file;
+	else
+		file_raw = ERR_PTR(-ENOENT);
+	mutex_unlock(&ep->mtx);
+
+	return file_raw;
+}
+
 /*
  * This is the callback that is passed to the wait queue wakeup
  * mechanism. It is called by the stored file descriptors when they
Index: linux-ml.git/include/linux/eventpoll.h
===================================================================
--- linux-ml.git.orig/include/linux/eventpoll.h
+++ linux-ml.git/include/linux/eventpoll.h
@@ -14,6 +14,7 @@
 #define _LINUX_EVENTPOLL_H
 
 #include <uapi/linux/eventpoll.h>
+#include <uapi/linux/kcmp.h>
 
 
 /* Forward declarations to avoid compiler errors */
@@ -22,6 +23,8 @@ struct file;
 
 #ifdef CONFIG_EPOLL
 
+struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
+
 /* Used to initialize the epoll bits inside the "struct file" */
 static inline void eventpoll_init_file(struct file *file)
 {
Index: linux-ml.git/include/uapi/linux/kcmp.h
===================================================================
--- linux-ml.git.orig/include/uapi/linux/kcmp.h
+++ linux-ml.git/include/uapi/linux/kcmp.h
@@ -1,6 +1,8 @@
 #ifndef _UAPI_LINUX_KCMP_H
 #define _UAPI_LINUX_KCMP_H
 
+#include <linux/types.h>
+
 /* Comparison type */
 enum kcmp_type {
 	KCMP_FILE,
@@ -10,8 +12,16 @@ enum kcmp_type {
 	KCMP_SIGHAND,
 	KCMP_IO,
 	KCMP_SYSVSEM,
+	KCMP_EPOLL_TFD,
 
 	KCMP_TYPES,
 };
 
+/* Slot for KCMP_EPOLL_TFD */
+struct kcmp_epoll_slot {
+	__u32 efd;		/* epoll file descriptor */
+	__u32 tfd;		/* target file number */
+	__u32 toff;		/* target offset within same numbered sequence */
+};
+
 #endif /* _UAPI_LINUX_KCMP_H */
Index: linux-ml.git/kernel/kcmp.c
===================================================================
--- linux-ml.git.orig/kernel/kcmp.c
+++ linux-ml.git/kernel/kcmp.c
@@ -11,6 +11,10 @@
 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
 
 #include <asm/unistd.h>
 
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, s
 	return err;
 }
 
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	struct file *filp, *filp_epoll, *filp_tgt;
+	struct kcmp_epoll_slot slot;
+	struct files_struct *files;
+
+	if (copy_from_user(&slot, uslot, sizeof(slot)))
+		return -EFAULT;
+
+	filp = get_file_raw_ptr(task1, idx1);
+	if (!filp)
+		return -EBADF;
+
+	files = get_files_struct(task2);
+	if (!files)
+		return -EBADF;
+
+	spin_lock(&files->file_lock);
+	filp_epoll = fcheck_files(files, slot.efd);
+	if (filp_epoll)
+		get_file(filp_epoll);
+	else
+		filp_tgt = ERR_PTR(-EBADF);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (filp_epoll) {
+		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+		fput(filp_epoll);
+	} else
+
+	if (IS_ERR(filp_tgt))
+		return PTR_ERR(filp_tgt);
+
+	return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+			     struct task_struct *task2,
+			     unsigned long idx1,
+			     struct kcmp_epoll_slot __user *uslot)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
 		unsigned long, idx1, unsigned long, idx2)
 {
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t
 		ret = -EOPNOTSUPP;
 #endif
 		break;
+	case KCMP_EPOLL_TFD:
+		ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+		break;
 	default:
 		ret = -EINVAL;
 		break;

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found] ` <20170424154423.511592110-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-05-12 22:00   ` Andrew Morton
       [not found]     ` <20170512150018.b931c7f5295dd7484845fcec-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
  2017-05-12 22:41   ` Jann Horn
  2017-09-17 16:01   ` [v4,resend,2/2] " Eugene Syromiatnikov
  2 siblings, 1 reply; 9+ messages in thread
From: Andrew Morton @ 2017-05-12 22:00 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
	akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	avagin-5HdwGun5lf+gSpxsJD1C4w, xemul-5HdwGun5lf+gSpxsJD1C4w,
	mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	gorcunov-GEFAQzZX7r8dnm+yROfE0A, avagin-GEFAQzZX7r8dnm+yROfE0A,
	jbaron-JqFfY2XvxFXQT0dZR+AlfA, luto-kltTT9wpgjJwATOyAt5JVQ

On Mon, 24 Apr 2017 18:39:28 +0300 Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> With current epoll architecture target files are addressed
> with file_struct and file descriptor number, where the last
> is not unique. Moreover files can be transferred from another
> process via unix socket, added into queue and closed then
> so we won't find this descriptor in the task fdinfo list.
> 
> Thus to checkpoint and restore such processes CRIU needs to
> find out where exactly the target file is present to add it into
> epoll queue. For this sake one can use kcmp call where
> some particular target file from the queue is compared with
> arbitrary file passed as an argument.
> 
> Because epoll target files can have same file descriptor
> number but different file_struct a caller should explicitly
> specify the offset within.
> 
> To test if some particular file is matching entry inside
> epoll one have to
> 
>  - fill kcmp_epoll_slot structure with epoll file descriptor,
>    target file number and target file offset (in case if only
>    one target is present then it should be 0)
> 
>  - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
>     - the kernel fetch file pointer matching file descriptor @fd of pid1
>     - lookups for file struct in epoll queue of pid2 and returns traditional
>       0,1,2 result for sorting purpose

That's quite a bit more code.  Is there a neat way of making it depend
on a new CONFIG_foo, then select CONFIG_foo if
CONFIG_CHECKPOINT_RESTORE?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found]     ` <20170512150018.b931c7f5295dd7484845fcec-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
@ 2017-05-12 22:14       ` Cyrill Gorcunov
  0 siblings, 0 replies; 9+ messages in thread
From: Cyrill Gorcunov @ 2017-05-12 22:14 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
	akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	avagin-5HdwGun5lf+gSpxsJD1C4w, xemul-5HdwGun5lf+gSpxsJD1C4w,
	mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	avagin-GEFAQzZX7r8dnm+yROfE0A, jbaron-JqFfY2XvxFXQT0dZR+AlfA,
	luto-kltTT9wpgjJwATOyAt5JVQ

On Fri, May 12, 2017 at 03:00:18PM -0700, Andrew Morton wrote:
> On Mon, 24 Apr 2017 18:39:28 +0300 Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> 
> That's quite a bit more code.  Is there a neat way of making it depend
> on a new CONFIG_foo, then select CONFIG_foo if
> CONFIG_CHECKPOINT_RESTORE?

Sure, will do on top and send. Thank you!

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found] ` <20170424154423.511592110-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-12 22:00   ` Andrew Morton
@ 2017-05-12 22:41   ` Jann Horn
  2017-05-12 22:53     ` Cyrill Gorcunov
  2017-09-17 16:01   ` [v4,resend,2/2] " Eugene Syromiatnikov
  2 siblings, 1 reply; 9+ messages in thread
From: Jann Horn @ 2017-05-12 22:41 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, kernel list, Linux API,
	Al Viro, akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	avagin-5HdwGun5lf+gSpxsJD1C4w, xemul-5HdwGun5lf+gSpxsJD1C4w,
	Michael Kerrisk-manpages, Cyrill Gorcunov,
	avagin-GEFAQzZX7r8dnm+yROfE0A, jbaron-JqFfY2XvxFXQT0dZR+AlfA,
	Andy Lutomirski

[resending as plaintext]

On Mon, Apr 24, 2017 at 5:39 PM, Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> With current epoll architecture target files are addressed
> with file_struct and file descriptor number, where the last
> is not unique. Moreover files can be transferred from another
> process via unix socket, added into queue and closed then
> so we won't find this descriptor in the task fdinfo list.
>
> Thus to checkpoint and restore such processes CRIU needs to
> find out where exactly the target file is present to add it into
> epoll queue. For this sake one can use kcmp call where
> some particular target file from the queue is compared with
> arbitrary file passed as an argument.
[...]
> +#ifdef CONFIG_EPOLL
> +static int kcmp_epoll_target(struct task_struct *task1,
> +                            struct task_struct *task2,
> +                            unsigned long idx1,
> +                            struct kcmp_epoll_slot __user *uslot)
> +{
> +       struct file *filp, *filp_epoll, *filp_tgt;
> +       struct kcmp_epoll_slot slot;
> +       struct files_struct *files;
> +
> +       if (copy_from_user(&slot, uslot, sizeof(slot)))
> +               return -EFAULT;
> +
> +       filp = get_file_raw_ptr(task1, idx1);
> +       if (!filp)
> +               return -EBADF;
> +
> +       files = get_files_struct(task2);
> +       if (!files)
> +               return -EBADF;
> +
> +       spin_lock(&files->file_lock);
> +       filp_epoll = fcheck_files(files, slot.efd);
> +       if (filp_epoll)
> +               get_file(filp_epoll);
> +       else
> +               filp_tgt = ERR_PTR(-EBADF);
> +       spin_unlock(&files->file_lock);
> +       put_files_struct(files);
> +
> +       if (filp_epoll) {
> +               filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
> +               fput(filp_epoll);
> +       } else
> +
> +       if (IS_ERR(filp_tgt))
> +               return PTR_ERR(filp_tgt);
> +
> +       return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
> +}

I realize that the existing kcmp code has the same issue, but:

Why are you not taking a reference to filp or filp_tgt? This can end up
performing a comparison between a pointer to a freed struct file and a
pointer to a struct file that was allocated afterwards, right? So it can
return a false "is equal" result when the two files aren't actually the same
if one of the target tasks is running? This looks like it unnecessarily
exposes information about whether an allocation reuses the memory of
a previously freed allocation.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
  2017-05-12 22:41   ` Jann Horn
@ 2017-05-12 22:53     ` Cyrill Gorcunov
       [not found]       ` <20170512225340.GD1881-ZmlpmtaulQd+urZeOPWqwQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Cyrill Gorcunov @ 2017-05-12 22:53 UTC (permalink / raw)
  To: Jann Horn
  Cc: linux-fsdevel, kernel list, Linux API, Al Viro, akpm, avagin,
	xemul, Michael Kerrisk-manpages, avagin, jbaron, Andy Lutomirski

On Sat, May 13, 2017 at 12:41:30AM +0200, Jann Horn wrote:
> [resending as plaintext]
> 
> I realize that the existing kcmp code has the same issue, but:
> 
> Why are you not taking a reference to filp or filp_tgt? This can end up
> performing a comparison between a pointer to a freed struct file and a
> pointer to a struct file that was allocated afterwards, right? So it can
> return a false "is equal" result when the two files aren't actually the same
> if one of the target tasks is running? This looks like it unnecessarily
> exposes information about whether an allocation reuses the memory of
> a previously freed allocation.

It work with unlocked data on purpose for speed sake. Moreover even
if we grap a reference it is valid _only_ during comparision operation,
next we drop ref and it can be easily freed by os. Thus it's up to
a caller to keep references to files/task and other resources used.

	Cyrill

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found]       ` <20170512225340.GD1881-ZmlpmtaulQd+urZeOPWqwQ@public.gmane.org>
@ 2017-05-13  1:45         ` Andrei Vagin
       [not found]           ` <20170513014508.GA21900-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Andrei Vagin @ 2017-05-13  1:45 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Jann Horn, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, kernel list,
	Linux API, Al Viro, akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	xemul-5HdwGun5lf+gSpxsJD1C4w, Michael Kerrisk-manpages,
	avagin-GEFAQzZX7r8dnm+yROfE0A, jbaron-JqFfY2XvxFXQT0dZR+AlfA,
	Andy Lutomirski

On Sat, May 13, 2017 at 01:53:40AM +0300, Cyrill Gorcunov wrote:
> On Sat, May 13, 2017 at 12:41:30AM +0200, Jann Horn wrote:
> > [resending as plaintext]
> > 
> > I realize that the existing kcmp code has the same issue, but:
> > 
> > Why are you not taking a reference to filp or filp_tgt? This can end up
> > performing a comparison between a pointer to a freed struct file and a
> > pointer to a struct file that was allocated afterwards, right? So it can
> > return a false "is equal" result when the two files aren't actually the same
> > if one of the target tasks is running? This looks like it unnecessarily
> > exposes information about whether an allocation reuses the memory of
> > a previously freed allocation.
> 
> It work with unlocked data on purpose for speed sake. Moreover even
> if we grap a reference it is valid _only_ during comparision operation,
> next we drop ref and it can be easily freed by os. Thus it's up to
> a caller to keep references to files/task and other resources used.

Looks like we can take rcu_read_lock() to guarantee that these objects
will not be freed, and rcu_read_lock() should not affect perfomance too much.

> 
> 	Cyrill

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found]           ` <20170513014508.GA21900-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
@ 2017-05-13  6:55             ` Cyrill Gorcunov
       [not found]               ` <20170513065514.GE1881-ZmlpmtaulQd+urZeOPWqwQ@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Cyrill Gorcunov @ 2017-05-13  6:55 UTC (permalink / raw)
  To: Andrei Vagin
  Cc: Jann Horn, linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, kernel list,
	Linux API, Al Viro, akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	xemul-5HdwGun5lf+gSpxsJD1C4w, Michael Kerrisk-manpages,
	avagin-GEFAQzZX7r8dnm+yROfE0A, jbaron-JqFfY2XvxFXQT0dZR+AlfA,
	Andy Lutomirski

On Fri, May 12, 2017 at 06:45:09PM -0700, Andrei Vagin wrote:
> On Sat, May 13, 2017 at 01:53:40AM +0300, Cyrill Gorcunov wrote:
> > On Sat, May 13, 2017 at 12:41:30AM +0200, Jann Horn wrote:
> > > [resending as plaintext]
> > > 
> > > I realize that the existing kcmp code has the same issue, but:
> > > 
> > > Why are you not taking a reference to filp or filp_tgt? This can end up
> > > performing a comparison between a pointer to a freed struct file and a
> > > pointer to a struct file that was allocated afterwards, right? So it can
> > > return a false "is equal" result when the two files aren't actually the same
> > > if one of the target tasks is running? This looks like it unnecessarily
> > > exposes information about whether an allocation reuses the memory of
> > > a previously freed allocation.
> > 
> > It work with unlocked data on purpose for speed sake. Moreover even
> > if we grap a reference it is valid _only_ during comparision operation,
> > next we drop ref and it can be easily freed by os. Thus it's up to
> > a caller to keep references to files/task and other resources used.
> 
> Looks like we can take rcu_read_lock() to guarantee that these objects
> will not be freed, and rcu_read_lock() should not affect perfomance too much.

Rather they should be get_file_rcu/fput. Still I'm not convinced we need it,
but fine will update both: plain KCMP_FILE and KCMP_EPOLL_TFD since it won't
hurt performance.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found]               ` <20170513065514.GE1881-ZmlpmtaulQd+urZeOPWqwQ@public.gmane.org>
@ 2017-05-13  7:15                 ` Cyrill Gorcunov
  0 siblings, 0 replies; 9+ messages in thread
From: Cyrill Gorcunov @ 2017-05-13  7:15 UTC (permalink / raw)
  To: Andrei Vagin, Jann Horn
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, kernel list, Linux API,
	Al Viro, akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	xemul-5HdwGun5lf+gSpxsJD1C4w, Michael Kerrisk-manpages,
	avagin-GEFAQzZX7r8dnm+yROfE0A, jbaron-JqFfY2XvxFXQT0dZR+AlfA,
	Andy Lutomirski

On Sat, May 13, 2017 at 09:55:14AM +0300, Cyrill Gorcunov wrote:
> On Fri, May 12, 2017 at 06:45:09PM -0700, Andrei Vagin wrote:
> > On Sat, May 13, 2017 at 01:53:40AM +0300, Cyrill Gorcunov wrote:
> > > On Sat, May 13, 2017 at 12:41:30AM +0200, Jann Horn wrote:
> > > > [resending as plaintext]
> > > > 
> > > > I realize that the existing kcmp code has the same issue, but:
> > > > 
> > > > Why are you not taking a reference to filp or filp_tgt? This can end up
> > > > performing a comparison between a pointer to a freed struct file and a
> > > > pointer to a struct file that was allocated afterwards, right? So it can
> > > > return a false "is equal" result when the two files aren't actually the same
> > > > if one of the target tasks is running? This looks like it unnecessarily
> > > > exposes information about whether an allocation reuses the memory of
> > > > a previously freed allocation.
> > > 
> > > It work with unlocked data on purpose for speed sake. Moreover even
> > > if we grap a reference it is valid _only_ during comparision operation,
> > > next we drop ref and it can be easily freed by os. Thus it's up to
> > > a caller to keep references to files/task and other resources used.
> > 
> > Looks like we can take rcu_read_lock() to guarantee that these objects
> > will not be freed, and rcu_read_lock() should not affect perfomance too much.
> 
> Rather they should be get_file_rcu/fput. Still I'm not convinced we need it,
> but fine will update both: plain KCMP_FILE and KCMP_EPOLL_TFD since it won't
> hurt performance.

>From manpage we wrote:

       Note the kcmp() is not protected against false positives which may occur
       if tasks are running.  One should stop tasks by sending SIGSTOP (see  sig‐
       nal(7)) prior to inspection with this system call to obtain meaningful results.

So no, not going to uglify source code and add get/put files there.

	Cyrill

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [v4,resend,2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files
       [not found] ` <20170424154423.511592110-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-05-12 22:00   ` Andrew Morton
  2017-05-12 22:41   ` Jann Horn
@ 2017-09-17 16:01   ` Eugene Syromiatnikov
  2 siblings, 0 replies; 9+ messages in thread
From: Eugene Syromiatnikov @ 2017-09-17 16:01 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
	akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	avagin-5HdwGun5lf+gSpxsJD1C4w, xemul-5HdwGun5lf+gSpxsJD1C4w,
	mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w,
	gorcunov-GEFAQzZX7r8dnm+yROfE0A, avagin-GEFAQzZX7r8dnm+yROfE0A,
	jbaron-JqFfY2XvxFXQT0dZR+AlfA, luto-kltTT9wpgjJwATOyAt5JVQ

On Mon, Apr 24, 2017 at 06:39:28PM +0300, Cyrill Gorcunov wrote:
> With current epoll architecture target files are addressed
> with file_struct and file descriptor number, where the last
> is not unique. Moreover files can be transferred from another
> process via unix socket, added into queue and closed then
> so we won't find this descriptor in the task fdinfo list.
> 
> Thus to checkpoint and restore such processes CRIU needs to
> find out where exactly the target file is present to add it into
> epoll queue. For this sake one can use kcmp call where
> some particular target file from the queue is compared with
> arbitrary file passed as an argument.
> 
> Because epoll target files can have same file descriptor
> number but different file_struct a caller should explicitly
> specify the offset within.
> 
> To test if some particular file is matching entry inside
> epoll one have to
> 
>  - fill kcmp_epoll_slot structure with epoll file descriptor,
>    target file number and target file offset (in case if only
>    one target is present then it should be 0)
> 
>  - call kcmp as kcmp(pid1, pid2, KCMP_EPOLL_TFD, fd, &kcmp_epoll_slot)
>     - the kernel fetch file pointer matching file descriptor @fd of pid1
>     - lookups for file struct in epoll queue of pid2 and returns traditional
>       0,1,2 result for sorting purpose
> 
> v2:
>  - Use KCMP_FILES salt for files comparision (for convenience sake,
>    since the pointers are file structs so user can lookup over previously
>    collected files tree)
>  - Make kcmp_epoll_target as a separate helper instead of opencoding
>    it with #ifdef
> 
> v3:
>  - Use less if()s in kcmp_epoll_target for readability sake (by avagin@)
>  - Use u32 for kcmp_epoll_slot::toff instead of u64, which makes the less
>    memory pressue
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> Acked-by: Andrey Vagin <avagin-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> CC: Al Viro <viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn@public.gmane.org>
> CC: Andrew Morton <akpm-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org>
> CC: Pavel Emelyanov <xemul-5HdwGun5lf+gSpxsJD1C4w@public.gmane.org>
> CC: Michael Kerrisk <mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
> CC: Jason Baron <jbaron-JqFfY2XvxFXQT0dZR+AlfA@public.gmane.org>
> CC: Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org>
> ---
>  fs/eventpoll.c            |   42 +++++++++++++++++++++++++++++++++
>  include/linux/eventpoll.h |    3 ++
>  include/uapi/linux/kcmp.h |   10 ++++++++
>  kernel/kcmp.c             |   57 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 112 insertions(+)
> 
> Index: linux-ml.git/fs/eventpoll.c
> ===================================================================
> --- linux-ml.git.orig/fs/eventpoll.c
> +++ linux-ml.git/fs/eventpoll.c
> @@ -1000,6 +1000,48 @@ static struct epitem *ep_find(struct eve
>  	return epir;
>  }
>  
> +static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
> +{
> +	struct rb_node *rbp;
> +	struct epitem *epi;
> +
> +	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> +		epi = rb_entry(rbp, struct epitem, rbn);
> +		if (epi->ffd.fd == tfd) {
> +			if (toff == 0)
> +				return epi;
> +			else
> +				toff--;
> +		}
> +		cond_resched();
> +	}
> +
> +	return NULL;
> +}
> +
> +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
> +				     unsigned long toff)
> +{
> +	struct file *file_raw;
> +	struct eventpoll *ep;
> +	struct epitem *epi;
> +
> +	if (!is_file_epoll(file))
> +		return ERR_PTR(-EINVAL);
> +
> +	ep = file->private_data;
> +
> +	mutex_lock(&ep->mtx);
> +	epi = ep_find_tfd(ep, tfd, toff);
> +	if (epi)
> +		file_raw = epi->ffd.file;
> +	else
> +		file_raw = ERR_PTR(-ENOENT);
> +	mutex_unlock(&ep->mtx);
> +
> +	return file_raw;
> +}
> +
>  /*
>   * This is the callback that is passed to the wait queue wakeup
>   * mechanism. It is called by the stored file descriptors when they
> Index: linux-ml.git/include/linux/eventpoll.h
> ===================================================================
> --- linux-ml.git.orig/include/linux/eventpoll.h
> +++ linux-ml.git/include/linux/eventpoll.h
> @@ -14,6 +14,7 @@
>  #define _LINUX_EVENTPOLL_H
>  
>  #include <uapi/linux/eventpoll.h>
> +#include <uapi/linux/kcmp.h>
>  
>  
>  /* Forward declarations to avoid compiler errors */
> @@ -22,6 +23,8 @@ struct file;
>  
>  #ifdef CONFIG_EPOLL
>  
> +struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
> +
>  /* Used to initialize the epoll bits inside the "struct file" */
>  static inline void eventpoll_init_file(struct file *file)
>  {
> Index: linux-ml.git/include/uapi/linux/kcmp.h
> ===================================================================
> --- linux-ml.git.orig/include/uapi/linux/kcmp.h
> +++ linux-ml.git/include/uapi/linux/kcmp.h
> @@ -1,6 +1,8 @@
>  #ifndef _UAPI_LINUX_KCMP_H
>  #define _UAPI_LINUX_KCMP_H
>  
> +#include <linux/types.h>
> +
>  /* Comparison type */
>  enum kcmp_type {
>  	KCMP_FILE,
> @@ -10,8 +12,16 @@ enum kcmp_type {
>  	KCMP_SIGHAND,
>  	KCMP_IO,
>  	KCMP_SYSVSEM,
> +	KCMP_EPOLL_TFD,
>  
>  	KCMP_TYPES,
>  };
>  
> +/* Slot for KCMP_EPOLL_TFD */
> +struct kcmp_epoll_slot {
> +	__u32 efd;		/* epoll file descriptor */
> +	__u32 tfd;		/* target file number */
> +	__u32 toff;		/* target offset within same numbered sequence */
> +};
> +
>  #endif /* _UAPI_LINUX_KCMP_H */
> Index: linux-ml.git/kernel/kcmp.c
> ===================================================================
> --- linux-ml.git.orig/kernel/kcmp.c
> +++ linux-ml.git/kernel/kcmp.c
> @@ -11,6 +11,10 @@
>  #include <linux/bug.h>
>  #include <linux/err.h>
>  #include <linux/kcmp.h>
> +#include <linux/capability.h>
> +#include <linux/list.h>
> +#include <linux/eventpoll.h>
> +#include <linux/file.h>
>  
>  #include <asm/unistd.h>
>  
> @@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, s
>  	return err;
>  }
>  
> +#ifdef CONFIG_EPOLL
> +static int kcmp_epoll_target(struct task_struct *task1,
> +			     struct task_struct *task2,
> +			     unsigned long idx1,
> +			     struct kcmp_epoll_slot __user *uslot)
> +{
> +	struct file *filp, *filp_epoll, *filp_tgt;
> +	struct kcmp_epoll_slot slot;
> +	struct files_struct *files;
> +
> +	if (copy_from_user(&slot, uslot, sizeof(slot)))
> +		return -EFAULT;
> +
> +	filp = get_file_raw_ptr(task1, idx1);
> +	if (!filp)
> +		return -EBADF;
> +
> +	files = get_files_struct(task2);
> +	if (!files)
> +		return -EBADF;
> +
> +	spin_lock(&files->file_lock);
> +	filp_epoll = fcheck_files(files, slot.efd);
> +	if (filp_epoll)
> +		get_file(filp_epoll);
> +	else
> +		filp_tgt = ERR_PTR(-EBADF);
> +	spin_unlock(&files->file_lock);
> +	put_files_struct(files);
> +
> +	if (filp_epoll) {
> +		filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
> +		fput(filp_epoll);
> +	} else
I think this "else" is unnecessary here.

> +
> +	if (IS_ERR(filp_tgt))
> +		return PTR_ERR(filp_tgt);
> +
> +	return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
> +}
> +#else
> +static int kcmp_epoll_target(struct task_struct *task1,
> +			     struct task_struct *task2,
> +			     unsigned long idx1,
> +			     struct kcmp_epoll_slot __user *uslot)
> +{
> +	return -EOPNOTSUPP;
> +}
> +#endif
> +
>  SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
>  		unsigned long, idx1, unsigned long, idx2)
>  {
> @@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t
>  		ret = -EOPNOTSUPP;
>  #endif
>  		break;
> +	case KCMP_EPOLL_TFD:
> +		ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-09-17 16:01 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-24 15:39 [patch v4 resend 2/2] kcmp: Add KCMP_EPOLL_TFD mode to compare epoll target files Cyrill Gorcunov
     [not found] ` <20170424154423.511592110-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-05-12 22:00   ` Andrew Morton
     [not found]     ` <20170512150018.b931c7f5295dd7484845fcec-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
2017-05-12 22:14       ` Cyrill Gorcunov
2017-05-12 22:41   ` Jann Horn
2017-05-12 22:53     ` Cyrill Gorcunov
     [not found]       ` <20170512225340.GD1881-ZmlpmtaulQd+urZeOPWqwQ@public.gmane.org>
2017-05-13  1:45         ` Andrei Vagin
     [not found]           ` <20170513014508.GA21900-1ViLX0X+lBJGNQ1M2rI3KwRV3xvJKrda@public.gmane.org>
2017-05-13  6:55             ` Cyrill Gorcunov
     [not found]               ` <20170513065514.GE1881-ZmlpmtaulQd+urZeOPWqwQ@public.gmane.org>
2017-05-13  7:15                 ` Cyrill Gorcunov
2017-09-17 16:01   ` [v4,resend,2/2] " Eugene Syromiatnikov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).