linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name
@ 2009-02-27 20:34 Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function Dave Hansen
                   ` (6 more replies)
  0 siblings, 7 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


The 'cr_hdr_fd_data' name is too long for me.  Adding data
at the end doesn't clarify anything in its use which makes
it a waste of space.  Kill it.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/checkpoint/ckpt_file.c         |    2 +-
 linux-2.6.git-dave/checkpoint/rstr_file.c         |    2 +-
 linux-2.6.git-dave/include/linux/checkpoint_hdr.h |    2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff -puN checkpoint/ckpt_file.c~no_data0 checkpoint/ckpt_file.c
--- linux-2.6.git/checkpoint/ckpt_file.c~no_data0	2009-02-27 12:07:37.000000000 -0800
+++ linux-2.6.git-dave/checkpoint/ckpt_file.c	2009-02-27 12:07:37.000000000 -0800
@@ -76,7 +76,7 @@ int cr_scan_fds(struct files_struct *fil
 static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
 {
 	struct cr_hdr h;
-	struct cr_hdr_fd_data *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct cr_hdr_fd *hh = cr_hbuf_get(ctx, sizeof(*hh));
 	struct dentry *dent = file->f_dentry;
 	struct inode *inode = dent->d_inode;
 	enum fd_type fd_type;
diff -puN checkpoint/rstr_file.c~no_data0 checkpoint/rstr_file.c
--- linux-2.6.git/checkpoint/rstr_file.c~no_data0	2009-02-27 12:07:37.000000000 -0800
+++ linux-2.6.git-dave/checkpoint/rstr_file.c	2009-02-27 12:07:37.000000000 -0800
@@ -71,7 +71,7 @@ static int cr_attach_get_file(struct fil
 static int
 cr_read_fd_data(struct cr_ctx *ctx, struct files_struct *files, int rparent)
 {
-	struct cr_hdr_fd_data *hh = cr_hbuf_get(ctx, sizeof(*hh));
+	struct cr_hdr_fd *hh = cr_hbuf_get(ctx, sizeof(*hh));
 	struct file *file;
 	int parent, ret;
 	int fd = 0;	/* pacify gcc warning */
diff -puN include/linux/checkpoint_hdr.h~no_data0 include/linux/checkpoint_hdr.h
--- linux-2.6.git/include/linux/checkpoint_hdr.h~no_data0	2009-02-27 12:07:37.000000000 -0800
+++ linux-2.6.git-dave/include/linux/checkpoint_hdr.h	2009-02-27 12:07:37.000000000 -0800
@@ -137,7 +137,7 @@ enum  fd_type {
 	CR_FD_DIR,
 };
 
-struct cr_hdr_fd_data {
+struct cr_hdr_fd {
 	__u16 fd_type;
 	__u16 f_mode;
 	__u32 f_flags;
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-27 20:56   ` Vegard Nossum
  2009-02-27 20:34 ` [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's Dave Hansen
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


I'll be adding to this in a moment and it is in a bad place
to do that cleanly now.

Also, increase the buffer size.  Most /proc files can
output up to a page, so use the same here.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/fs/proc/base.c |   23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff -puN fs/proc/base.c~breakout-fdinfo fs/proc/base.c
--- linux-2.6.git/fs/proc/base.c~breakout-fdinfo	2009-02-27 12:07:37.000000000 -0800
+++ linux-2.6.git-dave/fs/proc/base.c	2009-02-27 12:07:37.000000000 -0800
@@ -1632,7 +1632,18 @@ out:
 	return ~0U;
 }
 
-#define PROC_FDINFO_MAX 64
+#define PROC_FDINFO_MAX PAGE_SIZE
+
+static void proc_fd_write_info(struct file *file, char *info)
+{
+	int max = PROC_FDINFO_MAX;
+	int p = 0;
+	if (!info)
+		return;
+
+	p += snprintf(info+p, max-p, "pos:\t%lli\n", (long long) file->f_pos);
+	p += snprintf(info+p, max-p, "flags:\t0%o\n", file->f_flags);
+}
 
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
@@ -1657,12 +1668,7 @@ static int proc_fd_info(struct inode *in
 				*path = file->f_path;
 				path_get(&file->f_path);
 			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 file->f_flags);
+			proc_fd_write_info(file, info);
 			spin_unlock(&files->file_lock);
 			put_files_struct(files);
 			return 0;
@@ -1870,10 +1876,11 @@ static int proc_readfd(struct file *filp
 static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
 				      size_t len, loff_t *ppos)
 {
-	char tmp[PROC_FDINFO_MAX];
+	char *tmp = kmalloc(PROC_FDINFO_MAX, GFP_KERNEL);
 	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
 	if (!err)
 		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
+	kfree(tmp);
 	return err;
 }
 
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-27 21:16   ` Alexey Dobriyan
  2009-02-27 20:34 ` [RFC][PATCH 4/8] file c/r: expose functions to query fs support Dave Hansen
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


There are plenty of filesystems that are not supported for
c/r at this point.  Think of things like hugetlbfs which
are externally visible or pipefs which are kernel-internal.

This provides a quick way to mark the filesystems as they
get supported.  We won't mark wery many filesystems
explicitly for now.  That's because we will assume that
FS_REQUIRES_DEV implies FS_CHECKPOINTABLE for now.  This
assumption may need to get overridden in the future, but
it should be OK for now.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/fs/nfs/super.c     |   13 ++++++++-----
 linux-2.6.git-dave/include/linux/fs.h |    1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff -puN fs/nfs/super.c~FS_CHECKPOINTABLE-flag fs/nfs/super.c
--- linux-2.6.git/fs/nfs/super.c~FS_CHECKPOINTABLE-flag	2009-02-27 12:07:38.000000000 -0800
+++ linux-2.6.git-dave/fs/nfs/super.c	2009-02-27 12:07:38.000000000 -0800
@@ -233,12 +233,15 @@ static int nfs_xdev_get_sb(struct file_s
 static void nfs_kill_super(struct super_block *);
 static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 
+#define NFS_FS_FLAGS FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|\
+		     FS_BINARY_MOUNTDATA|FS_CHECKPOINTABLE
+
 static struct file_system_type nfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "nfs",
 	.get_sb		= nfs_get_sb,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= NFS_FS_FLAGS,
 };
 
 struct file_system_type nfs_xdev_fs_type = {
@@ -246,7 +249,7 @@ struct file_system_type nfs_xdev_fs_type
 	.name		= "nfs",
 	.get_sb		= nfs_xdev_get_sb,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= NFS_FS_FLAGS,
 };
 
 static const struct super_operations nfs_sops = {
@@ -275,7 +278,7 @@ static struct file_system_type nfs4_fs_t
 	.name		= "nfs4",
 	.get_sb		= nfs4_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= NFS_FS_FLAGS,
 };
 
 struct file_system_type nfs4_xdev_fs_type = {
@@ -283,7 +286,7 @@ struct file_system_type nfs4_xdev_fs_typ
 	.name		= "nfs4",
 	.get_sb		= nfs4_xdev_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= NFS_FS_FLAGS,
 };
 
 struct file_system_type nfs4_referral_fs_type = {
@@ -291,7 +294,7 @@ struct file_system_type nfs4_referral_fs
 	.name		= "nfs4",
 	.get_sb		= nfs4_referral_get_sb,
 	.kill_sb	= nfs4_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+	.fs_flags	= NFS_FS_FLAGS,
 };
 
 static const struct super_operations nfs4_sops = {
diff -puN include/linux/fs.h~FS_CHECKPOINTABLE-flag include/linux/fs.h
--- linux-2.6.git/include/linux/fs.h~FS_CHECKPOINTABLE-flag	2009-02-27 12:07:38.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fs.h	2009-02-27 12:07:38.000000000 -0800
@@ -109,6 +109,7 @@ struct inodes_stat_t {
 #define FS_REQUIRES_DEV 1 
 #define FS_BINARY_MOUNTDATA 2
 #define FS_HAS_SUBTYPE 4
+#define FS_CHECKPOINTABLE 8
 #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move()
 					 * during rename() internally.
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 4/8] file c/r: expose functions to query fs support
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-27 21:14   ` Sukadev Bhattiprolu
  2009-02-28  1:33   ` Sukadev Bhattiprolu
  2009-02-27 20:34 ` [RFC][PATCH 5/8] add f_op for checkpointability Dave Hansen
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


This pair of functions will check to see whether a given
'struct file' can be checkpointed.  If it can't be, the
"explain" function can also give a description why.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/checkpoint/ckpt_file.c     |   46 ++++++++++++++++++++++++++
 linux-2.6.git-dave/include/linux/checkpoint.h |   18 ++++++++++
 2 files changed, 64 insertions(+)

diff -puN checkpoint/ckpt_file.c~cr-explain-unckpt-file checkpoint/ckpt_file.c
--- linux-2.6.git/checkpoint/ckpt_file.c~cr-explain-unckpt-file	2009-02-27 12:07:38.000000000 -0800
+++ linux-2.6.git-dave/checkpoint/ckpt_file.c	2009-02-27 12:07:38.000000000 -0800
@@ -72,6 +72,52 @@ int cr_scan_fds(struct files_struct *fil
 	return n;
 }
 
+int fs_is_cr_able(struct file_system_type *fs_type)
+{
+	if (fs_type->fs_flags & FS_CHECKPOINTABLE)
+		return 1;
+	/*
+	 * We assume that all block-based filesystems that
+	 * need devices work.  This covers all of the
+	 * "important" fs's by default like ext*.  If this
+	 * assumption becomes untrue, we may need a
+	 * NOT_CHECKPOINTABLE flag in the future
+	 */
+	if (fs_type->fs_flags & FS_REQUIRES_DEV)
+		return 1;
+	return 0;
+}
+
+int cr_explain_file(struct file *file, char *explain, int left)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct file_system_type *fs_type = inode->i_sb->s_type;
+
+	if (!fs_is_cr_able(fs_type))
+		return snprintf(explain, left,
+				" (%s does not support checkpoint)",
+				fs_type->name);
+
+	if (special_file(inode->i_mode))
+		return snprintf(explain, left, " (special file)");
+
+	return 0;
+}
+
+int cr_file_supported(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct file_system_type *fs_type = inode->i_sb->s_type;
+
+	if (fs_is_cr_able(fs_type))
+		return 0;
+
+	if (special_file(inode->i_mode))
+		return 0;
+
+	return 1;
+}
+
 /* cr_write_fd_data - dump the state of a given file pointer */
 static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
 {
diff -puN include/linux/checkpoint.h~cr-explain-unckpt-file include/linux/checkpoint.h
--- linux-2.6.git/include/linux/checkpoint.h~cr-explain-unckpt-file	2009-02-27 12:07:38.000000000 -0800
+++ linux-2.6.git-dave/include/linux/checkpoint.h	2009-02-27 12:07:38.000000000 -0800
@@ -13,6 +13,8 @@
 #include <linux/path.h>
 #include <linux/fs.h>
 
+#ifdef CONFIG_CHECKPOINT_RESTART
+
 #define CR_VERSION  2
 
 struct cr_ctx {
@@ -99,4 +101,20 @@ extern int cr_read_files(struct cr_ctx *
 
 #define pr_fmt(fmt) "[%d:c/r:%s] " fmt, task_pid_vnr(current), __func__
 
+int cr_explain_file(struct file *file, char *explain, int left);
+int cr_file_supported(struct file *file);
+
+#else /* !CONFIG_CHECKPOINT_RESTART */
+
+static inline int cr_explain_file(struct file *file, char *explain, int left)
+{
+	return 0;
+}
+
+int cr_file_supported(struct file *file)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CHECKPOINT_RESTART */
 #endif /* _CHECKPOINT_CKPT_H_ */
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
                   ` (2 preceding siblings ...)
  2009-02-27 20:34 ` [RFC][PATCH 4/8] file c/r: expose functions to query fs support Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-28  2:14   ` Sukadev Bhattiprolu
  2009-02-28 20:53   ` Christoph Hellwig
  2009-02-27 20:34 ` [RFC][PATCH 6/8] mark /dev/null and zero as checkpointable Dave Hansen
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


We have set up sane defaults for how filesystems should
be checkpointed.  However, as usual in the VFS, there
are specialized places that will always need an ability
to override these defaults.

This adds a new 'file_operations' function for
checkpointing a file.  I did this under the assumption
that we should have a dirt-simple way to make something
(un)checkpointable that fits in with current code.

As you can see in the /dev/null patch in a second, all
that we have to do to make something like /dev/null
supported is add a single "generic" f_op entry.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/checkpoint/ckpt_file.c     |   77 ++++++++++++++++----------
 linux-2.6.git-dave/include/linux/checkpoint.h |    6 ++
 linux-2.6.git-dave/include/linux/fs.h         |    4 +
 3 files changed, 60 insertions(+), 27 deletions(-)

diff -puN checkpoint/ckpt_file.c~f_op-for-checkpointability checkpoint/ckpt_file.c
--- linux-2.6.git/checkpoint/ckpt_file.c~f_op-for-checkpointability	2009-02-27 12:07:39.000000000 -0800
+++ linux-2.6.git-dave/checkpoint/ckpt_file.c	2009-02-27 12:07:39.000000000 -0800
@@ -104,56 +104,79 @@ int cr_explain_file(struct file *file, c
 	return 0;
 }
 
-int cr_file_supported(struct file *file)
+typedef int (do_checkpoint_t)(struct file *, struct cr_ctx *,
+			      struct cr_hdr_fd *);
+
+int generic_file_checkpoint(struct file *file, struct cr_ctx *ctx,
+			 struct cr_hdr_fd *hh)
+{
+	/*
+	 * A NULL hh means to make a trial run not
+	 * actually writing data.  Just determine
+	 * if the file is checkpointable.
+	 */
+	if (!hh)
+		return 0;
+
+	hh->f_flags = file->f_flags;
+	hh->f_mode = file->f_mode;
+	hh->f_pos = file->f_pos;
+	hh->f_version = file->f_version;
+	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
+
+	return 0;
+}
+
+do_checkpoint_t *cr_file_get_func(struct file *file)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct file_system_type *fs_type = inode->i_sb->s_type;
 
-	if (fs_is_cr_able(fs_type))
-		return 0;
+	if (file->f_op->checkpoint)
+		return file->f_op->checkpoint;
+
+	if (!fs_is_cr_able(fs_type))
+		return NULL;
 
 	if (special_file(inode->i_mode))
-		return 0;
+		return NULL;
 
-	return 1;
+	return generic_file_checkpoint;
+}
+
+int cr_file_supported(struct file *file)
+{
+	do_checkpoint_t *func = cr_file_get_func(file);
+
+	if (func)
+		return !func(file, NULL, NULL);
+
+	return 0;
 }
 
 /* cr_write_fd_data - dump the state of a given file pointer */
 static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
 {
+	do_checkpoint_t *ckpt_func;
 	struct cr_hdr h;
 	struct cr_hdr_fd *hh = cr_hbuf_get(ctx, sizeof(*hh));
-	struct dentry *dent = file->f_dentry;
-	struct inode *inode = dent->d_inode;
-	enum fd_type fd_type;
 	int ret;
 
 	h.type = CR_HDR_FD_DATA;
 	h.len = sizeof(*hh);
 	h.parent = parent;
 
-	hh->f_flags = file->f_flags;
-	hh->f_mode = file->f_mode;
-	hh->f_pos = file->f_pos;
-	hh->f_version = file->f_version;
-	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFREG:
-		fd_type = CR_FD_FILE;
-		break;
-	case S_IFDIR:
-		fd_type = CR_FD_DIR;
-		break;
-	default:
-		cr_hbuf_put(ctx, sizeof(*hh));
-		return -EBADF;
-	}
+	ckpt_func = cr_file_get_func(file);
+	ret = -EBADF;
+	if (!ckpt_func)
+		goto out;
 
-	/* FIX: check if the file/dir/link is unlinked */
-	hh->fd_type = fd_type;
+	ret = ckpt_func(file, ctx, hh);
+	if (ret)
+		goto out;
 
 	ret = cr_write_obj(ctx, &h, hh);
+out:
 	cr_hbuf_put(ctx, sizeof(*hh));
 	if (ret < 0)
 		return ret;
diff -puN include/linux/checkpoint.h~f_op-for-checkpointability include/linux/checkpoint.h
--- linux-2.6.git/include/linux/checkpoint.h~f_op-for-checkpointability	2009-02-27 12:07:39.000000000 -0800
+++ linux-2.6.git-dave/include/linux/checkpoint.h	2009-02-27 12:07:39.000000000 -0800
@@ -70,6 +70,7 @@ extern int cr_obj_add_ref(struct cr_ctx 
 			  unsigned short type, unsigned short flags);
 
 struct cr_hdr;
+struct cr_hdr_fd;
 
 extern int cr_write_obj(struct cr_ctx *ctx, struct cr_hdr *h, void *buf);
 extern int cr_write_buffer(struct cr_ctx *ctx, void *buf, int len);
@@ -103,9 +104,14 @@ extern int cr_read_files(struct cr_ctx *
 
 int cr_explain_file(struct file *file, char *explain, int left);
 int cr_file_supported(struct file *file);
+int generic_file_checkpoint(struct file *, struct cr_ctx *, struct cr_hdr_fd *);
 
 #else /* !CONFIG_CHECKPOINT_RESTART */
 
+struct cr_hdr_fd;
+
+#define generic_file_checkpoint NULL
+
 static inline int cr_explain_file(struct file *file, char *explain, int left)
 {
 	return 0;
diff -puN include/linux/fs.h~f_op-for-checkpointability include/linux/fs.h
--- linux-2.6.git/include/linux/fs.h~f_op-for-checkpointability	2009-02-27 12:07:39.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fs.h	2009-02-27 12:07:39.000000000 -0800
@@ -1303,6 +1303,9 @@ struct block_device_operations;
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 
+struct cr_ctx;
+struct cr_hdr_fd;
+
 /*
  * NOTE:
  * read, write, poll, fsync, readv, writev, unlocked_ioctl and compat_ioctl
@@ -1335,6 +1338,7 @@ struct file_operations {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
 	int (*setlease)(struct file *, long, struct file_lock **);
+	int (*checkpoint)(struct file *, struct cr_ctx *, struct cr_hdr_fd *);
 };
 
 struct inode_operations {
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 6/8] mark /dev/null and zero as checkpointable
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
                   ` (3 preceding siblings ...)
  2009-02-27 20:34 ` [RFC][PATCH 5/8] add f_op for checkpointability Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 7/8] add c/r info to fdinfo Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 8/8] check files for checkpointability Dave Hansen
  6 siblings, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


We currently have a special_file() check in the checkpoint
code which considers all special files as uncheckpointable.

Now that we have the f_op and a generic function, use that
to override these simple devices and make them OK to
checkpoint.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/drivers/char/mem.c |    3 +++
 1 file changed, 3 insertions(+)

diff -puN drivers/char/mem.c~make-dev-null-work drivers/char/mem.c
--- linux-2.6.git/drivers/char/mem.c~make-dev-null-work	2009-02-27 12:07:39.000000000 -0800
+++ linux-2.6.git-dave/drivers/char/mem.c	2009-02-27 12:07:39.000000000 -0800
@@ -27,6 +27,7 @@
 #include <linux/splice.h>
 #include <linux/pfn.h>
 #include <linux/smp_lock.h>
+#include <linux/checkpoint.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -824,6 +825,7 @@ static const struct file_operations null
 	.read		= read_null,
 	.write		= write_null,
 	.splice_write	= splice_write_null,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 #ifdef CONFIG_DEVPORT
@@ -840,6 +842,7 @@ static const struct file_operations zero
 	.read		= read_zero,
 	.write		= write_zero,
 	.mmap		= mmap_zero,
+	.checkpoint	= generic_file_checkpoint,
 };
 
 /*
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 7/8] add c/r info to fdinfo
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
                   ` (4 preceding siblings ...)
  2009-02-27 20:34 ` [RFC][PATCH 6/8] mark /dev/null and zero as checkpointable Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-27 20:34 ` [RFC][PATCH 8/8] check files for checkpointability Dave Hansen
  6 siblings, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


Use the new checkpoint/restart file functions to query
and report on each fd in the /proc/$$/fdinfo/X file.

This should provide an easy way to examine processes
at runtime to see what exactly is causing their inability
to checkpoint.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/fs/proc/base.c |    8 ++++++++
 1 file changed, 8 insertions(+)

diff -puN fs/proc/base.c~add-cr-part-to-fdinfo fs/proc/base.c
--- linux-2.6.git/fs/proc/base.c~add-cr-part-to-fdinfo	2009-02-27 12:07:40.000000000 -0800
+++ linux-2.6.git-dave/fs/proc/base.c	2009-02-27 12:07:40.000000000 -0800
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/checkpoint.h>
 #include "internal.h"
 
 /* NOTE:
@@ -1636,6 +1637,7 @@ out:
 
 static void proc_fd_write_info(struct file *file, char *info)
 {
+	int checkpointable;
 	int max = PROC_FDINFO_MAX;
 	int p = 0;
 	if (!info)
@@ -1643,6 +1645,12 @@ static void proc_fd_write_info(struct fi
 
 	p += snprintf(info+p, max-p, "pos:\t%lli\n", (long long) file->f_pos);
 	p += snprintf(info+p, max-p, "flags:\t0%o\n", file->f_flags);
+
+	checkpointable = cr_file_supported(file);
+	p += snprintf(info+p, max-p, "checkpointable:\t%d", checkpointable);
+	if (!checkpointable)
+		p += cr_explain_file(file, info+p, max-p);
+	p += snprintf(info+p, max-p, "\n");
 }
 
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [RFC][PATCH 8/8] check files for checkpointability
  2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
                   ` (5 preceding siblings ...)
  2009-02-27 20:34 ` [RFC][PATCH 7/8] add c/r info to fdinfo Dave Hansen
@ 2009-02-27 20:34 ` Dave Hansen
  2009-02-28  2:57   ` Sukadev Bhattiprolu
                     ` (2 more replies)
  6 siblings, 3 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 20:34 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: containers, linux-kernel, Serge E. Hallyn, Oren Laadan,
	Alexey Dobriyan, hch, Dave Hansen


Introduce a files_struct counter to indicate whether a particular
file_struct has ever contained a file which can not be
checkpointed.  This flag is a one-way trip; once it is set, it may
not be unset.

We assume at allocation that a new files_struct is clean and may
be checkpointed.  However, as soon as it has had its files filled
from its parent's, we check it for real in __scan_files_for_cr().
At that point, we mark it if it contained any uncheckpointable
files.

We also check each 'struct file' when it is installed in a fd
slot.  This way, if anyone open()s or managed to dup() an
unsuppored file, we can catch it.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
---

 linux-2.6.git-dave/fs/file.c                  |   19 +++++++++++++++++++
 linux-2.6.git-dave/fs/open.c                  |    5 +++++
 linux-2.6.git-dave/include/linux/checkpoint.h |   14 ++++++++++++++
 linux-2.6.git-dave/include/linux/fdtable.h    |    3 +++
 4 files changed, 41 insertions(+)

diff -puN fs/file.c~track-files_struct-checkpointability fs/file.c
--- linux-2.6.git/fs/file.c~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
+++ linux-2.6.git-dave/fs/file.c	2009-02-27 12:07:41.000000000 -0800
@@ -15,6 +15,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/bitops.h>
+#include <linux/checkpoint.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
@@ -285,6 +286,20 @@ static int count_open_files(struct fdtab
 	return i;
 }
 
+static void __scan_files_for_cr(struct files_struct *files)
+{
+	int i;
+
+	for (i = 0; i < files->fdtab.max_fds; i++) {
+		struct file *f = fcheck_files(files, i);
+		if (!f)
+			continue;
+		if (cr_file_supported(f))
+			continue;
+		files_deny_checkpointing(files);
+	}
+}
+
 /*
  * Allocate a new files structure and copy contents from the
  * passed in files structure.
@@ -303,6 +318,9 @@ struct files_struct *dup_fd(struct files
 		goto out;
 
 	atomic_set(&newf->count, 1);
+#ifdef CONFIG_CHECKPOINT_RESTART
+	newf->may_checkpoint = 1;
+#endif
 
 	spin_lock_init(&newf->file_lock);
 	newf->next_fd = 0;
@@ -396,6 +414,7 @@ struct files_struct *dup_fd(struct files
 
 	rcu_assign_pointer(newf->fdt, new_fdt);
 
+	__scan_files_for_cr(newf);
 	return newf;
 
 out_release:
diff -puN fs/open.c~track-files_struct-checkpointability fs/open.c
--- linux-2.6.git/fs/open.c~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
+++ linux-2.6.git-dave/fs/open.c	2009-02-27 12:07:41.000000000 -0800
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/checkpoint.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -1015,6 +1016,10 @@ void fd_install(unsigned int fd, struct 
 {
 	struct files_struct *files = current->files;
 	struct fdtable *fdt;
+
+	if (!cr_file_supported(file))
+		files_deny_checkpointing(files);
+
 	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
 	BUG_ON(fdt->fd[fd] != NULL);
diff -puN include/linux/checkpoint.h~track-files_struct-checkpointability include/linux/checkpoint.h
--- linux-2.6.git/include/linux/checkpoint.h~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
+++ linux-2.6.git-dave/include/linux/checkpoint.h	2009-02-27 12:07:41.000000000 -0800
@@ -12,6 +12,7 @@
 
 #include <linux/path.h>
 #include <linux/fs.h>
+#include <linux/fdtable.h>
 
 #ifdef CONFIG_CHECKPOINT_RESTART
 
@@ -102,6 +103,18 @@ extern int cr_read_files(struct cr_ctx *
 
 #define pr_fmt(fmt) "[%d:c/r:%s] " fmt, task_pid_vnr(current), __func__
 
+static inline void __files_deny_checkpointing(struct files_struct *files,
+		char *file, int line)
+{
+	if (!test_and_clear_bit(0, &files->may_checkpoint))
+		return;
+	printk(KERN_INFO "process performed an action that can not be "
+			"checkpointed at: %s:%d\n", file, line);
+	WARN_ON(1);
+}
+#define files_deny_checkpointing(f)  \
+	__files_deny_checkpointing(f, __FILE__, __LINE__)
+
 int cr_explain_file(struct file *file, char *explain, int left);
 int cr_file_supported(struct file *file);
 int generic_file_checkpoint(struct file *, struct cr_ctx *, struct cr_hdr_fd *);
@@ -112,6 +125,7 @@ struct cr_hdr_fd;
 
 #define generic_file_checkpoint NULL
 
+static inline void files_deny_checkpointing(struct files_struct *files) {}
 static inline int cr_explain_file(struct file *file, char *explain, int left)
 {
 	return 0;
diff -puN include/linux/fdtable.h~track-files_struct-checkpointability include/linux/fdtable.h
--- linux-2.6.git/include/linux/fdtable.h~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
+++ linux-2.6.git-dave/include/linux/fdtable.h	2009-02-27 12:07:41.000000000 -0800
@@ -45,6 +45,9 @@ struct files_struct {
 	atomic_t count;
 	struct fdtable *fdt;
 	struct fdtable fdtab;
+#ifdef CONFIG_CHECKPOINT_RESTART
+	unsigned long may_checkpoint;
+#endif
   /*
    * written part on a separate cache line in SMP
    */
_

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function
  2009-02-27 20:34 ` [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function Dave Hansen
@ 2009-02-27 20:56   ` Vegard Nossum
  2009-02-27 21:23     ` Dave Hansen
  0 siblings, 1 reply; 41+ messages in thread
From: Vegard Nossum @ 2009-02-27 20:56 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

2009/2/27 Dave Hansen <dave@linux.vnet.ibm.com>:
>
> I'll be adding to this in a moment and it is in a bad place
> to do that cleanly now.
>
> Also, increase the buffer size.  Most /proc files can
> output up to a page, so use the same here.
>
> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> ---
>
>  linux-2.6.git-dave/fs/proc/base.c |   23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
>
> diff -puN fs/proc/base.c~breakout-fdinfo fs/proc/base.c
> --- linux-2.6.git/fs/proc/base.c~breakout-fdinfo        2009-02-27 12:07:37.000000000 -0800
> +++ linux-2.6.git-dave/fs/proc/base.c   2009-02-27 12:07:37.000000000 -0800
> @@ -1632,7 +1632,18 @@ out:
>        return ~0U;
>  }
>
> -#define PROC_FDINFO_MAX 64
> +#define PROC_FDINFO_MAX PAGE_SIZE
> +
> +static void proc_fd_write_info(struct file *file, char *info)
> +{
> +       int max = PROC_FDINFO_MAX;
> +       int p = 0;
> +       if (!info)
> +               return;
> +
> +       p += snprintf(info+p, max-p, "pos:\t%lli\n", (long long) file->f_pos);
> +       p += snprintf(info+p, max-p, "flags:\t0%o\n", file->f_flags);

Actually, snprintf() is not the right function to use here.
scnprintf(), perhaps?


Vegard

-- 
"The animistic metaphor of the bug that maliciously sneaked in while
the programmer was not looking is intellectually dishonest as it
disguises that the error is the programmer's own creation."
	-- E. W. Dijkstra, EWD1036

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 4/8] file c/r: expose functions to query fs support
  2009-02-27 20:34 ` [RFC][PATCH 4/8] file c/r: expose functions to query fs support Dave Hansen
@ 2009-02-27 21:14   ` Sukadev Bhattiprolu
  2009-02-27 21:24     ` Dave Hansen
  2009-02-28  1:33   ` Sukadev Bhattiprolu
  1 sibling, 1 reply; 41+ messages in thread
From: Sukadev Bhattiprolu @ 2009-02-27 21:14 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

Dave Hansen [dave@linux.vnet.ibm.com] wrote:
| 
| This pair of functions will check to see whether a given
| 'struct file' can be checkpointed.  If it can't be, the
| "explain" function can also give a description why.
| 
| Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
| ---
| 
|  linux-2.6.git-dave/checkpoint/ckpt_file.c     |   46 ++++++++++++++++++++++++++
|  linux-2.6.git-dave/include/linux/checkpoint.h |   18 ++++++++++
|  2 files changed, 64 insertions(+)
| 
| diff -puN checkpoint/ckpt_file.c~cr-explain-unckpt-file checkpoint/ckpt_file.c
| --- linux-2.6.git/checkpoint/ckpt_file.c~cr-explain-unckpt-file	2009-02-27 12:07:38.000000000 -0800
| +++ linux-2.6.git-dave/checkpoint/ckpt_file.c	2009-02-27 12:07:38.000000000 -0800
| @@ -72,6 +72,52 @@ int cr_scan_fds(struct files_struct *fil
|  	return n;
|  }
| 
| +int fs_is_cr_able(struct file_system_type *fs_type)
| +{
| +	if (fs_type->fs_flags & FS_CHECKPOINTABLE)
| +		return 1;
| +	/*
| +	 * We assume that all block-based filesystems that
| +	 * need devices work.  This covers all of the
| +	 * "important" fs's by default like ext*.  If this
| +	 * assumption becomes untrue, we may need a
| +	 * NOT_CHECKPOINTABLE flag in the future
| +	 */
| +	if (fs_type->fs_flags & FS_REQUIRES_DEV)
| +		return 1;
| +	return 0;
| +}
| +
| +int cr_explain_file(struct file *file, char *explain, int left)
| +{
| +	struct inode *inode = file->f_dentry->d_inode;
| +	struct file_system_type *fs_type = inode->i_sb->s_type;
| +
| +	if (!fs_is_cr_able(fs_type))
| +		return snprintf(explain, left,
| +				" (%s does not support checkpoint)",
| +				fs_type->name);
| +
| +	if (special_file(inode->i_mode))
| +		return snprintf(explain, left, " (special file)");
| +
| +	return 0;
| +}
| +
| +int cr_file_supported(struct file *file)
| +{
| +	struct inode *inode = file->f_dentry->d_inode;
| +	struct file_system_type *fs_type = inode->i_sb->s_type;
| +
| +	if (fs_is_cr_able(fs_type))
| +		return 0;

Should this be if (!fs_is_cr_able(fs_type)) ?

| +
| +	if (special_file(inode->i_mode))
| +		return 0;
| +
| +	return 1;
| +}
| +
|  /* cr_write_fd_data - dump the state of a given file pointer */
|  static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
|  {
| diff -puN include/linux/checkpoint.h~cr-explain-unckpt-file include/linux/checkpoint.h
| --- linux-2.6.git/include/linux/checkpoint.h~cr-explain-unckpt-file	2009-02-27 12:07:38.000000000 -0800
| +++ linux-2.6.git-dave/include/linux/checkpoint.h	2009-02-27 12:07:38.000000000 -0800
| @@ -13,6 +13,8 @@
|  #include <linux/path.h>
|  #include <linux/fs.h>
| 
| +#ifdef CONFIG_CHECKPOINT_RESTART
| +
|  #define CR_VERSION  2
| 
|  struct cr_ctx {
| @@ -99,4 +101,20 @@ extern int cr_read_files(struct cr_ctx *
| 
|  #define pr_fmt(fmt) "[%d:c/r:%s] " fmt, task_pid_vnr(current), __func__
| 
| +int cr_explain_file(struct file *file, char *explain, int left);
| +int cr_file_supported(struct file *file);
| +
| +#else /* !CONFIG_CHECKPOINT_RESTART */
| +
| +static inline int cr_explain_file(struct file *file, char *explain, int left)
| +{
| +	return 0;
| +}
| +
| +int cr_file_supported(struct file *file)
| +{
| +	return 0;
| +}
| +
| +#endif /* CONFIG_CHECKPOINT_RESTART */
|  #endif /* _CHECKPOINT_CKPT_H_ */
| _
| --
| To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
| the body of a message to majordomo@vger.kernel.org
| More majordomo info at  http://vger.kernel.org/majordomo-info.html
| Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's
  2009-02-27 20:34 ` [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's Dave Hansen
@ 2009-02-27 21:16   ` Alexey Dobriyan
  2009-02-27 21:20     ` Dave Hansen
  0 siblings, 1 reply; 41+ messages in thread
From: Alexey Dobriyan @ 2009-02-27 21:16 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn, Oren Laadan, hch

On Fri, Feb 27, 2009 at 12:34:28PM -0800, Dave Hansen wrote:
> +#define FS_CHECKPOINTABLE 8

This will ban all sockets, for instance, until all of them are
C/R-ready. And every new socket type must be made C/R-ready from
the very beginning, or risk major C/R breakage.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's
  2009-02-27 21:16   ` Alexey Dobriyan
@ 2009-02-27 21:20     ` Dave Hansen
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 21:20 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn, Oren Laadan, hch

On Sat, 2009-02-28 at 00:16 +0300, Alexey Dobriyan wrote:
> On Fri, Feb 27, 2009 at 12:34:28PM -0800, Dave Hansen wrote:
> > +#define FS_CHECKPOINTABLE 8
> 
> This will ban all sockets, for instance, until all of them are
> C/R-ready. And every new socket type must be made C/R-ready from
> the very beginning, or risk major C/R breakage.

Yes, it is very important to have fine-grained control over what is
checkpointable and not.  We can't simply say "all sockets are OK" all at
once.  I meant this as a high-level things so that I didn't have to go
chase down the several f_ops for each in and every filesystem that is
supported.

Did you look farther down in the patches to see how this can be
overridden with an f_op?

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function
  2009-02-27 20:56   ` Vegard Nossum
@ 2009-02-27 21:23     ` Dave Hansen
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 21:23 UTC (permalink / raw)
  To: Vegard Nossum; +Cc: containers, linux-kernel, hch, Ingo Molnar, Alexey Dobriyan

On Fri, 2009-02-27 at 21:56 +0100, Vegard Nossum wrote:
> > +static void proc_fd_write_info(struct file *file, char *info)
> > +{
> > +       int max = PROC_FDINFO_MAX;
> > +       int p = 0;
> > +       if (!info)
> > +               return;
> > +
> > +       p += snprintf(info+p, max-p, "pos:\t%lli\n", (long long) file->f_pos);
> > +       p += snprintf(info+p, max-p, "flags:\t0%o\n", file->f_flags);
> 
> Actually, snprintf() is not the right function to use here.
> scnprintf(), perhaps?

Yes, that does look more appropriate.  I'll double-check what happens
when we overrun the buffer.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 4/8] file c/r: expose functions to query fs support
  2009-02-27 21:14   ` Sukadev Bhattiprolu
@ 2009-02-27 21:24     ` Dave Hansen
  2009-02-27 21:32       ` Dave Hansen
  0 siblings, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 21:24 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: containers, linux-kernel, hch, Ingo Molnar, Alexey Dobriyan

On Fri, 2009-02-27 at 13:14 -0800, Sukadev Bhattiprolu wrote:
> Dave Hansen [dave@linux.vnet.ibm.com] wrote:
> | +int cr_file_supported(struct file *file)
> | +{
> | +	struct inode *inode = file->f_dentry->d_inode;
> | +	struct file_system_type *fs_type = inode->i_sb->s_type;
> | +
> | +	if (fs_is_cr_able(fs_type))
> | +		return 0;
> 
> Should this be if (!fs_is_cr_able(fs_type)) ?

Yes.  I did find and fix that at some point, but it appears I didn't
manage to integrate it into what I sent out.

Thanks, Suka.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 4/8] file c/r: expose functions to query fs support
  2009-02-27 21:24     ` Dave Hansen
@ 2009-02-27 21:32       ` Dave Hansen
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-27 21:32 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: hch, containers, Ingo Molnar, linux-kernel, Alexey Dobriyan

On Fri, 2009-02-27 at 13:24 -0800, Dave Hansen wrote:
> On Fri, 2009-02-27 at 13:14 -0800, Sukadev Bhattiprolu wrote:
> > Dave Hansen [dave@linux.vnet.ibm.com] wrote:
> > | +int cr_file_supported(struct file *file)
> > | +{
> > | +   struct inode *inode = file->f_dentry->d_inode;
> > | +   struct file_system_type *fs_type = inode->i_sb->s_type;
> > | +
> > | +   if (fs_is_cr_able(fs_type))
> > | +           return 0;
> > 
> > Should this be if (!fs_is_cr_able(fs_type)) ?
> 
> Yes.  I did find and fix that at some point, but it appears I didn't
> manage to integrate it into what I sent out.

Yeah, I think I managed to accidentally dump the fix into the next patch
instead of this one.  I'll correct that in the next set.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 4/8] file c/r: expose functions to query fs support
  2009-02-27 20:34 ` [RFC][PATCH 4/8] file c/r: expose functions to query fs support Dave Hansen
  2009-02-27 21:14   ` Sukadev Bhattiprolu
@ 2009-02-28  1:33   ` Sukadev Bhattiprolu
  1 sibling, 0 replies; 41+ messages in thread
From: Sukadev Bhattiprolu @ 2009-02-28  1:33 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

Dave Hansen [dave@linux.vnet.ibm.com] wrote:
| 
| This pair of functions will check to see whether a given
| 'struct file' can be checkpointed.  If it can't be, the
| "explain" function can also give a description why.
| 
| Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
| ---
| 
|  linux-2.6.git-dave/checkpoint/ckpt_file.c     |   46 ++++++++++++++++++++++++++
|  linux-2.6.git-dave/include/linux/checkpoint.h |   18 ++++++++++
|  2 files changed, 64 insertions(+)
| 
| diff -puN checkpoint/ckpt_file.c~cr-explain-unckpt-file checkpoint/ckpt_file.c
| --- linux-2.6.git/checkpoint/ckpt_file.c~cr-explain-unckpt-file	2009-02-27 12:07:38.000000000 -0800
| +++ linux-2.6.git-dave/checkpoint/ckpt_file.c	2009-02-27 12:07:38.000000000 -0800
| @@ -72,6 +72,52 @@ int cr_scan_fds(struct files_struct *fil
|  	return n;
|  }
| 
| +int fs_is_cr_able(struct file_system_type *fs_type)

Too many variations in naming ? "*cr_able(), *supported(), *checkpointable()'
all for a boolean result.

How about calling this cr_fs_checkpointable() ?

| +{
| +	if (fs_type->fs_flags & FS_CHECKPOINTABLE)
| +		return 1;
| +	/*
| +	 * We assume that all block-based filesystems that
| +	 * need devices work.  This covers all of the
| +	 * "important" fs's by default like ext*.  If this
| +	 * assumption becomes untrue, we may need a
| +	 * NOT_CHECKPOINTABLE flag in the future
| +	 */
| +	if (fs_type->fs_flags & FS_REQUIRES_DEV)
| +		return 1;
| +	return 0;
| +}
| +
| +int cr_explain_file(struct file *file, char *explain, int left)
| +{
| +	struct inode *inode = file->f_dentry->d_inode;
| +	struct file_system_type *fs_type = inode->i_sb->s_type;
| +
| +	if (!fs_is_cr_able(fs_type))
| +		return snprintf(explain, left,
| +				" (%s does not support checkpoint)",
| +				fs_type->name);
| +
| +	if (special_file(inode->i_mode))
| +		return snprintf(explain, left, " (special file)");
| +
| +	return 0;
| +}
| +
| +int cr_file_supported(struct file *file)

and this cr_file_checkpointable() ?

Sukadev

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-27 20:34 ` [RFC][PATCH 5/8] add f_op for checkpointability Dave Hansen
@ 2009-02-28  2:14   ` Sukadev Bhattiprolu
  2009-02-28  2:51     ` Dave Hansen
  2009-02-28 20:53   ` Christoph Hellwig
  1 sibling, 1 reply; 41+ messages in thread
From: Sukadev Bhattiprolu @ 2009-02-28  2:14 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

Dave Hansen [dave@linux.vnet.ibm.com] wrote:
| 
| We have set up sane defaults for how filesystems should
| be checkpointed.  However, as usual in the VFS, there
| are specialized places that will always need an ability
| to override these defaults.
| 
| This adds a new 'file_operations' function for
| checkpointing a file.  I did this under the assumption
| that we should have a dirt-simple way to make something
| (un)checkpointable that fits in with current code.
| 
| As you can see in the /dev/null patch in a second, all
| that we have to do to make something like /dev/null
| supported is add a single "generic" f_op entry.
| 
| Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
| ---
| 
|  linux-2.6.git-dave/checkpoint/ckpt_file.c     |   77 ++++++++++++++++----------
|  linux-2.6.git-dave/include/linux/checkpoint.h |    6 ++
|  linux-2.6.git-dave/include/linux/fs.h         |    4 +
|  3 files changed, 60 insertions(+), 27 deletions(-)
| 
| diff -puN checkpoint/ckpt_file.c~f_op-for-checkpointability checkpoint/ckpt_file.c
| --- linux-2.6.git/checkpoint/ckpt_file.c~f_op-for-checkpointability	2009-02-27 12:07:39.000000000 -0800
| +++ linux-2.6.git-dave/checkpoint/ckpt_file.c	2009-02-27 12:07:39.000000000 -0800
| @@ -104,56 +104,79 @@ int cr_explain_file(struct file *file, c
|  	return 0;
|  }
| 
| -int cr_file_supported(struct file *file)
| +typedef int (do_checkpoint_t)(struct file *, struct cr_ctx *,
| +			      struct cr_hdr_fd *);
| +
| +int generic_file_checkpoint(struct file *file, struct cr_ctx *ctx,
| +			 struct cr_hdr_fd *hh)
| +{
| +	/*
| +	 * A NULL hh means to make a trial run not
| +	 * actually writing data.  Just determine
| +	 * if the file is checkpointable.
| +	 */
| +	if (!hh)
| +		return 0;
| +
| +	hh->f_flags = file->f_flags;
| +	hh->f_mode = file->f_mode;
| +	hh->f_pos = file->f_pos;
| +	hh->f_version = file->f_version;
| +	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
| +
| +	return 0;
| +}
| +
| +do_checkpoint_t *cr_file_get_func(struct file *file)
|  {

Do we really need this helper ? IOW do callers need this function pointer
itself ? Or can we have a more generic helper that callers can use both
check if checkpoint is possible (pass NULL in ctx and hh) and to actually
checkpoint. Something like:

	int cr_file_checkpoint(file, ctx, hh)
	{
		int rc = -1;

		if (!cr_fs_checkpointable(fstype))
			return rc;
		
		if (!cr_file_checkpointable(file))
			return rc;

		if (special_file(file))
			return rc;

		op = file->f_op->checkpoint;
		if (!op)
			op = generic_file_checkpoint;

		return (*op)(file, ctx, hh);
	}



|  	struct inode *inode = file->f_dentry->d_inode;
|  	struct file_system_type *fs_type = inode->i_sb->s_type;
| 
| -	if (fs_is_cr_able(fs_type))
| -		return 0;
| +	if (file->f_op->checkpoint)
| +		return file->f_op->checkpoint;
| +
| +	if (!fs_is_cr_able(fs_type))
| +		return NULL;
| 
|  	if (special_file(inode->i_mode))
| -		return 0;
| +		return NULL;
| 
| -	return 1;
| +	return generic_file_checkpoint;
| +}
| +
| +int cr_file_supported(struct file *file)
| +{
| +	do_checkpoint_t *func = cr_file_get_func(file);
| +
| +	if (func)
| +		return !func(file, NULL, NULL);
| +
| +	return 0;
|  }
| 
|  /* cr_write_fd_data - dump the state of a given file pointer */
|  static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
|  {
| +	do_checkpoint_t *ckpt_func;
|  	struct cr_hdr h;
|  	struct cr_hdr_fd *hh = cr_hbuf_get(ctx, sizeof(*hh));
| -	struct dentry *dent = file->f_dentry;
| -	struct inode *inode = dent->d_inode;
| -	enum fd_type fd_type;
|  	int ret;
| 
|  	h.type = CR_HDR_FD_DATA;
|  	h.len = sizeof(*hh);
|  	h.parent = parent;
| 
| -	hh->f_flags = file->f_flags;
| -	hh->f_mode = file->f_mode;
| -	hh->f_pos = file->f_pos;
| -	hh->f_version = file->f_version;
| -	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
| -
| -	switch (inode->i_mode & S_IFMT) {
| -	case S_IFREG:
| -		fd_type = CR_FD_FILE;
| -		break;
| -	case S_IFDIR:
| -		fd_type = CR_FD_DIR;
| -		break;
| -	default:
| -		cr_hbuf_put(ctx, sizeof(*hh));
| -		return -EBADF;
| -	}
| +	ckpt_func = cr_file_get_func(file);
| +	ret = -EBADF;
| +	if (!ckpt_func)
| +		goto out;
| 
| -	/* FIX: check if the file/dir/link is unlinked */
| -	hh->fd_type = fd_type;
| +	ret = ckpt_func(file, ctx, hh);
| +	if (ret)
| +		goto out;

So we can combine these two steps into just one ?

	ret = -EBADF;
	hh->fd_type = fd_type;
	if (cr_file_checkpoint(file, ctx, hh))
		goto out;

| 
|  	ret = cr_write_obj(ctx, &h, hh);
| +out:
|  	cr_hbuf_put(ctx, sizeof(*hh));
|  	if (ret < 0)
|  		return ret;

Sukadev

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-28  2:14   ` Sukadev Bhattiprolu
@ 2009-02-28  2:51     ` Dave Hansen
  0 siblings, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-02-28  2:51 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: containers, linux-kernel, hch, Ingo Molnar, Alexey Dobriyan

On Fri, 2009-02-27 at 18:14 -0800, Sukadev Bhattiprolu wrote:
> | -int cr_file_supported(struct file *file)
> | +typedef int (do_checkpoint_t)(struct file *, struct cr_ctx *,
> | +			      struct cr_hdr_fd *);
> | +
> | +int generic_file_checkpoint(struct file *file, struct cr_ctx *ctx,
> | +			 struct cr_hdr_fd *hh)
> | +{
> | +	/*
> | +	 * A NULL hh means to make a trial run not
> | +	 * actually writing data.  Just determine
> | +	 * if the file is checkpointable.
> | +	 */
> | +	if (!hh)
> | +		return 0;
> | +
> | +	hh->f_flags = file->f_flags;
> | +	hh->f_mode = file->f_mode;
> | +	hh->f_pos = file->f_pos;
> | +	hh->f_version = file->f_version;
> | +	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
> | +
> | +	return 0;
> | +}
> | +
> | +do_checkpoint_t *cr_file_get_func(struct file *file)
> |  {
> 
> Do we really need this helper ? IOW do callers need this function pointer
> itself ? Or can we have a more generic helper that callers can use both
> check if checkpoint is possible (pass NULL in ctx and hh) and to actually
> checkpoint. Something like:

That helper is there because I overload that f_op->checkpoint for both
the "is this file checkpointable" function and the "checkpoint this
file" operation.

> 	int cr_file_checkpoint(file, ctx, hh)
> 	{
> 		int rc = -1;
> 
> 		if (!cr_fs_checkpointable(fstype))
> 			return rc;
> 		
> 		if (!cr_file_checkpointable(file))
> 			return rc;
> 
> 		if (special_file(file))
> 			return rc;
> 
> 		op = file->f_op->checkpoint;
> 		if (!op)
> 			op = generic_file_checkpoint;
> 
> 		return (*op)(file, ctx, hh);
> 	}

First thing you have to be careful about is that the f_op should be able
to override *everything*.  So it has to be first, always.

The other part is that I'd prefer not to call check (a la !
cr_file_checkpointable()) then try to checkpoint a second later, since
we share the implementation between the two here.  

You are probably right that we should probably be able to do this:

int cr_file_checkpointable(file)
{
	return !cr_file_checkpoint(file, NULL, NULL);
}

I'll look into that and see how natural it is to implement.

> |  	struct inode *inode = file->f_dentry->d_inode;
> |  	struct file_system_type *fs_type = inode->i_sb->s_type;
> | 
> | -	if (fs_is_cr_able(fs_type))
> | -		return 0;
> | +	if (file->f_op->checkpoint)
> | +		return file->f_op->checkpoint;
> | +
> | +	if (!fs_is_cr_able(fs_type))
> | +		return NULL;
> | 
> |  	if (special_file(inode->i_mode))
> | -		return 0;
> | +		return NULL;
> | 
> | -	return 1;
> | +	return generic_file_checkpoint;
> | +}
> | +
> | +int cr_file_supported(struct file *file)
> | +{
> | +	do_checkpoint_t *func = cr_file_get_func(file);
> | +
> | +	if (func)
> | +		return !func(file, NULL, NULL);
> | +
> | +	return 0;
> |  }
> | 
> |  /* cr_write_fd_data - dump the state of a given file pointer */
> |  static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
> |  {
> | +	do_checkpoint_t *ckpt_func;
> |  	struct cr_hdr h;
> |  	struct cr_hdr_fd *hh = cr_hbuf_get(ctx, sizeof(*hh));
> | -	struct dentry *dent = file->f_dentry;
> | -	struct inode *inode = dent->d_inode;
> | -	enum fd_type fd_type;
> |  	int ret;
> | 
> |  	h.type = CR_HDR_FD_DATA;
> |  	h.len = sizeof(*hh);
> |  	h.parent = parent;
> | 
> | -	hh->f_flags = file->f_flags;
> | -	hh->f_mode = file->f_mode;
> | -	hh->f_pos = file->f_pos;
> | -	hh->f_version = file->f_version;
> | -	/* FIX: need also file->uid, file->gid, file->f_owner, etc */
> | -
> | -	switch (inode->i_mode & S_IFMT) {
> | -	case S_IFREG:
> | -		fd_type = CR_FD_FILE;
> | -		break;
> | -	case S_IFDIR:
> | -		fd_type = CR_FD_DIR;
> | -		break;
> | -	default:
> | -		cr_hbuf_put(ctx, sizeof(*hh));
> | -		return -EBADF;
> | -	}
> | +	ckpt_func = cr_file_get_func(file);
> | +	ret = -EBADF;
> | +	if (!ckpt_func)
> | +		goto out;
> | 
> | -	/* FIX: check if the file/dir/link is unlinked */
> | -	hh->fd_type = fd_type;
> | +	ret = ckpt_func(file, ctx, hh);
> | +	if (ret)
> | +		goto out;
> 
> So we can combine these two steps into just one ?
> 
> 	ret = -EBADF;
> 	hh->fd_type = fd_type;
> 	if (cr_file_checkpoint(file, ctx, hh))
> 		goto out;

We could but it would be harder to read. :)

I really don't like operational things inside if()s like that.  I think
it kinda hides important code.

Thanks for taking the time to look at this, Suka.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-02-27 20:34 ` [RFC][PATCH 8/8] check files for checkpointability Dave Hansen
@ 2009-02-28  2:57   ` Sukadev Bhattiprolu
  2009-03-01 17:00     ` Serge E. Hallyn
  2009-03-04 23:41     ` Dave Hansen
  2009-03-01 19:43   ` Serge E. Hallyn
  2009-03-02 13:37   ` Serge E. Hallyn
  2 siblings, 2 replies; 41+ messages in thread
From: Sukadev Bhattiprolu @ 2009-02-28  2:57 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

Dave Hansen [dave@linux.vnet.ibm.com] wrote:
| 
| Introduce a files_struct counter to indicate whether a particular
| file_struct has ever contained a file which can not be
| checkpointed.  This flag is a one-way trip; once it is set, it may
| not be unset.
| 
| We assume at allocation that a new files_struct is clean and may
| be checkpointed.  However, as soon as it has had its files filled
| from its parent's, we check it for real in __scan_files_for_cr().
| At that point, we mark it if it contained any uncheckpointable
| files.

Hmm. Why not just copy ->may_checkpoint setting from parent (or old)
files_struct ? If parent is not checkpointable, then child won't be
and vice-versa - no ?

| 
| We also check each 'struct file' when it is installed in a fd
| slot.  This way, if anyone open()s or managed to dup() an
| unsuppored file, we can catch it.
| 
| Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
| ---
| 
|  linux-2.6.git-dave/fs/file.c                  |   19 +++++++++++++++++++
|  linux-2.6.git-dave/fs/open.c                  |    5 +++++
|  linux-2.6.git-dave/include/linux/checkpoint.h |   14 ++++++++++++++
|  linux-2.6.git-dave/include/linux/fdtable.h    |    3 +++
|  4 files changed, 41 insertions(+)
| 
| diff -puN fs/file.c~track-files_struct-checkpointability fs/file.c
| --- linux-2.6.git/fs/file.c~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
| +++ linux-2.6.git-dave/fs/file.c	2009-02-27 12:07:41.000000000 -0800
| @@ -15,6 +15,7 @@
|  #include <linux/file.h>
|  #include <linux/fdtable.h>
|  #include <linux/bitops.h>
| +#include <linux/checkpoint.h>
|  #include <linux/interrupt.h>
|  #include <linux/spinlock.h>
|  #include <linux/rcupdate.h>
| @@ -285,6 +286,20 @@ static int count_open_files(struct fdtab
|  	return i;
|  }
| 
| +static void __scan_files_for_cr(struct files_struct *files)
| +{
| +	int i;
| +
| +	for (i = 0; i < files->fdtab.max_fds; i++) {
| +		struct file *f = fcheck_files(files, i);
| +		if (!f)
| +			continue;
| +		if (cr_file_supported(f))
| +			continue;
| +		files_deny_checkpointing(files);
| +	}
| +}
| +

A version of __scan_files_for_cr() for CONFIG_CHECKPOINT_RESTART=n or...

|  /*
|   * Allocate a new files structure and copy contents from the
|   * passed in files structure.
| @@ -303,6 +318,9 @@ struct files_struct *dup_fd(struct files
|  		goto out;
| 
|  	atomic_set(&newf->count, 1);
| +#ifdef CONFIG_CHECKPOINT_RESTART
| +	newf->may_checkpoint = 1;
| +#endif
| 
|  	spin_lock_init(&newf->file_lock);
|  	newf->next_fd = 0;
| @@ -396,6 +414,7 @@ struct files_struct *dup_fd(struct files
| 
|  	rcu_assign_pointer(newf->fdt, new_fdt);
| 
| +	__scan_files_for_cr(newf);

... #ifdef CONFIG_CHECKPOINT_RESTART here ?

Sukadev

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-27 20:34 ` [RFC][PATCH 5/8] add f_op for checkpointability Dave Hansen
  2009-02-28  2:14   ` Sukadev Bhattiprolu
@ 2009-02-28 20:53   ` Christoph Hellwig
  2009-02-28 21:37     ` Dave Hansen
  2009-03-02 17:05     ` Dave Hansen
  1 sibling, 2 replies; 41+ messages in thread
From: Christoph Hellwig @ 2009-02-28 20:53 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

On Fri, Feb 27, 2009 at 12:34:31PM -0800, Dave Hansen wrote:
> 
> We have set up sane defaults for how filesystems should
> be checkpointed.  However, as usual in the VFS, there
> are specialized places that will always need an ability
> to override these defaults.
> 
> This adds a new 'file_operations' function for
> checkpointing a file.  I did this under the assumption
> that we should have a dirt-simple way to make something
> (un)checkpointable that fits in with current code.
> 
> As you can see in the /dev/null patch in a second, all
> that we have to do to make something like /dev/null
> supported is add a single "generic" f_op entry.

Please don't do the fallback to allow checkpointing without file
operations.  We've never had luck with these fallbacks, and I'm
in the process of getting of the last default file operation (llseek,
which has a very bad default) currently.

Incidentally that should also allow you to get rid of the per-fs flag
by just checking for the presence of the operation to check if
checkpointing is allowed.

Also the double-use of the op seem not very nice to me.  Is there any
real life use case were you would have the operation on a file but
sometimes not allow checkpoiting?


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-28 20:53   ` Christoph Hellwig
@ 2009-02-28 21:37     ` Dave Hansen
  2009-03-01 15:19       ` Serge E. Hallyn
  2009-03-02 17:05     ` Dave Hansen
  1 sibling, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-02-28 21:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan

On Sat, 2009-02-28 at 15:53 -0500, Christoph Hellwig wrote:
> On Fri, Feb 27, 2009 at 12:34:31PM -0800, Dave Hansen wrote:
> > We have set up sane defaults for how filesystems should
> > be checkpointed.  However, as usual in the VFS, there
> > are specialized places that will always need an ability
> > to override these defaults.
> > 
> > This adds a new 'file_operations' function for
> > checkpointing a file.  I did this under the assumption
> > that we should have a dirt-simple way to make something
> > (un)checkpointable that fits in with current code.
> > 
> > As you can see in the /dev/null patch in a second, all
> > that we have to do to make something like /dev/null
> > supported is add a single "generic" f_op entry.
> 
> Please don't do the fallback to allow checkpointing without file
> operations.  We've never had luck with these fallbacks, and I'm
> in the process of getting of the last default file operation (llseek,
> which has a very bad default) currently.

You'll probably believe me when I tell you that I was looking at how
lseek was done when I did this. :)

Doing this will make for a much, much bigger patch, but I do understand
why you're asking for it to be done this way, so I'll give it a shot for
the next round.

> Incidentally that should also allow you to get rid of the per-fs flag
> by just checking for the presence of the operation to check if
> checkpointing is allowed.

Yup, I completely agree.  The flag was basically an indicator when it
was OK to do the fallback.

> Also the double-use of the op seem not very nice to me.  Is there any
> real life use case were you would have the operation on a file but
> sometimes not allow checkpoiting?

No, I don't have any good concrete ones.  The first thing that comes to
mind is something like a pipe.  We can checkpoint when there's no data,
but must refuse when there's data in the pipe.  In practice, pipes are
fixable, but it is the kind of situation where I expected it to get
used.

Thanks, Christoph.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-28 21:37     ` Dave Hansen
@ 2009-03-01 15:19       ` Serge E. Hallyn
  0 siblings, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-01 15:19 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Christoph Hellwig, Ingo Molnar, containers, linux-kernel,
	Oren Laadan, Alexey Dobriyan

Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> > Also the double-use of the op seem not very nice to me.  Is there any
> > real life use case were you would have the operation on a file but
> > sometimes not allow checkpoiting?
> 
> No, I don't have any good concrete ones.  The first thing that comes to
> mind is something like a pipe.  We can checkpoint when there's no data,
> but must refuse when there's data in the pipe.  In practice, pipes are
> fixable, but it is the kind of situation where I expected it to get
> used.

Hmm, but that's the kind of thing Ingo is resolutely against,
right?  If you've opened some resource that may in certain
cases not be checkpointable, then checkpointing it in certain
states is just wrong, as the app can never know for sure (without
knowing the fragile and temporary implementation details)
whether it is checkpointable.

So we either support pipes or we don't.

(Now maybe you have another use in mind for it...)

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-02-28  2:57   ` Sukadev Bhattiprolu
@ 2009-03-01 17:00     ` Serge E. Hallyn
  2009-03-04 23:41     ` Dave Hansen
  1 sibling, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-01 17:00 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: Dave Hansen, Ingo Molnar, containers, linux-kernel, Oren Laadan,
	Alexey Dobriyan, hch

Quoting Sukadev Bhattiprolu (sukadev@linux.vnet.ibm.com):
> Dave Hansen [dave@linux.vnet.ibm.com] wrote:
> | 
> | Introduce a files_struct counter to indicate whether a particular
> | file_struct has ever contained a file which can not be
> | checkpointed.  This flag is a one-way trip; once it is set, it may
> | not be unset.
> | 
> | We assume at allocation that a new files_struct is clean and may
> | be checkpointed.  However, as soon as it has had its files filled
> | from its parent's, we check it for real in __scan_files_for_cr().
> | At that point, we mark it if it contained any uncheckpointable
> | files.
> 
> Hmm. Why not just copy ->may_checkpoint setting from parent (or old)
> files_struct ? If parent is not checkpointable, then child won't be
> and vice-versa - no ?

No.  We don't clear the files_struct checkpointable flag when an
uncheckpointable file is closed.  But if the parent has closed
all uncheckpointable files before forking, then the child can
be started with a checkpointable files_struct.

Otherwise it wouldn't just be the task which has a one-way trip to
uncheckpointability, but process trees, and - assuming init does
anything uncheckpointable at all - the whole system :)

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-02-27 20:34 ` [RFC][PATCH 8/8] check files for checkpointability Dave Hansen
  2009-02-28  2:57   ` Sukadev Bhattiprolu
@ 2009-03-01 19:43   ` Serge E. Hallyn
  2009-03-02 13:37   ` Serge E. Hallyn
  2 siblings, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-01 19:43 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Oren Laadan, Alexey Dobriyan, hch

Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> 
> Introduce a files_struct counter to indicate whether a particular
> file_struct has ever contained a file which can not be
> checkpointed.  This flag is a one-way trip; once it is set, it may
> not be unset.
> 
> We assume at allocation that a new files_struct is clean and may
> be checkpointed.  However, as soon as it has had its files filled
> from its parent's, we check it for real in __scan_files_for_cr().
> At that point, we mark it if it contained any uncheckpointable
> files.
> 
> We also check each 'struct file' when it is installed in a fd
> slot.  This way, if anyone open()s or managed to dup() an
> unsuppored file, we can catch it.
> 
> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> ---
> 
>  linux-2.6.git-dave/fs/file.c                  |   19 +++++++++++++++++++
>  linux-2.6.git-dave/fs/open.c                  |    5 +++++
>  linux-2.6.git-dave/include/linux/checkpoint.h |   14 ++++++++++++++
>  linux-2.6.git-dave/include/linux/fdtable.h    |    3 +++
>  4 files changed, 41 insertions(+)
> 
> diff -puN fs/file.c~track-files_struct-checkpointability fs/file.c
> --- linux-2.6.git/fs/file.c~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
> +++ linux-2.6.git-dave/fs/file.c	2009-02-27 12:07:41.000000000 -0800
> @@ -15,6 +15,7 @@
>  #include <linux/file.h>
>  #include <linux/fdtable.h>
>  #include <linux/bitops.h>
> +#include <linux/checkpoint.h>
>  #include <linux/interrupt.h>
>  #include <linux/spinlock.h>
>  #include <linux/rcupdate.h>
> @@ -285,6 +286,20 @@ static int count_open_files(struct fdtab
>  	return i;
>  }
> 
> +static void __scan_files_for_cr(struct files_struct *files)
> +{
> +	int i;
> +
> +	for (i = 0; i < files->fdtab.max_fds; i++) {
> +		struct file *f = fcheck_files(files, i);
> +		if (!f)
> +			continue;
> +		if (cr_file_supported(f))
> +			continue;
> +		files_deny_checkpointing(files);
> +	}
> +}
> +
>  /*
>   * Allocate a new files structure and copy contents from the
>   * passed in files structure.
> @@ -303,6 +318,9 @@ struct files_struct *dup_fd(struct files
>  		goto out;
> 
>  	atomic_set(&newf->count, 1);
> +#ifdef CONFIG_CHECKPOINT_RESTART
> +	newf->may_checkpoint = 1;
> +#endif
> 
>  	spin_lock_init(&newf->file_lock);
>  	newf->next_fd = 0;
> @@ -396,6 +414,7 @@ struct files_struct *dup_fd(struct files
> 
>  	rcu_assign_pointer(newf->fdt, new_fdt);
> 
> +	__scan_files_for_cr(newf);
>  	return newf;
> 
>  out_release:
> diff -puN fs/open.c~track-files_struct-checkpointability fs/open.c
> --- linux-2.6.git/fs/open.c~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
> +++ linux-2.6.git-dave/fs/open.c	2009-02-27 12:07:41.000000000 -0800
> @@ -29,6 +29,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/audit.h>
>  #include <linux/falloc.h>
> +#include <linux/checkpoint.h>
> 
>  int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
>  {
> @@ -1015,6 +1016,10 @@ void fd_install(unsigned int fd, struct 
>  {
>  	struct files_struct *files = current->files;
>  	struct fdtable *fdt;
> +
> +	if (!cr_file_supported(file))
> +		files_deny_checkpointing(files);
> +
>  	spin_lock(&files->file_lock);
>  	fdt = files_fdtable(files);
>  	BUG_ON(fdt->fd[fd] != NULL);
> diff -puN include/linux/checkpoint.h~track-files_struct-checkpointability include/linux/checkpoint.h
> --- linux-2.6.git/include/linux/checkpoint.h~track-files_struct-checkpointability	2009-02-27 12:07:41.000000000 -0800
> +++ linux-2.6.git-dave/include/linux/checkpoint.h	2009-02-27 12:07:41.000000000 -0800
> @@ -12,6 +12,7 @@
> 
>  #include <linux/path.h>
>  #include <linux/fs.h>
> +#include <linux/fdtable.h>
> 
>  #ifdef CONFIG_CHECKPOINT_RESTART
> 
> @@ -102,6 +103,18 @@ extern int cr_read_files(struct cr_ctx *
> 
>  #define pr_fmt(fmt) "[%d:c/r:%s] " fmt, task_pid_vnr(current), __func__
> 
> +static inline void __files_deny_checkpointing(struct files_struct *files,
> +		char *file, int line)
> +{
> +	if (!test_and_clear_bit(0, &files->may_checkpoint))
> +		return;
> +	printk(KERN_INFO "process performed an action that can not be "
> +			"checkpointed at: %s:%d\n", file, line);
> +	WARN_ON(1);

This WARN_ON(1) is one step past the line into obnoxiousness on my
console :)

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-02-27 20:34 ` [RFC][PATCH 8/8] check files for checkpointability Dave Hansen
  2009-02-28  2:57   ` Sukadev Bhattiprolu
  2009-03-01 19:43   ` Serge E. Hallyn
@ 2009-03-02 13:37   ` Serge E. Hallyn
  2009-03-02 15:56     ` Dave Hansen
  2009-03-02 15:59     ` Nathan Lynch
  2 siblings, 2 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-02 13:37 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Ingo Molnar, containers, linux-kernel, Oren Laadan, Alexey Dobriyan, hch

Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> 
> Introduce a files_struct counter to indicate whether a particular
> file_struct has ever contained a file which can not be
> checkpointed.  This flag is a one-way trip; once it is set, it may
> not be unset.
> 
> We assume at allocation that a new files_struct is clean and may
> be checkpointed.  However, as soon as it has had its files filled
> from its parent's, we check it for real in __scan_files_for_cr().
> At that point, we mark it if it contained any uncheckpointable
> files.
> 
> We also check each 'struct file' when it is installed in a fd
> slot.  This way, if anyone open()s or managed to dup() an
> unsuppored file, we can catch it.
> 
> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>

So on a practical note, Ingo's scheme appears to be paying off.  In
order for any program's files_struct to be checkpointable right now,
it must be statically compiled, else ld.so (I assume) looks up
/proc/$$/status.  So since proc is not checkpointable, the result
is irreversibly non-checkpointable.

So...  does it make sense to mark proc as checkpointable?  Do we
reasonably assume that the same procfile will be available at
restart?

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 13:37   ` Serge E. Hallyn
@ 2009-03-02 15:56     ` Dave Hansen
  2009-03-02 15:59     ` Nathan Lynch
  1 sibling, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-03-02 15:56 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers, linux-kernel, hch, Ingo Molnar, Alexey Dobriyan

On Mon, 2009-03-02 at 07:37 -0600, Serge E. Hallyn wrote:
> Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> > 
> > Introduce a files_struct counter to indicate whether a particular
> > file_struct has ever contained a file which can not be
> > checkpointed.  This flag is a one-way trip; once it is set, it may
> > not be unset.
> > 
> > We assume at allocation that a new files_struct is clean and may
> > be checkpointed.  However, as soon as it has had its files filled
> > from its parent's, we check it for real in __scan_files_for_cr().
> > At that point, we mark it if it contained any uncheckpointable
> > files.
> > 
> > We also check each 'struct file' when it is installed in a fd
> > slot.  This way, if anyone open()s or managed to dup() an
> > unsuppored file, we can catch it.
> > 
> > Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> 
> So on a practical note, Ingo's scheme appears to be paying off.  In
> order for any program's files_struct to be checkpointable right now,
> it must be statically compiled, else ld.so (I assume) looks up
> /proc/$$/status.  So since proc is not checkpointable, the result
> is irreversibly non-checkpointable.
> 
> So...  does it make sense to mark proc as checkpointable?  Do we
> reasonably assume that the same procfile will be available at
> restart?

Can I kick and scream for a minute?  :)

dave@nimitz:~/lse/linux/2.5/linux-2.6.git$ grep -r 'struct file_operations.*{' fs/ | grep /proc/ | wc -l
51

I'll need to go actually look at (and mark) each of those.  But, the
upside is that I'll have to go look at each of those.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 13:37   ` Serge E. Hallyn
  2009-03-02 15:56     ` Dave Hansen
@ 2009-03-02 15:59     ` Nathan Lynch
  2009-03-02 16:27       ` Dave Hansen
  2009-03-02 16:28       ` Serge E. Hallyn
  1 sibling, 2 replies; 41+ messages in thread
From: Nathan Lynch @ 2009-03-02 15:59 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Dave Hansen, containers, linux-kernel, hch, Ingo Molnar, Alexey Dobriyan

On Mon, 2 Mar 2009 07:37:54 -0600
"Serge E. Hallyn" <serue@us.ibm.com> wrote:

> Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> > 
> > Introduce a files_struct counter to indicate whether a particular
> > file_struct has ever contained a file which can not be
> > checkpointed.  This flag is a one-way trip; once it is set, it may
> > not be unset.
> > 
> > We assume at allocation that a new files_struct is clean and may
> > be checkpointed.  However, as soon as it has had its files filled
> > from its parent's, we check it for real in __scan_files_for_cr().
> > At that point, we mark it if it contained any uncheckpointable
> > files.
> > 
> > We also check each 'struct file' when it is installed in a fd
> > slot.  This way, if anyone open()s or managed to dup() an
> > unsuppored file, we can catch it.
> > 
> > Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> 
> So on a practical note, Ingo's scheme appears to be paying off.  In
> order for any program's files_struct to be checkpointable right now,
> it must be statically compiled, else ld.so (I assume) looks up
> /proc/$$/status.  So since proc is not checkpointable, the result
> is irreversibly non-checkpointable.
> 
> So...  does it make sense to mark proc as checkpointable?  Do we
> reasonably assume that the same procfile will be available at
> restart?

With respect to /proc/$x/* where $x is the pid the restarted task wants,
is that not a chicken-and-egg problem?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 15:59     ` Nathan Lynch
@ 2009-03-02 16:27       ` Dave Hansen
  2009-03-02 17:22         ` Nathan Lynch
  2009-03-02 16:28       ` Serge E. Hallyn
  1 sibling, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-03-02 16:27 UTC (permalink / raw)
  To: Nathan Lynch
  Cc: Serge E. Hallyn, containers, linux-kernel, hch, Ingo Molnar,
	Alexey Dobriyan

On Mon, 2009-03-02 at 09:59 -0600, Nathan Lynch wrote:
> On Mon, 2 Mar 2009 07:37:54 -0600
> "Serge E. Hallyn" <serue@us.ibm.com> wrote:
> > So on a practical note, Ingo's scheme appears to be paying off.  In
> > order for any program's files_struct to be checkpointable right now,
> > it must be statically compiled, else ld.so (I assume) looks up
> > /proc/$$/status.  So since proc is not checkpointable, the result
> > is irreversibly non-checkpointable.
> > 
> > So...  does it make sense to mark proc as checkpointable?  Do we
> > reasonably assume that the same procfile will be available at
> > restart?
> 
> With respect to /proc/$x/* where $x is the pid the restarted task wants,
> is that not a chicken-and-egg problem?

Do you mean that we have to go look into /proc to figure out which task
we want before we can checkpoint it?  That makes the process *doing* the
checkpoint uncheckpointable, but no the process being examined.

Anyway, I'll fix /proc.  It is pretty important.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 15:59     ` Nathan Lynch
  2009-03-02 16:27       ` Dave Hansen
@ 2009-03-02 16:28       ` Serge E. Hallyn
  1 sibling, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-02 16:28 UTC (permalink / raw)
  To: Nathan Lynch
  Cc: Dave Hansen, containers, linux-kernel, hch, Ingo Molnar, Alexey Dobriyan

Quoting Nathan Lynch (ntl@pobox.com):
> On Mon, 2 Mar 2009 07:37:54 -0600
> "Serge E. Hallyn" <serue@us.ibm.com> wrote:
> 
> > Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> > > 
> > > Introduce a files_struct counter to indicate whether a particular
> > > file_struct has ever contained a file which can not be
> > > checkpointed.  This flag is a one-way trip; once it is set, it may
> > > not be unset.
> > > 
> > > We assume at allocation that a new files_struct is clean and may
> > > be checkpointed.  However, as soon as it has had its files filled
> > > from its parent's, we check it for real in __scan_files_for_cr().
> > > At that point, we mark it if it contained any uncheckpointable
> > > files.
> > > 
> > > We also check each 'struct file' when it is installed in a fd
> > > slot.  This way, if anyone open()s or managed to dup() an
> > > unsuppored file, we can catch it.
> > > 
> > > Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
> > 
> > So on a practical note, Ingo's scheme appears to be paying off.  In
> > order for any program's files_struct to be checkpointable right now,
> > it must be statically compiled, else ld.so (I assume) looks up
> > /proc/$$/status.  So since proc is not checkpointable, the result
> > is irreversibly non-checkpointable.
> > 
> > So...  does it make sense to mark proc as checkpointable?  Do we
> > reasonably assume that the same procfile will be available at
> > restart?
> 
> With respect to /proc/$x/* where $x is the pid the restarted task wants,
> is that not a chicken-and-egg problem?

I don't think so... the task will get the pid back (eventually :).  So
sure it won't really be supported yet but we can ignore that for now
imo.

The question is, do we worry about the fact that the procfile contents
might be different at restart (different kernel, etc).

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-02-28 20:53   ` Christoph Hellwig
  2009-02-28 21:37     ` Dave Hansen
@ 2009-03-02 17:05     ` Dave Hansen
  2009-03-03 13:15       ` Christoph Hellwig
  1 sibling, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-03-02 17:05 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: containers, linux-kernel, Ingo Molnar, Alexey Dobriyan

On Sat, 2009-02-28 at 15:53 -0500, Christoph Hellwig wrote:
> Also the double-use of the op seem not very nice to me.  Is there any
> real life use case were you would have the operation on a file but
> sometimes not allow checkpoiting?

I'm still reaching here...

I was thinking of /proc.  Opening your own /proc/$$/* would certainly be
considered OK.  But, doing it for some other process not in your pid
namespace would not be OK and would not be checkpointable.

I know we're not quite in real-life territory here, yet, but I'm still
thinking.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 16:27       ` Dave Hansen
@ 2009-03-02 17:22         ` Nathan Lynch
  2009-03-02 17:30           ` Dave Hansen
  0 siblings, 1 reply; 41+ messages in thread
From: Nathan Lynch @ 2009-03-02 17:22 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Serge E. Hallyn, containers, linux-kernel, hch, Ingo Molnar,
	Alexey Dobriyan

On Mon, 02 Mar 2009 08:27:31 -0800
Dave Hansen <dave@linux.vnet.ibm.com> wrote:

> On Mon, 2009-03-02 at 09:59 -0600, Nathan Lynch wrote:
> > On Mon, 2 Mar 2009 07:37:54 -0600
> > "Serge E. Hallyn" <serue@us.ibm.com> wrote:
> > > So on a practical note, Ingo's scheme appears to be paying off.
> > > In order for any program's files_struct to be checkpointable
> > > right now, it must be statically compiled, else ld.so (I assume)
> > > looks up /proc/$$/status.  So since proc is not checkpointable,
> > > the result is irreversibly non-checkpointable.
> > > 
> > > So...  does it make sense to mark proc as checkpointable?  Do we
> > > reasonably assume that the same procfile will be available at
> > > restart?
> > 
> > With respect to /proc/$x/* where $x is the pid the restarted task
> > wants, is that not a chicken-and-egg problem?
> 
> Do you mean that we have to go look into /proc to figure out which
> task we want before we can checkpoint it?  That makes the process
> *doing* the checkpoint uncheckpointable, but no the process being
> examined.

No.. I mean what if a process 1234 does

f = fopen("/proc/1234/stat", "r");

and is then checkpointed.  Can that path be resolved during restart,
before pid 1234 is alive?

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 17:22         ` Nathan Lynch
@ 2009-03-02 17:30           ` Dave Hansen
  2009-03-02 17:44             ` Serge E. Hallyn
  0 siblings, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-03-02 17:30 UTC (permalink / raw)
  To: Nathan Lynch
  Cc: Serge E. Hallyn, containers, linux-kernel, hch, Ingo Molnar,
	Alexey Dobriyan

On Mon, 2009-03-02 at 11:22 -0600, Nathan Lynch wrote:
> No.. I mean what if a process 1234 does
> 
> f = fopen("/proc/1234/stat", "r");
> 
> and is then checkpointed.  Can that path be resolved during restart,
> before pid 1234 is alive?

Heh, that's a good one.

It does mean that we can't do restore like this:

	for_each_cr_task()
		restore_task_struct()
		restore_files()
		...

We have to do:

	for_each_cr_task()
		restore_task_struct()
	for_each_cr_task()
		restore_files()


-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 17:30           ` Dave Hansen
@ 2009-03-02 17:44             ` Serge E. Hallyn
  2009-03-02 17:58               ` Dave Hansen
  2009-03-02 18:13               ` Dave Hansen
  0 siblings, 2 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-02 17:44 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Nathan Lynch, containers, linux-kernel, hch, Ingo Molnar,
	Alexey Dobriyan

Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> On Mon, 2009-03-02 at 11:22 -0600, Nathan Lynch wrote:
> > No.. I mean what if a process 1234 does
> > 
> > f = fopen("/proc/1234/stat", "r");
> > 
> > and is then checkpointed.  Can that path be resolved during restart,
> > before pid 1234 is alive?
> 
> Heh, that's a good one.
> 
> It does mean that we can't do restore like this:
> 
> 	for_each_cr_task()
> 		restore_task_struct()
> 		restore_files()
> 		...
> 
> We have to do:
> 
> 	for_each_cr_task()
> 		restore_task_struct()
> 	for_each_cr_task()
> 		restore_files()
> 
> 
> -- Dave

Which is what we actually do, right?

Actually we have userspace create the tasks first, and
then each task calls sys_restart which does restore_files().

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 17:44             ` Serge E. Hallyn
@ 2009-03-02 17:58               ` Dave Hansen
  2009-03-02 18:13               ` Dave Hansen
  1 sibling, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-03-02 17:58 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Nathan Lynch, containers, linux-kernel, hch, Ingo Molnar,
	Alexey Dobriyan

On Mon, 2009-03-02 at 11:44 -0600, Serge E. Hallyn wrote:
> Which is what we actually do, right?
> 
> Actually we have userspace create the tasks first, and
> then each task calls sys_restart which does restore_files().

Quoting:

http://git.ncl.cs.columbia.edu/?p=linux-cr.git;a=commitdiff;h=3c1b1900f92ed12f5020a7b566065bffda2908d8;hp=7aec1a8f3345bb33c3f93226c895a45ec269bb59

> Restarting of multiple processes expects all restarting tasks to call
> sys_restart(). Once inside the system call, each task will restart
> itself at the same order that they were saved. The internals of the
> syscall will take care of in-kernel synchronization bewteen tasks.

I guess it is OK since everybody sleeps once they enter sys_restart()
until the container init decides it is ready to go.  

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 17:44             ` Serge E. Hallyn
  2009-03-02 17:58               ` Dave Hansen
@ 2009-03-02 18:13               ` Dave Hansen
  2009-03-02 18:35                 ` Serge E. Hallyn
  2009-03-05  8:20                 ` Cedric Le Goater
  1 sibling, 2 replies; 41+ messages in thread
From: Dave Hansen @ 2009-03-02 18:13 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: containers, linux-kernel, hch, Nathan Lynch, Ingo Molnar,
	Alexey Dobriyan

On Mon, 2009-03-02 at 11:44 -0600, Serge E. Hallyn wrote:
> Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> > On Mon, 2009-03-02 at 11:22 -0600, Nathan Lynch wrote:
> > > No.. I mean what if a process 1234 does
> > > 
> > > f = fopen("/proc/1234/stat", "r");
> > > 
> > > and is then checkpointed.  Can that path be resolved during restart,
> > > before pid 1234 is alive?
> > 
> > Heh, that's a good one.
> > 
> > It does mean that we can't do restore like this:
> > 
> > 	for_each_cr_task()
> > 		restore_task_struct()
> > 		restore_files()
> > 		...
> > 
> > We have to do:
> > 
> > 	for_each_cr_task()
> > 		restore_task_struct()
> > 	for_each_cr_task()
> > 		restore_files()
> >
> Which is what we actually do, right?

OK, I have a really evil one.

What if task 1234 does:

	open(O_RDONLY, "/proc/5678/fdinfo/44");

and task 5678 does:

	open(O_RDONLY, "/proc/5678/fdinfo/55");

There is no right order.

The only right way I can think to do it is that we have to loop on the
restore and defer files that we can't seem to find right now, hoping
that they'll show up as the restore progresses.

Basically:

	for_each_cr_task()
		deferred_files = restore_files()
retry:
	making_progress = 0
	for_each(deferred_file)
		restore(deferred_file)
	if (making_progress)
		goto retry;

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 18:13               ` Dave Hansen
@ 2009-03-02 18:35                 ` Serge E. Hallyn
  2009-03-05  8:20                 ` Cedric Le Goater
  1 sibling, 0 replies; 41+ messages in thread
From: Serge E. Hallyn @ 2009-03-02 18:35 UTC (permalink / raw)
  To: Dave Hansen
  Cc: containers, linux-kernel, hch, Nathan Lynch, Ingo Molnar,
	Alexey Dobriyan

Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> On Mon, 2009-03-02 at 11:44 -0600, Serge E. Hallyn wrote:
> > Quoting Dave Hansen (dave@linux.vnet.ibm.com):
> > > On Mon, 2009-03-02 at 11:22 -0600, Nathan Lynch wrote:
> > > > No.. I mean what if a process 1234 does
> > > > 
> > > > f = fopen("/proc/1234/stat", "r");
> > > > 
> > > > and is then checkpointed.  Can that path be resolved during restart,
> > > > before pid 1234 is alive?
> > > 
> > > Heh, that's a good one.
> > > 
> > > It does mean that we can't do restore like this:
> > > 
> > > 	for_each_cr_task()
> > > 		restore_task_struct()
> > > 		restore_files()
> > > 		...
> > > 
> > > We have to do:
> > > 
> > > 	for_each_cr_task()
> > > 		restore_task_struct()
> > > 	for_each_cr_task()
> > > 		restore_files()
> > >
> > Which is what we actually do, right?
> 
> OK, I have a really evil one.
> 
> What if task 1234 does:
> 
> 	open(O_RDONLY, "/proc/5678/fdinfo/44");
> 
> and task 5678 does:
> 
> 	open(O_RDONLY, "/proc/5678/fdinfo/55");

Nice one.  Let's make fdinfo files uncheckpointable for now :)

-serge

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-03-02 17:05     ` Dave Hansen
@ 2009-03-03 13:15       ` Christoph Hellwig
  2009-03-20 21:13         ` Dave Hansen
  0 siblings, 1 reply; 41+ messages in thread
From: Christoph Hellwig @ 2009-03-03 13:15 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Christoph Hellwig, containers, linux-kernel, Ingo Molnar,
	Alexey Dobriyan

On Mon, Mar 02, 2009 at 09:05:56AM -0800, Dave Hansen wrote:
> On Sat, 2009-02-28 at 15:53 -0500, Christoph Hellwig wrote:
> > Also the double-use of the op seem not very nice to me.  Is there any
> > real life use case were you would have the operation on a file but
> > sometimes not allow checkpoiting?
> 
> I'm still reaching here...
> 
> I was thinking of /proc.  Opening your own /proc/$$/* would certainly be
> considered OK.  But, doing it for some other process not in your pid
> namespace would not be OK and would not be checkpointable.
> 
> I know we're not quite in real-life territory here, yet, but I'm still
> thinking.

That mighr be a good enough excuse, I was just wondering what the use
case was.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-02-28  2:57   ` Sukadev Bhattiprolu
  2009-03-01 17:00     ` Serge E. Hallyn
@ 2009-03-04 23:41     ` Dave Hansen
  1 sibling, 0 replies; 41+ messages in thread
From: Dave Hansen @ 2009-03-04 23:41 UTC (permalink / raw)
  To: Sukadev Bhattiprolu
  Cc: Ingo Molnar, containers, linux-kernel, Serge E. Hallyn,
	Oren Laadan, Alexey Dobriyan, hch

On Fri, 2009-02-27 at 18:57 -0800, Sukadev Bhattiprolu wrote: 
> Dave Hansen [dave@linux.vnet.ibm.com] wrote:
> | 
> | Introduce a files_struct counter to indicate whether a particular
> | file_struct has ever contained a file which can not be
> | checkpointed.  This flag is a one-way trip; once it is set, it may
> | not be unset.
> | 
> | We assume at allocation that a new files_struct is clean and may
> | be checkpointed.  However, as soon as it has had its files filled
> | from its parent's, we check it for real in __scan_files_for_cr().
> | At that point, we mark it if it contained any uncheckpointable
> | files.
> 
> Hmm. Why not just copy ->may_checkpoint setting from parent (or old)
> files_struct ? If parent is not checkpointable, then child won't be
> and vice-versa - no ?

Because init does things that are uncheckpointable.  If we purely
inherit, we'll never be able to checkpoint.

> | +static void __scan_files_for_cr(struct files_struct *files)
> | +{
> | +	int i;
> | +
> | +	for (i = 0; i < files->fdtab.max_fds; i++) {
> | +		struct file *f = fcheck_files(files, i);
> | +		if (!f)
> | +			continue;
> | +		if (cr_file_supported(f))
> | +			continue;
> | +		files_deny_checkpointing(files);
> | +	}
> | +}
> | +
> 
> A version of __scan_files_for_cr() for CONFIG_CHECKPOINT_RESTART=n or...
> 
> |  /*
> |   * Allocate a new files structure and copy contents from the
> |   * passed in files structure.
> | @@ -303,6 +318,9 @@ struct files_struct *dup_fd(struct files
> |  		goto out;
> | 
> |  	atomic_set(&newf->count, 1);
> | +#ifdef CONFIG_CHECKPOINT_RESTART
> | +	newf->may_checkpoint = 1;
> | +#endif
> | 
> |  	spin_lock_init(&newf->file_lock);
> |  	newf->next_fd = 0;
> | @@ -396,6 +414,7 @@ struct files_struct *dup_fd(struct files
> | 
> |  	rcu_assign_pointer(newf->fdt, new_fdt);
> | 
> | +	__scan_files_for_cr(newf);
> 
> ... #ifdef CONFIG_CHECKPOINT_RESTART here ?

gcc isn't quite smart enough to figure out that this is a noop.  Please
see my new set for a new compiler helper to solve this problem.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 8/8] check files for checkpointability
  2009-03-02 18:13               ` Dave Hansen
  2009-03-02 18:35                 ` Serge E. Hallyn
@ 2009-03-05  8:20                 ` Cedric Le Goater
  1 sibling, 0 replies; 41+ messages in thread
From: Cedric Le Goater @ 2009-03-05  8:20 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Serge E. Hallyn, containers, linux-kernel, hch, Nathan Lynch,
	Ingo Molnar, Alexey Dobriyan

Dave Hansen wrote:
> On Mon, 2009-03-02 at 11:44 -0600, Serge E. Hallyn wrote:
>> Quoting Dave Hansen (dave@linux.vnet.ibm.com):
>>> On Mon, 2009-03-02 at 11:22 -0600, Nathan Lynch wrote:
>>>> No.. I mean what if a process 1234 does
>>>>
>>>> f = fopen("/proc/1234/stat", "r");
>>>>
>>>> and is then checkpointed.  Can that path be resolved during restart,
>>>> before pid 1234 is alive?
>>> Heh, that's a good one.
>>>
>>> It does mean that we can't do restore like this:
>>>
>>> 	for_each_cr_task()
>>> 		restore_task_struct()
>>> 		restore_files()
>>> 		...
>>>
>>> We have to do:
>>>
>>> 	for_each_cr_task()
>>> 		restore_task_struct()
>>> 	for_each_cr_task()
>>> 		restore_files()
>>>
>> Which is what we actually do, right?
> 
> OK, I have a really evil one.
> 
> What if task 1234 does:
> 
> 	open(O_RDONLY, "/proc/5678/fdinfo/44");
> 
> and task 5678 does:
> 
> 	open(O_RDONLY, "/proc/5678/fdinfo/55");
> 
> There is no right order.
> 
> The only right way I can think to do it is that we have to loop on the
> restore and defer files that we can't seem to find right now, hoping
> that they'll show up as the restore progresses.

or the restore algorithm should support recursion. for example, epoll, 
attached 'struct files' to af_unix socket, pipes (2 ends), fifos (idem),
connected socket (you need the listening end), etc. 

C.

> Basically:
> 
> 	for_each_cr_task()
> 		deferred_files = restore_files()
> retry:
> 	making_progress = 0
> 	for_each(deferred_file)
> 		restore(deferred_file)
> 	if (making_progress)
> 		goto retry;




 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-03-03 13:15       ` Christoph Hellwig
@ 2009-03-20 21:13         ` Dave Hansen
  2009-03-20 21:30           ` Oren Laadan
  0 siblings, 1 reply; 41+ messages in thread
From: Dave Hansen @ 2009-03-20 21:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: containers, Ingo Molnar, linux-kernel, Alexey Dobriyan

On Tue, 2009-03-03 at 08:15 -0500, Christoph Hellwig wrote:
> On Mon, Mar 02, 2009 at 09:05:56AM -0800, Dave Hansen wrote:
> > On Sat, 2009-02-28 at 15:53 -0500, Christoph Hellwig wrote:
> > > Also the double-use of the op seem not very nice to me.  Is there any
> > > real life use case were you would have the operation on a file but
> > > sometimes not allow checkpoiting?
> > 
> > I'm still reaching here...
> > 
> > I was thinking of /proc.  Opening your own /proc/$$/* would certainly be
> > considered OK.  But, doing it for some other process not in your pid
> > namespace would not be OK and would not be checkpointable.
> > 
> > I know we're not quite in real-life territory here, yet, but I'm still
> > thinking.
> 
> That mighr be a good enough excuse, I was just wondering what the use
> case was.

I just thought of another one: unlinked files and directories.  They're
a pain to checkpoint and won't be supported for a while.  Holding open
an unlinked file would make a process uncheckpointable for a bit.

-- Dave


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [RFC][PATCH 5/8] add f_op for checkpointability
  2009-03-20 21:13         ` Dave Hansen
@ 2009-03-20 21:30           ` Oren Laadan
  0 siblings, 0 replies; 41+ messages in thread
From: Oren Laadan @ 2009-03-20 21:30 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Christoph Hellwig, Alexey Dobriyan, containers, Ingo Molnar,
	linux-kernel



Dave Hansen wrote:
> On Tue, 2009-03-03 at 08:15 -0500, Christoph Hellwig wrote:
>> On Mon, Mar 02, 2009 at 09:05:56AM -0800, Dave Hansen wrote:
>>> On Sat, 2009-02-28 at 15:53 -0500, Christoph Hellwig wrote:
>>>> Also the double-use of the op seem not very nice to me.  Is there any
>>>> real life use case were you would have the operation on a file but
>>>> sometimes not allow checkpoiting?
>>> I'm still reaching here...
>>>
>>> I was thinking of /proc.  Opening your own /proc/$$/* would certainly be
>>> considered OK.  But, doing it for some other process not in your pid
>>> namespace would not be OK and would not be checkpointable.
>>>
>>> I know we're not quite in real-life territory here, yet, but I'm still
>>> thinking.
>> That mighr be a good enough excuse, I was just wondering what the use
>> case was.
> 
> I just thought of another one: unlinked files and directories.  They're
> a pain to checkpoint and won't be supported for a while.  Holding open
> an unlinked file would make a process uncheckpointable for a bit.

Actually, unlinked directories are the easiest to restore: you recreate it,
open it, and then immediately delete them. This works, because an unlinked
directory does not hold any data except for its existence.

For unlinked files, in physical file systems (non pseudo), for many apps
they are small in size, so a first approximation would be to save the
contents of that file with the checkpoint image, and on restart recreate
the file, put the contents, open it, and then unlink it again.

Oren.

> 
> -- Dave
> 
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2009-03-20 21:32 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-02-27 20:34 [RFC][PATCH 1/8] kill '_data' in cr_hdr_fd_data name Dave Hansen
2009-02-27 20:34 ` [RFC][PATCH 2/8] breakout fdinfo sprintf() into its own function Dave Hansen
2009-02-27 20:56   ` Vegard Nossum
2009-02-27 21:23     ` Dave Hansen
2009-02-27 20:34 ` [RFC][PATCH 3/8] create fs flags to mark c/r supported fs's Dave Hansen
2009-02-27 21:16   ` Alexey Dobriyan
2009-02-27 21:20     ` Dave Hansen
2009-02-27 20:34 ` [RFC][PATCH 4/8] file c/r: expose functions to query fs support Dave Hansen
2009-02-27 21:14   ` Sukadev Bhattiprolu
2009-02-27 21:24     ` Dave Hansen
2009-02-27 21:32       ` Dave Hansen
2009-02-28  1:33   ` Sukadev Bhattiprolu
2009-02-27 20:34 ` [RFC][PATCH 5/8] add f_op for checkpointability Dave Hansen
2009-02-28  2:14   ` Sukadev Bhattiprolu
2009-02-28  2:51     ` Dave Hansen
2009-02-28 20:53   ` Christoph Hellwig
2009-02-28 21:37     ` Dave Hansen
2009-03-01 15:19       ` Serge E. Hallyn
2009-03-02 17:05     ` Dave Hansen
2009-03-03 13:15       ` Christoph Hellwig
2009-03-20 21:13         ` Dave Hansen
2009-03-20 21:30           ` Oren Laadan
2009-02-27 20:34 ` [RFC][PATCH 6/8] mark /dev/null and zero as checkpointable Dave Hansen
2009-02-27 20:34 ` [RFC][PATCH 7/8] add c/r info to fdinfo Dave Hansen
2009-02-27 20:34 ` [RFC][PATCH 8/8] check files for checkpointability Dave Hansen
2009-02-28  2:57   ` Sukadev Bhattiprolu
2009-03-01 17:00     ` Serge E. Hallyn
2009-03-04 23:41     ` Dave Hansen
2009-03-01 19:43   ` Serge E. Hallyn
2009-03-02 13:37   ` Serge E. Hallyn
2009-03-02 15:56     ` Dave Hansen
2009-03-02 15:59     ` Nathan Lynch
2009-03-02 16:27       ` Dave Hansen
2009-03-02 17:22         ` Nathan Lynch
2009-03-02 17:30           ` Dave Hansen
2009-03-02 17:44             ` Serge E. Hallyn
2009-03-02 17:58               ` Dave Hansen
2009-03-02 18:13               ` Dave Hansen
2009-03-02 18:35                 ` Serge E. Hallyn
2009-03-05  8:20                 ` Cedric Le Goater
2009-03-02 16:28       ` Serge E. Hallyn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).