All of lore.kernel.org
 help / color / mirror / Atom feed
* Add support for connected INET sockets
@ 2009-11-10 18:47 Dan Smith
  2009-11-10 18:47 ` [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v4) Dan Smith
       [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 2 replies; 18+ messages in thread
From: Dan Smith @ 2009-11-10 18:47 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This set includes the previous INET connected sockets patch with a few
trivial changes suggested by Oren, as well as the readme update.  The
first patch fixes socket buffer checkpoint to include fragments, and
includes the changes in the previous first patch that added header mark
information.  As a result, this set now also includes a fix to the UNIX
code to cope with the change.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers
       [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-10 18:47   ` Dan Smith
       [not found]     ` <1257878856-25520-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-10 18:47   ` [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file Dan Smith
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 18+ messages in thread
From: Dan Smith @ 2009-11-10 18:47 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

The INET code often creates socket buffers by attaching fragments instead
of writing to the linear region.  This extends the skb write functions
to write out the linear and fragment regions of an skb, and adds a
function to be used by others wishing to restore an skb in the same way.
This also includes the header-mark-setting bits from a previous patch.

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 include/linux/checkpoint.h     |    1 +
 include/linux/checkpoint_hdr.h |   11 ++
 net/checkpoint.c               |  253 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 242 insertions(+), 23 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index dfcb59b..3e73e68 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -100,6 +100,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct socket *socket,
 			      struct sockaddr *loc, unsigned *loc_len,
 			      struct sockaddr *rem, unsigned *rem_len);
+struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 5d9c088..ace4139 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -561,8 +561,19 @@ struct ckpt_hdr_socket_queue {
 
 struct ckpt_hdr_socket_buffer {
 	struct ckpt_hdr h;
+	__u64 transport_header;
+	__u64 network_header;
+	__u64 mac_header;
+	__u64 lin_len; /* Length of linear data */
+	__u64 frg_len; /* Length of fragment data */
+	__u64 skb_len; /* Length of skb (adjusted) */
+	__u64 hdr_len; /* Length of skipped header */
+	__u64 mac_len;
 	__s32 sk_objref;
 	__s32 pr_objref;
+	__u16 protocol;
+	__u16 nr_frags;
+	__u8 cb[48];
 };
 
 #define CKPT_UNIX_LINKED 1
diff --git a/net/checkpoint.c b/net/checkpoint.c
index dd23efd..00365b2 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -17,9 +17,11 @@
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/fs_struct.h>
+#include <linux/highmem.h>
 
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
+#include <net/tcp.h>
 
 #include <linux/deferqueue.h>
 #include <linux/checkpoint.h>
@@ -88,6 +90,233 @@ static int sock_copy_buffers(struct sk_buff_head *from,
 	return -EAGAIN;
 }
 
+static void sock_record_header_info(struct sk_buff *skb,
+				    struct ckpt_hdr_socket_buffer *h)
+{
+
+	h->mac_len = skb->mac_len;
+	h->skb_len = skb->len;
+	h->hdr_len = skb->data - skb->head;
+	h->lin_len = (__u64)(skb->tail - skb->head);
+	h->frg_len = skb->data_len;
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	h->transport_header = skb->transport_hdr;
+	h->network_header = skb->network_header;
+	h->mac_header = skb->mac_header;
+#else
+	h->transport_header = skb->transport_header - skb->head;
+	h->network_header = skb->network_header - skb->head;
+	h->mac_header = skb->mac_header - skb->head;
+#endif
+
+	memcpy(h->cb, skb->cb, sizeof(skb->cb));
+	h->nr_frags = skb_shinfo(skb)->nr_frags;
+}
+
+int sock_restore_header_info(struct sk_buff *skb,
+			     struct ckpt_hdr_socket_buffer *h)
+{
+	if (h->mac_header + h->mac_len != h->network_header) {
+		ckpt_debug("skb mac_header %llu+%llu != network header %llu\n",
+			   h->mac_header, h->mac_len, h->network_header);
+		return -EINVAL;
+	}
+
+	if (h->network_header > h->lin_len) {
+		ckpt_debug("skb network header %llu > linear length %llu\n",
+			   h->network_header, h->lin_len);
+		return -EINVAL;
+	}
+
+	if (h->transport_header > h->lin_len) {
+		ckpt_debug("skb transport header %llu > linear length %llu\n",
+			   h->transport_header, h->lin_len);
+		return -EINVAL;
+	}
+
+	if (h->skb_len > SKB_MAX_ALLOC) {
+		ckpt_debug("skb total length %llu larger than max of %lu\n",
+			   h->skb_len, SKB_MAX_ALLOC);
+		return -EINVAL;
+	}
+
+	skb_set_transport_header(skb, h->transport_header);
+	skb_set_network_header(skb, h->network_header);
+	skb_set_mac_header(skb, h->mac_header);
+	skb->mac_len = h->mac_len;
+
+	/* FIXME: This should probably be sanitized per-protocol to
+	 * make sure nothing bad happens if it is hijacked.  For the
+	 * current set of protocols that we restore this way, the data
+	 * contained within is not very risky (flags and sequence
+	 * numbers) but could still be evalutated from a
+	 * could-the-user- have-set-these-flags point of view.
+	 */
+	memcpy(skb->cb, h->cb, sizeof(skb->cb));
+
+	skb->data = skb->head + skb->hdr_len;
+	skb->len = h->skb_len;
+
+	return 0;
+}
+
+static int sock_restore_skb_frag(struct ckpt_ctx *ctx,
+				 struct sk_buff *skb,
+				 int frag_idx)
+{
+	int ret = 0;
+	int fraglen;
+	struct page *page;
+	void *buf;
+
+	fraglen = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+	if (fraglen < 0)
+		return fraglen;
+
+	if (fraglen > PAGE_SIZE) {
+		ckpt_debug("skb frag size %i > PAGE_SIZE\n", fraglen);
+		return -EINVAL;
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	buf = kmap(page);
+	ret = ckpt_kread(ctx, buf, fraglen);
+	kunmap(page);
+
+	if (ret) {
+		ckpt_debug("failed to read fragment: %i\n", ret);
+		ret = -EINVAL;
+		__free_page(page);
+	} else {
+		ckpt_debug("read %i for fragment %i\n", fraglen, frag_idx);
+		skb_add_rx_frag(skb, frag_idx, page, 0, fraglen);
+	}
+
+	return ret < 0 ? ret : fraglen;
+}
+
+struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	struct sk_buff *skb = NULL;
+	int i;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return (struct sk_buff *)h;
+
+	if (h->lin_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket linear buffer too big (%llu > %lu)\n",
+			   h->lin_len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	} else if (h->frg_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket frag size too big (%llu > %lu\n",
+			   h->frg_len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	skb = alloc_skb(h->lin_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = _ckpt_read_obj_type(ctx, skb_put(skb, h->lin_len),
+				  h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("read linear skb length %llu: %i\n", h->lin_len, ret);
+	if (ret < 0) {
+		goto out;
+	}
+
+	for (i = 0; i < h->nr_frags; i++) {
+		ret = sock_restore_skb_frag(ctx, skb, i);
+		ckpt_debug("read skb frag %i/%i: %i\n",
+			   i + 1, h->nr_frags, ret);
+		if (ret < 0)
+			goto out;
+		h->frg_len -= ret;
+	}
+
+	if (h->frg_len != 0) {
+		ckpt_debug("length %llu remaining after reading frags\n",
+			   h->frg_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sock_restore_header_info(skb, h);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0) {
+		kfree_skb(skb);
+		skb = ERR_PTR(ret);
+	}
+
+	return skb;
+}
+
+static int __sock_write_skb(struct ckpt_ctx *ctx,
+			    struct sk_buff *skb,
+			    int dst_objref)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (!h)
+		return -ENOMEM;
+
+	if (dst_objref > 0) {
+		BUG_ON(!skb->sk);
+		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
+		if (ret < 0)
+			goto out;
+		h->sk_objref = ret;
+		h->pr_objref = dst_objref;
+	}
+
+	sock_record_header_info(skb, h);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj_type(ctx, skb->head, h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("writing skb linear region %llu: %i\n", h->lin_len, ret);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		u8 *vaddr = kmap(frag->page);
+
+		ckpt_debug("writing buffer fragment %i/%i (%i)\n",
+			   i + 1, h->nr_frags, frag->size);
+		ret = ckpt_write_obj_type(ctx, vaddr + frag->page_offset,
+					  frag->size, CKPT_HDR_BUFFER);
+		kunmap(frag->page);
+		h->frg_len -= frag->size;
+		if (ret < 0)
+			goto out;
+	}
+
+	WARN_ON(h->frg_len != 0);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
 				int dst_objref)
@@ -95,13 +324,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 	struct sk_buff *skb;
 
 	skb_queue_walk(queue, skb) {
-		struct ckpt_hdr_socket_buffer *h;
 		int ret = 0;
 
-		/* FIXME: This could be a false positive for non-unix
-		 *        buffers, so add a type check here in the
-		 *        future
-		 */
 		if (UNIXCB(skb).fp) {
 			ckpt_write_err(ctx, "TE", "af_unix: pass fd", -EBUSY);
 			return -EBUSY;
@@ -113,25 +337,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		 * because we don't save out (or restore) the control
 		 * information contained in the skb.
 		 */
-		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
-		if (!h)
-			return -ENOMEM;
-
-		BUG_ON(!skb->sk);
-		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
-		if (ret < 0)
-			goto end;
-		h->sk_objref = ret;
-		h->pr_objref = dst_objref;
-
-		ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
-		if (ret < 0)
-			goto end;
 
-		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
-					  CKPT_HDR_BUFFER);
-	end:
-		ckpt_hdr_put(ctx, h);
+		ret = __sock_write_skb(ctx, skb, dst_objref);
 		if (ret < 0)
 			return ret;
 	}
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v4)
  2009-11-10 18:47 Add support for connected INET sockets Dan Smith
@ 2009-11-10 18:47 ` Dan Smith
  2009-11-11 20:32   ` Serge E. Hallyn
       [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 18+ messages in thread
From: Dan Smith @ 2009-11-10 18:47 UTC (permalink / raw)
  To: containers; +Cc: netdev

This patch adds basic support for C/R of open INET sockets.  I think that
all the important bits of the TCP and ICSK socket structures is saved,
but I think there is still some additional IPv6 stuff that needs to be
handled.

With this patch applied, the following script can be used to demonstrate
the functionality:

  https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html

It shows that this enables migration of a sendmail process with open
connections from one machine to another without dropping.

We probably need comments from the netdev people about the quality of
sanity checking we do on the values in the ckpt_hdr_socket_inet
structure on restart.

Note that this still doesn't address lingering sockets yet.

Changes in v4:
 - Use the new socket buffer restore functions introduced in the
   previous patch
 - Move listen_sockets list under the restart items in ckpt_ctx
 - Rename RESTART_SOCK_LISTENONLY to RESTART_CONN_RESET

Changes in v3:
 - Prevent restart from allowing a bind on a <1024 port unless the
   user is granted that capability
 - Add some sanity checking in the inet_precheck() function to make sure
   the values read from the checkpoint image are within acceptable ranges
 - Check the result of sock_restore_header_info() and fail if needed

Changes in v2:
 - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
   structure instead of saving them separately
 - Fix 'sock' naming in sock_cptrst()
 - Don't take the queue lock before skb_queue_tail() since it is
   done for us
 - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
   flag is specified on sys_restart()
 - Pull the implementation of the list of listening sockets back into
   this patch
 - Fix dangling printk
 - Add some comments around the parent/child restore logic

Cc: netdev@vger.kernel.org
Acked-by: Oren Laadan <orenl@librato.com>
Signed-off-by: Dan Smith <danms@us.ibm.com>
---
 checkpoint/sys.c                 |    4 +
 include/linux/checkpoint.h       |    5 +-
 include/linux/checkpoint_hdr.h   |   95 +++++++++
 include/linux/checkpoint_types.h |    1 +
 net/checkpoint.c                 |   27 ++--
 net/ipv4/checkpoint.c            |  391 ++++++++++++++++++++++++++++++++++----
 6 files changed, 473 insertions(+), 50 deletions(-)

diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 260a1ee..df00973 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -221,6 +221,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
 	kfree(ctx->pids_arr);
 
+	sock_listening_list_free(&ctx->listen_sockets);
+
 	kfree(ctx);
 }
 
@@ -249,6 +251,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
 	spin_lock_init(&ctx->lock);
 #endif
 
+	INIT_LIST_HEAD(&ctx->listen_sockets);
+
 	err = -EBADF;
 	ctx->file = fget(fd);
 	if (!ctx->file)
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 3e73e68..d4765f6 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -19,6 +19,7 @@
 #define RESTART_TASKSELF	0x1
 #define RESTART_FROZEN		0x2
 #define RESTART_GHOST		0x4
+#define RESTART_CONN_RESET	0x8
 
 #ifdef __KERNEL__
 #ifdef CONFIG_CHECKPOINT
@@ -48,7 +49,8 @@
 #define RESTART_USER_FLAGS  \
 	(RESTART_TASKSELF | \
 	 RESTART_FROZEN | \
-	 RESTART_GHOST)
+	 RESTART_GHOST | \
+	 RESTART_CONN_RESET)
 
 extern int walk_task_subtree(struct task_struct *task,
 			     int (*func)(struct task_struct *, void *),
@@ -101,6 +103,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct sockaddr *loc, unsigned *loc_len,
 			      struct sockaddr *rem, unsigned *rem_len);
 struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
+void sock_listening_list_free(struct list_head *head);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index ace4139..7c81f93 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -20,6 +20,7 @@
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #else
 #include <sys/socket.h>
 #include <sys/un.h>
@@ -592,6 +593,100 @@ struct ckpt_hdr_socket_unix {
 
 struct ckpt_hdr_socket_inet {
 	struct ckpt_hdr h;
+	__u32 daddr;
+	__u32 rcv_saddr;
+	__u32 saddr;
+	__u16 dport;
+	__u16 num;
+	__u16 sport;
+	__s16 uc_ttl;
+	__u16 cmsg_flags;
+
+	struct {
+		__u64 timeout;
+		__u32 ato;
+		__u32 lrcvtime;
+		__u16 last_seg_size;
+		__u16 rcv_mss;
+		__u8 pending;
+		__u8 quick;
+		__u8 pingpong;
+		__u8 blocked;
+	} icsk_ack __attribute__ ((aligned(8)));
+
+	/* FIXME: Skipped opt, tos, multicast, cork settings */
+
+	struct {
+		__u32 rcv_nxt;
+		__u32 copied_seq;
+		__u32 rcv_wup;
+		__u32 snd_nxt;
+		__u32 snd_una;
+		__u32 snd_sml;
+		__u32 rcv_tstamp;
+		__u32 lsndtime;
+
+		__u32 snd_wl1;
+		__u32 snd_wnd;
+		__u32 max_window;
+		__u32 mss_cache;
+		__u32 window_clamp;
+		__u32 rcv_ssthresh;
+		__u32 frto_highmark;
+
+		__u32 srtt;
+		__u32 mdev;
+		__u32 mdev_max;
+		__u32 rttvar;
+		__u32 rtt_seq;
+
+		__u32 packets_out;
+		__u32 retrans_out;
+
+		__u32 snd_up;
+		__u32 rcv_wnd;
+		__u32 write_seq;
+		__u32 pushed_seq;
+		__u32 lost_out;
+		__u32 sacked_out;
+		__u32 fackets_out;
+		__u32 tso_deferred;
+		__u32 bytes_acked;
+
+		__s32 lost_cnt_hint;
+		__u32 retransmit_high;
+
+		__u32 lost_retrans_low;
+
+		__u32 prior_ssthresh;
+		__u32 high_seq;
+
+		__u32 retrans_stamp;
+		__u32 undo_marker;
+		__s32 undo_retrans;
+		__u32 total_retrans;
+
+		__u32 urg_seq;
+		__u32 keepalive_time;
+		__u32 keepalive_intvl;
+
+		__u16 urg_data;
+		__u16 advmss;
+		__u8 frto_counter;
+		__u8 nonagle;
+
+		__u8 ecn_flags;
+		__u8 reordering;
+
+		__u8 keepalive_probes;
+	} tcp __attribute__ ((aligned(8)));
+
+	struct {
+		struct in6_addr saddr;
+		struct in6_addr rcv_saddr;
+		struct in6_addr daddr;
+	} inet6 __attribute__ ((aligned(8)));
+
 	__u32 laddr_len;
 	__u32 raddr_len;
 	struct sockaddr_in laddr;
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 5cc11d9..eac0d5a 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -78,6 +78,7 @@ struct ckpt_ctx {
 	wait_queue_head_t waitq;	/* waitqueue for restarting tasks */
 	wait_queue_head_t ghostq;	/* waitqueue for ghost tasks */
 	struct cred *realcred, *ecred;	/* tmp storage for cred at restart */
+	struct list_head listen_sockets;/* listening parent sockets */
 
 	struct ckpt_stats stats;	/* statistics */
 
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 00365b2..32ccaba 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -319,6 +319,7 @@ static int __sock_write_skb(struct ckpt_ctx *ctx,
 
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
+				uint16_t family,
 				int dst_objref)
 {
 	struct sk_buff *skb;
@@ -331,11 +332,11 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 			return -EBUSY;
 		}
 
-		/* The other ancillary messages are always present
-		 * unlike descriptors.  Even though we can't detect
-		 * them and fail the checkpoint, we're not at risk
-		 * because we don't save out (or restore) the control
-		 * information contained in the skb.
+		/* The other ancillary messages UNIX are always
+		 * present unlike descriptors.  Even though we can't
+		 * detect them and fail the checkpoint, we're not at
+		 * risk because we don't restore the control
+		 * information in the UNIX code.
 		 */
 
 		ret = __sock_write_skb(ctx, skb, dst_objref);
@@ -348,6 +349,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 
 static int sock_write_buffers(struct ckpt_ctx *ctx,
 			      struct sk_buff_head *queue,
+			      uint16_t family,
 			      int dst_objref)
 {
 	struct ckpt_hdr_socket_queue *h;
@@ -367,7 +369,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
 	h->skb_count = ret;
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
 	if (!ret)
-		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
+		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
 
  out:
 	ckpt_hdr_put(ctx, h);
@@ -389,12 +391,14 @@ int sock_deferred_write_buffers(void *data)
 		return dst_objref;
 	}
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write recv buffers: %i\n", ret);
 	if (ret < 0)
 		return ret;
 
-	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
+	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
+				 dq->sk->sk_family, dst_objref);
 	ckpt_debug("write send buffers: %i\n", ret);
 
 	return ret;
@@ -919,10 +923,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
 		goto err;
 
 	if ((h->sock_common.family == AF_INET) &&
-	    (h->sock.state != TCP_LISTEN)) {
-		/* Temporary hack to enable restore of TCP_LISTEN sockets
-		 * while forcing anything else to a closed state
-		 */
+	    (h->sock.state != TCP_LISTEN) &&
+	    (ctx->uflags & RESTART_CONN_RESET)) {
+		ckpt_debug("Forcing open socket closed\n");
 		sock->sk->sk_state = TCP_CLOSE;
 		sock->state = SS_UNCONNECTED;
 	}
diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
index 9cbbf5e..ee41633 100644
--- a/net/ipv4/checkpoint.c
+++ b/net/ipv4/checkpoint.c
@@ -17,6 +17,7 @@
 #include <linux/deferqueue.h>
 #include <net/tcp_states.h>
 #include <net/tcp.h>
+#include <net/ipv6.h>
 
 struct dq_sock {
 	struct ckpt_ctx *ctx;
@@ -28,6 +29,236 @@ struct dq_buffers {
 	struct sock *sk;
 };
 
+struct listen_item {
+	struct sock *sk;
+	struct list_head list;
+};
+
+void sock_listening_list_free(struct list_head *head)
+{
+	struct listen_item *item, *tmp;
+
+	list_for_each_entry_safe(item, tmp, head, list) {
+		list_del(&item->list);
+		kfree(item);
+	}
+}
+
+static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	item->sk = sk;
+	list_add(&item->list, &ctx->listen_sockets);
+
+	return 0;
+}
+
+static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
+{
+	struct listen_item *item;
+
+	list_for_each_entry(item, &ctx->listen_sockets, list) {
+		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
+			return item->sk;
+	}
+
+	return NULL;
+}
+
+static int sock_hash_parent(void *data)
+{
+	struct dq_sock *dq = (struct dq_sock *)data;
+	struct sock *parent;
+
+	ckpt_debug("INET post-restart hash\n");
+
+	dq->sk->sk_prot->hash(dq->sk);
+
+	/* If there is a listening socket with the same source port,
+	 * then become a child of that socket [we are the result of an
+	 * accept()].  Otherwise hash ourselves directly in [we are
+	 * the result of a connect()]
+	 */
+
+	parent = sock_get_parent(dq->ctx, dq->sk);
+	if (parent) {
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+		local_bh_disable();
+		__inet_inherit_port(parent, dq->sk);
+		local_bh_enable();
+	} else {
+		inet_sk(dq->sk)->num = 0;
+		inet_hash_connect(&tcp_death_row, dq->sk);
+		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
+	}
+
+	return 0;
+}
+
+static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
+{
+	struct dq_sock dq;
+
+	dq.sk = sock;
+	dq.ctx = ctx;
+
+	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
+			      sock_hash_parent, NULL);
+}
+
+static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
+				struct tcp_sock *sk,
+				struct ckpt_hdr_socket_inet *hh,
+				int op)
+{
+	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
+	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
+	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
+	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
+	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
+	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
+	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
+	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
+
+	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
+	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
+	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
+	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
+	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
+	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
+	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
+	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
+	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
+	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
+
+	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
+	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
+	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
+	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
+	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
+
+	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
+	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
+
+	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
+	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
+	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
+	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
+
+	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
+
+	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
+	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
+	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
+	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
+	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
+	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
+	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
+	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
+
+	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
+	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
+
+	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
+
+	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
+	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
+
+	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
+	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
+	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
+	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
+
+	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
+	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
+	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
+
+	if (!skb_queue_empty(&sk->ucopy.prequeue))
+		printk("PREQUEUE!\n");
+
+	return 0;
+}
+
+static int sock_inet_restore_addrs(struct inet_sock *inet,
+				   struct ckpt_hdr_socket_inet *hh)
+{
+	inet->daddr = hh->raddr.sin_addr.s_addr;
+	inet->saddr = hh->laddr.sin_addr.s_addr;
+	inet->rcv_saddr = inet->saddr;
+
+	inet->dport = hh->raddr.sin_port;
+	inet->sport = hh->laddr.sin_port;
+
+	return 0;
+}
+
+static int sock_inet_cptrst(struct ckpt_ctx *ctx,
+			    struct sock *sk,
+			    struct ckpt_hdr_socket_inet *hh,
+			    int op)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int ret;
+
+	if (op == CKPT_CPT) {
+		CKPT_COPY(op, hh->daddr, inet->daddr);
+		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
+		CKPT_COPY(op, hh->dport, inet->dport);
+		CKPT_COPY(op, hh->saddr, inet->saddr);
+		CKPT_COPY(op, hh->sport, inet->sport);
+	} else {
+		ret = sock_inet_restore_addrs(inet, hh);
+		if (ret)
+			return ret;
+	}
+
+	CKPT_COPY(op, hh->num, inet->num);
+	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
+	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
+
+	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
+	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
+	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
+	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
+	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
+	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
+	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
+	CKPT_COPY(op,
+		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
+	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
+
+	if (sk->sk_protocol == IPPROTO_TCP)
+		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
+	else if (sk->sk_protocol == IPPROTO_UDP)
+		ret = 0;
+	else {
+		ckpt_write_err(ctx, "T", "unknown socket protocol %d",
+			       sk->sk_protocol);
+		ret = -EINVAL;
+	}
+
+	if (sk->sk_family == AF_INET6) {
+		struct ipv6_pinfo *inet6 = inet6_sk(sk);
+		if (op == CKPT_CPT) {
+			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
+			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
+			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
+		} else {
+			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
+			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
+			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
+		}
+	}
+
+	return ret;
+}
+
 int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 {
 	struct ckpt_hdr_socket_inet *in;
@@ -43,6 +274,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
 	if (ret)
 		goto out;
 
+	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
+	if (ret < 0)
+		goto out;
+
 	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
  out:
 	ckpt_hdr_put(ctx, in);
@@ -55,51 +290,22 @@ int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
 	return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
 }
 
-static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+static int inet_read_buffer(struct ckpt_ctx *ctx,
+			    struct sk_buff_head *queue)
 {
-	struct ckpt_hdr_socket_buffer *h;
-	int len;
-	int ret;
 	struct sk_buff *skb = NULL;
 
-	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
-	if (IS_ERR(h))
-		return PTR_ERR(h);
-
-	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
-	if (len < 0) {
-		ret = len;
-		goto out;
-	} else if (len > SKB_MAX_ALLOC) {
-		ckpt_debug("Socket buffer too big (%i > %lu)",
-			   len, SKB_MAX_ALLOC);
-		ret = -ENOSPC;
-		goto out;
-	}
-
-	skb = alloc_skb(len, GFP_KERNEL);
-	if (!skb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ckpt_kread(ctx, skb_put(skb, len), len);
-	if (ret < 0)
-		goto out;
+	skb = sock_restore_skb(ctx);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
 
-	spin_lock(&queue->lock);
 	skb_queue_tail(queue, skb);
-	spin_unlock(&queue->lock);
- out:
-	ckpt_hdr_put(ctx, h);
-
-	if ((ret < 0) && skb)
-		kfree_skb(skb);
 
-	return ret;
+	return skb->len;
 }
 
-static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
+static int inet_read_buffers(struct ckpt_ctx *ctx,
+			     struct sk_buff_head *queue)
 {
 	struct ckpt_hdr_socket_queue *h;
 	int ret = 0;
@@ -162,6 +368,19 @@ static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
 
 static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
 {
+	__u8 icsk_ack_mask = ICSK_ACK_SCHED | ICSK_ACK_TIMER |
+		ICSK_ACK_PUSHED | ICSK_ACK_PUSHED2;
+	__u16 urg_mask = TCP_URG_VALID | TCP_URG_NOTYET | TCP_URG_READ;
+	__u8 nonagle_mask = TCP_NAGLE_OFF | TCP_NAGLE_CORK | TCP_NAGLE_PUSH;
+	__u8 ecn_mask = TCP_ECN_OK | TCP_ECN_QUEUE_CWR | TCP_ECN_DEMAND_CWR;
+
+	if ((htons(in->laddr.sin_port) < PROT_SOCK) &&
+	    !capable(CAP_NET_BIND_SERVICE)) {
+		ckpt_debug("unable to bind to port %hu\n",
+			   htons(in->laddr.sin_port));
+		return -EINVAL;
+	}
+
 	if (in->laddr_len > sizeof(struct sockaddr_in)) {
 		ckpt_debug("laddr_len is too big\n");
 		return -EINVAL;
@@ -172,6 +391,77 @@ static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
 		return -EINVAL;
 	}
 
+	/* Set ato to the default */
+	in->icsk_ack.ato = TCP_ATO_MIN;
+
+	/* No quick acks are scheduled after a restart */
+	in->icsk_ack.quick = 0;
+
+	if (in->icsk_ack.pending & ~icsk_ack_mask) {
+		ckpt_debug("invalid pending flags 0x%x\n",
+			   in->icsk_ack.pending & ~icsk_ack_mask);
+		return -EINVAL;
+	}
+
+	if (in->icsk_ack.pingpong > 1) {
+		ckpt_debug("invalid icsk_ack.pingpong value\n");
+		return -EINVAL;
+	}
+
+	if (in->icsk_ack.blocked > 1) {
+		ckpt_debug("invalid icsk_ack.blocked value\n");
+		return -EINVAL;
+	}
+
+	/* do_tcp_setsockopt() quietly makes this coercion */
+	if (in->tcp.window_clamp < (SOCK_MIN_RCVBUF / 2))
+		in->tcp.window_clamp = SOCK_MIN_RCVBUF / 2;
+	else if (in->tcp.window_clamp > 65535U) {
+		ckpt_debug("invalid window_clamp value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.rcv_ssthresh > (4U * in->tcp.advmss))
+		in->tcp.rcv_ssthresh = 4U * in->tcp.advmss;
+
+	/* These will all be recalculated on the next call to
+	 * tcp_rtt_estimator()
+	 */
+	in->tcp.srtt = in->tcp.mdev = in->tcp.mdev_max = 0;
+	in->tcp.rttvar = in->tcp.rtt_seq = 0;
+
+	/* Might want to set packets_out to zero ? */
+
+	if (in->tcp.rcv_wnd > MAX_TCP_WINDOW)
+		in->tcp.rcv_wnd = MAX_TCP_WINDOW;
+
+	if (in->tcp.keepalive_intvl > MAX_TCP_KEEPINTVL) {
+		ckpt_debug("keepalive_intvl %i out of range\n",
+			   in->tcp.keepalive_intvl);
+		return -EINVAL;
+	}
+
+	if (in->tcp.keepalive_probes > MAX_TCP_KEEPCNT) {
+		ckpt_debug("Invalid keepalive_probes value %i\n",
+			   in->tcp.keepalive_probes);
+		return -EINVAL;
+	}
+
+	if (in->tcp.urg_data & ~urg_mask) {
+		ckpt_debug("Invalid urg_data value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.nonagle & ~nonagle_mask) {
+		ckpt_debug("Invalid nonagle value\n");
+		return -EINVAL;
+	}
+
+	if (in->tcp.ecn_flags & ~ecn_mask) {
+		ckpt_debug("Invalid ecn_flags value\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -209,8 +499,35 @@ int inet_restore(struct ckpt_ctx *ctx,
 			ckpt_debug("inet listen: %i\n", ret);
 			if (ret < 0)
 				goto out;
+
+			/* We are a listening socket, so add ourselves
+			 * to the list of parent sockets.  This will
+			 * allow our children to find us later and
+			 * link up
+			 */
+
+			ret = sock_listening_list_add(ctx, sock->sk);
+			if (ret < 0)
+				goto out;
 		}
 	} else {
+		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
+		if (ret)
+			goto out;
+
+		if ((h->sock.state == TCP_ESTABLISHED) &&
+		    (h->sock.protocol == IPPROTO_TCP)) {
+			/* A connected socket that was spawned from an
+			 * accept() needs to be hashed with its parent
+			 * listening socket in order to receive
+			 * traffic on the original port.  Since we may
+			 * not have restarted the parent yet, we defer
+			 * this until later when we know we have all
+			 * the listening sockets accounted for.
+			 */
+			ret = sock_defer_hash(ctx, sock->sk);
+		}
+
 		if (!sock_flag(sock->sk, SOCK_DEAD))
 			ret = inet_defer_restore_buffers(ctx, sock->sk);
 	}
-- 
1.6.3.3


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-10 18:47   ` [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers Dan Smith
@ 2009-11-10 18:47   ` Dan Smith
       [not found]     ` <1257878856-25520-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-10 18:47   ` [PATCH 4/4] Add some content to the readme.txt for socket c/r Dan Smith
  2009-11-16 18:38   ` Add support for connected INET sockets Oren Laadan
  3 siblings, 1 reply; 18+ messages in thread
From: Dan Smith @ 2009-11-10 18:47 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 net/unix/checkpoint.c |   54 ++++++++++++++++++++++++++----------------------
 1 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
index 8b7cb22..bd1fc35 100644
--- a/net/unix/checkpoint.c
+++ b/net/unix/checkpoint.c
@@ -200,7 +200,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 	uint8_t peer_shutdown = 0;
 	void *buf = NULL;
 	int sndbuf;
-	int len;
 	int ret = 0;
 
 	memset(&msg, 0, sizeof(msg));
@@ -209,14 +208,30 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 	if (IS_ERR(h))
 		return PTR_ERR(h);
 
-	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
-	if (len < 0) {
-		ret = len;
+	if (h->lin_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket buffer too big (%llu > %lu)\n",
+			   h->lin_len, SKB_MAX_ALLOC);
+		ret = -EINVAL;
+		goto out;
+	} else if (h->nr_frags != 0) {
+		ckpt_debug("unix socket claims to have fragments\n");
+		ret = -EINVAL;
 		goto out;
-	} else if (len > SKB_MAX_ALLOC) {
-		ckpt_debug("Socket buffer too big (%i > %lu)",
-			   len, SKB_MAX_ALLOC);
-		ret = -ENOSPC;
+	}
+
+	buf = kmalloc(h->lin_len, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	kvec.iov_len = h->lin_len;
+	kvec.iov_base = buf;
+	ret = _ckpt_read_obj_type(ctx, kvec.iov_base,
+				  h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("read unix socket buffer %llu: %i\n", h->lin_len, ret);
+	if (ret < h->lin_len) {
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -245,18 +260,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 		}
 	}
 
-	kvec.iov_len = len;
-	buf = kmalloc(len, GFP_KERNEL);
-	kvec.iov_base = buf;
-	if (!buf) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ckpt_kread(ctx, kvec.iov_base, len);
-	if (ret < 0)
-		goto out;
-
 	msg.msg_name = addr;
 	msg.msg_namelen = addrlen;
 
@@ -272,15 +275,16 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
 
 	/* Make sure there's room in the send buffer */
 	sndbuf = sk->sk_sndbuf;
-	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
+	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
 	    capable(CAP_NET_ADMIN))
-		sk->sk_sndbuf += len;
+		sk->sk_sndbuf += h->lin_len;
 	else
 		sk->sk_sndbuf = sysctl_wmem_max;
 
-	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, len);
-	ckpt_debug("kernel_sendmsg(%i,%i): %i\n", h->sk_objref, len, ret);
-	if ((ret > 0) && (ret != len))
+	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, h->lin_len);
+	ckpt_debug("kernel_sendmsg(%i,%llu): %i\n",
+		   h->sk_objref, h->lin_len, ret);
+	if ((ret > 0) && (ret != h->lin_len))
 		ret = -ENOMEM;
 
 	sk->sk_sndbuf = sndbuf;
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/4] Add some content to the readme.txt for socket c/r
       [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-10 18:47   ` [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers Dan Smith
  2009-11-10 18:47   ` [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file Dan Smith
@ 2009-11-10 18:47   ` Dan Smith
  2009-11-16 18:38   ` Add support for connected INET sockets Oren Laadan
  3 siblings, 0 replies; 18+ messages in thread
From: Dan Smith @ 2009-11-10 18:47 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 Documentation/checkpoint/readme.txt |   21 +++++++++++++++++++++
 1 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
index 33c70d4..c98e5ba 100644
--- a/Documentation/checkpoint/readme.txt
+++ b/Documentation/checkpoint/readme.txt
@@ -341,6 +341,27 @@ we will be forced to more carefully review each of those features.
 However, this can be controlled with a sysctl-variable.
 
 
+Sockets
+=======
+
+For AF_UNIX sockets, both endpoints must be within the checkpointed
+task set to maintain a connected state after restart.  UNIX sockets
+that are in the process of passing a descriptor will cause the
+checkpoint to fail with -EBUSY indicating a transient state that
+cannot be checkpointed.  Listening sockets with an unaccepted peer
+will also cause an -EBUSY result.
+
+AF_INET sockets with endpoints outside the checkpointed task set may
+remain open if care is taken to avoid TCP timeouts and resets.
+Careful use of a virtual IP address can help avoid emission of an RST
+to the non-checkpointed endpoint.  If desired, the
+RESTART_SOCK_LISTENONLY flag may be passed to the restart syscall
+which will cause all connected AF_INET sockets to be closed during the
+restore process.  Listening sockets will still be restored to their
+original state, which makes this mode a candidate for something like
+an HTTP server.
+
+
 Kernel interfaces
 =================
 
-- 
1.6.3.3

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers
       [not found]     ` <1257878856-25520-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-11 20:02       ` Serge E. Hallyn
  2009-11-16 18:30       ` Oren Laadan
  1 sibling, 0 replies; 18+ messages in thread
From: Serge E. Hallyn @ 2009-11-11 20:02 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> The INET code often creates socket buffers by attaching fragments instead
> of writing to the linear region.  This extends the skb write functions
> to write out the linear and fragment regions of an skb, and adds a
> function to be used by others wishing to restore an skb in the same way.
> This also includes the header-mark-setting bits from a previous patch.
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

> ---
>  include/linux/checkpoint.h     |    1 +
>  include/linux/checkpoint_hdr.h |   11 ++
>  net/checkpoint.c               |  253 ++++++++++++++++++++++++++++++++++++----
>  3 files changed, 242 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index dfcb59b..3e73e68 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -100,6 +100,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
>  			      struct socket *socket,
>  			      struct sockaddr *loc, unsigned *loc_len,
>  			      struct sockaddr *rem, unsigned *rem_len);
> +struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
> 
>  /* ckpt kflags */
>  #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 5d9c088..ace4139 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -561,8 +561,19 @@ struct ckpt_hdr_socket_queue {
> 
>  struct ckpt_hdr_socket_buffer {
>  	struct ckpt_hdr h;
> +	__u64 transport_header;
> +	__u64 network_header;
> +	__u64 mac_header;
> +	__u64 lin_len; /* Length of linear data */
> +	__u64 frg_len; /* Length of fragment data */
> +	__u64 skb_len; /* Length of skb (adjusted) */
> +	__u64 hdr_len; /* Length of skipped header */
> +	__u64 mac_len;
>  	__s32 sk_objref;
>  	__s32 pr_objref;
> +	__u16 protocol;
> +	__u16 nr_frags;
> +	__u8 cb[48];
>  };
> 
>  #define CKPT_UNIX_LINKED 1
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> index dd23efd..00365b2 100644
> --- a/net/checkpoint.c
> +++ b/net/checkpoint.c
> @@ -17,9 +17,11 @@
>  #include <linux/syscalls.h>
>  #include <linux/sched.h>
>  #include <linux/fs_struct.h>
> +#include <linux/highmem.h>
> 
>  #include <net/af_unix.h>
>  #include <net/tcp_states.h>
> +#include <net/tcp.h>
> 
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
> @@ -88,6 +90,233 @@ static int sock_copy_buffers(struct sk_buff_head *from,
>  	return -EAGAIN;
>  }
> 
> +static void sock_record_header_info(struct sk_buff *skb,
> +				    struct ckpt_hdr_socket_buffer *h)
> +{
> +
> +	h->mac_len = skb->mac_len;
> +	h->skb_len = skb->len;
> +	h->hdr_len = skb->data - skb->head;
> +	h->lin_len = (__u64)(skb->tail - skb->head);
> +	h->frg_len = skb->data_len;
> +
> +#ifdef NET_SKBUFF_DATA_USES_OFFSET
> +	h->transport_header = skb->transport_hdr;
> +	h->network_header = skb->network_header;
> +	h->mac_header = skb->mac_header;
> +#else
> +	h->transport_header = skb->transport_header - skb->head;
> +	h->network_header = skb->network_header - skb->head;
> +	h->mac_header = skb->mac_header - skb->head;
> +#endif
> +
> +	memcpy(h->cb, skb->cb, sizeof(skb->cb));
> +	h->nr_frags = skb_shinfo(skb)->nr_frags;
> +}
> +
> +int sock_restore_header_info(struct sk_buff *skb,
> +			     struct ckpt_hdr_socket_buffer *h)
> +{
> +	if (h->mac_header + h->mac_len != h->network_header) {
> +		ckpt_debug("skb mac_header %llu+%llu != network header %llu\n",
> +			   h->mac_header, h->mac_len, h->network_header);
> +		return -EINVAL;
> +	}
> +
> +	if (h->network_header > h->lin_len) {
> +		ckpt_debug("skb network header %llu > linear length %llu\n",
> +			   h->network_header, h->lin_len);
> +		return -EINVAL;
> +	}
> +
> +	if (h->transport_header > h->lin_len) {
> +		ckpt_debug("skb transport header %llu > linear length %llu\n",
> +			   h->transport_header, h->lin_len);
> +		return -EINVAL;
> +	}
> +
> +	if (h->skb_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("skb total length %llu larger than max of %lu\n",
> +			   h->skb_len, SKB_MAX_ALLOC);
> +		return -EINVAL;
> +	}
> +
> +	skb_set_transport_header(skb, h->transport_header);
> +	skb_set_network_header(skb, h->network_header);
> +	skb_set_mac_header(skb, h->mac_header);
> +	skb->mac_len = h->mac_len;
> +
> +	/* FIXME: This should probably be sanitized per-protocol to
> +	 * make sure nothing bad happens if it is hijacked.  For the
> +	 * current set of protocols that we restore this way, the data
> +	 * contained within is not very risky (flags and sequence
> +	 * numbers) but could still be evalutated from a
> +	 * could-the-user- have-set-these-flags point of view.
> +	 */
> +	memcpy(skb->cb, h->cb, sizeof(skb->cb));
> +
> +	skb->data = skb->head + skb->hdr_len;
> +	skb->len = h->skb_len;
> +
> +	return 0;
> +}
> +
> +static int sock_restore_skb_frag(struct ckpt_ctx *ctx,
> +				 struct sk_buff *skb,
> +				 int frag_idx)
> +{
> +	int ret = 0;
> +	int fraglen;
> +	struct page *page;
> +	void *buf;
> +
> +	fraglen = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> +	if (fraglen < 0)
> +		return fraglen;
> +
> +	if (fraglen > PAGE_SIZE) {
> +		ckpt_debug("skb frag size %i > PAGE_SIZE\n", fraglen);
> +		return -EINVAL;
> +	}
> +
> +	page = alloc_page(GFP_KERNEL);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	buf = kmap(page);
> +	ret = ckpt_kread(ctx, buf, fraglen);
> +	kunmap(page);
> +
> +	if (ret) {
> +		ckpt_debug("failed to read fragment: %i\n", ret);
> +		ret = -EINVAL;
> +		__free_page(page);
> +	} else {
> +		ckpt_debug("read %i for fragment %i\n", fraglen, frag_idx);
> +		skb_add_rx_frag(skb, frag_idx, page, 0, fraglen);
> +	}
> +
> +	return ret < 0 ? ret : fraglen;
> +}
> +
> +struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff *skb = NULL;
> +	int i;
> +	int ret = 0;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return (struct sk_buff *)h;
> +
> +	if (h->lin_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket linear buffer too big (%llu > %lu)\n",
> +			   h->lin_len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	} else if (h->frg_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket frag size too big (%llu > %lu\n",
> +			   h->frg_len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +
> +	skb = alloc_skb(h->lin_len, GFP_KERNEL);
> +	if (!skb) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = _ckpt_read_obj_type(ctx, skb_put(skb, h->lin_len),
> +				  h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("read linear skb length %llu: %i\n", h->lin_len, ret);
> +	if (ret < 0) {
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->nr_frags; i++) {
> +		ret = sock_restore_skb_frag(ctx, skb, i);
> +		ckpt_debug("read skb frag %i/%i: %i\n",
> +			   i + 1, h->nr_frags, ret);
> +		if (ret < 0)
> +			goto out;
> +		h->frg_len -= ret;
> +	}
> +
> +	if (h->frg_len != 0) {
> +		ckpt_debug("length %llu remaining after reading frags\n",
> +			   h->frg_len);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	sock_restore_header_info(skb, h);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	if (ret < 0) {
> +		kfree_skb(skb);
> +		skb = ERR_PTR(ret);
> +	}
> +
> +	return skb;
> +}
> +
> +static int __sock_write_skb(struct ckpt_ctx *ctx,
> +			    struct sk_buff *skb,
> +			    int dst_objref)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	if (dst_objref > 0) {
> +		BUG_ON(!skb->sk);
> +		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
> +		if (ret < 0)
> +			goto out;
> +		h->sk_objref = ret;
> +		h->pr_objref = dst_objref;
> +	}
> +
> +	sock_record_header_info(skb, h);
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = ckpt_write_obj_type(ctx, skb->head, h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("writing skb linear region %llu: %i\n", h->lin_len, ret);
> +	if (ret < 0)
> +		goto out;
> +
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +		u8 *vaddr = kmap(frag->page);
> +
> +		ckpt_debug("writing buffer fragment %i/%i (%i)\n",
> +			   i + 1, h->nr_frags, frag->size);
> +		ret = ckpt_write_obj_type(ctx, vaddr + frag->page_offset,
> +					  frag->size, CKPT_HDR_BUFFER);
> +		kunmap(frag->page);
> +		h->frg_len -= frag->size;
> +		if (ret < 0)
> +			goto out;
> +	}
> +
> +	WARN_ON(h->frg_len != 0);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
>  static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  				struct sk_buff_head *queue,
>  				int dst_objref)
> @@ -95,13 +324,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  	struct sk_buff *skb;
> 
>  	skb_queue_walk(queue, skb) {
> -		struct ckpt_hdr_socket_buffer *h;
>  		int ret = 0;
> 
> -		/* FIXME: This could be a false positive for non-unix
> -		 *        buffers, so add a type check here in the
> -		 *        future
> -		 */
>  		if (UNIXCB(skb).fp) {
>  			ckpt_write_err(ctx, "TE", "af_unix: pass fd", -EBUSY);
>  			return -EBUSY;
> @@ -113,25 +337,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  		 * because we don't save out (or restore) the control
>  		 * information contained in the skb.
>  		 */
> -		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> -		if (!h)
> -			return -ENOMEM;
> -
> -		BUG_ON(!skb->sk);
> -		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
> -		if (ret < 0)
> -			goto end;
> -		h->sk_objref = ret;
> -		h->pr_objref = dst_objref;
> -
> -		ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> -		if (ret < 0)
> -			goto end;
> 
> -		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
> -					  CKPT_HDR_BUFFER);
> -	end:
> -		ckpt_hdr_put(ctx, h);
> +		ret = __sock_write_skb(ctx, skb, dst_objref);
>  		if (ret < 0)
>  			return ret;
>  	}
> -- 
> 1.6.3.3
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v4)
  2009-11-10 18:47 ` [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v4) Dan Smith
@ 2009-11-11 20:32   ` Serge E. Hallyn
  0 siblings, 0 replies; 18+ messages in thread
From: Serge E. Hallyn @ 2009-11-11 20:32 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers, netdev

Quoting Dan Smith (danms@us.ibm.com):
> This patch adds basic support for C/R of open INET sockets.  I think that
> all the important bits of the TCP and ICSK socket structures is saved,
> but I think there is still some additional IPv6 stuff that needs to be
> handled.
> 
> With this patch applied, the following script can be used to demonstrate
> the functionality:
> 
>   https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html
> 
> It shows that this enables migration of a sendmail process with open
> connections from one machine to another without dropping.
> 
> We probably need comments from the netdev people about the quality of
> sanity checking we do on the values in the ckpt_hdr_socket_inet
> structure on restart.
> 
> Note that this still doesn't address lingering sockets yet.
> 
> Changes in v4:
>  - Use the new socket buffer restore functions introduced in the
>    previous patch
>  - Move listen_sockets list under the restart items in ckpt_ctx
>  - Rename RESTART_SOCK_LISTENONLY to RESTART_CONN_RESET
> 
> Changes in v3:
>  - Prevent restart from allowing a bind on a <1024 port unless the
>    user is granted that capability
>  - Add some sanity checking in the inet_precheck() function to make sure
>    the values read from the checkpoint image are within acceptable ranges
>  - Check the result of sock_restore_header_info() and fail if needed
> 
> Changes in v2:
>  - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
>    structure instead of saving them separately
>  - Fix 'sock' naming in sock_cptrst()
>  - Don't take the queue lock before skb_queue_tail() since it is
>    done for us
>  - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
>    flag is specified on sys_restart()
>  - Pull the implementation of the list of listening sockets back into
>    this patch
>  - Fix dangling printk
>  - Add some comments around the parent/child restore logic
> 
> Cc: netdev@vger.kernel.org
> Acked-by: Oren Laadan <orenl@librato.com>
> Signed-off-by: Dan Smith <danms@us.ibm.com>

Acked-by: Serge Hallyn <serue@us.ibm.com>

There are probably still gremlins in there somewhere, but imo it's
past time to get this in the tree so we can be testing it.

-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]     ` <1257878856-25520-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-11 21:38       ` Serge E. Hallyn
       [not found]         ` <20091111213851.GE8761-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-11 21:40       ` Serge E. Hallyn
  1 sibling, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2009-11-11 21:38 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
>  	/* Make sure there's room in the send buffer */
>  	sndbuf = sk->sk_sndbuf;
> -	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
> +	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
>  	    capable(CAP_NET_ADMIN))
> -		sk->sk_sndbuf += len;
> +		sk->sk_sndbuf += h->lin_len;
>  	else
>  		sk->sk_sndbuf = sysctl_wmem_max;

Can you explain what's going on here?  'if the size of the send buffer
minus tranmit queue bytes committed is less than linear length,
then if you're privileged you add h->len to sk->sk_sndbuf, but if
either you're not privileged or the length was greater than linear
length, then you add sysctl_wmem_max.'  ?

I realize that's a question on the original code, not on this
patch...

-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]     ` <1257878856-25520-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-11 21:38       ` Serge E. Hallyn
@ 2009-11-11 21:40       ` Serge E. Hallyn
  1 sibling, 0 replies; 18+ messages in thread
From: Serge E. Hallyn @ 2009-11-11 21:40 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

Anyway the patch itself seems fine :)

Acked-by: Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

> ---
>  net/unix/checkpoint.c |   54 ++++++++++++++++++++++++++----------------------
>  1 files changed, 29 insertions(+), 25 deletions(-)
> 
> diff --git a/net/unix/checkpoint.c b/net/unix/checkpoint.c
> index 8b7cb22..bd1fc35 100644
> --- a/net/unix/checkpoint.c
> +++ b/net/unix/checkpoint.c
> @@ -200,7 +200,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
>  	uint8_t peer_shutdown = 0;
>  	void *buf = NULL;
>  	int sndbuf;
> -	int len;
>  	int ret = 0;
> 
>  	memset(&msg, 0, sizeof(msg));
> @@ -209,14 +208,30 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
>  	if (IS_ERR(h))
>  		return PTR_ERR(h);
> 
> -	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> -	if (len < 0) {
> -		ret = len;
> +	if (h->lin_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket buffer too big (%llu > %lu)\n",
> +			   h->lin_len, SKB_MAX_ALLOC);
> +		ret = -EINVAL;
> +		goto out;
> +	} else if (h->nr_frags != 0) {
> +		ckpt_debug("unix socket claims to have fragments\n");
> +		ret = -EINVAL;
>  		goto out;
> -	} else if (len > SKB_MAX_ALLOC) {
> -		ckpt_debug("Socket buffer too big (%i > %lu)",
> -			   len, SKB_MAX_ALLOC);
> -		ret = -ENOSPC;
> +	}
> +
> +	buf = kmalloc(h->lin_len, GFP_KERNEL);
> +	if (!buf) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	kvec.iov_len = h->lin_len;
> +	kvec.iov_base = buf;
> +	ret = _ckpt_read_obj_type(ctx, kvec.iov_base,
> +				  h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("read unix socket buffer %llu: %i\n", h->lin_len, ret);
> +	if (ret < h->lin_len) {
> +		ret = -EINVAL;
>  		goto out;
>  	}
> 
> @@ -245,18 +260,6 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
>  		}
>  	}
> 
> -	kvec.iov_len = len;
> -	buf = kmalloc(len, GFP_KERNEL);
> -	kvec.iov_base = buf;
> -	if (!buf) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -
> -	ret = ckpt_kread(ctx, kvec.iov_base, len);
> -	if (ret < 0)
> -		goto out;
> -
>  	msg.msg_name = addr;
>  	msg.msg_namelen = addrlen;
> 
> @@ -272,15 +275,16 @@ static int sock_read_buffer_sendmsg(struct ckpt_ctx *ctx,
> 
>  	/* Make sure there's room in the send buffer */
>  	sndbuf = sk->sk_sndbuf;
> -	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
> +	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
>  	    capable(CAP_NET_ADMIN))
> -		sk->sk_sndbuf += len;
> +		sk->sk_sndbuf += h->lin_len;
>  	else
>  		sk->sk_sndbuf = sysctl_wmem_max;
> 
> -	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, len);
> -	ckpt_debug("kernel_sendmsg(%i,%i): %i\n", h->sk_objref, len, ret);
> -	if ((ret > 0) && (ret != len))
> +	ret = kernel_sendmsg(sk->sk_socket, &msg, &kvec, 1, h->lin_len);
> +	ckpt_debug("kernel_sendmsg(%i,%llu): %i\n",
> +		   h->sk_objref, h->lin_len, ret);
> +	if ((ret > 0) && (ret != h->lin_len))
>  		ret = -ENOMEM;
> 
>  	sk->sk_sndbuf = sndbuf;
> -- 
> 1.6.3.3
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]         ` <20091111213851.GE8761-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-11 21:57           ` Dan Smith
       [not found]             ` <87vdhgvi6b.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Dan Smith @ 2009-11-11 21:57 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

>> /* Make sure there's room in the send buffer */
>> sndbuf = sk->sk_sndbuf;
>> -	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
>> +	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
>> capable(CAP_NET_ADMIN))
>> -		sk->sk_sndbuf += len;
>> +		sk->sk_sndbuf += h->lin_len;
>> else
sk-> sk_sndbuf = sysctl_wmem_max;

SH> Can you explain what's going on here?

If we're trying to restore a buffer that is larger than the remaining
space in the buffer, then one of two things can happen:

1. You're privileged and we make the space you need
2. You're not privileged so we give you the benefit of the doubt and
   set the buffer limit to the system default

In the case of 2, if that system default still isn't enough then the
sendmsg() will fail like it normally would.

The reason for this is that the application could have loaded up its
legitimate buffer with data and then set the buffer limit low.  That
doesn't purge the data it already had buffered, it just limits how
much you can add to it.  So, in order to not fail a restart of such a
legitimate situation, we assume the system default instead of the
limit set by the user.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]             ` <87vdhgvi6b.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-11-12  2:18               ` Serge E. Hallyn
       [not found]                 ` <20091112021824.GA14646-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Serge E. Hallyn @ 2009-11-12  2:18 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> >> /* Make sure there's room in the send buffer */
> >> sndbuf = sk->sk_sndbuf;
> >> -	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
> >> +	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
> >> capable(CAP_NET_ADMIN))
> >> -		sk->sk_sndbuf += len;
> >> +		sk->sk_sndbuf += h->lin_len;
> >> else
> sk-> sk_sndbuf = sysctl_wmem_max;
> 
> SH> Can you explain what's going on here?
> 
> If we're trying to restore a buffer that is larger than the remaining
> space in the buffer, then one of two things can happen:
> 
> 1. You're privileged and we make the space you need
> 2. You're not privileged so we give you the benefit of the doubt and
>    set the buffer limit to the system default
> 
> In the case of 2, if that system default still isn't enough then the
> sendmsg() will fail like it normally would.

But so should check whether h->len_len < sysctl_wmem_max before
doing the capable check?  Remember that any check for capable()
will set PF_SUPERPRIV on the task, so it's better to not call it
if it wasn't definately needed.

> The reason for this is that the application could have loaded up its
> legitimate buffer with data and then set the buffer limit low.  That
> doesn't purge the data it already had buffered, it just limits how
> much you can add to it.  So, in order to not fail a restart of such a
> legitimate situation, we assume the system default instead of the
> limit set by the user.

thanks,
-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]                 ` <20091112021824.GA14646-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-11-12 18:19                   ` Dan Smith
       [not found]                     ` <87k4xvvc5b.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-11-16 18:35                   ` Oren Laadan
  1 sibling, 1 reply; 18+ messages in thread
From: Dan Smith @ 2009-11-12 18:19 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

SH> But so should check whether h->len_len < sysctl_wmem_max before
SH> doing the capable check?  Remember that any check for capable()
SH> will set PF_SUPERPRIV on the task, so it's better to not call it
SH> if it wasn't definately needed.

Okay, sure.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]                     ` <87k4xvvc5b.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-11-12 19:43                       ` Serge E. Hallyn
  0 siblings, 0 replies; 18+ messages in thread
From: Serge E. Hallyn @ 2009-11-12 19:43 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> SH> But so should check whether h->len_len < sysctl_wmem_max before
> SH> doing the capable check?  Remember that any check for capable()
> SH> will set PF_SUPERPRIV on the task, so it's better to not call it
> SH> if it wasn't definately needed.
> 
> Okay, sure.

Ok, thanks - I figured if I was misunderstanding, then this suggestion
would sound ridiculous and you'd suitably ridicule me  :)

thanks,
-serge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers
       [not found]     ` <1257878856-25520-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-11 20:02       ` Serge E. Hallyn
@ 2009-11-16 18:30       ` Oren Laadan
       [not found]         ` <4B019A2C.9090507-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
  1 sibling, 1 reply; 18+ messages in thread
From: Oren Laadan @ 2009-11-16 18:30 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
> The INET code often creates socket buffers by attaching fragments instead
> of writing to the linear region.  This extends the skb write functions
> to write out the linear and fragment regions of an skb, and adds a
> function to be used by others wishing to restore an skb in the same way.
> This also includes the header-mark-setting bits from a previous patch.
> 
> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> ---
>  include/linux/checkpoint.h     |    1 +
>  include/linux/checkpoint_hdr.h |   11 ++
>  net/checkpoint.c               |  253 ++++++++++++++++++++++++++++++++++++----
>  3 files changed, 242 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index dfcb59b..3e73e68 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -100,6 +100,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
>  			      struct socket *socket,
>  			      struct sockaddr *loc, unsigned *loc_len,
>  			      struct sockaddr *rem, unsigned *rem_len);
> +struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
>  
>  /* ckpt kflags */
>  #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 5d9c088..ace4139 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -561,8 +561,19 @@ struct ckpt_hdr_socket_queue {
>  
>  struct ckpt_hdr_socket_buffer {
>  	struct ckpt_hdr h;
> +	__u64 transport_header;
> +	__u64 network_header;
> +	__u64 mac_header;
> +	__u64 lin_len; /* Length of linear data */
> +	__u64 frg_len; /* Length of fragment data */
> +	__u64 skb_len; /* Length of skb (adjusted) */
> +	__u64 hdr_len; /* Length of skipped header */
> +	__u64 mac_len;

Can you use u32 (or even less ?) for these ?

>  	__s32 sk_objref;
>  	__s32 pr_objref;
> +	__u16 protocol;
> +	__u16 nr_frags;
> +	__u8 cb[48];

Do you it will ever be required that cb[] be aligned ?

>  };

[...]

> +static int sock_restore_skb_frag(struct ckpt_ctx *ctx,
> +				 struct sk_buff *skb,
> +				 int frag_idx)
> +{
> +	int ret = 0;
> +	int fraglen;
> +	struct page *page;
> +	void *buf;
> +
> +	fraglen = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> +	if (fraglen < 0)
> +		return fraglen;
> +
> +	if (fraglen > PAGE_SIZE) {
> +		ckpt_debug("skb frag size %i > PAGE_SIZE\n", fraglen);
> +		return -EINVAL;
> +	}
> +
> +	page = alloc_page(GFP_KERNEL);
> +	if (!page)
> +		return -ENOMEM;
> +
> +	buf = kmap(page);
> +	ret = ckpt_kread(ctx, buf, fraglen);
> +	kunmap(page);

I'm unsure how much of a performance issue this is - I sort of expected
a comment from Dave Hansen about this;  Did you consider reusing the
restore_read_page() from checkpoint/memory.c ?  (and the matching
checkpoint_dump_page() for the checkpoint).

> +
> +	if (ret) {
> +		ckpt_debug("failed to read fragment: %i\n", ret);
> +		ret = -EINVAL;
> +		__free_page(page);
> +	} else {
> +		ckpt_debug("read %i for fragment %i\n", fraglen, frag_idx);
> +		skb_add_rx_frag(skb, frag_idx, page, 0, fraglen);
> +	}
> +
> +	return ret < 0 ? ret : fraglen;
> +}
> +
> +struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff *skb = NULL;
> +	int i;
> +	int ret = 0;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return (struct sk_buff *)h;
> +
> +	if (h->lin_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket linear buffer too big (%llu > %lu)\n",
> +			   h->lin_len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	} else if (h->frg_len > SKB_MAX_ALLOC) {
> +		ckpt_debug("socket frag size too big (%llu > %lu\n",
> +			   h->frg_len, SKB_MAX_ALLOC);
> +		ret = -ENOSPC;
> +		goto out;
> +	}
> +
> +	skb = alloc_skb(h->lin_len, GFP_KERNEL);
> +	if (!skb) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = _ckpt_read_obj_type(ctx, skb_put(skb, h->lin_len),
> +				  h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("read linear skb length %llu: %i\n", h->lin_len, ret);
> +	if (ret < 0) {
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->nr_frags; i++) {

Sanitize h->nr_frags ?

> +		ret = sock_restore_skb_frag(ctx, skb, i);
> +		ckpt_debug("read skb frag %i/%i: %i\n",
> +			   i + 1, h->nr_frags, ret);
> +		if (ret < 0)
> +			goto out;
> +		h->frg_len -= ret;
> +	}
> +
> +	if (h->frg_len != 0) {
> +		ckpt_debug("length %llu remaining after reading frags\n",
> +			   h->frg_len);
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	sock_restore_header_info(skb, h);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	if (ret < 0) {
> +		kfree_skb(skb);
> +		skb = ERR_PTR(ret);
> +	}
> +
> +	return skb;
> +}
> +
> +static int __sock_write_skb(struct ckpt_ctx *ctx,
> +			    struct sk_buff *skb,
> +			    int dst_objref)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	if (dst_objref > 0) {
> +		BUG_ON(!skb->sk);
> +		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
> +		if (ret < 0)
> +			goto out;
> +		h->sk_objref = ret;
> +		h->pr_objref = dst_objref;
> +	}
> +
> +	sock_record_header_info(skb, h);
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = ckpt_write_obj_type(ctx, skb->head, h->lin_len, CKPT_HDR_BUFFER);
> +	ckpt_debug("writing skb linear region %llu: %i\n", h->lin_len, ret);
> +	if (ret < 0)
> +		goto out;
> +
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +		u8 *vaddr = kmap(frag->page);

Here, too, consider checkpoint_dump_page() to avoid kmap() ?

It makes sense to have a scratch page to be used (also) for this,
on ckpt_ctx - to avoid alloc/dealloc repeatedly.

> +
> +		ckpt_debug("writing buffer fragment %i/%i (%i)\n",
> +			   i + 1, h->nr_frags, frag->size);
> +		ret = ckpt_write_obj_type(ctx, vaddr + frag->page_offset,
> +					  frag->size, CKPT_HDR_BUFFER);
> +		kunmap(frag->page);

[...]

Oren.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file
       [not found]                 ` <20091112021824.GA14646-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-11-12 18:19                   ` Dan Smith
@ 2009-11-16 18:35                   ` Oren Laadan
  1 sibling, 0 replies; 18+ messages in thread
From: Oren Laadan @ 2009-11-16 18:35 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, Dan Smith



Serge E. Hallyn wrote:
> Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
>>>> /* Make sure there's room in the send buffer */
>>>> sndbuf = sk->sk_sndbuf;
>>>> -	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < len) &&
>>>> +	if (((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < h->lin_len) &&
>>>> capable(CAP_NET_ADMIN))
>>>> -		sk->sk_sndbuf += len;
>>>> +		sk->sk_sndbuf += h->lin_len;
>>>> else
>> sk-> sk_sndbuf = sysctl_wmem_max;
>>
>> SH> Can you explain what's going on here?
>>
>> If we're trying to restore a buffer that is larger than the remaining
>> space in the buffer, then one of two things can happen:
>>
>> 1. You're privileged and we make the space you need
>> 2. You're not privileged so we give you the benefit of the doubt and
>>    set the buffer limit to the system default
>>
>> In the case of 2, if that system default still isn't enough then the
>> sendmsg() will fail like it normally would.
> 
> But so should check whether h->len_len < sysctl_wmem_max before
> doing the capable check?  Remember that any check for capable()
> will set PF_SUPERPRIV on the task, so it's better to not call it
> if it wasn't definately needed.
> 
>> The reason for this is that the application could have loaded up its
>> legitimate buffer with data and then set the buffer limit low.  That
>> doesn't purge the data it already had buffered, it just limits how
>> much you can add to it.  So, in order to not fail a restart of such a
>> legitimate situation, we assume the system default instead of the
>> limit set by the user.

Maybe also worth beefing up the comment near that code to help
future reviewers/developers...

The patch is good.

Oren

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: Add support for connected INET sockets
       [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2009-11-10 18:47   ` [PATCH 4/4] Add some content to the readme.txt for socket c/r Dan Smith
@ 2009-11-16 18:38   ` Oren Laadan
  3 siblings, 0 replies; 18+ messages in thread
From: Oren Laadan @ 2009-11-16 18:38 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg


The patches look good, see a few comments per patch.

Before I pull, please (1) use ckpt_err() instead of ckpt_write_err(),
and also use ckpt_err() in the restart path where it makes sense.

I should be pushing v19-rc1 by tomorrow, perhaps better wait.

Thanks,

Oren.

Dan Smith wrote:
> This set includes the previous INET connected sockets patch with a few
> trivial changes suggested by Oren, as well as the readme update.  The
> first patch fixes socket buffer checkpoint to include fragments, and
> includes the changes in the previous first patch that added header mark
> information.  As a result, this set now also includes a fix to the UNIX
> code to cope with the change.
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers
       [not found]         ` <4B019A2C.9090507-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
@ 2009-11-16 18:51           ` Dan Smith
       [not found]             ` <87einyuwum.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 18+ messages in thread
From: Dan Smith @ 2009-11-16 18:51 UTC (permalink / raw)
  To: Oren Laadan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

>> struct ckpt_hdr_socket_buffer {
>> struct ckpt_hdr h;
>> +	__u64 transport_header;
>> +	__u64 network_header;
>> +	__u64 mac_header;
>> +	__u64 lin_len; /* Length of linear data */
>> +	__u64 frg_len; /* Length of fragment data */
>> +	__u64 skb_len; /* Length of skb (adjusted) */
>> +	__u64 hdr_len; /* Length of skipped header */
>> +	__u64 mac_len;

OL> Can you use u32 (or even less ?) for these ?

Well, in the structure the len is an unsigned int, so u64 seemed
appropriate.

>> __s32 sk_objref;
>> __s32 pr_objref;
>> +	__u16 protocol;
>> +	__u16 nr_frags;
>> +	__u8 cb[48];

OL> Do you it will ever be required that cb[] be aligned ?

It's not aligned in the real structure and it's mostly opaque data, so
no, I wouldn't think so.

OL> I'm unsure how much of a performance issue this is - I sort of
OL> expected a comment from Dave Hansen about this; Did you consider
OL> reusing the restore_read_page() from checkpoint/memory.c ?  (and
OL> the matching checkpoint_dump_page() for the checkpoint).

Nope, I'll take a look.

>> +	for (i = 0; i < h->nr_frags; i++) {

OL> Sanitize h->nr_frags ?

To what?  I've already checked h->frg_len at this point, so I think
maybe just making sure frag_len never crosses zero would be
sufficient.

>> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
>> +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>> +		u8 *vaddr = kmap(frag->page);

OL> Here, too, consider checkpoint_dump_page() to avoid kmap() ?

OL> It makes sense to have a scratch page to be used (also) for this,
OL> on ckpt_ctx - to avoid alloc/dealloc repeatedly.

Hmm, is there an alloc/dealloc here?

I'll take a look at checkpoint_dump_page()...

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers
       [not found]             ` <87einyuwum.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-11-16 19:05               ` Oren Laadan
  0 siblings, 0 replies; 18+ messages in thread
From: Oren Laadan @ 2009-11-16 19:05 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg



Dan Smith wrote:
>>> struct ckpt_hdr_socket_buffer {
>>> struct ckpt_hdr h;
>>> +	__u64 transport_header;
>>> +	__u64 network_header;
>>> +	__u64 mac_header;
>>> +	__u64 lin_len; /* Length of linear data */
>>> +	__u64 frg_len; /* Length of fragment data */
>>> +	__u64 skb_len; /* Length of skb (adjusted) */
>>> +	__u64 hdr_len; /* Length of skipped header */
>>> +	__u64 mac_len;
> 
> OL> Can you use u32 (or even less ?) for these ?
> 
> Well, in the structure the len is an unsigned int, so u64 seemed
> appropriate.
> 
>>> __s32 sk_objref;
>>> __s32 pr_objref;
>>> +	__u16 protocol;
>>> +	__u16 nr_frags;
>>> +	__u8 cb[48];
> 
> OL> Do you it will ever be required that cb[] be aligned ?
> 
> It's not aligned in the real structure and it's mostly opaque data, so
> no, I wouldn't think so.
> 
> OL> I'm unsure how much of a performance issue this is - I sort of
> OL> expected a comment from Dave Hansen about this; Did you consider
> OL> reusing the restore_read_page() from checkpoint/memory.c ?  (and
> OL> the matching checkpoint_dump_page() for the checkpoint).
> 
> Nope, I'll take a look.
> 
>>> +	for (i = 0; i < h->nr_frags; i++) {
> 
> OL> Sanitize h->nr_frags ?
> 
> To what?  I've already checked h->frg_len at this point, so I think
> maybe just making sure frag_len never crosses zero would be
> sufficient.

MAX_SKB_FRAGS.

You use @i when calling skb_add_rx_frag() which in turn uses it
to index in the skb's fragments array without checking.

> 
>>> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
>>> +		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
>>> +		u8 *vaddr = kmap(frag->page);
> 
> OL> Here, too, consider checkpoint_dump_page() to avoid kmap() ?
> 
> OL> It makes sense to have a scratch page to be used (also) for this,
> OL> on ckpt_ctx - to avoid alloc/dealloc repeatedly.
> 
> Hmm, is there an alloc/dealloc here?

Sorry, should have been more explicit: these two functions use an
temporary page to read the data to (write the data from) without
holding kmap(); instead they use kmap_atomic(), copy the data to
the scratch page, kunmap_atomic(), and then write the data.

When dumping memory, the scratch page is allocated once for every
big group of pages and then freed Here, you will need to allocate
for each skb, which makes less sense - hence my suggestion that
we keep a scratch page for that purpose. (and it can be used by
the memory dump/restore too).

> 
> I'll take a look at checkpoint_dump_page()...
> 

Oren.

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2009-11-16 19:05 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-10 18:47 Add support for connected INET sockets Dan Smith
2009-11-10 18:47 ` [PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v4) Dan Smith
2009-11-11 20:32   ` Serge E. Hallyn
     [not found] ` <1257878856-25520-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-10 18:47   ` [PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers Dan Smith
     [not found]     ` <1257878856-25520-2-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-11 20:02       ` Serge E. Hallyn
2009-11-16 18:30       ` Oren Laadan
     [not found]         ` <4B019A2C.9090507-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
2009-11-16 18:51           ` Dan Smith
     [not found]             ` <87einyuwum.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-11-16 19:05               ` Oren Laadan
2009-11-10 18:47   ` [PATCH 3/4] Update the UNIX buffer restore code to match the new format saved in the image file Dan Smith
     [not found]     ` <1257878856-25520-4-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-11 21:38       ` Serge E. Hallyn
     [not found]         ` <20091111213851.GE8761-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-11 21:57           ` Dan Smith
     [not found]             ` <87vdhgvi6b.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-11-12  2:18               ` Serge E. Hallyn
     [not found]                 ` <20091112021824.GA14646-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-11-12 18:19                   ` Dan Smith
     [not found]                     ` <87k4xvvc5b.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-11-12 19:43                       ` Serge E. Hallyn
2009-11-16 18:35                   ` Oren Laadan
2009-11-11 21:40       ` Serge E. Hallyn
2009-11-10 18:47   ` [PATCH 4/4] Add some content to the readme.txt for socket c/r Dan Smith
2009-11-16 18:38   ` Add support for connected INET sockets Oren Laadan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.