All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
@ 2010-09-11  3:23 Changli Gao
  2010-09-12 12:13 ` Michael S. Tsirkin
  0 siblings, 1 reply; 7+ messages in thread
From: Changli Gao @ 2010-09-11  3:23 UTC (permalink / raw)
  To: David S. Miller
  Cc: Eric Dumazet, Oliver Hartkopp, Michael S. Tsirkin, netdev, Changli Gao

Since skb->destructor() is used to account socket memory, and maybe called
before the skb is sent out, a corrupt skb maybe sent out finally.

A new destructor is added into structure skb_shared_info(), and it won't
be called until the last reference to the data of an skb is put. af_packet
uses this destructor instead.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
v2: avoid kmalloc/kfree
 include/linux/skbuff.h |    3 ++-
 net/core/skbuff.c      |   19 ++++++++++++++-----
 net/packet/af_packet.c |   24 +++++++++++-------------
 3 files changed, 27 insertions(+), 19 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9e8085a..1a8cfa1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -191,6 +191,7 @@ struct skb_shared_info {
 	__u8		tx_flags;
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
+	void		(*destructor)(struct sk_buff *skb);
 
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
@@ -199,7 +200,7 @@ struct skb_shared_info {
 
 	/* Intermediate layers must ensure that destructor_arg
 	 * remains valid until skb destructor */
-	void *		destructor_arg;
+	void		*destructor_arg[2];
 	/* must be last field, see pskb_expand_head() */
 	skb_frag_t	frags[MAX_SKB_FRAGS];
 };
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 752c197..fef81f3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -332,10 +332,14 @@ static void skb_release_data(struct sk_buff *skb)
 	if (!skb->cloned ||
 	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 			       &skb_shinfo(skb)->dataref)) {
-		if (skb_shinfo(skb)->nr_frags) {
+		struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+		if (shinfo->destructor)
+			shinfo->destructor(skb);
+		if (shinfo->nr_frags) {
 			int i;
-			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
-				put_page(skb_shinfo(skb)->frags[i].page);
+			for (i = 0; i < shinfo->nr_frags; i++)
+				put_page(shinfo->frags[i].page);
 		}
 
 		if (skb_has_frag_list(skb))
@@ -497,9 +501,12 @@ bool skb_recycle_check(struct sk_buff *skb, int skb_size)
 	if (skb_shared(skb) || skb_cloned(skb))
 		return false;
 
+	shinfo = skb_shinfo(skb);
+	if (shinfo->destructor)
+		return false;
+
 	skb_release_head_state(skb);
 
-	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
 
@@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 
 	memcpy((struct skb_shared_info *)(data + size),
 	       skb_shinfo(skb),
-	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
+	       offsetof(struct skb_shared_info,
+			frags[skb_shinfo(skb)->nr_frags]));
+	skb_shinfo(skb)->destructor = NULL;
 
 	/* Check if we can avoid taking references on fragments if we own
 	 * the last reference on skb->head. (see skb_release_data())
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 3616f27..ce81c45 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -825,19 +825,18 @@ ring_is_full:
 
 static void tpacket_destruct_skb(struct sk_buff *skb)
 {
-	struct packet_sock *po = pkt_sk(skb->sk);
-	void *ph;
-
-	BUG_ON(skb == NULL);
+	struct packet_sock *po = pkt_sk(skb_shinfo(skb)->destructor_arg[0]);
 
 	if (likely(po->tx_ring.pg_vec)) {
-		ph = skb_shinfo(skb)->destructor_arg;
+		void *ph = skb_shinfo(skb)->destructor_arg[1];
+
 		BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
 		atomic_dec(&po->tx_ring.pending);
 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
 	}
 
+	skb->sk = &po->sk;
 	sock_wfree(skb);
 }
 
@@ -862,7 +861,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	skb->dev = dev;
 	skb->priority = po->sk.sk_priority;
 	skb->mark = po->sk.sk_mark;
-	skb_shinfo(skb)->destructor_arg = ph.raw;
 
 	switch (po->tp_version) {
 	case TPACKET_V2:
@@ -884,9 +882,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	to_write = tp_len;
 
 	if (sock->type == SOCK_DGRAM) {
-		err = dev_hard_header(skb, dev, ntohs(proto), addr,
-				NULL, tp_len);
-		if (unlikely(err < 0))
+		if (unlikely(dev_hard_header(skb, dev, ntohs(proto), addr,
+					     NULL, tp_len) < 0))
 			return -EINVAL;
 	} else if (dev->hard_header_len) {
 		/* net device doesn't like empty head */
@@ -897,8 +894,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		}
 
 		skb_push(skb, dev->hard_header_len);
-		err = skb_store_bits(skb, 0, data,
-				dev->hard_header_len);
+		err = skb_store_bits(skb, 0, data, dev->hard_header_len);
 		if (unlikely(err))
 			return err;
 
@@ -906,7 +902,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 		to_write -= dev->hard_header_len;
 	}
 
-	err = -EFAULT;
 	page = virt_to_page(data);
 	offset = offset_in_page(data);
 	len_max = PAGE_SIZE - offset;
@@ -1028,7 +1023,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			}
 		}
 
-		skb->destructor = tpacket_destruct_skb;
+		skb_shinfo(skb)->destructor_arg[0] = &po->sk;
+		skb_shinfo(skb)->destructor_arg[1] = ph;
+		skb->destructor = NULL;
+		skb_shinfo(skb)->destructor = tpacket_destruct_skb;
 		__packet_set_status(po, ph, TP_STATUS_SENDING);
 		atomic_inc(&po->tx_ring.pending);
 

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
  2010-09-11  3:23 [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out Changli Gao
@ 2010-09-12 12:13 ` Michael S. Tsirkin
  2010-09-15  3:20   ` David Miller
  0 siblings, 1 reply; 7+ messages in thread
From: Michael S. Tsirkin @ 2010-09-12 12:13 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, Eric Dumazet, Oliver Hartkopp, netdev

On Sat, Sep 11, 2010 at 11:23:23AM +0800, Changli Gao wrote:
> @@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>  
>  	memcpy((struct skb_shared_info *)(data + size),
>  	       skb_shinfo(skb),
> -	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
> +	       offsetof(struct skb_shared_info,
> +			frags[skb_shinfo(skb)->nr_frags]));
> +	skb_shinfo(skb)->destructor = NULL;
>  
>  	/* Check if we can avoid taking references on fragments if we own
>  	 * the last reference on skb->head. (see skb_release_data())

So it looks like pskb_expand_head will prevent the shinfo desctructor
from being called, ever? If so, won't this break af_packet?

-- 
MST

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
  2010-09-12 12:13 ` Michael S. Tsirkin
@ 2010-09-15  3:20   ` David Miller
  2010-09-15  5:23     ` Michael S. Tsirkin
  2010-09-22  9:24     ` Changli Gao
  0 siblings, 2 replies; 7+ messages in thread
From: David Miller @ 2010-09-15  3:20 UTC (permalink / raw)
  To: mst; +Cc: xiaosuo, eric.dumazet, socketcan, netdev

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 12 Sep 2010 14:13:49 +0200

> On Sat, Sep 11, 2010 at 11:23:23AM +0800, Changli Gao wrote:
>> @@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>>  
>>  	memcpy((struct skb_shared_info *)(data + size),
>>  	       skb_shinfo(skb),
>> -	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
>> +	       offsetof(struct skb_shared_info,
>> +			frags[skb_shinfo(skb)->nr_frags]));
>> +	skb_shinfo(skb)->destructor = NULL;
>>  
>>  	/* Check if we can avoid taking references on fragments if we own
>>  	 * the last reference on skb->head. (see skb_release_data())
> 
> So it looks like pskb_expand_head will prevent the shinfo desctructor
> from being called, ever? If so, won't this break af_packet?

>From what I read, he is propagating it into the new SKB data blob
with expanded head area.  It would get invoked when the skb's
new data is put.

I am not sure this is correct, however.

Destructor register only cares about original data area, but what
constitutes "original data" is ambiguous.  In fact it seems
impossible to catch the freeing of all parts properly.

When pskb_expand_head() is invoked we get new linear part, but
non-linear part stays the same.  However, entity which registered
skb data destructor cares about old linear data lifetime, which
we will no longer track after destructor is propagated only to
the new shinfo.

So we need to do something different here.  I bet original code
overriding socket destructor semantics had a similar problem.

Changli, I have one other minor request, please name this something
like "shinfo->data_destructor" and "shinfo->data_destructor_arg".

I think that will make it easier for other humans to understand :)

Thank you.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
  2010-09-15  3:20   ` David Miller
@ 2010-09-15  5:23     ` Michael S. Tsirkin
  2010-09-22  9:35       ` Changli Gao
  2010-09-22  9:24     ` Changli Gao
  1 sibling, 1 reply; 7+ messages in thread
From: Michael S. Tsirkin @ 2010-09-15  5:23 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, eric.dumazet, socketcan, netdev

On Tue, Sep 14, 2010 at 08:20:23PM -0700, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Sun, 12 Sep 2010 14:13:49 +0200
> 
> > On Sat, Sep 11, 2010 at 11:23:23AM +0800, Changli Gao wrote:
> >> @@ -799,7 +806,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> >>  
> >>  	memcpy((struct skb_shared_info *)(data + size),
> >>  	       skb_shinfo(skb),
> >> -	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
> >> +	       offsetof(struct skb_shared_info,
> >> +			frags[skb_shinfo(skb)->nr_frags]));
> >> +	skb_shinfo(skb)->destructor = NULL;
> >>  
> >>  	/* Check if we can avoid taking references on fragments if we own
> >>  	 * the last reference on skb->head. (see skb_release_data())
> > 
> > So it looks like pskb_expand_head will prevent the shinfo desctructor
> > from being called, ever? If so, won't this break af_packet?
> 
> >From what I read, he is propagating it into the new SKB data blob
> with expanded head area.  It would get invoked when the skb's
> new data is put.
> 
> I am not sure this is correct, however.
> 
> Destructor register only cares about original data area, but what
> constitutes "original data" is ambiguous.  In fact it seems
> impossible to catch the freeing of all parts properly.
> 
> When pskb_expand_head() is invoked we get new linear part, but
> non-linear part stays the same.  However, entity which registered
> skb data destructor cares about old linear data lifetime, which
> we will no longer track after destructor is propagated only to
> the new shinfo.
> 
> So we need to do something different here.  I bet original code
> overriding socket destructor semantics had a similar problem.
> 
> Changli, I have one other minor request, please name this something
> like "shinfo->data_destructor" and "shinfo->data_destructor_arg".
> 
> I think that will make it easier for other humans to understand :)
> 
> Thank you.

Hmm, and there's another issue I think I see here:
destructor_arg now points to a socket.
What happens if the skb gets queued on an interface for a very long time
(as can be the case with e.g. tap), and meanwhile
you try to kill the task that owns the socket, which
will try to destroy the socket?

Original code handles this by relevant devices orphaning an skb
if it's queued indefinitely.

-- 
MST

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
  2010-09-15  3:20   ` David Miller
  2010-09-15  5:23     ` Michael S. Tsirkin
@ 2010-09-22  9:24     ` Changli Gao
  1 sibling, 0 replies; 7+ messages in thread
From: Changli Gao @ 2010-09-22  9:24 UTC (permalink / raw)
  To: David Miller; +Cc: mst, eric.dumazet, socketcan, netdev

On Wed, Sep 15, 2010 at 11:20 AM, David Miller <davem@davemloft.net> wrote:
>
>
> Changli, I have one other minor request, please name this something
> like "shinfo->data_destructor" and "shinfo->data_destructor_arg".
>
> I think that will make it easier for other humans to understand :)
>

OK. Thanks.

But there is another issue, when splice() is involved. If we splice
the skbs generated by AF_PACKET socket to a pipe, the fragment pages
will be hold by the pipe, but the skbs are freed, and AF_PACKET socket
will been told that the corresponding TX ring buffers are available
for the other uses wrongly.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
  2010-09-15  5:23     ` Michael S. Tsirkin
@ 2010-09-22  9:35       ` Changli Gao
  2010-09-22 10:59         ` Michael S. Tsirkin
  0 siblings, 1 reply; 7+ messages in thread
From: Changli Gao @ 2010-09-22  9:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: David Miller, eric.dumazet, socketcan, netdev

On Wed, Sep 15, 2010 at 1:23 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>
> Hmm, and there's another issue I think I see here:
> destructor_arg now points to a socket.
> What happens if the skb gets queued on an interface for a very long time
> (as can be the case with e.g. tap), and meanwhile
> you try to kill the task that owns the socket, which
> will try to destroy the socket?
>
> Original code handles this by relevant devices orphaning an skb
> if it's queued indefinitely.
>

I don't think the skb_orphan() there is used to destroy the socket in
time, but notify the socket that skbs are sent out and it can send new
skbs.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out
  2010-09-22  9:35       ` Changli Gao
@ 2010-09-22 10:59         ` Michael S. Tsirkin
  0 siblings, 0 replies; 7+ messages in thread
From: Michael S. Tsirkin @ 2010-09-22 10:59 UTC (permalink / raw)
  To: Changli Gao; +Cc: David Miller, eric.dumazet, socketcan, netdev

On Wed, Sep 22, 2010 at 05:35:07PM +0800, Changli Gao wrote:
> On Wed, Sep 15, 2010 at 1:23 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > Hmm, and there's another issue I think I see here:
> > destructor_arg now points to a socket.
> > What happens if the skb gets queued on an interface for a very long time
> > (as can be the case with e.g. tap), and meanwhile
> > you try to kill the task that owns the socket, which
> > will try to destroy the socket?
> >
> > Original code handles this by relevant devices orphaning an skb
> > if it's queued indefinitely.
> >
> 
> I don't think the skb_orphan() there is used to destroy the socket in
> time, but notify the socket that skbs are sent out and it can send new
> skbs.

Well, the result is that we drop a socket reference from the skb,
so it becomes possible to free the socket.

> -- 
> Regards,
> Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2010-09-22 11:05 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-09-11  3:23 [PATCH v2] net: af_packet: don't call tpacket_destruct_skb() until the skb is sent out Changli Gao
2010-09-12 12:13 ` Michael S. Tsirkin
2010-09-15  3:20   ` David Miller
2010-09-15  5:23     ` Michael S. Tsirkin
2010-09-22  9:35       ` Changli Gao
2010-09-22 10:59         ` Michael S. Tsirkin
2010-09-22  9:24     ` Changli Gao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.