All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Multicast packet reassembly can fail
@ 2009-10-27 22:46 Steve Chen
  2009-10-27 23:22 ` Rick Jones
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Steve Chen @ 2009-10-27 22:46 UTC (permalink / raw)
  To: netdev

Multicast packet reassembly can fail

When multicast connections with multiple fragments are received by the same
node from more than one Ethernet ports, race condition between fragments
from each Ethernet port can cause fragment reassembly to fail leading to
packet drop.  This is because packets from each Ethernet port appears identical
to the the code that reassembles the Ethernet packet.

The solution is evaluate the Ethernet interface number in addition to all other
parameters so that every packet can be uniquely identified.  The existing
iif field in struct ipq is now used to generate the hash key, and iif is also
used for comparison in case of hash collision.

Please note that q->saddr ^ (q->iif << 5) is now being passed into
ipqhashfn to generate the hash key.  This is borrowed from the routing
code.

Signed-off-by: Steve Chen <schen@mvista.com>
Signed-off-by: Mark Huth <mhuth@mvista.com>

---

 net/ipv4/ip_fragment.c |   24 +++++++++++++++++-------
 1 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 575f9bd..2de0035 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -90,6 +90,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 struct ip4_create_arg {
 	struct iphdr *iph;
 	u32 user;
+	int iif;
 };
 
 static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
@@ -104,7 +105,8 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q)
 	struct ipq *ipq;
 
 	ipq = container_of(q, struct ipq, q);
-	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
+	return ipqhashfn(ipq->id, ipq->saddr ^ (ipq->iif << 5), ipq->daddr,
+			 ipq->protocol);
 }
 
 static int ip4_frag_match(struct inet_frag_queue *q, void *a)
@@ -117,6 +119,7 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
 			qp->saddr == arg->iph->saddr &&
 			qp->daddr == arg->iph->daddr &&
 			qp->protocol == arg->iph->protocol &&
+			qp->iif == arg->iif &&
 			qp->user == arg->user);
 }
 
@@ -140,6 +143,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a)
 	qp->saddr = arg->iph->saddr;
 	qp->daddr = arg->iph->daddr;
 	qp->user = arg->user;
+	qp->iif = arg->iif;
 	qp->peer = sysctl_ipfrag_max_dist ?
 		inet_getpeer(arg->iph->saddr, 1) : NULL;
 }
@@ -219,7 +223,8 @@ out:
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
-static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
+static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user,
+				  int iif)
 {
 	struct inet_frag_queue *q;
 	struct ip4_create_arg arg;
@@ -227,9 +232,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
 
 	arg.iph = iph;
 	arg.user = user;
+	arg.iif = iif;
 
 	read_lock(&ip4_frags.lock);
-	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
+	hash = ipqhashfn(iph->id, iph->saddr & (iif << 5), iph->daddr,
+			 iph->protocol);
 
 	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
 	if (q == NULL)
@@ -433,10 +440,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		qp->q.fragments = skb;
 
 	dev = skb->dev;
-	if (dev) {
-		qp->iif = dev->ifindex;
+	if (dev)
 		skb->dev = NULL;
-	}
+
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	atomic_add(skb->truesize, &qp->q.net->mem);
@@ -572,6 +578,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 {
 	struct ipq *qp;
 	struct net *net;
+	int iif  = 0;
 
 	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
@@ -580,8 +587,12 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh)
 		ip_evictor(net);
 
+	if (skb->dev)
+		iif = skb->dev->ifindex;
+
 	/* Lookup (or create) queue header */
-	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
+	qp = ip_find(net, ip_hdr(skb), user, iif);
+	if (qp != NULL) {
 		int ret;
 
 		spin_lock(&qp->q.lock);



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-27 22:46 [PATCH] Multicast packet reassembly can fail Steve Chen
@ 2009-10-27 23:22 ` Rick Jones
  2009-10-28 13:29   ` Steve Chen
  2009-10-28 16:55   ` Mark Huth
  2009-10-28 10:18 ` Eric Dumazet
  2009-10-28 20:12 ` David Stevens
  2 siblings, 2 replies; 17+ messages in thread
From: Rick Jones @ 2009-10-27 23:22 UTC (permalink / raw)
  To: Steve Chen; +Cc: netdev

Steve Chen wrote:
> Multicast packet reassembly can fail
> 
> When multicast connections with multiple fragments are received by the same
> node from more than one Ethernet ports, race condition between fragments
> from each Ethernet port can cause fragment reassembly to fail leading to
> packet drop.  This is because packets from each Ethernet port appears identical
> to the the code that reassembles the Ethernet packet.
> 
> The solution is evaluate the Ethernet interface number in addition to all other
> parameters so that every packet can be uniquely identified.  The existing
> iif field in struct ipq is now used to generate the hash key, and iif is also
> used for comparison in case of hash collision.
> 
> Please note that q->saddr ^ (q->iif << 5) is now being passed into
> ipqhashfn to generate the hash key.  This is borrowed from the routing
> code.
> 
> Signed-off-by: Steve Chen <schen@mvista.com>
> Signed-off-by: Mark Huth <mhuth@mvista.com>

It has been hours since my last good Emily Litella moment so I'll ask - isn't 
the combination of source and dest addr, protocol, IP ID and fragment offset 
supposed to take care of this?  How does the ingress interface have anything to 
do with it?

rick jones

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-27 22:46 [PATCH] Multicast packet reassembly can fail Steve Chen
  2009-10-27 23:22 ` Rick Jones
@ 2009-10-28 10:18 ` Eric Dumazet
  2009-10-28 13:32   ` Steve Chen
  2009-10-29  4:57   ` David Miller
  2009-10-28 20:12 ` David Stevens
  2 siblings, 2 replies; 17+ messages in thread
From: Eric Dumazet @ 2009-10-28 10:18 UTC (permalink / raw)
  To: Steve Chen; +Cc: netdev

Steve Chen a écrit :
> Multicast packet reassembly can fail
> 
> When multicast connections with multiple fragments are received by the same
> node from more than one Ethernet ports, race condition between fragments
> from each Ethernet port can cause fragment reassembly to fail leading to
> packet drop.  This is because packets from each Ethernet port appears identical
> to the the code that reassembles the Ethernet packet.
> 
> The solution is evaluate the Ethernet interface number in addition to all other
> parameters so that every packet can be uniquely identified.  The existing
> iif field in struct ipq is now used to generate the hash key, and iif is also
> used for comparison in case of hash collision.
> 
> Please note that q->saddr ^ (q->iif << 5) is now being passed into
> ipqhashfn to generate the hash key.  This is borrowed from the routing
> code.
> 
> Signed-off-by: Steve Chen <schen@mvista.com>
> Signed-off-by: Mark Huth <mhuth@mvista.com>
> 

This makes no sense to me, but I need to check the code.

How interface could matter in IP defragmentation ?
And why multicast is part of the equation ?

If defrag fails, this must be for other reason,
and probably needs another fix.

Check line 219 of net/ipv4/inet_fragment.c

#ifdef CONFIG_SMP
        /* With SMP race we have to recheck hash table, because
         * such entry could be created on other cpu, while we
         * promoted read lock to write lock.
         */
        hlist_for_each_entry(qp, n, &f->hash[hash], list) {
                if (qp->net == nf && f->match(qp, arg)) {
                        atomic_inc(&qp->refcnt);
                        write_unlock(&f->lock);
                        qp_in->last_in |= INET_FRAG_COMPLETE;   <<< HERE >>>
                        inet_frag_put(qp_in, f);
                        return qp;
                }
        }
#endif

I really wonder why we set INET_FRAG_COMPLETE here

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-27 23:22 ` Rick Jones
@ 2009-10-28 13:29   ` Steve Chen
  2009-10-28 16:55   ` Mark Huth
  1 sibling, 0 replies; 17+ messages in thread
From: Steve Chen @ 2009-10-28 13:29 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev

On Tue, 2009-10-27 at 16:22 -0700, Rick Jones wrote:
> Steve Chen wrote:
> > Multicast packet reassembly can fail
> > 
> > When multicast connections with multiple fragments are received by the same
> > node from more than one Ethernet ports, race condition between fragments
> > from each Ethernet port can cause fragment reassembly to fail leading to
> > packet drop.  This is because packets from each Ethernet port appears identical
> > to the the code that reassembles the Ethernet packet.
> > 
> > The solution is evaluate the Ethernet interface number in addition to all other
> > parameters so that every packet can be uniquely identified.  The existing
> > iif field in struct ipq is now used to generate the hash key, and iif is also
> > used for comparison in case of hash collision.
> > 
> > Please note that q->saddr ^ (q->iif << 5) is now being passed into
> > ipqhashfn to generate the hash key.  This is borrowed from the routing
> > code.
> > 
> > Signed-off-by: Steve Chen <schen@mvista.com>
> > Signed-off-by: Mark Huth <mhuth@mvista.com>
> 
> It has been hours since my last good Emily Litella moment so I'll ask - isn't 
> the combination of source and dest addr, protocol, IP ID and fragment offset 
> supposed to take care of this?  How does the ingress interface have anything to 
> do with it?

Here is the scenario this patch tries to address

<src node> ---->  <switch>  ----> <eth0 dest node>
                            \--->  <eth1 dest node>

For this specific case, src/dst address, protocol, IP ID and fragment
offset are all identical.  The only difference is the ingress interface.
A good follow up question would be why would anyone in their right mind
multicast to the same destination?  well, I don't know.  I can not get
the people who reported the problem to tell me either.   Since someone
found the need to do this,  perhaps others may find it useful too.

Regards,

Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 13:32   ` Steve Chen
@ 2009-10-28 13:30     ` Eric Dumazet
  0 siblings, 0 replies; 17+ messages in thread
From: Eric Dumazet @ 2009-10-28 13:30 UTC (permalink / raw)
  To: Steve Chen; +Cc: netdev

Steve Chen a écrit :
 
> I sent the specific scenario the patch tries to address to the list in
> an earlier e-mail.  Would it be beneficial if I post the test code
> somewhere so everyone can have access?
> 

Yes please, I cannot find your previous mail in my archives.

Thanks


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 10:18 ` Eric Dumazet
@ 2009-10-28 13:32   ` Steve Chen
  2009-10-28 13:30     ` Eric Dumazet
  2009-10-29  4:57   ` David Miller
  1 sibling, 1 reply; 17+ messages in thread
From: Steve Chen @ 2009-10-28 13:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

On Wed, 2009-10-28 at 11:18 +0100, Eric Dumazet wrote:
> Steve Chen a écrit :
> > Multicast packet reassembly can fail
> > 
> > When multicast connections with multiple fragments are received by the same
> > node from more than one Ethernet ports, race condition between fragments
> > from each Ethernet port can cause fragment reassembly to fail leading to
> > packet drop.  This is because packets from each Ethernet port appears identical
> > to the the code that reassembles the Ethernet packet.
> > 
> > The solution is evaluate the Ethernet interface number in addition to all other
> > parameters so that every packet can be uniquely identified.  The existing
> > iif field in struct ipq is now used to generate the hash key, and iif is also
> > used for comparison in case of hash collision.
> > 
> > Please note that q->saddr ^ (q->iif << 5) is now being passed into
> > ipqhashfn to generate the hash key.  This is borrowed from the routing
> > code.
> > 
> > Signed-off-by: Steve Chen <schen@mvista.com>
> > Signed-off-by: Mark Huth <mhuth@mvista.com>
> > 
> 
> This makes no sense to me, but I need to check the code.
> 
> How interface could matter in IP defragmentation ?
> And why multicast is part of the equation ?
> 
> If defrag fails, this must be for other reason,
> and probably needs another fix.
> 
> Check line 219 of net/ipv4/inet_fragment.c
> 
> #ifdef CONFIG_SMP
>         /* With SMP race we have to recheck hash table, because
>          * such entry could be created on other cpu, while we
>          * promoted read lock to write lock.
>          */
>         hlist_for_each_entry(qp, n, &f->hash[hash], list) {
>                 if (qp->net == nf && f->match(qp, arg)) {
>                         atomic_inc(&qp->refcnt);
>                         write_unlock(&f->lock);
>                         qp_in->last_in |= INET_FRAG_COMPLETE;   <<< HERE >>>
>                         inet_frag_put(qp_in, f);
>                         return qp;
>                 }
>         }
> #endif
> 
> I really wonder why we set INET_FRAG_COMPLETE here

I sent the specific scenario the patch tries to address to the list in
an earlier e-mail.  Would it be beneficial if I post the test code
somewhere so everyone can have access?

Regards,

Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-27 23:22 ` Rick Jones
  2009-10-28 13:29   ` Steve Chen
@ 2009-10-28 16:55   ` Mark Huth
  2009-10-28 17:18     ` Rick Jones
  1 sibling, 1 reply; 17+ messages in thread
From: Mark Huth @ 2009-10-28 16:55 UTC (permalink / raw)
  To: Rick Jones; +Cc: Steve Chen, netdev

Rick Jones wrote:
> Steve Chen wrote:
>> Multicast packet reassembly can fail
>>
>> When multicast connections with multiple fragments are received by the 
>> same
>> node from more than one Ethernet ports, race condition between fragments
>> from each Ethernet port can cause fragment reassembly to fail leading to
>> packet drop.  This is because packets from each Ethernet port appears 
>> identical
>> to the the code that reassembles the Ethernet packet.
>>
>> The solution is evaluate the Ethernet interface number in addition to 
>> all other
>> parameters so that every packet can be uniquely identified.  The existing
>> iif field in struct ipq is now used to generate the hash key, and iif 
>> is also
>> used for comparison in case of hash collision.
>>
>> Please note that q->saddr ^ (q->iif << 5) is now being passed into
>> ipqhashfn to generate the hash key.  This is borrowed from the routing
>> code.
>>
>> Signed-off-by: Steve Chen <schen@mvista.com>
>> Signed-off-by: Mark Huth <mhuth@mvista.com>
> 
> It has been hours since my last good Emily Litella moment so I'll ask - 
> isn't the combination of source and dest addr, protocol, IP ID and 
> fragment offset supposed to take care of this?  How does the ingress 
> interface have anything to do with it?
> 
> rick jones
The problem we've seen arises only when there are multiple interfaces 
each receiving the same multicast packets.  In that case there are 
multiple packets with the same key.  Steve was able to track down a 
packet loss due to re-assembly failure under certain arrival order 
conditions.

The proposed fix eliminated the packet loss in this case.  There might 
be a different problem in the re-assembly code that we have masked by 
separating the packets into streams from each interface.  Now that you 
mention it, the re-assembly code should be robust in the face of some 
duplicated and mis-ordered packets.  We can look more closely at that code.

Mark Huth


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 16:55   ` Mark Huth
@ 2009-10-28 17:18     ` Rick Jones
  2009-10-28 17:50       ` Steve Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Rick Jones @ 2009-10-28 17:18 UTC (permalink / raw)
  To: Mark Huth; +Cc: Steve Chen, netdev

>> It has been hours since my last good Emily Litella moment so I'll ask 
>> - isn't the combination of source and dest addr, protocol, IP ID and 
>> fragment offset supposed to take care of this?  How does the ingress 
>> interface have anything to do with it?
>>
>> rick jones
> 
> The problem we've seen arises only when there are multiple interfaces 
> each receiving the same multicast packets.  In that case there are 
> multiple packets with the same key.  Steve was able to track down a 
> packet loss due to re-assembly failure under certain arrival order 
> conditions.
> 
> The proposed fix eliminated the packet loss in this case.  There might 
> be a different problem in the re-assembly code that we have masked by 
> separating the packets into streams from each interface.  Now that you 
> mention it, the re-assembly code should be robust in the face of some 
> duplicated and mis-ordered packets.  We can look more closely at that code.

If I understand correctly, the idea here is to say that when multiple interfaces 
receive fragments of copies of the same  IP datagram that both copies will 
"survive" and flow up the stack?

I'm basing that on your description, and an email from Steve that reads:

> Actually, the patch tries to prevent packet drop for this exact
> scenario.  Please consider the following scenarios
> 1.  Packet comes in the fragment reassemble code in the following order
> (eth0 frag1), (eth0 frag2), (eth1 frag1), (eth1 frag2)
> Packet from both interfaces get reassembled and gets further processed.
> 
> 2. Packet can some times arrive in (perhaps other orders as well)
> (eth0 frag1), (eth1 frag1), (eth0 frag2), (eth1 frag2)
> Without this patch, eth0 frag 1/2 are overwritten by eth1 frag1/2, and
> packet from eth1 is dropped in the routing code.

Doesn't that rather fly in the face of the weak-end-system model followed by Linux?

I can see where scenario one leads to two IP datagrams making it up the stack, 
but I would have thought that was simply an "accident" of the situation that 
cannot reasonably be prevented, not justification to cause scenario two to send 
two datagrams up the stack.

rick jones

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 17:18     ` Rick Jones
@ 2009-10-28 17:50       ` Steve Chen
  2009-10-28 18:10         ` Rick Jones
  0 siblings, 1 reply; 17+ messages in thread
From: Steve Chen @ 2009-10-28 17:50 UTC (permalink / raw)
  To: Rick Jones; +Cc: Mark Huth, netdev

On Wed, 2009-10-28 at 10:18 -0700, Rick Jones wrote:
> >> It has been hours since my last good Emily Litella moment so I'll ask 
> >> - isn't the combination of source and dest addr, protocol, IP ID and 
> >> fragment offset supposed to take care of this?  How does the ingress 
> >> interface have anything to do with it?
> >>
> >> rick jones
> > 
> > The problem we've seen arises only when there are multiple interfaces 
> > each receiving the same multicast packets.  In that case there are 
> > multiple packets with the same key.  Steve was able to track down a 
> > packet loss due to re-assembly failure under certain arrival order 
> > conditions.
> > 
> > The proposed fix eliminated the packet loss in this case.  There might 
> > be a different problem in the re-assembly code that we have masked by 
> > separating the packets into streams from each interface.  Now that you 
> > mention it, the re-assembly code should be robust in the face of some 
> > duplicated and mis-ordered packets.  We can look more closely at that code.
> 
> If I understand correctly, the idea here is to say that when multiple interfaces 
> receive fragments of copies of the same  IP datagram that both copies will 
> "survive" and flow up the stack?
> 
> I'm basing that on your description, and an email from Steve that reads:
> 
> > Actually, the patch tries to prevent packet drop for this exact
> > scenario.  Please consider the following scenarios
> > 1.  Packet comes in the fragment reassemble code in the following order
> > (eth0 frag1), (eth0 frag2), (eth1 frag1), (eth1 frag2)
> > Packet from both interfaces get reassembled and gets further processed.
> > 
> > 2. Packet can some times arrive in (perhaps other orders as well)
> > (eth0 frag1), (eth1 frag1), (eth0 frag2), (eth1 frag2)
> > Without this patch, eth0 frag 1/2 are overwritten by eth1 frag1/2, and
> > packet from eth1 is dropped in the routing code.
> 
> Doesn't that rather fly in the face of the weak-end-system model followed by Linux?
> 
> I can see where scenario one leads to two IP datagrams making it up the stack, 
> but I would have thought that was simply an "accident" of the situation that 
> cannot reasonably be prevented, not justification to cause scenario two to send 
> two datagrams up the stack.

For scenario 2, the routing code drops the 2nd packet.  As a result, no
packet make it to the application.  If someone is willing to suggest an
alternative, I can certainly rework the patch and retest.

Regards,

Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 17:50       ` Steve Chen
@ 2009-10-28 18:10         ` Rick Jones
  2009-10-28 18:40           ` Steve Chen
  0 siblings, 1 reply; 17+ messages in thread
From: Rick Jones @ 2009-10-28 18:10 UTC (permalink / raw)
  To: Steve Chen; +Cc: Mark Huth, netdev

>>If I understand correctly, the idea here is to say that when multiple interfaces 
>>receive fragments of copies of the same  IP datagram that both copies will 
>>"survive" and flow up the stack?
>>
>>I'm basing that on your description, and an email from Steve that reads:
>>
>>
>>>Actually, the patch tries to prevent packet drop for this exact
>>>scenario.  Please consider the following scenarios
>>>1.  Packet comes in the fragment reassemble code in the following order
>>>(eth0 frag1), (eth0 frag2), (eth1 frag1), (eth1 frag2)
>>>Packet from both interfaces get reassembled and gets further processed.
>>>
>>>2. Packet can some times arrive in (perhaps other orders as well)
>>>(eth0 frag1), (eth1 frag1), (eth0 frag2), (eth1 frag2)
>>>Without this patch, eth0 frag 1/2 are overwritten by eth1 frag1/2, and
>>>packet from eth1 is dropped in the routing code.
>>
>>Doesn't that rather fly in the face of the weak-end-system model followed by Linux?
>>
>>I can see where scenario one leads to two IP datagrams making it up the stack, 
>>but I would have thought that was simply an "accident" of the situation that 
>>cannot reasonably be prevented, not justification to cause scenario two to send 
>>two datagrams up the stack.
> 
> 
> For scenario 2, the routing code drops the 2nd packet.  As a result, no
> packet make it to the application.  If someone is willing to suggest an
> alternative, I can certainly rework the patch and retest.

I'll ask my next potentially Emily Litella question - don't multicast IP 
applications bind to multicast IP addresses and not interfaces?  That is to say, 
doesn't the first datagram completed get delivered to all applications on the 
host which have bound to the corresponding multicast IP (and port number...) ?

rick jones

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 18:10         ` Rick Jones
@ 2009-10-28 18:40           ` Steve Chen
  2009-10-29 18:04             ` Herbert Xu
  0 siblings, 1 reply; 17+ messages in thread
From: Steve Chen @ 2009-10-28 18:40 UTC (permalink / raw)
  To: Rick Jones; +Cc: Mark Huth, netdev

On Wed, 2009-10-28 at 11:10 -0700, Rick Jones wrote:
> >>If I understand correctly, the idea here is to say that when multiple interfaces 
> >>receive fragments of copies of the same  IP datagram that both copies will 
> >>"survive" and flow up the stack?
> >>
> >>I'm basing that on your description, and an email from Steve that reads:
> >>
> >>
> >>>Actually, the patch tries to prevent packet drop for this exact
> >>>scenario.  Please consider the following scenarios
> >>>1.  Packet comes in the fragment reassemble code in the following order
> >>>(eth0 frag1), (eth0 frag2), (eth1 frag1), (eth1 frag2)
> >>>Packet from both interfaces get reassembled and gets further processed.
> >>>
> >>>2. Packet can some times arrive in (perhaps other orders as well)
> >>>(eth0 frag1), (eth1 frag1), (eth0 frag2), (eth1 frag2)
> >>>Without this patch, eth0 frag 1/2 are overwritten by eth1 frag1/2, and
> >>>packet from eth1 is dropped in the routing code.
> >>
> >>Doesn't that rather fly in the face of the weak-end-system model followed by Linux?
> >>
> >>I can see where scenario one leads to two IP datagrams making it up the stack, 
> >>but I would have thought that was simply an "accident" of the situation that 
> >>cannot reasonably be prevented, not justification to cause scenario two to send 
> >>two datagrams up the stack.
> > 
> > 
> > For scenario 2, the routing code drops the 2nd packet.  As a result, no
> > packet make it to the application.  If someone is willing to suggest an
> > alternative, I can certainly rework the patch and retest.
> 
> I'll ask my next potentially Emily Litella question - don't multicast IP 
> applications bind to multicast IP addresses and not interfaces?  That is to say, 
> doesn't the first datagram completed get delivered to all applications on the 
> host which have bound to the corresponding multicast IP (and port number...) ?
I actually don't know who Emily Litella is until today.  This mailing
list is great not just for learning networking stuff :).  In the test
code I received, one of the step to setup is to configure the IP address
of the interface that the application is expecting the packet.  It
appears to bind on interface based on that casual observation.  I'll
have to study the code in detail to be able to say for sure.

Regards,

Steve



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-27 22:46 [PATCH] Multicast packet reassembly can fail Steve Chen
  2009-10-27 23:22 ` Rick Jones
  2009-10-28 10:18 ` Eric Dumazet
@ 2009-10-28 20:12 ` David Stevens
  2 siblings, 0 replies; 17+ messages in thread
From: David Stevens @ 2009-10-28 20:12 UTC (permalink / raw)
  To: Steve Chen; +Cc: netdev, netdev-owner

I haven't gone through the entire thread yet, but I should point
out that this appears to break regular IP fragmentation for
unicast packets. There is no restriction whatsoever that
fragments from a remote destination that are actually for
the same datagram need to be routed on the same paths
and received on the same input interface.

For the multicast case, if they are from the same datagram,
it doesn't matter how you got them. If it's a different datagram
with the same ID, which can happen anyway, the checksum
should fail (at least (64K-1) of 64K cases). I don't see a special
case here, other than that you can tell by the interface if it was
actually a distinct datagram with the same ID in the multicast
case (and only in multicast and only if the different interfaces
are not in the same multicast routing domain).

NACK.

                                        +-DLS


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 10:18 ` Eric Dumazet
  2009-10-28 13:32   ` Steve Chen
@ 2009-10-29  4:57   ` David Miller
  2009-10-29  5:31     ` Eric Dumazet
  1 sibling, 1 reply; 17+ messages in thread
From: David Miller @ 2009-10-29  4:57 UTC (permalink / raw)
  To: eric.dumazet; +Cc: schen, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 28 Oct 2009 11:18:24 +0100

> Check line 219 of net/ipv4/inet_fragment.c
> 
> #ifdef CONFIG_SMP
>         /* With SMP race we have to recheck hash table, because
>          * such entry could be created on other cpu, while we
>          * promoted read lock to write lock.
>          */
>         hlist_for_each_entry(qp, n, &f->hash[hash], list) {
>                 if (qp->net == nf && f->match(qp, arg)) {
>                         atomic_inc(&qp->refcnt);
>                         write_unlock(&f->lock);
>                         qp_in->last_in |= INET_FRAG_COMPLETE;   <<< HERE >>>
>                         inet_frag_put(qp_in, f);
>                         return qp;
>                 }
>         }
> #endif
> 
> I really wonder why we set INET_FRAG_COMPLETE here

What has happened here is that another cpu created an identical
frag entry before we took the write lock.

So we're letting that other cpu's entry stand, and will release
our local one and not use it at all.

Setting INET_FRAG_COMPLETE does two things:

1) It makes sure input frag processing skips this entry if such
   code paths happen to see it for some reason.

2) INET_FRAG_COMPLETE must be set when inet_frag_destroy() gets
   called by inet_frag_put() when it drops the refcount to zero.
   There is an assertion on INET_FRAG_COMPLETE in inet_frag_destroy.

Hope that clears things up.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-29  4:57   ` David Miller
@ 2009-10-29  5:31     ` Eric Dumazet
  0 siblings, 0 replies; 17+ messages in thread
From: Eric Dumazet @ 2009-10-29  5:31 UTC (permalink / raw)
  To: David Miller; +Cc: schen, netdev

David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 28 Oct 2009 11:18:24 +0100
> 
>> Check line 219 of net/ipv4/inet_fragment.c
>>
>> #ifdef CONFIG_SMP
>>         /* With SMP race we have to recheck hash table, because
>>          * such entry could be created on other cpu, while we
>>          * promoted read lock to write lock.
>>          */
>>         hlist_for_each_entry(qp, n, &f->hash[hash], list) {
>>                 if (qp->net == nf && f->match(qp, arg)) {
>>                         atomic_inc(&qp->refcnt);
>>                         write_unlock(&f->lock);
>>                         qp_in->last_in |= INET_FRAG_COMPLETE;   <<< HERE >>>
>>                         inet_frag_put(qp_in, f);
>>                         return qp;
>>                 }
>>         }
>> #endif
>>
>> I really wonder why we set INET_FRAG_COMPLETE here
> 
> What has happened here is that another cpu created an identical
> frag entry before we took the write lock.
> 
> So we're letting that other cpu's entry stand, and will release
> our local one and not use it at all.
> 
> Setting INET_FRAG_COMPLETE does two things:
> 
> 1) It makes sure input frag processing skips this entry if such
>    code paths happen to see it for some reason.
> 
> 2) INET_FRAG_COMPLETE must be set when inet_frag_destroy() gets
>    called by inet_frag_put() when it drops the refcount to zero.
>    There is an assertion on INET_FRAG_COMPLETE in inet_frag_destroy.
> 
> Hope that clears things up.


Yes thanks David, this is clear now.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-28 18:40           ` Steve Chen
@ 2009-10-29 18:04             ` Herbert Xu
  2009-10-29 18:33               ` Steve Chen
  2009-11-02 18:36               ` Steve Chen
  0 siblings, 2 replies; 17+ messages in thread
From: Herbert Xu @ 2009-10-29 18:04 UTC (permalink / raw)
  To: Steve Chen; +Cc: rick.jones2, mhuth, netdev

Steve Chen <schen@mvista.com> wrote:
>
> of the interface that the application is expecting the packet.  It
> appears to bind on interface based on that casual observation.  I'll
> have to study the code in detail to be able to say for sure.

Well if it does bind to the interface then that explains the
failure. And the fix is "if it hurts, don't do it" :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-29 18:04             ` Herbert Xu
@ 2009-10-29 18:33               ` Steve Chen
  2009-11-02 18:36               ` Steve Chen
  1 sibling, 0 replies; 17+ messages in thread
From: Steve Chen @ 2009-10-29 18:33 UTC (permalink / raw)
  To: Herbert Xu; +Cc: rick.jones2, mhuth, netdev

On Thu, 2009-10-29 at 14:04 -0400, Herbert Xu wrote:
> Steve Chen <schen@mvista.com> wrote:
> >
> > of the interface that the application is expecting the packet.  It
> > appears to bind on interface based on that casual observation.  I'll
> > have to study the code in detail to be able to say for sure.
> 
> Well if it does bind to the interface then that explains the
> failure. And the fix is "if it hurts, don't do it" :)

I like that solution.  May be I can even use the first letter of every
line to send a "special" message to the customer :)

Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] Multicast packet reassembly can fail
  2009-10-29 18:04             ` Herbert Xu
  2009-10-29 18:33               ` Steve Chen
@ 2009-11-02 18:36               ` Steve Chen
  1 sibling, 0 replies; 17+ messages in thread
From: Steve Chen @ 2009-11-02 18:36 UTC (permalink / raw)
  To: Herbert Xu; +Cc: rick.jones2, mhuth, David Stevens, Eric Dumazet, netdev

On Thu, 2009-10-29 at 14:04 -0400, Herbert Xu wrote:
> Steve Chen <schen@mvista.com> wrote:
> >
> > of the interface that the application is expecting the packet.  It
> > appears to bind on interface based on that casual observation.  I'll
> > have to study the code in detail to be able to say for sure.
> 
> Well if it does bind to the interface then that explains the
> failure. And the fix is "if it hurts, don't do it" :)
> 
> Cheers,

The packet drop was tracked to rp_filter.  All packets received as
expected after disabling rp_filter.  Thank you all for the inputs.

Regards,

Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2009-11-02 18:28 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-10-27 22:46 [PATCH] Multicast packet reassembly can fail Steve Chen
2009-10-27 23:22 ` Rick Jones
2009-10-28 13:29   ` Steve Chen
2009-10-28 16:55   ` Mark Huth
2009-10-28 17:18     ` Rick Jones
2009-10-28 17:50       ` Steve Chen
2009-10-28 18:10         ` Rick Jones
2009-10-28 18:40           ` Steve Chen
2009-10-29 18:04             ` Herbert Xu
2009-10-29 18:33               ` Steve Chen
2009-11-02 18:36               ` Steve Chen
2009-10-28 10:18 ` Eric Dumazet
2009-10-28 13:32   ` Steve Chen
2009-10-28 13:30     ` Eric Dumazet
2009-10-29  4:57   ` David Miller
2009-10-29  5:31     ` Eric Dumazet
2009-10-28 20:12 ` David Stevens

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.