All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
@ 2018-05-19 12:07 Jon Rosen
  2018-05-20 22:51 ` Willem de Bruijn
  0 siblings, 1 reply; 14+ messages in thread
From: Jon Rosen @ 2018-05-19 12:07 UTC (permalink / raw)
  To: David S. Miller
  Cc: Jon Rosen, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
casues the ring to get corrupted by allowing multiple kernel threads
to claim ownership of the same ring entry. Track ownership in a shadow
ring structure to prevent other kernel threads from reusing the same
entry before it's fully filled in, passed to user space, and then
eventually passed back to the kernel for use with a new packet.

Signed-off-by: Jon Rosen <jrosen@cisco.com>
---

There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
it possible for multiple kernel threads to claim ownership of the same
ring entry, corrupting the ring and the corresponding packet(s).

These diffs are the second proposed solution, previous proposal was described
in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
to prevent RX ring overrun

Those diffs would have changed the binary interface and have broken certain
applications. Consensus was that such a change would be inappropriate.

These new diffs use a shadow ring in kernel space for tracking intermediate
state of an entry and prevent more than one kernel thread from simultaneously
allocating a ring entry. This avoids any impact to the binary interface
between kernel and userspace but comes at the additional cost of requiring a
second spin_lock when passing ownership of a ring entry to userspace.

Jon Rosen (1):
  packet: track ring entry use using a shadow ring to prevent RX ring
    overrun

 net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/internal.h  | 14 +++++++++++
 2 files changed, 78 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e0f3f4a..4d08c8e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2165,6 +2165,26 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
 	return 0;
 }
 
+static inline void *packet_rx_shadow_aquire_head(struct packet_sock *po)
+{
+	struct packet_ring_shadow_entry *entry;
+
+	entry = &po->rx_shadow.ring[po->rx_ring.head];
+	if (unlikely(entry->inuse))
+		return NULL;
+
+	entry->inuse = 1;
+	return (void *)entry;
+}
+
+static inline void packet_rx_shadow_release(void *_entry)
+{
+	struct packet_ring_shadow_entry *entry;
+
+	entry = (struct packet_ring_shadow_entry *)_entry;
+	entry->inuse = 0;
+}
+
 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		       struct packet_type *pt, struct net_device *orig_dev)
 {
@@ -2182,6 +2202,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	__u32 ts_status;
 	bool is_drop_n_account = false;
 	bool do_vnet = false;
+	void *rx_shadow_ring_entry = NULL;
 
 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
 	 * We may add members to them until current aligned size without forcing
@@ -2277,7 +2298,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!h.raw)
 		goto drop_n_account;
 	if (po->tp_version <= TPACKET_V2) {
+		/* Attempt to allocate shadow ring entry.
+		 * If already inuse then the ring is full.
+		 */
+		rx_shadow_ring_entry = packet_rx_shadow_aquire_head(po);
+		if (unlikely(!rx_shadow_ring_entry))
+			goto ring_is_full;
+
 		packet_increment_rx_head(po, &po->rx_ring);
+
 	/*
 	 * LOSING will be reported till you read the stats,
 	 * because it's COR - Clear On Read.
@@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 #endif
 
 	if (po->tp_version <= TPACKET_V2) {
+		spin_lock(&sk->sk_receive_queue.lock);
 		__packet_set_status(po, h.raw, status);
+		packet_rx_shadow_release(rx_shadow_ring_entry);
+		spin_unlock(&sk->sk_receive_queue.lock);
+
 		sk->sk_data_ready(sk);
 	} else {
 		prb_clear_blk_fill_status(&po->rx_ring);
@@ -4197,6 +4230,25 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 	goto out;
 }
 
+static struct packet_ring_shadow_entry *
+		packet_rx_shadow_alloc(unsigned int tp_frame_nr)
+{
+	struct packet_ring_shadow_entry *rx_shadow_ring;
+	int ring_size;
+	int i;
+
+	ring_size = tp_frame_nr * sizeof(*rx_shadow_ring);
+	rx_shadow_ring = kmalloc(ring_size, GFP_KERNEL);
+
+	if (!rx_shadow_ring)
+		return NULL;
+
+	for (i = 0; i < tp_frame_nr; i++)
+		rx_shadow_ring[i].inuse = 0;
+
+	return rx_shadow_ring;
+}
+
 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		int closing, int tx_ring)
 {
@@ -4209,6 +4261,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	int err = -EINVAL;
 	/* Added to avoid minimal code churn */
 	struct tpacket_req *req = &req_u->req;
+	struct packet_ring_shadow_entry *rx_shadow_ring = NULL;
 
 	lock_sock(sk);
 
@@ -4266,6 +4319,13 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			goto out;
 
 		err = -ENOMEM;
+		if (!tx_ring && po->tp_version <= TPACKET_V2) {
+			rx_shadow_ring =
+				packet_rx_shadow_alloc(req->tp_frame_nr);
+			if (!rx_shadow_ring)
+				goto out;
+		}
+
 		order = get_order(req->tp_block_size);
 		pg_vec = alloc_pg_vec(req, order);
 		if (unlikely(!pg_vec))
@@ -4319,6 +4379,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		rb->frame_max = (req->tp_frame_nr - 1);
 		rb->head = 0;
 		rb->frame_size = req->tp_frame_size;
+		if (!tx_ring)
+			swap(po->rx_shadow.ring, rx_shadow_ring);
 		spin_unlock_bh(&rb_queue->lock);
 
 		swap(rb->pg_vec_order, order);
@@ -4349,6 +4411,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
+	kfree(rx_shadow_ring);
+
 	release_sock(sk);
 	return err;
 }
diff --git a/net/packet/internal.h b/net/packet/internal.h
index a1d2b23..d1a965e 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -73,6 +73,14 @@ struct packet_ring_buffer {
 	struct tpacket_kbdq_core	prb_bdqc;
 };
 
+struct packet_ring_shadow_entry {
+	unsigned int		inuse;
+};
+
+struct packet_ring_shadow {
+	struct packet_ring_shadow_entry	*ring;
+};
+
 extern struct mutex fanout_mutex;
 #define PACKET_FANOUT_MAX	256
 
@@ -107,8 +115,14 @@ struct packet_sock {
 	struct sock		sk;
 	struct packet_fanout	*fanout;
 	union  tpacket_stats_u	stats;
+	/* Do not separate rx/tx ring structures.
+	 * They are treated as an array in af_packet.c:packet_mmap()
+	 */
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
+	/* end of rings */
+
+	struct packet_ring_shadow	rx_shadow;
 	int			copy_thresh;
 	spinlock_t		bind_lock;
 	struct mutex		pg_vec_lock;
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-19 12:07 [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun Jon Rosen
@ 2018-05-20 22:51 ` Willem de Bruijn
  2018-05-20 23:22   ` Willem de Bruijn
  0 siblings, 1 reply; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-20 22:51 UTC (permalink / raw)
  To: Jon Rosen
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Sat, May 19, 2018 at 8:07 AM, Jon Rosen <jrosen@cisco.com> wrote:
> Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
> casues the ring to get corrupted by allowing multiple kernel threads
> to claim ownership of the same ring entry. Track ownership in a shadow
> ring structure to prevent other kernel threads from reusing the same
> entry before it's fully filled in, passed to user space, and then
> eventually passed back to the kernel for use with a new packet.
>
> Signed-off-by: Jon Rosen <jrosen@cisco.com>
> ---
>
> There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
> the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
> it possible for multiple kernel threads to claim ownership of the same
> ring entry, corrupting the ring and the corresponding packet(s).
>
> These diffs are the second proposed solution, previous proposal was described
> in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
> subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
> to prevent RX ring overrun
>
> Those diffs would have changed the binary interface and have broken certain
> applications. Consensus was that such a change would be inappropriate.
>
> These new diffs use a shadow ring in kernel space for tracking intermediate
> state of an entry and prevent more than one kernel thread from simultaneously
> allocating a ring entry. This avoids any impact to the binary interface
> between kernel and userspace but comes at the additional cost of requiring a
> second spin_lock when passing ownership of a ring entry to userspace.
>
> Jon Rosen (1):
>   packet: track ring entry use using a shadow ring to prevent RX ring
>     overrun
>
>  net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/packet/internal.h  | 14 +++++++++++
>  2 files changed, 78 insertions(+)
>

> @@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>  #endif
>
>         if (po->tp_version <= TPACKET_V2) {
> +               spin_lock(&sk->sk_receive_queue.lock);
>                 __packet_set_status(po, h.raw, status);
> +               packet_rx_shadow_release(rx_shadow_ring_entry);
> +               spin_unlock(&sk->sk_receive_queue.lock);
> +
>                 sk->sk_data_ready(sk);

Thanks for continuing to look at this. I spent some time on it last time
around but got stuck, too.

This version takes an extra spinlock in the hot path. That will be very
expensive. Once we need to accept that, we could opt for a simpler
implementation akin to the one discussed in the previous thread:

stash a value in tp_padding or similar while tp_status remains
TP_STATUS_KERNEL to signal ownership to concurrent kernel
threads. The issue previously was that that field could not atomically
be cleared together with __packet_set_status. This is no longer
an issue when holding the queue lock.

With a field like tp_padding, unlike tp_len, it is arguably also safe to
clear it after flipping status (userspace should treat it as undefined).

With v1 tpacket_hdr, no explicit padding field is defined but due to
TPACKET_HDRLEN alignment it exists on both 32 and 64 bit
platforms.

The danger with using padding is that a process may write to it
and cause deadlock, of course. There is no logical reason for doing
so.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-20 22:51 ` Willem de Bruijn
@ 2018-05-20 23:22   ` Willem de Bruijn
  2018-05-21 12:57     ` Jon Rosen (jrosen)
  2018-05-23 11:54     ` Jon Rosen (jrosen)
  0 siblings, 2 replies; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-20 23:22 UTC (permalink / raw)
  To: Jon Rosen
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Sun, May 20, 2018 at 6:51 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Sat, May 19, 2018 at 8:07 AM, Jon Rosen <jrosen@cisco.com> wrote:
>> Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
>> casues the ring to get corrupted by allowing multiple kernel threads
>> to claim ownership of the same ring entry. Track ownership in a shadow
>> ring structure to prevent other kernel threads from reusing the same
>> entry before it's fully filled in, passed to user space, and then
>> eventually passed back to the kernel for use with a new packet.
>>
>> Signed-off-by: Jon Rosen <jrosen@cisco.com>
>> ---
>>
>> There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
>> the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
>> it possible for multiple kernel threads to claim ownership of the same
>> ring entry, corrupting the ring and the corresponding packet(s).
>>
>> These diffs are the second proposed solution, previous proposal was described
>> in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
>> subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
>> to prevent RX ring overrun
>>
>> Those diffs would have changed the binary interface and have broken certain
>> applications. Consensus was that such a change would be inappropriate.
>>
>> These new diffs use a shadow ring in kernel space for tracking intermediate
>> state of an entry and prevent more than one kernel thread from simultaneously
>> allocating a ring entry. This avoids any impact to the binary interface
>> between kernel and userspace but comes at the additional cost of requiring a
>> second spin_lock when passing ownership of a ring entry to userspace.
>>
>> Jon Rosen (1):
>>   packet: track ring entry use using a shadow ring to prevent RX ring
>>     overrun
>>
>>  net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  net/packet/internal.h  | 14 +++++++++++
>>  2 files changed, 78 insertions(+)
>>
>
>> @@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>>  #endif
>>
>>         if (po->tp_version <= TPACKET_V2) {
>> +               spin_lock(&sk->sk_receive_queue.lock);
>>                 __packet_set_status(po, h.raw, status);
>> +               packet_rx_shadow_release(rx_shadow_ring_entry);
>> +               spin_unlock(&sk->sk_receive_queue.lock);
>> +
>>                 sk->sk_data_ready(sk);
>
> Thanks for continuing to look at this. I spent some time on it last time
> around but got stuck, too.
>
> This version takes an extra spinlock in the hot path. That will be very
> expensive. Once we need to accept that, we could opt for a simpler
> implementation akin to the one discussed in the previous thread:
>
> stash a value in tp_padding or similar while tp_status remains
> TP_STATUS_KERNEL to signal ownership to concurrent kernel
> threads. The issue previously was that that field could not atomically
> be cleared together with __packet_set_status. This is no longer
> an issue when holding the queue lock.
>
> With a field like tp_padding, unlike tp_len, it is arguably also safe to
> clear it after flipping status (userspace should treat it as undefined).
>
> With v1 tpacket_hdr, no explicit padding field is defined but due to
> TPACKET_HDRLEN alignment it exists on both 32 and 64 bit
> platforms.
>
> The danger with using padding is that a process may write to it
> and cause deadlock, of course. There is no logical reason for doing
> so.

For the ring, there is no requirement to allocate exactly the amount
specified by the user request. Safer than relying on shared memory
and simpler than the extra allocation in this patch would be to allocate
extra shadow memory at the end of the ring (and not mmap that).

That still leaves an extra cold cacheline vs using tp_padding.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-20 23:22   ` Willem de Bruijn
@ 2018-05-21 12:57     ` Jon Rosen (jrosen)
  2018-05-21 17:07       ` Willem de Bruijn
  2018-05-23 11:54     ` Jon Rosen (jrosen)
  1 sibling, 1 reply; 14+ messages in thread
From: Jon Rosen (jrosen) @ 2018-05-21 12:57 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Sunday, May 20, 2018 7:22 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Sun, May 20, 2018 at 6:51 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Sat, May 19, 2018 at 8:07 AM, Jon Rosen <jrosen@cisco.com> wrote:
>>> Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
>>> casues the ring to get corrupted by allowing multiple kernel threads
>>> to claim ownership of the same ring entry. Track ownership in a shadow
>>> ring structure to prevent other kernel threads from reusing the same
>>> entry before it's fully filled in, passed to user space, and then
>>> eventually passed back to the kernel for use with a new packet.
>>>
>>> Signed-off-by: Jon Rosen <jrosen@cisco.com>
>>> ---
>>>
>>> There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
>>> the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
>>> it possible for multiple kernel threads to claim ownership of the same
>>> ring entry, corrupting the ring and the corresponding packet(s).
>>>
>>> These diffs are the second proposed solution, previous proposal was described
>>> in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
>>> subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
>>> to prevent RX ring overrun
>>>
>>> Those diffs would have changed the binary interface and have broken certain
>>> applications. Consensus was that such a change would be inappropriate.
>>>
>>> These new diffs use a shadow ring in kernel space for tracking intermediate
>>> state of an entry and prevent more than one kernel thread from simultaneously
>>> allocating a ring entry. This avoids any impact to the binary interface
>>> between kernel and userspace but comes at the additional cost of requiring a
>>> second spin_lock when passing ownership of a ring entry to userspace.
>>>
>>> Jon Rosen (1):
>>>   packet: track ring entry use using a shadow ring to prevent RX ring
>>>     overrun
>>>
>>>  net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  net/packet/internal.h  | 14 +++++++++++
>>>  2 files changed, 78 insertions(+)
>>>
>>
>>> @@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>>>  #endif
>>>
>>>         if (po->tp_version <= TPACKET_V2) {
>>> +               spin_lock(&sk->sk_receive_queue.lock);
>>>                 __packet_set_status(po, h.raw, status);
>>> +               packet_rx_shadow_release(rx_shadow_ring_entry);
>>> +               spin_unlock(&sk->sk_receive_queue.lock);
>>> +
>>>                 sk->sk_data_ready(sk);
>>
>> Thanks for continuing to look at this. I spent some time on it last time
>> around but got stuck, too.
>>
>> This version takes an extra spinlock in the hot path. That will be very
>> expensive. Once we need to accept that, we could opt for a simpler
>> implementation akin to the one discussed in the previous thread:
>>
>> stash a value in tp_padding or similar while tp_status remains
>> TP_STATUS_KERNEL to signal ownership to concurrent kernel
>> threads. The issue previously was that that field could not atomically
>> be cleared together with __packet_set_status. This is no longer
>> an issue when holding the queue lock.
>>
>> With a field like tp_padding, unlike tp_len, it is arguably also safe to
>> clear it after flipping status (userspace should treat it as undefined).
>>
>> With v1 tpacket_hdr, no explicit padding field is defined but due to
>> TPACKET_HDRLEN alignment it exists on both 32 and 64 bit
>> platforms.
>>
>> The danger with using padding is that a process may write to it
>> and cause deadlock, of course. There is no logical reason for doing
>> so.
>
> For the ring, there is no requirement to allocate exactly the amount
> specified by the user request. Safer than relying on shared memory
> and simpler than the extra allocation in this patch would be to allocate
> extra shadow memory at the end of the ring (and not mmap that).
>
> That still leaves an extra cold cacheline vs using tp_padding.

Given my lack of experience and knowledge in writing kernel code
it was easier for me to allocate the shadow ring as a separate
structure.  Of course it's not about me and my skills so if it's
more appropriate to allocate at the tail of the existing ring
then certainly I can look at doing that.

I think the bigger issues as you've pointed out are the cost of
the additional spin lock and should the additional state be
stored in-band (fewer cache lines) or out-of band (less risk of
breaking due to unpredictable application behavior).

As much as I would like to find a solution that doesn't require
the spin lock I have yet to do so. Maybe the answer is that
existing applications will need to suffer the performance impact
but a new version or option for TPACKET_V1/V2 could be added to
indicate strict adherence of the TP_STATUS_USER bit and then the
original diffs could be used.

There is another option I was considering but have yet to try
which would avoid needing a shadow ring by using counter(s) to
track maximum sequence number queued to userspace vs. the next
sequence number to be allocated in the ring.  If the difference
is greater than the size of the ring then the ring can be
considered full and the allocation would fail. Of course this may
create an additional hotspot between cores, not sure if that
would be significant or not.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-21 12:57     ` Jon Rosen (jrosen)
@ 2018-05-21 17:07       ` Willem de Bruijn
  2018-05-21 18:16         ` Jon Rosen (jrosen)
  2018-05-22 14:12         ` Jon Rosen (jrosen)
  0 siblings, 2 replies; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-21 17:07 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Mon, May 21, 2018 at 8:57 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
> On Sunday, May 20, 2018 7:22 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Sun, May 20, 2018 at 6:51 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Sat, May 19, 2018 at 8:07 AM, Jon Rosen <jrosen@cisco.com> wrote:
>>>> Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
>>>> casues the ring to get corrupted by allowing multiple kernel threads
>>>> to claim ownership of the same ring entry. Track ownership in a shadow
>>>> ring structure to prevent other kernel threads from reusing the same
>>>> entry before it's fully filled in, passed to user space, and then
>>>> eventually passed back to the kernel for use with a new packet.
>>>>
>>>> Signed-off-by: Jon Rosen <jrosen@cisco.com>
>>>> ---
>>>>
>>>> There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
>>>> the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
>>>> it possible for multiple kernel threads to claim ownership of the same
>>>> ring entry, corrupting the ring and the corresponding packet(s).
>>>>
>>>> These diffs are the second proposed solution, previous proposal was described
>>>> in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
>>>> subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
>>>> to prevent RX ring overrun
>>>>
>>>> Those diffs would have changed the binary interface and have broken certain
>>>> applications. Consensus was that such a change would be inappropriate.
>>>>
>>>> These new diffs use a shadow ring in kernel space for tracking intermediate
>>>> state of an entry and prevent more than one kernel thread from simultaneously
>>>> allocating a ring entry. This avoids any impact to the binary interface
>>>> between kernel and userspace but comes at the additional cost of requiring a
>>>> second spin_lock when passing ownership of a ring entry to userspace.
>>>>
>>>> Jon Rosen (1):
>>>>   packet: track ring entry use using a shadow ring to prevent RX ring
>>>>     overrun
>>>>
>>>>  net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  net/packet/internal.h  | 14 +++++++++++
>>>>  2 files changed, 78 insertions(+)
>>>>
>>>
>>>> @@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>>>>  #endif
>>>>
>>>>         if (po->tp_version <= TPACKET_V2) {
>>>> +               spin_lock(&sk->sk_receive_queue.lock);
>>>>                 __packet_set_status(po, h.raw, status);
>>>> +               packet_rx_shadow_release(rx_shadow_ring_entry);
>>>> +               spin_unlock(&sk->sk_receive_queue.lock);
>>>> +
>>>>                 sk->sk_data_ready(sk);
>>>
>>> Thanks for continuing to look at this. I spent some time on it last time
>>> around but got stuck, too.
>>>
>>> This version takes an extra spinlock in the hot path. That will be very
>>> expensive. Once we need to accept that, we could opt for a simpler
>>> implementation akin to the one discussed in the previous thread:
>>>
>>> stash a value in tp_padding or similar while tp_status remains
>>> TP_STATUS_KERNEL to signal ownership to concurrent kernel
>>> threads. The issue previously was that that field could not atomically
>>> be cleared together with __packet_set_status. This is no longer
>>> an issue when holding the queue lock.
>>>
>>> With a field like tp_padding, unlike tp_len, it is arguably also safe to
>>> clear it after flipping status (userspace should treat it as undefined).
>>>
>>> With v1 tpacket_hdr, no explicit padding field is defined but due to
>>> TPACKET_HDRLEN alignment it exists on both 32 and 64 bit
>>> platforms.
>>>
>>> The danger with using padding is that a process may write to it
>>> and cause deadlock, of course. There is no logical reason for doing
>>> so.
>>
>> For the ring, there is no requirement to allocate exactly the amount
>> specified by the user request. Safer than relying on shared memory
>> and simpler than the extra allocation in this patch would be to allocate
>> extra shadow memory at the end of the ring (and not mmap that).
>>
>> That still leaves an extra cold cacheline vs using tp_padding.
>
> Given my lack of experience and knowledge in writing kernel code
> it was easier for me to allocate the shadow ring as a separate
> structure.  Of course it's not about me and my skills so if it's
> more appropriate to allocate at the tail of the existing ring
> then certainly I can look at doing that.
>
> I think the bigger issues as you've pointed out are the cost of
> the additional spin lock and should the additional state be
> stored in-band (fewer cache lines) or out-of band (less risk of
> breaking due to unpredictable application behavior).

We don't need the spinlock if clearing the shadow byte after
setting the status to user.

Worst case, user will set it back to kernel while the shadow
byte is not cleared yet and the next producer will drop a packet.
But next producers will make progress, so there is no deadlock
or corruption.

It probably does require a shadow structure as opposed to a
padding byte to work with the long tail of (possibly broken)
applications, sadly.

A setsockopt for userspace to signal a stricter interpretation of
tp_status to elide the shadow hack could then be considered.
It's not pretty. Either way, no full new version is required.

> As much as I would like to find a solution that doesn't require
> the spin lock I have yet to do so. Maybe the answer is that
> existing applications will need to suffer the performance impact
> but a new version or option for TPACKET_V1/V2 could be added to
> indicate strict adherence of the TP_STATUS_USER bit and then the
> original diffs could be used.
>
> There is another option I was considering but have yet to try
> which would avoid needing a shadow ring by using counter(s) to
> track maximum sequence number queued to userspace vs. the next
> sequence number to be allocated in the ring.  If the difference
> is greater than the size of the ring then the ring can be
> considered full and the allocation would fail. Of course this may
> create an additional hotspot between cores, not sure if that
> would be significant or not.

Please do have a look, but I don't think that this will work in this
case in practice. It requires tracking the producer tail. Updating
the slowest writer requires probing each subsequent slot's status
byte to find the new tail, which is a lot of (by then cold) cacheline
reads.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-21 17:07       ` Willem de Bruijn
@ 2018-05-21 18:16         ` Jon Rosen (jrosen)
  2018-05-22 15:41           ` Willem de Bruijn
  2018-05-22 14:12         ` Jon Rosen (jrosen)
  1 sibling, 1 reply; 14+ messages in thread
From: Jon Rosen (jrosen) @ 2018-05-21 18:16 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Monday, May 21, 2018 1:07 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>On Mon, May 21, 2018 at 8:57 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>> On Sunday, May 20, 2018 7:22 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Sun, May 20, 2018 at 6:51 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Sat, May 19, 2018 at 8:07 AM, Jon Rosen <jrosen@cisco.com> wrote:
>>>>> Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
>>>>> casues the ring to get corrupted by allowing multiple kernel threads
>>>>> to claim ownership of the same ring entry. Track ownership in a shadow
>>>>> ring structure to prevent other kernel threads from reusing the same
>>>>> entry before it's fully filled in, passed to user space, and then
>>>>> eventually passed back to the kernel for use with a new packet.
>>>>>
>>>>> Signed-off-by: Jon Rosen <jrosen@cisco.com>
>>>>> ---
>>>>>
>>>>> There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
>>>>> the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
>>>>> it possible for multiple kernel threads to claim ownership of the same
>>>>> ring entry, corrupting the ring and the corresponding packet(s).
>>>>>
>>>>> These diffs are the second proposed solution, previous proposal was described
>>>>> in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
>>>>> subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
>>>>> to prevent RX ring overrun
>>>>>
>>>>> Those diffs would have changed the binary interface and have broken certain
>>>>> applications. Consensus was that such a change would be inappropriate.
>>>>>
>>>>> These new diffs use a shadow ring in kernel space for tracking intermediate
>>>>> state of an entry and prevent more than one kernel thread from simultaneously
>>>>> allocating a ring entry. This avoids any impact to the binary interface
>>>>> between kernel and userspace but comes at the additional cost of requiring a
>>>>> second spin_lock when passing ownership of a ring entry to userspace.
>>>>>
>>>>> Jon Rosen (1):
>>>>>   packet: track ring entry use using a shadow ring to prevent RX ring
>>>>>     overrun
>>>>>
>>>>>  net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  net/packet/internal.h  | 14 +++++++++++
>>>>>  2 files changed, 78 insertions(+)
>>>>>
>>>>
>>>>> @@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>>>>>  #endif
>>>>>
>>>>>         if (po->tp_version <= TPACKET_V2) {
>>>>> +               spin_lock(&sk->sk_receive_queue.lock);
>>>>>                 __packet_set_status(po, h.raw, status);
>>>>> +               packet_rx_shadow_release(rx_shadow_ring_entry);
>>>>> +               spin_unlock(&sk->sk_receive_queue.lock);
>>>>> +
>>>>>                 sk->sk_data_ready(sk);
>>>>
>>>> Thanks for continuing to look at this. I spent some time on it last time
>>>> around but got stuck, too.
>>>>
>>>> This version takes an extra spinlock in the hot path. That will be very
>>>> expensive. Once we need to accept that, we could opt for a simpler
>>>> implementation akin to the one discussed in the previous thread:
>>>>
>>>> stash a value in tp_padding or similar while tp_status remains
>>>> TP_STATUS_KERNEL to signal ownership to concurrent kernel
>>>> threads. The issue previously was that that field could not atomically
>>>> be cleared together with __packet_set_status. This is no longer
>>>> an issue when holding the queue lock.
>>>>
>>>> With a field like tp_padding, unlike tp_len, it is arguably also safe to
>>>> clear it after flipping status (userspace should treat it as undefined).
>>>>
>>>> With v1 tpacket_hdr, no explicit padding field is defined but due to
>>>> TPACKET_HDRLEN alignment it exists on both 32 and 64 bit
>>>> platforms.
>>>>
>>>> The danger with using padding is that a process may write to it
>>>> and cause deadlock, of course. There is no logical reason for doing
>>>> so.
>>>
>>> For the ring, there is no requirement to allocate exactly the amount
>>> specified by the user request. Safer than relying on shared memory
>>> and simpler than the extra allocation in this patch would be to allocate
>>> extra shadow memory at the end of the ring (and not mmap that).
>>>
>>> That still leaves an extra cold cacheline vs using tp_padding.
>>
>> Given my lack of experience and knowledge in writing kernel code
>> it was easier for me to allocate the shadow ring as a separate
>> structure.  Of course it's not about me and my skills so if it's
>> more appropriate to allocate at the tail of the existing ring
>> then certainly I can look at doing that.
>>
>> I think the bigger issues as you've pointed out are the cost of
>> the additional spin lock and should the additional state be
>> stored in-band (fewer cache lines) or out-of band (less risk of
>> breaking due to unpredictable application behavior).
>
> We don't need the spinlock if clearing the shadow byte after
> setting the status to user.
>
> Worst case, user will set it back to kernel while the shadow
> byte is not cleared yet and the next producer will drop a packet.
> But next producers will make progress, so there is no deadlock
> or corruption.

I thought so too for a while but after spending more time than I
care to admit I relized the following sequence was occuring:

   Core A                       Core B
   ------                       ------
   - Enter spin_lock
   -   Get tp_status of head (X)
       tp_status == 0
   -   Check inuse
       inuse == 0
   -   Allocate entry X
       advance head (X+1)
       set inuse=1
   - Exit spin_lock

     <very long delay>

                                <allocate N-1 entries
                                where N = size of ring>

                                - Enter spin_lock
                                -   get tp_status of head (X+N)
                                    tp_status == 0 (but slot
                                    in use for X on core A)

   - write tp_status of         <--- trouble!
     X = TP_STATUS_USER         <--- trouble!
   - write inuse=0              <--- trouble!

                                -   Check inuse
                                    inuse == 0
                                -   Allocate entry X+N
                                    advance head (X+N+1)
                                    set inuse=1
                                - Exit spin_lock


At this point Core A just passed slot X to userspace with a
packet and Core B has just been assigned slot X+N (same slot as
X) for it's new packet. Both cores A and B end up filling in that
slot.  Tracking ths donw was one of the reasons it took me a
while to produce these updated diffs.


>
> It probably does require a shadow structure as opposed to a
> padding byte to work with the long tail of (possibly broken)
> applications, sadly.

I agree.

>
> A setsockopt for userspace to signal a stricter interpretation of
> tp_status to elide the shadow hack could then be considered.
> It's not pretty. Either way, no full new version is required.
>
>> As much as I would like to find a solution that doesn't require
>> the spin lock I have yet to do so. Maybe the answer is that
>> existing applications will need to suffer the performance impact
>> but a new version or option for TPACKET_V1/V2 could be added to
>> indicate strict adherence of the TP_STATUS_USER bit and then the
>> original diffs could be used.
>>
>> There is another option I was considering but have yet to try
>> which would avoid needing a shadow ring by using counter(s) to
>> track maximum sequence number queued to userspace vs. the next
>> sequence number to be allocated in the ring.  If the difference
>> is greater than the size of the ring then the ring can be
>> considered full and the allocation would fail. Of course this may
>> create an additional hotspot between cores, not sure if that
>> would be significant or not.
>
> Please do have a look, but I don't think that this will work in this
> case in practice. It requires tracking the producer tail. Updating
> the slowest writer requires probing each subsequent slot's status
> byte to find the new tail, which is a lot of (by then cold) cacheline
> reads.

I've thought about it a little more and am not convinced it's
workable but I'll spend a little more time on it before giving
up.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-21 17:07       ` Willem de Bruijn
  2018-05-21 18:16         ` Jon Rosen (jrosen)
@ 2018-05-22 14:12         ` Jon Rosen (jrosen)
  2018-05-22 15:46           ` Willem de Bruijn
  1 sibling, 1 reply; 14+ messages in thread
From: Jon Rosen (jrosen) @ 2018-05-22 14:12 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Monday, May 21, 2018 2:17 PM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
> On Monday, May 21, 2018 1:07 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Mon, May 21, 2018 at 8:57 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:

...snip...

>>
>> A setsockopt for userspace to signal a stricter interpretation of
>> tp_status to elide the shadow hack could then be considered.
>> It's not pretty. Either way, no full new version is required.
>>
>>> As much as I would like to find a solution that doesn't require
>>> the spin lock I have yet to do so. Maybe the answer is that
>>> existing applications will need to suffer the performance impact
>>> but a new version or option for TPACKET_V1/V2 could be added to
>>> indicate strict adherence of the TP_STATUS_USER bit and then the
>>> original diffs could be used.

It looks like adding new socket options is pretty rare so I
wonder if a better option might be to define a new TP_STATUS_XXX
bit which would signal from a userspace application to the kernel
that it strictly interprets the TP_STATUS_USER bit to determine
ownership.

Todays applications set tp_status = TP_STATUS_KERNEL(0) for the
kernel to pick up the entry.  We could define a new value to pass
ownership as well as one to indicate to other kernel threads that
an entry is inuse:

        #define TP_STATUS_USER_TO_KERNEL        (1 << 8)
        #define TP_STATUS_INUSE                 (1 << 9)

If the kernel sees tp_status == TP_STATUS_KERNEL then it should
use the shadow method for tacking ownership. If it sees tp_status
== TP_STATUS_USER_TO_KERNEL then it can use the TP_STATUS_INUSE
method.

>>>
>>> There is another option I was considering but have yet to try
>>> which would avoid needing a shadow ring by using counter(s) to
>>> track maximum sequence number queued to userspace vs. the next
>>> sequence number to be allocated in the ring.  If the difference
>>> is greater than the size of the ring then the ring can be
>>> considered full and the allocation would fail. Of course this may
>>> create an additional hotspot between cores, not sure if that
>>> would be significant or not.
>>
>> Please do have a look, but I don't think that this will work in this
>> case in practice. It requires tracking the producer tail. Updating
>> the slowest writer requires probing each subsequent slot's status
>> byte to find the new tail, which is a lot of (by then cold) cacheline
>> reads.
>
> I've thought about it a little more and am not convinced it's
> workable but I'll spend a little more time on it before giving
> up.

I've given up on this method.  Just don't see how to make it work.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-21 18:16         ` Jon Rosen (jrosen)
@ 2018-05-22 15:41           ` Willem de Bruijn
  2018-05-23 11:08             ` Jon Rosen (jrosen)
  0 siblings, 1 reply; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-22 15:41 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

>>> I think the bigger issues as you've pointed out are the cost of
>>> the additional spin lock and should the additional state be
>>> stored in-band (fewer cache lines) or out-of band (less risk of
>>> breaking due to unpredictable application behavior).
>>
>> We don't need the spinlock if clearing the shadow byte after
>> setting the status to user.
>>
>> Worst case, user will set it back to kernel while the shadow
>> byte is not cleared yet and the next producer will drop a packet.
>> But next producers will make progress, so there is no deadlock
>> or corruption.
>
> I thought so too for a while but after spending more time than I
> care to admit I relized the following sequence was occuring:
>
>    Core A                       Core B
>    ------                       ------
>    - Enter spin_lock
>    -   Get tp_status of head (X)
>        tp_status == 0
>    -   Check inuse
>        inuse == 0
>    -   Allocate entry X
>        advance head (X+1)
>        set inuse=1
>    - Exit spin_lock
>
>      <very long delay>
>
>                                 <allocate N-1 entries
>                                 where N = size of ring>
>
>                                 - Enter spin_lock
>                                 -   get tp_status of head (X+N)
>                                     tp_status == 0 (but slot
>                                     in use for X on core A)
>
>    - write tp_status of         <--- trouble!
>      X = TP_STATUS_USER         <--- trouble!
>    - write inuse=0              <--- trouble!
>
>                                 -   Check inuse
>                                     inuse == 0
>                                 -   Allocate entry X+N
>                                     advance head (X+N+1)
>                                     set inuse=1
>                                 - Exit spin_lock
>
>
> At this point Core A just passed slot X to userspace with a
> packet and Core B has just been assigned slot X+N (same slot as
> X) for it's new packet. Both cores A and B end up filling in that
> slot.  Tracking ths donw was one of the reasons it took me a
> while to produce these updated diffs.

Is this not just an ordering issue? Since inuse is set after tp_status,
it has to be tested first (and barriers are needed to avoid reordering).

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-22 14:12         ` Jon Rosen (jrosen)
@ 2018-05-22 15:46           ` Willem de Bruijn
  0 siblings, 0 replies; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-22 15:46 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Tue, May 22, 2018 at 10:12 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
> On Monday, May 21, 2018 2:17 PM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>> On Monday, May 21, 2018 1:07 PM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Mon, May 21, 2018 at 8:57 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>
> ...snip...
>
>>>
>>> A setsockopt for userspace to signal a stricter interpretation of
>>> tp_status to elide the shadow hack could then be considered.
>>> It's not pretty. Either way, no full new version is required.
>>>
>>>> As much as I would like to find a solution that doesn't require
>>>> the spin lock I have yet to do so. Maybe the answer is that
>>>> existing applications will need to suffer the performance impact
>>>> but a new version or option for TPACKET_V1/V2 could be added to
>>>> indicate strict adherence of the TP_STATUS_USER bit and then the
>>>> original diffs could be used.
>
> It looks like adding new socket options is pretty rare so I
> wonder if a better option might be to define a new TP_STATUS_XXX
> bit which would signal from a userspace application to the kernel
> that it strictly interprets the TP_STATUS_USER bit to determine
> ownership.
>
> Todays applications set tp_status = TP_STATUS_KERNEL(0) for the
> kernel to pick up the entry.  We could define a new value to pass
> ownership as well as one to indicate to other kernel threads that
> an entry is inuse:
>
>         #define TP_STATUS_USER_TO_KERNEL        (1 << 8)
>         #define TP_STATUS_INUSE                 (1 << 9)
>
> If the kernel sees tp_status == TP_STATUS_KERNEL then it should
> use the shadow method for tacking ownership. If it sees tp_status
> == TP_STATUS_USER_TO_KERNEL then it can use the TP_STATUS_INUSE
> method.

Interesting idea. Userspace would have to consistently follow the
new behavior for all slots, which is hard to enforce. And as this is
checked at runtime, the kernel always has to defensively allocate
the shadow memory. I do think that a per-ring option set before
ring allocation is simpler. Either way, this optimization would be a
separate follow-on patch.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-22 15:41           ` Willem de Bruijn
@ 2018-05-23 11:08             ` Jon Rosen (jrosen)
  0 siblings, 0 replies; 14+ messages in thread
From: Jon Rosen (jrosen) @ 2018-05-23 11:08 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

> >>> I think the bigger issues as you've pointed out are the cost of
> >>> the additional spin lock and should the additional state be
> >>> stored in-band (fewer cache lines) or out-of band (less risk of
> >>> breaking due to unpredictable application behavior).
> >>
> >> We don't need the spinlock if clearing the shadow byte after
> >> setting the status to user.
> >>
> >> Worst case, user will set it back to kernel while the shadow
> >> byte is not cleared yet and the next producer will drop a packet.
> >> But next producers will make progress, so there is no deadlock
> >> or corruption.
> >
> > I thought so too for a while but after spending more time than I
> > care to admit I relized the following sequence was occuring:
> >
> >    Core A                       Core B
> >    ------                       ------
> >    - Enter spin_lock
> >    -   Get tp_status of head (X)
> >        tp_status == 0
> >    -   Check inuse
> >        inuse == 0
> >    -   Allocate entry X
> >        advance head (X+1)
> >        set inuse=1
> >    - Exit spin_lock
> >
> >      <very long delay>
> >
> >                                 <allocate N-1 entries
> >                                 where N = size of ring>
> >
> >                                 - Enter spin_lock
> >                                 -   get tp_status of head (X+N)
> >                                     tp_status == 0 (but slot
> >                                     in use for X on core A)
> >
> >    - write tp_status of         <--- trouble!
> >      X = TP_STATUS_USER         <--- trouble!
> >    - write inuse=0              <--- trouble!
> >
> >                                 -   Check inuse
> >                                     inuse == 0
> >                                 -   Allocate entry X+N
> >                                     advance head (X+N+1)
> >                                     set inuse=1
> >                                 - Exit spin_lock
> >
> >
> > At this point Core A just passed slot X to userspace with a
> > packet and Core B has just been assigned slot X+N (same slot as
> > X) for it's new packet. Both cores A and B end up filling in that
> > slot.  Tracking ths donw was one of the reasons it took me a
> > while to produce these updated diffs.
> 
> Is this not just an ordering issue? Since inuse is set after tp_status,
> it has to be tested first (and barriers are needed to avoid reordering).

I changed the code as you suggest to do the inuse check first and
removed the extra added spin_lock/unlock and it seems to be working.
I was able to run through the night without an issue (normally I would
hit the ring corruption in 1 to 2 hours).

Thanks for pointing that out, I should have caught that myself.  Next
I'll look at your suggestion for where to put the shadow ring.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-20 23:22   ` Willem de Bruijn
  2018-05-21 12:57     ` Jon Rosen (jrosen)
@ 2018-05-23 11:54     ` Jon Rosen (jrosen)
  2018-05-23 13:37       ` Willem de Bruijn
  1 sibling, 1 reply; 14+ messages in thread
From: Jon Rosen (jrosen) @ 2018-05-23 11:54 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

> > For the ring, there is no requirement to allocate exactly the amount
> > specified by the user request. Safer than relying on shared memory
> > and simpler than the extra allocation in this patch would be to allocate
> > extra shadow memory at the end of the ring (and not mmap that).
> >
> > That still leaves an extra cold cacheline vs using tp_padding.
> 
> Given my lack of experience and knowledge in writing kernel code
> it was easier for me to allocate the shadow ring as a separate
> structure.  Of course it's not about me and my skills so if it's
> more appropriate to allocate at the tail of the existing ring
> then certainly I can look at doing that.

The memory for the ring is not one contiguous block, it's an array of
blocks of pages (or 'order' sized blocks of pages). I don't think
increasing the size of each of the blocks to provided storage would be
such a good idea as it will risk spilling over into the next order and
wasting lots of memory. I suspect it's also more complex than a single
shadow ring to do both the allocation and the access.

It could be tacked onto the end of the pg_vec[] used to store the
pointers to the blocks. The challenge with that is that a pg_vec[] is
created for each of RX and TX rings so either it would have to
allocate unnecessary storage for TX or the caller will have to say if
extra space should be allocated or not.  E.g.:

static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)

I'm not sure avoiding the extra allocation and moving it to the
pg_vec[] for the RX ring is going to get the simplification you were
hoping for.  Is there another way of storing the shadow ring which
I should consider?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-23 11:54     ` Jon Rosen (jrosen)
@ 2018-05-23 13:37       ` Willem de Bruijn
  2018-05-23 15:29         ` Jon Rosen (jrosen)
  0 siblings, 1 reply; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-23 13:37 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Wed, May 23, 2018 at 7:54 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>> > For the ring, there is no requirement to allocate exactly the amount
>> > specified by the user request. Safer than relying on shared memory
>> > and simpler than the extra allocation in this patch would be to allocate
>> > extra shadow memory at the end of the ring (and not mmap that).
>> >
>> > That still leaves an extra cold cacheline vs using tp_padding.
>>
>> Given my lack of experience and knowledge in writing kernel code
>> it was easier for me to allocate the shadow ring as a separate
>> structure.  Of course it's not about me and my skills so if it's
>> more appropriate to allocate at the tail of the existing ring
>> then certainly I can look at doing that.
>
> The memory for the ring is not one contiguous block, it's an array of
> blocks of pages (or 'order' sized blocks of pages). I don't think
> increasing the size of each of the blocks to provided storage would be
> such a good idea as it will risk spilling over into the next order and
> wasting lots of memory. I suspect it's also more complex than a single
> shadow ring to do both the allocation and the access.
>
> It could be tacked onto the end of the pg_vec[] used to store the
> pointers to the blocks. The challenge with that is that a pg_vec[] is
> created for each of RX and TX rings so either it would have to
> allocate unnecessary storage for TX or the caller will have to say if
> extra space should be allocated or not.  E.g.:
>
> static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)
>
> I'm not sure avoiding the extra allocation and moving it to the
> pg_vec[] for the RX ring is going to get the simplification you were
> hoping for.  Is there another way of storing the shadow ring which
> I should consider?

I did indeed mean attaching extra pages to pg_vec[]. It should be
simpler than a separate structure, but I may be wrong.

Either way, I still would prefer to avoid the shadow buffer completely.
It incurs complexity and cycle cost on all users because of only the
rare (non-existent?) consumer that overwrites the padding bytes.

Perhaps we can use padding yet avoid deadlock by writing a
timed value. The simplest would be jiffies >> N. Then only a
process that writes this exact value would be subject to drops and
then still only for a limited period.

Instead of depending on wall clock time, like jiffies, another option
would be to keep a percpu array of values. Each cpu has a zero
entry if it is not writing, nonzero if it is. If a writer encounters a
number in padding that is > num_cpus, then the state is garbage
from userspace. If <= num_cpus, it is adhered to only until that cpu
clears its entry, which is guaranteed to happen eventually.

Just a quick thought. This might not fly at all upon closer scrutiny.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-23 13:37       ` Willem de Bruijn
@ 2018-05-23 15:29         ` Jon Rosen (jrosen)
  2018-05-23 15:53           ` Willem de Bruijn
  0 siblings, 1 reply; 14+ messages in thread
From: Jon Rosen (jrosen) @ 2018-05-23 15:29 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list



> -----Original Message-----
> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
> Sent: Wednesday, May 23, 2018 9:37 AM
> To: Jon Rosen (jrosen) <jrosen@cisco.com>
> Cc: David S. Miller <davem@davemloft.net>; Willem de Bruijn <willemb@google.com>; Eric Dumazet
> <edumazet@google.com>; Kees Cook <keescook@chromium.org>; David Windsor <dwindsor@gmail.com>; Rosen,
> Rami <rami.rosen@intel.com>; Reshetova, Elena <elena.reshetova@intel.com>; Mike Maloney
> <maloney@google.com>; Benjamin Poirier <bpoirier@suse.com>; Thomas Gleixner <tglx@linutronix.de>; Greg
> Kroah-Hartman <gregkh@linuxfoundation.org>; open list:NETWORKING [GENERAL] <netdev@vger.kernel.org>;
> open list <linux-kernel@vger.kernel.org>
> Subject: Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
> 
> On Wed, May 23, 2018 at 7:54 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
> >> > For the ring, there is no requirement to allocate exactly the amount
> >> > specified by the user request. Safer than relying on shared memory
> >> > and simpler than the extra allocation in this patch would be to allocate
> >> > extra shadow memory at the end of the ring (and not mmap that).
> >> >
> >> > That still leaves an extra cold cacheline vs using tp_padding.
> >>
> >> Given my lack of experience and knowledge in writing kernel code
> >> it was easier for me to allocate the shadow ring as a separate
> >> structure.  Of course it's not about me and my skills so if it's
> >> more appropriate to allocate at the tail of the existing ring
> >> then certainly I can look at doing that.
> >
> > The memory for the ring is not one contiguous block, it's an array of
> > blocks of pages (or 'order' sized blocks of pages). I don't think
> > increasing the size of each of the blocks to provided storage would be
> > such a good idea as it will risk spilling over into the next order and
> > wasting lots of memory. I suspect it's also more complex than a single
> > shadow ring to do both the allocation and the access.
> >
> > It could be tacked onto the end of the pg_vec[] used to store the
> > pointers to the blocks. The challenge with that is that a pg_vec[] is
> > created for each of RX and TX rings so either it would have to
> > allocate unnecessary storage for TX or the caller will have to say if
> > extra space should be allocated or not.  E.g.:
> >
> > static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)
> >
> > I'm not sure avoiding the extra allocation and moving it to the
> > pg_vec[] for the RX ring is going to get the simplification you were
> > hoping for.  Is there another way of storing the shadow ring which
> > I should consider?
> 
> I did indeed mean attaching extra pages to pg_vec[]. It should be
> simpler than a separate structure, but I may be wrong.

I don't think it would be too bad, it may actually turn out to be
convenient to implement.

> 
> Either way, I still would prefer to avoid the shadow buffer completely.
> It incurs complexity and cycle cost on all users because of only the
> rare (non-existent?) consumer that overwrites the padding bytes.

I prefer that as well.  I'm just not sure there is a bulletproof
solution without the shadow state.  I also wish it were only a
theoretical issue but unfortunately it is actually something our
customers have seen.
> 
> Perhaps we can use padding yet avoid deadlock by writing a
> timed value. The simplest would be jiffies >> N. Then only a
> process that writes this exact value would be subject to drops and
> then still only for a limited period.
> 
> Instead of depending on wall clock time, like jiffies, another option
> would be to keep a percpu array of values. Each cpu has a zero
> entry if it is not writing, nonzero if it is. If a writer encounters a
> number in padding that is > num_cpus, then the state is garbage
> from userspace. If <= num_cpus, it is adhered to only until that cpu
> clears its entry, which is guaranteed to happen eventually.
> 
> Just a quick thought. This might not fly at all upon closer scrutiny.

I'm not sure I understand the suggestion, but I'll think on it
some more.

Some other options maybe worth considering (in no specific order):
- test the application to see if it will consume entries if tp_status
  is set to anything other than TP_STATUS_USER, only use shadow if
  it doesn't strictly honor the TP_STATUS_USER bit.

- skip shadow if we see new TP_STATUS_USER_TO_KERNEL is used

- use tp_len == -1 to indicate inuse

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
  2018-05-23 15:29         ` Jon Rosen (jrosen)
@ 2018-05-23 15:53           ` Willem de Bruijn
  0 siblings, 0 replies; 14+ messages in thread
From: Willem de Bruijn @ 2018-05-23 15:53 UTC (permalink / raw)
  To: Jon Rosen (jrosen)
  Cc: David S. Miller, Willem de Bruijn, Eric Dumazet, Kees Cook,
	David Windsor, Rosen, Rami, Reshetova, Elena, Mike Maloney,
	Benjamin Poirier, Thomas Gleixner, Greg Kroah-Hartman,
	open list:NETWORKING [GENERAL],
	open list

On Wed, May 23, 2018 at 11:29 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>
>
>> -----Original Message-----
>> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com]
>> Sent: Wednesday, May 23, 2018 9:37 AM
>> To: Jon Rosen (jrosen) <jrosen@cisco.com>
>> Cc: David S. Miller <davem@davemloft.net>; Willem de Bruijn <willemb@google.com>; Eric Dumazet
>> <edumazet@google.com>; Kees Cook <keescook@chromium.org>; David Windsor <dwindsor@gmail.com>; Rosen,
>> Rami <rami.rosen@intel.com>; Reshetova, Elena <elena.reshetova@intel.com>; Mike Maloney
>> <maloney@google.com>; Benjamin Poirier <bpoirier@suse.com>; Thomas Gleixner <tglx@linutronix.de>; Greg
>> Kroah-Hartman <gregkh@linuxfoundation.org>; open list:NETWORKING [GENERAL] <netdev@vger.kernel.org>;
>> open list <linux-kernel@vger.kernel.org>
>> Subject: Re: [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun
>>
>> On Wed, May 23, 2018 at 7:54 AM, Jon Rosen (jrosen) <jrosen@cisco.com> wrote:
>> >> > For the ring, there is no requirement to allocate exactly the amount
>> >> > specified by the user request. Safer than relying on shared memory
>> >> > and simpler than the extra allocation in this patch would be to allocate
>> >> > extra shadow memory at the end of the ring (and not mmap that).
>> >> >
>> >> > That still leaves an extra cold cacheline vs using tp_padding.
>> >>
>> >> Given my lack of experience and knowledge in writing kernel code
>> >> it was easier for me to allocate the shadow ring as a separate
>> >> structure.  Of course it's not about me and my skills so if it's
>> >> more appropriate to allocate at the tail of the existing ring
>> >> then certainly I can look at doing that.
>> >
>> > The memory for the ring is not one contiguous block, it's an array of
>> > blocks of pages (or 'order' sized blocks of pages). I don't think
>> > increasing the size of each of the blocks to provided storage would be
>> > such a good idea as it will risk spilling over into the next order and
>> > wasting lots of memory. I suspect it's also more complex than a single
>> > shadow ring to do both the allocation and the access.
>> >
>> > It could be tacked onto the end of the pg_vec[] used to store the
>> > pointers to the blocks. The challenge with that is that a pg_vec[] is
>> > created for each of RX and TX rings so either it would have to
>> > allocate unnecessary storage for TX or the caller will have to say if
>> > extra space should be allocated or not.  E.g.:
>> >
>> > static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order, int scratch, void **scratch_p)
>> >
>> > I'm not sure avoiding the extra allocation and moving it to the
>> > pg_vec[] for the RX ring is going to get the simplification you were
>> > hoping for.  Is there another way of storing the shadow ring which
>> > I should consider?
>>
>> I did indeed mean attaching extra pages to pg_vec[]. It should be
>> simpler than a separate structure, but I may be wrong.
>
> I don't think it would be too bad, it may actually turn out to be
> convenient to implement.
>
>>
>> Either way, I still would prefer to avoid the shadow buffer completely.
>> It incurs complexity and cycle cost on all users because of only the
>> rare (non-existent?) consumer that overwrites the padding bytes.
>
> I prefer that as well.  I'm just not sure there is a bulletproof
> solution without the shadow state.  I also wish it were only a
> theoretical issue but unfortunately it is actually something our
> customers have seen.
>>
>> Perhaps we can use padding yet avoid deadlock by writing a
>> timed value. The simplest would be jiffies >> N. Then only a
>> process that writes this exact value would be subject to drops and
>> then still only for a limited period.
>>
>> Instead of depending on wall clock time, like jiffies, another option
>> would be to keep a percpu array of values. Each cpu has a zero
>> entry if it is not writing, nonzero if it is. If a writer encounters a
>> number in padding that is > num_cpus, then the state is garbage
>> from userspace. If <= num_cpus, it is adhered to only until that cpu
>> clears its entry, which is guaranteed to happen eventually.
>>
>> Just a quick thought. This might not fly at all upon closer scrutiny.
>
> I'm not sure I understand the suggestion, but I'll think on it
> some more.

The idea is to use tp_padding to signal state between kernel threads.
But in such a way that worst case is not a deadlock, just a higher drop
rate.

First, write a specific value and ignore any values other than zero and
this. This fixes the bug, except for the rare users that write to padding.

Second, invalidate the specific value at some point to ensure forward
progress.

Simplest is wall clock time, so jiffies based. But this is imprecise.

Alternative that scales with #cpus instead of #slots is to a percpu
array. A kernel thread that reads a non-zero padding is reading
either garbage or a cpuid. If a cpuid, it can double check that the
given cpu is busy writing by reading the entry from the percpu array.

>
> Some other options maybe worth considering (in no specific order):
> - test the application to see if it will consume entries if tp_status
>   is set to anything other than TP_STATUS_USER, only use shadow if
>   it doesn't strictly honor the TP_STATUS_USER bit.
>
> - skip shadow if we see new TP_STATUS_USER_TO_KERNEL is used
>
> - use tp_len == -1 to indicate inuse
>
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-05-23 15:54 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-19 12:07 [PATCH v2] packet: track ring entry use using a shadow ring to prevent RX ring overrun Jon Rosen
2018-05-20 22:51 ` Willem de Bruijn
2018-05-20 23:22   ` Willem de Bruijn
2018-05-21 12:57     ` Jon Rosen (jrosen)
2018-05-21 17:07       ` Willem de Bruijn
2018-05-21 18:16         ` Jon Rosen (jrosen)
2018-05-22 15:41           ` Willem de Bruijn
2018-05-23 11:08             ` Jon Rosen (jrosen)
2018-05-22 14:12         ` Jon Rosen (jrosen)
2018-05-22 15:46           ` Willem de Bruijn
2018-05-23 11:54     ` Jon Rosen (jrosen)
2018-05-23 13:37       ` Willem de Bruijn
2018-05-23 15:29         ` Jon Rosen (jrosen)
2018-05-23 15:53           ` Willem de Bruijn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.