All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
@ 2019-06-19 20:25 Neil Horman
  2019-06-20 13:41 ` Willem de Bruijn
                   ` (3 more replies)
  0 siblings, 4 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-19 20:25 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Matteo Croce, David S. Miller

When an application is run that:
a) Sets its scheduler to be SCHED_FIFO
and
b) Opens a memory mapped AF_PACKET socket, and sends frames with the
MSG_DONTWAIT flag cleared, its possible for the application to hang
forever in the kernel.  This occurs because when waiting, the code in
tpacket_snd calls schedule, which under normal circumstances allows
other tasks to run, including ksoftirqd, which in some cases is
responsible for freeing the transmitted skb (which in AF_PACKET calls a
destructor that flips the status bit of the transmitted frame back to
available, allowing the transmitting task to complete).

However, when the calling application is SCHED_FIFO, its priority is
such that the schedule call immediately places the task back on the cpu,
preventing ksoftirqd from freeing the skb, which in turn prevents the
transmitting task from detecting that the transmission is complete.

We can fix this by converting the schedule call to a completion
mechanism.  By using a completion queue, we force the calling task, when
it detects there are no more frames to send, to schedule itself off the
cpu until such time as the last transmitted skb is freed, allowing
forward progress to be made.

Tested by myself and the reporter, with good results

Appies to the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Matteo Croce <mcroce@redhat.com>
CC: "David S. Miller" <davem@davemloft.net>
---
 net/packet/af_packet.c | 42 +++++++++++++++++++++++++++++++-----------
 net/packet/internal.h  |  2 ++
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a29d66da7394..e65f4ef18a06 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -358,7 +358,8 @@ static inline struct page * __pure pgv_to_page(void *addr)
 	return virt_to_page(addr);
 }
 
-static void __packet_set_status(struct packet_sock *po, void *frame, int status)
+static void __packet_set_status(struct packet_sock *po, void *frame, int status,
+				bool call_complete)
 {
 	union tpacket_uhdr h;
 
@@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		BUG();
 	}
 
+	if (po->wait_on_complete && call_complete)
+		complete(&po->skb_completion);
 	smp_wmb();
 }
 
@@ -1148,6 +1151,14 @@ static void *packet_previous_frame(struct packet_sock *po,
 	return packet_lookup_frame(po, rb, previous, status);
 }
 
+static void *packet_next_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	unsigned int next = rb->head != rb->frame_max ? rb->head+1 : 0;
+	return packet_lookup_frame(po, rb, next, status);
+}
+
 static void packet_increment_head(struct packet_ring_buffer *buff)
 {
 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
@@ -2360,7 +2371,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 #endif
 
 	if (po->tp_version <= TPACKET_V2) {
-		__packet_set_status(po, h.raw, status);
+		__packet_set_status(po, h.raw, status, false);
 		sk->sk_data_ready(sk);
 	} else {
 		prb_clear_blk_fill_status(&po->rx_ring);
@@ -2400,7 +2411,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 		packet_dec_pending(&po->tx_ring);
 
 		ts = __packet_set_timestamp(po, ph, skb);
-		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts, true);
 	}
 
 	sock_wfree(skb);
@@ -2600,6 +2611,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
 	int hlen, tlen, copylen = 0;
+	void *tmp;
 
 	mutex_lock(&po->pg_vec_lock);
 
@@ -2647,16 +2659,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	do {
+
+		if (po->wait_on_complete && need_wait) {
+			wait_for_completion(&po->skb_completion);
+			po->wait_on_complete = 0;
+		}
+
 		ph = packet_current_frame(po, &po->tx_ring,
 					  TP_STATUS_SEND_REQUEST);
-		if (unlikely(ph == NULL)) {
-			if (need_wait && need_resched())
-				schedule();
-			continue;
-		}
+
+		if (likely(ph == NULL))
+			break;
 
 		skb = NULL;
 		tp_len = tpacket_parse_header(po, ph, size_max, &data);
+
 		if (tp_len < 0)
 			goto tpacket_error;
 
@@ -2699,7 +2716,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 tpacket_error:
 			if (po->tp_loss) {
 				__packet_set_status(po, ph,
-						TP_STATUS_AVAILABLE);
+						TP_STATUS_AVAILABLE, false);
 				packet_increment_head(&po->tx_ring);
 				kfree_skb(skb);
 				continue;
@@ -2719,7 +2736,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		}
 
 		skb->destructor = tpacket_destruct_skb;
-		__packet_set_status(po, ph, TP_STATUS_SENDING);
+		__packet_set_status(po, ph, TP_STATUS_SENDING, false);
+		if (!packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST))
+			po->wait_on_complete = 1;
 		packet_inc_pending(&po->tx_ring);
 
 		status = TP_STATUS_SEND_REQUEST;
@@ -2753,7 +2772,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	goto out_put;
 
 out_status:
-	__packet_set_status(po, ph, status);
+	__packet_set_status(po, ph, status, false);
 	kfree_skb(skb);
 out_put:
 	dev_put(dev);
@@ -3207,6 +3226,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_init_data(sock, sk);
 
 	po = pkt_sk(sk);
+	init_completion(&po->skb_completion);
 	sk->sk_family = PF_PACKET;
 	po->num = proto;
 	po->xmit = dev_queue_xmit;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 3bb7c5fb3bff..bbb4be2c18e7 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -128,6 +128,8 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_tstamp;
+	struct completion	skb_completion;
+	unsigned int		wait_on_complete;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-19 20:25 [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET Neil Horman
@ 2019-06-20 13:41 ` Willem de Bruijn
  2019-06-20 14:01   ` Matteo Croce
  2019-06-20 14:23   ` Neil Horman
  2019-06-22 17:41 ` [PATCH v2 " Neil Horman
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-20 13:41 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> When an application is run that:
> a) Sets its scheduler to be SCHED_FIFO
> and
> b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> MSG_DONTWAIT flag cleared, its possible for the application to hang
> forever in the kernel.  This occurs because when waiting, the code in
> tpacket_snd calls schedule, which under normal circumstances allows
> other tasks to run, including ksoftirqd, which in some cases is
> responsible for freeing the transmitted skb (which in AF_PACKET calls a
> destructor that flips the status bit of the transmitted frame back to
> available, allowing the transmitting task to complete).
>
> However, when the calling application is SCHED_FIFO, its priority is
> such that the schedule call immediately places the task back on the cpu,
> preventing ksoftirqd from freeing the skb, which in turn prevents the
> transmitting task from detecting that the transmission is complete.
>
> We can fix this by converting the schedule call to a completion
> mechanism.  By using a completion queue, we force the calling task, when
> it detects there are no more frames to send, to schedule itself off the
> cpu until such time as the last transmitted skb is freed, allowing
> forward progress to be made.
>
> Tested by myself and the reporter, with good results
>
> Appies to the net tree
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> Reported-by: Matteo Croce <mcroce@redhat.com>
> CC: "David S. Miller" <davem@davemloft.net>
> ---

This is a complex change for a narrow configuration. Isn't a
SCHED_FIFO process preempting ksoftirqd a potential problem for other
networking workloads as well? And the right configuration to always
increase ksoftirqd priority when increasing another process's
priority? Also, even when ksoftirqd kicks in, isn't some progress
still made on the local_bh_enable reached from schedule()?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 13:41 ` Willem de Bruijn
@ 2019-06-20 14:01   ` Matteo Croce
  2019-06-20 14:23   ` Neil Horman
  1 sibling, 0 replies; 34+ messages in thread
From: Matteo Croce @ 2019-06-20 14:01 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Neil Horman, Network Development, David S. Miller

On Thu, Jun 20, 2019 at 3:42 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > When an application is run that:
> > a) Sets its scheduler to be SCHED_FIFO
> > and
> > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > forever in the kernel.  This occurs because when waiting, the code in
> > tpacket_snd calls schedule, which under normal circumstances allows
> > other tasks to run, including ksoftirqd, which in some cases is
> > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > destructor that flips the status bit of the transmitted frame back to
> > available, allowing the transmitting task to complete).
> >
> > However, when the calling application is SCHED_FIFO, its priority is
> > such that the schedule call immediately places the task back on the cpu,
> > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > transmitting task from detecting that the transmission is complete.
> >
> > We can fix this by converting the schedule call to a completion
> > mechanism.  By using a completion queue, we force the calling task, when
> > it detects there are no more frames to send, to schedule itself off the
> > cpu until such time as the last transmitted skb is freed, allowing
> > forward progress to be made.
> >
> > Tested by myself and the reporter, with good results
> >
> > Appies to the net tree
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > Reported-by: Matteo Croce <mcroce@redhat.com>
> > CC: "David S. Miller" <davem@davemloft.net>
> > ---
>
> This is a complex change for a narrow configuration. Isn't a
> SCHED_FIFO process preempting ksoftirqd a potential problem for other
> networking workloads as well? And the right configuration to always
> increase ksoftirqd priority when increasing another process's
> priority? Also, even when ksoftirqd kicks in, isn't some progress
> still made on the local_bh_enable reached from schedule()?

Hi Willem,

from my tests, it seems that when the application hangs, raising the
ksoftirqd priority doesn't help.
If the application priority is the same as ksoftirqd, it will loop
forever, the only workaround is to lower the application priority or
set it to SCHED_NORMAL

Regards,

-- 
Matteo Croce
per aspera ad upstream

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 13:41 ` Willem de Bruijn
  2019-06-20 14:01   ` Matteo Croce
@ 2019-06-20 14:23   ` Neil Horman
  2019-06-20 15:16     ` Willem de Bruijn
  1 sibling, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-20 14:23 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > When an application is run that:
> > a) Sets its scheduler to be SCHED_FIFO
> > and
> > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > forever in the kernel.  This occurs because when waiting, the code in
> > tpacket_snd calls schedule, which under normal circumstances allows
> > other tasks to run, including ksoftirqd, which in some cases is
> > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > destructor that flips the status bit of the transmitted frame back to
> > available, allowing the transmitting task to complete).
> >
> > However, when the calling application is SCHED_FIFO, its priority is
> > such that the schedule call immediately places the task back on the cpu,
> > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > transmitting task from detecting that the transmission is complete.
> >
> > We can fix this by converting the schedule call to a completion
> > mechanism.  By using a completion queue, we force the calling task, when
> > it detects there are no more frames to send, to schedule itself off the
> > cpu until such time as the last transmitted skb is freed, allowing
> > forward progress to be made.
> >
> > Tested by myself and the reporter, with good results
> >
> > Appies to the net tree
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > Reported-by: Matteo Croce <mcroce@redhat.com>
> > CC: "David S. Miller" <davem@davemloft.net>
> > ---
> 
> This is a complex change for a narrow configuration. Isn't a
> SCHED_FIFO process preempting ksoftirqd a potential problem for other
> networking workloads as well? And the right configuration to always
> increase ksoftirqd priority when increasing another process's
> priority? Also, even when ksoftirqd kicks in, isn't some progress
> still made on the local_bh_enable reached from schedule()?
> 

A few questions here to answer:

Regarding other protocols having this problem, thats not the case, because non
packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
certainly do that, but the current implementation doesn't (opting instead to
wait indefinately until the respective packet(s) have transmitted or errored
out), and I wanted to maintain that behavior.  If there is consensus that packet
sockets should honor SNDTIMEO, then I can certainly do that.

As for progress made by calling local_bh_enable, My read of the code doesn't
have the scheduler calling local_bh_enable at all.  Instead schedule uses
preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
which ignores pending softirqs on re-enablement.  Perhaps that needs to change,
but I'm averse to making scheduler changes for this (the aforementioned concern
about complex changes for a narrow use case)

Regarding raising the priority of ksoftirqd, that could be a solution, but the
priority would need to be raised to a high priority SCHED_FIFO parameter, and
that gets back to making complex changes for a narrow problem domain

As for the comlexity of the of the solution, I think this is, given your
comments the least complex and intrusive change to solve the given problem.  We
need to find a way to force the calling task off the cpu while the asynchronous
operations in the transmit path complete, and we can do that this way, or by
honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
alter the current protocol behavior without consensus on that.

Regards
Neil


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 14:23   ` Neil Horman
@ 2019-06-20 15:16     ` Willem de Bruijn
  2019-06-20 16:14       ` Neil Horman
  2019-06-21 16:41       ` Neil Horman
  0 siblings, 2 replies; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-20 15:16 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > >
> > > When an application is run that:
> > > a) Sets its scheduler to be SCHED_FIFO
> > > and
> > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > forever in the kernel.  This occurs because when waiting, the code in
> > > tpacket_snd calls schedule, which under normal circumstances allows
> > > other tasks to run, including ksoftirqd, which in some cases is
> > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > destructor that flips the status bit of the transmitted frame back to
> > > available, allowing the transmitting task to complete).
> > >
> > > However, when the calling application is SCHED_FIFO, its priority is
> > > such that the schedule call immediately places the task back on the cpu,
> > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > transmitting task from detecting that the transmission is complete.
> > >
> > > We can fix this by converting the schedule call to a completion
> > > mechanism.  By using a completion queue, we force the calling task, when
> > > it detects there are no more frames to send, to schedule itself off the
> > > cpu until such time as the last transmitted skb is freed, allowing
> > > forward progress to be made.
> > >
> > > Tested by myself and the reporter, with good results
> > >
> > > Appies to the net tree
> > >
> > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > CC: "David S. Miller" <davem@davemloft.net>
> > > ---
> >
> > This is a complex change for a narrow configuration. Isn't a
> > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > networking workloads as well? And the right configuration to always
> > increase ksoftirqd priority when increasing another process's
> > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > still made on the local_bh_enable reached from schedule()?
> >
>
> A few questions here to answer:

Thanks for the detailed explanation.

> Regarding other protocols having this problem, thats not the case, because non
> packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> certainly do that, but the current implementation doesn't (opting instead to
> wait indefinately until the respective packet(s) have transmitted or errored
> out), and I wanted to maintain that behavior.  If there is consensus that packet
> sockets should honor SNDTIMEO, then I can certainly do that.
>
> As for progress made by calling local_bh_enable, My read of the code doesn't
> have the scheduler calling local_bh_enable at all.  Instead schedule uses
> preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> which ignores pending softirqs on re-enablement.

Ah, I'm mistaken there, then.

>  Perhaps that needs to change,
> but I'm averse to making scheduler changes for this (the aforementioned concern
> about complex changes for a narrow use case)
>
> Regarding raising the priority of ksoftirqd, that could be a solution, but the
> priority would need to be raised to a high priority SCHED_FIFO parameter, and
> that gets back to making complex changes for a narrow problem domain
>
> As for the comlexity of the of the solution, I think this is, given your
> comments the least complex and intrusive change to solve the given problem.

Could it be simpler to ensure do_softirq() gets run here? That would
allow progress for this case.

>  We
> need to find a way to force the calling task off the cpu while the asynchronous
> operations in the transmit path complete, and we can do that this way, or by
> honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> alter the current protocol behavior without consensus on that.

In general SCHED_FIFO is dangerous with regard to stalling other
progress, incl. ksoftirqd. But it does appear that this packet socket
case is special inside networking in calling schedule() directly here.

If converting that, should it convert to logic more akin to other
sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
the pros and cons of completion here yet, sorry. Didn't want to delay
responding until after I get a chance.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 15:16     ` Willem de Bruijn
@ 2019-06-20 16:14       ` Neil Horman
  2019-06-20 16:18         ` Willem de Bruijn
  2019-06-21 16:41       ` Neil Horman
  1 sibling, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-20 16:14 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > >
> > > > When an application is run that:
> > > > a) Sets its scheduler to be SCHED_FIFO
> > > > and
> > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > destructor that flips the status bit of the transmitted frame back to
> > > > available, allowing the transmitting task to complete).
> > > >
> > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > such that the schedule call immediately places the task back on the cpu,
> > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > transmitting task from detecting that the transmission is complete.
> > > >
> > > > We can fix this by converting the schedule call to a completion
> > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > it detects there are no more frames to send, to schedule itself off the
> > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > forward progress to be made.
> > > >
> > > > Tested by myself and the reporter, with good results
> > > >
> > > > Appies to the net tree
> > > >
> > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > ---
> > >
> > > This is a complex change for a narrow configuration. Isn't a
> > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > networking workloads as well? And the right configuration to always
> > > increase ksoftirqd priority when increasing another process's
> > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > still made on the local_bh_enable reached from schedule()?
> > >
> >
> > A few questions here to answer:
> 
> Thanks for the detailed explanation.
> 
Gladly.

> > Regarding other protocols having this problem, thats not the case, because non
> > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > certainly do that, but the current implementation doesn't (opting instead to
> > wait indefinately until the respective packet(s) have transmitted or errored
> > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > sockets should honor SNDTIMEO, then I can certainly do that.
> >
> > As for progress made by calling local_bh_enable, My read of the code doesn't
> > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > which ignores pending softirqs on re-enablement.
> 
> Ah, I'm mistaken there, then.
> 
> >  Perhaps that needs to change,
> > but I'm averse to making scheduler changes for this (the aforementioned concern
> > about complex changes for a narrow use case)
> >
> > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > that gets back to making complex changes for a narrow problem domain
> >
> > As for the comlexity of the of the solution, I think this is, given your
> > comments the least complex and intrusive change to solve the given problem.
> 
> Could it be simpler to ensure do_softirq() gets run here? That would
> allow progress for this case.
> 
I'm not sure.  On the surface, we certainly could do it, but inserting a call to
do_softirq, either directly, or indirectly through some other mechanism seems
like a non-obvious fix, and may lead to confusion down the road.  I'm hesitant
to pursue such a soultion without some evidence it would make a better solution.

> >  We
> > need to find a way to force the calling task off the cpu while the asynchronous
> > operations in the transmit path complete, and we can do that this way, or by
> > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > alter the current protocol behavior without consensus on that.
> 
> In general SCHED_FIFO is dangerous with regard to stalling other
> progress, incl. ksoftirqd. But it does appear that this packet socket
> case is special inside networking in calling schedule() directly here.
> 
> If converting that, should it convert to logic more akin to other
> sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> the pros and cons of completion here yet, sorry. Didn't want to delay
> responding until after I get a chance.
> 
That would be the solution described above (i.e. honoring SK_SNDTIMEO.
Basically you call sock_send_waittimeo, which returns a timeout value, or 0 if
MSG_DONTWAIT is set), then you block for that period of time waiting for
transmit completion.  I'm happy to implement that solution, but I'd like to get
some clarity as to if there is a reason we don't currently honor that socket
option now before I change the behavior that way.

Dave, do you have any insight into AF_PACKET history as to why we would ignore
the send timeout socket option here?

Best
Neil


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 16:14       ` Neil Horman
@ 2019-06-20 16:18         ` Willem de Bruijn
  2019-06-20 17:31           ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-20 16:18 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Thu, Jun 20, 2019 at 12:14 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> > On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > >
> > > > > When an application is run that:
> > > > > a) Sets its scheduler to be SCHED_FIFO
> > > > > and
> > > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > > destructor that flips the status bit of the transmitted frame back to
> > > > > available, allowing the transmitting task to complete).
> > > > >
> > > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > > such that the schedule call immediately places the task back on the cpu,
> > > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > > transmitting task from detecting that the transmission is complete.
> > > > >
> > > > > We can fix this by converting the schedule call to a completion
> > > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > > it detects there are no more frames to send, to schedule itself off the
> > > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > > forward progress to be made.
> > > > >
> > > > > Tested by myself and the reporter, with good results
> > > > >
> > > > > Appies to the net tree
> > > > >
> > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > > ---
> > > >
> > > > This is a complex change for a narrow configuration. Isn't a
> > > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > > networking workloads as well? And the right configuration to always
> > > > increase ksoftirqd priority when increasing another process's
> > > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > > still made on the local_bh_enable reached from schedule()?
> > > >
> > >
> > > A few questions here to answer:
> >
> > Thanks for the detailed explanation.
> >
> Gladly.
>
> > > Regarding other protocols having this problem, thats not the case, because non
> > > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > > certainly do that, but the current implementation doesn't (opting instead to
> > > wait indefinately until the respective packet(s) have transmitted or errored
> > > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > > sockets should honor SNDTIMEO, then I can certainly do that.
> > >
> > > As for progress made by calling local_bh_enable, My read of the code doesn't
> > > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > > which ignores pending softirqs on re-enablement.
> >
> > Ah, I'm mistaken there, then.
> >
> > >  Perhaps that needs to change,
> > > but I'm averse to making scheduler changes for this (the aforementioned concern
> > > about complex changes for a narrow use case)
> > >
> > > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > > that gets back to making complex changes for a narrow problem domain
> > >
> > > As for the comlexity of the of the solution, I think this is, given your
> > > comments the least complex and intrusive change to solve the given problem.
> >
> > Could it be simpler to ensure do_softirq() gets run here? That would
> > allow progress for this case.
> >
> I'm not sure.  On the surface, we certainly could do it, but inserting a call to
> do_softirq, either directly, or indirectly through some other mechanism seems
> like a non-obvious fix, and may lead to confusion down the road.  I'm hesitant
> to pursue such a soultion without some evidence it would make a better solution.
>
> > >  We
> > > need to find a way to force the calling task off the cpu while the asynchronous
> > > operations in the transmit path complete, and we can do that this way, or by
> > > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > > alter the current protocol behavior without consensus on that.
> >
> > In general SCHED_FIFO is dangerous with regard to stalling other
> > progress, incl. ksoftirqd. But it does appear that this packet socket
> > case is special inside networking in calling schedule() directly here.
> >
> > If converting that, should it convert to logic more akin to other
> > sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> > the pros and cons of completion here yet, sorry. Didn't want to delay
> > responding until after I get a chance.
> >
> That would be the solution described above (i.e. honoring SK_SNDTIMEO.
> Basically you call sock_send_waittimeo, which returns a timeout value, or 0 if
> MSG_DONTWAIT is set), then you block for that period of time waiting for
> transmit completion.

From an ABI point of view, starting to support SK_SNDTIMEO where it
currently is not implemented certainly seems fine.

>  I'm happy to implement that solution, but I'd like to get
> some clarity as to if there is a reason we don't currently honor that socket
> option now before I change the behavior that way.
>
> Dave, do you have any insight into AF_PACKET history as to why we would ignore
> the send timeout socket option here?

On the point of calling schedule(): even if that is rare, there are a lot of
other cond_resched() calls that may have the same starvation issue
with SCHED_FIFO and ksoftirqd.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 16:18         ` Willem de Bruijn
@ 2019-06-20 17:31           ` Neil Horman
  0 siblings, 0 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-20 17:31 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Thu, Jun 20, 2019 at 12:18:41PM -0400, Willem de Bruijn wrote:
> On Thu, Jun 20, 2019 at 12:14 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> > > On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > >
> > > > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > > >
> > > > > > When an application is run that:
> > > > > > a) Sets its scheduler to be SCHED_FIFO
> > > > > > and
> > > > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > > > destructor that flips the status bit of the transmitted frame back to
> > > > > > available, allowing the transmitting task to complete).
> > > > > >
> > > > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > > > such that the schedule call immediately places the task back on the cpu,
> > > > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > > > transmitting task from detecting that the transmission is complete.
> > > > > >
> > > > > > We can fix this by converting the schedule call to a completion
> > > > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > > > it detects there are no more frames to send, to schedule itself off the
> > > > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > > > forward progress to be made.
> > > > > >
> > > > > > Tested by myself and the reporter, with good results
> > > > > >
> > > > > > Appies to the net tree
> > > > > >
> > > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > > > ---
> > > > >
> > > > > This is a complex change for a narrow configuration. Isn't a
> > > > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > > > networking workloads as well? And the right configuration to always
> > > > > increase ksoftirqd priority when increasing another process's
> > > > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > > > still made on the local_bh_enable reached from schedule()?
> > > > >
> > > >
> > > > A few questions here to answer:
> > >
> > > Thanks for the detailed explanation.
> > >
> > Gladly.
> >
> > > > Regarding other protocols having this problem, thats not the case, because non
> > > > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > > > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > > > certainly do that, but the current implementation doesn't (opting instead to
> > > > wait indefinately until the respective packet(s) have transmitted or errored
> > > > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > > > sockets should honor SNDTIMEO, then I can certainly do that.
> > > >
> > > > As for progress made by calling local_bh_enable, My read of the code doesn't
> > > > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > > > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > > > which ignores pending softirqs on re-enablement.
> > >
> > > Ah, I'm mistaken there, then.
> > >
> > > >  Perhaps that needs to change,
> > > > but I'm averse to making scheduler changes for this (the aforementioned concern
> > > > about complex changes for a narrow use case)
> > > >
> > > > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > > > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > > > that gets back to making complex changes for a narrow problem domain
> > > >
> > > > As for the comlexity of the of the solution, I think this is, given your
> > > > comments the least complex and intrusive change to solve the given problem.
> > >
> > > Could it be simpler to ensure do_softirq() gets run here? That would
> > > allow progress for this case.
> > >
> > I'm not sure.  On the surface, we certainly could do it, but inserting a call to
> > do_softirq, either directly, or indirectly through some other mechanism seems
> > like a non-obvious fix, and may lead to confusion down the road.  I'm hesitant
> > to pursue such a soultion without some evidence it would make a better solution.
> >
> > > >  We
> > > > need to find a way to force the calling task off the cpu while the asynchronous
> > > > operations in the transmit path complete, and we can do that this way, or by
> > > > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > > > alter the current protocol behavior without consensus on that.
> > >
> > > In general SCHED_FIFO is dangerous with regard to stalling other
> > > progress, incl. ksoftirqd. But it does appear that this packet socket
> > > case is special inside networking in calling schedule() directly here.
> > >
> > > If converting that, should it convert to logic more akin to other
> > > sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> > > the pros and cons of completion here yet, sorry. Didn't want to delay
> > > responding until after I get a chance.
> > >
> > That would be the solution described above (i.e. honoring SK_SNDTIMEO.
> > Basically you call sock_send_waittimeo, which returns a timeout value, or 0 if
> > MSG_DONTWAIT is set), then you block for that period of time waiting for
> > transmit completion.
> 
> From an ABI point of view, starting to support SK_SNDTIMEO where it
> currently is not implemented certainly seems fine.
> 
I would agree, I was just thinking since AF_PACKET is something of a unique
protocol (i.e. no real protocol), there may have been reason to ignore that
option, but I agree, it probably makes sense, barring no counter reasoning.

> >  I'm happy to implement that solution, but I'd like to get
> > some clarity as to if there is a reason we don't currently honor that socket
> > option now before I change the behavior that way.
> >
> > Dave, do you have any insight into AF_PACKET history as to why we would ignore
> > the send timeout socket option here?
> 
> On the point of calling schedule(): even if that is rare, there are a lot of
> other cond_resched() calls that may have the same starvation issue
> with SCHED_FIFO and ksoftirqd.
> 
Agreed, but I've not found any yet
Neil


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-20 15:16     ` Willem de Bruijn
  2019-06-20 16:14       ` Neil Horman
@ 2019-06-21 16:41       ` Neil Horman
  2019-06-21 18:31         ` Willem de Bruijn
  1 sibling, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-21 16:41 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > >
> > > > When an application is run that:
> > > > a) Sets its scheduler to be SCHED_FIFO
> > > > and
> > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > destructor that flips the status bit of the transmitted frame back to
> > > > available, allowing the transmitting task to complete).
> > > >
> > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > such that the schedule call immediately places the task back on the cpu,
> > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > transmitting task from detecting that the transmission is complete.
> > > >
> > > > We can fix this by converting the schedule call to a completion
> > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > it detects there are no more frames to send, to schedule itself off the
> > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > forward progress to be made.
> > > >
> > > > Tested by myself and the reporter, with good results
> > > >
> > > > Appies to the net tree
> > > >
> > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > ---
> > >
> > > This is a complex change for a narrow configuration. Isn't a
> > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > networking workloads as well? And the right configuration to always
> > > increase ksoftirqd priority when increasing another process's
> > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > still made on the local_bh_enable reached from schedule()?
> > >
> >
> > A few questions here to answer:
> 
> Thanks for the detailed explanation.
> 
> > Regarding other protocols having this problem, thats not the case, because non
> > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > certainly do that, but the current implementation doesn't (opting instead to
> > wait indefinately until the respective packet(s) have transmitted or errored
> > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > sockets should honor SNDTIMEO, then I can certainly do that.
> >
> > As for progress made by calling local_bh_enable, My read of the code doesn't
> > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > which ignores pending softirqs on re-enablement.
> 
> Ah, I'm mistaken there, then.
> 
> >  Perhaps that needs to change,
> > but I'm averse to making scheduler changes for this (the aforementioned concern
> > about complex changes for a narrow use case)
> >
> > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > that gets back to making complex changes for a narrow problem domain
> >
> > As for the comlexity of the of the solution, I think this is, given your
> > comments the least complex and intrusive change to solve the given problem.
> 
> Could it be simpler to ensure do_softirq() gets run here? That would
> allow progress for this case.
> 
> >  We
> > need to find a way to force the calling task off the cpu while the asynchronous
> > operations in the transmit path complete, and we can do that this way, or by
> > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > alter the current protocol behavior without consensus on that.
> 
> In general SCHED_FIFO is dangerous with regard to stalling other
> progress, incl. ksoftirqd. But it does appear that this packet socket
> case is special inside networking in calling schedule() directly here.
> 
> If converting that, should it convert to logic more akin to other
> sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> the pros and cons of completion here yet, sorry. Didn't want to delay
> responding until after I get a chance.
> 
So, I started looking at implementing SOCK_SNDTIMEO for this patch, and
something occured to me....We still need a mechanism to block in tpacket_snd.
That is to say, other protocol use SK_SNDTIMEO to wait for socket memory to
become available, and that requirement doesn't exist for memory mapped sockets
in AF_PACKET (which tpacket_snd implements the kernel side for).  We have memory
mapped frame buffers, which we marshall with an otherwise empty skb, and just
send that (i.e. no waiting on socket memory, we just product an error if we
don't have enough ram to allocate an sk_buff).  Given that, we only ever need to
wait for a frame to complete transmission, or get freed in an error path further
down the stack.  This probably explains why SK_SNDTIMEO doesn't exist for
AF_PACKET.

Given that, is it worth implementing (as it will just further complicate this
patch, for no additional value), or can we move forward with it as it is?

Neil


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-21 16:41       ` Neil Horman
@ 2019-06-21 18:31         ` Willem de Bruijn
  2019-06-21 19:18           ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-21 18:31 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Fri, Jun 21, 2019 at 12:42 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> > On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > >
> > > > > When an application is run that:
> > > > > a) Sets its scheduler to be SCHED_FIFO
> > > > > and
> > > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > > destructor that flips the status bit of the transmitted frame back to
> > > > > available, allowing the transmitting task to complete).
> > > > >
> > > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > > such that the schedule call immediately places the task back on the cpu,
> > > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > > transmitting task from detecting that the transmission is complete.
> > > > >
> > > > > We can fix this by converting the schedule call to a completion
> > > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > > it detects there are no more frames to send, to schedule itself off the
> > > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > > forward progress to be made.
> > > > >
> > > > > Tested by myself and the reporter, with good results
> > > > >
> > > > > Appies to the net tree
> > > > >
> > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > > ---
> > > >
> > > > This is a complex change for a narrow configuration. Isn't a
> > > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > > networking workloads as well? And the right configuration to always
> > > > increase ksoftirqd priority when increasing another process's
> > > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > > still made on the local_bh_enable reached from schedule()?
> > > >
> > >
> > > A few questions here to answer:
> >
> > Thanks for the detailed explanation.
> >
> > > Regarding other protocols having this problem, thats not the case, because non
> > > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > > certainly do that, but the current implementation doesn't (opting instead to
> > > wait indefinately until the respective packet(s) have transmitted or errored
> > > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > > sockets should honor SNDTIMEO, then I can certainly do that.
> > >
> > > As for progress made by calling local_bh_enable, My read of the code doesn't
> > > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > > which ignores pending softirqs on re-enablement.
> >
> > Ah, I'm mistaken there, then.
> >
> > >  Perhaps that needs to change,
> > > but I'm averse to making scheduler changes for this (the aforementioned concern
> > > about complex changes for a narrow use case)
> > >
> > > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > > that gets back to making complex changes for a narrow problem domain
> > >
> > > As for the comlexity of the of the solution, I think this is, given your
> > > comments the least complex and intrusive change to solve the given problem.
> >
> > Could it be simpler to ensure do_softirq() gets run here? That would
> > allow progress for this case.
> >
> > >  We
> > > need to find a way to force the calling task off the cpu while the asynchronous
> > > operations in the transmit path complete, and we can do that this way, or by
> > > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > > alter the current protocol behavior without consensus on that.
> >
> > In general SCHED_FIFO is dangerous with regard to stalling other
> > progress, incl. ksoftirqd. But it does appear that this packet socket
> > case is special inside networking in calling schedule() directly here.
> >
> > If converting that, should it convert to logic more akin to other
> > sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> > the pros and cons of completion here yet, sorry. Didn't want to delay
> > responding until after I get a chance.
> >
> So, I started looking at implementing SOCK_SNDTIMEO for this patch, and
> something occured to me....We still need a mechanism to block in tpacket_snd.
> That is to say, other protocol use SK_SNDTIMEO to wait for socket memory to
> become available, and that requirement doesn't exist for memory mapped sockets
> in AF_PACKET (which tpacket_snd implements the kernel side for).  We have memory
> mapped frame buffers, which we marshall with an otherwise empty skb, and just
> send that (i.e. no waiting on socket memory, we just product an error if we
> don't have enough ram to allocate an sk_buff).  Given that, we only ever need to
> wait for a frame to complete transmission, or get freed in an error path further
> down the stack.  This probably explains why SK_SNDTIMEO doesn't exist for
> AF_PACKET.

SNDTIMEO behavior would still be useful: to wait for frame slot to
become available, but only up to a timeout?

To be clear, adding that is not a prerequisite for fixing this
specific issue, of course. It would just be nice if the one happens to
be fixed by adding the other.

My main question is wrt the implementation details of struct
completion. Without dynamic memory allocation,
sock_wait_for_wmem/sk_stream_write_space obviously does not make
sense. But should we still use sk_wq and more importantly does this
need the same wait semantics (TASK_INTERRUPTIBLE) and does struct
completion give those?

>
> Given that, is it worth implementing (as it will just further complicate this
> patch, for no additional value), or can we move forward with it as it is?
>
> Neil
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-21 18:31         ` Willem de Bruijn
@ 2019-06-21 19:18           ` Neil Horman
  2019-06-21 20:06             ` Willem de Bruijn
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-21 19:18 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Fri, Jun 21, 2019 at 02:31:17PM -0400, Willem de Bruijn wrote:
> On Fri, Jun 21, 2019 at 12:42 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> > > On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > >
> > > > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > > >
> > > > > > When an application is run that:
> > > > > > a) Sets its scheduler to be SCHED_FIFO
> > > > > > and
> > > > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > > > destructor that flips the status bit of the transmitted frame back to
> > > > > > available, allowing the transmitting task to complete).
> > > > > >
> > > > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > > > such that the schedule call immediately places the task back on the cpu,
> > > > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > > > transmitting task from detecting that the transmission is complete.
> > > > > >
> > > > > > We can fix this by converting the schedule call to a completion
> > > > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > > > it detects there are no more frames to send, to schedule itself off the
> > > > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > > > forward progress to be made.
> > > > > >
> > > > > > Tested by myself and the reporter, with good results
> > > > > >
> > > > > > Appies to the net tree
> > > > > >
> > > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > > > ---
> > > > >
> > > > > This is a complex change for a narrow configuration. Isn't a
> > > > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > > > networking workloads as well? And the right configuration to always
> > > > > increase ksoftirqd priority when increasing another process's
> > > > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > > > still made on the local_bh_enable reached from schedule()?
> > > > >
> > > >
> > > > A few questions here to answer:
> > >
> > > Thanks for the detailed explanation.
> > >
> > > > Regarding other protocols having this problem, thats not the case, because non
> > > > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > > > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > > > certainly do that, but the current implementation doesn't (opting instead to
> > > > wait indefinately until the respective packet(s) have transmitted or errored
> > > > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > > > sockets should honor SNDTIMEO, then I can certainly do that.
> > > >
> > > > As for progress made by calling local_bh_enable, My read of the code doesn't
> > > > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > > > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > > > which ignores pending softirqs on re-enablement.
> > >
> > > Ah, I'm mistaken there, then.
> > >
> > > >  Perhaps that needs to change,
> > > > but I'm averse to making scheduler changes for this (the aforementioned concern
> > > > about complex changes for a narrow use case)
> > > >
> > > > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > > > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > > > that gets back to making complex changes for a narrow problem domain
> > > >
> > > > As for the comlexity of the of the solution, I think this is, given your
> > > > comments the least complex and intrusive change to solve the given problem.
> > >
> > > Could it be simpler to ensure do_softirq() gets run here? That would
> > > allow progress for this case.
> > >
> > > >  We
> > > > need to find a way to force the calling task off the cpu while the asynchronous
> > > > operations in the transmit path complete, and we can do that this way, or by
> > > > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > > > alter the current protocol behavior without consensus on that.
> > >
> > > In general SCHED_FIFO is dangerous with regard to stalling other
> > > progress, incl. ksoftirqd. But it does appear that this packet socket
> > > case is special inside networking in calling schedule() directly here.
> > >
> > > If converting that, should it convert to logic more akin to other
> > > sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> > > the pros and cons of completion here yet, sorry. Didn't want to delay
> > > responding until after I get a chance.
> > >
> > So, I started looking at implementing SOCK_SNDTIMEO for this patch, and
> > something occured to me....We still need a mechanism to block in tpacket_snd.
> > That is to say, other protocol use SK_SNDTIMEO to wait for socket memory to
> > become available, and that requirement doesn't exist for memory mapped sockets
> > in AF_PACKET (which tpacket_snd implements the kernel side for).  We have memory
> > mapped frame buffers, which we marshall with an otherwise empty skb, and just
> > send that (i.e. no waiting on socket memory, we just product an error if we
> > don't have enough ram to allocate an sk_buff).  Given that, we only ever need to
> > wait for a frame to complete transmission, or get freed in an error path further
> > down the stack.  This probably explains why SK_SNDTIMEO doesn't exist for
> > AF_PACKET.
> 
> SNDTIMEO behavior would still be useful: to wait for frame slot to
> become available, but only up to a timeout?
> 
Ok, thats fair.  To be clear, memory_mapped packets aren't waiting here for a
frame to become available for sending (thats done in userspace, where the
application checks a specific memory location for the TP_AVAILABLE status to be
set, so a new frame can be written).  tpacket_snd is wating for the transmission
of a specific frame to complete the transmit action, which is a different thing.
Still, I suppose theres no reason we couldn't contrain that wait on a timeout
set by SK_SNDTIMEO

> To be clear, adding that is not a prerequisite for fixing this
> specific issue, of course. It would just be nice if the one happens to
> be fixed by adding the other.
> 
> My main question is wrt the implementation details of struct
> completion. Without dynamic memory allocation,
> sock_wait_for_wmem/sk_stream_write_space obviously does not make
> sense. But should we still use sk_wq and more importantly does this
> need the same wait semantics (TASK_INTERRUPTIBLE) and does struct
> completion give those?
> 
I suppose we could overload its use here, but I would be worried that such an
overload would be confusing.  Nominally, sk_wq is used, as you note, to block
sockets whose allocated send space is full, until such time as enough frames
have been sent to make space for the next write.  In this scenario, we already
know we have space to send a frame (by virtue of the fact that we are executing
in tpakcet_snd, which is only called after userspace has written a frame to the
memory mapped buffer already allocated for frame transmission).  In this case we
are simply waiting for the last frame that we have sent to complete
transmission, at which point we can look to see if more frames are available to
send, or return from the system call.  I'm happy to take an alternate consensus
into account, but for the sake of clarity I think I would rather use the
completion queue, as it makes clear the correlation between the waiter and the
event we are waiting on.


> >
> > Given that, is it worth implementing (as it will just further complicate this
> > patch, for no additional value), or can we move forward with it as it is?
> >
> > Neil
> >
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-21 19:18           ` Neil Horman
@ 2019-06-21 20:06             ` Willem de Bruijn
  2019-06-22 11:08               ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-21 20:06 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Fri, Jun 21, 2019 at 3:18 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Fri, Jun 21, 2019 at 02:31:17PM -0400, Willem de Bruijn wrote:
> > On Fri, Jun 21, 2019 at 12:42 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > >
> > > On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> > > > On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > >
> > > > > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > > > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > > > >
> > > > > > > When an application is run that:
> > > > > > > a) Sets its scheduler to be SCHED_FIFO
> > > > > > > and
> > > > > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > > > > destructor that flips the status bit of the transmitted frame back to
> > > > > > > available, allowing the transmitting task to complete).
> > > > > > >
> > > > > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > > > > such that the schedule call immediately places the task back on the cpu,
> > > > > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > > > > transmitting task from detecting that the transmission is complete.
> > > > > > >
> > > > > > > We can fix this by converting the schedule call to a completion
> > > > > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > > > > it detects there are no more frames to send, to schedule itself off the
> > > > > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > > > > forward progress to be made.
> > > > > > >
> > > > > > > Tested by myself and the reporter, with good results
> > > > > > >
> > > > > > > Appies to the net tree
> > > > > > >
> > > > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > > > > ---
> > > > > >
> > > > > > This is a complex change for a narrow configuration. Isn't a
> > > > > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > > > > networking workloads as well? And the right configuration to always
> > > > > > increase ksoftirqd priority when increasing another process's
> > > > > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > > > > still made on the local_bh_enable reached from schedule()?
> > > > > >
> > > > >
> > > > > A few questions here to answer:
> > > >
> > > > Thanks for the detailed explanation.
> > > >
> > > > > Regarding other protocols having this problem, thats not the case, because non
> > > > > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > > > > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > > > > certainly do that, but the current implementation doesn't (opting instead to
> > > > > wait indefinately until the respective packet(s) have transmitted or errored
> > > > > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > > > > sockets should honor SNDTIMEO, then I can certainly do that.
> > > > >
> > > > > As for progress made by calling local_bh_enable, My read of the code doesn't
> > > > > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > > > > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > > > > which ignores pending softirqs on re-enablement.
> > > >
> > > > Ah, I'm mistaken there, then.
> > > >
> > > > >  Perhaps that needs to change,
> > > > > but I'm averse to making scheduler changes for this (the aforementioned concern
> > > > > about complex changes for a narrow use case)
> > > > >
> > > > > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > > > > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > > > > that gets back to making complex changes for a narrow problem domain
> > > > >
> > > > > As for the comlexity of the of the solution, I think this is, given your
> > > > > comments the least complex and intrusive change to solve the given problem.
> > > >
> > > > Could it be simpler to ensure do_softirq() gets run here? That would
> > > > allow progress for this case.
> > > >
> > > > >  We
> > > > > need to find a way to force the calling task off the cpu while the asynchronous
> > > > > operations in the transmit path complete, and we can do that this way, or by
> > > > > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > > > > alter the current protocol behavior without consensus on that.
> > > >
> > > > In general SCHED_FIFO is dangerous with regard to stalling other
> > > > progress, incl. ksoftirqd. But it does appear that this packet socket
> > > > case is special inside networking in calling schedule() directly here.
> > > >
> > > > If converting that, should it convert to logic more akin to other
> > > > sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> > > > the pros and cons of completion here yet, sorry. Didn't want to delay
> > > > responding until after I get a chance.
> > > >
> > > So, I started looking at implementing SOCK_SNDTIMEO for this patch, and
> > > something occured to me....We still need a mechanism to block in tpacket_snd.
> > > That is to say, other protocol use SK_SNDTIMEO to wait for socket memory to
> > > become available, and that requirement doesn't exist for memory mapped sockets
> > > in AF_PACKET (which tpacket_snd implements the kernel side for).  We have memory
> > > mapped frame buffers, which we marshall with an otherwise empty skb, and just
> > > send that (i.e. no waiting on socket memory, we just product an error if we
> > > don't have enough ram to allocate an sk_buff).  Given that, we only ever need to
> > > wait for a frame to complete transmission, or get freed in an error path further
> > > down the stack.  This probably explains why SK_SNDTIMEO doesn't exist for
> > > AF_PACKET.
> >
> > SNDTIMEO behavior would still be useful: to wait for frame slot to
> > become available, but only up to a timeout?
> >
> Ok, thats fair.  To be clear, memory_mapped packets aren't waiting here for a
> frame to become available for sending (thats done in userspace, where the
> application checks a specific memory location for the TP_AVAILABLE status to be
> set, so a new frame can be written).  tpacket_snd is wating for the transmission
> of a specific frame to complete the transmit action, which is a different thing.

Right. Until this report I was actually not even aware of this
behavior without MSG_DONTWAIT.

Though the wait is not for a specific frame, right? Wait as long as
the pending_refcnt, which is incremented on every loop.

> Still, I suppose theres no reason we couldn't contrain that wait on a timeout
> set by SK_SNDTIMEO
>
> > To be clear, adding that is not a prerequisite for fixing this
> > specific issue, of course. It would just be nice if the one happens to
> > be fixed by adding the other.
> >
> > My main question is wrt the implementation details of struct
> > completion. Without dynamic memory allocation,
> > sock_wait_for_wmem/sk_stream_write_space obviously does not make
> > sense. But should we still use sk_wq and more importantly does this
> > need the same wait semantics (TASK_INTERRUPTIBLE) and does struct
> > completion give those?
> >
> I suppose we could overload its use here, but I would be worried that such an
> overload would be confusing.  Nominally, sk_wq is used, as you note, to block
> sockets whose allocated send space is full, until such time as enough frames
> have been sent to make space for the next write.  In this scenario, we already
> know we have space to send a frame (by virtue of the fact that we are executing
> in tpakcet_snd, which is only called after userspace has written a frame to the
> memory mapped buffer already allocated for frame transmission).  In this case we
> are simply waiting for the last frame that we have sent to complete
> transmission, at which point we can look to see if more frames are available to
> send, or return from the system call.  I'm happy to take an alternate consensus
> into account, but for the sake of clarity I think I would rather use the
> completion queue, as it makes clear the correlation between the waiter and the
> event we are waiting on.

That will be less likely to have unexpected side-effects. Agreed.
Thanks for the explanation.

Only last question, does it have the right wait behavior? Should this
be wait_for_completion_interruptible?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-21 20:06             ` Willem de Bruijn
@ 2019-06-22 11:08               ` Neil Horman
  0 siblings, 0 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-22 11:08 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Fri, Jun 21, 2019 at 04:06:09PM -0400, Willem de Bruijn wrote:
> On Fri, Jun 21, 2019 at 3:18 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Fri, Jun 21, 2019 at 02:31:17PM -0400, Willem de Bruijn wrote:
> > > On Fri, Jun 21, 2019 at 12:42 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > >
> > > > On Thu, Jun 20, 2019 at 11:16:13AM -0400, Willem de Bruijn wrote:
> > > > > On Thu, Jun 20, 2019 at 10:24 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > > >
> > > > > > On Thu, Jun 20, 2019 at 09:41:30AM -0400, Willem de Bruijn wrote:
> > > > > > > On Wed, Jun 19, 2019 at 4:26 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> > > > > > > >
> > > > > > > > When an application is run that:
> > > > > > > > a) Sets its scheduler to be SCHED_FIFO
> > > > > > > > and
> > > > > > > > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > > > > > > > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > > > > > > > forever in the kernel.  This occurs because when waiting, the code in
> > > > > > > > tpacket_snd calls schedule, which under normal circumstances allows
> > > > > > > > other tasks to run, including ksoftirqd, which in some cases is
> > > > > > > > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > > > > > > > destructor that flips the status bit of the transmitted frame back to
> > > > > > > > available, allowing the transmitting task to complete).
> > > > > > > >
> > > > > > > > However, when the calling application is SCHED_FIFO, its priority is
> > > > > > > > such that the schedule call immediately places the task back on the cpu,
> > > > > > > > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > > > > > > > transmitting task from detecting that the transmission is complete.
> > > > > > > >
> > > > > > > > We can fix this by converting the schedule call to a completion
> > > > > > > > mechanism.  By using a completion queue, we force the calling task, when
> > > > > > > > it detects there are no more frames to send, to schedule itself off the
> > > > > > > > cpu until such time as the last transmitted skb is freed, allowing
> > > > > > > > forward progress to be made.
> > > > > > > >
> > > > > > > > Tested by myself and the reporter, with good results
> > > > > > > >
> > > > > > > > Appies to the net tree
> > > > > > > >
> > > > > > > > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > > > > > > > Reported-by: Matteo Croce <mcroce@redhat.com>
> > > > > > > > CC: "David S. Miller" <davem@davemloft.net>
> > > > > > > > ---
> > > > > > >
> > > > > > > This is a complex change for a narrow configuration. Isn't a
> > > > > > > SCHED_FIFO process preempting ksoftirqd a potential problem for other
> > > > > > > networking workloads as well? And the right configuration to always
> > > > > > > increase ksoftirqd priority when increasing another process's
> > > > > > > priority? Also, even when ksoftirqd kicks in, isn't some progress
> > > > > > > still made on the local_bh_enable reached from schedule()?
> > > > > > >
> > > > > >
> > > > > > A few questions here to answer:
> > > > >
> > > > > Thanks for the detailed explanation.
> > > > >
> > > > > > Regarding other protocols having this problem, thats not the case, because non
> > > > > > packet sockets honor the SK_SNDTIMEO option here (i.e. they sleep for a period
> > > > > > of time specified by the SNDTIMEO option if MSG_DONTWAIT isn't set.  We could
> > > > > > certainly do that, but the current implementation doesn't (opting instead to
> > > > > > wait indefinately until the respective packet(s) have transmitted or errored
> > > > > > out), and I wanted to maintain that behavior.  If there is consensus that packet
> > > > > > sockets should honor SNDTIMEO, then I can certainly do that.
> > > > > >
> > > > > > As for progress made by calling local_bh_enable, My read of the code doesn't
> > > > > > have the scheduler calling local_bh_enable at all.  Instead schedule uses
> > > > > > preempt_disable/preempt_enable_no_resched() to gain exlcusive access to the cpu,
> > > > > > which ignores pending softirqs on re-enablement.
> > > > >
> > > > > Ah, I'm mistaken there, then.
> > > > >
> > > > > >  Perhaps that needs to change,
> > > > > > but I'm averse to making scheduler changes for this (the aforementioned concern
> > > > > > about complex changes for a narrow use case)
> > > > > >
> > > > > > Regarding raising the priority of ksoftirqd, that could be a solution, but the
> > > > > > priority would need to be raised to a high priority SCHED_FIFO parameter, and
> > > > > > that gets back to making complex changes for a narrow problem domain
> > > > > >
> > > > > > As for the comlexity of the of the solution, I think this is, given your
> > > > > > comments the least complex and intrusive change to solve the given problem.
> > > > >
> > > > > Could it be simpler to ensure do_softirq() gets run here? That would
> > > > > allow progress for this case.
> > > > >
> > > > > >  We
> > > > > > need to find a way to force the calling task off the cpu while the asynchronous
> > > > > > operations in the transmit path complete, and we can do that this way, or by
> > > > > > honoring SK_SNDTIMEO.  I'm fine with doing the latter, but I didn't want to
> > > > > > alter the current protocol behavior without consensus on that.
> > > > >
> > > > > In general SCHED_FIFO is dangerous with regard to stalling other
> > > > > progress, incl. ksoftirqd. But it does appear that this packet socket
> > > > > case is special inside networking in calling schedule() directly here.
> > > > >
> > > > > If converting that, should it convert to logic more akin to other
> > > > > sockets, like sock_wait_for_wmem? I haven't had a chance to read up on
> > > > > the pros and cons of completion here yet, sorry. Didn't want to delay
> > > > > responding until after I get a chance.
> > > > >
> > > > So, I started looking at implementing SOCK_SNDTIMEO for this patch, and
> > > > something occured to me....We still need a mechanism to block in tpacket_snd.
> > > > That is to say, other protocol use SK_SNDTIMEO to wait for socket memory to
> > > > become available, and that requirement doesn't exist for memory mapped sockets
> > > > in AF_PACKET (which tpacket_snd implements the kernel side for).  We have memory
> > > > mapped frame buffers, which we marshall with an otherwise empty skb, and just
> > > > send that (i.e. no waiting on socket memory, we just product an error if we
> > > > don't have enough ram to allocate an sk_buff).  Given that, we only ever need to
> > > > wait for a frame to complete transmission, or get freed in an error path further
> > > > down the stack.  This probably explains why SK_SNDTIMEO doesn't exist for
> > > > AF_PACKET.
> > >
> > > SNDTIMEO behavior would still be useful: to wait for frame slot to
> > > become available, but only up to a timeout?
> > >
> > Ok, thats fair.  To be clear, memory_mapped packets aren't waiting here for a
> > frame to become available for sending (thats done in userspace, where the
> > application checks a specific memory location for the TP_AVAILABLE status to be
> > set, so a new frame can be written).  tpacket_snd is wating for the transmission
> > of a specific frame to complete the transmit action, which is a different thing.
> 
> Right. Until this report I was actually not even aware of this
> behavior without MSG_DONTWAIT.
> 
> Though the wait is not for a specific frame, right? Wait as long as
> the pending_refcnt, which is incremented on every loop.
> 
> > Still, I suppose theres no reason we couldn't contrain that wait on a timeout
> > set by SK_SNDTIMEO
> >
> > > To be clear, adding that is not a prerequisite for fixing this
> > > specific issue, of course. It would just be nice if the one happens to
> > > be fixed by adding the other.
> > >
> > > My main question is wrt the implementation details of struct
> > > completion. Without dynamic memory allocation,
> > > sock_wait_for_wmem/sk_stream_write_space obviously does not make
> > > sense. But should we still use sk_wq and more importantly does this
> > > need the same wait semantics (TASK_INTERRUPTIBLE) and does struct
> > > completion give those?
> > >
> > I suppose we could overload its use here, but I would be worried that such an
> > overload would be confusing.  Nominally, sk_wq is used, as you note, to block
> > sockets whose allocated send space is full, until such time as enough frames
> > have been sent to make space for the next write.  In this scenario, we already
> > know we have space to send a frame (by virtue of the fact that we are executing
> > in tpakcet_snd, which is only called after userspace has written a frame to the
> > memory mapped buffer already allocated for frame transmission).  In this case we
> > are simply waiting for the last frame that we have sent to complete
> > transmission, at which point we can look to see if more frames are available to
> > send, or return from the system call.  I'm happy to take an alternate consensus
> > into account, but for the sake of clarity I think I would rather use the
> > completion queue, as it makes clear the correlation between the waiter and the
> > event we are waiting on.
> 
> That will be less likely to have unexpected side-effects. Agreed.
> Thanks for the explanation.
> 
> Only last question, does it have the right wait behavior? Should this
> be wait_for_completion_interruptible?
> 
Thats a good point.  Other protocols use the socket timeout along with
sk_wait_event, which sets the task state TASK_INTERRUPTIBLE, so we should
probably use wait_for_completion_interruptible_timeout

Thanks!

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-19 20:25 [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET Neil Horman
  2019-06-20 13:41 ` Willem de Bruijn
@ 2019-06-22 17:41 ` Neil Horman
  2019-06-23  2:12   ` Willem de Bruijn
  2019-06-24  0:46 ` [PATCH v3 " Neil Horman
  2019-06-25 21:57 ` [PATCH v4 " Neil Horman
  3 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-22 17:41 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Matteo Croce, David S. Miller, Willem de Bruijn

When an application is run that:
a) Sets its scheduler to be SCHED_FIFO
and
b) Opens a memory mapped AF_PACKET socket, and sends frames with the
MSG_DONTWAIT flag cleared, its possible for the application to hang
forever in the kernel.  This occurs because when waiting, the code in
tpacket_snd calls schedule, which under normal circumstances allows
other tasks to run, including ksoftirqd, which in some cases is
responsible for freeing the transmitted skb (which in AF_PACKET calls a
destructor that flips the status bit of the transmitted frame back to
available, allowing the transmitting task to complete).

However, when the calling application is SCHED_FIFO, its priority is
such that the schedule call immediately places the task back on the cpu,
preventing ksoftirqd from freeing the skb, which in turn prevents the
transmitting task from detecting that the transmission is complete.

We can fix this by converting the schedule call to a completion
mechanism.  By using a completion queue, we force the calling task, when
it detects there are no more frames to send, to schedule itself off the
cpu until such time as the last transmitted skb is freed, allowing
forward progress to be made.

Tested by myself and the reporter, with good results

Appies to the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Matteo Croce <mcroce@redhat.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>

Change Notes:

V1->V2:
	Enhance the sleep logic to support being interruptible and
allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)
---
 net/packet/af_packet.c | 60 ++++++++++++++++++++++++++++++++----------
 net/packet/internal.h  |  2 ++
 2 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a29d66da7394..8ddb2f7aebb4 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -358,7 +358,8 @@ static inline struct page * __pure pgv_to_page(void *addr)
 	return virt_to_page(addr);
 }
 
-static void __packet_set_status(struct packet_sock *po, void *frame, int status)
+static void __packet_set_status(struct packet_sock *po, void *frame, int status,
+				bool call_complete)
 {
 	union tpacket_uhdr h;
 
@@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		BUG();
 	}
 
+	if (po->wait_on_complete && call_complete)
+		complete(&po->skb_completion);
 	smp_wmb();
 }
 
@@ -1148,6 +1151,14 @@ static void *packet_previous_frame(struct packet_sock *po,
 	return packet_lookup_frame(po, rb, previous, status);
 }
 
+static void *packet_next_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	unsigned int next = rb->head != rb->frame_max ? rb->head+1 : 0;
+	return packet_lookup_frame(po, rb, next, status);
+}
+
 static void packet_increment_head(struct packet_ring_buffer *buff)
 {
 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
@@ -2360,7 +2371,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 #endif
 
 	if (po->tp_version <= TPACKET_V2) {
-		__packet_set_status(po, h.raw, status);
+		__packet_set_status(po, h.raw, status, false);
 		sk->sk_data_ready(sk);
 	} else {
 		prb_clear_blk_fill_status(&po->rx_ring);
@@ -2400,7 +2411,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 		packet_dec_pending(&po->tx_ring);
 
 		ts = __packet_set_timestamp(po, ph, skb);
-		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts, true);
 	}
 
 	sock_wfree(skb);
@@ -2585,13 +2596,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct net_device *dev;
 	struct virtio_net_hdr *vnet_hdr = NULL;
 	struct sockcm_cookie sockc;
 	__be16 proto;
 	int err, reserve = 0;
-	void *ph;
+	void *ph = NULL;
 	DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
 	bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
 	unsigned char *addr = NULL;
@@ -2600,9 +2611,12 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
 	int hlen, tlen, copylen = 0;
+	long timeo;
 
 	mutex_lock(&po->pg_vec_lock);
 
+	timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
+
 	if (likely(saddr == NULL)) {
 		dev	= packet_cached_dev_get(po);
 		proto	= po->num;
@@ -2647,16 +2661,29 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	do {
+
+		if (po->wait_on_complete && need_wait) {
+			timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
+			po->wait_on_complete = 0;
+			if (!timeo) {
+				/* We timed out, break out and notify userspace */
+				err = -ETIMEDOUT;
+				goto out_status;
+			} else if (timeo == -ERESTARTSYS) {
+				err = -ERESTARTSYS;
+				goto out_status;
+			}
+		}
+
 		ph = packet_current_frame(po, &po->tx_ring,
 					  TP_STATUS_SEND_REQUEST);
-		if (unlikely(ph == NULL)) {
-			if (need_wait && need_resched())
-				schedule();
-			continue;
-		}
+
+		if (likely(ph == NULL))
+			break;
 
 		skb = NULL;
 		tp_len = tpacket_parse_header(po, ph, size_max, &data);
+
 		if (tp_len < 0)
 			goto tpacket_error;
 
@@ -2699,7 +2726,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 tpacket_error:
 			if (po->tp_loss) {
 				__packet_set_status(po, ph,
-						TP_STATUS_AVAILABLE);
+						TP_STATUS_AVAILABLE, false);
 				packet_increment_head(&po->tx_ring);
 				kfree_skb(skb);
 				continue;
@@ -2719,7 +2746,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		}
 
 		skb->destructor = tpacket_destruct_skb;
-		__packet_set_status(po, ph, TP_STATUS_SENDING);
+		__packet_set_status(po, ph, TP_STATUS_SENDING, false);
+		if (!packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST))
+			po->wait_on_complete = 1;
 		packet_inc_pending(&po->tx_ring);
 
 		status = TP_STATUS_SEND_REQUEST;
@@ -2753,8 +2782,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	goto out_put;
 
 out_status:
-	__packet_set_status(po, ph, status);
-	kfree_skb(skb);
+	if (ph)
+		__packet_set_status(po, ph, status, false);
+	if (skb)
+		kfree_skb(skb);
 out_put:
 	dev_put(dev);
 out:
@@ -3207,6 +3238,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_init_data(sock, sk);
 
 	po = pkt_sk(sk);
+	init_completion(&po->skb_completion);
 	sk->sk_family = PF_PACKET;
 	po->num = proto;
 	po->xmit = dev_queue_xmit;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 3bb7c5fb3bff..bbb4be2c18e7 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -128,6 +128,8 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_tstamp;
+	struct completion	skb_completion;
+	unsigned int		wait_on_complete;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-22 17:41 ` [PATCH v2 " Neil Horman
@ 2019-06-23  2:12   ` Willem de Bruijn
  2019-06-23  2:21     ` Willem de Bruijn
  2019-06-23 11:34     ` Neil Horman
  0 siblings, 2 replies; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-23  2:12 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Sat, Jun 22, 2019 at 1:42 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> When an application is run that:
> a) Sets its scheduler to be SCHED_FIFO
> and
> b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> MSG_DONTWAIT flag cleared, its possible for the application to hang
> forever in the kernel.  This occurs because when waiting, the code in
> tpacket_snd calls schedule, which under normal circumstances allows
> other tasks to run, including ksoftirqd, which in some cases is
> responsible for freeing the transmitted skb  (which in AF_PACKET calls a
> destructor that flips the status bit of the transmitted frame back to
> available, allowing the transmitting task to complete).
>
> However, when the calling application is SCHED_FIFO, its priority is
> such that the schedule call immediately places the task back on the cpu,
> preventing ksoftirqd from freeing the skb, which in turn prevents the
> transmitting task from detecting that the transmission is complete.
>
> We can fix this by converting the schedule call to a completion
> mechanism.  By using a completion queue, we force the calling task, when
> it detects there are no more frames to send, to schedule itself off the
> cpu until such time as the last transmitted skb is freed, allowing
> forward progress to be made.
>
> Tested by myself and the reporter, with good results
>
> Appies to the net tree
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> Reported-by: Matteo Croce <mcroce@redhat.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
>
> Change Notes:
>
> V1->V2:
>         Enhance the sleep logic to support being interruptible and
> allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)
> ---
>  net/packet/af_packet.c | 60 ++++++++++++++++++++++++++++++++----------
>  net/packet/internal.h  |  2 ++
>  2 files changed, 48 insertions(+), 14 deletions(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index a29d66da7394..8ddb2f7aebb4 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -358,7 +358,8 @@ static inline struct page * __pure pgv_to_page(void *addr)
>         return virt_to_page(addr);
>  }
>
> -static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> +static void __packet_set_status(struct packet_sock *po, void *frame, int status,
> +                               bool call_complete)
>  {
>         union tpacket_uhdr h;
>
> @@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
>                 BUG();
>         }
>
> +       if (po->wait_on_complete && call_complete)
> +               complete(&po->skb_completion);

This wake need not happen before the barrier. Only one caller of
__packet_set_status passes call_complete (tpacket_destruct_skb).
Moving this branch to the caller avoids a lot of code churn.

Also, multiple packets may be released before the process is awoken.
The process will block until packet_read_pending drops to zero. Can
defer the wait_on_complete to that one instance.

>         smp_wmb();
>  }
>
> @@ -1148,6 +1151,14 @@ static void *packet_previous_frame(struct packet_sock *po,
>         return packet_lookup_frame(po, rb, previous, status);
>  }
>
> +static void *packet_next_frame(struct packet_sock *po,
> +               struct packet_ring_buffer *rb,
> +               int status)
> +{
> +       unsigned int next = rb->head != rb->frame_max ? rb->head+1 : 0;
> +       return packet_lookup_frame(po, rb, next, status);
> +}
> +
>  static void packet_increment_head(struct packet_ring_buffer *buff)
>  {
>         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
> @@ -2360,7 +2371,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
>  #endif
>
>         if (po->tp_version <= TPACKET_V2) {
> -               __packet_set_status(po, h.raw, status);
> +               __packet_set_status(po, h.raw, status, false);
>                 sk->sk_data_ready(sk);
>         } else {
>                 prb_clear_blk_fill_status(&po->rx_ring);
> @@ -2400,7 +2411,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
>                 packet_dec_pending(&po->tx_ring);
>
>                 ts = __packet_set_timestamp(po, ph, skb);
> -               __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> +               __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts, true);
>         }
>
>         sock_wfree(skb);
> @@ -2585,13 +2596,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
>
>  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>  {
> -       struct sk_buff *skb;
> +       struct sk_buff *skb = NULL;
>         struct net_device *dev;
>         struct virtio_net_hdr *vnet_hdr = NULL;
>         struct sockcm_cookie sockc;
>         __be16 proto;
>         int err, reserve = 0;
> -       void *ph;
> +       void *ph = NULL;
>         DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
>         bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
>         unsigned char *addr = NULL;
> @@ -2600,9 +2611,12 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>         int len_sum = 0;
>         int status = TP_STATUS_AVAILABLE;
>         int hlen, tlen, copylen = 0;
> +       long timeo;
>
>         mutex_lock(&po->pg_vec_lock);
>
> +       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> +
>         if (likely(saddr == NULL)) {
>                 dev     = packet_cached_dev_get(po);
>                 proto   = po->num;
> @@ -2647,16 +2661,29 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>                 size_max = dev->mtu + reserve + VLAN_HLEN;
>
>         do {
> +
> +               if (po->wait_on_complete && need_wait) {
> +                       timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);

Why move the sleeping location from where it was with schedule()?
Without that change, ph and skb are both guaranteed to be NULL after
packet_current_frame, so can jump to out_put and avoid adding branches
at out_status. And no need for packet_next_frame.

Just replace schedule with a sleeping function in place (removing the
then obsolete need_resched call).

That is a much shorter patch and easier to compare for correctness
with the current code.

minor: probably preferable to first check local variable need_wait
before reading a struct member.

> +                       po->wait_on_complete = 0;
> +                       if (!timeo) {
> +                               /* We timed out, break out and notify userspace */
> +                               err = -ETIMEDOUT;
> +                               goto out_status;
> +                       } else if (timeo == -ERESTARTSYS) {
> +                               err = -ERESTARTSYS;
> +                               goto out_status;
> +                       }
> +               }
> +
>                 ph = packet_current_frame(po, &po->tx_ring,
>                                           TP_STATUS_SEND_REQUEST);
> -               if (unlikely(ph == NULL)) {
> -                       if (need_wait && need_resched())
> -                               schedule();
> -                       continue;
> -               }
> +
> +               if (likely(ph == NULL))

why switch from unlikely to likely?

> +                       break;
>
>                 skb = NULL;
>                 tp_len = tpacket_parse_header(po, ph, size_max, &data);
> +

nit: irrelevant

>                 if (tp_len < 0)
>                         goto tpacket_error;
>
> @@ -2699,7 +2726,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>  tpacket_error:
>                         if (po->tp_loss) {
>                                 __packet_set_status(po, ph,
> -                                               TP_STATUS_AVAILABLE);
> +                                               TP_STATUS_AVAILABLE, false);
>                                 packet_increment_head(&po->tx_ring);
>                                 kfree_skb(skb);
>                                 continue;
> @@ -2719,7 +2746,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>                 }
>
>                 skb->destructor = tpacket_destruct_skb;
> -               __packet_set_status(po, ph, TP_STATUS_SENDING);
> +               __packet_set_status(po, ph, TP_STATUS_SENDING, false);
> +               if (!packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST))
> +                       po->wait_on_complete = 1;
>                 packet_inc_pending(&po->tx_ring);
>
>                 status = TP_STATUS_SEND_REQUEST;
> @@ -2753,8 +2782,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>         goto out_put;
>
>  out_status:
> -       __packet_set_status(po, ph, status);
> -       kfree_skb(skb);
> +       if (ph)
> +               __packet_set_status(po, ph, status, false);
> +       if (skb)
> +               kfree_skb(skb);
>  out_put:
>         dev_put(dev);
>  out:
> @@ -3207,6 +3238,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
>         sock_init_data(sock, sk);
>
>         po = pkt_sk(sk);
> +       init_completion(&po->skb_completion);
>         sk->sk_family = PF_PACKET;
>         po->num = proto;
>         po->xmit = dev_queue_xmit;
> diff --git a/net/packet/internal.h b/net/packet/internal.h
> index 3bb7c5fb3bff..bbb4be2c18e7 100644
> --- a/net/packet/internal.h
> +++ b/net/packet/internal.h
> @@ -128,6 +128,8 @@ struct packet_sock {
>         unsigned int            tp_hdrlen;
>         unsigned int            tp_reserve;
>         unsigned int            tp_tstamp;
> +       struct completion       skb_completion;
> +       unsigned int            wait_on_complete;

Probably belong in packet_ring_buffer. Near pending_refcnt as touched
in the same code. And because protected by the ring buffer mutex.



>         struct net_device __rcu *cached_dev;
>         int                     (*xmit)(struct sk_buff *skb);
>         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
> --
> 2.20.1
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-23  2:12   ` Willem de Bruijn
@ 2019-06-23  2:21     ` Willem de Bruijn
  2019-06-23 11:40       ` Neil Horman
  2019-06-23 11:34     ` Neil Horman
  1 sibling, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-23  2:21 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

> > -static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > +static void __packet_set_status(struct packet_sock *po, void *frame, int status,
> > +                               bool call_complete)
> >  {
> >         union tpacket_uhdr h;
> >
> > @@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> >                 BUG();
> >         }
> >
> > +       if (po->wait_on_complete && call_complete)
> > +               complete(&po->skb_completion);
>
> This wake need not happen before the barrier. Only one caller of
> __packet_set_status passes call_complete (tpacket_destruct_skb).
> Moving this branch to the caller avoids a lot of code churn.
>
> Also, multiple packets may be released before the process is awoken.
> The process will block until packet_read_pending drops to zero. Can
> defer the wait_on_complete to that one instance.

Eh no. The point of having this sleep in the send loop is that
additional slots may be released for transmission (flipped to
TP_STATUS_SEND_REQUEST) from another thread while this thread is
waiting.

Else, it would have been much simpler to move the wait below the send
loop: send as many packets as possible, then wait for all of them
having been released. Much clearer control flow.

Where to set and clear the wait_on_complete boolean remains. Integer
assignment is fragile, as the compiler and processor may optimize or
move simple seemingly independent operations. As complete() takes a
spinlock, avoiding that in the DONTWAIT case is worthwhile. But probably
still preferable to set when beginning waiting and clear when calling
complete.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-23  2:12   ` Willem de Bruijn
  2019-06-23  2:21     ` Willem de Bruijn
@ 2019-06-23 11:34     ` Neil Horman
  1 sibling, 0 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-23 11:34 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Sat, Jun 22, 2019 at 10:12:46PM -0400, Willem de Bruijn wrote:
> On Sat, Jun 22, 2019 at 1:42 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > When an application is run that:
> > a) Sets its scheduler to be SCHED_FIFO
> > and
> > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > forever in the kernel.  This occurs because when waiting, the code in
> > tpacket_snd calls schedule, which under normal circumstances allows
> > other tasks to run, including ksoftirqd, which in some cases is
> > responsible for freeing the transmitted skb  (which in AF_PACKET calls a
> > destructor that flips the status bit of the transmitted frame back to
> > available, allowing the transmitting task to complete).
> >
> > However, when the calling application is SCHED_FIFO, its priority is
> > such that the schedule call immediately places the task back on the cpu,
> > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > transmitting task from detecting that the transmission is complete.
> >
> > We can fix this by converting the schedule call to a completion
> > mechanism.  By using a completion queue, we force the calling task, when
> > it detects there are no more frames to send, to schedule itself off the
> > cpu until such time as the last transmitted skb is freed, allowing
> > forward progress to be made.
> >
> > Tested by myself and the reporter, with good results
> >
> > Appies to the net tree
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > Reported-by: Matteo Croce <mcroce@redhat.com>
> > CC: "David S. Miller" <davem@davemloft.net>
> > CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> >
> > Change Notes:
> >
> > V1->V2:
> >         Enhance the sleep logic to support being interruptible and
> > allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)
> > ---
> >  net/packet/af_packet.c | 60 ++++++++++++++++++++++++++++++++----------
> >  net/packet/internal.h  |  2 ++
> >  2 files changed, 48 insertions(+), 14 deletions(-)
> >
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index a29d66da7394..8ddb2f7aebb4 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -358,7 +358,8 @@ static inline struct page * __pure pgv_to_page(void *addr)
> >         return virt_to_page(addr);
> >  }
> >
> > -static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > +static void __packet_set_status(struct packet_sock *po, void *frame, int status,
> > +                               bool call_complete)
> >  {
> >         union tpacket_uhdr h;
> >
> > @@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> >                 BUG();
> >         }
> >
> > +       if (po->wait_on_complete && call_complete)
> > +               complete(&po->skb_completion);
> 
> This wake need not happen before the barrier. Only one caller of
> __packet_set_status passes call_complete (tpacket_destruct_skb).
> Moving this branch to the caller avoids a lot of code churn.
> 
Yeah, thats a fair point, I'll adjust that.

> Also, multiple packets may be released before the process is awoken.
> The process will block until packet_read_pending drops to zero. Can
> defer the wait_on_complete to that one instance.
> 
Also true, we can gate the complete call on wait_on_complete and
pack_read_pending(...) == 0
> >         smp_wmb();
> >  }
> >
> > @@ -1148,6 +1151,14 @@ static void *packet_previous_frame(struct packet_sock *po,
> >         return packet_lookup_frame(po, rb, previous, status);
> >  }
> >
> > +static void *packet_next_frame(struct packet_sock *po,
> > +               struct packet_ring_buffer *rb,
> > +               int status)
> > +{
> > +       unsigned int next = rb->head != rb->frame_max ? rb->head+1 : 0;
> > +       return packet_lookup_frame(po, rb, next, status);
> > +}
> > +
> >  static void packet_increment_head(struct packet_ring_buffer *buff)
> >  {
> >         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
> > @@ -2360,7 +2371,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
> >  #endif
> >
> >         if (po->tp_version <= TPACKET_V2) {
> > -               __packet_set_status(po, h.raw, status);
> > +               __packet_set_status(po, h.raw, status, false);
> >                 sk->sk_data_ready(sk);
> >         } else {
> >                 prb_clear_blk_fill_status(&po->rx_ring);
> > @@ -2400,7 +2411,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
> >                 packet_dec_pending(&po->tx_ring);
> >
> >                 ts = __packet_set_timestamp(po, ph, skb);
> > -               __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> > +               __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts, true);
> >         }
> >
> >         sock_wfree(skb);
> > @@ -2585,13 +2596,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
> >
> >  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >  {
> > -       struct sk_buff *skb;
> > +       struct sk_buff *skb = NULL;
> >         struct net_device *dev;
> >         struct virtio_net_hdr *vnet_hdr = NULL;
> >         struct sockcm_cookie sockc;
> >         __be16 proto;
> >         int err, reserve = 0;
> > -       void *ph;
> > +       void *ph = NULL;
> >         DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
> >         bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
> >         unsigned char *addr = NULL;
> > @@ -2600,9 +2611,12 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >         int len_sum = 0;
> >         int status = TP_STATUS_AVAILABLE;
> >         int hlen, tlen, copylen = 0;
> > +       long timeo;
> >
> >         mutex_lock(&po->pg_vec_lock);
> >
> > +       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > +
> >         if (likely(saddr == NULL)) {
> >                 dev     = packet_cached_dev_get(po);
> >                 proto   = po->num;
> > @@ -2647,16 +2661,29 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >                 size_max = dev->mtu + reserve + VLAN_HLEN;
> >
> >         do {
> > +
> > +               if (po->wait_on_complete && need_wait) {
> > +                       timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> 
> Why move the sleeping location from where it was with schedule()?
> Without that change, ph and skb are both guaranteed to be NULL after
> packet_current_frame, so can jump to out_put and avoid adding branches
> at out_status. And no need for packet_next_frame.
> 
> Just replace schedule with a sleeping function in place (removing the
> then obsolete need_resched call).
> 
> That is a much shorter patch and easier to compare for correctness
> with the current code.
> 
> minor: probably preferable to first check local variable need_wait
> before reading a struct member.
> 
> > +                       po->wait_on_complete = 0;
> > +                       if (!timeo) {
> > +                               /* We timed out, break out and notify userspace */
> > +                               err = -ETIMEDOUT;
> > +                               goto out_status;
> > +                       } else if (timeo == -ERESTARTSYS) {
> > +                               err = -ERESTARTSYS;
> > +                               goto out_status;
> > +                       }
> > +               }
> > +
> >                 ph = packet_current_frame(po, &po->tx_ring,
> >                                           TP_STATUS_SEND_REQUEST);
> > -               if (unlikely(ph == NULL)) {
> > -                       if (need_wait && need_resched())
> > -                               schedule();
> > -                       continue;
> > -               }
> > +
> > +               if (likely(ph == NULL))
> 
> why switch from unlikely to likely?
> 
> > +                       break;
> >
> >                 skb = NULL;
> >                 tp_len = tpacket_parse_header(po, ph, size_max, &data);
> > +
> 
> nit: irrelevant
> 
> >                 if (tp_len < 0)
> >                         goto tpacket_error;
> >
> > @@ -2699,7 +2726,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >  tpacket_error:
> >                         if (po->tp_loss) {
> >                                 __packet_set_status(po, ph,
> > -                                               TP_STATUS_AVAILABLE);
> > +                                               TP_STATUS_AVAILABLE, false);
> >                                 packet_increment_head(&po->tx_ring);
> >                                 kfree_skb(skb);
> >                                 continue;
> > @@ -2719,7 +2746,9 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >                 }
> >
> >                 skb->destructor = tpacket_destruct_skb;
> > -               __packet_set_status(po, ph, TP_STATUS_SENDING);
> > +               __packet_set_status(po, ph, TP_STATUS_SENDING, false);
> > +               if (!packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST))
> > +                       po->wait_on_complete = 1;
> >                 packet_inc_pending(&po->tx_ring);
> >
> >                 status = TP_STATUS_SEND_REQUEST;
> > @@ -2753,8 +2782,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >         goto out_put;
> >
> >  out_status:
> > -       __packet_set_status(po, ph, status);
> > -       kfree_skb(skb);
> > +       if (ph)
> > +               __packet_set_status(po, ph, status, false);
> > +       if (skb)
> > +               kfree_skb(skb);
> >  out_put:
> >         dev_put(dev);
> >  out:
> > @@ -3207,6 +3238,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
> >         sock_init_data(sock, sk);
> >
> >         po = pkt_sk(sk);
> > +       init_completion(&po->skb_completion);
> >         sk->sk_family = PF_PACKET;
> >         po->num = proto;
> >         po->xmit = dev_queue_xmit;
> > diff --git a/net/packet/internal.h b/net/packet/internal.h
> > index 3bb7c5fb3bff..bbb4be2c18e7 100644
> > --- a/net/packet/internal.h
> > +++ b/net/packet/internal.h
> > @@ -128,6 +128,8 @@ struct packet_sock {
> >         unsigned int            tp_hdrlen;
> >         unsigned int            tp_reserve;
> >         unsigned int            tp_tstamp;
> > +       struct completion       skb_completion;
> > +       unsigned int            wait_on_complete;
> 
> Probably belong in packet_ring_buffer. Near pending_refcnt as touched
> in the same code. And because protected by the ring buffer mutex.
> 
> 
> 
> >         struct net_device __rcu *cached_dev;
> >         int                     (*xmit)(struct sk_buff *skb);
> >         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
> > --
> > 2.20.1
> >
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-23  2:21     ` Willem de Bruijn
@ 2019-06-23 11:40       ` Neil Horman
  2019-06-23 14:39         ` Willem de Bruijn
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-23 11:40 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Sat, Jun 22, 2019 at 10:21:31PM -0400, Willem de Bruijn wrote:
> > > -static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > > +static void __packet_set_status(struct packet_sock *po, void *frame, int status,
> > > +                               bool call_complete)
> > >  {
> > >         union tpacket_uhdr h;
> > >
> > > @@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > >                 BUG();
> > >         }
> > >
> > > +       if (po->wait_on_complete && call_complete)
> > > +               complete(&po->skb_completion);
> >
> > This wake need not happen before the barrier. Only one caller of
> > __packet_set_status passes call_complete (tpacket_destruct_skb).
> > Moving this branch to the caller avoids a lot of code churn.
> >
> > Also, multiple packets may be released before the process is awoken.
> > The process will block until packet_read_pending drops to zero. Can
> > defer the wait_on_complete to that one instance.
> 
> Eh no. The point of having this sleep in the send loop is that
> additional slots may be released for transmission (flipped to
> TP_STATUS_SEND_REQUEST) from another thread while this thread is
> waiting.
> 
Thats incorrect.  The entirety of tpacket_snd is protected by a mutex. No other
thread can alter the state of the frames in the vector from the kernel send path
while this thread is waiting.

> Else, it would have been much simpler to move the wait below the send
> loop: send as many packets as possible, then wait for all of them
> having been released. Much clearer control flow.
> 
Thats (almost) what happens now.  The only difference is that with this
implementation, the waiting thread has the opportunity to see if userspace has
queued more frames for transmission during the wait period.  We could
potentially change that, but thats outside the scope of this fix.

> Where to set and clear the wait_on_complete boolean remains. Integer
> assignment is fragile, as the compiler and processor may optimize or
> move simple seemingly independent operations. As complete() takes a
> spinlock, avoiding that in the DONTWAIT case is worthwhile. But probably
> still preferable to set when beginning waiting and clear when calling
> complete.
We avoid any call to wait_for_complete or complete already, based on the gating
of the need_wait variable in tpacket_snd.  If the transmitting thread doesn't
set MSG_DONTWAIT in the flags of the msg structure, we will never set
wait_for_complete, and so we will never manipulate the completion queue.

Neil

> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-23 11:40       ` Neil Horman
@ 2019-06-23 14:39         ` Willem de Bruijn
  2019-06-23 19:21           ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-23 14:39 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Sun, Jun 23, 2019 at 7:40 AM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Sat, Jun 22, 2019 at 10:21:31PM -0400, Willem de Bruijn wrote:
> > > > -static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > > > +static void __packet_set_status(struct packet_sock *po, void *frame, int status,
> > > > +                               bool call_complete)
> > > >  {
> > > >         union tpacket_uhdr h;
> > > >
> > > > @@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > > >                 BUG();
> > > >         }
> > > >
> > > > +       if (po->wait_on_complete && call_complete)
> > > > +               complete(&po->skb_completion);
> > >
> > > This wake need not happen before the barrier. Only one caller of
> > > __packet_set_status passes call_complete (tpacket_destruct_skb).
> > > Moving this branch to the caller avoids a lot of code churn.
> > >
> > > Also, multiple packets may be released before the process is awoken.
> > > The process will block until packet_read_pending drops to zero. Can
> > > defer the wait_on_complete to that one instance.
> >
> > Eh no. The point of having this sleep in the send loop is that
> > additional slots may be released for transmission (flipped to
> > TP_STATUS_SEND_REQUEST) from another thread while this thread is
> > waiting.
> >
> Thats incorrect.  The entirety of tpacket_snd is protected by a mutex. No other
> thread can alter the state of the frames in the vector from the kernel send path
> while this thread is waiting.

I meant another user thread updating the memory mapped ring contents.

> > Else, it would have been much simpler to move the wait below the send
> > loop: send as many packets as possible, then wait for all of them
> > having been released. Much clearer control flow.
> >
> Thats (almost) what happens now.  The only difference is that with this
> implementation, the waiting thread has the opportunity to see if userspace has
> queued more frames for transmission during the wait period.  We could
> potentially change that, but thats outside the scope of this fix.

Agreed. I think the current, more complex, behavior was intentional.
We could still restructure to move it out of the loop and jump back.
But, yes, definitely out of scope for a fix.

> > Where to set and clear the wait_on_complete boolean remains. Integer
> > assignment is fragile, as the compiler and processor may optimize or
> > move simple seemingly independent operations. As complete() takes a
> > spinlock, avoiding that in the DONTWAIT case is worthwhile. But probably
> > still preferable to set when beginning waiting and clear when calling
> > complete.
> We avoid any call to wait_for_complete or complete already, based on the gating
> of the need_wait variable in tpacket_snd.  If the transmitting thread doesn't
> set MSG_DONTWAIT in the flags of the msg structure, we will never set
> wait_for_complete, and so we will never manipulate the completion queue.

But we don't know the state of this at tpacket_destruct_skb time without
wait_for_completion?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v2 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-23 14:39         ` Willem de Bruijn
@ 2019-06-23 19:21           ` Neil Horman
  0 siblings, 0 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-23 19:21 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Sun, Jun 23, 2019 at 10:39:12AM -0400, Willem de Bruijn wrote:
> On Sun, Jun 23, 2019 at 7:40 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Sat, Jun 22, 2019 at 10:21:31PM -0400, Willem de Bruijn wrote:
> > > > > -static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > > > > +static void __packet_set_status(struct packet_sock *po, void *frame, int status,
> > > > > +                               bool call_complete)
> > > > >  {
> > > > >         union tpacket_uhdr h;
> > > > >
> > > > > @@ -381,6 +382,8 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> > > > >                 BUG();
> > > > >         }
> > > > >
> > > > > +       if (po->wait_on_complete && call_complete)
> > > > > +               complete(&po->skb_completion);
> > > >
> > > > This wake need not happen before the barrier. Only one caller of
> > > > __packet_set_status passes call_complete (tpacket_destruct_skb).
> > > > Moving this branch to the caller avoids a lot of code churn.
> > > >
> > > > Also, multiple packets may be released before the process is awoken.
> > > > The process will block until packet_read_pending drops to zero. Can
> > > > defer the wait_on_complete to that one instance.
> > >
> > > Eh no. The point of having this sleep in the send loop is that
> > > additional slots may be released for transmission (flipped to
> > > TP_STATUS_SEND_REQUEST) from another thread while this thread is
> > > waiting.
> > >
> > Thats incorrect.  The entirety of tpacket_snd is protected by a mutex. No other
> > thread can alter the state of the frames in the vector from the kernel send path
> > while this thread is waiting.
> 
> I meant another user thread updating the memory mapped ring contents.
> 
Yes, thats true, and if that happens, we will loop through this path again (the
do..while section, picking up the next frame for transmit)

> > > Else, it would have been much simpler to move the wait below the send
> > > loop: send as many packets as possible, then wait for all of them
> > > having been released. Much clearer control flow.
> > >
> > Thats (almost) what happens now.  The only difference is that with this
> > implementation, the waiting thread has the opportunity to see if userspace has
> > queued more frames for transmission during the wait period.  We could
> > potentially change that, but thats outside the scope of this fix.
> 
> Agreed. I think the current, more complex, behavior was intentional.
> We could still restructure to move it out of the loop and jump back.
> But, yes, definitely out of scope for a fix.
> 
Yes, it was, though based on your comments I've moved the wait_for_completion
call to the bottom of the loop, so its only checked after we are guaranteed to
have sent at least one frame.  I think that makes the code a bit more legible.

> > > Where to set and clear the wait_on_complete boolean remains. Integer
> > > assignment is fragile, as the compiler and processor may optimize or
> > > move simple seemingly independent operations. As complete() takes a
> > > spinlock, avoiding that in the DONTWAIT case is worthwhile. But probably
> > > still preferable to set when beginning waiting and clear when calling
> > > complete.
> > We avoid any call to wait_for_complete or complete already, based on the gating
> > of the need_wait variable in tpacket_snd.  If the transmitting thread doesn't
> > set MSG_DONTWAIT in the flags of the msg structure, we will never set
> > wait_for_complete, and so we will never manipulate the completion queue.
> 
> But we don't know the state of this at tpacket_destruct_skb time without
> wait_for_completion?
> 
Sure we do, wait_for_complete is stored in the packet_sock structure, which is
available and stable at the time tpacket_destruct_skb is called.
po->wait_for_complete is set in tpacket_snd iff:
1) The MSG_DONTWAIT flag is clear
and
2) We have detected that the next frame in the memory mapped buffer does not
have its status set to TP_STATUS_SEND_REQUEST.

If those two conditions are true, we set po->wait_for_complete to 1, which
indicates that tpacket_destruct_skb should call complete, when all the frames
we've sent to the physical layer have been freed (i.e. when packet_read_pending
is zero).

If wait_for_complete is non-zero, we also can be confident that the
calling task is either:
a) Already blocking on wait_for_completion_interruptible_timeout
or
b) Will be waiting on it shortly

In case (a) the blocking/transmitting task will be woken up, and continue on its
way

In case (b) the transmitting task will call
wait_for_completion_interruptible_timeout, see that the completion has already
been called (based on the completion structs done variable being positive), and
return immediately.

I've made a slight update to the logic/comments in my next version to make that a little
more clear

Neil


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-19 20:25 [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET Neil Horman
  2019-06-20 13:41 ` Willem de Bruijn
  2019-06-22 17:41 ` [PATCH v2 " Neil Horman
@ 2019-06-24  0:46 ` Neil Horman
  2019-06-24 18:08   ` Willem de Bruijn
  2019-06-25 21:57 ` [PATCH v4 " Neil Horman
  3 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-24  0:46 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Matteo Croce, David S. Miller, Willem de Bruijn

When an application is run that:
a) Sets its scheduler to be SCHED_FIFO
and
b) Opens a memory mapped AF_PACKET socket, and sends frames with the
MSG_DONTWAIT flag cleared, its possible for the application to hang
forever in the kernel.  This occurs because when waiting, the code in
tpacket_snd calls schedule, which under normal circumstances allows
other tasks to run, including ksoftirqd, which in some cases is
responsible for freeing the transmitted skb (which in AF_PACKET calls a
destructor that flips the status bit of the transmitted frame back to
available, allowing the transmitting task to complete).

However, when the calling application is SCHED_FIFO, its priority is
such that the schedule call immediately places the task back on the cpu,
preventing ksoftirqd from freeing the skb, which in turn prevents the
transmitting task from detecting that the transmission is complete.

We can fix this by converting the schedule call to a completion
mechanism.  By using a completion queue, we force the calling task, when
it detects there are no more frames to send, to schedule itself off the
cpu until such time as the last transmitted skb is freed, allowing
forward progress to be made.

Tested by myself and the reporter, with good results

Appies to the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Matteo Croce <mcroce@redhat.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>

Change Notes:

V1->V2:
	Enhance the sleep logic to support being interruptible and
allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)

V2->V3:
	Rearrage the point at which we wait for the completion queue, to
avoid needing to check for ph/skb being null at the end of the loop.
Also move the complete call to the skb destructor to avoid needing to
modify __packet_set_status.  Also gate calling complete on
packet_read_pending returning zero to avoid multiple calls to complete.
(Willem de Bruijn)

	Move timeo computation within loop, to re-fetch the socket
timeout since we also use the timeo variable to record the return code
from the wait_for_complete call (Neil Horman)
---
 net/packet/af_packet.c | 59 +++++++++++++++++++++++++++++++++++++-----
 net/packet/internal.h  |  2 ++
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a29d66da7394..5c48bb7a4fa5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -380,7 +380,6 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		WARN(1, "TPACKET version not supported.\n");
 		BUG();
 	}
-
 	smp_wmb();
 }
 
@@ -1148,6 +1147,14 @@ static void *packet_previous_frame(struct packet_sock *po,
 	return packet_lookup_frame(po, rb, previous, status);
 }
 
+static void *packet_next_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status)
+{
+	unsigned int next = rb->head != rb->frame_max ? rb->head+1 : 0;
+	return packet_lookup_frame(po, rb, next, status);
+}
+
 static void packet_increment_head(struct packet_ring_buffer *buff)
 {
 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
@@ -2401,6 +2408,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 
 		ts = __packet_set_timestamp(po, ph, skb);
 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+
+		if (po->wait_on_complete && !packet_read_pending(&po->tx_ring))
+			complete(&po->skb_completion);
 	}
 
 	sock_wfree(skb);
@@ -2600,9 +2610,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
 	int hlen, tlen, copylen = 0;
+	long timeo = 0;
 
 	mutex_lock(&po->pg_vec_lock);
 
+
 	if (likely(saddr == NULL)) {
 		dev	= packet_cached_dev_get(po);
 		proto	= po->num;
@@ -2647,16 +2659,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
 	do {
+
 		ph = packet_current_frame(po, &po->tx_ring,
 					  TP_STATUS_SEND_REQUEST);
-		if (unlikely(ph == NULL)) {
-			if (need_wait && need_resched())
-				schedule();
-			continue;
-		}
+
+		if (unlikely(ph == NULL))
+			break;
 
 		skb = NULL;
 		tp_len = tpacket_parse_header(po, ph, size_max, &data);
+
 		if (tp_len < 0)
 			goto tpacket_error;
 
@@ -2720,6 +2732,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 		skb->destructor = tpacket_destruct_skb;
 		__packet_set_status(po, ph, TP_STATUS_SENDING);
+
+		/*
+		 * If we need to wait and we've sent the last frame pending
+		 * transmission in the mmaped buffer, flag that we need to wait
+		 * on those frames to get freed via tpacket_destruct_skb.  This
+		 * flag indicates that tpacket_destruct_skb should call complete
+		 * when the packet_pending count reaches zero, and that we need
+		 * to call wait_on_complete_interruptible_timeout below, to make
+		 * sure we pick up the result of that completion
+		 */
+		if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
+			po->wait_on_complete = 1;
+			timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
+		}
+
 		packet_inc_pending(&po->tx_ring);
 
 		status = TP_STATUS_SEND_REQUEST;
@@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 			err = net_xmit_errno(err);
 			if (err && __packet_get_status(po, ph) ==
 				   TP_STATUS_AVAILABLE) {
+				/* re-init completion queue to avoid subsequent fallthrough
+				 * on a future thread calling wait_on_complete_interruptible_timeout
+				 */
+				po->wait_on_complete = 0;
+				init_completion(&po->skb_completion);
 				/* skb was destructed already */
 				skb = NULL;
 				goto out_status;
@@ -2740,6 +2772,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 		}
 		packet_increment_head(&po->tx_ring);
 		len_sum += tp_len;
+
+		if (po->wait_on_complete) {
+			timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
+			po->wait_on_complete = 0;
+			if (!timeo) {
+				/* We timed out, break out and notify userspace */
+				err = -ETIMEDOUT;
+				goto out_status;
+			} else if (timeo == -ERESTARTSYS) {
+				err = -ERESTARTSYS;
+				goto out_status;
+			}
+		}
+
 	} while (likely((ph != NULL) ||
 		/* Note: packet_read_pending() might be slow if we have
 		 * to call it as it's per_cpu variable, but in fast-path
@@ -3207,6 +3253,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_init_data(sock, sk);
 
 	po = pkt_sk(sk);
+	init_completion(&po->skb_completion);
 	sk->sk_family = PF_PACKET;
 	po->num = proto;
 	po->xmit = dev_queue_xmit;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 3bb7c5fb3bff..bbb4be2c18e7 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -128,6 +128,8 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_tstamp;
+	struct completion	skb_completion;
+	unsigned int		wait_on_complete;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-24  0:46 ` [PATCH v3 " Neil Horman
@ 2019-06-24 18:08   ` Willem de Bruijn
  2019-06-24 21:51     ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-24 18:08 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Sun, Jun 23, 2019 at 8:46 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> When an application is run that:
> a) Sets its scheduler to be SCHED_FIFO
> and
> b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> MSG_DONTWAIT flag cleared, its possible for the application to hang
> forever in the kernel.  This occurs because when waiting, the code in
> tpacket_snd calls schedule, which under normal circumstances allows
> other tasks to run, including ksoftirqd, which in some cases is
> responsible for freeing the transmitted skb (which in AF_PACKET calls a
> destructor that flips the status bit of the transmitted frame back to
> available, allowing the transmitting task to complete).
>
> However, when the calling application is SCHED_FIFO, its priority is
> such that the schedule call immediately places the task back on the cpu,
> preventing ksoftirqd from freeing the skb, which in turn prevents the
> transmitting task from detecting that the transmission is complete.
>
> We can fix this by converting the schedule call to a completion
> mechanism.  By using a completion queue, we force the calling task, when
> it detects there are no more frames to send, to schedule itself off the
> cpu until such time as the last transmitted skb is freed, allowing
> forward progress to be made.
>
> Tested by myself and the reporter, with good results
>
> Appies to the net tree
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> Reported-by: Matteo Croce <mcroce@redhat.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
>
> Change Notes:
>
> V1->V2:
>         Enhance the sleep logic to support being interruptible and
> allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)
>
> V2->V3:
>         Rearrage the point at which we wait for the completion queue, to
> avoid needing to check for ph/skb being null at the end of the loop.
> Also move the complete call to the skb destructor to avoid needing to
> modify __packet_set_status.  Also gate calling complete on
> packet_read_pending returning zero to avoid multiple calls to complete.
> (Willem de Bruijn)
>
>         Move timeo computation within loop, to re-fetch the socket
> timeout since we also use the timeo variable to record the return code
> from the wait_for_complete call (Neil Horman)
> ---
>  net/packet/af_packet.c | 59 +++++++++++++++++++++++++++++++++++++-----
>  net/packet/internal.h  |  2 ++
>  2 files changed, 55 insertions(+), 6 deletions(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index a29d66da7394..5c48bb7a4fa5 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -380,7 +380,6 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
>                 WARN(1, "TPACKET version not supported.\n");
>                 BUG();
>         }
> -

Unrelated to this feature

>         smp_wmb();
>  }
>
> @@ -1148,6 +1147,14 @@ static void *packet_previous_frame(struct packet_sock *po,
>         return packet_lookup_frame(po, rb, previous, status);
>  }
>
> +static void *packet_next_frame(struct packet_sock *po,
> +               struct packet_ring_buffer *rb,
> +               int status)
> +{
> +       unsigned int next = rb->head != rb->frame_max ? rb->head+1 : 0;
> +       return packet_lookup_frame(po, rb, next, status);
> +}
> +
>  static void packet_increment_head(struct packet_ring_buffer *buff)
>  {
>         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
> @@ -2401,6 +2408,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
>
>                 ts = __packet_set_timestamp(po, ph, skb);
>                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> +
> +               if (po->wait_on_complete && !packet_read_pending(&po->tx_ring))
> +                       complete(&po->skb_completion);
>         }
>
>         sock_wfree(skb);
> @@ -2600,9 +2610,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>         int len_sum = 0;
>         int status = TP_STATUS_AVAILABLE;
>         int hlen, tlen, copylen = 0;
> +       long timeo = 0;
>
>         mutex_lock(&po->pg_vec_lock);
>
> +

Same

>         if (likely(saddr == NULL)) {
>                 dev     = packet_cached_dev_get(po);
>                 proto   = po->num;
> @@ -2647,16 +2659,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>                 size_max = dev->mtu + reserve + VLAN_HLEN;
>
>         do {
> +

Same

>                 ph = packet_current_frame(po, &po->tx_ring,
>                                           TP_STATUS_SEND_REQUEST);
> -               if (unlikely(ph == NULL)) {
> -                       if (need_wait && need_resched())
> -                               schedule();
> -                       continue;

Why not keep the test whether the process needs to wait exactly here (A)?

Then no need for packet_next_frame.

> -               }
> +
> +               if (unlikely(ph == NULL))
> +                       break;
>
>                 skb = NULL;
>                 tp_len = tpacket_parse_header(po, ph, size_max, &data);
> +

Again

>                 if (tp_len < 0)
>                         goto tpacket_error;
>
> @@ -2720,6 +2732,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>
>                 skb->destructor = tpacket_destruct_skb;
>                 __packet_set_status(po, ph, TP_STATUS_SENDING);
> +
> +               /*
> +                * If we need to wait and we've sent the last frame pending
> +                * transmission in the mmaped buffer, flag that we need to wait
> +                * on those frames to get freed via tpacket_destruct_skb.  This
> +                * flag indicates that tpacket_destruct_skb should call complete
> +                * when the packet_pending count reaches zero, and that we need
> +                * to call wait_on_complete_interruptible_timeout below, to make
> +                * sure we pick up the result of that completion
> +                */
> +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> +                       po->wait_on_complete = 1;
> +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);

This resets timeout on every loop. should only set above the loop once.

Also, please limit the comments in the code (also below). If every
patch would add this many lines of comment, the file would be
enormous. OTOH, it's great to be this explanatory in the git commit,
which is easily reached for any line with git blame.

> +               }
> +
>                 packet_inc_pending(&po->tx_ring);
>
>                 status = TP_STATUS_SEND_REQUEST;
> @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>                         err = net_xmit_errno(err);
>                         if (err && __packet_get_status(po, ph) ==
>                                    TP_STATUS_AVAILABLE) {
> +                               /* re-init completion queue to avoid subsequent fallthrough
> +                                * on a future thread calling wait_on_complete_interruptible_timeout
> +                                */
> +                               po->wait_on_complete = 0;

If setting where sleeping, no need for resetting if a failure happens
between those blocks.

> +                               init_completion(&po->skb_completion);

no need to reinit between each use?

>                                 /* skb was destructed already */
>                                 skb = NULL;
>                                 goto out_status;
> @@ -2740,6 +2772,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>                 }
>                 packet_increment_head(&po->tx_ring);
>                 len_sum += tp_len;
> +
> +               if (po->wait_on_complete) {
> +                       timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> +                       po->wait_on_complete = 0;

I was going to argue for clearing in tpacket_destruct_skb. But then we
would have to separate clear on timeout instead of signal, too.

  po->wait_on_complete = 1;
  timeo = wait_for_completion...
  po->wait_on_complete = 0;

as the previous version had is fine, as long as the compiler does not
"optimize" away an assignment. The function call will avoid reordering
by the cpu, at least. Probably requires WRITE_ONCE/READ_ONCE.

> +                       if (!timeo) {
> +                               /* We timed out, break out and notify userspace */
> +                               err = -ETIMEDOUT;
> +                               goto out_status;

goto out_put, there is no active ph or skb here

> +                       } else if (timeo == -ERESTARTSYS) {
> +                               err = -ERESTARTSYS;
> +                               goto out_status;
> +                       }
> +               }
> +
>         } while (likely((ph != NULL) ||
>                 /* Note: packet_read_pending() might be slow if we have
>                  * to call it as it's per_cpu variable, but in fast-path
> @@ -3207,6 +3253,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
>         sock_init_data(sock, sk);
>
>         po = pkt_sk(sk);
> +       init_completion(&po->skb_completion);
>         sk->sk_family = PF_PACKET;
>         po->num = proto;
>         po->xmit = dev_queue_xmit;

This is basically replacing a busy-wait with schedule() with sleeping
using wait_for_completion_interruptible_timeout. My main question is
does this really need to move control flow around and add
packet_next_frame? If not, especially for net, the shortest, simplest
change is preferable.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-24 18:08   ` Willem de Bruijn
@ 2019-06-24 21:51     ` Neil Horman
  2019-06-24 22:15       ` Willem de Bruijn
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-24 21:51 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Mon, Jun 24, 2019 at 02:08:43PM -0400, Willem de Bruijn wrote:
> On Sun, Jun 23, 2019 at 8:46 PM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > When an application is run that:
> > a) Sets its scheduler to be SCHED_FIFO
> > and
> > b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> > MSG_DONTWAIT flag cleared, its possible for the application to hang
> > forever in the kernel.  This occurs because when waiting, the code in
> > tpacket_snd calls schedule, which under normal circumstances allows
> > other tasks to run, including ksoftirqd, which in some cases is
> > responsible for freeing the transmitted skb (which in AF_PACKET calls a
> > destructor that flips the status bit of the transmitted frame back to
> > available, allowing the transmitting task to complete).
> >
> > However, when the calling application is SCHED_FIFO, its priority is
> > such that the schedule call immediately places the task back on the cpu,
> > preventing ksoftirqd from freeing the skb, which in turn prevents the
> > transmitting task from detecting that the transmission is complete.
> >
> > We can fix this by converting the schedule call to a completion
> > mechanism.  By using a completion queue, we force the calling task, when
> > it detects there are no more frames to send, to schedule itself off the
> > cpu until such time as the last transmitted skb is freed, allowing
> > forward progress to be made.
> >
> > Tested by myself and the reporter, with good results
> >
> > Appies to the net tree
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > Reported-by: Matteo Croce <mcroce@redhat.com>
> > CC: "David S. Miller" <davem@davemloft.net>
> > CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> >
> > Change Notes:
> >
> > V1->V2:
> >         Enhance the sleep logic to support being interruptible and
> > allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)
> >
> > V2->V3:
> >         Rearrage the point at which we wait for the completion queue, to
> > avoid needing to check for ph/skb being null at the end of the loop.
> > Also move the complete call to the skb destructor to avoid needing to
> > modify __packet_set_status.  Also gate calling complete on
> > packet_read_pending returning zero to avoid multiple calls to complete.
> > (Willem de Bruijn)
> >
> >         Move timeo computation within loop, to re-fetch the socket
> > timeout since we also use the timeo variable to record the return code
> > from the wait_for_complete call (Neil Horman)
> > ---
> >  net/packet/af_packet.c | 59 +++++++++++++++++++++++++++++++++++++-----
> >  net/packet/internal.h  |  2 ++
> >  2 files changed, 55 insertions(+), 6 deletions(-)
> >
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index a29d66da7394..5c48bb7a4fa5 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -380,7 +380,6 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
> >                 WARN(1, "TPACKET version not supported.\n");
> >                 BUG();
> >         }
> > -
> 
> Unrelated to this feature
> 
Agreed.

> >         smp_wmb();
><snip>

> 
> >                 ph = packet_current_frame(po, &po->tx_ring,
> >                                           TP_STATUS_SEND_REQUEST);
> > -               if (unlikely(ph == NULL)) {
> > -                       if (need_wait && need_resched())
> > -                               schedule();
> > -                       continue;
> 
> Why not keep the test whether the process needs to wait exactly here (A)?
> 
As I said in the changelog, I think it makes the code more readable, to
understand that you are waiting for an event to complete after you send the
frame.

> Then no need for packet_next_frame.
> 
Thats fair.  I still think waiting at the bottom of the loop is more clear, but
it does save a function, so I'll agree to this.

> > -               }
> > +
> > +               if (unlikely(ph == NULL))
> > +                       break;
> >
> >                 skb = NULL;
> >                 tp_len = tpacket_parse_header(po, ph, size_max, &data);
> > +
> 
> Again
> 
> >                 if (tp_len < 0)
> >                         goto tpacket_error;
> >
> > @@ -2720,6 +2732,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >
> >                 skb->destructor = tpacket_destruct_skb;
> >                 __packet_set_status(po, ph, TP_STATUS_SENDING);
> > +
> > +               /*
> > +                * If we need to wait and we've sent the last frame pending
> > +                * transmission in the mmaped buffer, flag that we need to wait
> > +                * on those frames to get freed via tpacket_destruct_skb.  This
> > +                * flag indicates that tpacket_destruct_skb should call complete
> > +                * when the packet_pending count reaches zero, and that we need
> > +                * to call wait_on_complete_interruptible_timeout below, to make
> > +                * sure we pick up the result of that completion
> > +                */
> > +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> > +                       po->wait_on_complete = 1;
> > +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> 
> This resets timeout on every loop. should only set above the loop once.
> 
I explained exactly why I did that in the change log.  Its because I reuse the
timeout variable to get the return value of the wait_for_complete call.
Otherwise I need to add additional data to the stack, which I don't want to do.
Sock_sndtimeo is an inline function and really doesn't add any overhead to this
path, so I see no reason not to reuse the variable.

> Also, please limit the comments in the code (also below). If every
> patch would add this many lines of comment, the file would be
> enormous. OTOH, it's great to be this explanatory in the git commit,
> which is easily reached for any line with git blame.
> 
> > +               }
> > +
> >                 packet_inc_pending(&po->tx_ring);
> >
> >                 status = TP_STATUS_SEND_REQUEST;
> > @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >                         err = net_xmit_errno(err);
> >                         if (err && __packet_get_status(po, ph) ==
> >                                    TP_STATUS_AVAILABLE) {
> > +                               /* re-init completion queue to avoid subsequent fallthrough
> > +                                * on a future thread calling wait_on_complete_interruptible_timeout
> > +                                */
> > +                               po->wait_on_complete = 0;
> 
> If setting where sleeping, no need for resetting if a failure happens
> between those blocks.
> 
> > +                               init_completion(&po->skb_completion);
> 
> no need to reinit between each use?
> 
I explained exactly why I did this in the comment above.  We have to set
wait_for_complete prior to calling transmit, so as to ensure that we call
wait_for_completion before we exit the loop. However, in this error case, we
exit the loop prior to calling wait_for_complete, so we need to reset the
completion variable and the wait_for_complete flag.  Otherwise we will be in a
case where, on the next entrace to this loop we will have a completion variable
with completion->done > 0, meaning the next wait will be a fall through case,
which we don't want.

> >                                 /* skb was destructed already */
> >                                 skb = NULL;
> >                                 goto out_status;
> > @@ -2740,6 +2772,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >                 }
> >                 packet_increment_head(&po->tx_ring);
> >                 len_sum += tp_len;
> > +
> > +               if (po->wait_on_complete) {
> > +                       timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> > +                       po->wait_on_complete = 0;
> 
> I was going to argue for clearing in tpacket_destruct_skb. But then we
> would have to separate clear on timeout instead of signal, too.
> 
>   po->wait_on_complete = 1;
>   timeo = wait_for_completion...
>   po->wait_on_complete = 0;
> 
Also, we would have a race condition, since the destructor may be called from
softirq context (the first cause of the bug I'm fixing here), and so if the
packet is freed prior to us checking wait_for_complete in tpacket_snd, we will
be in the above situation again, exiting the loop with a completion variable in
an improper state.

> as the previous version had is fine, as long as the compiler does not
> "optimize" away an assignment. The function call will avoid reordering
> by the cpu, at least. Probably requires WRITE_ONCE/READ_ONCE.
> 
> > +                       if (!timeo) {
> > +                               /* We timed out, break out and notify userspace */
> > +                               err = -ETIMEDOUT;
> > +                               goto out_status;
> 
> goto out_put, there is no active ph or skb here
> 
Yes, good catch.

> > +                       } else if (timeo == -ERESTARTSYS) {
> > +                               err = -ERESTARTSYS;
> > +                               goto out_status;
> > +                       }
> > +               }
> > +
> >         } while (likely((ph != NULL) ||
> >                 /* Note: packet_read_pending() might be slow if we have
> >                  * to call it as it's per_cpu variable, but in fast-path
> > @@ -3207,6 +3253,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
> >         sock_init_data(sock, sk);
> >
> >         po = pkt_sk(sk);
> > +       init_completion(&po->skb_completion);
> >         sk->sk_family = PF_PACKET;
> >         po->num = proto;
> >         po->xmit = dev_queue_xmit;
> 
> This is basically replacing a busy-wait with schedule() with sleeping
> using wait_for_completion_interruptible_timeout. My main question is
> does this really need to move control flow around and add
> packet_next_frame? If not, especially for net, the shortest, simplest
> change is preferable.
> 
Its not replacing a busy wait at all, its replacing a non-blocking schedule with
a blocking schedule (via completion queues).  As for control flow, Im not sure I
why you are bound to the existing control flow, and given that we already have
packet_previous_frame, I didn't see anything egregious about adding
packet_next_frame as well, but since you've seen a way to eliminate it, I'm ok
with it.

Neil
 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-24 21:51     ` Neil Horman
@ 2019-06-24 22:15       ` Willem de Bruijn
  2019-06-25 11:02         ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-24 22:15 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

> > > +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> > > +                       po->wait_on_complete = 1;
> > > +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> >
> > This resets timeout on every loop. should only set above the loop once.
> >
> I explained exactly why I did that in the change log.  Its because I reuse the
> timeout variable to get the return value of the wait_for_complete call.
> Otherwise I need to add additional data to the stack, which I don't want to do.
> Sock_sndtimeo is an inline function and really doesn't add any overhead to this
> path, so I see no reason not to reuse the variable.

The issue isn't the reuse. It is that timeo is reset to sk_sndtimeo
each time. Whereas wait_for_common and its variants return the
number of jiffies left in case the loop needs to sleep again later.

Reading sock_sndtimeo once and passing it to wait_.. repeatedly is a
common pattern across the stack.

> > > @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > >                         err = net_xmit_errno(err);
> > >                         if (err && __packet_get_status(po, ph) ==
> > >                                    TP_STATUS_AVAILABLE) {
> > > +                               /* re-init completion queue to avoid subsequent fallthrough
> > > +                                * on a future thread calling wait_on_complete_interruptible_timeout
> > > +                                */
> > > +                               po->wait_on_complete = 0;
> >
> > If setting where sleeping, no need for resetting if a failure happens
> > between those blocks.
> >
> > > +                               init_completion(&po->skb_completion);
> >
> > no need to reinit between each use?
> >
> I explained exactly why I did this in the comment above.  We have to set
> wait_for_complete prior to calling transmit, so as to ensure that we call
> wait_for_completion before we exit the loop. However, in this error case, we
> exit the loop prior to calling wait_for_complete, so we need to reset the
> completion variable and the wait_for_complete flag.  Otherwise we will be in a
> case where, on the next entrace to this loop we will have a completion variable
> with completion->done > 0, meaning the next wait will be a fall through case,
> which we don't want.

By moving back to the point where schedule() is called, hopefully this
complexity automatically goes away. Same as my comment to the line
immediately above.

> > > @@ -2740,6 +2772,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > >                 }
> > >                 packet_increment_head(&po->tx_ring);
> > >                 len_sum += tp_len;
> > > +
> > > +               if (po->wait_on_complete) {
> > > +                       timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> > > +                       po->wait_on_complete = 0;
> >
> > I was going to argue for clearing in tpacket_destruct_skb. But then we
> > would have to separate clear on timeout instead of signal, too.
> >
> >   po->wait_on_complete = 1;
> >   timeo = wait_for_completion...
> >   po->wait_on_complete = 0;
> >
> Also, we would have a race condition, since the destructor may be called from
> softirq context (the first cause of the bug I'm fixing here), and so if the
> packet is freed prior to us checking wait_for_complete in tpacket_snd, we will
> be in the above situation again, exiting the loop with a completion variable in
> an improper state.

Good point.

The common pattern is to clear in tpacket_destruct_skb. Then
we do need to handle the case where the wait is interrupted or
times out and reset it in those cases.

> > This is basically replacing a busy-wait with schedule() with sleeping
> > using wait_for_completion_interruptible_timeout. My main question is
> > does this really need to move control flow around and add
> > packet_next_frame? If not, especially for net, the shortest, simplest
> > change is preferable.
> >
> Its not replacing a busy wait at all, its replacing a non-blocking schedule with
> a blocking schedule (via completion queues).  As for control flow, Im not sure I
> why you are bound to the existing control flow, and given that we already have
> packet_previous_frame, I didn't see anything egregious about adding
> packet_next_frame as well, but since you've seen a way to eliminate it, I'm ok
> with it.

The benefit of keeping to the existing control flow is that that is a
smaller change, so easier to verify.

I understand the benefit of moving the wait outside the loop. Before
this report, I was not even aware of that behavior on !MSG_DONTWAIT,
because it is so co-located.

But moving it elsewhere in the loop does not have the same benefit,
imho. Either way, I think we better leave any such code improvements
to net-next and focus on the minimal , least risky, patch for net.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-24 22:15       ` Willem de Bruijn
@ 2019-06-25 11:02         ` Neil Horman
  2019-06-25 13:37           ` Willem de Bruijn
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-25 11:02 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Mon, Jun 24, 2019 at 06:15:29PM -0400, Willem de Bruijn wrote:
> > > > +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> > > > +                       po->wait_on_complete = 1;
> > > > +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > >
> > > This resets timeout on every loop. should only set above the loop once.
> > >
> > I explained exactly why I did that in the change log.  Its because I reuse the
> > timeout variable to get the return value of the wait_for_complete call.
> > Otherwise I need to add additional data to the stack, which I don't want to do.
> > Sock_sndtimeo is an inline function and really doesn't add any overhead to this
> > path, so I see no reason not to reuse the variable.
> 
> The issue isn't the reuse. It is that timeo is reset to sk_sndtimeo
> each time. Whereas wait_for_common and its variants return the
> number of jiffies left in case the loop needs to sleep again later.
> 
> Reading sock_sndtimeo once and passing it to wait_.. repeatedly is a
> common pattern across the stack.
> 
But those patterns are unique to those situations.  For instance, in
tcp_sendmsg_locked, we aquire the value of the socket timeout, and use that to
wait for the entire message send operation to complete, which consists of
potentially several blocking operations (waiting for the tcp connection to be
established, waiting for socket memory, etc).  In that situation we want to wait
for all of those operations to complete to send a single message, and fail if
they exceed the timeout in aggregate.  The semantics are different with
AF_PACKET.  In this use case, the message is in effect empty, and just used to
pass some control information.  tpacket_snd, sends as many frames from the
memory mapped buffer as possible, and on each iteration we want to wait for the
specified timeout for those frames to complete sending.  I think resetting the
timeout on each wait instance is the right way to go here.

> > > > @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > >                         err = net_xmit_errno(err);
> > > >                         if (err && __packet_get_status(po, ph) ==
> > > >                                    TP_STATUS_AVAILABLE) {
> > > > +                               /* re-init completion queue to avoid subsequent fallthrough
> > > > +                                * on a future thread calling wait_on_complete_interruptible_timeout
> > > > +                                */
> > > > +                               po->wait_on_complete = 0;
> > >
> > > If setting where sleeping, no need for resetting if a failure happens
> > > between those blocks.
> > >
> > > > +                               init_completion(&po->skb_completion);
> > >
> > > no need to reinit between each use?
> > >
> > I explained exactly why I did this in the comment above.  We have to set
> > wait_for_complete prior to calling transmit, so as to ensure that we call
> > wait_for_completion before we exit the loop. However, in this error case, we
> > exit the loop prior to calling wait_for_complete, so we need to reset the
> > completion variable and the wait_for_complete flag.  Otherwise we will be in a
> > case where, on the next entrace to this loop we will have a completion variable
> > with completion->done > 0, meaning the next wait will be a fall through case,
> > which we don't want.
> 
> By moving back to the point where schedule() is called, hopefully this
> complexity automatically goes away. Same as my comment to the line
> immediately above.
> 
Its going to change what the complexity is, actually.  I was looking at this
last night, and I realized that your assertion that we could remove
packet_next_frame came at a cost.  This is because we need to determine if we
have to wait before we call po->xmit, but we need to actually do the wait after
po->xmit (to ensure that wait_on_complete is set properly when the desructor is
called).  By moving the wait to the top of the loop and getting rid of
packet_next_frame we now have a race condition in which we might call
tpacket_destruct_skb with wait_on_complete set to false, causing us to wait for
the maximum timeout erroneously.  So I'm going to have to find a new way to
signal the need to call complete, which I think will introduce a different level
of complexity.

> > > > @@ -2740,6 +2772,20 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > >                 }
> > > >                 packet_increment_head(&po->tx_ring);
> > > >                 len_sum += tp_len;
> > > > +
> > > > +               if (po->wait_on_complete) {
> > > > +                       timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> > > > +                       po->wait_on_complete = 0;
> > >
> > > I was going to argue for clearing in tpacket_destruct_skb. But then we
> > > would have to separate clear on timeout instead of signal, too.
> > >
> > >   po->wait_on_complete = 1;
> > >   timeo = wait_for_completion...
> > >   po->wait_on_complete = 0;
> > >
> > Also, we would have a race condition, since the destructor may be called from
> > softirq context (the first cause of the bug I'm fixing here), and so if the
> > packet is freed prior to us checking wait_for_complete in tpacket_snd, we will
> > be in the above situation again, exiting the loop with a completion variable in
> > an improper state.
> 
> Good point.
> 
> The common pattern is to clear in tpacket_destruct_skb. Then
> we do need to handle the case where the wait is interrupted or
> times out and reset it in those cases.
> 
As noted above, if we restore the original control flow, the wait_on_complete
flag is not useable, as its state needs to be determined prior to actually
sending the frame to the driver via po->xmit.

I'm going to try some logic in which both tpacket_snd and tpacket_destruct_skb
key off of packet_read_pending.  I think this will require a re-initalization of
the completion queue on each entry to tpacket_snd, but perhaps thats more
pallatable.

> > > This is basically replacing a busy-wait with schedule() with sleeping
> > > using wait_for_completion_interruptible_timeout. My main question is
> > > does this really need to move control flow around and add
> > > packet_next_frame? If not, especially for net, the shortest, simplest
> > > change is preferable.
> > >
> > Its not replacing a busy wait at all, its replacing a non-blocking schedule with
> > a blocking schedule (via completion queues).  As for control flow, Im not sure I
> > why you are bound to the existing control flow, and given that we already have
> > packet_previous_frame, I didn't see anything egregious about adding
> > packet_next_frame as well, but since you've seen a way to eliminate it, I'm ok
> > with it.
> 
> The benefit of keeping to the existing control flow is that that is a
> smaller change, so easier to verify.
> 
I don't think it will really, because its going to introduce different
complexities, but we'll see for certain when I finish getting it rewritten
again.

> I understand the benefit of moving the wait outside the loop. Before
> this report, I was not even aware of that behavior on !MSG_DONTWAIT,
> because it is so co-located.
> 
> But moving it elsewhere in the loop does not have the same benefit,
> imho. Either way, I think we better leave any such code improvements
> to net-next and focus on the minimal , least risky, patch for net.
> 
Ok.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-25 11:02         ` Neil Horman
@ 2019-06-25 13:37           ` Willem de Bruijn
  2019-06-25 16:20             ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-25 13:37 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Tue, Jun 25, 2019 at 7:03 AM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Mon, Jun 24, 2019 at 06:15:29PM -0400, Willem de Bruijn wrote:
> > > > > +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> > > > > +                       po->wait_on_complete = 1;
> > > > > +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > > >
> > > > This resets timeout on every loop. should only set above the loop once.
> > > >
> > > I explained exactly why I did that in the change log.  Its because I reuse the
> > > timeout variable to get the return value of the wait_for_complete call.
> > > Otherwise I need to add additional data to the stack, which I don't want to do.
> > > Sock_sndtimeo is an inline function and really doesn't add any overhead to this
> > > path, so I see no reason not to reuse the variable.
> >
> > The issue isn't the reuse. It is that timeo is reset to sk_sndtimeo
> > each time. Whereas wait_for_common and its variants return the
> > number of jiffies left in case the loop needs to sleep again later.
> >
> > Reading sock_sndtimeo once and passing it to wait_.. repeatedly is a
> > common pattern across the stack.
> >
> But those patterns are unique to those situations.  For instance, in
> tcp_sendmsg_locked, we aquire the value of the socket timeout, and use that to
> wait for the entire message send operation to complete, which consists of
> potentially several blocking operations (waiting for the tcp connection to be
> established, waiting for socket memory, etc).  In that situation we want to wait
> for all of those operations to complete to send a single message, and fail if
> they exceed the timeout in aggregate.  The semantics are different with
> AF_PACKET.  In this use case, the message is in effect empty, and just used to
> pass some control information.  tpacket_snd, sends as many frames from the
> memory mapped buffer as possible, and on each iteration we want to wait for the
> specified timeout for those frames to complete sending.  I think resetting the
> timeout on each wait instance is the right way to go here.

I disagree. If a SO_SNDTIMEO of a given time is set, the thread should
not wait beyond that. Else what is the point of passing a specific
duration in the syscall?

Btw, we can always drop the timeo and go back to unconditional (bar
signals) waiting.

>
> > > > > @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > > >                         err = net_xmit_errno(err);
> > > > >                         if (err && __packet_get_status(po, ph) ==
> > > > >                                    TP_STATUS_AVAILABLE) {
> > > > > +                               /* re-init completion queue to avoid subsequent fallthrough
> > > > > +                                * on a future thread calling wait_on_complete_interruptible_timeout
> > > > > +                                */
> > > > > +                               po->wait_on_complete = 0;
> > > >
> > > > If setting where sleeping, no need for resetting if a failure happens
> > > > between those blocks.
> > > >
> > > > > +                               init_completion(&po->skb_completion);
> > > >
> > > > no need to reinit between each use?
> > > >
> > > I explained exactly why I did this in the comment above.  We have to set
> > > wait_for_complete prior to calling transmit, so as to ensure that we call
> > > wait_for_completion before we exit the loop. However, in this error case, we
> > > exit the loop prior to calling wait_for_complete, so we need to reset the
> > > completion variable and the wait_for_complete flag.  Otherwise we will be in a
> > > case where, on the next entrace to this loop we will have a completion variable
> > > with completion->done > 0, meaning the next wait will be a fall through case,
> > > which we don't want.
> >
> > By moving back to the point where schedule() is called, hopefully this
> > complexity automatically goes away. Same as my comment to the line
> > immediately above.
> >
> Its going to change what the complexity is, actually.  I was looking at this
> last night, and I realized that your assertion that we could remove
> packet_next_frame came at a cost.  This is because we need to determine if we
> have to wait before we call po->xmit, but we need to actually do the wait after
> po->xmit

Why? The existing method using schedule() doesn't.

Can we not just loop while sending and sleep immediately when
__packet_get_status returns !TP_STATUS_AVAILABLE?

I don't understand the need to probe the next packet to send instead
of the current.

This seems to be the crux of the disagreement. My guess is that it has
to do with setting wait_on_complete, but I don't see the problem. It
can be set immediately before going to sleep.

I don't meant to draw this out, btw, or add to your workload. If you
prefer, I can take a stab at my (admittedly hand-wavy) suggestion.

> (to ensure that wait_on_complete is set properly when the desructor is
> called).  By moving the wait to the top of the loop and getting rid of
> packet_next_frame we now have a race condition in which we might call
> tpacket_destruct_skb with wait_on_complete set to false, causing us to wait for
> the maximum timeout erroneously.  So I'm going to have to find a new way to
> signal the need to call complete, which I think will introduce a different level
> of complexity.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-25 13:37           ` Willem de Bruijn
@ 2019-06-25 16:20             ` Neil Horman
  2019-06-25 21:59               ` Willem de Bruijn
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-25 16:20 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Tue, Jun 25, 2019 at 09:37:17AM -0400, Willem de Bruijn wrote:
> On Tue, Jun 25, 2019 at 7:03 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Mon, Jun 24, 2019 at 06:15:29PM -0400, Willem de Bruijn wrote:
> > > > > > +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> > > > > > +                       po->wait_on_complete = 1;
> > > > > > +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > > > >
> > > > > This resets timeout on every loop. should only set above the loop once.
> > > > >
> > > > I explained exactly why I did that in the change log.  Its because I reuse the
> > > > timeout variable to get the return value of the wait_for_complete call.
> > > > Otherwise I need to add additional data to the stack, which I don't want to do.
> > > > Sock_sndtimeo is an inline function and really doesn't add any overhead to this
> > > > path, so I see no reason not to reuse the variable.
> > >
> > > The issue isn't the reuse. It is that timeo is reset to sk_sndtimeo
> > > each time. Whereas wait_for_common and its variants return the
> > > number of jiffies left in case the loop needs to sleep again later.
> > >
> > > Reading sock_sndtimeo once and passing it to wait_.. repeatedly is a
> > > common pattern across the stack.
> > >
> > But those patterns are unique to those situations.  For instance, in
> > tcp_sendmsg_locked, we aquire the value of the socket timeout, and use that to
> > wait for the entire message send operation to complete, which consists of
> > potentially several blocking operations (waiting for the tcp connection to be
> > established, waiting for socket memory, etc).  In that situation we want to wait
> > for all of those operations to complete to send a single message, and fail if
> > they exceed the timeout in aggregate.  The semantics are different with
> > AF_PACKET.  In this use case, the message is in effect empty, and just used to
> > pass some control information.  tpacket_snd, sends as many frames from the
> > memory mapped buffer as possible, and on each iteration we want to wait for the
> > specified timeout for those frames to complete sending.  I think resetting the
> > timeout on each wait instance is the right way to go here.
> 
> I disagree. If a SO_SNDTIMEO of a given time is set, the thread should
> not wait beyond that. Else what is the point of passing a specific
> duration in the syscall?
> 
I can appreciate that, but you said yourself that you wanted to minimize this
change.  The current behavior of AF_PACKET is to:
a) check for their being no more packets ready to send
b) if (a) is false we schedule() (nominally allowing other tasks to run)
c) when (b) is complete, check for additional frames to send, and exit if there
are not, otherwise, continue sending and waiting

In that model, a single task calling sendmsg can run in the kernel indefinately,
if userspace continues to fill the memory mapped buffer

Given that model, resetting the timeout on each call to wait_for_completion
keeps that behavior.  In fact, if we allow the timeout value to get continuously
decremented, it will be possible for a call to sendmsg in this protocol to
_always_ return -ETIMEDOUT (if we keep the buffer full in userspace long enough,
then the sending task will eventually decrement timeo to zero, and force an
-ETIMEDOUT call).

I'm convinced that resetting the timeout here is the right way to go.

> Btw, we can always drop the timeo and go back to unconditional (bar
> signals) waiting.
> 
We could, but it would be nice to implement the timeout feature here if
possible.

> >
> > > > > > @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > > > >                         err = net_xmit_errno(err);
> > > > > >                         if (err && __packet_get_status(po, ph) ==
> > > > > >                                    TP_STATUS_AVAILABLE) {
> > > > > > +                               /* re-init completion queue to avoid subsequent fallthrough
> > > > > > +                                * on a future thread calling wait_on_complete_interruptible_timeout
> > > > > > +                                */
> > > > > > +                               po->wait_on_complete = 0;
> > > > >
> > > > > If setting where sleeping, no need for resetting if a failure happens
> > > > > between those blocks.
> > > > >
> > > > > > +                               init_completion(&po->skb_completion);
> > > > >
> > > > > no need to reinit between each use?
> > > > >
> > > > I explained exactly why I did this in the comment above.  We have to set
> > > > wait_for_complete prior to calling transmit, so as to ensure that we call
> > > > wait_for_completion before we exit the loop. However, in this error case, we
> > > > exit the loop prior to calling wait_for_complete, so we need to reset the
> > > > completion variable and the wait_for_complete flag.  Otherwise we will be in a
> > > > case where, on the next entrace to this loop we will have a completion variable
> > > > with completion->done > 0, meaning the next wait will be a fall through case,
> > > > which we don't want.
> > >
> > > By moving back to the point where schedule() is called, hopefully this
> > > complexity automatically goes away. Same as my comment to the line
> > > immediately above.
> > >
> > Its going to change what the complexity is, actually.  I was looking at this
> > last night, and I realized that your assertion that we could remove
> > packet_next_frame came at a cost.  This is because we need to determine if we
> > have to wait before we call po->xmit, but we need to actually do the wait after
> > po->xmit
> 
> Why? The existing method using schedule() doesn't.
> 
Because the existing method using schedule doesn't have to rely on an
asynchronous event to wake the sending task back up.  Specifically, we need to
set some state that indicates tpacket_destruct_skb should call complete, and
since tpacket_destruct_skb is called asynchronously in a separate execution
context, we need to ensure there is no race condition whereby we execute
tpacket_destruct_skb before we set the state that we also use to determine if we
should call wait_for_complete.  ie:
1) tpacket_send_skb calls po->xmit
2) tpacket_send_skb loops around and checks to see if there are more packets to
send.  If not and if need_wait is set we call wait_for_complete

If tpacket_destruct_skb is called after (2), we're fine.  But if its called
between (1) and (2), then tpacket_destruct_skb won't call complete (because
wait_on_complete isn't set yet), causing a hang.

Because you wanted to remove packet_next_frame, we have no way to determine if
we're done sending frames prior to calling po->xmit, which is the point past
which tpacket_destruct_skb might be called, so now I have to find an alternate
state condition to key off of.
 

> Can we not just loop while sending and sleep immediately when
> __packet_get_status returns !TP_STATUS_AVAILABLE?
> 
> I don't understand the need to probe the next packet to send instead
> of the current.
> 
See above, we can definately check the return of ph at the top of the loop, and
sleep if its NULL, but in so doing we cant use po->wait_on_complete as a gating
variable because we've already called po->xmit.  Once we call that function, if
we haven't already set po->wait_on_complete to true, its possible
tpacket_destruct_skb will already have been called before we get to the point
where we check wait_for_completion.  That means we will wait on a completion
variable that never gets completed, so I need to find a new way to track weather
or not we are waiting.  Its entirely doable, I'm sure, its just not as straight
forward as your making it out to be.

> This seems to be the crux of the disagreement. My guess is that it has
> to do with setting wait_on_complete, but I don't see the problem. It
> can be set immediately before going to sleep.
> 
The reason has to do with the fact that
tpacket_destruct_skb can be called from ksoftirq context (i.e. it will be called
asynchrnously, not from the thread running in tpacket_snd).  Condsider this
flow, assuming we do as you suggest, and just set po->wait_on_complete=1
immediately prior to calling wait_for_complete_interruptible_timeout: 

1) tpacket_snd gets a frame, builds the skb for transmission
2) tpakcket_snd calls po->xmit(skb), sending the frame to the driver
3) tpacket_snd, iterates through back to the top of the loop, and determines
that we need to wait for the previous packet to complete
4) The driver gets a tx completion interrupt, interrupting the cpu on which we
are executing, and calls kfree_skb_irq on the skb we just transmitted
5) the ksoftirq runs on the cpu, calling kfree_skb on our skb
6) (5) calls skb->destruct (which is tpakcet_skb_destruct)
7) tpacket_skb_destruct executes, sees that po->wait_on_completion is zero, and
skips calling complete()
8) the thread running tpacket_snd gets scheduled back on the cpu and now sets
po->wait_on_complete, then calls wait_for_completion_interruptible_timeout, but
since the skb we are waiting to complete has already been freed, we will never
get the completion notification, and so we will wait for the maximal timeout,
which is absolutely not what we want.

Interestingly (Ironically), that control flow will never happen in the use case
I'm trying to fix here, because its SCHED_FIFO, and will run until such time as
we call wait_for_completion_interuptible_timeout, so in this case we're safe.
But in the nominal case, where the sending task is acturally SCHED_OTHER, the
above can aboslutely happen, and thats very broken.

> I don't meant to draw this out, btw, or add to your workload. If you
> prefer, I can take a stab at my (admittedly hand-wavy) suggestion.
> 
No, I have another method in flight that I'm testing now, it removes the
po->wait_on_complete variable entirely, checking instead to make sure that
we've:
a) sent at least one frame
and
b) that we have a positive pending count
and
c) that we need to wait
and
d) that we have no more frames to send

This is why I was saying that your idea can be done, but it trades one form of
complexity for another.

Neil

> > (to ensure that wait_on_complete is set properly when the desructor is
> > called).  By moving the wait to the top of the loop and getting rid of
> > packet_next_frame we now have a race condition in which we might call
> > tpacket_destruct_skb with wait_on_complete set to false, causing us to wait for
> > the maximum timeout erroneously.  So I'm going to have to find a new way to
> > signal the need to call complete, which I think will introduce a different level
> > of complexity.
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v4 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-19 20:25 [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET Neil Horman
                   ` (2 preceding siblings ...)
  2019-06-24  0:46 ` [PATCH v3 " Neil Horman
@ 2019-06-25 21:57 ` Neil Horman
  2019-06-25 22:30   ` Willem de Bruijn
  2019-06-27  2:38   ` David Miller
  3 siblings, 2 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-25 21:57 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Matteo Croce, David S. Miller, Willem de Bruijn

When an application is run that:
a) Sets its scheduler to be SCHED_FIFO
and
b) Opens a memory mapped AF_PACKET socket, and sends frames with the
MSG_DONTWAIT flag cleared, its possible for the application to hang
forever in the kernel.  This occurs because when waiting, the code in
tpacket_snd calls schedule, which under normal circumstances allows
other tasks to run, including ksoftirqd, which in some cases is
responsible for freeing the transmitted skb (which in AF_PACKET calls a
destructor that flips the status bit of the transmitted frame back to
available, allowing the transmitting task to complete).

However, when the calling application is SCHED_FIFO, its priority is
such that the schedule call immediately places the task back on the cpu,
preventing ksoftirqd from freeing the skb, which in turn prevents the
transmitting task from detecting that the transmission is complete.

We can fix this by converting the schedule call to a completion
mechanism.  By using a completion queue, we force the calling task, when
it detects there are no more frames to send, to schedule itself off the
cpu until such time as the last transmitted skb is freed, allowing
forward progress to be made.

Tested by myself and the reporter, with good results

Appies to the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Matteo Croce <mcroce@redhat.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>

Change Notes:

V1->V2:
	Enhance the sleep logic to support being interruptible and
allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)

V2->V3:
	Rearrage the point at which we wait for the completion queue, to
avoid needing to check for ph/skb being null at the end of the loop.
Also move the complete call to the skb destructor to avoid needing to
modify __packet_set_status.  Also gate calling complete on
packet_read_pending returning zero to avoid multiple calls to complete.
(Willem de Bruijn)

	Move timeo computation within loop, to re-fetch the socket
timeout since we also use the timeo variable to record the return code
from the wait_for_complete call (Neil Horman)

V3->V4:
	Willem has requested that the control flow be restored to the
previous state.  Doing so lets us eliminate the need for the
po->wait_on_complete flag variable, and lets us get rid of the
packet_next_frame function, but introduces another complexity.
Specifically, but using the packet pending count, we can, if an
applications calls sendmsg multiple times with MSG_DONTWAIT set, each
set of transmitted frames, when complete, will cause
tpacket_destruct_skb to issue a complete call, for which there will
never be a wait_on_completion call.  This imbalance will lead to any
future call to wait_for_completion here to return early, when the frames
they sent may not have completed.  To correct this, we need to re-init
the completion queue on every call to tpacket_snd before we enter the
loop so as to ensure we wait properly for the frames we send in this
iteration.

	Change the timeout and interrupted gotos to out_put rather than
out_status so that we don't try to free a non-existant skb
	Clean up some extra newlines (Willem de Bruijn)
---
 net/packet/af_packet.c | 20 +++++++++++++++++---
 net/packet/internal.h  |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a29d66da7394..a7ca6a003ebe 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
 
 		ts = __packet_set_timestamp(po, ph, skb);
 		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
+
+		if (!packet_read_pending(&po->tx_ring))
+			complete(&po->skb_completion);
 	}
 
 	sock_wfree(skb);
@@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
 
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 {
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	struct net_device *dev;
 	struct virtio_net_hdr *vnet_hdr = NULL;
 	struct sockcm_cookie sockc;
@@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	int len_sum = 0;
 	int status = TP_STATUS_AVAILABLE;
 	int hlen, tlen, copylen = 0;
+	long timeo = 0;
 
 	mutex_lock(&po->pg_vec_lock);
 
@@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 	if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
 		size_max = dev->mtu + reserve + VLAN_HLEN;
 
+	reinit_completion(&po->skb_completion);
+
 	do {
 		ph = packet_current_frame(po, &po->tx_ring,
 					  TP_STATUS_SEND_REQUEST);
 		if (unlikely(ph == NULL)) {
-			if (need_wait && need_resched())
-				schedule();
+			if (need_wait && skb) {
+				timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
+				timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
+				if (timeo <= 0) {
+					err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
+					goto out_put;
+				}
+			}
+			/* check for additional frames */
 			continue;
 		}
 
@@ -3207,6 +3220,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_init_data(sock, sk);
 
 	po = pkt_sk(sk);
+	init_completion(&po->skb_completion);
 	sk->sk_family = PF_PACKET;
 	po->num = proto;
 	po->xmit = dev_queue_xmit;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 3bb7c5fb3bff..c70a2794456f 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -128,6 +128,7 @@ struct packet_sock {
 	unsigned int		tp_hdrlen;
 	unsigned int		tp_reserve;
 	unsigned int		tp_tstamp;
+	struct completion	skb_completion;
 	struct net_device __rcu	*cached_dev;
 	int			(*xmit)(struct sk_buff *skb);
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH v3 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-25 16:20             ` Neil Horman
@ 2019-06-25 21:59               ` Willem de Bruijn
  0 siblings, 0 replies; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-25 21:59 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Tue, Jun 25, 2019 at 12:20 PM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Tue, Jun 25, 2019 at 09:37:17AM -0400, Willem de Bruijn wrote:
> > On Tue, Jun 25, 2019 at 7:03 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > >
> > > On Mon, Jun 24, 2019 at 06:15:29PM -0400, Willem de Bruijn wrote:
> > > > > > > +               if (need_wait && !packet_next_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST)) {
> > > > > > > +                       po->wait_on_complete = 1;
> > > > > > > +                       timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > > > > >
> > > > > > This resets timeout on every loop. should only set above the loop once.
> > > > > >
> > > > > I explained exactly why I did that in the change log.  Its because I reuse the
> > > > > timeout variable to get the return value of the wait_for_complete call.
> > > > > Otherwise I need to add additional data to the stack, which I don't want to do.
> > > > > Sock_sndtimeo is an inline function and really doesn't add any overhead to this
> > > > > path, so I see no reason not to reuse the variable.
> > > >
> > > > The issue isn't the reuse. It is that timeo is reset to sk_sndtimeo
> > > > each time. Whereas wait_for_common and its variants return the
> > > > number of jiffies left in case the loop needs to sleep again later.
> > > >
> > > > Reading sock_sndtimeo once and passing it to wait_.. repeatedly is a
> > > > common pattern across the stack.
> > > >
> > > But those patterns are unique to those situations.  For instance, in
> > > tcp_sendmsg_locked, we aquire the value of the socket timeout, and use that to
> > > wait for the entire message send operation to complete, which consists of
> > > potentially several blocking operations (waiting for the tcp connection to be
> > > established, waiting for socket memory, etc).  In that situation we want to wait
> > > for all of those operations to complete to send a single message, and fail if
> > > they exceed the timeout in aggregate.  The semantics are different with
> > > AF_PACKET.  In this use case, the message is in effect empty, and just used to
> > > pass some control information.  tpacket_snd, sends as many frames from the
> > > memory mapped buffer as possible, and on each iteration we want to wait for the
> > > specified timeout for those frames to complete sending.  I think resetting the
> > > timeout on each wait instance is the right way to go here.
> >
> > I disagree. If a SO_SNDTIMEO of a given time is set, the thread should
> > not wait beyond that. Else what is the point of passing a specific
> > duration in the syscall?
> >
> I can appreciate that, but you said yourself that you wanted to minimize this
> change.  The current behavior of AF_PACKET is to:
> a) check for their being no more packets ready to send
> b) if (a) is false we schedule() (nominally allowing other tasks to run)
> c) when (b) is complete, check for additional frames to send, and exit if there
> are not, otherwise, continue sending and waiting
>
> In that model, a single task calling sendmsg can run in the kernel indefinately,
> if userspace continues to fill the memory mapped buffer
>
> Given that model, resetting the timeout on each call to wait_for_completion
> keeps that behavior.  In fact, if we allow the timeout value to get continuously
> decremented, it will be possible for a call to sendmsg in this protocol to
> _always_ return -ETIMEDOUT (if we keep the buffer full in userspace long enough,
> then the sending task will eventually decrement timeo to zero, and force an
> -ETIMEDOUT call).
>
> I'm convinced that resetting the timeout here is the right way to go.

It upholds the contract, but extends when userspace appends to the
ring. Okay, yes, that makes sense.

> > Btw, we can always drop the timeo and go back to unconditional (bar
> > signals) waiting.
> >
> We could, but it would be nice to implement the timeout feature here if
> possible.
>
> > >
> > > > > > > @@ -2728,6 +2755,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > > > > >                         err = net_xmit_errno(err);
> > > > > > >                         if (err && __packet_get_status(po, ph) ==
> > > > > > >                                    TP_STATUS_AVAILABLE) {
> > > > > > > +                               /* re-init completion queue to avoid subsequent fallthrough
> > > > > > > +                                * on a future thread calling wait_on_complete_interruptible_timeout
> > > > > > > +                                */
> > > > > > > +                               po->wait_on_complete = 0;
> > > > > >
> > > > > > If setting where sleeping, no need for resetting if a failure happens
> > > > > > between those blocks.
> > > > > >
> > > > > > > +                               init_completion(&po->skb_completion);
> > > > > >
> > > > > > no need to reinit between each use?
> > > > > >
> > > > > I explained exactly why I did this in the comment above.  We have to set
> > > > > wait_for_complete prior to calling transmit, so as to ensure that we call
> > > > > wait_for_completion before we exit the loop. However, in this error case, we
> > > > > exit the loop prior to calling wait_for_complete, so we need to reset the
> > > > > completion variable and the wait_for_complete flag.  Otherwise we will be in a
> > > > > case where, on the next entrace to this loop we will have a completion variable
> > > > > with completion->done > 0, meaning the next wait will be a fall through case,
> > > > > which we don't want.
> > > >
> > > > By moving back to the point where schedule() is called, hopefully this
> > > > complexity automatically goes away. Same as my comment to the line
> > > > immediately above.
> > > >
> > > Its going to change what the complexity is, actually.  I was looking at this
> > > last night, and I realized that your assertion that we could remove
> > > packet_next_frame came at a cost.  This is because we need to determine if we
> > > have to wait before we call po->xmit, but we need to actually do the wait after
> > > po->xmit
> >
> > Why? The existing method using schedule() doesn't.
> >
> Because the existing method using schedule doesn't have to rely on an
> asynchronous event to wake the sending task back up.  Specifically, we need to
> set some state that indicates tpacket_destruct_skb should call complete, and
> since tpacket_destruct_skb is called asynchronously in a separate execution
> context, we need to ensure there is no race condition whereby we execute
> tpacket_destruct_skb before we set the state that we also use to determine if we
> should call wait_for_complete.  ie:
> 1) tpacket_send_skb calls po->xmit
> 2) tpacket_send_skb loops around and checks to see if there are more packets to
> send.  If not and if need_wait is set we call wait_for_complete
>
> If tpacket_destruct_skb is called after (2), we're fine.  But if its called
> between (1) and (2), then tpacket_destruct_skb won't call complete (because
> wait_on_complete isn't set yet), causing a hang.
>
> Because you wanted to remove packet_next_frame, we have no way to determine if
> we're done sending frames prior to calling po->xmit, which is the point past
> which tpacket_destruct_skb might be called, so now I have to find an alternate
> state condition to key off of.
>
>
> > Can we not just loop while sending and sleep immediately when
> > __packet_get_status returns !TP_STATUS_AVAILABLE?
> >
> > I don't understand the need to probe the next packet to send instead
> > of the current.
> >
> See above, we can definately check the return of ph at the top of the loop, and
> sleep if its NULL, but in so doing we cant use po->wait_on_complete as a gating
> variable because we've already called po->xmit.  Once we call that function, if
> we haven't already set po->wait_on_complete to true, its possible
> tpacket_destruct_skb will already have been called before we get to the point
> where we check wait_for_completion.  That means we will wait on a completion
> variable that never gets completed, so I need to find a new way to track weather
> or not we are waiting.  Its entirely doable, I'm sure, its just not as straight
> forward as your making it out to be.
>
> > This seems to be the crux of the disagreement. My guess is that it has
> > to do with setting wait_on_complete, but I don't see the problem. It
> > can be set immediately before going to sleep.
> >
> The reason has to do with the fact that
> tpacket_destruct_skb can be called from ksoftirq context (i.e. it will be called
> asynchrnously, not from the thread running in tpacket_snd).  Condsider this
> flow, assuming we do as you suggest, and just set po->wait_on_complete=1
> immediately prior to calling wait_for_complete_interruptible_timeout:
>
> 1) tpacket_snd gets a frame, builds the skb for transmission
> 2) tpakcket_snd calls po->xmit(skb), sending the frame to the driver
> 3) tpacket_snd, iterates through back to the top of the loop, and determines
> that we need to wait for the previous packet to complete
> 4) The driver gets a tx completion interrupt, interrupting the cpu on which we
> are executing, and calls kfree_skb_irq on the skb we just transmitted
> 5) the ksoftirq runs on the cpu, calling kfree_skb on our skb
> 6) (5) calls skb->destruct (which is tpakcet_skb_destruct)
> 7) tpacket_skb_destruct executes, sees that po->wait_on_completion is zero, and
> skips calling complete()
> 8) the thread running tpacket_snd gets scheduled back on the cpu and now sets
> po->wait_on_complete, then calls wait_for_completion_interruptible_timeout, but
> since the skb we are waiting to complete has already been freed, we will never
> get the completion notification, and so we will wait for the maximal timeout,
> which is absolutely not what we want.
>
> Interestingly (Ironically), that control flow will never happen in the use case
> I'm trying to fix here, because its SCHED_FIFO, and will run until such time as
> we call wait_for_completion_interuptible_timeout, so in this case we're safe.
> But in the nominal case, where the sending task is acturally SCHED_OTHER, the
> above can aboslutely happen, and thats very broken.
>
> > I don't meant to draw this out, btw, or add to your workload. If you
> > prefer, I can take a stab at my (admittedly hand-wavy) suggestion.
> >
> No, I have another method in flight that I'm testing now, it removes the
> po->wait_on_complete variable entirely, checking instead to make sure that
> we've:
> a) sent at least one frame
> and
> b) that we have a positive pending count
> and
> c) that we need to wait
> and
> d) that we have no more frames to send
>
> This is why I was saying that your idea can be done, but it trades one form of
> complexity for another.

Thanks for the detailed explanation of the races.

I was thinking this through, but just see V4. Will take a look at that.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-25 21:57 ` [PATCH v4 " Neil Horman
@ 2019-06-25 22:30   ` Willem de Bruijn
  2019-06-26 10:54     ` Neil Horman
  2019-06-27  2:38   ` David Miller
  1 sibling, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-25 22:30 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index a29d66da7394..a7ca6a003ebe 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
>
>                 ts = __packet_set_timestamp(po, ph, skb);
>                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> +
> +               if (!packet_read_pending(&po->tx_ring))
> +                       complete(&po->skb_completion);
>         }
>
>         sock_wfree(skb);
> @@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
>
>  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>  {
> -       struct sk_buff *skb;
> +       struct sk_buff *skb = NULL;
>         struct net_device *dev;
>         struct virtio_net_hdr *vnet_hdr = NULL;
>         struct sockcm_cookie sockc;
> @@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>         int len_sum = 0;
>         int status = TP_STATUS_AVAILABLE;
>         int hlen, tlen, copylen = 0;
> +       long timeo = 0;
>
>         mutex_lock(&po->pg_vec_lock);
>
> @@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
>         if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
>                 size_max = dev->mtu + reserve + VLAN_HLEN;
>
> +       reinit_completion(&po->skb_completion);
> +
>         do {
>                 ph = packet_current_frame(po, &po->tx_ring,
>                                           TP_STATUS_SEND_REQUEST);
>                 if (unlikely(ph == NULL)) {
> -                       if (need_wait && need_resched())
> -                               schedule();
> +                       if (need_wait && skb) {
> +                               timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> +                               timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);

This looks really nice.

But isn't it still susceptible to the race where tpacket_destruct_skb
is called in between po->xmit and this
wait_for_completion_interruptible_timeout?

The test for skb is shorthand for packet_read_pending  != 0, right?

> +                               if (timeo <= 0) {
> +                                       err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
> +                                       goto out_put;
> +                               }
> +                       }
> +                       /* check for additional frames */
>                         continue;
>                 }
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-25 22:30   ` Willem de Bruijn
@ 2019-06-26 10:54     ` Neil Horman
  2019-06-26 15:05       ` Willem de Bruijn
  0 siblings, 1 reply; 34+ messages in thread
From: Neil Horman @ 2019-06-26 10:54 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Tue, Jun 25, 2019 at 06:30:08PM -0400, Willem de Bruijn wrote:
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index a29d66da7394..a7ca6a003ebe 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
> >
> >                 ts = __packet_set_timestamp(po, ph, skb);
> >                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> > +
> > +               if (!packet_read_pending(&po->tx_ring))
> > +                       complete(&po->skb_completion);
> >         }
> >
> >         sock_wfree(skb);
> > @@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
> >
> >  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >  {
> > -       struct sk_buff *skb;
> > +       struct sk_buff *skb = NULL;
> >         struct net_device *dev;
> >         struct virtio_net_hdr *vnet_hdr = NULL;
> >         struct sockcm_cookie sockc;
> > @@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >         int len_sum = 0;
> >         int status = TP_STATUS_AVAILABLE;
> >         int hlen, tlen, copylen = 0;
> > +       long timeo = 0;
> >
> >         mutex_lock(&po->pg_vec_lock);
> >
> > @@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> >         if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
> >                 size_max = dev->mtu + reserve + VLAN_HLEN;
> >
> > +       reinit_completion(&po->skb_completion);
> > +
> >         do {
> >                 ph = packet_current_frame(po, &po->tx_ring,
> >                                           TP_STATUS_SEND_REQUEST);
> >                 if (unlikely(ph == NULL)) {
> > -                       if (need_wait && need_resched())
> > -                               schedule();
> > +                       if (need_wait && skb) {
> > +                               timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > +                               timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> 
> This looks really nice.
> 
> But isn't it still susceptible to the race where tpacket_destruct_skb
> is called in between po->xmit and this
> wait_for_completion_interruptible_timeout?
> 
Thats not an issue, since the complete is only gated on packet_read_pending
reaching 0 in tpacket_destuct_skb.  Previously it was gated on my
wait_on_complete flag being non-zero, so we had to set that prior to calling
po->xmit, or the complete call might never get made, resulting in a hang.  Now,
we will always call complete, and the completion api allows for arbitrary
ordering of complete/wait_for_complete (since its internal done variable gets
incremented), making a call to wait_for_complete effectively a fall through if
complete gets called first.

There is an odd path here though.  If an application calls sendmsg on a packet
socket here with MSG_DONTWAIT set, then need_wait will be zero, and we will
eventually exit this loop without ever having called wait_for_complete, but
tpacket_destruct_skb will still have called complete when all the frames
complete transmission.  In and of itself, thats fine, but it leave the
completion structure in a state where its done variable will have been
incremented at least once (specifically it will be set to N, where N is the
number of frames transmitted during the call where MSG_DONTWAIT is set).  If the
application then calls sendmsg on this socket with MSG_DONTWAIT clear, we will
call wait_for_complete, but immediately return from it (due to the previously
made calls to complete).  I've corrected this however, but adding that call to
reinit_completion prior to the loop entry, so that we are always guaranteed to
have the completion variable set properly to wait for only the frames being sent
in this particular instance of the sendmsg call.

> The test for skb is shorthand for packet_read_pending  != 0, right?
> 
Sort of.  gating on skb guarantees for us that we have sent at least one frame
in this call to tpacket_snd.  If we didn't do that, then it would be possible
for an application to call sendmsg without setting any frames in the buffer to
TP_STATUS_SEND_REQUEST, which would cause us to wait for a completion without
having sent any frames, meaning we would block waiting for an event
(tpacket_destruct_skb), that will never happen.  The check for skb ensures that
tpacket_snd_skb will get called, and that we will get a wakeup from a call to
wait_for_complete.  It does suggest that packet_read_pending != 0, but thats not
guaranteed, because tpacket_destruct_skb may already have been called (see the
above explination regarding ordering of complete/wait_for_complete).

Neil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-26 10:54     ` Neil Horman
@ 2019-06-26 15:05       ` Willem de Bruijn
  2019-06-26 17:14         ` Neil Horman
  0 siblings, 1 reply; 34+ messages in thread
From: Willem de Bruijn @ 2019-06-26 15:05 UTC (permalink / raw)
  To: Neil Horman; +Cc: Network Development, Matteo Croce, David S. Miller

On Wed, Jun 26, 2019 at 6:54 AM Neil Horman <nhorman@tuxdriver.com> wrote:
>
> On Tue, Jun 25, 2019 at 06:30:08PM -0400, Willem de Bruijn wrote:
> > > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > > index a29d66da7394..a7ca6a003ebe 100644
> > > --- a/net/packet/af_packet.c
> > > +++ b/net/packet/af_packet.c
> > > @@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
> > >
> > >                 ts = __packet_set_timestamp(po, ph, skb);
> > >                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> > > +
> > > +               if (!packet_read_pending(&po->tx_ring))
> > > +                       complete(&po->skb_completion);
> > >         }
> > >
> > >         sock_wfree(skb);
> > > @@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
> > >
> > >  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > >  {
> > > -       struct sk_buff *skb;
> > > +       struct sk_buff *skb = NULL;
> > >         struct net_device *dev;
> > >         struct virtio_net_hdr *vnet_hdr = NULL;
> > >         struct sockcm_cookie sockc;
> > > @@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > >         int len_sum = 0;
> > >         int status = TP_STATUS_AVAILABLE;
> > >         int hlen, tlen, copylen = 0;
> > > +       long timeo = 0;
> > >
> > >         mutex_lock(&po->pg_vec_lock);
> > >
> > > @@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > >         if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
> > >                 size_max = dev->mtu + reserve + VLAN_HLEN;
> > >
> > > +       reinit_completion(&po->skb_completion);
> > > +
> > >         do {
> > >                 ph = packet_current_frame(po, &po->tx_ring,
> > >                                           TP_STATUS_SEND_REQUEST);
> > >                 if (unlikely(ph == NULL)) {
> > > -                       if (need_wait && need_resched())
> > > -                               schedule();
> > > +                       if (need_wait && skb) {
> > > +                               timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > > +                               timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> >
> > This looks really nice.
> >
> > But isn't it still susceptible to the race where tpacket_destruct_skb
> > is called in between po->xmit and this
> > wait_for_completion_interruptible_timeout?
> >
> Thats not an issue, since the complete is only gated on packet_read_pending
> reaching 0 in tpacket_destuct_skb.  Previously it was gated on my
> wait_on_complete flag being non-zero, so we had to set that prior to calling
> po->xmit, or the complete call might never get made, resulting in a hang.  Now,
> we will always call complete, and the completion api allows for arbitrary
> ordering of complete/wait_for_complete (since its internal done variable gets
> incremented), making a call to wait_for_complete effectively a fall through if
> complete gets called first.

Perfect! I was not aware that it handles that internally. Hadn't read
do_wait_for_common closely before.

> There is an odd path here though.  If an application calls sendmsg on a packet
> socket here with MSG_DONTWAIT set, then need_wait will be zero, and we will
> eventually exit this loop without ever having called wait_for_complete, but
> tpacket_destruct_skb will still have called complete when all the frames
> complete transmission.  In and of itself, thats fine, but it leave the
> completion structure in a state where its done variable will have been
> incremented at least once (specifically it will be set to N, where N is the
> number of frames transmitted during the call where MSG_DONTWAIT is set).  If the
> application then calls sendmsg on this socket with MSG_DONTWAIT clear, we will
> call wait_for_complete, but immediately return from it (due to the previously
> made calls to complete).  I've corrected this however, but adding that call to
> reinit_completion prior to the loop entry, so that we are always guaranteed to
> have the completion variable set properly to wait for only the frames being sent
> in this particular instance of the sendmsg call.

Yep, understood.

>
> > The test for skb is shorthand for packet_read_pending  != 0, right?
> >
> Sort of.  gating on skb guarantees for us that we have sent at least one frame
> in this call to tpacket_snd.  If we didn't do that, then it would be possible
> for an application to call sendmsg without setting any frames in the buffer to
> TP_STATUS_SEND_REQUEST, which would cause us to wait for a completion without
> having sent any frames, meaning we would block waiting for an event
> (tpacket_destruct_skb), that will never happen.  The check for skb ensures that
> tpacket_snd_skb will get called, and that we will get a wakeup from a call to
> wait_for_complete.  It does suggest that packet_read_pending != 0, but thats not
> guaranteed, because tpacket_destruct_skb may already have been called (see the
> above explination regarding ordering of complete/wait_for_complete).

But the inverse is true: if gating sleeping on packet_read_pending,
the process only ever waits if a packet is still to be acknowledged.
Then both the wait and wake clearly depend on the same state.

Either way works, I think. So this is definitely fine.

One possible refinement would be to keep po->wait_on_complete (but
rename as po->wake_om_complete), set it before entering the loop and
clear it before function return (both within the pg_vec_lock critical
section). And test that in tpacket_destruct_skb to avoid calling
complete if MSG_DONTWAIT. But I don't think it's worth the complexity.

One rare edge case is a MSG_DONTWAIT send followed by a !MSG_DONTWAIT.
It is then possible for a tpacket_destruct_skb to be run as a result
from the first call, during the second call, after the call to
reinit_completion. That would cause the next wait to return before
*its* packets have been sent. But due to the packet_read_pending test
in the while () condition it will loop again and return to wait. So that's fine.

Thanks for bearing with me.

Reviewed-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-26 15:05       ` Willem de Bruijn
@ 2019-06-26 17:14         ` Neil Horman
  0 siblings, 0 replies; 34+ messages in thread
From: Neil Horman @ 2019-06-26 17:14 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Matteo Croce, David S. Miller

On Wed, Jun 26, 2019 at 11:05:39AM -0400, Willem de Bruijn wrote:
> On Wed, Jun 26, 2019 at 6:54 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> >
> > On Tue, Jun 25, 2019 at 06:30:08PM -0400, Willem de Bruijn wrote:
> > > > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > > > index a29d66da7394..a7ca6a003ebe 100644
> > > > --- a/net/packet/af_packet.c
> > > > +++ b/net/packet/af_packet.c
> > > > @@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
> > > >
> > > >                 ts = __packet_set_timestamp(po, ph, skb);
> > > >                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
> > > > +
> > > > +               if (!packet_read_pending(&po->tx_ring))
> > > > +                       complete(&po->skb_completion);
> > > >         }
> > > >
> > > >         sock_wfree(skb);
> > > > @@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
> > > >
> > > >  static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > >  {
> > > > -       struct sk_buff *skb;
> > > > +       struct sk_buff *skb = NULL;
> > > >         struct net_device *dev;
> > > >         struct virtio_net_hdr *vnet_hdr = NULL;
> > > >         struct sockcm_cookie sockc;
> > > > @@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > >         int len_sum = 0;
> > > >         int status = TP_STATUS_AVAILABLE;
> > > >         int hlen, tlen, copylen = 0;
> > > > +       long timeo = 0;
> > > >
> > > >         mutex_lock(&po->pg_vec_lock);
> > > >
> > > > @@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
> > > >         if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
> > > >                 size_max = dev->mtu + reserve + VLAN_HLEN;
> > > >
> > > > +       reinit_completion(&po->skb_completion);
> > > > +
> > > >         do {
> > > >                 ph = packet_current_frame(po, &po->tx_ring,
> > > >                                           TP_STATUS_SEND_REQUEST);
> > > >                 if (unlikely(ph == NULL)) {
> > > > -                       if (need_wait && need_resched())
> > > > -                               schedule();
> > > > +                       if (need_wait && skb) {
> > > > +                               timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
> > > > +                               timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
> > >
> > > This looks really nice.
> > >
> > > But isn't it still susceptible to the race where tpacket_destruct_skb
> > > is called in between po->xmit and this
> > > wait_for_completion_interruptible_timeout?
> > >
> > Thats not an issue, since the complete is only gated on packet_read_pending
> > reaching 0 in tpacket_destuct_skb.  Previously it was gated on my
> > wait_on_complete flag being non-zero, so we had to set that prior to calling
> > po->xmit, or the complete call might never get made, resulting in a hang.  Now,
> > we will always call complete, and the completion api allows for arbitrary
> > ordering of complete/wait_for_complete (since its internal done variable gets
> > incremented), making a call to wait_for_complete effectively a fall through if
> > complete gets called first.
> 
> Perfect! I was not aware that it handles that internally. Hadn't read
> do_wait_for_common closely before.
> 
> > There is an odd path here though.  If an application calls sendmsg on a packet
> > socket here with MSG_DONTWAIT set, then need_wait will be zero, and we will
> > eventually exit this loop without ever having called wait_for_complete, but
> > tpacket_destruct_skb will still have called complete when all the frames
> > complete transmission.  In and of itself, thats fine, but it leave the
> > completion structure in a state where its done variable will have been
> > incremented at least once (specifically it will be set to N, where N is the
> > number of frames transmitted during the call where MSG_DONTWAIT is set).  If the
> > application then calls sendmsg on this socket with MSG_DONTWAIT clear, we will
> > call wait_for_complete, but immediately return from it (due to the previously
> > made calls to complete).  I've corrected this however, but adding that call to
> > reinit_completion prior to the loop entry, so that we are always guaranteed to
> > have the completion variable set properly to wait for only the frames being sent
> > in this particular instance of the sendmsg call.
> 
> Yep, understood.
> 
> >
> > > The test for skb is shorthand for packet_read_pending  != 0, right?
> > >
> > Sort of.  gating on skb guarantees for us that we have sent at least one frame
> > in this call to tpacket_snd.  If we didn't do that, then it would be possible
> > for an application to call sendmsg without setting any frames in the buffer to
> > TP_STATUS_SEND_REQUEST, which would cause us to wait for a completion without
> > having sent any frames, meaning we would block waiting for an event
> > (tpacket_destruct_skb), that will never happen.  The check for skb ensures that
> > tpacket_snd_skb will get called, and that we will get a wakeup from a call to
> > wait_for_complete.  It does suggest that packet_read_pending != 0, but thats not
> > guaranteed, because tpacket_destruct_skb may already have been called (see the
> > above explination regarding ordering of complete/wait_for_complete).
> 
> But the inverse is true: if gating sleeping on packet_read_pending,
> the process only ever waits if a packet is still to be acknowledged.
> Then both the wait and wake clearly depend on the same state.
> 
> Either way works, I think. So this is definitely fine.
> 
Yeah, we could do that.  Its basically a pick your poison situation, in the case
you stipulate, we could gate the wait_on_complete on read_pending being
non-zero, but if a frame frees quickly and decrements the pending count, we
still leave the loop with a completion struct that needs to be reset.  Either
way we have to re-init the completion.  And as for calling wait_on_complete when
we don't have to, your proposal does solve that prolbem but requires that we
call packet_read_pending an extra time for iteration on every loop.
Packet_read_pending accumulates the sum of all the per-cpu pointer pending
counts (which is a separate problem, I'm not sure why we're using per-cpu
counters there).  Regardless, I looked at that and (anecdotally), decided that
periodically calling wait_for_complete which takes a spin lock would be more
performant than accessing a per-cpu variable on each available cpu every
iteration of the loop (based on the comments at the bottom of the loop).

> One possible refinement would be to keep po->wait_on_complete (but
> rename as po->wake_om_complete), set it before entering the loop and
> clear it before function return (both within the pg_vec_lock critical
> section). And test that in tpacket_destruct_skb to avoid calling
> complete if MSG_DONTWAIT. But I don't think it's worth the complexity.
> 
I agree, we could use a socket variable to communicate to tpacket_destruct_skb
that we need to call complete, in conjunction with the pending count, but I
don't think the added complexity buys us anything.

> One rare edge case is a MSG_DONTWAIT send followed by a !MSG_DONTWAIT.
> It is then possible for a tpacket_destruct_skb to be run as a result
> from the first call, during the second call, after the call to
> reinit_completion. That would cause the next wait to return before
> *its* packets have been sent. But due to the packet_read_pending test
> in the while () condition it will loop again and return to wait. So that's fine.
> 
yup, exactly.

> Thanks for bearing with me.
> 
> Reviewed-by: Willem de Bruijn <willemb@google.com>
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH v4 net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET
  2019-06-25 21:57 ` [PATCH v4 " Neil Horman
  2019-06-25 22:30   ` Willem de Bruijn
@ 2019-06-27  2:38   ` David Miller
  1 sibling, 0 replies; 34+ messages in thread
From: David Miller @ 2019-06-27  2:38 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, mcroce, willemdebruijn.kernel

From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 25 Jun 2019 17:57:49 -0400

> When an application is run that:
> a) Sets its scheduler to be SCHED_FIFO
> and
> b) Opens a memory mapped AF_PACKET socket, and sends frames with the
> MSG_DONTWAIT flag cleared, its possible for the application to hang
> forever in the kernel.  This occurs because when waiting, the code in
> tpacket_snd calls schedule, which under normal circumstances allows
> other tasks to run, including ksoftirqd, which in some cases is
> responsible for freeing the transmitted skb (which in AF_PACKET calls a
> destructor that flips the status bit of the transmitted frame back to
> available, allowing the transmitting task to complete).
> 
> However, when the calling application is SCHED_FIFO, its priority is
> such that the schedule call immediately places the task back on the cpu,
> preventing ksoftirqd from freeing the skb, which in turn prevents the
> transmitting task from detecting that the transmission is complete.
> 
> We can fix this by converting the schedule call to a completion
> mechanism.  By using a completion queue, we force the calling task, when
> it detects there are no more frames to send, to schedule itself off the
> cpu until such time as the last transmitted skb is freed, allowing
> forward progress to be made.
> 
> Tested by myself and the reporter, with good results
> 
> Appies to the net tree
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> Reported-by: Matteo Croce <mcroce@redhat.com>
> CC: "David S. Miller" <davem@davemloft.net>
> CC: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
 ...

Applied and queued up for -stable.

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2019-06-27  2:38 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-19 20:25 [PATCH net] af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET Neil Horman
2019-06-20 13:41 ` Willem de Bruijn
2019-06-20 14:01   ` Matteo Croce
2019-06-20 14:23   ` Neil Horman
2019-06-20 15:16     ` Willem de Bruijn
2019-06-20 16:14       ` Neil Horman
2019-06-20 16:18         ` Willem de Bruijn
2019-06-20 17:31           ` Neil Horman
2019-06-21 16:41       ` Neil Horman
2019-06-21 18:31         ` Willem de Bruijn
2019-06-21 19:18           ` Neil Horman
2019-06-21 20:06             ` Willem de Bruijn
2019-06-22 11:08               ` Neil Horman
2019-06-22 17:41 ` [PATCH v2 " Neil Horman
2019-06-23  2:12   ` Willem de Bruijn
2019-06-23  2:21     ` Willem de Bruijn
2019-06-23 11:40       ` Neil Horman
2019-06-23 14:39         ` Willem de Bruijn
2019-06-23 19:21           ` Neil Horman
2019-06-23 11:34     ` Neil Horman
2019-06-24  0:46 ` [PATCH v3 " Neil Horman
2019-06-24 18:08   ` Willem de Bruijn
2019-06-24 21:51     ` Neil Horman
2019-06-24 22:15       ` Willem de Bruijn
2019-06-25 11:02         ` Neil Horman
2019-06-25 13:37           ` Willem de Bruijn
2019-06-25 16:20             ` Neil Horman
2019-06-25 21:59               ` Willem de Bruijn
2019-06-25 21:57 ` [PATCH v4 " Neil Horman
2019-06-25 22:30   ` Willem de Bruijn
2019-06-26 10:54     ` Neil Horman
2019-06-26 15:05       ` Willem de Bruijn
2019-06-26 17:14         ` Neil Horman
2019-06-27  2:38   ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.