netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* UDP implementation and the MSG_MORE flag
@ 2021-01-26 14:12 Oliver Graute
  2021-01-26 21:54 ` Willem de Bruijn
  2021-02-03 22:21 ` michi1
  0 siblings, 2 replies; 8+ messages in thread
From: Oliver Graute @ 2021-01-26 14:12 UTC (permalink / raw)
  To: kernelnewbies, netdev; +Cc: kuznet, yoshfuji, jakub, pabeni

Hello,

we observe some unexpected behavior in the UDP implementation of the
linux kernel.

Some UDP packets send via the loopback interface are dropped in the
kernel on the receive side when using sendto with the MSG_MORE flag.
Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
example code to reproduce it is appended below.

In the code we tracked it down to this code section. ( Even a little
further but its unclear to me wy the csum() is wrong in the bad case)

udpv6_recvmsg()
...
if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
		if (udp_skb_is_linear(skb))
			err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
		else
			err = skb_copy_datagram_msg(skb, off, msg, copied);
	} else {
		err = skb_copy_and_csum_datagram_msg(skb, off, msg);
		if (err == -EINVAL) {
			goto csum_copy_err;
		}
	}
...


Perhaps someone with deeper knowledge can comment on this and can explain
us the reason of this behavior.

Best regards,

Oliver


udp-send.c

#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <poll.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>

#define BUFFSIZE 512*1024

int main(int argc, char** argv)
{
    int fd = 0;
    int port = 0;
    char *buffer;
    struct sockaddr_in addr;
    ssize_t addrlen = 0;

    if(argc == 2)
    {
        port = atoi(argv[1]);
    }
    else
    {
        port = 4711;
    }

    fd = socket(PF_INET, SOCK_DGRAM, 0);
    addr.sin_family = AF_INET;
    addr.sin_port = htons(port);
    addr.sin_addr.s_addr = inet_addr("127.0.0.1");
    addrlen = sizeof(addr);

    buffer = malloc(BUFFSIZE);
    if (!buffer) {
        return 0;
    }

    printf("\nsending BROKEN segmented testdata on local port %i \n", port);
    snprintf(buffer, BUFFSIZE, "start-data {\n");
    sendto(fd, buffer, strlen(buffer), MSG_MORE, (struct sockaddr *) &addr, addrlen);
    snprintf(buffer, BUFFSIZE, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n");
    sendto(fd, buffer, strlen(buffer), MSG_MORE, (struct sockaddr *) &addr, addrlen);
    snprintf(buffer, BUFFSIZE, "}\n");
    sendto(fd, buffer, strlen(buffer), 0, (struct sockaddr *) &addr, addrlen);

    printf("\nsending VALID segmented testdata on local port %i \n", port);
    snprintf(buffer, BUFFSIZE, "start-data {\n");
    sendto(fd, buffer, strlen(buffer), MSG_MORE, (struct sockaddr *) &addr, addrlen);
    snprintf(buffer, BUFFSIZE, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n");
    sendto(fd, buffer, strlen(buffer), MSG_MORE, (struct sockaddr *) &addr, addrlen);
    snprintf(buffer, BUFFSIZE, "}\n");
    sendto(fd, buffer, strlen(buffer), 0, (struct sockaddr *) &addr, addrlen);

    printf("\nsending VALID unsegmented testdata on local port %i \n", port);
    snprintf(buffer, BUFFSIZE, "start-data {\n");
    snprintf(buffer + strlen(buffer), BUFFSIZE - strlen(buffer), "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n");
    snprintf(buffer+ strlen(buffer), BUFFSIZE - strlen(buffer), "}\n");
    sendto(fd, buffer, strlen(buffer), 0, (struct sockaddr *) &addr, addrlen);

    free(buffer);
    return 0;
}
-------

udp-receive.c 

#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <poll.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>

int main(int argc, char** argv)
{
    int fd = 0;
    int arg = 0;
    int ret = 0;
    struct sockaddr_in addr;
    ssize_t addrlen = 0;
    int port = 0;
    char *buffer;
    char *printbuffer;
    int recvlen = 0;

    if(argc == 2)
    {
        port = atoi(argv[1]);
    }
    else
    {
        port = 4711;
    }

    fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);

    addr.sin_family = AF_INET;
    addr.sin_port = htons(port);
    addr.sin_addr.s_addr = inet_addr("0.0.0.0");
    addrlen = sizeof(addr);

    buffer = malloc(65536);
    if (!buffer) {
        return 0;
    }

    printbuffer = malloc(65537);
    if (!printbuffer) {
        return 0;
    }

    if(fd)
    {
        printf("\nbinding to local port %i \n", port);
        //bind
        ret = bind(fd, (struct sockaddr *)&addr, addrlen);
        printf("result error %i, errno %i\n", ret, errno);

        do {
            recvlen = recvfrom(fd, buffer, 65536, 0, NULL, NULL);

            if (recvlen >0) {
                printf("\nreceived %i bytes of data:\n", recvlen);

                memset(printbuffer, 0, 65537);
                memcpy(printbuffer, buffer, recvlen);

                printf("%s\n", printbuffer);
            }
            else if(recvlen < 0) {
                printf("\n receive error %i, errno %i\n", recvlen, errno);
            }
        } while(1);

        close(fd);
    }
    else
    {
        printf("\nerror creating socket\n");
    }
    
    free(buffer);
    free(printbuffer);
    return 0;
}


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-26 14:12 UDP implementation and the MSG_MORE flag Oliver Graute
@ 2021-01-26 21:54 ` Willem de Bruijn
  2021-01-26 22:00   ` Willem de Bruijn
  2021-02-03 22:21 ` michi1
  1 sibling, 1 reply; 8+ messages in thread
From: Willem de Bruijn @ 2021-01-26 21:54 UTC (permalink / raw)
  To: kernelnewbies, Network Development, Alexey Kuznetsov,
	Hideaki YOSHIFUJI, Jakub Sitnicki, Paolo Abeni

On Tue, Jan 26, 2021 at 9:58 AM Oliver Graute <oliver.graute@gmail.com> wrote:
>
> Hello,
>
> we observe some unexpected behavior in the UDP implementation of the
> linux kernel.
>
> Some UDP packets send via the loopback interface are dropped in the
> kernel on the receive side when using sendto with the MSG_MORE flag.
> Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> example code to reproduce it is appended below.
>
> In the code we tracked it down to this code section. ( Even a little
> further but its unclear to me wy the csum() is wrong in the bad case)
>
> udpv6_recvmsg()
> ...
> if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
>                 if (udp_skb_is_linear(skb))
>                         err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
>                 else
>                         err = skb_copy_datagram_msg(skb, off, msg, copied);
>         } else {
>                 err = skb_copy_and_csum_datagram_msg(skb, off, msg);
>                 if (err == -EINVAL) {
>                         goto csum_copy_err;
>                 }
>         }
> ...
>

Thanks for the report with a full reproducer.

I don't have a full answer yet, but can reproduce this easily.

The third program, without MSG_MORE, builds an skb with
CHECKSUM_PARTIAL in __ip_append_data. When looped to the receive path
that ip_summed means no additional validation is needed. As encoded in
skb_csum_unnecessary.

The first and second programs are essentially the same, bar for a
slight difference in length. In both cases packet length is very short
compared to the loopback device MTU. Because of MSG_MORE, these
packets have CHECKSUM_NONE.

On receive in

  __udp4_lib_rcv()
    udp4_csum_init()
      err = skb_checksum_init_zero_check()

The second program validates and sets ip_summed = CHECKSUM_COMPLETE
and csum_valid = 1.
The first does not, though err == 0.

This appears to succeed consistently for packets <= 68B of payload,
fail consistently otherwise. It is not clear to me yet what causes
this distinction.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-26 21:54 ` Willem de Bruijn
@ 2021-01-26 22:00   ` Willem de Bruijn
  2021-01-27  3:25     ` Willem de Bruijn
  0 siblings, 1 reply; 8+ messages in thread
From: Willem de Bruijn @ 2021-01-26 22:00 UTC (permalink / raw)
  To: oliver.graute
  Cc: Network Development, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Sitnicki, Paolo Abeni

On Tue, Jan 26, 2021 at 4:54 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Tue, Jan 26, 2021 at 9:58 AM Oliver Graute <oliver.graute@gmail.com> wrote:
> >
> > Hello,
> >
> > we observe some unexpected behavior in the UDP implementation of the
> > linux kernel.
> >
> > Some UDP packets send via the loopback interface are dropped in the
> > kernel on the receive side when using sendto with the MSG_MORE flag.
> > Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> > example code to reproduce it is appended below.
> >
> > In the code we tracked it down to this code section. ( Even a little
> > further but its unclear to me wy the csum() is wrong in the bad case)
> >
> > udpv6_recvmsg()
> > ...
> > if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
> >                 if (udp_skb_is_linear(skb))
> >                         err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
> >                 else
> >                         err = skb_copy_datagram_msg(skb, off, msg, copied);
> >         } else {
> >                 err = skb_copy_and_csum_datagram_msg(skb, off, msg);
> >                 if (err == -EINVAL) {
> >                         goto csum_copy_err;
> >                 }
> >         }
> > ...
> >
>
> Thanks for the report with a full reproducer.
>
> I don't have a full answer yet, but can reproduce this easily.
>
> The third program, without MSG_MORE, builds an skb with
> CHECKSUM_PARTIAL in __ip_append_data. When looped to the receive path
> that ip_summed means no additional validation is needed. As encoded in
> skb_csum_unnecessary.
>
> The first and second programs are essentially the same, bar for a
> slight difference in length. In both cases packet length is very short
> compared to the loopback device MTU. Because of MSG_MORE, these
> packets have CHECKSUM_NONE.
>
> On receive in
>
>   __udp4_lib_rcv()
>     udp4_csum_init()
>       err = skb_checksum_init_zero_check()
>
> The second program validates and sets ip_summed = CHECKSUM_COMPLETE
> and csum_valid = 1.
> The first does not, though err == 0.
>
> This appears to succeed consistently for packets <= 68B of payload,
> fail consistently otherwise. It is not clear to me yet what causes
> this distinction.

This is from

"
/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76
"

So the small packet gets checksummed immediately in
__skb_checksum_validate_complete, but the larger one does not.

Question is why the copy_and_checksum you pointed to seems to fail checksum.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-26 22:00   ` Willem de Bruijn
@ 2021-01-27  3:25     ` Willem de Bruijn
  2021-01-28  2:53       ` Willem de Bruijn
  0 siblings, 1 reply; 8+ messages in thread
From: Willem de Bruijn @ 2021-01-27  3:25 UTC (permalink / raw)
  To: oliver.graute
  Cc: Network Development, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Sitnicki, Paolo Abeni, sagi, Christoph Hellwig

On Tue, Jan 26, 2021 at 5:00 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Tue, Jan 26, 2021 at 4:54 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Tue, Jan 26, 2021 at 9:58 AM Oliver Graute <oliver.graute@gmail.com> wrote:
> > >
> > > Hello,
> > >
> > > we observe some unexpected behavior in the UDP implementation of the
> > > linux kernel.
> > >
> > > Some UDP packets send via the loopback interface are dropped in the
> > > kernel on the receive side when using sendto with the MSG_MORE flag.
> > > Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> > > example code to reproduce it is appended below.
> > >
> > > In the code we tracked it down to this code section. ( Even a little
> > > further but its unclear to me wy the csum() is wrong in the bad case)
> > >
> > > udpv6_recvmsg()
> > > ...
> > > if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
> > >                 if (udp_skb_is_linear(skb))
> > >                         err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
> > >                 else
> > >                         err = skb_copy_datagram_msg(skb, off, msg, copied);
> > >         } else {
> > >                 err = skb_copy_and_csum_datagram_msg(skb, off, msg);
> > >                 if (err == -EINVAL) {
> > >                         goto csum_copy_err;
> > >                 }
> > >         }
> > > ...
> > >
> >
> > Thanks for the report with a full reproducer.
> >
> > I don't have a full answer yet, but can reproduce this easily.
> >
> > The third program, without MSG_MORE, builds an skb with
> > CHECKSUM_PARTIAL in __ip_append_data. When looped to the receive path
> > that ip_summed means no additional validation is needed. As encoded in
> > skb_csum_unnecessary.
> >
> > The first and second programs are essentially the same, bar for a
> > slight difference in length. In both cases packet length is very short
> > compared to the loopback device MTU. Because of MSG_MORE, these
> > packets have CHECKSUM_NONE.
> >
> > On receive in
> >
> >   __udp4_lib_rcv()
> >     udp4_csum_init()
> >       err = skb_checksum_init_zero_check()
> >
> > The second program validates and sets ip_summed = CHECKSUM_COMPLETE
> > and csum_valid = 1.
> > The first does not, though err == 0.
> >
> > This appears to succeed consistently for packets <= 68B of payload,
> > fail consistently otherwise. It is not clear to me yet what causes
> > this distinction.
>
> This is from
>
> "
> /* For small packets <= CHECKSUM_BREAK perform checksum complete directly
>  * in checksum_init.
>  */
> #define CHECKSUM_BREAK 76
> "
>
> So the small packet gets checksummed immediately in
> __skb_checksum_validate_complete, but the larger one does not.
>
> Question is why the copy_and_checksum you pointed to seems to fail checksum.

Manually calling __skb_checksum_complete(skb) in
skb_copy_and_csum_datagram_msg succeeds, so it is the
skb_copy_and_csum_datagram that returns an incorrect csum.

Bisection shows that this is a regression in 5.0, between

65d69e2505bb datagram: introduce skb_copy_and_hash_datagram_iter helper (fail)
d05f443554b3 iov_iter: introduce hash_and_copy_to_iter helper
950fcaecd5cc datagram: consolidate datagram copy to iter helpers
cb002d074dab iov_iter: pass void csum pointer to csum_and_copy_to_iter (pass)

That's a significant amount of code change. I'll take a closer look,
but checkpointing state for now..

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-27  3:25     ` Willem de Bruijn
@ 2021-01-28  2:53       ` Willem de Bruijn
  2021-01-28  3:10         ` Willem de Bruijn
  0 siblings, 1 reply; 8+ messages in thread
From: Willem de Bruijn @ 2021-01-28  2:53 UTC (permalink / raw)
  To: oliver.graute
  Cc: Network Development, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Sitnicki, Paolo Abeni, sagi, Christoph Hellwig, sagi

On Tue, Jan 26, 2021 at 10:25 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Tue, Jan 26, 2021 at 5:00 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Tue, Jan 26, 2021 at 4:54 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > On Tue, Jan 26, 2021 at 9:58 AM Oliver Graute <oliver.graute@gmail.com> wrote:
> > > >
> > > > Hello,
> > > >
> > > > we observe some unexpected behavior in the UDP implementation of the
> > > > linux kernel.
> > > >
> > > > Some UDP packets send via the loopback interface are dropped in the
> > > > kernel on the receive side when using sendto with the MSG_MORE flag.
> > > > Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> > > > example code to reproduce it is appended below.
> > > >
> > > > In the code we tracked it down to this code section. ( Even a little
> > > > further but its unclear to me wy the csum() is wrong in the bad case)
> > > >
> > > > udpv6_recvmsg()
> > > > ...
> > > > if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
> > > >                 if (udp_skb_is_linear(skb))
> > > >                         err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
> > > >                 else
> > > >                         err = skb_copy_datagram_msg(skb, off, msg, copied);
> > > >         } else {
> > > >                 err = skb_copy_and_csum_datagram_msg(skb, off, msg);
> > > >                 if (err == -EINVAL) {
> > > >                         goto csum_copy_err;
> > > >                 }
> > > >         }
> > > > ...
> > > >
> > >
> > > Thanks for the report with a full reproducer.
> > >
> > > I don't have a full answer yet, but can reproduce this easily.
> > >
> > > The third program, without MSG_MORE, builds an skb with
> > > CHECKSUM_PARTIAL in __ip_append_data. When looped to the receive path
> > > that ip_summed means no additional validation is needed. As encoded in
> > > skb_csum_unnecessary.
> > >
> > > The first and second programs are essentially the same, bar for a
> > > slight difference in length. In both cases packet length is very short
> > > compared to the loopback device MTU. Because of MSG_MORE, these
> > > packets have CHECKSUM_NONE.
> > >
> > > On receive in
> > >
> > >   __udp4_lib_rcv()
> > >     udp4_csum_init()
> > >       err = skb_checksum_init_zero_check()
> > >
> > > The second program validates and sets ip_summed = CHECKSUM_COMPLETE
> > > and csum_valid = 1.
> > > The first does not, though err == 0.
> > >
> > > This appears to succeed consistently for packets <= 68B of payload,
> > > fail consistently otherwise. It is not clear to me yet what causes
> > > this distinction.
> >
> > This is from
> >
> > "
> > /* For small packets <= CHECKSUM_BREAK perform checksum complete directly
> >  * in checksum_init.
> >  */
> > #define CHECKSUM_BREAK 76
> > "
> >
> > So the small packet gets checksummed immediately in
> > __skb_checksum_validate_complete, but the larger one does not.
> >
> > Question is why the copy_and_checksum you pointed to seems to fail checksum.
>
> Manually calling __skb_checksum_complete(skb) in
> skb_copy_and_csum_datagram_msg succeeds, so it is the
> skb_copy_and_csum_datagram that returns an incorrect csum.
>
> Bisection shows that this is a regression in 5.0, between
>
> 65d69e2505bb datagram: introduce skb_copy_and_hash_datagram_iter helper (fail)
> d05f443554b3 iov_iter: introduce hash_and_copy_to_iter helper
> 950fcaecd5cc datagram: consolidate datagram copy to iter helpers
> cb002d074dab iov_iter: pass void csum pointer to csum_and_copy_to_iter (pass)
>
> That's a significant amount of code change. I'll take a closer look,
> but checkpointing state for now..

Key difference is the csum_block_add when handling frags, and the
removal of temporary csum2.

In the reproducer, there is one 13B csum_and_copy_to_iter from
skb->data + offset, followed by a 73B csum_and_copy_to_iter from the
first frag. So the second one passes pos 13 to csum_block_add.

The original implementation of skb_copy_and_csum_datagram similarly
fails the test, if we fail to account for the position

-                       *csump = csum_block_add(*csump, csum2, pos);
+                       *csump = csum_block_add(*csump, csum2, 0);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-28  2:53       ` Willem de Bruijn
@ 2021-01-28  3:10         ` Willem de Bruijn
  2021-01-28 15:23           ` Oliver Graute
  0 siblings, 1 reply; 8+ messages in thread
From: Willem de Bruijn @ 2021-01-28  3:10 UTC (permalink / raw)
  To: oliver.graute
  Cc: Network Development, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Sitnicki, Paolo Abeni, sagi, Christoph Hellwig, sagi,
	Al Viro

On Wed, Jan 27, 2021 at 9:53 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Tue, Jan 26, 2021 at 10:25 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Tue, Jan 26, 2021 at 5:00 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > On Tue, Jan 26, 2021 at 4:54 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > On Tue, Jan 26, 2021 at 9:58 AM Oliver Graute <oliver.graute@gmail.com> wrote:
> > > > >
> > > > > Hello,
> > > > >
> > > > > we observe some unexpected behavior in the UDP implementation of the
> > > > > linux kernel.
> > > > >
> > > > > Some UDP packets send via the loopback interface are dropped in the
> > > > > kernel on the receive side when using sendto with the MSG_MORE flag.
> > > > > Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> > > > > example code to reproduce it is appended below.
> > > > >
> > > > > In the code we tracked it down to this code section. ( Even a little
> > > > > further but its unclear to me wy the csum() is wrong in the bad case)
> > > > >
> > > > > udpv6_recvmsg()
> > > > > ...
> > > > > if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
> > > > >                 if (udp_skb_is_linear(skb))
> > > > >                         err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
> > > > >                 else
> > > > >                         err = skb_copy_datagram_msg(skb, off, msg, copied);
> > > > >         } else {
> > > > >                 err = skb_copy_and_csum_datagram_msg(skb, off, msg);
> > > > >                 if (err == -EINVAL) {
> > > > >                         goto csum_copy_err;
> > > > >                 }
> > > > >         }
> > > > > ...
> > > > >
> > > >
> > > > Thanks for the report with a full reproducer.
> > > >
> > > > I don't have a full answer yet, but can reproduce this easily.
> > > >
> > > > The third program, without MSG_MORE, builds an skb with
> > > > CHECKSUM_PARTIAL in __ip_append_data. When looped to the receive path
> > > > that ip_summed means no additional validation is needed. As encoded in
> > > > skb_csum_unnecessary.
> > > >
> > > > The first and second programs are essentially the same, bar for a
> > > > slight difference in length. In both cases packet length is very short
> > > > compared to the loopback device MTU. Because of MSG_MORE, these
> > > > packets have CHECKSUM_NONE.
> > > >
> > > > On receive in
> > > >
> > > >   __udp4_lib_rcv()
> > > >     udp4_csum_init()
> > > >       err = skb_checksum_init_zero_check()
> > > >
> > > > The second program validates and sets ip_summed = CHECKSUM_COMPLETE
> > > > and csum_valid = 1.
> > > > The first does not, though err == 0.
> > > >
> > > > This appears to succeed consistently for packets <= 68B of payload,
> > > > fail consistently otherwise. It is not clear to me yet what causes
> > > > this distinction.
> > >
> > > This is from
> > >
> > > "
> > > /* For small packets <= CHECKSUM_BREAK perform checksum complete directly
> > >  * in checksum_init.
> > >  */
> > > #define CHECKSUM_BREAK 76
> > > "
> > >
> > > So the small packet gets checksummed immediately in
> > > __skb_checksum_validate_complete, but the larger one does not.
> > >
> > > Question is why the copy_and_checksum you pointed to seems to fail checksum.
> >
> > Manually calling __skb_checksum_complete(skb) in
> > skb_copy_and_csum_datagram_msg succeeds, so it is the
> > skb_copy_and_csum_datagram that returns an incorrect csum.
> >
> > Bisection shows that this is a regression in 5.0, between
> >
> > 65d69e2505bb datagram: introduce skb_copy_and_hash_datagram_iter helper (fail)
> > d05f443554b3 iov_iter: introduce hash_and_copy_to_iter helper
> > 950fcaecd5cc datagram: consolidate datagram copy to iter helpers
> > cb002d074dab iov_iter: pass void csum pointer to csum_and_copy_to_iter (pass)
> >
> > That's a significant amount of code change. I'll take a closer look,
> > but checkpointing state for now..
>
> Key difference is the csum_block_add when handling frags, and the
> removal of temporary csum2.
>
> In the reproducer, there is one 13B csum_and_copy_to_iter from
> skb->data + offset, followed by a 73B csum_and_copy_to_iter from the
> first frag. So the second one passes pos 13 to csum_block_add.
>
> The original implementation of skb_copy_and_csum_datagram similarly
> fails the test, if we fail to account for the position
>
> -                       *csump = csum_block_add(*csump, csum2, pos);
> +                       *csump = csum_block_add(*csump, csum2, 0);

One possible approach:

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 81809fa735a7..56bd0c32fa65 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -721,8 +721,10 @@ static int skb_copy_and_csum_datagram(const
struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len,
                                      __wsum *csump)
 {
+       struct csum_iter csdata = { .csump = csump };
+
        return __skb_datagram_iter(skb, offset, to, len, true,
-                       csum_and_copy_to_iter, csump);
+                       csum_and_copy_to_iter, &csdata);
 }
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 72d88566694e..7ce4f403a03f 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -260,7 +260,12 @@ static inline void iov_iter_reexpand(struct
iov_iter *i, size_t count)
 {
        i->count = count;
 }
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void
*csump, struct iov_iter *i);
+
+struct csum_iter {
+       void *csump;
+       int off;
+};
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void
*csdata, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum
*csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum
*csum, struct iov_iter *i);
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index a21e6a5792c5..2e6e24f7dfe1 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1522,13 +1522,13 @@ bool csum_and_copy_from_iter_full(void *addr,
size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter_full);

-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csdata,
                             struct iov_iter *i)
 {
+       struct csum_iter *csdata = _csdata;
        const char *from = addr;
-       __wsum *csum = csump;
+       __wsum *csum = csdata->csump;
        __wsum sum, next;
-       size_t off = 0;

        if (unlikely(iov_iter_is_pipe(i)))
                return csum_and_copy_to_pipe_iter(addr, bytes, csum, i);
@@ -1543,22 +1543,22 @@ size_t csum_and_copy_to_iter(const void *addr,
size_t bytes, void *csump,
                                             v.iov_base,
                                             v.iov_len);
                if (next) {
-                       sum = csum_block_add(sum, next, off);
-                       off += v.iov_len;
+                       sum = csum_block_add(sum, next, csdata->off);
+                       csdata->off += v.iov_len;
                }
                next ? 0 : v.iov_len;
        }), ({
                char *p = kmap_atomic(v.bv_page);
                sum = csum_and_memcpy(p + v.bv_offset,
                                      (from += v.bv_len) - v.bv_len,
-                                     v.bv_len, sum, off);
+                                     v.bv_len, sum, csdata->off);
                kunmap_atomic(p);
-               off += v.bv_len;
+               csdata->off += v.bv_len;
        }),({
                sum = csum_and_memcpy(v.iov_base,
                                     (from += v.iov_len) - v.iov_len,
-                                    v.iov_len, sum, off);
-               off += v.iov_len;
+                                    v.iov_len, sum, csdata->off);
+               csdata->off += v.iov_len;
        })
        )
        *csum = sum;

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-28  3:10         ` Willem de Bruijn
@ 2021-01-28 15:23           ` Oliver Graute
  0 siblings, 0 replies; 8+ messages in thread
From: Oliver Graute @ 2021-01-28 15:23 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, Alexey Kuznetsov, Hideaki YOSHIFUJI,
	Jakub Sitnicki, Paolo Abeni, sagi, Christoph Hellwig, sagi,
	Al Viro

On 27/01/21, Willem de Bruijn wrote:
> On Wed, Jan 27, 2021 at 9:53 PM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Tue, Jan 26, 2021 at 10:25 PM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > On Tue, Jan 26, 2021 at 5:00 PM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > On Tue, Jan 26, 2021 at 4:54 PM Willem de Bruijn
> > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jan 26, 2021 at 9:58 AM Oliver Graute <oliver.graute@gmail.com> wrote:
> > > > > >
> > > > > > Hello,
> > > > > >
> > > > > > we observe some unexpected behavior in the UDP implementation of the
> > > > > > linux kernel.
> > > > > >
> > > > > > Some UDP packets send via the loopback interface are dropped in the
> > > > > > kernel on the receive side when using sendto with the MSG_MORE flag.
> > > > > > Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> > > > > > example code to reproduce it is appended below.
> > > > > >
> > > > > > In the code we tracked it down to this code section. ( Even a little
> > > > > > further but its unclear to me wy the csum() is wrong in the bad case)
> > > > > >
> > > > > > udpv6_recvmsg()
> > > > > > ...
> > > > > > if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
> > > > > >                 if (udp_skb_is_linear(skb))
> > > > > >                         err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
> > > > > >                 else
> > > > > >                         err = skb_copy_datagram_msg(skb, off, msg, copied);
> > > > > >         } else {
> > > > > >                 err = skb_copy_and_csum_datagram_msg(skb, off, msg);
> > > > > >                 if (err == -EINVAL) {
> > > > > >                         goto csum_copy_err;
> > > > > >                 }
> > > > > >         }
> > > > > > ...
> > > > > >
> > > > >
> > > > > Thanks for the report with a full reproducer.
> > > > >
> > > > > I don't have a full answer yet, but can reproduce this easily.
> > > > >
> > > > > The third program, without MSG_MORE, builds an skb with
> > > > > CHECKSUM_PARTIAL in __ip_append_data. When looped to the receive path
> > > > > that ip_summed means no additional validation is needed. As encoded in
> > > > > skb_csum_unnecessary.
> > > > >
> > > > > The first and second programs are essentially the same, bar for a
> > > > > slight difference in length. In both cases packet length is very short
> > > > > compared to the loopback device MTU. Because of MSG_MORE, these
> > > > > packets have CHECKSUM_NONE.
> > > > >
> > > > > On receive in
> > > > >
> > > > >   __udp4_lib_rcv()
> > > > >     udp4_csum_init()
> > > > >       err = skb_checksum_init_zero_check()
> > > > >
> > > > > The second program validates and sets ip_summed = CHECKSUM_COMPLETE
> > > > > and csum_valid = 1.
> > > > > The first does not, though err == 0.
> > > > >
> > > > > This appears to succeed consistently for packets <= 68B of payload,
> > > > > fail consistently otherwise. It is not clear to me yet what causes
> > > > > this distinction.
> > > >
> > > > This is from
> > > >
> > > > "
> > > > /* For small packets <= CHECKSUM_BREAK perform checksum complete directly
> > > >  * in checksum_init.
> > > >  */
> > > > #define CHECKSUM_BREAK 76
> > > > "
> > > >
> > > > So the small packet gets checksummed immediately in
> > > > __skb_checksum_validate_complete, but the larger one does not.
> > > >
> > > > Question is why the copy_and_checksum you pointed to seems to fail checksum.
> > >
> > > Manually calling __skb_checksum_complete(skb) in
> > > skb_copy_and_csum_datagram_msg succeeds, so it is the
> > > skb_copy_and_csum_datagram that returns an incorrect csum.
> > >
> > > Bisection shows that this is a regression in 5.0, between
> > >
> > > 65d69e2505bb datagram: introduce skb_copy_and_hash_datagram_iter helper (fail)
> > > d05f443554b3 iov_iter: introduce hash_and_copy_to_iter helper
> > > 950fcaecd5cc datagram: consolidate datagram copy to iter helpers
> > > cb002d074dab iov_iter: pass void csum pointer to csum_and_copy_to_iter (pass)
> > >
> > > That's a significant amount of code change. I'll take a closer look,
> > > but checkpointing state for now..
> >
> > Key difference is the csum_block_add when handling frags, and the
> > removal of temporary csum2.
> >
> > In the reproducer, there is one 13B csum_and_copy_to_iter from
> > skb->data + offset, followed by a 73B csum_and_copy_to_iter from the
> > first frag. So the second one passes pos 13 to csum_block_add.
> >
> > The original implementation of skb_copy_and_csum_datagram similarly
> > fails the test, if we fail to account for the position
> >
> > -                       *csump = csum_block_add(*csump, csum2, pos);
> > +                       *csump = csum_block_add(*csump, csum2, 0);
> 
> One possible approach:

very thx for your analysis and your patch proposal. After a first quick
test this patch proposal solves the problem.

Best regards,

Oliver

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: UDP implementation and the MSG_MORE flag
  2021-01-26 14:12 UDP implementation and the MSG_MORE flag Oliver Graute
  2021-01-26 21:54 ` Willem de Bruijn
@ 2021-02-03 22:21 ` michi1
  1 sibling, 0 replies; 8+ messages in thread
From: michi1 @ 2021-02-03 22:21 UTC (permalink / raw)
  To: kernelnewbies, netdev, kuznet, yoshfuji, jakub, pabeni

Hi!

On 15:12 Tue 26 Jan     , Oliver Graute wrote:
> Some UDP packets send via the loopback interface are dropped in the
> kernel on the receive side when using sendto with the MSG_MORE flag.
> Every drop increases the InCsumErrors in /proc/self/net/snmp. Some
> example code to reproduce it is appended below.
> 
> In the code we tracked it down to this code section. ( Even a little
> further but its unclear to me wy the csum() is wrong in the bad case)

I think it is unlikely that the error is on the receiving side. I recommend to
capture the data that is being sent with a sniffer and see if there is
anything wrong. You may also want to try disabling hardware acceleration on
your network devices, because the checksum is sometimes done in hardware.

	-Michi
-- 
programing a layer 3+4 network protocol for mesh networks
see http://michaelblizek.twilightparadox.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-02-03 22:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-26 14:12 UDP implementation and the MSG_MORE flag Oliver Graute
2021-01-26 21:54 ` Willem de Bruijn
2021-01-26 22:00   ` Willem de Bruijn
2021-01-27  3:25     ` Willem de Bruijn
2021-01-28  2:53       ` Willem de Bruijn
2021-01-28  3:10         ` Willem de Bruijn
2021-01-28 15:23           ` Oliver Graute
2021-02-03 22:21 ` michi1

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).