ppoll() stuck on POLLIN while TCP peer is sending

* ppoll() stuck on POLLIN while TCP peer is sending
@ 2012-12-28  1:45 Eric Wong
  2012-12-28  7:06 ` Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 53+ messages in thread
From: Eric Wong @ 2012-12-28  1:45 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: Andreas Voellmy, viro, linux-fsdevel, Junchang(Jason) Wang

I'm finding ppoll() unexpectedly stuck when waiting for POLLIN on a
local TCP socket.  The isolated code below can reproduces the issue
after many minutes (<1 hour).  It might be easier to reproduce on
a busy system while disk I/O is happening.

This may also be related to an epoll-related issue reported
by Andreas Voellmy:
http://thread.gmane.org/gmane.linux.kernel/1408782/

My example involves a 3 thread data flow between two pairs
of (4) sockets:

	 send_loop ->   recv_loop(recv_send)   -> recv_loop(recv_only)
	 pair_a[1] -> (pair_a[0] -> pair_b[1]) -> pair_b[0]

At least 3.7 and 3.7.1 are affected.

I have tcp_low_latency=1 set, I will try 0 later

The last progress message I got was after receiving 2942052597760
bytes on fd=7 (out of 64-bit ULONG_MAX / 2)

strace:

3644  sendto(4, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 0, NULL, 0 <unfinished ...>
3643  sendto(6, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 16384, 0, NULL, 0 <unfinished ...>
3642  ppoll([{fd=7, events=POLLIN}], 1, NULL, NULL, 8 <unfinished ...>
3641  futex(0x7f23ed8129d0, FUTEX_WAIT, 3644, NULL <unfinished ...>

The first and last lines of the strace are expected:

+ 3644	sendto(4) is blocked because 3643 is blocked on sendto(fd=6)
  and not able to call recv().
+ 3641 is the main thread calling pthread_join

What is unexpected is the tid=3643 and tid=3642 interaction.  As confirmed
by lsof below, fd=6 is sending to wake up fd=7, but ppoll(fd=7) seems
to not be waking up.

lsof:
toosleepy 3641   ew    4u  IPv4  12405      0t0     TCP localhost:55904->localhost:33249 (ESTABLISHED)
toosleepy 3641   ew    5u  IPv4  12406      0t0     TCP localhost:33249->localhost:55904 (ESTABLISHED)
toosleepy 3641   ew    6u  IPv4  12408      0t0     TCP localhost:48777->localhost:33348 (ESTABLISHED)
toosleepy 3641   ew    7u  IPv4  12409      0t0     TCP localhost:33348->localhost:48777 (ESTABLISHED)

System info: Linux 3.7.1 x86_64 SMP PREEMPT
AMD Phenom(tm) II X4 945 Processor (4 cores)
Nothing interesting in dmesg, iptables rules are empty.

I have not yet been able to reproduce the issue using UNIX sockets,
only TCP, but you can run:

  ./toosleepy unix

...to test with UNIX sockets intead of TCP.

The following code is also available via git://bogomips.org/toosleepy
gcc -o toosleepy -O2 -Wall -lpthread toosleepy.c
-------------------------------- 8< ------------------------------------
#define _GNU_SOURCE
#include <poll.h>
#include <sys/ioctl.h>
#include <pthread.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <assert.h>
#include <limits.h>

struct receiver {
	int rfd;
	int sfd;
};

/* blocking sender */
static void * send_loop(void *fdp)
{
	int fd = *(int *)fdp;
	char buf[16384];
	ssize_t s;
	size_t sent = 0;
	size_t max = (size_t)ULONG_MAX / 2;

	while (sent < max) {
		s = send(fd, buf, sizeof(buf), 0);
		if (s > 0)
			sent += s;
		if (s == -1)
			assert(errno == EINTR);
	}
	dprintf(2, "%d done sending: %zu\n", fd, sent);
	close(fd);
	return NULL;
}

/* non-blocking receiver, using ppoll */
static void * recv_loop(void *p)
{
	const struct receiver *rcvr = p;
	char buf[16384];
	nfds_t nfds = 1;
	struct pollfd fds;
	int rc;
	ssize_t r, s;
	size_t received = 0;
	size_t sent = 0;

	for (;;) {
		r = recv(rcvr->rfd, buf, sizeof(buf), 0);
		if (r == 0) {
			break;
		} else if (r == -1) {
			assert(errno == EAGAIN);

			fds.fd = rcvr->rfd;
			fds.events = POLLIN;
			errno = 0;
			rc = ppoll(&fds, nfds, NULL, NULL);
			assert(rc == 1);
		} else {
			assert(r > 0);
			received += r;
			if (rcvr->sfd >= 0) {
				s = send(rcvr->sfd, buf, sizeof(buf), 0);
				if (s > 0)
					sent += s;
				if (s == -1)
					assert(errno == EINTR);
			} else {
				/* just burn some cycles */
				write(-1, buf, sizeof(buf));
			}
		}
		if ((received % (sizeof(buf) * sizeof(buf) * 16) == 0))
			dprintf(2, " %d progress: %zu\n",
			        rcvr->rfd, received);
	}
	dprintf(2, "%d got: %zu\n", rcvr->rfd, received);
	if (rcvr->sfd >= 0) {
		dprintf(2, "%d sent: %zu\n", rcvr->sfd, sent);
		close(rcvr->sfd);
	}

	return NULL;
}

static void tcp_socketpair(int sv[2], int accept_flags)
{
	struct sockaddr_in addr;
	socklen_t addrlen = sizeof(addr);
	int l = socket(PF_INET, SOCK_STREAM, 0);
	int c = socket(PF_INET, SOCK_STREAM, 0);
	int a;

	addr.sin_family = AF_INET;
	addr.sin_addr.s_addr = INADDR_ANY;
	addr.sin_port = 0;
	assert(0 == bind(l, (struct sockaddr*)&addr, addrlen));
	assert(0 == listen(l, 1024));
	assert(0 == getsockname(l, (struct sockaddr *)&addr, &addrlen));
	assert(0 == connect(c, (struct sockaddr *)&addr, addrlen));
	a = accept4(l, NULL, NULL, accept_flags);
	assert(a >= 0);
	close(l);
	sv[0] = a;
	sv[1] = c;
}

int main(int argc, char *argv[])
{
	int pair_a[2];
	int pair_b[2];
	pthread_t s, rs, r;
	struct receiver recv_only;
	struct receiver recv_send;

	if (argc == 2 && strcmp(argv[1], "unix") == 0) {
		int val;
		assert(0 == socketpair(AF_UNIX, SOCK_STREAM, 0, pair_a));
		assert(0 == socketpair(AF_UNIX, SOCK_STREAM, 0, pair_b));
		/* only make the receiver non-blocking */
		val = 1;
		assert(0 == ioctl(pair_a[0], FIONBIO, &val));
		val = 1;
		assert(0 == ioctl(pair_b[0], FIONBIO, &val));
	} else {
		tcp_socketpair(pair_a, SOCK_NONBLOCK);
		tcp_socketpair(pair_b, SOCK_NONBLOCK);
	}

	recv_send.rfd = pair_a[0];
	recv_send.sfd = pair_b[1];
	recv_only.rfd = pair_b[0];
	recv_only.sfd = -1;

	/*
	 * data flow:
	 * send_loop ->   recv_loop(recv_send)   -> recv_loop(recv_only)
	 * pair_a[1] -> (pair_a[0] -> pair_b[1]) -> pair_b[0]
	 */
	assert(0 == pthread_create(&r, NULL, recv_loop, &recv_only));
	assert(0 == pthread_create(&rs, NULL, recv_loop, &recv_send));
	assert(0 == pthread_create(&s, NULL, send_loop, &pair_a[1]));
	assert(0 == pthread_join(s, NULL));
	assert(0 == pthread_join(rs, NULL));
	assert(0 == pthread_join(r, NULL));

	return 0;
}
-------------------------------- 8< ------------------------------------
Any help/suggestions/test patches would be greatly appreciated.
Thanks for reading!

-- 
Eric Wong

^ permalink raw reply	[flat|nested] 53+ messages in thread