All of lore.kernel.org
 help / color / mirror / Atom feed
* Lots of connections led oxenstored stuck
@ 2014-08-08  7:01 Joe Jin
  2014-08-08  8:35 ` Liuqiming (John)
  0 siblings, 1 reply; 12+ messages in thread
From: Joe Jin @ 2014-08-08  7:01 UTC (permalink / raw)
  To: David Scott, Luis R. Rodriguez, Ian Jackson; +Cc: xen-devel

Hi,

During internal test on Xen-4.3-stable we found sometime when restarted
Xen, it stuck and does not response any request, xenstored.log filled
out below stuff:
[20140702T21:00:41.564Z|error|xenstored] caught exception Unix.Unix_error(15, "accept", "")

I created reproducer which will create 2000 connections to oxenstored, after
ran the reproducer, "xm list --long" will stuck, oxenstored does not
response anymore, same test case passed when use xenstored, any input
will appreciate! 

/* 
 * This program used to test oxenstored connections stuck issue.
 * please compile by below command:
 *	gcc -o client client.c -lpthread
 */
#include <stdio.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <errno.h>


void *main_thread(void *arg)
{
	struct sockaddr_un address;
	int socket_fd, nbytes;
	char buffer[256];
	int i;
	extern int errno;

	memcpy(&i, arg, sizeof(i));
	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
	if (socket_fd < 0) {
		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
		return;
	}
	fprintf(stderr, "socket() %dth ok!\n", i);

	/* start with a clean address structure */
	memset(&address, 0, sizeof(struct sockaddr_un));

	address.sun_family = AF_UNIX;
	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");

	if (connect(socket_fd,
		    (struct sockaddr *) &address,
		    sizeof(struct sockaddr_un)) != 0) {
		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
		return;
	}
	fprintf(stderr, "connec() %dth ok!\n", i);

	while (1)
		sleep(1);
	if (arg) {
		free(arg);
		arg = NULL;
	}

	return;
}

int main(void)
{
	int i;
	for (i = 0; i < 2000; i++) {
		void *arg = malloc(sizeof(i));
		memset(arg, 0, sizeof(i));
		memcpy(arg, &i, sizeof(i));
		pthread_t thread;
		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
			perror("pthread_create:");
			break;
		}
	}
	/* Wait all children exit */
	sleep(3);
	return 0;
}
/* end */

Thanks,
Joe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-08  7:01 Lots of connections led oxenstored stuck Joe Jin
@ 2014-08-08  8:35 ` Liuqiming (John)
  2014-08-08  9:37   ` Dave Scott
  0 siblings, 1 reply; 12+ messages in thread
From: Liuqiming (John) @ 2014-08-08  8:35 UTC (permalink / raw)
  To: Joe Jin, David Scott, Luis R. Rodriguez, Ian Jackson
  Cc: Fanhenglong, Luonengjun, xen-devel

In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections. 

> -----Original Message-----
> From: xen-devel-bounces@lists.xen.org
> [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of Joe Jin
> Sent: Friday, August 08, 2014 3:01 PM
> To: David Scott; Luis R. Rodriguez; Ian Jackson
> Cc: xen-devel
> Subject: [Xen-devel] Lots of connections led oxenstored stuck
> 
> Hi,
> 
> During internal test on Xen-4.3-stable we found sometime when restarted
> Xen, it stuck and does not response any request, xenstored.log filled
> out below stuff:
> [20140702T21:00:41.564Z|error|xenstored] caught exception
> Unix.Unix_error(15, "accept", "")
> 
> I created reproducer which will create 2000 connections to oxenstored,
> after
> ran the reproducer, "xm list --long" will stuck, oxenstored does not
> response anymore, same test case passed when use xenstored, any input
> will appreciate!
> 
> /*
>  * This program used to test oxenstored connections stuck issue.
>  * please compile by below command:
>  *	gcc -o client client.c -lpthread
>  */
> #include <stdio.h>
> #include <sys/socket.h>
> #include <sys/un.h>
> #include <unistd.h>
> #include <string.h>
> #include <pthread.h>
> #include <stdlib.h>
> #include <errno.h>
> 
> 
> void *main_thread(void *arg)
> {
> 	struct sockaddr_un address;
> 	int socket_fd, nbytes;
> 	char buffer[256];
> 	int i;
> 	extern int errno;
> 
> 	memcpy(&i, arg, sizeof(i));
> 	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
> 	if (socket_fd < 0) {
> 		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
> 		return;
> 	}
> 	fprintf(stderr, "socket() %dth ok!\n", i);
> 
> 	/* start with a clean address structure */
> 	memset(&address, 0, sizeof(struct sockaddr_un));
> 
> 	address.sun_family = AF_UNIX;
> 	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");
> 
> 	if (connect(socket_fd,
> 		    (struct sockaddr *) &address,
> 		    sizeof(struct sockaddr_un)) != 0) {
> 		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
> 		return;
> 	}
> 	fprintf(stderr, "connec() %dth ok!\n", i);
> 
> 	while (1)
> 		sleep(1);
> 	if (arg) {
> 		free(arg);
> 		arg = NULL;
> 	}
> 
> 	return;
> }
> 
> int main(void)
> {
> 	int i;
> 	for (i = 0; i < 2000; i++) {
> 		void *arg = malloc(sizeof(i));
> 		memset(arg, 0, sizeof(i));
> 		memcpy(arg, &i, sizeof(i));
> 		pthread_t thread;
> 		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
> 			perror("pthread_create:");
> 			break;
> 		}
> 	}
> 	/* Wait all children exit */
> 	sleep(3);
> 	return 0;
> }
> /* end */
> 
> Thanks,
> Joe
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-08  8:35 ` Liuqiming (John)
@ 2014-08-08  9:37   ` Dave Scott
  2014-08-11  0:35     ` Joe Jin
  2014-08-11 16:58     ` Zheng Li
  0 siblings, 2 replies; 12+ messages in thread
From: Dave Scott @ 2014-08-08  9:37 UTC (permalink / raw)
  To: Liuqiming (John)
  Cc: Dave Scott, Luonengjun, Luis R. Rodriguez, Joe Jin, Zheng Li,
	xen-devel, Fanhenglong, Ian Jackson


On 8 Aug 2014, at 09:35, Liuqiming (John) <john.liuqiming@huawei.com> wrote:

> In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections. 

That’s true.

In the long term I’d like to use Lwt which internally uses libev and has a more scalable event loop.

In the short term I think Zheng Li (cc:d) may have a prototype patch to work around this issue. Is this right, Zheng?

Cheers,
Dave

> 
>> -----Original Message-----
>> From: xen-devel-bounces@lists.xen.org
>> [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of Joe Jin
>> Sent: Friday, August 08, 2014 3:01 PM
>> To: David Scott; Luis R. Rodriguez; Ian Jackson
>> Cc: xen-devel
>> Subject: [Xen-devel] Lots of connections led oxenstored stuck
>> 
>> Hi,
>> 
>> During internal test on Xen-4.3-stable we found sometime when restarted
>> Xen, it stuck and does not response any request, xenstored.log filled
>> out below stuff:
>> [20140702T21:00:41.564Z|error|xenstored] caught exception
>> Unix.Unix_error(15, "accept", "")
>> 
>> I created reproducer which will create 2000 connections to oxenstored,
>> after
>> ran the reproducer, "xm list --long" will stuck, oxenstored does not
>> response anymore, same test case passed when use xenstored, any input
>> will appreciate!
>> 
>> /*
>> * This program used to test oxenstored connections stuck issue.
>> * please compile by below command:
>> *	gcc -o client client.c -lpthread
>> */
>> #include <stdio.h>
>> #include <sys/socket.h>
>> #include <sys/un.h>
>> #include <unistd.h>
>> #include <string.h>
>> #include <pthread.h>
>> #include <stdlib.h>
>> #include <errno.h>
>> 
>> 
>> void *main_thread(void *arg)
>> {
>> 	struct sockaddr_un address;
>> 	int socket_fd, nbytes;
>> 	char buffer[256];
>> 	int i;
>> 	extern int errno;
>> 
>> 	memcpy(&i, arg, sizeof(i));
>> 	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
>> 	if (socket_fd < 0) {
>> 		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
>> 		return;
>> 	}
>> 	fprintf(stderr, "socket() %dth ok!\n", i);
>> 
>> 	/* start with a clean address structure */
>> 	memset(&address, 0, sizeof(struct sockaddr_un));
>> 
>> 	address.sun_family = AF_UNIX;
>> 	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");
>> 
>> 	if (connect(socket_fd,
>> 		    (struct sockaddr *) &address,
>> 		    sizeof(struct sockaddr_un)) != 0) {
>> 		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
>> 		return;
>> 	}
>> 	fprintf(stderr, "connec() %dth ok!\n", i);
>> 
>> 	while (1)
>> 		sleep(1);
>> 	if (arg) {
>> 		free(arg);
>> 		arg = NULL;
>> 	}
>> 
>> 	return;
>> }
>> 
>> int main(void)
>> {
>> 	int i;
>> 	for (i = 0; i < 2000; i++) {
>> 		void *arg = malloc(sizeof(i));
>> 		memset(arg, 0, sizeof(i));
>> 		memcpy(arg, &i, sizeof(i));
>> 		pthread_t thread;
>> 		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
>> 			perror("pthread_create:");
>> 			break;
>> 		}
>> 	}
>> 	/* Wait all children exit */
>> 	sleep(3);
>> 	return 0;
>> }
>> /* end */
>> 
>> Thanks,
>> Joe
>> 
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-08  9:37   ` Dave Scott
@ 2014-08-11  0:35     ` Joe Jin
  2014-08-11  9:41       ` Dave Scott
  2014-08-11 16:58     ` Zheng Li
  1 sibling, 1 reply; 12+ messages in thread
From: Joe Jin @ 2014-08-11  0:35 UTC (permalink / raw)
  To: Dave Scott, Liuqiming (John)
  Cc: Zheng Li, Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Ian Jackson

On 08/08/14 17:37, Dave Scott wrote:
> 
> On 8 Aug 2014, at 09:35, Liuqiming (John) <john.liuqiming@huawei.com> wrote:
> 
>> In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections. 
> 
> That’s true.

The problem is when oxenstored does not respond any request anymore even all
thread exited, with my reproducer, when you executed it and all threads exited,
"xm list -l" will stuck.

Thanks,
Joe
> 
> In the long term I’d like to use Lwt which internally uses libev and has a more scalable event loop.
> 
> In the short term I think Zheng Li (cc:d) may have a prototype patch to work around this issue. Is this right, Zheng?
> 
> Cheers,
> Dave
> 
>>
>>> -----Original Message-----
>>> From: xen-devel-bounces@lists.xen.org
>>> [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of Joe Jin
>>> Sent: Friday, August 08, 2014 3:01 PM
>>> To: David Scott; Luis R. Rodriguez; Ian Jackson
>>> Cc: xen-devel
>>> Subject: [Xen-devel] Lots of connections led oxenstored stuck
>>>
>>> Hi,
>>>
>>> During internal test on Xen-4.3-stable we found sometime when restarted
>>> Xen, it stuck and does not response any request, xenstored.log filled
>>> out below stuff:
>>> [20140702T21:00:41.564Z|error|xenstored] caught exception
>>> Unix.Unix_error(15, "accept", "")
>>>
>>> I created reproducer which will create 2000 connections to oxenstored,
>>> after
>>> ran the reproducer, "xm list --long" will stuck, oxenstored does not
>>> response anymore, same test case passed when use xenstored, any input
>>> will appreciate!
>>>
>>> /*
>>> * This program used to test oxenstored connections stuck issue.
>>> * please compile by below command:
>>> *	gcc -o client client.c -lpthread
>>> */
>>> #include <stdio.h>
>>> #include <sys/socket.h>
>>> #include <sys/un.h>
>>> #include <unistd.h>
>>> #include <string.h>
>>> #include <pthread.h>
>>> #include <stdlib.h>
>>> #include <errno.h>
>>>
>>>
>>> void *main_thread(void *arg)
>>> {
>>> 	struct sockaddr_un address;
>>> 	int socket_fd, nbytes;
>>> 	char buffer[256];
>>> 	int i;
>>> 	extern int errno;
>>>
>>> 	memcpy(&i, arg, sizeof(i));
>>> 	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
>>> 	if (socket_fd < 0) {
>>> 		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
>>> 		return;
>>> 	}
>>> 	fprintf(stderr, "socket() %dth ok!\n", i);
>>>
>>> 	/* start with a clean address structure */
>>> 	memset(&address, 0, sizeof(struct sockaddr_un));
>>>
>>> 	address.sun_family = AF_UNIX;
>>> 	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");
>>>
>>> 	if (connect(socket_fd,
>>> 		    (struct sockaddr *) &address,
>>> 		    sizeof(struct sockaddr_un)) != 0) {
>>> 		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
>>> 		return;
>>> 	}
>>> 	fprintf(stderr, "connec() %dth ok!\n", i);
>>>
>>> 	while (1)
>>> 		sleep(1);
>>> 	if (arg) {
>>> 		free(arg);
>>> 		arg = NULL;
>>> 	}
>>>
>>> 	return;
>>> }
>>>
>>> int main(void)
>>> {
>>> 	int i;
>>> 	for (i = 0; i < 2000; i++) {
>>> 		void *arg = malloc(sizeof(i));
>>> 		memset(arg, 0, sizeof(i));
>>> 		memcpy(arg, &i, sizeof(i));
>>> 		pthread_t thread;
>>> 		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
>>> 			perror("pthread_create:");
>>> 			break;
>>> 		}
>>> 	}
>>> 	/* Wait all children exit */
>>> 	sleep(3);
>>> 	return 0;
>>> }
>>> /* end */
>>>
>>> Thanks,
>>> Joe
>>>
>>> _______________________________________________
>>> Xen-devel mailing list
>>> Xen-devel@lists.xen.org
>>> http://lists.xen.org/xen-devel
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-11  0:35     ` Joe Jin
@ 2014-08-11  9:41       ` Dave Scott
  2014-08-12  0:19         ` Joe Jin
  0 siblings, 1 reply; 12+ messages in thread
From: Dave Scott @ 2014-08-11  9:41 UTC (permalink / raw)
  To: Joe Jin
  Cc: Zheng Li, Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Ian Jackson, Liuqiming (John)


On 11 Aug 2014, at 01:35, Joe Jin <joe.jin@oracle.com> wrote:

> On 08/08/14 17:37, Dave Scott wrote:
>> 
>> On 8 Aug 2014, at 09:35, Liuqiming (John) <john.liuqiming@huawei.com> wrote:
>> 
>>> In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections. 
>> 
>> That’s true.
> 
> The problem is when oxenstored does not respond any request anymore even all
> thread exited, with my reproducer, when you executed it and all threads exited,
> "xm list -l" will stuck.

OK so is this the behaviour you expect:

* root in dom0 opens many connections, until oxenstored is out of resources (where the most limited resource is currently file descriptors)
* root in dom0 closes the connections
* oxenstored recovers, and ‘xm list -l’ works again

Instead, you’re seeing oxenstored getting into a stuck state causing ‘xm list -l’ to block — is this accurate?

Could you share your reproducer program?

Thanks,
Dave

> 
> Thanks,
> Joe
>> 
>> In the long term I’d like to use Lwt which internally uses libev and has a more scalable event loop.
>> 
>> In the short term I think Zheng Li (cc:d) may have a prototype patch to work around this issue. Is this right, Zheng?
>> 
>> Cheers,
>> Dave
>> 
>>> 
>>>> -----Original Message-----
>>>> From: xen-devel-bounces@lists.xen.org
>>>> [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of Joe Jin
>>>> Sent: Friday, August 08, 2014 3:01 PM
>>>> To: David Scott; Luis R. Rodriguez; Ian Jackson
>>>> Cc: xen-devel
>>>> Subject: [Xen-devel] Lots of connections led oxenstored stuck
>>>> 
>>>> Hi,
>>>> 
>>>> During internal test on Xen-4.3-stable we found sometime when restarted
>>>> Xen, it stuck and does not response any request, xenstored.log filled
>>>> out below stuff:
>>>> [20140702T21:00:41.564Z|error|xenstored] caught exception
>>>> Unix.Unix_error(15, "accept", "")
>>>> 
>>>> I created reproducer which will create 2000 connections to oxenstored,
>>>> after
>>>> ran the reproducer, "xm list --long" will stuck, oxenstored does not
>>>> response anymore, same test case passed when use xenstored, any input
>>>> will appreciate!
>>>> 
>>>> /*
>>>> * This program used to test oxenstored connections stuck issue.
>>>> * please compile by below command:
>>>> *	gcc -o client client.c -lpthread
>>>> */
>>>> #include <stdio.h>
>>>> #include <sys/socket.h>
>>>> #include <sys/un.h>
>>>> #include <unistd.h>
>>>> #include <string.h>
>>>> #include <pthread.h>
>>>> #include <stdlib.h>
>>>> #include <errno.h>
>>>> 
>>>> 
>>>> void *main_thread(void *arg)
>>>> {
>>>> 	struct sockaddr_un address;
>>>> 	int socket_fd, nbytes;
>>>> 	char buffer[256];
>>>> 	int i;
>>>> 	extern int errno;
>>>> 
>>>> 	memcpy(&i, arg, sizeof(i));
>>>> 	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
>>>> 	if (socket_fd < 0) {
>>>> 		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
>>>> 		return;
>>>> 	}
>>>> 	fprintf(stderr, "socket() %dth ok!\n", i);
>>>> 
>>>> 	/* start with a clean address structure */
>>>> 	memset(&address, 0, sizeof(struct sockaddr_un));
>>>> 
>>>> 	address.sun_family = AF_UNIX;
>>>> 	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");
>>>> 
>>>> 	if (connect(socket_fd,
>>>> 		    (struct sockaddr *) &address,
>>>> 		    sizeof(struct sockaddr_un)) != 0) {
>>>> 		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
>>>> 		return;
>>>> 	}
>>>> 	fprintf(stderr, "connec() %dth ok!\n", i);
>>>> 
>>>> 	while (1)
>>>> 		sleep(1);
>>>> 	if (arg) {
>>>> 		free(arg);
>>>> 		arg = NULL;
>>>> 	}
>>>> 
>>>> 	return;
>>>> }
>>>> 
>>>> int main(void)
>>>> {
>>>> 	int i;
>>>> 	for (i = 0; i < 2000; i++) {
>>>> 		void *arg = malloc(sizeof(i));
>>>> 		memset(arg, 0, sizeof(i));
>>>> 		memcpy(arg, &i, sizeof(i));
>>>> 		pthread_t thread;
>>>> 		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
>>>> 			perror("pthread_create:");
>>>> 			break;
>>>> 		}
>>>> 	}
>>>> 	/* Wait all children exit */
>>>> 	sleep(3);
>>>> 	return 0;
>>>> }
>>>> /* end */
>>>> 
>>>> Thanks,
>>>> Joe
>>>> 
>>>> _______________________________________________
>>>> Xen-devel mailing list
>>>> Xen-devel@lists.xen.org
>>>> http://lists.xen.org/xen-devel
>> 
>> 
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-08  9:37   ` Dave Scott
  2014-08-11  0:35     ` Joe Jin
@ 2014-08-11 16:58     ` Zheng Li
  1 sibling, 0 replies; 12+ messages in thread
From: Zheng Li @ 2014-08-11 16:58 UTC (permalink / raw)
  To: Dave Scott, Liuqiming (John)
  Cc: Joe Jin, Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Ian Jackson

On 08/08/2014 10:37, Dave Scott wrote:
>
> On 8 Aug 2014, at 09:35, Liuqiming (John) <john.liuqiming@huawei.com> wrote:
>
>> In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections.
>
> That’s true.
>
> In the long term I’d like to use Lwt which internally uses libev and has a more scalable event loop.
>
> In the short term I think Zheng Li (cc:d) may have a prototype patch to work around this issue. Is this right, Zheng?
>

Yes, I created a workaround patch using poll a few days ago. We're currently perf testing it under a VM bootstorm setting. I'm going to circulate the patch when the tests have been done.

Cheers,
Zheng


>>> -----Original Message-----
>>> From: xen-devel-bounces@lists.xen.org
>>> [mailto:xen-devel-bounces@lists.xen.org] On Behalf Of Joe Jin
>>> Sent: Friday, August 08, 2014 3:01 PM
>>> To: David Scott; Luis R. Rodriguez; Ian Jackson
>>> Cc: xen-devel
>>> Subject: [Xen-devel] Lots of connections led oxenstored stuck
>>>
>>> Hi,
>>>
>>> During internal test on Xen-4.3-stable we found sometime when restarted
>>> Xen, it stuck and does not response any request, xenstored.log filled
>>> out below stuff:
>>> [20140702T21:00:41.564Z|error|xenstored] caught exception
>>> Unix.Unix_error(15, "accept", "")
>>>
>>> I created reproducer which will create 2000 connections to oxenstored,
>>> after
>>> ran the reproducer, "xm list --long" will stuck, oxenstored does not
>>> response anymore, same test case passed when use xenstored, any input
>>> will appreciate!
>>>
>>> /*
>>> * This program used to test oxenstored connections stuck issue.
>>> * please compile by below command:
>>> *	gcc -o client client.c -lpthread
>>> */
>>> #include <stdio.h>
>>> #include <sys/socket.h>
>>> #include <sys/un.h>
>>> #include <unistd.h>
>>> #include <string.h>
>>> #include <pthread.h>
>>> #include <stdlib.h>
>>> #include <errno.h>
>>>
>>>
>>> void *main_thread(void *arg)
>>> {
>>> 	struct sockaddr_un address;
>>> 	int socket_fd, nbytes;
>>> 	char buffer[256];
>>> 	int i;
>>> 	extern int errno;
>>>
>>> 	memcpy(&i, arg, sizeof(i));
>>> 	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
>>> 	if (socket_fd < 0) {
>>> 		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
>>> 		return;
>>> 	}
>>> 	fprintf(stderr, "socket() %dth ok!\n", i);
>>>
>>> 	/* start with a clean address structure */
>>> 	memset(&address, 0, sizeof(struct sockaddr_un));
>>>
>>> 	address.sun_family = AF_UNIX;
>>> 	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");
>>>
>>> 	if (connect(socket_fd,
>>> 		    (struct sockaddr *) &address,
>>> 		    sizeof(struct sockaddr_un)) != 0) {
>>> 		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
>>> 		return;
>>> 	}
>>> 	fprintf(stderr, "connec() %dth ok!\n", i);
>>>
>>> 	while (1)
>>> 		sleep(1);
>>> 	if (arg) {
>>> 		free(arg);
>>> 		arg = NULL;
>>> 	}
>>>
>>> 	return;
>>> }
>>>
>>> int main(void)
>>> {
>>> 	int i;
>>> 	for (i = 0; i < 2000; i++) {
>>> 		void *arg = malloc(sizeof(i));
>>> 		memset(arg, 0, sizeof(i));
>>> 		memcpy(arg, &i, sizeof(i));
>>> 		pthread_t thread;
>>> 		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
>>> 			perror("pthread_create:");
>>> 			break;
>>> 		}
>>> 	}
>>> 	/* Wait all children exit */
>>> 	sleep(3);
>>> 	return 0;
>>> }
>>> /* end */
>>>
>>> Thanks,
>>> Joe
>>>
>>> _______________________________________________
>>> Xen-devel mailing list
>>> Xen-devel@lists.xen.org
>>> http://lists.xen.org/xen-devel
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-11  9:41       ` Dave Scott
@ 2014-08-12  0:19         ` Joe Jin
  2014-08-14  8:33           ` Joe Jin
  2014-08-26  8:15           ` Joe Jin
  0 siblings, 2 replies; 12+ messages in thread
From: Joe Jin @ 2014-08-12  0:19 UTC (permalink / raw)
  To: Dave Scott
  Cc: Zheng Li, Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Liuqiming (John),
	Ian Jackson

On 08/11/14 17:41, Dave Scott wrote:
> 
> On 11 Aug 2014, at 01:35, Joe Jin <joe.jin@oracle.com> wrote:
> 
>> On 08/08/14 17:37, Dave Scott wrote:
>>>
>>> On 8 Aug 2014, at 09:35, Liuqiming (John) <john.liuqiming@huawei.com> wrote:
>>>
>>>> In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections. 
>>>
>>> That’s true.
>>
>> The problem is when oxenstored does not respond any request anymore even all
>> thread exited, with my reproducer, when you executed it and all threads exited,
>> "xm list -l" will stuck.
> 
> OK so is this the behaviour you expect:
> 
> * root in dom0 opens many connections, until oxenstored is out of resources (where the most limited resource is currently file descriptors)
> * root in dom0 closes the connections
> * oxenstored recovers, and ‘xm list -l’ works again
> 
> Instead, you’re seeing oxenstored getting into a stuck state causing ‘xm list -l’ to block — is this accurate?

Yes that's it.

> 
> Could you share your reproducer program?

/* 
 * This program used to test oxenstored connections stuck issue.
 * please compile by below command:
 *	gcc -o client client.c -lpthread
 */
#include <stdio.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <errno.h>


void *main_thread(void *arg)
{
	struct sockaddr_un address;
	int socket_fd, nbytes;
	char buffer[256];
	int i;
	extern int errno;

	memcpy(&i, arg, sizeof(i));
	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
	if (socket_fd < 0) {
		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
		return;
	}
	fprintf(stderr, "socket() %dth ok!\n", i);

	/* start with a clean address structure */
	memset(&address, 0, sizeof(struct sockaddr_un));

	address.sun_family = AF_UNIX;
	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");

	if (connect(socket_fd,
		    (struct sockaddr *) &address,
		    sizeof(struct sockaddr_un)) != 0) {
		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
		return;
	}
	fprintf(stderr, "connec() %dth ok!\n", i);

	while (1)
		sleep(1);
	if (arg) {
		free(arg);
		arg = NULL;
	}

	return;
}

int main(void)
{
	int i;
	for (i = 0; i < 2000; i++) {
		void *arg = malloc(sizeof(i));
		memset(arg, 0, sizeof(i));
		memcpy(arg, &i, sizeof(i));
		pthread_t thread;
		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
			perror("pthread_create:");
			break;
		}
	}
	/* Wait all children exit */
	sleep(3);
	return 0;
}
/* end */

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-12  0:19         ` Joe Jin
@ 2014-08-14  8:33           ` Joe Jin
  2014-08-26  8:15           ` Joe Jin
  1 sibling, 0 replies; 12+ messages in thread
From: Joe Jin @ 2014-08-14  8:33 UTC (permalink / raw)
  To: Dave Scott
  Cc: Zheng Li, Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Liuqiming (John),
	Ian Jackson

On 08/12/14 08:19, Joe Jin wrote:
>> > * root in dom0 opens many connections, until oxenstored is out of resources (where the most limited resource is currently file descriptors)
>> > * root in dom0 closes the connections
>> > * oxenstored recovers, and ‘xm list -l’ works again
>> > 
>> > Instead, you’re seeing oxenstored getting into a stuck state causing ‘xm list -l’ to block — is this accurate?
> Yes that's it.
> 
>> > 
>> > Could you share your reproducer program?

Dave, can you reproduce this issue with the reproducer or no?
Anything else can I help for test/reproduce?

Thanks,
Joe

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-12  0:19         ` Joe Jin
  2014-08-14  8:33           ` Joe Jin
@ 2014-08-26  8:15           ` Joe Jin
  2014-08-26  9:02             ` Zheng Li
  1 sibling, 1 reply; 12+ messages in thread
From: Joe Jin @ 2014-08-26  8:15 UTC (permalink / raw)
  To: Dave Scott
  Cc: Zheng Li, Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Ian Jackson, Liuqiming (John)

This bug caused by oxenstored handle incoming requests, when lots of
connections came at same time it has not chance to delete closed sockets.

I created a patch for this, please review:

Thanks,
Joe

[PATCH] oxenstored: check and delete closed socket before accept incoming connections

When more than SYSCONF.OPEN_MAX connections came at the same time and
connecitons been closed later, oxenstored has not change to delete closed
socket, this led oxenstored stuck and unable to handle any incoming
requests any more. This patch let oxenstored check and process closed
socket before handle incoming connections to avoid the stuck.

Cc: David Scott <dave.scott@eu.citrix.com>
Cc: Zheng Li <dev@zheng.li>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Ian Jackson <Ian.Jackson@citrix.com>
Signed-off-by: Joe Jin <joe.jin@oracle.com>
---
 tools/ocaml/xenstored/xenstored.ml |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
index 1c02f2f..b142952 100644
--- a/tools/ocaml/xenstored/xenstored.ml
+++ b/tools/ocaml/xenstored/xenstored.ml
@@ -373,10 +373,10 @@ let _ =
 			[], [], [] in
 		let sfds, cfds =
 			List.partition (fun fd -> List.mem fd spec_fds) rset in
-		if List.length sfds > 0 then
-			process_special_fds sfds;
 		if List.length cfds > 0 || List.length wset > 0 then
 			process_connection_fds store cons domains cfds wset;
+		if List.length sfds > 0 then
+			process_special_fds sfds;
 		process_domains store cons domains
 		in
 
-- 
1.7.1

On 08/12/14 08:19, Joe Jin wrote:
> On 08/11/14 17:41, Dave Scott wrote:
>>
>> On 11 Aug 2014, at 01:35, Joe Jin <joe.jin@oracle.com> wrote:
>>
>>> On 08/08/14 17:37, Dave Scott wrote:
>>>>
>>>> On 8 Aug 2014, at 09:35, Liuqiming (John) <john.liuqiming@huawei.com> wrote:
>>>>
>>>>> In oxenstored it use "select" for incoming socket, so I don't think it can handle more than 1024 socket connections. 
>>>>
>>>> That’s true.
>>>
>>> The problem is when oxenstored does not respond any request anymore even all
>>> thread exited, with my reproducer, when you executed it and all threads exited,
>>> "xm list -l" will stuck.
>>
>> OK so is this the behaviour you expect:
>>
>> * root in dom0 opens many connections, until oxenstored is out of resources (where the most limited resource is currently file descriptors)
>> * root in dom0 closes the connections
>> * oxenstored recovers, and ‘xm list -l’ works again
>>
>> Instead, you’re seeing oxenstored getting into a stuck state causing ‘xm list -l’ to block — is this accurate?
> 
> Yes that's it.
> 
>>
>> Could you share your reproducer program?
> 
> /* 
>  * This program used to test oxenstored connections stuck issue.
>  * please compile by below command:
>  *	gcc -o client client.c -lpthread
>  */
> #include <stdio.h>
> #include <sys/socket.h>
> #include <sys/un.h>
> #include <unistd.h>
> #include <string.h>
> #include <pthread.h>
> #include <stdlib.h>
> #include <errno.h>
> 
> 
> void *main_thread(void *arg)
> {
> 	struct sockaddr_un address;
> 	int socket_fd, nbytes;
> 	char buffer[256];
> 	int i;
> 	extern int errno;
> 
> 	memcpy(&i, arg, sizeof(i));
> 	socket_fd = socket(PF_UNIX, SOCK_STREAM, 0);
> 	if (socket_fd < 0) {
> 		fprintf(stderr, "socket() %dth failed, errno=%d\n", i, errno);
> 		return;
> 	}
> 	fprintf(stderr, "socket() %dth ok!\n", i);
> 
> 	/* start with a clean address structure */
> 	memset(&address, 0, sizeof(struct sockaddr_un));
> 
> 	address.sun_family = AF_UNIX;
> 	snprintf(address.sun_path, 1024, "/var/run/xenstored/socket");
> 
> 	if (connect(socket_fd,
> 		    (struct sockaddr *) &address,
> 		    sizeof(struct sockaddr_un)) != 0) {
> 		fprintf(stderr, "connect() %d failed, error=%d", i, errno);
> 		return;
> 	}
> 	fprintf(stderr, "connec() %dth ok!\n", i);
> 
> 	while (1)
> 		sleep(1);
> 	if (arg) {
> 		free(arg);
> 		arg = NULL;
> 	}
> 
> 	return;
> }
> 
> int main(void)
> {
> 	int i;
> 	for (i = 0; i < 2000; i++) {
> 		void *arg = malloc(sizeof(i));
> 		memset(arg, 0, sizeof(i));
> 		memcpy(arg, &i, sizeof(i));
> 		pthread_t thread;
> 		if (pthread_create(&thread, NULL, main_thread, arg) != 0) {
> 			perror("pthread_create:");
> 			break;
> 		}
> 	}
> 	/* Wait all children exit */
> 	sleep(3);
> 	return 0;
> }
> /* end */
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
> 

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-26  8:15           ` Joe Jin
@ 2014-08-26  9:02             ` Zheng Li
  2014-08-27  1:59               ` Joe Jin
  0 siblings, 1 reply; 12+ messages in thread
From: Zheng Li @ 2014-08-26  9:02 UTC (permalink / raw)
  To: Joe Jin, Dave Scott
  Cc: Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Ian Jackson, Liuqiming (John)

Hi Joe,

I read your patch and understand the basic idea behind it. It can mitigate the situation when bad things happen, but it doesn't solve the double limits imposed by both select and NR_OPEN. E.g.

   * When the number of fds is beyond NR_OPEN, is there any strict order for which fds being chosen to close? If no, then the special fds might get closed as well, in which case the xenstored might stuck still.

   * When select is given 1024 fds (which can still happen even with your patch), the behavior is _undefined_. IIRC, some bits in the bitmap might be reused (wrongly), so that the output (fds reported as ready for read/write) might be wrong for some fds, so that the following read/write might be blocked on them.

   * Also, we generally prefer to handle special fds first, as the eventchn fd represents all the domain connections.

I previously mentioned I've got patches for these. I'm currently testing with 1,000 Windows 7 VMs on a single host (each consume at least 2 persistent xenstored socket connections). Besides the two limits just mentioned, I've also fixed several bugs and bottlenecks along the way.

I'm going to upstream these patches very soon, just a bit clean up and documentation are needed. However if you (or anyone) need them urgently or eager to have a test, please send me an private email separately. I'm happy to send you the patch in its current form --- a single non-disaggregated patch for multiple issues, not very well commented, but should just work.

Cheers,
Zheng

On 26/08/2014 09:15, Joe Jin wrote:
> This bug caused by oxenstored handle incoming requests, when lots of
> connections came at same time it has not chance to delete closed sockets.
>
> I created a patch for this, please review:
>
> Thanks,
> Joe
>
> [PATCH] oxenstored: check and delete closed socket before accept incoming connections
>
> When more than SYSCONF.OPEN_MAX connections came at the same time and
> connecitons been closed later, oxenstored has not change to delete closed
> socket, this led oxenstored stuck and unable to handle any incoming
> requests any more. This patch let oxenstored check and process closed
> socket before handle incoming connections to avoid the stuck.
>
> Cc: David Scott <dave.scott@eu.citrix.com>
> Cc: Zheng Li <dev@zheng.li>
> Cc: Luis R. Rodriguez <mcgrof@suse.com>
> Cc: Ian Jackson <Ian.Jackson@citrix.com>
> Signed-off-by: Joe Jin <joe.jin@oracle.com>
> ---
>   tools/ocaml/xenstored/xenstored.ml |    4 ++--
>   1 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
> index 1c02f2f..b142952 100644
> --- a/tools/ocaml/xenstored/xenstored.ml
> +++ b/tools/ocaml/xenstored/xenstored.ml
> @@ -373,10 +373,10 @@ let _ =
>   			[], [], [] in
>   		let sfds, cfds =
>   			List.partition (fun fd -> List.mem fd spec_fds) rset in
> -		if List.length sfds > 0 then
> -			process_special_fds sfds;
>   		if List.length cfds > 0 || List.length wset > 0 then
>   			process_connection_fds store cons domains cfds wset;
> +		if List.length sfds > 0 then
> +			process_special_fds sfds;
>   		process_domains store cons domains
>   		in
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-26  9:02             ` Zheng Li
@ 2014-08-27  1:59               ` Joe Jin
  2014-08-27 10:16                 ` Zheng Li
  0 siblings, 1 reply; 12+ messages in thread
From: Joe Jin @ 2014-08-27  1:59 UTC (permalink / raw)
  To: Zheng Li, Dave Scott
  Cc: Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Liuqiming (John),
	Ian Jackson

On 08/26/14 17:02, Zheng Li wrote:
> Hi Joe,
> 
> I read your patch and understand the basic idea behind it. It can mitigate the situation when bad things happen, but it doesn't solve the double limits imposed by both select and NR_OPEN. E.g.

No this patch does not intend to fix NR_OPEN(this is from system side, need to
unlimited before start the daemon) and select limitations.
We met the bug not because of the limitations, the original issue is when more
connect(i.e 2000) request coming at the same time, accept() failed because of
open fds > SYSCONF.OPEN_MAX, this is as expected. The thing is when client exited,
oxenstored should close the sockets as well, but during our test, it did not, and
oxenstored keeping reported accept failed, any new request hang as well.
So my changes let oxenstored check and delete closed fds, then oxenstored able to
ack new requests.
During our testing, when issue happened also xenstored log enabled, xenstored.log
full of below error and xenstored.log keeping be rotated:
[20140827T15:48:25.399Z|error|xenstored] caught exception Unix.Unix_error(15, "accept", "")

> 
>   * When the number of fds is beyond NR_OPEN, is there any strict order for which fds being chosen to close? If no, then the special fds might get closed as well, in which case the xenstored might stuck still.

My change will not delete no-closed fds. Also I do not think the special fds will be
removed for no error from them.

> 
>   * When select is given 1024 fds (which can still happen even with your patch), the behavior is _undefined_. IIRC, some bits in the bitmap might be reused (wrongly), so that the output (fds reported as ready for read/write) might be wrong for some fds, so that the following read/write might be blocked on them.

> 
>   * Also, we generally prefer to handle special fds first, as the eventchn fd represents all the domain connections.

Remove closed firstly may reduce system resource usage?

> 
> I previously mentioned I've got patches for these. I'm currently testing with 1,000 Windows 7 VMs on a single host (each consume at least 2 persistent xenstored socket connections). Besides the two limits just mentioned, I've also fixed several bugs and bottlenecks along the way.
> 
> I'm going to upstream these patches very soon, just a bit clean up and documentation are needed. However if you (or anyone) need them urgently or eager to have a test, please send me an private email separately. I'm happy to send you the patch in its current form --- a single non-disaggregated patch for multiple issues, not very well commented, but should just work.

Can you please send a copy of your patch? I'd like to test when connections more than @nfds of
poll, what happened.

Thanks,
Joe

> 
> Cheers,
> Zheng
> 
> On 26/08/2014 09:15, Joe Jin wrote:
>> This bug caused by oxenstored handle incoming requests, when lots of
>> connections came at same time it has not chance to delete closed sockets.
>>
>> I created a patch for this, please review:
>>
>> Thanks,
>> Joe
>>
>> [PATCH] oxenstored: check and delete closed socket before accept incoming connections
>>
>> When more than SYSCONF.OPEN_MAX connections came at the same time and
>> connecitons been closed later, oxenstored has not change to delete closed
>> socket, this led oxenstored stuck and unable to handle any incoming
>> requests any more. This patch let oxenstored check and process closed
>> socket before handle incoming connections to avoid the stuck.
>>
>> Cc: David Scott <dave.scott@eu.citrix.com>
>> Cc: Zheng Li <dev@zheng.li>
>> Cc: Luis R. Rodriguez <mcgrof@suse.com>
>> Cc: Ian Jackson <Ian.Jackson@citrix.com>
>> Signed-off-by: Joe Jin <joe.jin@oracle.com>
>> ---
>>   tools/ocaml/xenstored/xenstored.ml |    4 ++--
>>   1 files changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
>> index 1c02f2f..b142952 100644
>> --- a/tools/ocaml/xenstored/xenstored.ml
>> +++ b/tools/ocaml/xenstored/xenstored.ml
>> @@ -373,10 +373,10 @@ let _ =
>>               [], [], [] in
>>           let sfds, cfds =
>>               List.partition (fun fd -> List.mem fd spec_fds) rset in
>> -        if List.length sfds > 0 then
>> -            process_special_fds sfds;
>>           if List.length cfds > 0 || List.length wset > 0 then
>>               process_connection_fds store cons domains cfds wset;
>> +        if List.length sfds > 0 then
>> +            process_special_fds sfds;
>>           process_domains store cons domains
>>           in
>>
>>
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


-- 
Oracle <http://www.oracle.com>
Joe Jin | Software Development Senior Manager | +8610.6106.5624
ORACLE | Linux and Virtualization
No. 24 Zhongguancun Software Park, Haidian District | 100193 Beijing 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Lots of connections led oxenstored stuck
  2014-08-27  1:59               ` Joe Jin
@ 2014-08-27 10:16                 ` Zheng Li
  0 siblings, 0 replies; 12+ messages in thread
From: Zheng Li @ 2014-08-27 10:16 UTC (permalink / raw)
  To: Joe Jin, Dave Scott
  Cc: Luis R. Rodriguez, Luonengjun, xen-devel, Fanhenglong,
	Liuqiming (John),
	Ian Jackson

Hi Joe,

On 27/08/2014 02:59, Joe Jin wrote:
> No this patch does not intend to fix NR_OPEN(this is from system side, need to
> unlimited before start the daemon) and select limitations.
> We met the bug not because of the limitations, the original issue is when more
> connect(i.e 2000) request coming at the same time, accept() failed because of
> open fds > SYSCONF.OPEN_MAX, this is as expected. The thing is when client exited,
> oxenstored should close the sockets as well, but during our test, it did not, and
> oxenstored keeping reported accept failed, any new request hang as well.
> So my changes let oxenstored check and delete closed fds, then oxenstored able to
> ack new requests.
> During our testing, when issue happened also xenstored log enabled, xenstored.log
> full of below error and xenstored.log keeping be rotated:
> [20140827T15:48:25.399Z|error|xenstored] caught exception Unix.Unix_error(15, "accept", "")

Thanks for the explanation, I now understand the intention of your patch better. Still, I think the change can only mitigate/delay the issue to certain extend. If I'm wrong on this, please help me to understand it better.

Here is an example walking through: Suppose your xenstored process have the default max open fd setting of 1024 and it currently has 1024 fds open already. Now 2000 more connections are coming:

   * Without your change, all the 2000 connection failed with EMFILE one by one (by Unix.accept in process_special_fds), you'll see 2000 lines of errors of Unix_error(15) in xenstored.log. But since there are always finite number of connections coming, eventually xenstored should recover and continue the normal logic after all these go through. And the recovery can happen even before that, if there are small time gaps among all your incoming connections (which basically give the process_connection_fd a chance to run).

   * With your change, xenstored will handle existing fds first. So if there is any of the fds being closed already, it basically gives its slot away to the coming connections, so the issue won't happen at the very beginning. But IIUIC, this all depends on how many persistent connections currently exist. Suppose among the 1024 fds, 800 of them are from relatively persistent connections (e.g. qemu connections for long running VMs) , then only the first small batch in your 2000 incoming connections will succeed and the rest will face exactly the same situation.

So, IMHO, the change might help a great deal in a artificial testcase where a lot of temporary connections come and go, but less so in real life system where a large number of the connections are persistent.

Please let me know if my understanding is correct. I have no objection to the change itself, mitigation is positive change too. Hence,

Acked-by: Zheng Li <dev@zheng.li>

A better way is probably wrapping try-with logic on each of process_special_fds and process_connections_fds and process_domains instead of the single top-level try-with, so that the failure on one aspect will not block the others. We can leave this as future work.

>>    * When select is given 1024 fds (which can still happen even with your patch), the behavior is _undefined_. IIRC, some bits in the bitmap might be reused (wrongly), so that the output (fds reported as ready for read/write) might be wrong for some fds, so that the following read/write might be blocked on them.

The max open fd setting  of a process normally defaults to 1024 for a reason. If, as you said, your tests raise the ulimit setting beforehand but still use select internally, then there are potential issues here.

>>    * Also, we generally prefer to handle special fds first, as the eventchn fd represents all the domain connections.
>
> Remove closed firstly may reduce system resource usage?

My initial guess was the original authors of oxenstored might be handling special fds first for performance consideration. But after more thinking about it, I don't think that could make a great difference. We can also separate the eventchn fd from the other two special sockets and give it higher priority if it turns out to be a problem afterwards.

>> I previously mentioned I've got patches for these. I'm currently testing with 1,000 Windows 7 VMs on a single host (each consume at least 2 persistent xenstored socket connections). Besides the two limits just mentioned, I've also fixed several bugs and bottlenecks along the way.
>>
>> I'm going to upstream these patches very soon, just a bit clean up and documentation are needed. However if you (or anyone) need them urgently or eager to have a test, please send me an private email separately. I'm happy to send you the patch in its current form --- a single non-disaggregated patch for multiple issues, not very well commented, but should just work.
>
> Can you please send a copy of your patch? I'd like to test when connections more than @nfds of
> poll, what happened.

Some logic in the patch changes the xenstored process's fd limit to the NR_OPEN of system max (usually 1024x1024), though you obviously won't be able to reach that exact number as other processes will have open fds too. But I guess that's a big enough number for real life cases.

I'll send out the patch to those on the threads (except xen-devel) late today.

Thanks,
Zheng


>> On 26/08/2014 09:15, Joe Jin wrote:
>>> This bug caused by oxenstored handle incoming requests, when lots of
>>> connections came at same time it has not chance to delete closed sockets.
>>>
>>> I created a patch for this, please review:
>>>
>>> Thanks,
>>> Joe
>>>
>>> [PATCH] oxenstored: check and delete closed socket before accept incoming connections
>>>
>>> When more than SYSCONF.OPEN_MAX connections came at the same time and
>>> connecitons been closed later, oxenstored has not change to delete closed
>>> socket, this led oxenstored stuck and unable to handle any incoming
>>> requests any more. This patch let oxenstored check and process closed
>>> socket before handle incoming connections to avoid the stuck.
>>>
>>> Cc: David Scott <dave.scott@eu.citrix.com>
>>> Cc: Zheng Li <dev@zheng.li>
>>> Cc: Luis R. Rodriguez <mcgrof@suse.com>
>>> Cc: Ian Jackson <Ian.Jackson@citrix.com>
>>> Signed-off-by: Joe Jin <joe.jin@oracle.com>
>>> ---
>>>    tools/ocaml/xenstored/xenstored.ml |    4 ++--
>>>    1 files changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
>>> index 1c02f2f..b142952 100644
>>> --- a/tools/ocaml/xenstored/xenstored.ml
>>> +++ b/tools/ocaml/xenstored/xenstored.ml
>>> @@ -373,10 +373,10 @@ let _ =
>>>                [], [], [] in
>>>            let sfds, cfds =
>>>                List.partition (fun fd -> List.mem fd spec_fds) rset in
>>> -        if List.length sfds > 0 then
>>> -            process_special_fds sfds;
>>>            if List.length cfds > 0 || List.length wset > 0 then
>>>                process_connection_fds store cons domains cfds wset;
>>> +        if List.length sfds > 0 then
>>> +            process_special_fds sfds;
>>>            process_domains store cons domains
>>>            in
>>>
>>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
>
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2014-08-27 10:16 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-08  7:01 Lots of connections led oxenstored stuck Joe Jin
2014-08-08  8:35 ` Liuqiming (John)
2014-08-08  9:37   ` Dave Scott
2014-08-11  0:35     ` Joe Jin
2014-08-11  9:41       ` Dave Scott
2014-08-12  0:19         ` Joe Jin
2014-08-14  8:33           ` Joe Jin
2014-08-26  8:15           ` Joe Jin
2014-08-26  9:02             ` Zheng Li
2014-08-27  1:59               ` Joe Jin
2014-08-27 10:16                 ` Zheng Li
2014-08-11 16:58     ` Zheng Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.