netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: Al Viro <viro@ZenIV.linux.org.uk>
Cc: David Miller <davem@davemloft.net>,
	stephen@networkplumber.org, netdev@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	dhowells@redhat.com, linux-fsdevel@vger.kernel.org
Subject: Re: [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect for sockets in accept(3)
Date: Thu, 29 Oct 2015 05:35:19 -0700	[thread overview]
Message-ID: <1446122119.7476.138.camel@edumazet-glaptop2.roam.corp.google.com> (raw)
In-Reply-To: <20151029041611.GF22011@ZenIV.linux.org.uk>

On Thu, 2015-10-29 at 04:16 +0000, Al Viro wrote:

> Have you tried to experiment with that in userland?  I mean, emulate that
> thing in normal userland code, count the cacheline accesses and drive it
> with the use patterns collected from actual applications.

Sure.

> 
> I can sit down and play with math expectations, but I suspect that it's
> easier to experiment.  It's nothing but an intuition (I hadn't seriously
> done probability theory in quite a while, and my mathematical tastes run
> more to geometry and topology anyway), but... I would expect it to degrade
> badly when the bitmap is reasonably dense.
> 
> Note, BTW, that vmalloc'ed memory gets populated as you read it, and it's
> not cheap - it's done via #PF triggered in kernel mode, with handler
> noticing that the faulting address is in vmalloc range and doing the
> right thing.  IOW, if your bitmap is very sparse, the price of page faults
> needs to be taken into account.

This vmalloc PF is pure noise.
This only matters for the very first allocations.

We target programs opening zillions of fd in their lifetime ;)

Not having to expand a 4,000,000 slots fd array while fully loaded also
removes a latency spike that is very often not desirable.


> 
> AFAICS, the only benefit of that thing is keeping dirtied cachelines far
> from each other.  Which might be a win overall, but I'm not convinced that
> the rest won't offset the effect of that...

Well, I already tested the O_FD_FASTALLOC thing, and I can tell you
find_next_zero_bit() is nowhere to be found in kernel profiles anymore.
It also lowers time we hold the fd array spinlock while doing fd alloc.

User land test program I wrote few months back

Current kernel :

    64.98%  [kernel]          [k] queued_spin_lock_slowpath    
    14.88%  opensock          [.] memset    // this part simulates user land actual work ;)                   
    11.15%  [kernel]          [k] _find_next_bit.part.0        
     0.69%  [kernel]          [k] _raw_spin_lock               
     0.46%  [kernel]          [k] memset_erms                  
     0.38%  [kernel]          [k] sk_alloc                     
     0.37%  [kernel]          [k] kmem_cache_alloc             
     0.33%  [kernel]          [k] get_empty_filp               
     0.31%  [kernel]          [k] kmem_cache_free              
     0.26%  [kernel]          [k] __alloc_fd                   
     0.26%  opensock          [.] child_function               
     0.18%  [kernel]          [k] inode_init_always            
     0.17%  opensock          [.] __random_r                   


/*
 * test for b/9072743 : fd scaling on gigantic process (with ~ 10,000,000 TCP sockets)
 * - Size fd arrays in kernel to avoid resizings that kill latencies.
 * - Then launch xx threads doing
 *    populate the fd array of the process, opening 'max' files.
 *    
 *    - Loop : close(randomfd()), socket(AF_INET, SOCK_STREAM, 0);
 *
 * Usage : opensock [ -n fds_count ] [ -t threads_count] [-f]
 */

#include <pthread.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>

unsigned int count;
int skflags;

#define NBTHREADS_MAX 4096
pthread_t tid[NBTHREADS_MAX];
int nbthreads;
int nbthreads_req = 24;
int stop_all;

#ifndef O_FD_FASTALLOC
#define O_FD_FASTALLOC 0x40000000
#endif

#ifndef SOCK_FD_FASTALLOC
#define SOCK_FD_FASTALLOC O_FD_FASTALLOC
#endif

/* expand kernel fd array for optimal perf.
 * This could be done by doing a loop on dup(),
 * or can be done using dup2()
 */
int expand_fd_array(int max)
{
	int target, res;
	int fd = socket(AF_INET, SOCK_STREAM, 0);

	if (fd == -1) {
		perror("socket()");
		return -1;
	}
	for (;;) {
		count = max;
		target = count;
		if (skflags & SOCK_FD_FASTALLOC)
			target += count/10;
		res = dup2(fd, target);
		if (res != -1) {
			close(res);
			break;
		}
		max -= max/10;
	}
	printf("count=%u (check/increase ulimit -n)\n", count);
	return 0;
}

static char state[32] = {
	0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 10, 11, 12, 13, 14, 15,
	16, 17, 18, 19, 20, 21, 22, 23,
	24, 25, 26, 27, 28, 29, 30, 31
};

/* each thread is using ~400 KB of data per unit of work */
#define WORKING_SET_SIZE 400000

static void *child_function(void *arg)
{
	unsigned int max = count / nbthreads_req;
	struct random_data buf;
	unsigned int idx;
	int *tab;
	unsigned long iter = 0;
	unsigned long *work_set = malloc(WORKING_SET_SIZE);
	int i;

	if (!work_set)
		return NULL;
	tab = malloc(max * sizeof(int));
	if (!tab) {
		free(work_set);
		return NULL;
	}
	memset(tab, 255, max * sizeof(int));
	
	initstate_r(getpid(), state, sizeof(state), &buf);

	tab[0] = socket(AF_INET, SOCK_STREAM | skflags, 0);
	for (i = 1; i < max; i++)
		tab[i] = dup(tab[0]);

	while (!stop_all) {
		random_r(&buf, &idx);
		idx = idx % max;
		close(tab[idx]);

		/* user space needs typically to use a bit of the memory. */
		memset(work_set, idx, WORKING_SET_SIZE);

		tab[idx] = socket(AF_INET, SOCK_STREAM | skflags, 0);
		if (tab[idx] == -1) {
			perror("socket");
			break;
		}
		iter++;
	}
	for (i = 0; i < max; i++)
		close(tab[idx]);
	free(tab);
	free(work_set);
	printf("%lu\n", iter);
	return NULL;
}

static int launch_threads(void)
{
	int i, err;

	for (i = 0; i < nbthreads_req; i++) {
		err = pthread_create(&tid[i], NULL, child_function, NULL);
		if (err)
			return err;
		nbthreads++;
	}
	return 0;
}

static void wait_end(void)
{
	int i;
	for (i = 0; i < nbthreads; i++)
		pthread_join(tid[i], NULL);
}

static void usage(int code)
{
	fprintf(stderr, "Usage : opensock [ -n fds_count ] [ -t threads_count] [-f]\n");
	exit(code);
}

int main(int argc, char *argv[])
{
	int c;
	int max = 1000000;
	int duration = 10;

	while ((c = getopt(argc, argv, "fn:t:l:")) != -1) {
		switch (c) {
		case 'f':
			skflags = SOCK_FD_FASTALLOC;
			break;
		case 'n':
			max = atoi(optarg);
			break;
		case 't':
			nbthreads_req = atoi(optarg);
			if (nbthreads_req > NBTHREADS_MAX)
				usage(1);
			break;
		case 'l':
			duration = atoi(optarg);
			break;
		default:
			usage(1);
		}
	}
	system("sysctl -w fs.file-max=8000000");
	expand_fd_array(max);
	launch_threads();
	sleep(duration);
	stop_all = 1;
	wait_end();
}



  reply	other threads:[~2015-10-29 12:35 UTC|newest]

Thread overview: 138+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-10-19 16:59 Fw: [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect for sockets in accept(3) Stephen Hemminger
2015-10-19 23:33 ` Eric Dumazet
2015-10-20  1:12   ` Alan Burlison
2015-10-20  1:45     ` Eric Dumazet
2015-10-20  9:59       ` Alan Burlison
2015-10-20 11:24         ` David Miller
2015-10-20 11:39           ` Alan Burlison
2015-10-20 13:19         ` Fw: " Eric Dumazet
2015-10-20 13:45           ` Alan Burlison
2015-10-20 15:30             ` Eric Dumazet
2015-10-20 18:31               ` Alan Burlison
2015-10-20 18:42                 ` Eric Dumazet
2015-10-21 10:25                 ` David Laight
2015-10-21 10:49                   ` Alan Burlison
2015-10-21 11:28                     ` Eric Dumazet
2015-10-21 13:03                       ` Alan Burlison
2015-10-21 13:29                         ` Eric Dumazet
2015-10-21  3:49       ` Al Viro
2015-10-21 14:38         ` Alan Burlison
2015-10-21 15:30           ` David Miller
2015-10-21 16:04             ` Casper.Dik
2015-10-21 21:18               ` Eric Dumazet
2015-10-21 21:28                 ` Al Viro
2015-10-21 16:32           ` Fw: " Eric Dumazet
2015-10-21 18:51           ` Al Viro
2015-10-21 20:33             ` Casper.Dik
2015-10-22  4:21               ` Al Viro
2015-10-22 10:55                 ` Alan Burlison
2015-10-22 18:16                   ` Al Viro
2015-10-22 20:15                     ` Alan Burlison
2015-11-02 10:03               ` David Laight
2015-11-02 10:29                 ` Al Viro
2015-10-21 22:28             ` Alan Burlison
2015-10-22  1:29             ` David Miller
2015-10-22  4:17               ` Alan Burlison
2015-10-22  4:44                 ` Al Viro
2015-10-22  6:03                   ` Al Viro
2015-10-22  6:34                     ` Casper.Dik
2015-10-22 17:21                       ` Al Viro
2015-10-22 18:24                         ` Casper.Dik
2015-10-22 19:07                           ` Al Viro
2015-10-22 19:51                             ` Casper.Dik
2015-10-22 21:57                               ` Al Viro
2015-10-23  9:52                                 ` Casper.Dik
2015-10-23 13:02                                   ` Eric Dumazet
2015-10-23 13:20                                     ` Casper.Dik
2015-10-23 13:48                                       ` Eric Dumazet
2015-10-23 14:13                                       ` Eric Dumazet
2015-10-23 13:35                                     ` Alan Burlison
2015-10-23 14:21                                       ` Eric Dumazet
2015-10-23 15:46                                         ` Alan Burlison
2015-10-23 16:00                                           ` Eric Dumazet
2015-10-23 16:07                                             ` Alan Burlison
2015-10-23 16:19                                             ` Eric Dumazet
2015-10-23 16:40                                               ` Alan Burlison
2015-10-23 17:47                                                 ` Eric Dumazet
2015-10-23 17:59                                                   ` [PATCH net-next] af_unix: do not report POLLOUT on listeners Eric Dumazet
2015-10-25 13:45                                                     ` David Miller
2015-10-24  2:30                                   ` [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect for sockets in accept(3) Al Viro
2015-10-27  9:08                                     ` Casper.Dik
2015-10-27 10:52                                       ` Alan Burlison
2015-10-27 12:01                                         ` Eric Dumazet
2015-10-27 12:27                                           ` Alan Burlison
2015-10-27 12:44                                             ` Eric Dumazet
2015-10-27 13:42                                         ` David Miller
2015-10-27 13:37                                           ` Alan Burlison
2015-10-27 13:59                                             ` David Miller
2015-10-27 14:13                                               ` Alan Burlison
2015-10-27 14:39                                                 ` David Miller
2015-10-27 14:39                                                   ` Alan Burlison
2015-10-27 15:04                                                     ` David Miller
2015-10-27 15:53                                                       ` Alan Burlison
2015-10-27 23:17                                         ` Al Viro
2015-10-28  0:13                                           ` Eric Dumazet
2015-10-28 12:35                                             ` Al Viro
2015-10-28 13:24                                               ` Eric Dumazet
2015-10-28 14:47                                                 ` Eric Dumazet
2015-10-28 21:13                                                   ` Al Viro
2015-10-28 21:44                                                     ` Eric Dumazet
2015-10-28 22:33                                                       ` Al Viro
2015-10-28 23:08                                                         ` Eric Dumazet
2015-10-29  0:15                                                           ` Al Viro
2015-10-29  3:29                                                             ` Eric Dumazet
2015-10-29  4:16                                                               ` Al Viro
2015-10-29 12:35                                                                 ` Eric Dumazet [this message]
2015-10-29 13:48                                                                   ` Eric Dumazet
2015-10-30 17:18                                                                   ` Linus Torvalds
2015-10-30 21:02                                                                     ` Al Viro
2015-10-30 21:23                                                                       ` Linus Torvalds
2015-10-30 21:50                                                                         ` Linus Torvalds
2015-10-30 22:33                                                                           ` Al Viro
2015-10-30 23:52                                                                             ` Linus Torvalds
2015-10-31  0:09                                                                               ` Al Viro
2015-10-31 15:59                                                                               ` Eric Dumazet
2015-10-31 19:34                                                                               ` Al Viro
2015-10-31 19:54                                                                                 ` Linus Torvalds
2015-10-31 20:29                                                                                   ` Al Viro
2015-11-02  0:24                                                                                     ` Al Viro
2015-11-02  0:59                                                                                       ` Linus Torvalds
2015-11-02  2:14                                                                                       ` Eric Dumazet
2015-11-02  6:22                                                                                         ` Al Viro
2015-10-31 20:45                                                                                   ` Eric Dumazet
2015-10-31 21:23                                                                                     ` Linus Torvalds
2015-10-31 21:51                                                                                       ` Al Viro
2015-10-31 22:34                                                                                       ` Eric Dumazet
2015-10-31  1:07                                                                           ` Eric Dumazet
2015-10-28 16:04                                           ` Alan Burlison
2015-10-29 14:58                                         ` David Holland
2015-10-29 15:18                                           ` Alan Burlison
2015-10-29 16:01                                             ` David Holland
2015-10-29 16:15                                               ` Alan Burlison
2015-10-29 17:07                                                 ` Al Viro
2015-10-29 17:12                                                   ` Alan Burlison
2015-10-30  1:54                                                     ` David Miller
2015-10-30  1:55                                                   ` David Miller
2015-10-30  5:44                                                 ` David Holland
2015-10-30 17:43                                           ` David Laight
2015-10-30 21:09                                             ` Al Viro
2015-11-04 15:54                                               ` David Laight
2015-11-04 16:27                                                 ` Al Viro
2015-11-06 15:07                                                   ` David Laight
2015-11-06 19:31                                                     ` Al Viro
2015-10-22  6:51                   ` Casper.Dik
2015-10-22 11:18                     ` Alan Burlison
2015-10-22 11:15                   ` Alan Burlison
2015-10-22  6:15                 ` Casper.Dik
2015-10-22 11:30                   ` Eric Dumazet
2015-10-22 11:58                     ` Alan Burlison
2015-10-22 12:10                       ` Eric Dumazet
2015-10-22 13:12                         ` David Miller
2015-10-22 13:14                         ` Alan Burlison
2015-10-22 17:05                           ` Al Viro
2015-10-22 17:39                             ` Alan Burlison
2015-10-22 18:56                               ` Al Viro
2015-10-22 19:50                                 ` Casper.Dik
2015-10-23 17:09                                   ` Al Viro
2015-10-23 18:30           ` Fw: " David Holland
2015-10-23 19:51             ` Al Viro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1446122119.7476.138.camel@edumazet-glaptop2.roam.corp.google.com \
    --to=eric.dumazet@gmail.com \
    --cc=davem@davemloft.net \
    --cc=dhowells@redhat.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=stephen@networkplumber.org \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@ZenIV.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).