Re: [PATCH] io_uring: reduce/pack size of io_ring_ctx

From: Jackie Liu <liuyun01@kylinos.cn>
To: Jens Axboe <axboe@kernel.dk>
Cc: io-uring@vger.kernel.org
Subject: Re: [PATCH] io_uring: reduce/pack size of io_ring_ctx
Date: Fri, 8 Nov 2019 08:00:05 +0800	[thread overview]
Message-ID: <EB274748-0796-4D09-A568-D7A16A0C22D7@kylinos.cn> (raw)
In-Reply-To: <1031c163-abd1-f42c-370d-8801f5fd2440@kernel.dk>

> 2019年11月8日 04:23，Jens Axboe <axboe@kernel.dk> 写道：
> 
> With the recent flurry of additions and changes to io_uring, the
> layout of io_ring_ctx has become a bit stale. We're right now at
> 704 bytes in size on my x86-64 build, or 11 cachelines. This
> patch does two things:
> 
> - We have to completion structs embedded, that we only use for
>  quiesce of the ctx (or shutdown) and for sqthread init cases.
>  That 2x32 bytes right there, let's dynamically allocate them.
> 
> - Reorder the struct a bit with an eye on cachelines, use cases,
>  and holes.
> 
> With this patch, we're down to 512 bytes, or 8 cachelines.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> --
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f8344f95817e..2dbc108fa27b 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -212,25 +212,14 @@ struct io_ring_ctx {
> 		wait_queue_head_t	inflight_wait;
> 	} ____cacheline_aligned_in_smp;
> 
> +	struct io_rings	*rings;
> +
> 	/* IO offload */
> 	struct io_wq		*io_wq;
> 	struct task_struct	*sqo_thread;	/* if using sq thread polling */
> 	struct mm_struct	*sqo_mm;
> 	wait_queue_head_t	sqo_wait;
> -	struct completion	sqo_thread_started;
> -
> -	struct {
> -		unsigned		cached_cq_tail;
> -		atomic_t		cached_cq_overflow;
> -		unsigned		cq_entries;
> -		unsigned		cq_mask;
> -		struct wait_queue_head	cq_wait;
> -		struct fasync_struct	*cq_fasync;
> -		struct eventfd_ctx	*cq_ev_fd;
> -		atomic_t		cq_timeouts;
> -	} ____cacheline_aligned_in_smp;
> -
> -	struct io_rings	*rings;
> +	struct completion	*sqo_done;
> 
> 	/*
> 	 * If used, fixed file set. Writers must ensure that ->refs is dead,
> @@ -246,7 +235,22 @@ struct io_ring_ctx {
> 
> 	struct user_struct	*user;
> 
> -	struct completion	ctx_done;
> +	struct completion	*ctx_done;
> +
> +#if defined(CONFIG_UNIX)
> +	struct socket		*ring_sock;
> +#endif
> +
> +	struct {
> +		unsigned		cached_cq_tail;
> +		atomic_t		cached_cq_overflow;
> +		unsigned		cq_entries;
> +		unsigned		cq_mask;
> +		struct wait_queue_head	cq_wait;
> +		struct fasync_struct	*cq_fasync;
> +		struct eventfd_ctx	*cq_ev_fd;
> +		atomic_t		cq_timeouts;
> +	} ____cacheline_aligned_in_smp;
> 
> 	struct {
> 		struct mutex		uring_lock;
> @@ -268,10 +272,6 @@ struct io_ring_ctx {
> 		spinlock_t		inflight_lock;
> 		struct list_head	inflight_list;
> 	} ____cacheline_aligned_in_smp;
> -
> -#if defined(CONFIG_UNIX)
> -	struct socket		*ring_sock;
> -#endif
> };
> 
> struct sqe_submit {
> @@ -396,7 +396,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
> {
> 	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
> 
> -	complete(&ctx->ctx_done);
> +	complete(ctx->ctx_done);
> }
> 
> static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> @@ -407,17 +407,20 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> 	if (!ctx)
> 		return NULL;
> 
> +	ctx->ctx_done = kmalloc(sizeof(struct completion), GFP_KERNEL);
> +	ctx->sqo_done = kmalloc(sizeof(struct completion), GFP_KERNEL);
> +	if (!ctx->ctx_done || !ctx->sqo_done)
> +		goto err;
> +
> 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
> -			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
> -		kfree(ctx);
> -		return NULL;
> -	}
> +			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
> +		goto err;
> 
> 	ctx->flags = p->flags;
> 	init_waitqueue_head(&ctx->cq_wait);
> 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
> -	init_completion(&ctx->ctx_done);
> -	init_completion(&ctx->sqo_thread_started);
> +	init_completion(ctx->ctx_done);
> +	init_completion(ctx->sqo_done);
> 	mutex_init(&ctx->uring_lock);
> 	init_waitqueue_head(&ctx->wait);
> 	spin_lock_init(&ctx->completion_lock);
> @@ -429,6 +432,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
> 	spin_lock_init(&ctx->inflight_lock);
> 	INIT_LIST_HEAD(&ctx->inflight_list);
> 	return ctx;
> +err:
> +	kfree(ctx->ctx_done);
> +	kfree(ctx->sqo_done);
> +	kfree(ctx);
> +	return NULL;
> }
> 
> static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
> @@ -3037,7 +3045,7 @@ static int io_sq_thread(void *data)
> 	unsigned inflight;
> 	unsigned long timeout;
> 
> -	complete(&ctx->sqo_thread_started);
> +	complete(ctx->sqo_done);
> 
> 	old_fs = get_fs();
> 	set_fs(USER_DS);
> @@ -3276,7 +3284,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
> static void io_sq_thread_stop(struct io_ring_ctx *ctx)
> {
> 	if (ctx->sqo_thread) {
> -		wait_for_completion(&ctx->sqo_thread_started);
> +		wait_for_completion(ctx->sqo_done);
> 		/*
> 		 * The park is a bit of a work-around, without it we get
> 		 * warning spews on shutdown with SQPOLL set and affinity
> @@ -4098,6 +4106,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
> 		io_unaccount_mem(ctx->user,
> 				ring_pages(ctx->sq_entries, ctx->cq_entries));
> 	free_uid(ctx->user);
> +	kfree(ctx->ctx_done);
> +	kfree(ctx->sqo_done);
> 	kfree(ctx);
> }
> 
> @@ -4141,7 +4151,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
> 		io_wq_cancel_all(ctx->io_wq);
> 
> 	io_iopoll_reap_events(ctx);
> -	wait_for_completion(&ctx->ctx_done);
> +	wait_for_completion(ctx->ctx_done);
> 	io_ring_ctx_free(ctx);
> }
> 
> @@ -4545,7 +4555,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> 	 * no new references will come in after we've killed the percpu ref.
> 	 */
> 	mutex_unlock(&ctx->uring_lock);
> -	wait_for_completion(&ctx->ctx_done);
> +	wait_for_completion(ctx->ctx_done);
> 	mutex_lock(&ctx->uring_lock);
> 
> 	switch (opcode) {
> @@ -4588,7 +4598,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
> 	}
> 
> 	/* bring the ctx back to life */
> -	reinit_completion(&ctx->ctx_done);
> +	reinit_completion(ctx->ctx_done);
> 	percpu_ref_reinit(&ctx->refs);
> 	return ret;
> }

This patch looks good, but I prefer sqo_thread_started instead of sqo_done,
because we are marking the thread started, not the end of the thread.

Anyway, Reviewed-by: Jackie Liu <liuyun01@kylinos.cn>

--
BR, Jackie Liu