Re: [PATCH v3 1/1] virtio-blk: avoid preallocating big SGL for data

From: Max Gurtovoy <mgurtovoy@nvidia.com>
To: "Michael S. Tsirkin" <mst@redhat.com>
Cc: <hch@infradead.org>, <virtualization@lists.linux-foundation.org>,
	<kvm@vger.kernel.org>, <stefanha@redhat.com>,
	<israelr@nvidia.com>, <nitzanc@nvidia.com>, <oren@nvidia.com>,
	<linux-block@vger.kernel.org>, <axboe@kernel.dk>
Subject: Re: [PATCH v3 1/1] virtio-blk: avoid preallocating big SGL for data
Date: Wed, 1 Sep 2021 17:58:31 +0300	[thread overview]
Message-ID: <89d6dc30-a876-b1b0-4ff4-605415113611@nvidia.com> (raw)
In-Reply-To: <20210901102623-mutt-send-email-mst@kernel.org>

On 9/1/2021 5:50 PM, Michael S. Tsirkin wrote:
> On Wed, Sep 01, 2021 at 04:14:34PM +0300, Max Gurtovoy wrote:
>> No need to pre-allocate a big buffer for the IO SGL anymore. If a device
>> has lots of deep queues, preallocation for the sg list can consume
>> substantial amounts of memory. For HW virtio-blk device, nr_hw_queues
>> can be 64 or 128 and each queue's depth might be 128. This means the
>> resulting preallocation for the data SGLs is big.
>>
>> Switch to runtime allocation for SGL for lists longer than 2 entries.
>> This is the approach used by NVMe drivers so it should be reasonable for
>> virtio block as well. Runtime SGL allocation has always been the case
>> for the legacy I/O path so this is nothing new.
>>
>> The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't
>> support SG_CHAIN, use only runtime allocation for the SGL.
>>
>> Re-organize the setup of the IO request to fit the new sg chain
>> mechanism.
>>
>> No performance degradation was seen (fio libaio engine with 16 jobs and
>> 128 iodepth):
>>
>> IO size      IOPs Rand Read (before/after)         IOPs Rand Write (before/after)
>> --------     ---------------------------------    ----------------------------------
>> 512B          318K/316K                                    329K/325K
>>
>> 4KB           323K/321K                                    353K/349K
>>
>> 16KB          199K/208K                                    250K/275K
>>
>> 128KB         36K/36.1K                                    39.2K/41.7K
>>
>> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com>
>> Reviewed-by: Israel Rukshin <israelr@nvidia.com>
> Could you use something to give confidence intervals maybe?
> As it is it looks like a 1-2% regression for 512B and 4KB.

1%-2% is not a regression. It's a device/env/test variance.

This is just one test results. I run it many times and got difference by 
+/- 2%-3% in each run for each sides.

Even if I run same driver without changes I get 2%-3% difference between 
runs.

If you have a perf test suite for virtio-blk it will be great if you can 
run it, or maybe Feng Li has.

>
>
>
>> ---
>>
>> changes from V2:
>>   - initialize vbr->out_hdr.sector during virtblk_setup_cmd
>>
>> changes from V1:
>>   - Kconfig update (from Christoph)
>>   - Re-order cmd setup (from Christoph)
>>   - use flexible sg pointer in the cmd (from Christoph)
>>   - added perf numbers to commit msg (from Feng Li)
>>
>> ---
>>   drivers/block/Kconfig      |   1 +
>>   drivers/block/virtio_blk.c | 155 +++++++++++++++++++++++--------------
>>   2 files changed, 100 insertions(+), 56 deletions(-)
>>
>> diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
>> index 63056cfd4b62..ca25a122b8ee 100644
>> --- a/drivers/block/Kconfig
>> +++ b/drivers/block/Kconfig
>> @@ -395,6 +395,7 @@ config XEN_BLKDEV_BACKEND
>>   config VIRTIO_BLK
>>   	tristate "Virtio block driver"
>>   	depends on VIRTIO
>> +	select SG_POOL
>>   	help
>>   	  This is the virtual block driver for virtio.  It can be used with
>>             QEMU based VMMs (like KVM or Xen).  Say Y or M.
>> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
>> index 9332fc4e9b31..bdd6d415bd20 100644
>> --- a/drivers/block/virtio_blk.c
>> +++ b/drivers/block/virtio_blk.c
>> @@ -24,6 +24,12 @@
>>   /* The maximum number of sg elements that fit into a virtqueue */
>>   #define VIRTIO_BLK_MAX_SG_ELEMS 32768
>>   
>> +#ifdef CONFIG_ARCH_NO_SG_CHAIN
>> +#define VIRTIO_BLK_INLINE_SG_CNT	0
>> +#else
>> +#define VIRTIO_BLK_INLINE_SG_CNT	2
>> +#endif
>> +
>>   static int virtblk_queue_count_set(const char *val,
>>   		const struct kernel_param *kp)
>>   {
>> @@ -93,6 +99,7 @@ struct virtio_blk {
>>   struct virtblk_req {
>>   	struct virtio_blk_outhdr out_hdr;
>>   	u8 status;
>> +	struct sg_table sg_table;
>>   	struct scatterlist sg[];
>>   };
>>   
>> @@ -178,15 +185,94 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
>>   	return 0;
>>   }
>>   
>> -static inline void virtblk_request_done(struct request *req)
>> +static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
>>   {
>> -	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
>> +	if (blk_rq_nr_phys_segments(req))
>> +		sg_free_table_chained(&vbr->sg_table,
>> +				      VIRTIO_BLK_INLINE_SG_CNT);
>> +}
>> +
>> +static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
>> +		struct virtblk_req *vbr)
>> +{
>> +	int err;
>> +
>> +	if (!blk_rq_nr_phys_segments(req))
>> +		return 0;
>> +
>> +	vbr->sg_table.sgl = vbr->sg;
>> +	err = sg_alloc_table_chained(&vbr->sg_table,
>> +				     blk_rq_nr_phys_segments(req),
>> +				     vbr->sg_table.sgl,
>> +				     VIRTIO_BLK_INLINE_SG_CNT);
>> +	if (unlikely(err))
>> +		return -ENOMEM;
>>   
>> +	return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
>> +}
>> +
>> +static void virtblk_cleanup_cmd(struct request *req)
>> +{
>>   	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
>>   		kfree(page_address(req->special_vec.bv_page) +
>>   		      req->special_vec.bv_offset);
>>   	}
>> +}
>> +
>> +static int virtblk_setup_cmd(struct virtio_device *vdev, struct request *req,
>> +		struct virtblk_req *vbr)
>> +{
>> +	bool unmap = false;
>> +	u32 type;
>> +
>> +	vbr->out_hdr.sector = 0;
>> +
>> +	switch (req_op(req)) {
>> +	case REQ_OP_READ:
>> +		type = VIRTIO_BLK_T_IN;
>> +		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
>> +						      blk_rq_pos(req));
>> +		break;
>> +	case REQ_OP_WRITE:
>> +		type = VIRTIO_BLK_T_OUT;
>> +		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
>> +						      blk_rq_pos(req));
>> +		break;
>> +	case REQ_OP_FLUSH:
>> +		type = VIRTIO_BLK_T_FLUSH;
>> +		break;
>> +	case REQ_OP_DISCARD:
>> +		type = VIRTIO_BLK_T_DISCARD;
>> +		break;
>> +	case REQ_OP_WRITE_ZEROES:
>> +		type = VIRTIO_BLK_T_WRITE_ZEROES;
>> +		unmap = !(req->cmd_flags & REQ_NOUNMAP);
>> +		break;
>> +	case REQ_OP_DRV_IN:
>> +		type = VIRTIO_BLK_T_GET_ID;
>> +		break;
>> +	default:
>> +		WARN_ON_ONCE(1);
>> +		return BLK_STS_IOERR;
>> +	}
>>   
>> +	vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
>> +	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
>> +
>> +	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
>> +		if (virtblk_setup_discard_write_zeroes(req, unmap))
>> +			return BLK_STS_RESOURCE;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static inline void virtblk_request_done(struct request *req)
>> +{
>> +	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
>> +
>> +	virtblk_unmap_data(req, vbr);
>> +	virtblk_cleanup_cmd(req);
>>   	blk_mq_end_request(req, virtblk_result(vbr));
>>   }
>>   
>> @@ -244,57 +330,23 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
>>   	int qid = hctx->queue_num;
>>   	int err;
>>   	bool notify = false;
>> -	bool unmap = false;
>> -	u32 type;
>>   
>>   	BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
>>   
>> -	switch (req_op(req)) {
>> -	case REQ_OP_READ:
>> -	case REQ_OP_WRITE:
>> -		type = 0;
>> -		break;
>> -	case REQ_OP_FLUSH:
>> -		type = VIRTIO_BLK_T_FLUSH;
>> -		break;
>> -	case REQ_OP_DISCARD:
>> -		type = VIRTIO_BLK_T_DISCARD;
>> -		break;
>> -	case REQ_OP_WRITE_ZEROES:
>> -		type = VIRTIO_BLK_T_WRITE_ZEROES;
>> -		unmap = !(req->cmd_flags & REQ_NOUNMAP);
>> -		break;
>> -	case REQ_OP_DRV_IN:
>> -		type = VIRTIO_BLK_T_GET_ID;
>> -		break;
>> -	default:
>> -		WARN_ON_ONCE(1);
>> -		return BLK_STS_IOERR;
>> -	}
>> -
>> -	vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, type);
>> -	vbr->out_hdr.sector = type ?
>> -		0 : cpu_to_virtio64(vblk->vdev, blk_rq_pos(req));
>> -	vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(req));
>> +	err = virtblk_setup_cmd(vblk->vdev, req, vbr);
>> +	if (unlikely(err))
>> +		return err;
>>   
>>   	blk_mq_start_request(req);
>>   
>> -	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) {
>> -		err = virtblk_setup_discard_write_zeroes(req, unmap);
>> -		if (err)
>> -			return BLK_STS_RESOURCE;
>> -	}
>> -
>> -	num = blk_rq_map_sg(hctx->queue, req, vbr->sg);
>> -	if (num) {
>> -		if (rq_data_dir(req) == WRITE)
>> -			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
>> -		else
>> -			vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
>> +	num = virtblk_map_data(hctx, req, vbr);
>> +	if (unlikely(num < 0)) {
>> +		virtblk_cleanup_cmd(req);
>> +		return BLK_STS_RESOURCE;
>>   	}
>>   
>>   	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
>> -	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
>> +	err = virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg_table.sgl, num);
>>   	if (err) {
>>   		virtqueue_kick(vblk->vqs[qid].vq);
>>   		/* Don't stop the queue if -ENOMEM: we may have failed to
>> @@ -303,6 +355,8 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
>>   		if (err == -ENOSPC)
>>   			blk_mq_stop_hw_queue(hctx);
>>   		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
>> +		virtblk_unmap_data(req, vbr);
>> +		virtblk_cleanup_cmd(req);
>>   		switch (err) {
>>   		case -ENOSPC:
>>   			return BLK_STS_DEV_RESOURCE;
>> @@ -681,16 +735,6 @@ static const struct attribute_group *virtblk_attr_groups[] = {
>>   	NULL,
>>   };
>>   
>> -static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
>> -		unsigned int hctx_idx, unsigned int numa_node)
>> -{
>> -	struct virtio_blk *vblk = set->driver_data;
>> -	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
>> -
>> -	sg_init_table(vbr->sg, vblk->sg_elems);
>> -	return 0;
>> -}
>> -
>>   static int virtblk_map_queues(struct blk_mq_tag_set *set)
>>   {
>>   	struct virtio_blk *vblk = set->driver_data;
>> @@ -703,7 +747,6 @@ static const struct blk_mq_ops virtio_mq_ops = {
>>   	.queue_rq	= virtio_queue_rq,
>>   	.commit_rqs	= virtio_commit_rqs,
>>   	.complete	= virtblk_request_done,
>> -	.init_request	= virtblk_init_request,
>>   	.map_queues	= virtblk_map_queues,
>>   };
>>   
>> @@ -783,7 +826,7 @@ static int virtblk_probe(struct virtio_device *vdev)
>>   	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>>   	vblk->tag_set.cmd_size =
>>   		sizeof(struct virtblk_req) +
>> -		sizeof(struct scatterlist) * sg_elems;
>> +		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
>>   	vblk->tag_set.driver_data = vblk;
>>   	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
>>   
>> -- 
>> 2.18.1