Re: [PATCH 3/5] virtio: use indirect ring elements

From: Stephen Hemminger <stephen@networkplumber.org>
To: "Xie, Huawei" <huawei.xie@intel.com>
Cc: "dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [PATCH 3/5] virtio: use indirect ring elements
Date: Mon, 19 Oct 2015 08:47:15 -0700	[thread overview]
Message-ID: <20151019084715.33d89e50@xeon-e3> (raw)
In-Reply-To: <C37D651A908B024F974696C65296B57B4B12C213@SHSMSX101.ccr.corp.intel.com>

On Mon, 19 Oct 2015 13:19:50 +0000
"Xie, Huawei" <huawei.xie@intel.com> wrote:

> >  static int
> > -virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
> > +virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie,
> > +		       int use_indirect)
> >  {
> >  	struct vq_desc_extra *dxp;
> >  	struct vring_desc *start_dp;
> >  	uint16_t seg_num = cookie->nb_segs;
> > -	uint16_t needed = 1 + seg_num;
> > +	uint16_t needed = use_indirect ? 1 : 1 + seg_num;
> >  	uint16_t head_idx, idx;
> > -	uint16_t head_size = txvq->hw->vtnet_hdr_size;
> > +	unsigned long offs;
> >  
> >  	if (unlikely(txvq->vq_free_cnt == 0))
> >  		return -ENOSPC;
> > @@ -220,12 +221,29 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
> >  	dxp = &txvq->vq_descx[idx];
> >  	dxp->cookie = (void *)cookie;
> >  	dxp->ndescs = needed;
> > -
> >  	start_dp = txvq->vq_ring.desc;
> > -	start_dp[idx].addr =
> > -		txvq->virtio_net_hdr_mem + idx * head_size;
> > -	start_dp[idx].len = (uint32_t)head_size;
> > -	start_dp[idx].flags = VRING_DESC_F_NEXT;
> > +
> > +	if (use_indirect) {
> > +		struct virtio_tx_region *txr
> > +			= txvq->virtio_net_hdr_mz->addr;
> > +
> > +		offs = idx * sizeof(struct virtio_tx_region)
> > +			+ offsetof(struct virtio_tx_region, tx_indir);
> > +
> > +		start_dp[idx].addr  = txvq->virtio_net_hdr_mem + offs;
> > +		start_dp[idx].len   = (seg_num + 1) * sizeof(struct vring_desc);  
> a. In indirect mode, as we always use one descriptor, could we allocate
> one fixed descriptor as what i did in RX/TX ring layout optimization? :).

The code can not assume all packets will be in indirect mode. If using
any_layout, then some packets will use that. Also if you give a packet
where nb_segs is very large, then it falls back to old mode.
Also some hosts (like vhost) don't support indirect.

> b. If not a, we could cache the descriptor, avoid update unless the
> fields are different. In current implementation of free desc list, we
> could make them always use the same tx desc for the same ring slot. I am
> to submit a patch for normal rx path.

See above

> c. Could we initialize the length of all tx descriptors to be
> VIRTIO_MAX_INDIRECT * sizeof(struct vring_desc)? Is maximum length ok
> here? Does the spec require that the length field reflects the length of
> real used descs, as we already have the next field to indicate the last
> descriptor.

The largest VIRTIO_MAX_INDIRECT possible is very large 4K

> 
> > +		start_dp[idx].flags = VRING_DESC_F_INDIRECT;
> > +
> > +		start_dp = txr[idx].tx_indir;
> > +		idx = 0;
> > +	} else {
> > +		offs = idx * sizeof(struct virtio_tx_region)
> > +			+ offsetof(struct virtio_tx_region, tx_hdr);
> > +
> > +		start_dp[idx].addr  = txvq->virtio_net_hdr_mem + offs;
> > +		start_dp[idx].len   = txvq->hw->vtnet_hdr_size;
> > +		start_dp[idx].flags = VRING_DESC_F_NEXT;
> > +	}
> >  
> >  	for (; ((seg_num > 0) && (cookie != NULL)); seg_num--) {
> >  		idx = start_dp[idx].next;  
> This looks weird to me. Why skip the first user provided descriptor?
> idx = 0
> idx = start_dp[idx].next
> start_dp[idx].addr = ...

The first descriptor (0) is initialized once to point to the static
all zeros tx header. Then code skips to second entry to initailize the
first data block.

> 
> > @@ -236,7 +254,12 @@ virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie)
> >  	}
> >  
> >  	start_dp[idx].flags &= ~VRING_DESC_F_NEXT;
> > -	idx = start_dp[idx].next;
> > +
> > +	if (use_indirect)
> > +		idx = txvq->vq_ring.desc[head_idx].next;
> > +	else
> > +		idx = start_dp[idx].next;
> > +
> >  	txvq->vq_desc_head_idx = idx;
> >  	if (txvq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)
> >  		txvq->vq_desc_tail_idx = idx;
> > @@ -261,7 +284,7 @@ static void
> >  virtio_dev_vring_start(struct virtqueue *vq, int queue_type)
> >  {
> >  	struct rte_mbuf *m;
> > -	int i, nbufs, error, size = vq->vq_nentries;
> > +	int nbufs, error, size = vq->vq_nentries;
> >  	struct vring *vr = &vq->vq_ring;
> >  	uint8_t *ring_mem = vq->vq_ring_virt_mem;
> >  
> > @@ -279,10 +302,7 @@ virtio_dev_vring_start(struct virtqueue *vq, int queue_type)
> >  	vq->vq_free_cnt = vq->vq_nentries;
> >  	memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries);
> >  
> > -	/* Chain all the descriptors in the ring with an END */
> > -	for (i = 0; i < size - 1; i++)
> > -		vr->desc[i].next = (uint16_t)(i + 1);
> > -	vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
> > +	vring_desc_init(vr->desc, size);
> >  
> >  	/*
> >  	 * Disable device(host) interrupting guest
> > @@ -760,7 +780,15 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> >  
> >  	for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {
> >  		struct rte_mbuf *txm = tx_pkts[nb_tx];
> > -		int need = txm->nb_segs - txvq->vq_free_cnt + 1;
> > +		int use_indirect, slots, need;
> > +
> > +		use_indirect = vtpci_with_feature(txvq->hw,
> > +						  VIRTIO_RING_F_INDIRECT_DESC)
> > +			&& (txm->nb_segs < VIRTIO_MAX_TX_INDIRECT);
> > +
> > +		/* How many ring entries are needed to this Tx? */  
> "how many descs" is more accurate , at least to me, ring entries/slots
> means entries/slots in avail ring.
> If it is OK, s/slots/descs/ as well.

The virtio spec doesn't use the words descriptors. that is more an Intel
driver terminolgy.

> 
> > +		slots = use_indirect ? 1 : 1 + txm->nb_segs;
> > +		need = slots - txvq->vq_free_cnt;
> >  
> >  		/* Positive value indicates it need free vring descriptors */
> >  		if (need > 0) {
> > @@ -769,7 +797,7 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> >  			need = RTE_MIN(need, (int)nb_used);
> >  
> >  			virtio_xmit_cleanup(txvq, need);
> > -			need = txm->nb_segs - txvq->vq_free_cnt + 1;
> > +			need = slots - txvq->vq_free_cnt;
> >  			if (unlikely(need > 0)) {
> >  				PMD_TX_LOG(ERR,
> >  					   "No free tx descriptors to transmit");
> > @@ -787,7 +815,7 @@ virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> >  		}
> >  
> >  		/* Enqueue Packet buffers */
> > -		error = virtqueue_enqueue_xmit(txvq, txm);
> > +		error = virtqueue_enqueue_xmit(txvq, txm, use_indirect);
> >  		if (unlikely(error)) {
> >  			if (error == ENOSPC)
> >  				PMD_TX_LOG(ERR, "virtqueue_enqueue Free count = 0");
> > diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
> > index 7789411..fe3fa66 100644
> > --- a/drivers/net/virtio/virtqueue.h
> > +++ b/drivers/net/virtio/virtqueue.h
> > @@ -237,6 +237,25 @@ struct virtio_net_hdr_mrg_rxbuf {
> >  	uint16_t num_buffers; /**< Number of merged rx buffers */
> >  };
> >  
> > +/* Region reserved to allow for transmit header and indirect ring */
> > +#define VIRTIO_MAX_TX_INDIRECT 8
> > +struct virtio_tx_region {
> > +	struct virtio_net_hdr_mrg_rxbuf tx_hdr;  
> Any reason to use merge-able header here?
> > +	struct vring_desc tx_indir[VIRTIO_MAX_TX_INDIRECT]
> > +			   __attribute__((__aligned__(16)));  
> WARNING: __aligned(size) is preferred over __attribute__((aligned(size)))
> [...]