On 05.09.2019 19:14, Marvin Liu wrote: > Burst enqueue function will first check whether descriptors are cache > aligned. It will also check prerequisites in the beginning. Burst > enqueue function not support chained mbufs, single packet enqueue > function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> Hi. Can we rely on loop unrolling by compiler instead of repeating each command 4 times? For example: uint64_t len[PACKED_DESCS_BURST]; for (i = 0; i < PACKED_DESCS_BURST; i++) len[i] = descs[avail_idx + i].len; For 'if's: res = false; for (i = 0; i < PACKED_DESCS_BURST; i++) res |= pkts[i]->next != NULL; if (unlikely(res)) return -1; or just for (i = 0; i < PACKED_DESCS_BURST; i++) if (unlikely(pkts[i]->next != NULL)) return -1; Since PACKED_DESCS_BURST is a fairly small constant, loops should be unrolled by compiler producing almost same code. This will significantly reduce code size and will also allow to play with PACKED_DESCS_BURST value without massive code changes. Same is applicable to other patches in the series. What do you think? Best regards, Ilya Maximets. > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 884befa85..ed8b4aabf 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,8 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_DESCS_BURST 4 > +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) > /** > * Structure contains buffer address, length and descriptor index > * from vring to do scatter RX. > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 5ad0a8175..51ed20543 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -896,6 +896,106 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return pkt_idx; > } > > +static __rte_unused uint16_t > +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > + struct rte_mbuf **pkts) > +{ > + bool wrap_counter = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + uint16_t avail_idx = vq->last_avail_idx; > + uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3; > + uint64_t len, len1, len2, len3; > + struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3; > + uint32_t buf_offset = dev->vhost_hlen; > + > + if (unlikely(avail_idx & PACKED_BURST_MASK)) > + return -1; > + > + if (unlikely((pkts[0]->next != NULL) | > + (pkts[1]->next != NULL) | > + (pkts[2]->next != NULL) | > + (pkts[3]->next != NULL))) > + return -1; > + > + if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) | > + unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) | > + unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) | > + unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter))) > + return 1; > + > + rte_smp_rmb(); > + > + len = descs[avail_idx].len; > + len1 = descs[avail_idx + 1].len; > + len2 = descs[avail_idx + 2].len; > + len3 = descs[avail_idx + 3].len; > + > + if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) | > + (pkts[1]->pkt_len > (len1 - buf_offset)) | > + (pkts[2]->pkt_len > (len2 - buf_offset)) | > + (pkts[3]->pkt_len > (len3 - buf_offset)))) > + return -1; > + > + desc_addr = vhost_iova_to_vva(dev, vq, descs[avail_idx].addr, &len, > + VHOST_ACCESS_RW); > + > + desc_addr1 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 1].addr, > + &len1, VHOST_ACCESS_RW); > + > + desc_addr2 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 2].addr, > + &len2, VHOST_ACCESS_RW); > + > + desc_addr3 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 3].addr, > + &len3, VHOST_ACCESS_RW); > + > + if (unlikely((len != descs[avail_idx].len) | > + (len1 != descs[avail_idx + 1].len) | > + (len2 != descs[avail_idx + 2].len) | > + (len3 != descs[avail_idx + 3].len))) > + return -1; > + > + rte_prefetch0((void *)(uintptr_t)desc_addr); > + rte_prefetch0((void *)(uintptr_t)desc_addr1); > + rte_prefetch0((void *)(uintptr_t)desc_addr2); > + rte_prefetch0((void *)(uintptr_t)desc_addr3); > + > + hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr; > + hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1; > + hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2; > + hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3; > + > + virtio_enqueue_offload(pkts[0], &hdr->hdr); > + virtio_enqueue_offload(pkts[1], &hdr1->hdr); > + virtio_enqueue_offload(pkts[2], &hdr2->hdr); > + virtio_enqueue_offload(pkts[3], &hdr3->hdr); > + > + len = pkts[0]->pkt_len + dev->vhost_hlen; > + len1 = pkts[1]->pkt_len + dev->vhost_hlen; > + len2 = pkts[2]->pkt_len + dev->vhost_hlen; > + len3 = pkts[3]->pkt_len + dev->vhost_hlen; > + > + vq->last_avail_idx += PACKED_DESCS_BURST; > + if (vq->last_avail_idx >= vq->size) { > + vq->last_avail_idx -= vq->size; > + vq->avail_wrap_counter ^= 1; > + } > + > + rte_memcpy((void *)(uintptr_t)(desc_addr + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[0], void *, 0), > + pkts[0]->pkt_len); > + rte_memcpy((void *)(uintptr_t)(desc_addr1 + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[1], void *, 0), > + pkts[1]->pkt_len); > + rte_memcpy((void *)(uintptr_t)(desc_addr2 + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[2], void *, 0), > + pkts[2]->pkt_len); > + rte_memcpy((void *)(uintptr_t)(desc_addr3 + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[3], void *, 0), > + pkts[3]->pkt_len); > + > + return 0; > +} > + > static __rte_unused int16_t > virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mbuf *pkt) >
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Uroll burst loop function into more pieces. Handle descriptors in one cache line simultaneously. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost descriptor update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. Cache memory region structure for fast conversion. Disable sofware prefetch is hardware can do better. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 40%. Marvin Liu (14): vhost: add single packet enqueue function vhost: add burst enqueue function for packed ring vhost: add single packet dequeue function vhost: add burst dequeue function vhost: rename flush shadow used ring functions vhost: flush vhost enqueue shadow ring by burst vhost: add flush function for burst enqueue vhost: buffer vhost dequeue shadow ring vhost: split enqueue and dequeue flush functions vhost: optimize Rx function of packed ring vhost: add burst and single zero dequeue functions vhost: optimize Tx function of packed ring vhost: cache address translation result vhost: check whether disable software pre-fetch lib/librte_vhost/Makefile | 6 + lib/librte_vhost/rte_vhost.h | 27 + lib/librte_vhost/vhost.h | 13 + lib/librte_vhost/virtio_net.c | 1094 +++++++++++++++++++++++++++------ 4 files changed, 944 insertions(+), 196 deletions(-) -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5b85b832d..5ad0a8175 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -774,6 +774,71 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +/* + * Returns -1 on fail, 0 on success + */ +static inline int +vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + + uint16_t avail_idx; + uint16_t max_tries, tries = 0; + + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + avail_idx = vq->last_avail_idx; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + uint16_t num_buffers = 0; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) { + return -1; + } + + len = RTE_MIN(len, size); + + size -= len; + + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + + *nr_descs += desc_count; + num_buffers += 1; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, + buf_vec, nr_vec, + num_buffers) < 0) { + return 0; + } + + return 0; +} + + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -831,6 +896,35 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return 0; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 1; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Burst enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Burst enqueue function not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 884befa85..ed8b4aabf 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,8 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESCS_BURST 4 +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5ad0a8175..51ed20543 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -896,6 +896,106 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused uint16_t +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3; + uint64_t len, len1, len2, len3; + struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3; + uint32_t buf_offset = dev->vhost_hlen; + + if (unlikely(avail_idx & PACKED_BURST_MASK)) + return -1; + + if (unlikely((pkts[0]->next != NULL) | + (pkts[1]->next != NULL) | + (pkts[2]->next != NULL) | + (pkts[3]->next != NULL))) + return -1; + + if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) | + unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) | + unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) | + unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter))) + return 1; + + rte_smp_rmb(); + + len = descs[avail_idx].len; + len1 = descs[avail_idx + 1].len; + len2 = descs[avail_idx + 2].len; + len3 = descs[avail_idx + 3].len; + + if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) | + (pkts[1]->pkt_len > (len1 - buf_offset)) | + (pkts[2]->pkt_len > (len2 - buf_offset)) | + (pkts[3]->pkt_len > (len3 - buf_offset)))) + return -1; + + desc_addr = vhost_iova_to_vva(dev, vq, descs[avail_idx].addr, &len, + VHOST_ACCESS_RW); + + desc_addr1 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 1].addr, + &len1, VHOST_ACCESS_RW); + + desc_addr2 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 2].addr, + &len2, VHOST_ACCESS_RW); + + desc_addr3 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 3].addr, + &len3, VHOST_ACCESS_RW); + + if (unlikely((len != descs[avail_idx].len) | + (len1 != descs[avail_idx + 1].len) | + (len2 != descs[avail_idx + 2].len) | + (len3 != descs[avail_idx + 3].len))) + return -1; + + rte_prefetch0((void *)(uintptr_t)desc_addr); + rte_prefetch0((void *)(uintptr_t)desc_addr1); + rte_prefetch0((void *)(uintptr_t)desc_addr2); + rte_prefetch0((void *)(uintptr_t)desc_addr3); + + hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr; + hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1; + hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2; + hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3; + + virtio_enqueue_offload(pkts[0], &hdr->hdr); + virtio_enqueue_offload(pkts[1], &hdr1->hdr); + virtio_enqueue_offload(pkts[2], &hdr2->hdr); + virtio_enqueue_offload(pkts[3], &hdr3->hdr); + + len = pkts[0]->pkt_len + dev->vhost_hlen; + len1 = pkts[1]->pkt_len + dev->vhost_hlen; + len2 = pkts[2]->pkt_len + dev->vhost_hlen; + len3 = pkts[3]->pkt_len + dev->vhost_hlen; + + vq->last_avail_idx += PACKED_DESCS_BURST; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + rte_memcpy((void *)(uintptr_t)(desc_addr + buf_offset), + rte_pktmbuf_mtod_offset(pkts[0], void *, 0), + pkts[0]->pkt_len); + rte_memcpy((void *)(uintptr_t)(desc_addr1 + buf_offset), + rte_pktmbuf_mtod_offset(pkts[1], void *, 0), + pkts[1]->pkt_len); + rte_memcpy((void *)(uintptr_t)(desc_addr2 + buf_offset), + rte_pktmbuf_mtod_offset(pkts[2], void *, 0), + pkts[2]->pkt_len); + rte_memcpy((void *)(uintptr_t)(desc_addr3 + buf_offset), + rte_pktmbuf_mtod_offset(pkts[3], void *, 0), + pkts[3]->pkt_len); + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) -- 2.17.1
dd vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 51ed20543..454e8b33e 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1603,6 +1603,62 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) { + return -1; + } + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add burst dequeue function like enqueue function for packed ring, burst dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index ed8b4aabf..b33f29ba0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -41,6 +41,8 @@ #define PACKED_DESCS_BURST 4 #define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) +#define DESC_SINGLE_DEQUEUE (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT) + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 454e8b33e..f34df3733 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1603,6 +1603,150 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t avail_idx, uintptr_t *desc_addr, uint16_t *ids) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint64_t len, len1, len2, len3; + uint64_t buf_len, buf_len1, buf_len2, buf_len3; + uint32_t buf_offset = dev->vhost_hlen; + + // check whether desc is burst aligned + if (unlikely(avail_idx & PACKED_BURST_MASK)) + return -1; + + if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) | + unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) | + unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) | + unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter))) + return 1; + + if (unlikely((descs[avail_idx].flags & DESC_SINGLE_DEQUEUE) | + (descs[avail_idx + 1].flags & DESC_SINGLE_DEQUEUE) | + (descs[avail_idx + 2].flags & DESC_SINGLE_DEQUEUE) | + (descs[avail_idx + 3].flags & DESC_SINGLE_DEQUEUE))) + return -1; + + rte_smp_rmb(); + + len = descs[avail_idx].len; + len1 = descs[avail_idx + 1].len; + len2 = descs[avail_idx + 2].len; + len3 = descs[avail_idx + 3].len; + + desc_addr[0] = vhost_iova_to_vva(dev, vq, descs[avail_idx].addr, &len, + VHOST_ACCESS_RW); + + desc_addr[1] = vhost_iova_to_vva(dev, vq, descs[avail_idx + 1].addr, + &len1, VHOST_ACCESS_RW); + + desc_addr[2] = vhost_iova_to_vva(dev, vq, descs[avail_idx + 2].addr, + &len2, VHOST_ACCESS_RW); + + desc_addr[3] = vhost_iova_to_vva(dev, vq, descs[avail_idx + 3].addr, + &len3, VHOST_ACCESS_RW); + + if (unlikely((len != descs[avail_idx].len) | + (len1 != descs[avail_idx + 1].len) | + (len2 != descs[avail_idx + 2].len) | + (len3 != descs[avail_idx + 3].len))) { + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_DESCS_BURST)) + return -1; + + buf_len = pkts[0]->buf_len - pkts[0]->data_off; + buf_len1 = pkts[1]->buf_len - pkts[1]->data_off; + buf_len2 = pkts[2]->buf_len - pkts[2]->data_off; + buf_len3 = pkts[3]->buf_len - pkts[3]->data_off; + + if (unlikely((buf_len < (len - buf_offset)) | + (buf_len1 < (len1 - buf_offset)) | + (buf_len2 < (len2 - buf_offset)) | + (buf_len3 < (len3 - buf_offset)))) { + rte_pktmbuf_free(pkts[0]); + rte_pktmbuf_free(pkts[1]); + rte_pktmbuf_free(pkts[2]); + rte_pktmbuf_free(pkts[3]); + return -1; + } + + pkts[0]->pkt_len = descs[avail_idx].len - buf_offset; + pkts[1]->pkt_len = descs[avail_idx + 1].len - buf_offset; + pkts[2]->pkt_len = descs[avail_idx + 2].len - buf_offset; + pkts[3]->pkt_len = descs[avail_idx + 3].len - buf_offset; + + pkts[0]->data_len = pkts[0]->pkt_len; + pkts[1]->data_len = pkts[1]->pkt_len; + pkts[2]->data_len = pkts[2]->pkt_len; + pkts[3]->data_len = pkts[3]->pkt_len; + + ids[0] = descs[avail_idx].id; + ids[1] = descs[avail_idx + 1].id; + ids[2] = descs[avail_idx + 2].id; + ids[3] = descs[avail_idx + 3].id; + + return 0; +} + +static __rte_unused int +virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addr[4]; + uint16_t ids[4]; + int ret; + struct virtio_net_hdr *hdr, *hdr1, *hdr2, *hdr3; + + ret = vhost_dequeue_burst_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addr, ids); + + if (ret) + return ret; + + rte_prefetch0((void *)(uintptr_t)desc_addr[0]); + rte_prefetch0((void *)(uintptr_t)desc_addr[1]); + rte_prefetch0((void *)(uintptr_t)desc_addr[2]); + rte_prefetch0((void *)(uintptr_t)desc_addr[3]); + + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[0], void *, 0), + (void *)(uintptr_t)(desc_addr[0] + buf_offset), + pkts[0]->pkt_len); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[1], void *, 0), + (void *)(uintptr_t)(desc_addr[1] + buf_offset), + pkts[1]->pkt_len); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[2], void *, 0), + (void *)(uintptr_t)(desc_addr[2] + buf_offset), + pkts[2]->pkt_len); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[3], void *, 0), + (void *)(uintptr_t)(desc_addr[3] + buf_offset), + pkts[3]->pkt_len); + + if (virtio_net_with_host_offload(dev)) { + hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr[0]); + hdr1 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[1]); + hdr2 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[2]); + hdr3 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[3]); + vhost_dequeue_offload(hdr, pkts[0]); + vhost_dequeue_offload(hdr1, pkts[1]); + vhost_dequeue_offload(hdr2, pkts[2]); + vhost_dequeue_offload(hdr3, pkts[3]); + } + + vq->last_avail_idx += PACKED_DESCS_BURST; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, -- 2.17.1
Simplify flush shadow used ring function names as all shadow rings are reflect to used rings. No need to emphasize ring type. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index f34df3733..7116c389d 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -38,7 +38,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) } static __rte_always_inline void -do_flush_shadow_used_ring_split(struct virtio_net *dev, +do_flush_shadow_split(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t to, uint16_t from, uint16_t size) { @@ -51,22 +51,22 @@ do_flush_shadow_used_ring_split(struct virtio_net *dev, } static __rte_always_inline void -flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) +flush_shadow_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { uint16_t used_idx = vq->last_used_idx & (vq->size - 1); if (used_idx + vq->shadow_used_idx <= vq->size) { - do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, + do_flush_shadow_split(dev, vq, used_idx, 0, vq->shadow_used_idx); } else { uint16_t size; /* update used ring interval [used_idx, vq->size] */ size = vq->size - used_idx; - do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size); + do_flush_shadow_split(dev, vq, used_idx, 0, size); /* update the left half used ring interval [0, left_size] */ - do_flush_shadow_used_ring_split(dev, vq, 0, size, + do_flush_shadow_split(dev, vq, 0, size, vq->shadow_used_idx - size); } vq->last_used_idx += vq->shadow_used_idx; @@ -82,7 +82,7 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } static __rte_always_inline void -update_shadow_used_ring_split(struct vhost_virtqueue *vq, +update_shadow_split(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len) { uint16_t i = vq->shadow_used_idx++; @@ -92,8 +92,7 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) +flush_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { int i; uint16_t used_idx = vq->last_used_idx; @@ -159,7 +158,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, } static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, +update_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) { uint16_t i = vq->shadow_used_idx++; @@ -421,7 +420,7 @@ reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, VHOST_ACCESS_RW) < 0)) return -1; len = RTE_MIN(len, size); - update_shadow_used_ring_split(vq, head_idx, len); + update_shadow_split(vq, head_idx, len); size -= len; cur_idx++; @@ -597,7 +596,7 @@ reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return -1; len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); + update_shadow_packed(vq, buf_id, len, desc_count); size -= len; avail_idx += desc_count; @@ -889,7 +888,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_split(dev, vq); + flush_shadow_split(dev, vq); vhost_vring_call_split(dev, vq); } @@ -1069,7 +1068,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } @@ -1498,8 +1497,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, next = TAILQ_NEXT(zmbuf, next); if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_split(vq, - zmbuf->desc_idx, 0); + update_shadow_split(vq, zmbuf->desc_idx, 0); TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); restore_mbuf(zmbuf->mbuf); rte_pktmbuf_free(zmbuf->mbuf); @@ -1509,7 +1507,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_split(dev, vq); + flush_shadow_split(dev, vq); vhost_vring_call_split(dev, vq); } } @@ -1549,7 +1547,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, break; if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_split(vq, head_idx, 0); + update_shadow_split(vq, head_idx, 0); pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { @@ -1595,7 +1593,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_split(dev, vq); + flush_shadow_split(dev, vq); vhost_vring_call_split(dev, vq); } } @@ -1817,10 +1815,8 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, next = TAILQ_NEXT(zmbuf, next); if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + update_shadow_packed(vq, zmbuf->desc_idx, 0, + zmbuf->desc_count); TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); restore_mbuf(zmbuf->mbuf); @@ -1831,7 +1827,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } @@ -1857,8 +1853,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + update_shadow_packed(vq, buf_id, 0, desc_count); pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { @@ -1910,7 +1905,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } -- 2.17.1
Buffer vhost enqueue shadow ring update, flush shadow ring until buffered descriptors number exceed one burst. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index b33f29ba0..86552cbeb 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -143,6 +143,7 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + uint16_t enqueue_shadow_count; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 7116c389d..dffd466d5 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -157,6 +157,24 @@ flush_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, + uint32_t len, uint16_t count) +{ + /* enqueue shadow flush action aligned with burst num */ + if (!vq->shadow_used_idx) + vq->enqueue_shadow_count = vq->last_used_idx & + PACKED_BURST_MASK; + + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; + + vq->enqueue_shadow_count += count; +} + static __rte_always_inline void update_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -197,6 +215,22 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +flush_enqueue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint32_t len[], uint16_t id[], + uint16_t count[], uint16_t num_buffers) +{ + int i; + for (i = 0; i < num_buffers; i++) { + update_enqueue_shadow_packed(vq, id[i], len[i], count[i]); + + if (vq->enqueue_shadow_count >= PACKED_DESCS_BURST) { + do_data_copy_enqueue(dev, vq); + flush_shadow_packed(dev, vq); + } + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -798,6 +832,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, max_tries = 1; uint16_t num_buffers = 0; + uint32_t buffer_len[max_tries]; + uint16_t buffer_buf_id[max_tries]; + uint16_t buffer_desc_count[max_tries]; while (size > 0) { /* @@ -820,6 +857,10 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; + avail_idx += desc_count; if (avail_idx >= vq->size) avail_idx -= vq->size; @@ -834,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } + flush_enqueue_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } -- 2.17.1
Flush used flags when burst enqueue function is finished. Descriptor's flags are pre-calculated as them will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 86552cbeb..5471acaf7 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ + | VRING_DESC_F_WRITE) +#define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) #define PACKED_DESCS_BURST 4 #define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) #define DESC_SINGLE_DEQUEUE (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index dffd466d5..ce255dd82 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -157,6 +157,60 @@ flush_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t len, uint64_t len1, uint64_t len2, uint64_t len3, uint16_t id, + uint16_t id1, uint16_t id2, uint16_t id3, uint16_t flags) +{ + vq->desc_packed[vq->last_used_idx].id = id; + vq->desc_packed[vq->last_used_idx].len = len; + vq->desc_packed[vq->last_used_idx + 1].id = id1; + vq->desc_packed[vq->last_used_idx + 1].len = len1; + + vq->desc_packed[vq->last_used_idx + 2].id = id2; + vq->desc_packed[vq->last_used_idx + 2].len = len2; + + vq->desc_packed[vq->last_used_idx + 3].id = id3; + vq->desc_packed[vq->last_used_idx + 3].len = len3; + + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx].flags = flags; + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + 1].flags = flags; + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + 2].flags = flags; + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + 3].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_DESCS_BURST); + vhost_log_cache_sync(dev, vq); + + vq->last_used_idx += PACKED_DESCS_BURST; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static __rte_always_inline void +flush_enqueue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t len, uint64_t len1, uint64_t len2, uint64_t len3, uint16_t id, + uint16_t id1, uint16_t id2, uint16_t id3) +{ + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = VIRTIO_RX_USED_FLAG; + else + flags = VIRTIO_RX_USED_WRAP_FLAG; + + flush_burst_packed(dev, vq, len, len1, len2, len3, id, id1, id2, id3, + flags); +} + static __rte_always_inline void update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -950,6 +1004,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t len, len1, len2, len3; struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3; uint32_t buf_offset = dev->vhost_hlen; + uint16_t id, id1, id2, id3; if (unlikely(avail_idx & PACKED_BURST_MASK)) return -1; @@ -1036,6 +1091,14 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, rte_pktmbuf_mtod_offset(pkts[3], void *, 0), pkts[3]->pkt_len); + id = descs[avail_idx].id; + id1 = descs[avail_idx + 1].id; + id2 = descs[avail_idx + 2].id; + id3 = descs[avail_idx + 3].id; + + flush_enqueue_burst_packed(dev, vq, len, len1, len2, len3, id, id1, + id2, id3); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor index and its wrap counter. First shadowed ring index is recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 5471acaf7..b161082ca 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ | VRING_DESC_F_WRITE) #define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) +#define VIRTIO_TX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) +#define VIRTIO_TX_USED_WRAP_FLAG (0x0) #define PACKED_DESCS_BURST 4 #define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) #define DESC_SINGLE_DEQUEUE (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT) @@ -90,9 +92,11 @@ struct log_cache_entry { }; struct vring_used_elem_packed { + uint16_t used_idx; uint16_t id; uint32_t len; uint32_t count; + uint16_t used_wrap_counter; }; /** @@ -147,6 +151,7 @@ struct vhost_virtqueue { }; uint16_t shadow_used_idx; uint16_t enqueue_shadow_count; + uint16_t dequeue_shadow_head; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ce255dd82..f8ad54e18 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -240,6 +240,42 @@ update_shadow_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +update_dequeue_shadow_packed(struct vhost_virtqueue *vq, uint16_t buf_id, + uint16_t count) +{ + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + + if (vq->used_wrap_counter) + vq->desc_packed[vq->last_used_idx].flags = + VIRTIO_TX_USED_FLAG; + else + vq->desc_packed[vq->last_used_idx].flags = + VIRTIO_TX_USED_WRAP_FLAG; + + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1898,6 +1934,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; + update_dequeue_shadow_packed(vq, buf_id, desc_count); + vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { vq->last_avail_idx -= vq->size; -- 2.17.1
Vhost enqueue descriptors are updated by burst number, while vhost dequeue descriptors are buffered. Meanwhile in dequeue function only first descriptor is buffered. Due to these differences, split vhost enqueue and dequeue flush functions. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index f8ad54e18..8d09e1611 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -92,7 +92,7 @@ update_shadow_split(struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +flush_enqueue_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { int i; uint16_t used_idx = vq->last_used_idx; @@ -157,6 +157,33 @@ flush_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +flush_dequeue_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t head_idx = vq->dequeue_shadow_head; + uint16_t head_flags; + + if (vq->shadow_used_packed[0].used_wrap_counter) + head_flags = VIRTIO_TX_USED_FLAG; + else + head_flags = VIRTIO_TX_USED_WRAP_FLAG; + + if (vq->shadow_used_packed[0].len) + head_flags |= VRING_DESC_F_WRITE; + + vq->desc_packed[head_idx].id = vq->shadow_used_packed[0].id; + + rte_smp_wmb(); + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t len, uint64_t len1, uint64_t len2, uint64_t len3, uint16_t id, @@ -195,6 +222,52 @@ flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t id, uint16_t id1, uint16_t id2, uint16_t id3) +{ + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = VIRTIO_TX_USED_FLAG; + else + flags = VIRTIO_TX_USED_WRAP_FLAG; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].id = id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->desc_packed[vq->last_used_idx + 1].id = id1; + vq->desc_packed[vq->last_used_idx + 2].id = id2; + vq->desc_packed[vq->last_used_idx + 3].id = id3; + + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + 1].flags = flags; + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + 2].flags = flags; + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + 3].flags = flags; + + vq->shadow_used_idx = 1; + + vq->last_used_idx += PACKED_DESCS_BURST; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } else { + + flush_burst_packed(dev, vq, 0, 0, 0, 0, id, id1, id2, id3, + flags); + } +} + static __rte_always_inline void flush_enqueue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t len, uint64_t len1, uint64_t len2, uint64_t len3, uint16_t id, @@ -316,11 +389,29 @@ flush_enqueue_packed(struct virtio_net *dev, if (vq->enqueue_shadow_count >= PACKED_DESCS_BURST) { do_data_copy_enqueue(dev, vq); - flush_shadow_packed(dev, vq); + flush_enqueue_shadow_packed(dev, vq); } } } +static __rte_unused void +flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (!vq->shadow_used_idx) + return; + + int16_t shadow_count = vq->last_used_idx - vq->dequeue_shadow_head; + if (shadow_count <= 0) + shadow_count += vq->size; + + /* buffer used descs as many as possible when doing dequeue */ + if ((uint16_t)shadow_count >= (vq->size >> 1)) { + do_data_copy_dequeue(vq); + flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1211,7 +1302,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_packed(dev, vq); + flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } @@ -1869,6 +1960,8 @@ virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, (void *)(uintptr_t)(desc_addr[3] + buf_offset), pkts[3]->pkt_len); + update_dequeue_burst_packed(dev, vq, ids[0], ids[1], ids[2], ids[3]); + if (virtio_net_with_host_offload(dev)) { hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr[0]); hdr1 = (struct virtio_net_hdr *)((uintptr_t)desc_addr[1]); @@ -1972,7 +2065,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_packed(dev, vq); + flush_dequeue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } @@ -2050,7 +2143,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_packed(dev, vq); + flush_dequeue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } -- 2.17.1
Optimize vhost device rx function by separate descriptors, no-chained and direct descriptors will be handled by burst and other will be handled one by one as before. Pre-fetch descriptors in next two cache lines as hardware will load two cache line data automatically. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 8d09e1611..269ec8a43 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -769,64 +769,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1120,7 +1062,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused uint16_t +static __rte_always_inline uint16_t virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) { @@ -1229,7 +1171,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) { @@ -1263,46 +1205,45 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t pkt_num; + uint32_t remained = count; + uint16_t fetch_idx; + int ret; + struct vring_packed_desc *descs = vq->desc_packed; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + if ((vq->last_avail_idx & 0x7) == 0) { + fetch_idx = vq->last_avail_idx + 8; + rte_prefetch0((void *)(uintptr_t)&descs[fetch_idx]); + } - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_DESCS_BURST) { + ret = virtio_dev_rx_burst_packed(dev, vq, pkts); + + if (!ret) { + pkt_num = PACKED_DESCS_BURST; + pkt_idx += pkt_num; + remained -= pkt_num; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); + pkt_num = virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]); - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (pkt_num == 0) break; - } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } + pkt_idx += pkt_num; + remained -= pkt_num; - do_data_copy_enqueue(dev, vq); + } while (pkt_idx < count); + + if (pkt_idx) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); + flush_enqueue_shadow_packed(dev, vq); + } - if (likely(vq->shadow_used_idx)) { - flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Optimize vhost zero copy dequeue path like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 269ec8a43..8032229a0 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1979,6 +1979,108 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +static __rte_unused int +virtio_dev_tx_burst_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbuf, *zmbuf1, *zmbuf2, *zmbuf3; + int ret; + uintptr_t desc_addr[4]; + uint16_t ids[4]; + + uint16_t avail_idx = vq->last_avail_idx; + + ret = vhost_dequeue_burst_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addr, ids); + + if (ret) + return ret; + + zmbuf = get_zmbuf(vq); + zmbuf1 = get_zmbuf(vq); + zmbuf2 = get_zmbuf(vq); + zmbuf3 = get_zmbuf(vq); + + if (!zmbuf || !zmbuf1 || !zmbuf2 || !zmbuf3) { + rte_pktmbuf_free(pkts[0]); + rte_pktmbuf_free(pkts[1]); + rte_pktmbuf_free(pkts[2]); + rte_pktmbuf_free(pkts[3]); + return -1; + } + + zmbuf->mbuf = pkts[0]; + zmbuf->desc_idx = avail_idx; + zmbuf->desc_count = 1; + + zmbuf1->mbuf = pkts[1]; + zmbuf1->desc_idx = avail_idx + 1; + zmbuf1->desc_count = 1; + + zmbuf2->mbuf = pkts[2]; + zmbuf2->desc_idx = avail_idx + 2; + zmbuf2->desc_count = 1; + + zmbuf3->mbuf = pkts[3]; + zmbuf3->desc_idx = avail_idx + 3; + zmbuf3->desc_count = 1; + + rte_mbuf_refcnt_update(pkts[0], 1); + rte_mbuf_refcnt_update(pkts[1], 1); + rte_mbuf_refcnt_update(pkts[2], 1); + rte_mbuf_refcnt_update(pkts[3], 1); + + vq->nr_zmbuf += PACKED_DESCS_BURST; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf1, next); + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf2, next); + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf3, next); + + vq->last_avail_idx += PACKED_DESCS_BURST; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -2092,6 +2194,47 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_unused void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = VIRTIO_TX_USED_FLAG; + else + flags = VIRTIO_TX_USED_WRAP_FLAG; + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq->last_used_idx += zmbuf->desc_count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device tx function like rx function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 8032229a0..554617292 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -302,17 +302,6 @@ update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, vq->enqueue_shadow_count += count; } -static __rte_always_inline void -update_shadow_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static __rte_always_inline void update_dequeue_shadow_packed(struct vhost_virtqueue *vq, uint16_t buf_id, uint16_t count) @@ -394,7 +383,7 @@ flush_enqueue_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { if (!vq->shadow_used_idx) @@ -1866,7 +1855,7 @@ vhost_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1957,7 +1946,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1979,7 +1968,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_burst_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2047,7 +2036,7 @@ virtio_dev_tx_burst_packed_zmbuf(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) @@ -2082,119 +2071,7 @@ virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, return 0; } -static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -{ - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); - - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_packed(vq, zmbuf->desc_idx, 0, - zmbuf->desc_count); - - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; - } - } - - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); - - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t dummy_len; - uint16_t desc_count, nr_vec = 0; - int err; - - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &dummy_len, - VHOST_ACCESS_RO) < 0)) - break; - - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_packed(vq, buf_id, 0, desc_count); - - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; - } - - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; - } - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; - - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; - - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); - - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } - - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } - - if (likely(dev->dequeue_zero_copy == 0)) { - do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - return i; -} - -static __rte_unused void +static __rte_always_inline void free_zmbuf(struct vhost_virtqueue *vq) { struct zcopy_mbuf *next = NULL; @@ -2235,6 +2112,105 @@ free_zmbuf(struct vhost_virtqueue *vq) } } +static __rte_noinline uint16_t +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t pkt_num; + uint32_t remained = count; + int ret; + + free_zmbuf(vq); + + do { + if (remained >= PACKED_DESCS_BURST) { + ret = virtio_dev_tx_burst_packed_zmbuf(dev, vq, + mbuf_pool, + &pkts[pkt_idx]); + + if (!ret) { + pkt_num = PACKED_DESCS_BURST; + pkt_idx += pkt_num; + remained -= pkt_num; + continue; + } + } + + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + + pkt_num = 1; + pkt_idx += pkt_num; + remained -= pkt_num; + } while (remained); + + if (pkt_idx) + vhost_vring_call_packed(dev, vq); + + return pkt_idx; +} + +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t pkt_num; + uint32_t remained = count; + uint16_t fetch_idx; + int ret; + struct vring_packed_desc *descs = vq->desc_packed; + + do { + if ((vq->last_avail_idx & 0x7) == 0) { + fetch_idx = vq->last_avail_idx + 8; + rte_prefetch0((void *)(uintptr_t)&descs[fetch_idx]); + } + + if (remained >= PACKED_DESCS_BURST) { + ret = virtio_dev_tx_burst_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx]); + + if (!ret) { + pkt_num = PACKED_DESCS_BURST; + flush_dequeue_packed(dev, vq); + pkt_idx += pkt_num; + remained -= pkt_num; + continue; + } + } + + /* + * If remained descs can't bundled into one burst, just skip to + * next round. + */ + if (((vq->last_avail_idx & PACKED_BURST_MASK) + remained) < + PACKED_DESCS_BURST) + break; + + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + + pkt_num = 1; + pkt_idx += pkt_num; + remained -= pkt_num; + flush_dequeue_packed(dev, vq); + + } while (remained); + + if (pkt_idx) { + if (vq->shadow_used_idx) + do_data_copy_dequeue(vq); + + vhost_vring_call_packed(dev, vq); + } + + return pkt_idx; +} + uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) @@ -2308,9 +2284,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
Cache address translation result and use it in next translation. Due to limited regions are supported, buffers are most likely in same region when doing data transmission. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index 7fb172912..d90235cd6 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -91,10 +91,18 @@ struct rte_vhost_mem_region { int fd; }; +struct rte_vhost_mem_region_cache { + uint64_t guest_phys_addr; + uint64_t guest_phys_addr_end; + int64_t host_user_addr_offset; + uint64_t size; +}; + /** * Memory structure includes region and mapping information. */ struct rte_vhost_memory { + struct rte_vhost_mem_region_cache cache_region; uint32_t nregions; struct rte_vhost_mem_region regions[]; }; @@ -232,11 +240,30 @@ rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, struct rte_vhost_mem_region *r; uint32_t i; + struct rte_vhost_mem_region_cache *r_cache; + /* check with cached region */ + r_cache = &mem->cache_region; + if (likely(gpa >= r_cache->guest_phys_addr && gpa < + r_cache->guest_phys_addr_end)) { + if (unlikely(*len > r_cache->guest_phys_addr_end - gpa)) + *len = r_cache->guest_phys_addr_end - gpa; + + return gpa - r_cache->host_user_addr_offset; + } + + for (i = 0; i < mem->nregions; i++) { r = &mem->regions[i]; if (gpa >= r->guest_phys_addr && gpa < r->guest_phys_addr + r->size) { + r_cache->guest_phys_addr = r->guest_phys_addr; + r_cache->guest_phys_addr_end = r->guest_phys_addr + + r->size; + r_cache->size = r->size; + r_cache->host_user_addr_offset = r->guest_phys_addr - + r->host_user_addr; + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) *len = r->guest_phys_addr + r->size - gpa; -- 2.17.1
Disable software pre-fetch actions on Skylake and Cascadelake platforms. Hardware can fetch needed data for vhost, additional software pre-fetch will have impact on performance. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..1d1423bdc 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep AVX512F) + +ifneq ($(AVX512_SUPPORT),) +CFLAGS += -DDISABLE_SWPREFETCH +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 554617292..7d9a8eda0 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -489,7 +489,9 @@ map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(!desc_addr)) return -1; +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addr); +#endif buf_vec[vec_id].buf_iova = desc_iova; buf_vec[vec_id].buf_addr = desc_addr; @@ -1011,7 +1013,9 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, */ rte_smp_rmb(); +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); +#endif for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; @@ -1110,10 +1114,12 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, (len3 != descs[avail_idx + 3].len))) return -1; +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addr); rte_prefetch0((void *)(uintptr_t)desc_addr1); rte_prefetch0((void *)(uintptr_t)desc_addr2); rte_prefetch0((void *)(uintptr_t)desc_addr3); +#endif hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr; hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1; @@ -1687,7 +1693,9 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, */ rte_smp_rmb(); +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); +#endif VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); @@ -1872,10 +1880,12 @@ virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (ret) return ret; +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addr[0]); rte_prefetch0((void *)(uintptr_t)desc_addr[1]); rte_prefetch0((void *)(uintptr_t)desc_addr[2]); rte_prefetch0((void *)(uintptr_t)desc_addr[3]); +#endif rte_memcpy(rte_pktmbuf_mtod_offset(pkts[0], void *, 0), (void *)(uintptr_t)(desc_addr[0] + buf_offset), -- 2.17.1
> -----Original Message----- > From: Ilya Maximets [mailto:i.maximets@samsung.com] > Sent: Thursday, September 05, 2019 6:31 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; > maxime.coquelin@redhat.com; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function > for packed ring > > On 05.09.2019 19:14, Marvin Liu wrote: > > Burst enqueue function will first check whether descriptors are cache > > aligned. It will also check prerequisites in the beginning. Burst > > enqueue function not support chained mbufs, single packet enqueue > > function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > Hi. > > Can we rely on loop unrolling by compiler instead of repeating each > command 4 times? > > For example: > > uint64_t len[PACKED_DESCS_BURST]; > > for (i = 0; i < PACKED_DESCS_BURST; i++) > len[i] = descs[avail_idx + i].len; > > > For 'if's: > > res = false; > for (i = 0; i < PACKED_DESCS_BURST; i++) > res |= pkts[i]->next != NULL; > if (unlikely(res)) > return -1; > > or just > > for (i = 0; i < PACKED_DESCS_BURST; i++) > if (unlikely(pkts[i]->next != NULL)) > return -1; > > Since PACKED_DESCS_BURST is a fairly small constant, loops should be > unrolled by compiler producing almost same code. > > This will significantly reduce code size and will also allow to > play with PACKED_DESCS_BURST value without massive code changes. > > Same is applicable to other patches in the series. > > What do you think? > Hi Ilya, I did some test with the unroll availability of various compilers before. All listed compilers will cause loopback performance drop compared to repeating code version, especially GCC7.4 and ICC. Newer compilers will have much less impact (around 3%) on the throughput. If we can accept that, repeating code can be replaced with small loop function. |----------------|---------------|-------------|------| | Compiler | Auto unrolled | Fixed batch | Gap | |----------------|---------------|-------------|------| | Clang6.0.0 | 13.1M | 13.5M | 0.4M | |----------------|---------------|-------------|------| | GCC 8.3.0 | 13.9M | 14.4M | 0.5M | |----------------|---------------|-------------|------| | GCC 7.4.0 | 12.6M | 13.5M | 0.9M | |----------------|---------------|-------------|------| | ICC 19.0.4.243 | 11.0M | 12.3M | 1.3M | |----------------|---------------|-------------|------| Thanks, Marvin > Best regards, Ilya Maximets. > > >
Hi,
After checked gcc9.0.2, pragma is needed for notifying compiler to unroll the loop.
Code will look like below after added compile-time macro for pragma.
If this format is fine, I will send out patch set with the update later.
#ifdef SUPPORT_GCC_UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma("GCC unroll 4")
#endif
#ifdef SUPPORT_CLANG_UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma("unroll 4")
#endif
#ifdef SUPPORT_ICC_UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma("unroll (4)")
#endif
#ifndef UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma()
#endif
UNROLL_PRAGMA
for (i = 0; i < PACKED_DESCS_BURST; i++) {
if (unlikely(pkts[i]->next != NULL))
return -1;
}
Also checked compiler clang6.0.0, performance of small loop with pragma will be same as repeating code.
Regards,
Marvin
> -----Original Message-----
> From: Liu, Yong
> Sent: Friday, September 06, 2019 9:43 AM
> To: Ilya Maximets <i.maximets@samsung.com>; Bie, Tiwei
> <tiwei.bie@intel.com>; maxime.coquelin@redhat.com
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function
> for packed ring
>
>
>
> > -----Original Message-----
> > From: Ilya Maximets [mailto:i.maximets@samsung.com]
> > Sent: Thursday, September 05, 2019 6:31 PM
> > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> > maxime.coquelin@redhat.com; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue
> function
> > for packed ring
> >
> > On 05.09.2019 19:14, Marvin Liu wrote:
> > > Burst enqueue function will first check whether descriptors are cache
> > > aligned. It will also check prerequisites in the beginning. Burst
> > > enqueue function not support chained mbufs, single packet enqueue
> > > function will handle it.
> > >
> > > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > Hi.
> >
> > Can we rely on loop unrolling by compiler instead of repeating each
> > command 4 times?
> >
> > For example:
> >
> > uint64_t len[PACKED_DESCS_BURST];
> >
> > for (i = 0; i < PACKED_DESCS_BURST; i++)
> > len[i] = descs[avail_idx + i].len;
> >
> >
> > For 'if's:
> >
> > res = false;
> > for (i = 0; i < PACKED_DESCS_BURST; i++)
> > res |= pkts[i]->next != NULL;
> > if (unlikely(res))
> > return -1;
> >
> > or just
> >
> > for (i = 0; i < PACKED_DESCS_BURST; i++)
> > if (unlikely(pkts[i]->next != NULL))
> > return -1;
> >
> > Since PACKED_DESCS_BURST is a fairly small constant, loops should be
> > unrolled by compiler producing almost same code.
> >
> > This will significantly reduce code size and will also allow to
> > play with PACKED_DESCS_BURST value without massive code changes.
> >
> > Same is applicable to other patches in the series.
> >
> > What do you think?
> >
>
> Hi Ilya,
> I did some test with the unroll availability of various compilers before.
> All listed compilers will cause loopback performance drop compared to
> repeating code version, especially GCC7.4 and ICC.
> Newer compilers will have much less impact (around 3%) on the throughput.
> If we can accept that, repeating code can be replaced with small loop
> function.
>
> |----------------|---------------|-------------|------|
> | Compiler | Auto unrolled | Fixed batch | Gap |
> |----------------|---------------|-------------|------|
> | Clang6.0.0 | 13.1M | 13.5M | 0.4M |
> |----------------|---------------|-------------|------|
> | GCC 8.3.0 | 13.9M | 14.4M | 0.5M |
> |----------------|---------------|-------------|------|
> | GCC 7.4.0 | 12.6M | 13.5M | 0.9M |
> |----------------|---------------|-------------|------|
> | ICC 19.0.4.243 | 11.0M | 12.3M | 1.3M |
> |----------------|---------------|-------------|------|
>
> Thanks,
> Marvin
>
> > Best regards, Ilya Maximets.
> >
> > >
On Fri, Sep 06, 2019 at 01:42:44AM +0000, Liu, Yong wrote:
>
>
> > -----Original Message-----
> > From: Ilya Maximets [mailto:i.maximets@samsung.com]
> > Sent: Thursday, September 05, 2019 6:31 PM
> > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> > maxime.coquelin@redhat.com; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function
> > for packed ring
> >
> > On 05.09.2019 19:14, Marvin Liu wrote:
> > > Burst enqueue function will first check whether descriptors are cache
> > > aligned. It will also check prerequisites in the beginning. Burst
> > > enqueue function not support chained mbufs, single packet enqueue
> > > function will handle it.
> > >
> > > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > Hi.
> >
> > Can we rely on loop unrolling by compiler instead of repeating each
> > command 4 times?
> >
> > For example:
> >
> > uint64_t len[PACKED_DESCS_BURST];
> >
> > for (i = 0; i < PACKED_DESCS_BURST; i++)
> > len[i] = descs[avail_idx + i].len;
> >
> >
> > For 'if's:
> >
> > res = false;
> > for (i = 0; i < PACKED_DESCS_BURST; i++)
> > res |= pkts[i]->next != NULL;
> > if (unlikely(res))
> > return -1;
> >
> > or just
> >
> > for (i = 0; i < PACKED_DESCS_BURST; i++)
> > if (unlikely(pkts[i]->next != NULL))
> > return -1;
> >
> > Since PACKED_DESCS_BURST is a fairly small constant, loops should be
> > unrolled by compiler producing almost same code.
> >
> > This will significantly reduce code size and will also allow to
> > play with PACKED_DESCS_BURST value without massive code changes.
> >
> > Same is applicable to other patches in the series.
> >
> > What do you think?
> >
>
> Hi Ilya,
> I did some test with the unroll availability of various compilers before.
> All listed compilers will cause loopback performance drop compared to repeating code version, especially GCC7.4 and ICC.
> Newer compilers will have much less impact (around 3%) on the throughput.
> If we can accept that, repeating code can be replaced with small loop function.
>
> |----------------|---------------|-------------|------|
> | Compiler | Auto unrolled | Fixed batch | Gap |
> |----------------|---------------|-------------|------|
> | Clang6.0.0 | 13.1M | 13.5M | 0.4M |
> |----------------|---------------|-------------|------|
> | GCC 8.3.0 | 13.9M | 14.4M | 0.5M |
> |----------------|---------------|-------------|------|
> | GCC 7.4.0 | 12.6M | 13.5M | 0.9M |
> |----------------|---------------|-------------|------|
> | ICC 19.0.4.243 | 11.0M | 12.3M | 1.3M |
> |----------------|---------------|-------------|------|
>
> Thanks,
> Marvin
>
Did you verify that the compiler was actually unrolling the loops? You may
need to put __attribute__((optimize("unroll-loops"))) in the function
definition.
/Bruce
> -----Original Message----- > From: Bruce Richardson [mailto:bruce.richardson@intel.com] > Sent: Friday, September 06, 2019 5:12 PM > To: Liu, Yong <yong.liu@intel.com> > Cc: Ilya Maximets <i.maximets@samsung.com>; Bie, Tiwei > <tiwei.bie@intel.com>; maxime.coquelin@redhat.com; dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function > for packed ring > > On Fri, Sep 06, 2019 at 01:42:44AM +0000, Liu, Yong wrote: > > > > > > > -----Original Message----- > > > From: Ilya Maximets [mailto:i.maximets@samsung.com] > > > Sent: Thursday, September 05, 2019 6:31 PM > > > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; > > > maxime.coquelin@redhat.com; dev@dpdk.org > > > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue > function > > > for packed ring > > > > > > On 05.09.2019 19:14, Marvin Liu wrote: > > > > Burst enqueue function will first check whether descriptors are cache > > > > aligned. It will also check prerequisites in the beginning. Burst > > > > enqueue function not support chained mbufs, single packet enqueue > > > > function will handle it. > > > > > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > > > Hi. > > > > > > Can we rely on loop unrolling by compiler instead of repeating each > > > command 4 times? > > > > > > For example: > > > > > > uint64_t len[PACKED_DESCS_BURST]; > > > > > > for (i = 0; i < PACKED_DESCS_BURST; i++) > > > len[i] = descs[avail_idx + i].len; > > > > > > > > > For 'if's: > > > > > > res = false; > > > for (i = 0; i < PACKED_DESCS_BURST; i++) > > > res |= pkts[i]->next != NULL; > > > if (unlikely(res)) > > > return -1; > > > > > > or just > > > > > > for (i = 0; i < PACKED_DESCS_BURST; i++) > > > if (unlikely(pkts[i]->next != NULL)) > > > return -1; > > > > > > Since PACKED_DESCS_BURST is a fairly small constant, loops should be > > > unrolled by compiler producing almost same code. > > > > > > This will significantly reduce code size and will also allow to > > > play with PACKED_DESCS_BURST value without massive code changes. > > > > > > Same is applicable to other patches in the series. > > > > > > What do you think? > > > > > > > Hi Ilya, > > I did some test with the unroll availability of various compilers before. > > All listed compilers will cause loopback performance drop compared to > repeating code version, especially GCC7.4 and ICC. > > Newer compilers will have much less impact (around 3%) on the throughput. > > If we can accept that, repeating code can be replaced with small loop > function. > > > > |----------------|---------------|-------------|------| > > | Compiler | Auto unrolled | Fixed batch | Gap | > > |----------------|---------------|-------------|------| > > | Clang6.0.0 | 13.1M | 13.5M | 0.4M | > > |----------------|---------------|-------------|------| > > | GCC 8.3.0 | 13.9M | 14.4M | 0.5M | > > |----------------|---------------|-------------|------| > > | GCC 7.4.0 | 12.6M | 13.5M | 0.9M | > > |----------------|---------------|-------------|------| > > | ICC 19.0.4.243 | 11.0M | 12.3M | 1.3M | > > |----------------|---------------|-------------|------| > > > > Thanks, > > Marvin > > > Did you verify that the compiler was actually unrolling the loops? You may > need to put __attribute__((optimize("unroll-loops"))) in the function > definition. Thanks for note, Bruce. I only checked GCC compiled binaries, loop have been unrolled. Will double check clang and ICC compiling result. Regards, Marvin > > /Bruce
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Uroll burst loop function into more pieces. Handle descriptors in one cache line simultaneously. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. Cache memory region structure for fast conversion. Disable sofware prefetch is hardware can do better. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 40%. v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (16): vhost: add single packet enqueue function vhost: unify unroll pragma parameter vhost: add burst enqueue function for packed ring vhost: add single packet dequeue function vhost: add burst dequeue function vhost: rename flush shadow used ring functions vhost: flush vhost enqueue shadow ring by burst vhost: add flush function for burst enqueue vhost: buffer vhost dequeue shadow ring vhost: split enqueue and dequeue flush functions vhost: optimize enqueue function of packed ring vhost: add burst and single zero dequeue functions vhost: optimize dequeue function of packed ring vhost: cache address translation result vhost: check whether disable software pre-fetch vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 24 + lib/librte_vhost/rte_vhost.h | 27 + lib/librte_vhost/vhost.h | 33 + lib/librte_vhost/virtio_net.c | 1071 +++++++++++++++++++++++++++------ 4 files changed, 960 insertions(+), 195 deletions(-) -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5b85b832d..2b5c47145 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -774,6 +774,70 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +/* + * Returns -1 on fail, 0 on success + */ +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + + uint16_t avail_idx; + uint16_t max_tries, tries = 0; + + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + avail_idx = vq->last_avail_idx; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + uint16_t num_buffers = 0; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) { + return -1; + } + + len = RTE_MIN(len, size); + + size -= len; + + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + + *nr_descs += desc_count; + num_buffers += 1; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, + buf_vec, nr_vec, + num_buffers) < 0) { + return 0; + } + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -831,6 +895,35 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Add macro for unifying Clang/ICC/GCC unroll pragma format. Burst functions were contained of several small loops which optimized by compiler’s loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..30839a001 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 884befa85..5074226f0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,24 @@ #define VHOST_LOG_CACHE_NR 32 +#ifdef SUPPORT_GCC_UNROLL_PRAGMA +#define PRAGMA_PARAM "GCC unroll 4" +#endif + +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA +#define PRAGMA_PARAM "unroll 4" +#endif + +#ifdef SUPPORT_ICC_UNROLL_PRAGMA +#define PRAGMA_PARAM "unroll (4)" +#endif + +#ifdef PRAGMA_PARAM +#define UNROLL_PRAGMA(param) _Pragma(param) +#else +#define UNROLL_PRAGMA(param) do {} while(0); +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Burst enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Burst enqueue function not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 5074226f0..67889c80a 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) + #ifdef SUPPORT_GCC_UNROLL_PRAGMA #define PRAGMA_PARAM "GCC unroll 4" #endif @@ -57,6 +60,8 @@ #define UNROLL_PRAGMA(param) do {} while(0); #endif +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 2b5c47145..c664b27c5 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -895,6 +895,84 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused __rte_always_inline int +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + + uint64_t desc_addrs[PACKED_DESCS_BURST]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_DESCS_BURST]; + + uint16_t i; + + if (unlikely(avail_idx & PACKED_BURST_MASK)) + return -1; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + lens[i] = descs[avail_idx + i].len; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq->last_avail_idx += PACKED_DESCS_BURST; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index c664b27c5..047fa7dc8 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1580,6 +1580,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) { + return -1; + } + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add burst dequeue function like enqueue function for packed ring, burst dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 67889c80a..9fa3c8adf 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -61,6 +61,7 @@ #endif #define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) +#define DESC_SINGLE_DEQUEUE (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT) /** * Structure contains buffer address, length and descriptor index diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 047fa7dc8..23c0f4685 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1580,6 +1580,121 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t avail_idx, uintptr_t *desc_addrs, uint16_t *ids) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint64_t lens[PACKED_DESCS_BURST]; + uint64_t buf_lens[PACKED_DESCS_BURST]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BURST_MASK)) + return -1; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + if (unlikely(descs[avail_idx + i].flags & + DESC_SINGLE_DEQUEUE)) + return -1; + } + + rte_smp_rmb(); + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + lens[i] = descs[avail_idx + i].len; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_DESCS_BURST)) + return -1; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + return 0; + +free_buf: + for (i = 0; i < PACKED_DESCS_BURST; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_DESCS_BURST]; + uint16_t ids[PACKED_DESCS_BURST]; + int ret; + struct virtio_net_hdr *hdr; + uint16_t i; + + ret = vhost_dequeue_burst_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addrs, ids); + + if (ret) + return ret; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + } + + if (virtio_net_with_host_offload(dev)) { + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + vq->last_avail_idx += PACKED_DESCS_BURST; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, -- 2.17.1
Simplify flush shadow used ring function names as all shadow rings are reflect to used rings. No need to emphasize ring type. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 23c0f4685..ebd6c175d 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -38,7 +38,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) } static __rte_always_inline void -do_flush_shadow_used_ring_split(struct virtio_net *dev, +do_flush_shadow_split(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t to, uint16_t from, uint16_t size) { @@ -51,22 +51,22 @@ do_flush_shadow_used_ring_split(struct virtio_net *dev, } static __rte_always_inline void -flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) +flush_shadow_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { uint16_t used_idx = vq->last_used_idx & (vq->size - 1); if (used_idx + vq->shadow_used_idx <= vq->size) { - do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, + do_flush_shadow_split(dev, vq, used_idx, 0, vq->shadow_used_idx); } else { uint16_t size; /* update used ring interval [used_idx, vq->size] */ size = vq->size - used_idx; - do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size); + do_flush_shadow_split(dev, vq, used_idx, 0, size); /* update the left half used ring interval [0, left_size] */ - do_flush_shadow_used_ring_split(dev, vq, 0, size, + do_flush_shadow_split(dev, vq, 0, size, vq->shadow_used_idx - size); } vq->last_used_idx += vq->shadow_used_idx; @@ -82,7 +82,7 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) } static __rte_always_inline void -update_shadow_used_ring_split(struct vhost_virtqueue *vq, +update_shadow_split(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len) { uint16_t i = vq->shadow_used_idx++; @@ -92,7 +92,7 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, +flush_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { int i; @@ -159,7 +159,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, } static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, +update_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) { uint16_t i = vq->shadow_used_idx++; @@ -421,7 +421,7 @@ reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, VHOST_ACCESS_RW) < 0)) return -1; len = RTE_MIN(len, size); - update_shadow_used_ring_split(vq, head_idx, len); + update_shadow_split(vq, head_idx, len); size -= len; cur_idx++; @@ -597,7 +597,7 @@ reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return -1; len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); + update_shadow_packed(vq, buf_id, len, desc_count); size -= len; avail_idx += desc_count; @@ -888,7 +888,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_split(dev, vq); + flush_shadow_split(dev, vq); vhost_vring_call_split(dev, vq); } @@ -1046,7 +1046,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } @@ -1475,8 +1475,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, next = TAILQ_NEXT(zmbuf, next); if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_split(vq, - zmbuf->desc_idx, 0); + update_shadow_split(vq, zmbuf->desc_idx, 0); TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); restore_mbuf(zmbuf->mbuf); rte_pktmbuf_free(zmbuf->mbuf); @@ -1486,7 +1485,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_split(dev, vq); + flush_shadow_split(dev, vq); vhost_vring_call_split(dev, vq); } } @@ -1526,7 +1525,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, break; if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_split(vq, head_idx, 0); + update_shadow_split(vq, head_idx, 0); pkts[i] = rte_pktmbuf_alloc(mbuf_pool); if (unlikely(pkts[i] == NULL)) { @@ -1572,7 +1571,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_split(dev, vq); + flush_shadow_split(dev, vq); vhost_vring_call_split(dev, vq); } } @@ -1764,7 +1763,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, next = TAILQ_NEXT(zmbuf, next); if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, + update_shadow_packed(vq, zmbuf->desc_idx, 0, zmbuf->desc_count); @@ -1778,7 +1777,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } @@ -1804,7 +1803,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, + update_shadow_packed(vq, buf_id, 0, desc_count); pkts[i] = rte_pktmbuf_alloc(mbuf_pool); @@ -1857,7 +1856,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } -- 2.17.1
Buffer vhost enqueue shadow ring update, flush shadow ring until buffered descriptors number exceed one burst. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 9fa3c8adf..000648dd4 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -163,6 +163,7 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + uint16_t enqueue_shadow_count; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ebd6c175d..e2787b72e 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -169,6 +169,24 @@ update_shadow_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, + uint32_t len, uint16_t count) +{ + /* enqueue shadow flush action aligned with burst num */ + if (!vq->shadow_used_idx) + vq->enqueue_shadow_count = vq->last_used_idx & + PACKED_BURST_MASK; + + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; + + vq->enqueue_shadow_count += count; +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -198,6 +216,21 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +flush_enqueue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint32_t len[], uint16_t id[], + uint16_t count[], uint16_t num_buffers) +{ + int i; + for (i = 0; i < num_buffers; i++) { + update_enqueue_shadow_packed(vq, id[i], len[i], count[i]); + + if (vq->enqueue_shadow_count >= PACKED_DESCS_BURST) { + do_data_copy_enqueue(dev, vq); + flush_shadow_packed(dev, vq); + } + } +} /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -799,6 +832,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, max_tries = 1; uint16_t num_buffers = 0; + uint32_t buffer_len[max_tries]; + uint16_t buffer_buf_id[max_tries]; + uint16_t buffer_desc_count[max_tries]; while (size > 0) { /* @@ -821,6 +857,10 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; + avail_idx += desc_count; if (avail_idx >= vq->size) avail_idx -= vq->size; @@ -835,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } + flush_enqueue_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } -- 2.17.1
Flush used flags when burst enqueue function is finished. Descriptor's flags are pre-calculated as them will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 000648dd4..9c42c7db0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ + | VRING_DESC_F_WRITE) +#define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) #define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index e2787b72e..8e4036204 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -169,6 +169,51 @@ update_shadow_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t *lens, uint16_t *ids, uint16_t flags) +{ + uint16_t i; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + i].flags = flags; + } + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_DESCS_BURST); + vhost_log_cache_sync(dev, vq); + + vq->last_used_idx += PACKED_DESCS_BURST; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static __rte_always_inline void +flush_enqueue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t *lens, uint16_t *ids) +{ + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = VIRTIO_RX_USED_FLAG; + else + flags = VIRTIO_RX_USED_WRAP_FLAG; + + flush_burst_packed(dev, vq, lens, ids, flags); +} + static __rte_always_inline void update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -950,6 +995,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_DESCS_BURST]; + uint16_t ids[PACKED_DESCS_BURST]; uint16_t i; @@ -1013,6 +1059,12 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + ids[i] = descs[avail_idx + i].id; + + flush_enqueue_burst_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor index and its wrap counter. First shadowed ring index is recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 9c42c7db0..14e87f670 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ | VRING_DESC_F_WRITE) #define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) +#define VIRTIO_TX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) +#define VIRTIO_TX_USED_WRAP_FLAG (0x0) #define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) @@ -110,9 +112,11 @@ struct log_cache_entry { }; struct vring_used_elem_packed { + uint16_t used_idx; uint16_t id; uint32_t len; uint32_t count; + uint16_t used_wrap_counter; }; /** @@ -167,6 +171,7 @@ struct vhost_virtqueue { }; uint16_t shadow_used_idx; uint16_t enqueue_shadow_count; + uint16_t dequeue_shadow_head; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 8e4036204..94c1b8dc7 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -232,6 +232,43 @@ update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, vq->enqueue_shadow_count += count; } +static __rte_always_inline void +update_dequeue_shadow_packed(struct vhost_virtqueue *vq, uint16_t buf_id, + uint16_t count) +{ + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + + if (vq->used_wrap_counter) + vq->desc_packed[vq->last_used_idx].flags = + VIRTIO_TX_USED_FLAG; + else + vq->desc_packed[vq->last_used_idx].flags = + VIRTIO_TX_USED_WRAP_FLAG; + + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1835,6 +1872,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; + update_dequeue_shadow_packed(vq, buf_id, desc_count); + vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { vq->last_avail_idx -= vq->size; -- 2.17.1
Vhost enqueue descriptors are updated by burst number, while vhost dequeue descriptors are buffered. Meanwhile in dequeue function only first descriptor is buffered. Due to these differences, split vhost enqueue and dequeue flush functions. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 94c1b8dc7..c5c86c219 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -92,8 +92,7 @@ update_shadow_split(struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) +flush_enqueue_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { int i; uint16_t used_idx = vq->last_used_idx; @@ -158,6 +157,31 @@ flush_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +flush_dequeue_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + uint16_t head_idx = vq->dequeue_shadow_head; + uint16_t head_flags; + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + if (used_elem->used_wrap_counter) + head_flags = VIRTIO_TX_USED_FLAG; + else + head_flags = VIRTIO_TX_USED_WRAP_FLAG; + + vq->desc_packed[head_idx].id = used_elem->id; + + rte_smp_wmb(); + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void update_shadow_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -200,6 +224,51 @@ flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags = 0; + uint16_t i; + + if (vq->used_wrap_counter) + flags = VIRTIO_TX_USED_FLAG; + else + flags = VIRTIO_TX_USED_WRAP_FLAG; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 1; i < PACKED_DESCS_BURST; i++) + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 1; i < PACKED_DESCS_BURST; i++) { + rte_smp_wmb(); + vq->desc_packed[vq->last_used_idx + i].flags = flags; + } + + vq->shadow_used_idx = 1; + + vq->last_used_idx += PACKED_DESCS_BURST; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } else { + uint64_t lens[PACKED_DESCS_BURST] = {0}; + flush_burst_packed(dev, vq, lens, ids, flags); + } +} + static __rte_always_inline void flush_enqueue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t *lens, uint16_t *ids) @@ -309,10 +378,29 @@ flush_enqueue_packed(struct virtio_net *dev, if (vq->enqueue_shadow_count >= PACKED_DESCS_BURST) { do_data_copy_enqueue(dev, vq); - flush_shadow_packed(dev, vq); + flush_enqueue_shadow_packed(dev, vq); } } } + +static __rte_unused __rte_always_inline void +flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (!vq->shadow_used_idx) + return; + + int16_t shadow_count = vq->last_used_idx - vq->dequeue_shadow_head; + if (shadow_count <= 0) + shadow_count += vq->size; + + /* buffer used descs as many as possible when doing dequeue */ + if ((uint16_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1178,7 +1266,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_packed(dev, vq); + flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } @@ -1810,6 +1898,7 @@ virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } + update_dequeue_burst_packed(dev, vq, ids); if (virtio_net_with_host_offload(dev)) { UNROLL_PRAGMA(PRAGMA_PARAM) for (i = 0; i < PACKED_DESCS_BURST; i++) { @@ -1911,7 +2000,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_packed(dev, vq); + flush_dequeue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } @@ -1990,7 +2079,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_packed(dev, vq); + flush_dequeue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } -- 2.17.1
Optimize vhost device Tx datapath by separate functions. Packets can be filled into one descriptor will be handled by burst and others will be handled one by one as before. Pre-fetch descriptors in next two cache lines as hardware will load two cache line data automatically. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index c5c86c219..2418b4e45 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -758,64 +758,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1108,7 +1050,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused __rte_always_inline int +static __rte_always_inline int virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) { @@ -1193,7 +1135,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) { @@ -1227,46 +1169,41 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + uint32_t remained = count; + uint16_t fetch_idx; + int ret; + struct vring_packed_desc *descs = vq->desc_packed; - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + do { + if ((vq->last_avail_idx & 0x7) == 0) { + fetch_idx = vq->last_avail_idx + 8; + rte_prefetch0((void *)(uintptr_t)&descs[fetch_idx]); } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); + if (remained >= PACKED_DESCS_BURST) { + ret = virtio_dev_rx_burst_packed(dev, vq, pkts); - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; - break; + if (!ret) { + pkt_idx += PACKED_DESCS_BURST; + remained -= PACKED_DESCS_BURST; + continue; + } } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) + break; - do_data_copy_enqueue(dev, vq); + pkt_idx++; + remained--; + + } while (pkt_idx < count); + + if (pkt_idx) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); + flush_enqueue_shadow_packed(dev, vq); + } - if (likely(vq->shadow_used_idx)) { - flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Optimize vhost zero copy dequeue path like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 2418b4e45..a8df74f87 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1909,6 +1909,144 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +static __rte_unused __rte_always_inline int +virtio_dev_tx_burst_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_DESCS_BURST]; + uintptr_t desc_addrs[PACKED_DESCS_BURST]; + uint16_t ids[PACKED_DESCS_BURST]; + int ret; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + ret = vhost_dequeue_burst_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addrs, ids); + + if (ret) + return ret; + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + zmbufs[i] = get_zmbuf(vq); + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + if (!zmbufs[i]) + goto free_pkt; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + rte_mbuf_refcnt_update(pkts[i], 1); + + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_DESCS_BURST; + vq->last_avail_idx += PACKED_DESCS_BURST; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; + +free_pkt: + UNROLL_PRAGMA(PRAGMA_PARAM) + for (i = 0; i < PACKED_DESCS_BURST; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + +static __rte_unused void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = VIRTIO_TX_USED_FLAG; + else + flags = VIRTIO_TX_USED_WRAP_FLAG; + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq->last_used_idx += zmbuf->desc_count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device Rx datapath by separate functions. No-chained and direct descriptors will be handled by burst and other will be handled one by one as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index a8df74f87..066514e43 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -182,17 +182,6 @@ flush_dequeue_shadow_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) vhost_log_cache_sync(dev, vq); } -static __rte_always_inline void -update_shadow_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static __rte_always_inline void flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t *lens, uint16_t *ids, uint16_t flags) @@ -383,7 +372,7 @@ flush_enqueue_packed(struct virtio_net *dev, } } -static __rte_unused __rte_always_inline void +static __rte_always_inline void flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { if (!vq->shadow_used_idx) @@ -1809,7 +1798,7 @@ vhost_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1887,7 +1876,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1909,7 +1898,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused __rte_always_inline int +static __rte_always_inline int virtio_dev_tx_burst_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1971,7 +1960,7 @@ virtio_dev_tx_burst_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) @@ -2006,7 +1995,7 @@ virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, return 0; } -static __rte_unused void +static __rte_always_inline void free_zmbuf(struct vhost_virtqueue *vq) { struct zcopy_mbuf *next = NULL; @@ -2048,120 +2037,97 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; + uint32_t pkt_idx = 0; + uint32_t remained = count; + int ret; - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + free_zmbuf(vq); - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + do { + if (remained >= PACKED_DESCS_BURST) { + ret = virtio_dev_tx_burst_packed_zmbuf(dev, vq, + mbuf_pool, + &pkts[pkt_idx]); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + if (!ret) { + pkt_idx += PACKED_DESCS_BURST; + remained -= PACKED_DESCS_BURST; + continue; } } - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t dummy_len; - uint16_t desc_count, nr_vec = 0; - int err; + pkt_idx++; + remained--; + } while (remained); - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &dummy_len, - VHOST_ACCESS_RO) < 0)) - break; + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_packed(vq, buf_id, 0, - desc_count); + return pkt_idx; +} - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; - } +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; + uint16_t fetch_idx; + int ret; + struct vring_packed_desc *descs = vq->desc_packed; - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; + do { + if ((vq->last_avail_idx & 0x7) == 0) { + fetch_idx = vq->last_avail_idx + 8; + rte_prefetch0((void *)(uintptr_t)&descs[fetch_idx]); } - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; + if (remained >= PACKED_DESCS_BURST) { + ret = virtio_dev_tx_burst_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx]); - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; + if (!ret) { + flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_DESCS_BURST; + remained -= PACKED_DESCS_BURST; + continue; } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; + } - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); + /* + * If remained descs can't bundled into one burst, just skip to + * next round. + */ + if (((vq->last_avail_idx & PACKED_BURST_MASK) + remained) < + PACKED_DESCS_BURST) + break; - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } + pkt_idx++; + remained--; + flush_dequeue_packed(dev, vq); - if (likely(dev->dequeue_zero_copy == 0)) { - do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } + } while (remained); + + if (pkt_idx) { + if (vq->shadow_used_idx) + do_data_copy_dequeue(vq); } - return i; + return pkt_idx; } + uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) @@ -2235,9 +2201,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
Cache address translation result and use it in next translation. Due to limited regions are supported, buffers are most likely in same region when doing data transmission. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index 7fb172912..d90235cd6 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -91,10 +91,18 @@ struct rte_vhost_mem_region { int fd; }; +struct rte_vhost_mem_region_cache { + uint64_t guest_phys_addr; + uint64_t guest_phys_addr_end; + int64_t host_user_addr_offset; + uint64_t size; +}; + /** * Memory structure includes region and mapping information. */ struct rte_vhost_memory { + struct rte_vhost_mem_region_cache cache_region; uint32_t nregions; struct rte_vhost_mem_region regions[]; }; @@ -232,11 +240,30 @@ rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, struct rte_vhost_mem_region *r; uint32_t i; + struct rte_vhost_mem_region_cache *r_cache; + /* check with cached region */ + r_cache = &mem->cache_region; + if (likely(gpa >= r_cache->guest_phys_addr && gpa < + r_cache->guest_phys_addr_end)) { + if (unlikely(*len > r_cache->guest_phys_addr_end - gpa)) + *len = r_cache->guest_phys_addr_end - gpa; + + return gpa - r_cache->host_user_addr_offset; + } + + for (i = 0; i < mem->nregions; i++) { r = &mem->regions[i]; if (gpa >= r->guest_phys_addr && gpa < r->guest_phys_addr + r->size) { + r_cache->guest_phys_addr = r->guest_phys_addr; + r_cache->guest_phys_addr_end = r->guest_phys_addr + + r->size; + r_cache->size = r->size; + r_cache->host_user_addr_offset = r->guest_phys_addr - + r->host_user_addr; + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) *len = r->guest_phys_addr + r->size - gpa; -- 2.17.1
Disable software pre-fetch actions on Skylake and Cascadelake platforms. Hardware can fetch needed data for vhost, additional software pre-fetch will have impact on performance. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 30839a001..5f3b42e56 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep AVX512F) + +ifneq ($(AVX512_SUPPORT),) +CFLAGS += -DDISABLE_SWPREFETCH +endif + ifeq ($(RTE_TOOLCHAIN), gcc) ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 066514e43..357517cdd 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -478,7 +478,9 @@ map_one_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(!desc_addr)) return -1; +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addr); +#endif buf_vec[vec_id].buf_iova = desc_iova; buf_vec[vec_id].buf_addr = desc_addr; @@ -999,7 +1001,9 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, */ rte_smp_rmb(); +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); +#endif for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; @@ -1093,7 +1097,9 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, UNROLL_PRAGMA(PRAGMA_PARAM) for (i = 0; i < PACKED_DESCS_BURST; i++) { +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); +#endif hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)desc_addrs[i]; lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; } @@ -1647,7 +1653,9 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, */ rte_smp_rmb(); +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); +#endif VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); @@ -1818,7 +1826,9 @@ virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, UNROLL_PRAGMA(PRAGMA_PARAM) for (i = 0; i < PACKED_DESCS_BURST; i++) { +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); +#endif rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 357517cdd..a7bb4ec79 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -213,6 +219,30 @@ flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_burst_packed_inorder(struct vhost_virtqueue *vq, uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + + } + + vq->last_used_idx += PACKED_DESCS_BURST; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static __rte_always_inline void update_dequeue_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t *ids) @@ -315,7 +345,6 @@ update_dequeue_shadow_packed(struct vhost_virtqueue *vq, uint16_t buf_id, else vq->desc_packed[vq->last_used_idx].flags = VIRTIO_TX_USED_WRAP_FLAG; - } vq->last_used_idx += count; @@ -326,6 +355,31 @@ update_dequeue_shadow_packed(struct vhost_virtqueue *vq, uint16_t buf_id, } } +static __rte_always_inline void +update_dequeue_shadow_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, uint16_t count) +{ + vq->shadow_used_packed[0].id = buf_id; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -1834,7 +1888,12 @@ virtio_dev_tx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } - update_dequeue_burst_packed(dev, vq, ids); + if (virtio_net_is_inorder(dev)) + update_dequeue_burst_packed_inorder(vq, + ids[PACKED_BURST_MASK]); + else + update_dequeue_burst_packed(dev, vq, ids); + if (virtio_net_with_host_offload(dev)) { UNROLL_PRAGMA(PRAGMA_PARAM) for (i = 0; i < PACKED_DESCS_BURST; i++) { @@ -1897,7 +1956,10 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; - update_dequeue_shadow_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + update_dequeue_shadow_packed_inorder(vq, buf_id, desc_count); + else + update_dequeue_shadow_packed(vq, buf_id, desc_count); vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { -- 2.17.1
On Fri, Sep 20, 2019 at 12:36:29AM +0800, Marvin Liu wrote: > Add macro for unifying Clang/ICC/GCC unroll pragma format. Burst > functions were contained of several small loops which optimized by > compiler’s loop unrolling pragma. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > index 8623e91c0..30839a001 100644 > --- a/lib/librte_vhost/Makefile > +++ b/lib/librte_vhost/Makefile > @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user > CFLAGS += -fno-strict-aliasing > LDLIBS += -lpthread > > +ifeq ($(RTE_TOOLCHAIN), gcc) > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) > +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > +endif > +endif > + > +ifeq ($(RTE_TOOLCHAIN), clang) > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) > +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA > +endif > +endif > + > +ifeq ($(RTE_TOOLCHAIN), icc) > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) > +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA > +endif > +endif > + > ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) > LDLIBS += -lnuma > endif You need to add meson support as well. > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 884befa85..5074226f0 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,24 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#ifdef SUPPORT_GCC_UNROLL_PRAGMA > +#define PRAGMA_PARAM "GCC unroll 4" The name "PRAGMA_PARAM" is too generic. > +#endif > + > +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA > +#define PRAGMA_PARAM "unroll 4" > +#endif > + > +#ifdef SUPPORT_ICC_UNROLL_PRAGMA > +#define PRAGMA_PARAM "unroll (4)" > +#endif > + > +#ifdef PRAGMA_PARAM > +#define UNROLL_PRAGMA(param) _Pragma(param) > +#else > +#define UNROLL_PRAGMA(param) do {} while(0); > +#endif > + > /** > * Structure contains buffer address, length and descriptor index > * from vring to do scatter RX. > -- > 2.17.1 >
On Fri, Sep 20, 2019 at 12:36:30AM +0800, Marvin Liu wrote: > Burst enqueue function will first check whether descriptors are cache > aligned. It will also check prerequisites in the beginning. Burst > enqueue function not support chained mbufs, single packet enqueue > function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 5074226f0..67889c80a 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > + sizeof(struct vring_packed_desc)) > + > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > #define PRAGMA_PARAM "GCC unroll 4" > #endif > @@ -57,6 +60,8 @@ > #define UNROLL_PRAGMA(param) do {} while(0); > #endif > > +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) It's better to have consistent names, e.g. something like: PACKED_BATCH_SIZE PACKED_BATCH_MASK instead of PACKED_DESCS_BURST PACKED_BURST_MASK Besides, please also put above two defines together. > + > /** > * Structure contains buffer address, length and descriptor index > * from vring to do scatter RX. > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 2b5c47145..c664b27c5 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -895,6 +895,84 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return pkt_idx; > } > > +static __rte_unused __rte_always_inline int > +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > + struct rte_mbuf **pkts) > +{ > + bool wrap_counter = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + uint16_t avail_idx = vq->last_avail_idx; > + > + uint64_t desc_addrs[PACKED_DESCS_BURST]; > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint64_t lens[PACKED_DESCS_BURST]; > + > + uint16_t i; > + > + if (unlikely(avail_idx & PACKED_BURST_MASK)) > + return -1; We also need to check (avail_idx + PACKED_DESCS_BURST) <= vq->size before accessing descs[avail_idx + i] in the following code. > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + if (unlikely(pkts[i]->next != NULL)) > + return -1; > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > + wrap_counter))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + lens[i] = descs[avail_idx + i].len; > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > + return -1; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], > + VHOST_ACCESS_RW); > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > + return -1; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)desc_addrs[i]; > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > + > + vq->last_avail_idx += PACKED_DESCS_BURST; > + if (vq->last_avail_idx >= vq->size) { > + vq->last_avail_idx -= vq->size; > + vq->avail_wrap_counter ^= 1; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + pkts[i]->pkt_len); > + } > + > + return 0; > +} > + > static __rte_unused int16_t > virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mbuf *pkt) > -- > 2.17.1 >
On Fri, Sep 20, 2019 at 12:36:33AM +0800, Marvin Liu wrote: > Simplify flush shadow used ring function names as all shadow rings are > reflect to used rings. No need to emphasize ring type. I think the old name "flush_shadow_used_ring" is more readable than the new name "flush_shadow". In the new name, it's not clear what the "shadow" is. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 23c0f4685..ebd6c175d 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -38,7 +38,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) > } > > static __rte_always_inline void > -do_flush_shadow_used_ring_split(struct virtio_net *dev, > +do_flush_shadow_split(struct virtio_net *dev, > struct vhost_virtqueue *vq, > uint16_t to, uint16_t from, uint16_t size) > { > @@ -51,22 +51,22 @@ do_flush_shadow_used_ring_split(struct virtio_net *dev, > } > > static __rte_always_inline void > -flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) > +flush_shadow_split(struct virtio_net *dev, struct vhost_virtqueue *vq) > { > uint16_t used_idx = vq->last_used_idx & (vq->size - 1); > > if (used_idx + vq->shadow_used_idx <= vq->size) { > - do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, > + do_flush_shadow_split(dev, vq, used_idx, 0, > vq->shadow_used_idx); > } else { > uint16_t size; > > /* update used ring interval [used_idx, vq->size] */ > size = vq->size - used_idx; > - do_flush_shadow_used_ring_split(dev, vq, used_idx, 0, size); > + do_flush_shadow_split(dev, vq, used_idx, 0, size); > > /* update the left half used ring interval [0, left_size] */ > - do_flush_shadow_used_ring_split(dev, vq, 0, size, > + do_flush_shadow_split(dev, vq, 0, size, > vq->shadow_used_idx - size); > } > vq->last_used_idx += vq->shadow_used_idx; > @@ -82,7 +82,7 @@ flush_shadow_used_ring_split(struct virtio_net *dev, struct vhost_virtqueue *vq) > } > > static __rte_always_inline void > -update_shadow_used_ring_split(struct vhost_virtqueue *vq, > +update_shadow_split(struct vhost_virtqueue *vq, > uint16_t desc_idx, uint32_t len) > { > uint16_t i = vq->shadow_used_idx++; > @@ -92,7 +92,7 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, > } > > static __rte_always_inline void > -flush_shadow_used_ring_packed(struct virtio_net *dev, > +flush_shadow_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq) > { > int i; > @@ -159,7 +159,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, > } > > static __rte_always_inline void > -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, > +update_shadow_packed(struct vhost_virtqueue *vq, > uint16_t desc_idx, uint32_t len, uint16_t count) > { > uint16_t i = vq->shadow_used_idx++; > @@ -421,7 +421,7 @@ reserve_avail_buf_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > VHOST_ACCESS_RW) < 0)) > return -1; > len = RTE_MIN(len, size); > - update_shadow_used_ring_split(vq, head_idx, len); > + update_shadow_split(vq, head_idx, len); > size -= len; > > cur_idx++; > @@ -597,7 +597,7 @@ reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > return -1; > > len = RTE_MIN(len, size); > - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); > + update_shadow_packed(vq, buf_id, len, desc_count); > size -= len; > > avail_idx += desc_count; > @@ -888,7 +888,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > do_data_copy_enqueue(dev, vq); > > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_split(dev, vq); > + flush_shadow_split(dev, vq); > vhost_vring_call_split(dev, vq); > } > > @@ -1046,7 +1046,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > do_data_copy_enqueue(dev, vq); > > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_packed(dev, vq); > + flush_shadow_packed(dev, vq); > vhost_vring_call_packed(dev, vq); > } > > @@ -1475,8 +1475,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > next = TAILQ_NEXT(zmbuf, next); > > if (mbuf_is_consumed(zmbuf->mbuf)) { > - update_shadow_used_ring_split(vq, > - zmbuf->desc_idx, 0); > + update_shadow_split(vq, zmbuf->desc_idx, 0); > TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); > restore_mbuf(zmbuf->mbuf); > rte_pktmbuf_free(zmbuf->mbuf); > @@ -1486,7 +1485,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > } > > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_split(dev, vq); > + flush_shadow_split(dev, vq); > vhost_vring_call_split(dev, vq); > } > } > @@ -1526,7 +1525,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > break; > > if (likely(dev->dequeue_zero_copy == 0)) > - update_shadow_used_ring_split(vq, head_idx, 0); > + update_shadow_split(vq, head_idx, 0); > > pkts[i] = rte_pktmbuf_alloc(mbuf_pool); > if (unlikely(pkts[i] == NULL)) { > @@ -1572,7 +1571,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > if (unlikely(i < count)) > vq->shadow_used_idx = i; > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_split(dev, vq); > + flush_shadow_split(dev, vq); > vhost_vring_call_split(dev, vq); > } > } > @@ -1764,7 +1763,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > next = TAILQ_NEXT(zmbuf, next); > > if (mbuf_is_consumed(zmbuf->mbuf)) { > - update_shadow_used_ring_packed(vq, > + update_shadow_packed(vq, > zmbuf->desc_idx, > 0, > zmbuf->desc_count); > @@ -1778,7 +1777,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > } > > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_packed(dev, vq); > + flush_shadow_packed(dev, vq); > vhost_vring_call_packed(dev, vq); > } > } > @@ -1804,7 +1803,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > break; > > if (likely(dev->dequeue_zero_copy == 0)) > - update_shadow_used_ring_packed(vq, buf_id, 0, > + update_shadow_packed(vq, buf_id, 0, > desc_count); > > pkts[i] = rte_pktmbuf_alloc(mbuf_pool); > @@ -1857,7 +1856,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > if (unlikely(i < count)) > vq->shadow_used_idx = i; > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_packed(dev, vq); > + flush_shadow_packed(dev, vq); > vhost_vring_call_packed(dev, vq); > } > } > -- > 2.17.1 >
Hi Marvin,
A general comment for the series, could you mark V1 Superseded?
/Gavin
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu
> Sent: Friday, September 20, 2019 12:36 AM
> To: maxime.coquelin@redhat.com; tiwei.bie@intel.com;
> zhihong.wang@intel.com
> Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com>
> Subject: [dpdk-dev] [PATCH v2 00/16] vhost packed ring performance
> optimization
>
> Packed ring has more compact ring format and thus can significantly
> reduce the number of cache miss. It can lead to better performance.
> This has been approved in virtio user driver, on normal E5 Xeon cpu
> single core performance can raise 12%.
>
> http://mails.dpdk.org/archives/dev/2018-April/095470.html
>
> However vhost performance with packed ring performance was decreased.
> Through analysis, mostly extra cost was from the calculating of each
> descriptor flag which depended on ring wrap counter. Moreover, both
> frontend and backend need to write same descriptors which will cause
> cache contention. Especially when doing vhost enqueue function, virtio
> refill packed ring function may write same cache line when vhost doing
> enqueue function. This kind of extra cache cost will reduce the benefit
> of reducing cache misses.
>
> For optimizing vhost packed ring performance, vhost enqueue and dequeue
> function will be splitted into fast and normal path.
>
> Several methods will be taken in fast path:
> Uroll burst loop function into more pieces.
> Handle descriptors in one cache line simultaneously.
> Prerequisite check that whether I/O space can copy directly into mbuf
> space and vice versa.
> Prerequisite check that whether descriptor mapping is successful.
> Distinguish vhost used ring update function by enqueue and dequeue
> function.
> Buffer dequeue used descriptors as many as possible.
> Update enqueue used descriptors by cache line.
> Cache memory region structure for fast conversion.
> Disable sofware prefetch is hardware can do better.
>
> After all these methods done, single core vhost PvP performance with 64B
> packet on Xeon 8180 can boost 40%.
>
> v2:
> - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc
> - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST)
> - Optimize dequeue used ring update when in_order negotiated
>
> Marvin Liu (16):
> vhost: add single packet enqueue function
> vhost: unify unroll pragma parameter
> vhost: add burst enqueue function for packed ring
> vhost: add single packet dequeue function
> vhost: add burst dequeue function
> vhost: rename flush shadow used ring functions
> vhost: flush vhost enqueue shadow ring by burst
> vhost: add flush function for burst enqueue
> vhost: buffer vhost dequeue shadow ring
> vhost: split enqueue and dequeue flush functions
> vhost: optimize enqueue function of packed ring
> vhost: add burst and single zero dequeue functions
> vhost: optimize dequeue function of packed ring
> vhost: cache address translation result
> vhost: check whether disable software pre-fetch
> vhost: optimize packed ring dequeue when in-order
>
> lib/librte_vhost/Makefile | 24 +
> lib/librte_vhost/rte_vhost.h | 27 +
> lib/librte_vhost/vhost.h | 33 +
> lib/librte_vhost/virtio_net.c | 1071 +++++++++++++++++++++++++++------
> 4 files changed, 960 insertions(+), 195 deletions(-)
>
> --
> 2.17.1
IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Hi Marvin, > -----Original Message----- > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > Sent: Friday, September 20, 2019 12:36 AM > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > Subject: [dpdk-dev] [PATCH v2 01/16] vhost: add single packet enqueue > function > > Add vhost enqueue function for single packet and meanwhile left space > for flush used ring function. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 5b85b832d..2b5c47145 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -774,6 +774,70 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct > vhost_virtqueue *vq, > return error; > } > > +/* > + * Returns -1 on fail, 0 on success > + */ > +static __rte_always_inline int > +vhost_enqueue_single_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t > *nr_descs) > +{ > + uint16_t nr_vec = 0; > + > + uint16_t avail_idx; > + uint16_t max_tries, tries = 0; > + > + uint16_t buf_id = 0; > + uint32_t len = 0; > + uint16_t desc_count; > + > + uint32_t size = pkt->pkt_len + dev->vhost_hlen; > + avail_idx = vq->last_avail_idx; > + > + if (rxvq_is_mergeable(dev)) > + max_tries = vq->size - 1; > + else > + max_tries = 1; > + > + uint16_t num_buffers = 0; > + > + while (size > 0) { > + /* > + * if we tried all available ring items, and still > + * can't get enough buf, it means something abnormal > + * happened. > + */ > + if (unlikely(++tries > max_tries)) > + return -1; > + > + if (unlikely(fill_vec_buf_packed(dev, vq, > + avail_idx, &desc_count, > + buf_vec, &nr_vec, > + &buf_id, &len, > + VHOST_ACCESS_RW) < 0)) { > + return -1; > + } > + > + len = RTE_MIN(len, size); > + > + size -= len; > + > + avail_idx += desc_count; > + if (avail_idx >= vq->size) > + avail_idx -= vq->size; > + > + *nr_descs += desc_count; > + num_buffers += 1; > + } > + > + if (copy_mbuf_to_desc(dev, vq, pkt, > + buf_vec, nr_vec, > + num_buffers) < 0) { > + return 0; Why return 0, which means success, while "copy_mbuf_to_desc" encounters some problems and failed? /Gavin > + } > + > + return 0; > +} > + > static __rte_noinline uint32_t > virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mbuf **pkts, uint32_t count) > @@ -831,6 +895,35 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > vhost_virtqueue *vq, > return pkt_idx; > } > > +static __rte_unused int16_t > +virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue > *vq, > + struct rte_mbuf *pkt) > +{ > + struct buf_vector buf_vec[BUF_VECTOR_MAX]; > + uint16_t nr_descs = 0; > + > + rte_smp_rmb(); > + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, > + &nr_descs) < 0)) { > + VHOST_LOG_DEBUG(VHOST_DATA, > + "(%d) failed to get enough desc from > vring\n", > + dev->vid); > + return -1; > + } > + > + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end > index %d\n", > + dev->vid, vq->last_avail_idx, > + vq->last_avail_idx + nr_descs); > + > + vq->last_avail_idx += nr_descs; > + if (vq->last_avail_idx >= vq->size) { > + vq->last_avail_idx -= vq->size; > + vq->avail_wrap_counter ^= 1; > + } > + > + return 0; > +} > + > static __rte_noinline uint32_t > virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mbuf **pkts, uint32_t count) > -- > 2.17.1 IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
> -----Original Message----- > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > Sent: Monday, September 23, 2019 5:09 PM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > Cc: dev@dpdk.org > Subject: RE: [dpdk-dev] [PATCH v2 01/16] vhost: add single packet enqueue > function > > Hi Marvin, > > > -----Original Message----- > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > Sent: Friday, September 20, 2019 12:36 AM > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > zhihong.wang@intel.com > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > Subject: [dpdk-dev] [PATCH v2 01/16] vhost: add single packet enqueue > > function > > > > Add vhost enqueue function for single packet and meanwhile left space > > for flush used ring function. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index 5b85b832d..2b5c47145 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -774,6 +774,70 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct > > vhost_virtqueue *vq, > > return error; > > } > > > > +/* > > + * Returns -1 on fail, 0 on success > > + */ > > +static __rte_always_inline int > > +vhost_enqueue_single_packed(struct virtio_net *dev, struct > > vhost_virtqueue *vq, > > + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t > > *nr_descs) > > +{ > > + uint16_t nr_vec = 0; > > + > > + uint16_t avail_idx; > > + uint16_t max_tries, tries = 0; > > + > > + uint16_t buf_id = 0; > > + uint32_t len = 0; > > + uint16_t desc_count; > > + > > + uint32_t size = pkt->pkt_len + dev->vhost_hlen; > > + avail_idx = vq->last_avail_idx; > > + > > + if (rxvq_is_mergeable(dev)) > > + max_tries = vq->size - 1; > > + else > > + max_tries = 1; > > + > > + uint16_t num_buffers = 0; > > + > > + while (size > 0) { > > + /* > > + * if we tried all available ring items, and still > > + * can't get enough buf, it means something abnormal > > + * happened. > > + */ > > + if (unlikely(++tries > max_tries)) > > + return -1; > > + > > + if (unlikely(fill_vec_buf_packed(dev, vq, > > + avail_idx, &desc_count, > > + buf_vec, &nr_vec, > > + &buf_id, &len, > > + VHOST_ACCESS_RW) < 0)) { > > + return -1; > > + } > > + > > + len = RTE_MIN(len, size); > > + > > + size -= len; > > + > > + avail_idx += desc_count; > > + if (avail_idx >= vq->size) > > + avail_idx -= vq->size; > > + > > + *nr_descs += desc_count; > > + num_buffers += 1; > > + } > > + > > + if (copy_mbuf_to_desc(dev, vq, pkt, > > + buf_vec, nr_vec, > > + num_buffers) < 0) { > > + return 0; > Why return 0, which means success, while "copy_mbuf_to_desc" encounters > some problems and failed? > /Gavin Gavin, Thanks for notice this typo issue. Here should return negative value -1. Regards, Marvin > > + } > > + > > + return 0; > > +} > > + > > static __rte_noinline uint32_t > > virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > > struct rte_mbuf **pkts, uint32_t count) > > @@ -831,6 +895,35 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > > vhost_virtqueue *vq, > > return pkt_idx; > > } > > > > +static __rte_unused int16_t > > +virtio_dev_rx_single_packed(struct virtio_net *dev, struct > vhost_virtqueue > > *vq, > > + struct rte_mbuf *pkt) > > +{ > > + struct buf_vector buf_vec[BUF_VECTOR_MAX]; > > + uint16_t nr_descs = 0; > > + > > + rte_smp_rmb(); > > + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, > > + &nr_descs) < 0)) { > > + VHOST_LOG_DEBUG(VHOST_DATA, > > + "(%d) failed to get enough desc from > > vring\n", > > + dev->vid); > > + return -1; > > + } > > + > > + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end > > index %d\n", > > + dev->vid, vq->last_avail_idx, > > + vq->last_avail_idx + nr_descs); > > + > > + vq->last_avail_idx += nr_descs; > > + if (vq->last_avail_idx >= vq->size) { > > + vq->last_avail_idx -= vq->size; > > + vq->avail_wrap_counter ^= 1; > > + } > > + > > + return 0; > > +} > > + > > static __rte_noinline uint32_t > > virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > > struct rte_mbuf **pkts, uint32_t count) > > -- > > 2.17.1 > > IMPORTANT NOTICE: The contents of this email and any attachments are > confidential and may also be privileged. If you are not the intended > recipient, please notify the sender immediately and do not disclose the > contents to any other person, use it for any purpose, or store or copy the > information in any medium. Thank you.
Sure, have changed state of V1.
> -----Original Message-----
> From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com]
> Sent: Monday, September 23, 2019 5:05 PM
> To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei
> <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 00/16] vhost packed ring performance
> optimization
>
> Hi Marvin,
>
> A general comment for the series, could you mark V1 Superseded?
>
> /Gavin
>
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu
> > Sent: Friday, September 20, 2019 12:36 AM
> > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com;
> > zhihong.wang@intel.com
> > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com>
> > Subject: [dpdk-dev] [PATCH v2 00/16] vhost packed ring performance
> > optimization
> >
> > Packed ring has more compact ring format and thus can significantly
> > reduce the number of cache miss. It can lead to better performance.
> > This has been approved in virtio user driver, on normal E5 Xeon cpu
> > single core performance can raise 12%.
> >
> > http://mails.dpdk.org/archives/dev/2018-April/095470.html
> >
> > However vhost performance with packed ring performance was decreased.
> > Through analysis, mostly extra cost was from the calculating of each
> > descriptor flag which depended on ring wrap counter. Moreover, both
> > frontend and backend need to write same descriptors which will cause
> > cache contention. Especially when doing vhost enqueue function, virtio
> > refill packed ring function may write same cache line when vhost doing
> > enqueue function. This kind of extra cache cost will reduce the benefit
> > of reducing cache misses.
> >
> > For optimizing vhost packed ring performance, vhost enqueue and dequeue
> > function will be splitted into fast and normal path.
> >
> > Several methods will be taken in fast path:
> > Uroll burst loop function into more pieces.
> > Handle descriptors in one cache line simultaneously.
> > Prerequisite check that whether I/O space can copy directly into mbuf
> > space and vice versa.
> > Prerequisite check that whether descriptor mapping is successful.
> > Distinguish vhost used ring update function by enqueue and dequeue
> > function.
> > Buffer dequeue used descriptors as many as possible.
> > Update enqueue used descriptors by cache line.
> > Cache memory region structure for fast conversion.
> > Disable sofware prefetch is hardware can do better.
> >
> > After all these methods done, single core vhost PvP performance with 64B
> > packet on Xeon 8180 can boost 40%.
> >
> > v2:
> > - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc
> > - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST)
> > - Optimize dequeue used ring update when in_order negotiated
> >
> > Marvin Liu (16):
> > vhost: add single packet enqueue function
> > vhost: unify unroll pragma parameter
> > vhost: add burst enqueue function for packed ring
> > vhost: add single packet dequeue function
> > vhost: add burst dequeue function
> > vhost: rename flush shadow used ring functions
> > vhost: flush vhost enqueue shadow ring by burst
> > vhost: add flush function for burst enqueue
> > vhost: buffer vhost dequeue shadow ring
> > vhost: split enqueue and dequeue flush functions
> > vhost: optimize enqueue function of packed ring
> > vhost: add burst and single zero dequeue functions
> > vhost: optimize dequeue function of packed ring
> > vhost: cache address translation result
> > vhost: check whether disable software pre-fetch
> > vhost: optimize packed ring dequeue when in-order
> >
> > lib/librte_vhost/Makefile | 24 +
> > lib/librte_vhost/rte_vhost.h | 27 +
> > lib/librte_vhost/vhost.h | 33 +
> > lib/librte_vhost/virtio_net.c | 1071 +++++++++++++++++++++++++++------
> > 4 files changed, 960 insertions(+), 195 deletions(-)
> >
> > --
> > 2.17.1
>
> IMPORTANT NOTICE: The contents of this email and any attachments are
> confidential and may also be privileged. If you are not the intended
> recipient, please notify the sender immediately and do not disclose the
> contents to any other person, use it for any purpose, or store or copy the
> information in any medium. Thank you.
Hi Marvin, One general comment and other comments inline: 1. Meson build should also be supported as Makefile is phasing out and Meson is the future in DPDK. /Gavin > -----Original Message----- > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > Sent: Friday, September 20, 2019 12:36 AM > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > Subject: [dpdk-dev] [PATCH v2 02/16] vhost: unify unroll pragma parameter > > Add macro for unifying Clang/ICC/GCC unroll pragma format. Burst > functions were contained of several small loops which optimized by > compiler’s loop unrolling pragma. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > index 8623e91c0..30839a001 100644 > --- a/lib/librte_vhost/Makefile > +++ b/lib/librte_vhost/Makefile > @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user > CFLAGS += -fno-strict-aliasing > LDLIBS += -lpthread > > +ifeq ($(RTE_TOOLCHAIN), gcc) > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) It is better to move this toolchain version related definition to eg: mk/toolchain/icc/rte.toolchain-compat.mk. There are a lot of similar stuff over there. Although "CFLAGS" was added to sth under this subfolder, it still applies globally to other components. /Gavin > +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > +endif > +endif > + > +ifeq ($(RTE_TOOLCHAIN), clang) > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) - > ge 37 && echo 1), 1) > +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA Why not combine all the three "-DSUPPORT_*_UNROLL_PRAGMA" into one "-DSUPPORT_ UNROLL_PRAGMA" for simplicity? Any differences for the support by different compilers? /Gavin > +endif > +endif > + > +ifeq ($(RTE_TOOLCHAIN), icc) > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) > +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA > +endif > +endif > + > ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) > LDLIBS += -lnuma > endif > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 884befa85..5074226f0 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,24 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#ifdef SUPPORT_GCC_UNROLL_PRAGMA > +#define PRAGMA_PARAM "GCC unroll 4" > +#endif > + > +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA > +#define PRAGMA_PARAM "unroll 4" > +#endif > + > +#ifdef SUPPORT_ICC_UNROLL_PRAGMA > +#define PRAGMA_PARAM "unroll (4)" > +#endif > + > +#ifdef PRAGMA_PARAM > +#define UNROLL_PRAGMA(param) _Pragma(param) > +#else > +#define UNROLL_PRAGMA(param) do {} while(0); > +#endif > + > /** > * Structure contains buffer address, length and descriptor index > * from vring to do scatter RX. > -- > 2.17.1 IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
Hi Marvin, Is it possible to vectorize the processing? Other comments inline: /Gavin > -----Original Message----- > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > Sent: Friday, September 20, 2019 12:37 AM > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > Subject: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue function for > packed ring > > Burst enqueue function will first check whether descriptors are cache > aligned. It will also check prerequisites in the beginning. Burst > enqueue function not support chained mbufs, single packet enqueue > function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 5074226f0..67889c80a 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > + sizeof(struct vring_packed_desc)) > + > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > #define PRAGMA_PARAM "GCC unroll 4" > #endif > @@ -57,6 +60,8 @@ > #define UNROLL_PRAGMA(param) do {} while(0); > #endif > > +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) > + > /** > * Structure contains buffer address, length and descriptor index > * from vring to do scatter RX. > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 2b5c47145..c664b27c5 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -895,6 +895,84 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > vhost_virtqueue *vq, > return pkt_idx; > } > > +static __rte_unused __rte_always_inline int I remember "__rte_always_inline" should start at the first and separate line, otherwise you will get a style issue. /Gavin > +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue > *vq, > + struct rte_mbuf **pkts) > +{ > + bool wrap_counter = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + uint16_t avail_idx = vq->last_avail_idx; > + > + uint64_t desc_addrs[PACKED_DESCS_BURST]; > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint64_t lens[PACKED_DESCS_BURST]; > + > + uint16_t i; > + > + if (unlikely(avail_idx & PACKED_BURST_MASK)) > + return -1; > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + if (unlikely(pkts[i]->next != NULL)) > + return -1; > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > + wrap_counter))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + lens[i] = descs[avail_idx + i].len; Looks like the code is a strong candidate for vectorization. > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > + return -1; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], > + VHOST_ACCESS_RW); > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > + return -1; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)desc_addrs[i]; > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > + A store barrier here is missing, last_avail_idx may be observed before the above enqueue completion on weak memory order architectures. For x86, a compiler barrier is also required. > + vq->last_avail_idx += PACKED_DESCS_BURST; > + if (vq->last_avail_idx >= vq->size) { > + vq->last_avail_idx -= vq->size; > + vq->avail_wrap_counter ^= 1; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + pkts[i]->pkt_len); > + } > + > + return 0; > +} > + > static __rte_unused int16_t > virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue > *vq, > struct rte_mbuf *pkt) > -- > 2.17.1
This code could be more concise. > +/* > + * Returns -1 on fail, 0 on success > + */ That is standard convention, and probably doesn't need a comment. > +static inline int > +vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t *nr_descs) > +{ > + uint16_t nr_vec = 0; > + > + uint16_t avail_idx; > + uint16_t max_tries, tries = 0; > + > + uint16_t buf_id = 0; > + uint32_t len = 0; > + uint16_t desc_count; You don't need extra blank lines in declarations. > + > + uint32_t size = pkt->pkt_len + dev->vhost_hlen; > + avail_idx = vq->last_avail_idx; > + > + if (rxvq_is_mergeable(dev)) > + max_tries = vq->size - 1; > + else > + max_tries = 1; > + > + uint16_t num_buffers = 0; > + > + while (size > 0) { > + /* > + * if we tried all available ring items, and still > + * can't get enough buf, it means something abnormal > + * happened. > + */ > + if (unlikely(++tries > max_tries)) > + return -1; > + > + if (unlikely(fill_vec_buf_packed(dev, vq, > + avail_idx, &desc_count, > + buf_vec, &nr_vec, > + &buf_id, &len, > + VHOST_ACCESS_RW) < 0)) { > + return -1; > + } Brackets are not necessary on single statement return. > + len = RTE_MIN(len, size); > + > + size -= len; No need for blank line in between. > + > + avail_idx += desc_count; > + if (avail_idx >= vq->size) > + avail_idx -= vq->size; > + > + *nr_descs += desc_count; > + num_buffers += 1; > + } > + > + if (copy_mbuf_to_desc(dev, vq, pkt, > + buf_vec, nr_vec, > + num_buffers) < 0) { > + return 0; > + } > + > + return 0; > +} > + > +
> -----Original Message----- > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > Sent: Monday, September 23, 2019 6:09 PM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Richardson, > Bruce <bruce.richardson@intel.com> > Cc: dev@dpdk.org > Subject: RE: [dpdk-dev] [PATCH v2 02/16] vhost: unify unroll pragma > parameter > > Hi Marvin, > > One general comment and other comments inline: > 1. Meson build should also be supported as Makefile is phasing out and > Meson is the future in DPDK. > > /Gavin > Thanks, Gavin. Will update meson build file in next release. > > -----Original Message----- > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > Sent: Friday, September 20, 2019 12:36 AM > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > zhihong.wang@intel.com > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > Subject: [dpdk-dev] [PATCH v2 02/16] vhost: unify unroll pragma parameter > > > > Add macro for unifying Clang/ICC/GCC unroll pragma format. Burst > > functions were contained of several small loops which optimized by > > compiler’s loop unrolling pragma. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > > index 8623e91c0..30839a001 100644 > > --- a/lib/librte_vhost/Makefile > > +++ b/lib/librte_vhost/Makefile > > @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user > > CFLAGS += -fno-strict-aliasing > > LDLIBS += -lpthread > > > > +ifeq ($(RTE_TOOLCHAIN), gcc) > > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) > It is better to move this toolchain version related definition to eg: > mk/toolchain/icc/rte.toolchain-compat.mk. > There are a lot of similar stuff over there. > Although "CFLAGS" was added to sth under this subfolder, it still applies > globally to other components. > /Gavin > > +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > > +endif > > +endif > > + > > +ifeq ($(RTE_TOOLCHAIN), clang) > > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) - > > ge 37 && echo 1), 1) > > +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA > Why not combine all the three "-DSUPPORT_*_UNROLL_PRAGMA" into one "- > DSUPPORT_ UNROLL_PRAGMA" for simplicity? > Any differences for the support by different compilers? > /Gavin Gavin, This is due to parameter format of pragmas are different between compilers. So here created several macros for each compiler. > > +endif > > +endif > > + > > +ifeq ($(RTE_TOOLCHAIN), icc) > > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) > > +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA > > +endif > > +endif > > + > > ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) > > LDLIBS += -lnuma > > endif > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index 884befa85..5074226f0 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,24 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#ifdef SUPPORT_GCC_UNROLL_PRAGMA > > +#define PRAGMA_PARAM "GCC unroll 4" > > +#endif > > + > > +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA > > +#define PRAGMA_PARAM "unroll 4" > > +#endif > > + > > +#ifdef SUPPORT_ICC_UNROLL_PRAGMA > > +#define PRAGMA_PARAM "unroll (4)" > > +#endif > > + > > +#ifdef PRAGMA_PARAM > > +#define UNROLL_PRAGMA(param) _Pragma(param) > > +#else > > +#define UNROLL_PRAGMA(param) do {} while(0); > > +#endif > > + > > /** > > * Structure contains buffer address, length and descriptor index > > * from vring to do scatter RX. > > -- > > 2.17.1 > > IMPORTANT NOTICE: The contents of this email and any attachments are > confidential and may also be privileged. If you are not the intended > recipient, please notify the sender immediately and do not disclose the > contents to any other person, use it for any purpose, or store or copy the > information in any medium. Thank you.
Thanks, Gavin. My comments are inline. > -----Original Message----- > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > Sent: Monday, September 23, 2019 7:09 PM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > Cc: dev@dpdk.org; nd <nd@arm.com> > Subject: RE: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue function > for packed ring > > Hi Marvin, > > Is it possible to vectorize the processing? > Other comments inline: > /Gavin Gavin, According to our experiment, only vectorize some parts in [ed]nqueue function can't benefit performance. Functions like vhost_iova_to_vva and virtio_enqueue_offload can't be easily vectorized as they are full of judgment conditions. Thanks, Marvin > > -----Original Message----- > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > Sent: Friday, September 20, 2019 12:37 AM > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > zhihong.wang@intel.com > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > Subject: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue function > for > > packed ring > > > > Burst enqueue function will first check whether descriptors are cache > > aligned. It will also check prerequisites in the beginning. Burst > > enqueue function not support chained mbufs, single packet enqueue > > function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index 5074226f0..67889c80a 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,9 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > > + sizeof(struct vring_packed_desc)) > > + > > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > > #define PRAGMA_PARAM "GCC unroll 4" > > #endif > > @@ -57,6 +60,8 @@ > > #define UNROLL_PRAGMA(param) do {} while(0); > > #endif > > > > +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) > > + > > /** > > * Structure contains buffer address, length and descriptor index > > * from vring to do scatter RX. > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index 2b5c47145..c664b27c5 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -895,6 +895,84 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > > vhost_virtqueue *vq, > > return pkt_idx; > > } > > > > +static __rte_unused __rte_always_inline int > I remember "__rte_always_inline" should start at the first and separate > line, otherwise you will get a style issue. > /Gavin > > +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct > vhost_virtqueue > > *vq, > > + struct rte_mbuf **pkts) > > +{ > > + bool wrap_counter = vq->avail_wrap_counter; > > + struct vring_packed_desc *descs = vq->desc_packed; > > + uint16_t avail_idx = vq->last_avail_idx; > > + > > + uint64_t desc_addrs[PACKED_DESCS_BURST]; > > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uint64_t lens[PACKED_DESCS_BURST]; > > + > > + uint16_t i; > > + > > + if (unlikely(avail_idx & PACKED_BURST_MASK)) > > + return -1; > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + if (unlikely(pkts[i]->next != NULL)) > > + return -1; > > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > > + wrap_counter))) > > + return -1; > > + } > > + > > + rte_smp_rmb(); > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > + lens[i] = descs[avail_idx + i].len; > Looks like the code is a strong candidate for vectorization. > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > > + return -1; > > + } > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > + descs[avail_idx + i].addr, > > + &lens[i], > > + VHOST_ACCESS_RW); > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > > + return -1; > > + } > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)desc_addrs[i]; > > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > > + } > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > > + > A store barrier here is missing, last_avail_idx may be observed before the > above enqueue completion on weak memory order architectures. > For x86, a compiler barrier is also required. > Thanks a lot for point out. I guess your mention is that need to add barrier between memcpy and enqueue. last_avail_idx is just local variable, no barrier is need to protect it. > > + vq->last_avail_idx += PACKED_DESCS_BURST; > > + if (vq->last_avail_idx >= vq->size) { > > + vq->last_avail_idx -= vq->size; > > + vq->avail_wrap_counter ^= 1; > > + } > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > + pkts[i]->pkt_len); > > + } > > + > > + return 0; > > +} > > + > > static __rte_unused int16_t > > virtio_dev_rx_single_packed(struct virtio_net *dev, struct > vhost_virtqueue > > *vq, > > struct rte_mbuf *pkt) > > -- > > 2.17.1
Hi Marvin, One typo and one comment about the barrier. /Gavin > -----Original Message----- > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > Sent: Friday, September 20, 2019 12:37 AM > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > Subject: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for burst > enqueue > > Flush used flags when burst enqueue function is finished. Descriptor's > flags are pre-calculated as them will be reset by vhost. s/them/they > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 000648dd4..9c42c7db0 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | > VRING_DESC_F_USED \ > + | VRING_DESC_F_WRITE) > +#define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) > #define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > sizeof(struct vring_packed_desc)) > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index e2787b72e..8e4036204 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -169,6 +169,51 @@ update_shadow_packed(struct vhost_virtqueue > *vq, > vq->shadow_used_packed[i].count = count; > } > > +static __rte_always_inline void > +flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > + uint64_t *lens, uint16_t *ids, uint16_t flags) > +{ > + uint16_t i; > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; > + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; > + } > + > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > + rte_smp_wmb(); Should this rte_smp_wmb() be moved above the loop? It guarantees the orderings of updates of id, len happens before the flags, But all the flags of different descriptors should not be ordered. > + vq->desc_packed[vq->last_used_idx + i].flags = flags; > + } > + > + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * > + sizeof(struct vring_packed_desc), > + sizeof(struct vring_packed_desc) * > + PACKED_DESCS_BURST); > + vhost_log_cache_sync(dev, vq); > + > + vq->last_used_idx += PACKED_DESCS_BURST; > + if (vq->last_used_idx >= vq->size) { > + vq->used_wrap_counter ^= 1; > + vq->last_used_idx -= vq->size; > + } > +} > + > +static __rte_always_inline void > +flush_enqueue_burst_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > + uint64_t *lens, uint16_t *ids) > +{ > + uint16_t flags = 0; > + > + if (vq->used_wrap_counter) > + flags = VIRTIO_RX_USED_FLAG; > + else > + flags = VIRTIO_RX_USED_WRAP_FLAG; > + > + flush_burst_packed(dev, vq, lens, ids, flags); > +} > + > static __rte_always_inline void > update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t > desc_idx, > uint32_t len, uint16_t count) > @@ -950,6 +995,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > uint32_t buf_offset = dev->vhost_hlen; > uint64_t lens[PACKED_DESCS_BURST]; > + uint16_t ids[PACKED_DESCS_BURST]; > > uint16_t i; > > @@ -1013,6 +1059,12 @@ virtio_dev_rx_burst_packed(struct virtio_net > *dev, struct vhost_virtqueue *vq, > pkts[i]->pkt_len); > } > > + UNROLL_PRAGMA(PRAGMA_PARAM) > + for (i = 0; i < PACKED_DESCS_BURST; i++) > + ids[i] = descs[avail_idx + i].id; > + > + flush_enqueue_burst_packed(dev, vq, lens, ids); > + > return 0; > } > > -- > 2.17.1
Similarly the rte_smp_wmb can be out of and right above the loop.
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu
> Sent: Friday, September 20, 2019 12:37 AM
> To: maxime.coquelin@redhat.com; tiwei.bie@intel.com;
> zhihong.wang@intel.com
> Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com>
> Subject: [dpdk-dev] [PATCH v2 10/16] vhost: split enqueue and dequeue
> flush functions
>
> Vhost enqueue descriptors are updated by burst number, while vhost
> dequeue descriptors are buffered. Meanwhile in dequeue function only
> first descriptor is buffered. Due to these differences, split vhost
> enqueue and dequeue flush functions.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
> -----Original Message----- > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > Sent: Wednesday, September 25, 2019 11:38 AM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > Cc: dev@dpdk.org; nd <nd@arm.com> > Subject: RE: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > burst enqueue > > Hi Marvin, > > One typo and one comment about the barrier. > > /Gavin > > > -----Original Message----- > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > Sent: Friday, September 20, 2019 12:37 AM > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > zhihong.wang@intel.com > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > Subject: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for burst > > enqueue > > > > Flush used flags when burst enqueue function is finished. Descriptor's > > flags are pre-calculated as them will be reset by vhost. > s/them/they > Thanks. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index 000648dd4..9c42c7db0 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,9 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | > > VRING_DESC_F_USED \ > > + | VRING_DESC_F_WRITE) > > +#define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) > > #define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > > sizeof(struct vring_packed_desc)) > > > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index e2787b72e..8e4036204 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -169,6 +169,51 @@ update_shadow_packed(struct vhost_virtqueue > > *vq, > > vq->shadow_used_packed[i].count = count; > > } > > > > +static __rte_always_inline void > > +flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > > + uint64_t *lens, uint16_t *ids, uint16_t flags) > > +{ > > + uint16_t i; > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; > > + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; > > + } > > + > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > + rte_smp_wmb(); > Should this rte_smp_wmb() be moved above the loop? It guarantees the > orderings of updates of id, len happens before the flags, > But all the flags of different descriptors should not be ordered. > Hi Gavin, For each descriptor, virtio driver will first check flags and then check read barrier, at the last driver will read id and length. So wmb here is to guarantee that id and length are updated before flags. And afterwards wmb is to guarantee the sequence. Thanks, Marvin > > + vq->desc_packed[vq->last_used_idx + i].flags = flags; > > + } > > + > > + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * > > + sizeof(struct vring_packed_desc), > > + sizeof(struct vring_packed_desc) * > > + PACKED_DESCS_BURST); > > + vhost_log_cache_sync(dev, vq); > > + > > + vq->last_used_idx += PACKED_DESCS_BURST; > > + if (vq->last_used_idx >= vq->size) { > > + vq->used_wrap_counter ^= 1; > > + vq->last_used_idx -= vq->size; > > + } > > +} > > + > > +static __rte_always_inline void > > +flush_enqueue_burst_packed(struct virtio_net *dev, struct > > vhost_virtqueue *vq, > > + uint64_t *lens, uint16_t *ids) > > +{ > > + uint16_t flags = 0; > > + > > + if (vq->used_wrap_counter) > > + flags = VIRTIO_RX_USED_FLAG; > > + else > > + flags = VIRTIO_RX_USED_WRAP_FLAG; > > + > > + flush_burst_packed(dev, vq, lens, ids, flags); > > +} > > + > > static __rte_always_inline void > > update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t > > desc_idx, > > uint32_t len, uint16_t count) > > @@ -950,6 +995,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, > > struct vhost_virtqueue *vq, > > struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > > uint32_t buf_offset = dev->vhost_hlen; > > uint64_t lens[PACKED_DESCS_BURST]; > > + uint16_t ids[PACKED_DESCS_BURST]; > > > > uint16_t i; > > > > @@ -1013,6 +1059,12 @@ virtio_dev_rx_burst_packed(struct virtio_net > > *dev, struct vhost_virtqueue *vq, > > pkts[i]->pkt_len); > > } > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > + ids[i] = descs[avail_idx + i].id; > > + > > + flush_enqueue_burst_packed(dev, vq, lens, ids); > > + > > return 0; > > } > > > > -- > > 2.17.1
> -----Original Message----- > From: Liu, Yong > Sent: Wednesday, September 25, 2019 1:38 PM > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; > maxime.coquelin@redhat.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, Zhihong > <zhihong.wang@intel.com> > Cc: dev@dpdk.org; nd <nd@arm.com> > Subject: RE: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > burst enqueue > > > > > -----Original Message----- > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > > Sent: Wednesday, September 25, 2019 11:38 AM > > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, > Tiwei > > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > > Cc: dev@dpdk.org; nd <nd@arm.com> > > Subject: RE: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > > burst enqueue > > > > Hi Marvin, > > > > One typo and one comment about the barrier. > > > > /Gavin > > > > > -----Original Message----- > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > > Sent: Friday, September 20, 2019 12:37 AM > > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > > zhihong.wang@intel.com > > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > > Subject: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > burst > > > enqueue > > > > > > Flush used flags when burst enqueue function is finished. Descriptor's > > > flags are pre-calculated as them will be reset by vhost. > > s/them/they > > > > Thanks. > > > > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > > index 000648dd4..9c42c7db0 100644 > > > --- a/lib/librte_vhost/vhost.h > > > +++ b/lib/librte_vhost/vhost.h > > > @@ -39,6 +39,9 @@ > > > > > > #define VHOST_LOG_CACHE_NR 32 > > > > > > +#define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | > > > VRING_DESC_F_USED \ > > > + | VRING_DESC_F_WRITE) > > > +#define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) > > > #define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > > > sizeof(struct vring_packed_desc)) > > > > > > diff --git a/lib/librte_vhost/virtio_net.c > > b/lib/librte_vhost/virtio_net.c > > > index e2787b72e..8e4036204 100644 > > > --- a/lib/librte_vhost/virtio_net.c > > > +++ b/lib/librte_vhost/virtio_net.c > > > @@ -169,6 +169,51 @@ update_shadow_packed(struct vhost_virtqueue > > > *vq, > > > vq->shadow_used_packed[i].count = count; > > > } > > > > > > +static __rte_always_inline void > > > +flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > > > + uint64_t *lens, uint16_t *ids, uint16_t flags) > > > +{ > > > + uint16_t i; > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; > > > + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; > > > + } > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + rte_smp_wmb(); > > Should this rte_smp_wmb() be moved above the loop? It guarantees the > > orderings of updates of id, len happens before the flags, > > But all the flags of different descriptors should not be ordered. > > > Hi Gavin, > For each descriptor, virtio driver will first check flags and then check > read barrier, at the last driver will read id and length. > So wmb here is to guarantee that id and length are updated before flags. > And afterwards wmb is to guarantee the sequence. > Gavin, Checked with master branch, flags store sequence is not needed. But in my environment, performance will be a litter better if ordered flags store. I think it may be harmless to place wmb here. How about your idea? > Thanks, > Marvin > > > > + vq->desc_packed[vq->last_used_idx + i].flags = flags; > > > + } > > > + > > > + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * > > > + sizeof(struct vring_packed_desc), > > > + sizeof(struct vring_packed_desc) * > > > + PACKED_DESCS_BURST); > > > + vhost_log_cache_sync(dev, vq); > > > + > > > + vq->last_used_idx += PACKED_DESCS_BURST; > > > + if (vq->last_used_idx >= vq->size) { > > > + vq->used_wrap_counter ^= 1; > > > + vq->last_used_idx -= vq->size; > > > + } > > > +} > > > + > > > +static __rte_always_inline void > > > +flush_enqueue_burst_packed(struct virtio_net *dev, struct > > > vhost_virtqueue *vq, > > > + uint64_t *lens, uint16_t *ids) > > > +{ > > > + uint16_t flags = 0; > > > + > > > + if (vq->used_wrap_counter) > > > + flags = VIRTIO_RX_USED_FLAG; > > > + else > > > + flags = VIRTIO_RX_USED_WRAP_FLAG; > > > + > > > + flush_burst_packed(dev, vq, lens, ids, flags); > > > +} > > > + > > > static __rte_always_inline void > > > update_enqueue_shadow_packed(struct vhost_virtqueue *vq, uint16_t > > > desc_idx, > > > uint32_t len, uint16_t count) > > > @@ -950,6 +995,7 @@ virtio_dev_rx_burst_packed(struct virtio_net *dev, > > > struct vhost_virtqueue *vq, > > > struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > > > uint32_t buf_offset = dev->vhost_hlen; > > > uint64_t lens[PACKED_DESCS_BURST]; > > > + uint16_t ids[PACKED_DESCS_BURST]; > > > > > > uint16_t i; > > > > > > @@ -1013,6 +1059,12 @@ virtio_dev_rx_burst_packed(struct virtio_net > > > *dev, struct vhost_virtqueue *vq, > > > pkts[i]->pkt_len); > > > } > > > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > + ids[i] = descs[avail_idx + i].id; > > > + > > > + flush_enqueue_burst_packed(dev, vq, lens, ids); > > > + > > > return 0; > > > } > > > > > > -- > > > 2.17.1
Hi Marvin, > -----Original Message----- > From: Liu, Yong <yong.liu@intel.com> > Sent: Wednesday, September 25, 2019 2:52 PM > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; > 'maxime.coquelin@redhat.com' <maxime.coquelin@redhat.com>; Bie, > Tiwei <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > Cc: 'dev@dpdk.org' <dev@dpdk.org>; nd <nd@arm.com> > Subject: RE: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for burst > enqueue > > > -----Original Message----- > > From: Liu, Yong > > Sent: Wednesday, September 25, 2019 1:38 PM > > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; > > maxime.coquelin@redhat.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong > > <zhihong.wang@intel.com> > > Cc: dev@dpdk.org; nd <nd@arm.com> > > Subject: RE: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > > burst enqueue > > > > > > > > > -----Original Message----- > > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > > > Sent: Wednesday, September 25, 2019 11:38 AM > > > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, > > Tiwei > > > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > > > Cc: dev@dpdk.org; nd <nd@arm.com> > > > Subject: RE: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > > > burst enqueue > > > > > > Hi Marvin, > > > > > > One typo and one comment about the barrier. > > > > > > /Gavin > > > > > > > -----Original Message----- > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > > > Sent: Friday, September 20, 2019 12:37 AM > > > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > > > zhihong.wang@intel.com > > > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > > > Subject: [dpdk-dev] [PATCH v2 08/16] vhost: add flush function for > > burst > > > > enqueue > > > > > > > > Flush used flags when burst enqueue function is finished. Descriptor's > > > > flags are pre-calculated as them will be reset by vhost. > > > s/them/they > > > > > > > Thanks. > > > > > > > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > > > index 000648dd4..9c42c7db0 100644 > > > > --- a/lib/librte_vhost/vhost.h > > > > +++ b/lib/librte_vhost/vhost.h > > > > @@ -39,6 +39,9 @@ > > > > > > > > #define VHOST_LOG_CACHE_NR 32 > > > > > > > > +#define VIRTIO_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | > > > > VRING_DESC_F_USED \ > > > > + | VRING_DESC_F_WRITE) > > > > +#define VIRTIO_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) > > > > #define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > > > > sizeof(struct vring_packed_desc)) > > > > > > > > diff --git a/lib/librte_vhost/virtio_net.c > > > b/lib/librte_vhost/virtio_net.c > > > > index e2787b72e..8e4036204 100644 > > > > --- a/lib/librte_vhost/virtio_net.c > > > > +++ b/lib/librte_vhost/virtio_net.c > > > > @@ -169,6 +169,51 @@ update_shadow_packed(struct > vhost_virtqueue > > > > *vq, > > > > vq->shadow_used_packed[i].count = count; > > > > } > > > > > > > > +static __rte_always_inline void > > > > +flush_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > > > > + uint64_t *lens, uint16_t *ids, uint16_t flags) > > > > +{ > > > > + uint16_t i; > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; > > > > + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; > > > > + } > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + rte_smp_wmb(); > > > Should this rte_smp_wmb() be moved above the loop? It guarantees the > > > orderings of updates of id, len happens before the flags, > > > But all the flags of different descriptors should not be ordered. > > > > > Hi Gavin, > > For each descriptor, virtio driver will first check flags and then check > > read barrier, at the last driver will read id and length. > > So wmb here is to guarantee that id and length are updated before flags. > > And afterwards wmb is to guarantee the sequence. > > > Gavin, > Checked with master branch, flags store sequence is not needed. > But in my environment, performance will be a litter better if ordered flags > store. > I think it may be harmless to place wmb here. How about your idea? The smp barrier on x86 is a compiler barrier only, it ensure data consistency, it will not help performance, The slight better performance should come from run-to-run variances or system noise or sth else. The barrier will dampen the performance on weak memory ordered platforms, like aarch64. /Gavin > > > Thanks, > > Marvin > > > > > > + vq->desc_packed[vq->last_used_idx + i].flags = flags; > > > > + } > > > > + > > > > + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * > > > > + sizeof(struct vring_packed_desc), > > > > + sizeof(struct vring_packed_desc) * > > > > + PACKED_DESCS_BURST); > > > > + vhost_log_cache_sync(dev, vq); > > > > + > > > > + vq->last_used_idx += PACKED_DESCS_BURST; > > > > + if (vq->last_used_idx >= vq->size) { > > > > + vq->used_wrap_counter ^= 1; > > > > + vq->last_used_idx -= vq->size; > > > > + } > > > > +} > > > > + > > > > +static __rte_always_inline void > > > > +flush_enqueue_burst_packed(struct virtio_net *dev, struct > > > > vhost_virtqueue *vq, > > > > + uint64_t *lens, uint16_t *ids) > > > > +{ > > > > + uint16_t flags = 0; > > > > + > > > > + if (vq->used_wrap_counter) > > > > + flags = VIRTIO_RX_USED_FLAG; > > > > + else > > > > + flags = VIRTIO_RX_USED_WRAP_FLAG; > > > > + > > > > + flush_burst_packed(dev, vq, lens, ids, flags); > > > > +} > > > > + > > > > static __rte_always_inline void > > > > update_enqueue_shadow_packed(struct vhost_virtqueue *vq, > uint16_t > > > > desc_idx, > > > > uint32_t len, uint16_t count) > > > > @@ -950,6 +995,7 @@ virtio_dev_rx_burst_packed(struct virtio_net > *dev, > > > > struct vhost_virtqueue *vq, > > > > struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > > > > uint32_t buf_offset = dev->vhost_hlen; > > > > uint64_t lens[PACKED_DESCS_BURST]; > > > > + uint16_t ids[PACKED_DESCS_BURST]; > > > > > > > > uint16_t i; > > > > > > > > @@ -1013,6 +1059,12 @@ virtio_dev_rx_burst_packed(struct > virtio_net > > > > *dev, struct vhost_virtqueue *vq, > > > > pkts[i]->pkt_len); > > > > } > > > > > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > > + ids[i] = descs[avail_idx + i].id; > > > > + > > > > + flush_enqueue_burst_packed(dev, vq, lens, ids); > > > > + > > > > return 0; > > > > } > > > > > > > > -- > > > > 2.17.1
Hi Marvin, > -----Original Message----- > From: Liu, Yong <yong.liu@intel.com> > Sent: Tuesday, September 24, 2019 11:31 AM > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; > maxime.coquelin@redhat.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com> > Cc: dev@dpdk.org; nd <nd@arm.com> > Subject: RE: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue function > for packed ring > > Thanks, Gavin. My comments are inline. > > > -----Original Message----- > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > > Sent: Monday, September 23, 2019 7:09 PM > > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, > Tiwei > > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > > Cc: dev@dpdk.org; nd <nd@arm.com> > > Subject: RE: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue > function > > for packed ring > > > > Hi Marvin, > > > > Is it possible to vectorize the processing? > > Other comments inline: > > /Gavin > > Gavin, > According to our experiment, only vectorize some parts in [ed]nqueue > function can't benefit performance. > Functions like vhost_iova_to_vva and virtio_enqueue_offload can't be > easily vectorized as they are full of judgment conditions. > > Thanks, > Marvin > > > > -----Original Message----- > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > > Sent: Friday, September 20, 2019 12:37 AM > > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > > zhihong.wang@intel.com > > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > > Subject: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue function > > for > > > packed ring > > > > > > Burst enqueue function will first check whether descriptors are cache > > > aligned. It will also check prerequisites in the beginning. Burst > > > enqueue function not support chained mbufs, single packet enqueue > > > function will handle it. > > > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > > index 5074226f0..67889c80a 100644 > > > --- a/lib/librte_vhost/vhost.h > > > +++ b/lib/librte_vhost/vhost.h > > > @@ -39,6 +39,9 @@ > > > > > > #define VHOST_LOG_CACHE_NR 32 > > > > > > +#define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > > > + sizeof(struct vring_packed_desc)) > > > + > > > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > > > #define PRAGMA_PARAM "GCC unroll 4" > > > #endif > > > @@ -57,6 +60,8 @@ > > > #define UNROLL_PRAGMA(param) do {} while(0); > > > #endif > > > > > > +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) > > > + > > > /** > > > * Structure contains buffer address, length and descriptor index > > > * from vring to do scatter RX. > > > diff --git a/lib/librte_vhost/virtio_net.c > > b/lib/librte_vhost/virtio_net.c > > > index 2b5c47145..c664b27c5 100644 > > > --- a/lib/librte_vhost/virtio_net.c > > > +++ b/lib/librte_vhost/virtio_net.c > > > @@ -895,6 +895,84 @@ virtio_dev_rx_split(struct virtio_net *dev, > struct > > > vhost_virtqueue *vq, > > > return pkt_idx; > > > } > > > > > > +static __rte_unused __rte_always_inline int > > I remember "__rte_always_inline" should start at the first and separate > > line, otherwise you will get a style issue. > > /Gavin > > > +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct > > vhost_virtqueue > > > *vq, > > > + struct rte_mbuf **pkts) > > > +{ > > > + bool wrap_counter = vq->avail_wrap_counter; > > > + struct vring_packed_desc *descs = vq->desc_packed; > > > + uint16_t avail_idx = vq->last_avail_idx; > > > + > > > + uint64_t desc_addrs[PACKED_DESCS_BURST]; > > > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > > > + uint32_t buf_offset = dev->vhost_hlen; > > > + uint64_t lens[PACKED_DESCS_BURST]; > > > + > > > + uint16_t i; > > > + > > > + if (unlikely(avail_idx & PACKED_BURST_MASK)) > > > + return -1; > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + if (unlikely(pkts[i]->next != NULL)) > > > + return -1; > > > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > > > + wrap_counter))) > > > + return -1; > > > + } > > > + > > > + rte_smp_rmb(); > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > + lens[i] = descs[avail_idx + i].len; > > Looks like the code is a strong candidate for vectorization. > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > > > + return -1; > > > + } > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > > + descs[avail_idx + i].addr, > > > + &lens[i], > > > + VHOST_ACCESS_RW); > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > > > + return -1; > > > + } > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)desc_addrs[i]; > > > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > > > + } > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > > > + > > A store barrier here is missing, last_avail_idx may be observed before the > > above enqueue completion on weak memory order architectures. > > For x86, a compiler barrier is also required. > > > Thanks a lot for point out. I guess your mention is that need to add barrier > between memcpy and enqueue. > last_avail_idx is just local variable, no barrier is need to protect it. Sorry I was wrong, yes, last_avail_idx is a local variable(or we may call it meta data). Copying the headers and payload does not need to be ordered, we just need to ensure all these happen before updating the idx, which is the single synchronization point. In one word, no barriers are required here. /Gavin > > > > + vq->last_avail_idx += PACKED_DESCS_BURST; > > > + if (vq->last_avail_idx >= vq->size) { > > > + vq->last_avail_idx -= vq->size; > > > + vq->avail_wrap_counter ^= 1; > > > + } > > > + > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > > + pkts[i]->pkt_len); > > > + } > > > + > > > + return 0; > > > +} > > > + > > > static __rte_unused int16_t > > > virtio_dev_rx_single_packed(struct virtio_net *dev, struct > > vhost_virtqueue > > > *vq, > > > struct rte_mbuf *pkt) > > > -- > > > 2.17.1
> -----Original Message----- > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > Sent: Wednesday, September 25, 2019 5:29 PM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > Cc: dev@dpdk.org; nd <nd@arm.com>; nd <nd@arm.com> > Subject: RE: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue function > for packed ring > > Hi Marvin, > > > -----Original Message----- > > From: Liu, Yong <yong.liu@intel.com> > > Sent: Tuesday, September 24, 2019 11:31 AM > > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; > > maxime.coquelin@redhat.com; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > > Zhihong <zhihong.wang@intel.com> > > Cc: dev@dpdk.org; nd <nd@arm.com> > > Subject: RE: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue > function > > for packed ring > > > > Thanks, Gavin. My comments are inline. > > > > > -----Original Message----- > > > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > > > Sent: Monday, September 23, 2019 7:09 PM > > > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, > > Tiwei > > > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com> > > > Cc: dev@dpdk.org; nd <nd@arm.com> > > > Subject: RE: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue > > function > > > for packed ring > > > > > > Hi Marvin, > > > > > > Is it possible to vectorize the processing? > > > Other comments inline: > > > /Gavin > > > > Gavin, > > According to our experiment, only vectorize some parts in [ed]nqueue > > function can't benefit performance. > > Functions like vhost_iova_to_vva and virtio_enqueue_offload can't be > > easily vectorized as they are full of judgment conditions. > > > > Thanks, > > Marvin > > > > > > -----Original Message----- > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Marvin Liu > > > > Sent: Friday, September 20, 2019 12:37 AM > > > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > > > zhihong.wang@intel.com > > > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > > > Subject: [dpdk-dev] [PATCH v2 03/16] vhost: add burst enqueue > function > > > for > > > > packed ring > > > > > > > > Burst enqueue function will first check whether descriptors are cache > > > > aligned. It will also check prerequisites in the beginning. Burst > > > > enqueue function not support chained mbufs, single packet enqueue > > > > function will handle it. > > > > > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > > > index 5074226f0..67889c80a 100644 > > > > --- a/lib/librte_vhost/vhost.h > > > > +++ b/lib/librte_vhost/vhost.h > > > > @@ -39,6 +39,9 @@ > > > > > > > > #define VHOST_LOG_CACHE_NR 32 > > > > > > > > +#define PACKED_DESCS_BURST (RTE_CACHE_LINE_SIZE / \ > > > > + sizeof(struct vring_packed_desc)) > > > > + > > > > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > > > > #define PRAGMA_PARAM "GCC unroll 4" > > > > #endif > > > > @@ -57,6 +60,8 @@ > > > > #define UNROLL_PRAGMA(param) do {} while(0); > > > > #endif > > > > > > > > +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1) > > > > + > > > > /** > > > > * Structure contains buffer address, length and descriptor index > > > > * from vring to do scatter RX. > > > > diff --git a/lib/librte_vhost/virtio_net.c > > > b/lib/librte_vhost/virtio_net.c > > > > index 2b5c47145..c664b27c5 100644 > > > > --- a/lib/librte_vhost/virtio_net.c > > > > +++ b/lib/librte_vhost/virtio_net.c > > > > @@ -895,6 +895,84 @@ virtio_dev_rx_split(struct virtio_net *dev, > > struct > > > > vhost_virtqueue *vq, > > > > return pkt_idx; > > > > } > > > > > > > > +static __rte_unused __rte_always_inline int > > > I remember "__rte_always_inline" should start at the first and separate > > > line, otherwise you will get a style issue. > > > /Gavin > > > > +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct > > > vhost_virtqueue > > > > *vq, > > > > + struct rte_mbuf **pkts) > > > > +{ > > > > + bool wrap_counter = vq->avail_wrap_counter; > > > > + struct vring_packed_desc *descs = vq->desc_packed; > > > > + uint16_t avail_idx = vq->last_avail_idx; > > > > + > > > > + uint64_t desc_addrs[PACKED_DESCS_BURST]; > > > > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_DESCS_BURST]; > > > > + uint32_t buf_offset = dev->vhost_hlen; > > > > + uint64_t lens[PACKED_DESCS_BURST]; > > > > + > > > > + uint16_t i; > > > > + > > > > + if (unlikely(avail_idx & PACKED_BURST_MASK)) > > > > + return -1; > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + if (unlikely(pkts[i]->next != NULL)) > > > > + return -1; > > > > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > > > > + wrap_counter))) > > > > + return -1; > > > > + } > > > > + > > > > + rte_smp_rmb(); > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > > + lens[i] = descs[avail_idx + i].len; > > > Looks like the code is a strong candidate for vectorization. > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > > > > + return -1; > > > > + } > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > > > + descs[avail_idx + i].addr, > > > > + &lens[i], > > > > + VHOST_ACCESS_RW); > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > > > > + return -1; > > > > + } > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > > > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf > *)desc_addrs[i]; > > > > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > > > > + } > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) > > > > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > > > > + > > > A store barrier here is missing, last_avail_idx may be observed before > the > > > above enqueue completion on weak memory order architectures. > > > For x86, a compiler barrier is also required. > > > > > Thanks a lot for point out. I guess your mention is that need to add > barrier > > between memcpy and enqueue. > > last_avail_idx is just local variable, no barrier is need to protect it. > > Sorry I was wrong, yes, last_avail_idx is a local variable(or we may call > it meta data). > Copying the headers and payload does not need to be ordered, we just need > to ensure all these happen before updating the idx, which is the single > synchronization point. > In one word, no barriers are required here. > /Gavin > > NP:) Nothing changed here in V3 patch. Thanks for kindly reviewing. > > > > + vq->last_avail_idx += PACKED_DESCS_BURST; > > > > + if (vq->last_avail_idx >= vq->size) { > > > > + vq->last_avail_idx -= vq->size; > > > > + vq->avail_wrap_counter ^= 1; > > > > + } > > > > + > > > > + UNROLL_PRAGMA(PRAGMA_PARAM) > > > > + for (i = 0; i < PACKED_DESCS_BURST; i++) { > > > > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + > buf_offset), > > > > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > > > + pkts[i]->pkt_len); > > > > + } > > > > + > > > > + return 0; > > > > +} > > > > + > > > > static __rte_unused int16_t > > > > virtio_dev_rx_single_packed(struct virtio_net *dev, struct > > > vhost_virtqueue > > > > *vq, > > > > struct rte_mbuf *pkt) > > > > -- > > > > 2.17.1
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. Cache memory region structure for fast conversion. Disable sofware prefetch if hardware can do better. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 40%. v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (15): vhost: add single packet enqueue function vhost: unify unroll pragma parameter vhost: add batch enqueue function for packed ring vhost: add single packet dequeue function vhost: add batch dequeue function vhost: flush vhost enqueue shadow ring by batch vhost: add flush function for batch enqueue vhost: buffer vhost dequeue shadow ring vhost: split enqueue and dequeue flush functions vhost: optimize enqueue function of packed ring vhost: add batch and single zero dequeue functions vhost: optimize dequeue function of packed ring vhost: cache address translation result vhost: check whether disable software pre-fetch vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 24 + lib/librte_vhost/rte_vhost.h | 27 + lib/librte_vhost/vhost.h | 33 ++ lib/librte_vhost/virtio_net.c | 994 +++++++++++++++++++++++++++------- 4 files changed, 895 insertions(+), 183 deletions(-) -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5b85b832d..520c4c6a8 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -774,6 +774,58 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -831,6 +883,36 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Add macro for unifying Clang/ICC/GCC unroll pragma format. Batch functions were contained of several small loops which optimized by compiler’s loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..30839a001 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 884befa85..4cba8c5ef 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,24 @@ #define VHOST_LOG_CACHE_NR 32 +#ifdef SUPPORT_GCC_UNROLL_PRAGMA +#define UNROLL_PRAGMA_PARAM "GCC unroll 4" +#endif + +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA +#define UNROLL_PRAGMA_PARAM "unroll 4" +#endif + +#ifdef SUPPORT_ICC_UNROLL_PRAGMA +#define UNROLL_PRAGMA_PARAM "unroll (4)" +#endif + +#ifdef UNROLL_PRAGMA_PARAM +#define UNROLL_PRAGMA(param) _Pragma(param) +#else +#define UNROLL_PRAGMA(param) do {} while (0); +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 4cba8c5ef..e241436c7 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,10 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + #ifdef SUPPORT_GCC_UNROLL_PRAGMA #define UNROLL_PRAGMA_PARAM "GCC unroll 4" #endif diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 520c4c6a8..5e08f7d9b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -883,6 +883,86 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + lens[i] = descs[avail_idx + i].len; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq->last_avail_idx += PACKED_BATCH_SIZE; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5e08f7d9b..17aabe8eb 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1571,6 +1571,60 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index e241436c7..e50e137ca 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -61,6 +61,8 @@ #define UNROLL_PRAGMA(param) do {} while (0); #endif +#define PACKED_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT) + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 17aabe8eb..2ff7329b2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1571,6 +1571,119 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t avail_idx, uintptr_t *desc_addrs, uint16_t *ids) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + if (unlikely(descs[avail_idx + i].flags & + PACKED_SINGLE_DEQUEUE_FLAG)) + return -1; + } + + rte_smp_rmb(); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + lens[i] = descs[avail_idx + i].len; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + return 0; +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + struct virtio_net_hdr *hdr; + uint16_t i; + + if (vhost_dequeue_batch_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addrs, ids)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + } + + if (virtio_net_with_host_offload(dev)) { + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + vq->last_avail_idx += PACKED_BATCH_SIZE; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, -- 2.17.1
Buffer vhost enqueue shadow ring update, flush shadow ring until buffered descriptors number exceed one batch. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index e50e137ca..18a207fc6 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -163,6 +163,7 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + uint16_t enqueue_shadow_count; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 2ff7329b2..f85619dc2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -169,6 +169,24 @@ update_shadow_used_ring_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint32_t len, uint16_t count) +{ + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->enqueue_shadow_count = vq->last_used_idx & + PACKED_BATCH_MASK; + + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; + + vq->enqueue_shadow_count += count; +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -198,6 +216,23 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +flush_enqueue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint32_t len[], uint16_t id[], + uint16_t count[], uint16_t num_buffers) +{ + int i; + for (i = 0; i < num_buffers; i++) { + update_enqueue_shadow_used_ring_packed(vq, id[i], len[i], + count[i]); + + if (vq->enqueue_shadow_count >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + flush_shadow_used_ring_packed(dev, vq); + } + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -786,6 +821,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -811,6 +849,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -822,6 +863,8 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + flush_enqueue_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); return 0; } -- 2.17.1
Flush used flags when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 18a207fc6..7bf9ff9b7 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ + | VRING_DESC_F_WRITE) +#define PACKED_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index f85619dc2..a629e66d4 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -169,6 +169,49 @@ update_shadow_used_ring_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t *lens, uint16_t *ids, uint16_t flags) +{ + uint16_t i; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq->last_used_idx += PACKED_BATCH_SIZE; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static __rte_always_inline void +flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t *lens, uint16_t *ids) +{ + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = PACKED_RX_USED_FLAG; + else + flags = PACKED_RX_USED_WRAP_FLAG; + flush_used_batch_packed(dev, vq, lens, ids, flags); +} + static __rte_always_inline void update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -937,6 +980,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1003,6 +1047,12 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + ids[i] = descs[avail_idx + i].id; + + flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor index and its wrap counter. First shadowed ring index is recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 7bf9ff9b7..f62e9ec3f 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ | VRING_DESC_F_WRITE) #define PACKED_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) +#define PACKED_TX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) +#define PACKED_TX_USED_WRAP_FLAG (0x0) #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) @@ -110,9 +112,11 @@ struct log_cache_entry { }; struct vring_used_elem_packed { + uint16_t used_idx; uint16_t id; uint32_t len; uint32_t count; + uint16_t used_wrap_counter; }; /** @@ -167,6 +171,7 @@ struct vhost_virtqueue { }; uint16_t shadow_used_idx; uint16_t enqueue_shadow_count; + uint16_t dequeue_shadow_head; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index a629e66d4..8f7209f83 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -230,6 +230,41 @@ update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, vq->enqueue_shadow_count += count; } +static __rte_always_inline void +update_dequeue_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, uint16_t count) +{ + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + + if (vq->used_wrap_counter) + vq->desc_packed[vq->last_used_idx].flags = + PACKED_TX_USED_FLAG; + else + vq->desc_packed[vq->last_used_idx].flags = + PACKED_TX_USED_WRAP_FLAG; + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1822,6 +1857,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; + update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count); + vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { vq->last_avail_idx -= vq->size; -- 2.17.1
Vhost enqueue descriptors are updated by batch number, while vhost dequeue descriptors are buffered. Meanwhile in dequeue function only first descriptor is buffered. Due to these differences, split vhost enqueue and dequeue flush functions. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 8f7209f83..1b0fa2c64 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -92,8 +92,8 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) +flush_enqueue_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) { int i; uint16_t used_idx = vq->last_used_idx; @@ -158,6 +158,32 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +flush_dequeue_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + uint16_t head_idx = vq->dequeue_shadow_head; + uint16_t head_flags; + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + if (used_elem->used_wrap_counter) + head_flags = PACKED_TX_USED_FLAG; + else + head_flags = PACKED_TX_USED_WRAP_FLAG; + + vq->desc_packed[head_idx].id = used_elem->id; + + rte_smp_wmb(); + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -199,6 +225,47 @@ flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags = 0; + uint16_t i; + + if (vq->used_wrap_counter) + flags = PACKED_TX_USED_FLAG; + else + flags = PACKED_TX_USED_WRAP_FLAG; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 1; i < PACKED_BATCH_SIZE; i++) + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + rte_smp_wmb(); + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 1; i < PACKED_BATCH_SIZE; i++) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vq->shadow_used_idx = 1; + vq->last_used_idx += PACKED_BATCH_SIZE; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } else { + uint64_t lens[PACKED_BATCH_SIZE] = {0}; + flush_used_batch_packed(dev, vq, lens, ids, flags); + } +} + static __rte_always_inline void flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t *lens, uint16_t *ids) @@ -306,11 +373,29 @@ flush_enqueue_packed(struct virtio_net *dev, if (vq->enqueue_shadow_count >= PACKED_BATCH_SIZE) { do_data_copy_enqueue(dev, vq); - flush_shadow_used_ring_packed(dev, vq); + flush_enqueue_shadow_used_ring_packed(dev, vq); } } } +static __rte_unused void +flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (!vq->shadow_used_idx) + return; + + int16_t shadow_count = vq->last_used_idx - vq->dequeue_shadow_head; + if (shadow_count <= 0) + shadow_count += vq->size; + + /* buffer used descs as many as possible when doing dequeue */ + if ((uint16_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + flush_dequeue_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1165,7 +1250,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_enqueue_shadow_used_ring_packed(dev, vq); vhost_vring_call_packed(dev, vq); } @@ -1796,6 +1881,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } + update_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_with_host_offload(dev)) { UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) for (i = 0; i < PACKED_BATCH_SIZE; i++) { @@ -1896,7 +1983,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_dequeue_shadow_used_ring_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } @@ -1975,7 +2062,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_dequeue_shadow_used_ring_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } -- 2.17.1
Optimize vhost device Tx datapath by separate functions. Packets can be filled into one descriptor will be handled by batch and others will be handled one by one as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1b0fa2c64..c485e7f49 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -753,64 +753,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1089,7 +1031,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) { @@ -1176,7 +1118,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) { @@ -1205,52 +1147,36 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } - static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; - - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + uint32_t remained = count; + + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & + (vq->size - 1)]); + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } - - do_data_copy_enqueue(dev, vq); + pkt_idx++; + remained--; + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { - flush_enqueue_shadow_used_ring_packed(dev, vq); + if (pkt_idx) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); + flush_enqueue_shadow_used_ring_packed(dev, vq); + } vhost_vring_call_packed(dev, vq); } -- 2.17.1
Optimize vhost zero copy dequeue path like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index c485e7f49..9ab95763a 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1881,6 +1881,141 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_dequeue_batch_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addrs, ids)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + zmbufs[i] = get_zmbuf(vq); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (!zmbufs[i]) + goto free_pkt; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_mbuf_refcnt_update(pkts[i], 1); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq->last_avail_idx += PACKED_BATCH_SIZE; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; + +free_pkt: + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + +static __rte_always_inline void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = PACKED_TX_USED_FLAG; + else + flags = PACKED_TX_USED_WRAP_FLAG; + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq->last_used_idx += zmbuf->desc_count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device Rx datapath by separate functions. No-chained and direct descriptors will be handled by batch and other will be handled one by one as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 9ab95763a..20624efdc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -184,17 +184,6 @@ flush_dequeue_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static __rte_always_inline void flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t *lens, uint16_t *ids, uint16_t flags) @@ -378,7 +367,7 @@ flush_enqueue_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { if (!vq->shadow_used_idx) @@ -1784,7 +1773,7 @@ vhost_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1859,7 +1848,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1881,7 +1870,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1940,7 +1929,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) @@ -2017,118 +2006,74 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + free_zmbuf(vq); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, + &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + pkt_idx++; + remained--; + } while (remained); - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t dummy_len; - uint16_t desc_count, nr_vec = 0; - int err; + return pkt_idx; +} - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &dummy_len, - VHOST_ACCESS_RO) < 0)) - break; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & + (vq->size - 1)]); - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; - } - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; - - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); - - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } - - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } + pkt_idx++; + remained--; + flush_dequeue_packed(dev, vq); + } while (remained); - if (likely(dev->dequeue_zero_copy == 0)) { - do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } + if (pkt_idx) { + if (vq->shadow_used_idx) + do_data_copy_dequeue(vq); } - return i; + return pkt_idx; } uint16_t @@ -2204,9 +2149,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
Cache address translation result and use it in next translation. Due to limited regions are supported, buffers are most likely in same region when doing data transmission. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h index 7fb172912..d90235cd6 100644 --- a/lib/librte_vhost/rte_vhost.h +++ b/lib/librte_vhost/rte_vhost.h @@ -91,10 +91,18 @@ struct rte_vhost_mem_region { int fd; }; +struct rte_vhost_mem_region_cache { + uint64_t guest_phys_addr; + uint64_t guest_phys_addr_end; + int64_t host_user_addr_offset; + uint64_t size; +}; + /** * Memory structure includes region and mapping information. */ struct rte_vhost_memory { + struct rte_vhost_mem_region_cache cache_region; uint32_t nregions; struct rte_vhost_mem_region regions[]; }; @@ -232,11 +240,30 @@ rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, struct rte_vhost_mem_region *r; uint32_t i; + struct rte_vhost_mem_region_cache *r_cache; + /* check with cached region */ + r_cache = &mem->cache_region; + if (likely(gpa >= r_cache->guest_phys_addr && gpa < + r_cache->guest_phys_addr_end)) { + if (unlikely(*len > r_cache->guest_phys_addr_end - gpa)) + *len = r_cache->guest_phys_addr_end - gpa; + + return gpa - r_cache->host_user_addr_offset; + } + + for (i = 0; i < mem->nregions; i++) { r = &mem->regions[i]; if (gpa >= r->guest_phys_addr && gpa < r->guest_phys_addr + r->size) { + r_cache->guest_phys_addr = r->guest_phys_addr; + r_cache->guest_phys_addr_end = r->guest_phys_addr + + r->size; + r_cache->size = r->size; + r_cache->host_user_addr_offset = r->guest_phys_addr - + r->host_user_addr; + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) *len = r->guest_phys_addr + r->size - gpa; -- 2.17.1
Disable software pre-fetch actions on Skylake and later platforms. Hardware can fetch needed data for vhost, additional software pre-fetch will impact performance. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 30839a001..5f3b42e56 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep AVX512F) + +ifneq ($(AVX512_SUPPORT),) +CFLAGS += -DDISABLE_SWPREFETCH +endif + ifeq ($(RTE_TOOLCHAIN), gcc) ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 20624efdc..e3872e384 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1075,7 +1075,9 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) for (i = 0; i < PACKED_BATCH_SIZE; i++) { +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); +#endif hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) (uintptr_t)desc_addrs[i]; lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; @@ -1144,8 +1146,10 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t remained = count; do { +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]); +#endif if (remained >= PACKED_BATCH_SIZE) { if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { pkt_idx += PACKED_BATCH_SIZE; @@ -1790,7 +1794,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) for (i = 0; i < PACKED_BATCH_SIZE; i++) { +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); +#endif rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); @@ -2046,8 +2052,10 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t remained = count; do { +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->desc_packed[vq->last_avail_idx & (vq->size - 1)]); +#endif if (remained >= PACKED_BATCH_SIZE) { if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index e3872e384..1e113fb3a 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -214,6 +220,29 @@ flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } + + vq->last_used_idx += PACKED_BATCH_SIZE; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static __rte_always_inline void update_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t *ids) @@ -321,6 +350,32 @@ update_dequeue_shadow_used_ring_packed(struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_shadow_used_ring_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, uint16_t count) +{ + vq->shadow_used_packed[0].id = buf_id; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1801,8 +1856,12 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); } + if (virtio_net_is_inorder(dev)) + update_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_MASK]); + else + update_dequeue_batch_packed(dev, vq, ids); - update_dequeue_batch_packed(dev, vq, ids); if (virtio_net_with_host_offload(dev)) { UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) @@ -1865,7 +1924,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; - update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + update_dequeue_shadow_used_ring_packed_inorder(vq, buf_id, + desc_count); + else + update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count); vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { -- 2.17.1
On 2019-09-25 19:13, Marvin Liu wrote: > Batch enqueue function will first check whether descriptors are cache > aligned. It will also check prerequisites in the beginning. Batch > enqueue function not support chained mbufs, single packet enqueue > function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 4cba8c5ef..e241436c7 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,10 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > + sizeof(struct vring_packed_desc)) > +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > + > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > #define UNROLL_PRAGMA_PARAM "GCC unroll 4" > #endif > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 520c4c6a8..5e08f7d9b 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -883,6 +883,86 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return pkt_idx; > } > > +static __rte_unused int > +virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > + struct rte_mbuf **pkts) > +{ > + bool wrap_counter = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + uint16_t avail_idx = vq->last_avail_idx; > + uint64_t desc_addrs[PACKED_BATCH_SIZE]; > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint64_t lens[PACKED_BATCH_SIZE]; > + uint16_t i; > + > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > + return -1; Does this really generate better code than just "avail_idx < PACKED_BATCH_SIZE"? and+jne vs cmp+jbe. > + > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > + return -1; > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > + if (unlikely(pkts[i]->next != NULL)) > + return -1; > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > + wrap_counter))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + lens[i] = descs[avail_idx + i].len; > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > + return -1; > + } > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], > + VHOST_ACCESS_RW); > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > + return -1; > + } > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) > + (uintptr_t)desc_addrs[i]; > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > + } > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > + > + vq->last_avail_idx += PACKED_BATCH_SIZE; > + if (vq->last_avail_idx >= vq->size) { > + vq->last_avail_idx -= vq->size; > + vq->avail_wrap_counter ^= 1; > + } > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + pkts[i]->pkt_len); > + } > + > + return 0; > +} > + > static __rte_unused int16_t > virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mbuf *pkt) >
On Thu, Sep 26, 2019 at 01:13:27AM +0800, Marvin Liu wrote: > Cache address translation result and use it in next translation. Due > to limited regions are supported, buffers are most likely in same > region when doing data transmission. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h > index 7fb172912..d90235cd6 100644 > --- a/lib/librte_vhost/rte_vhost.h > +++ b/lib/librte_vhost/rte_vhost.h > @@ -91,10 +91,18 @@ struct rte_vhost_mem_region { > int fd; > }; > > +struct rte_vhost_mem_region_cache { > + uint64_t guest_phys_addr; > + uint64_t guest_phys_addr_end; > + int64_t host_user_addr_offset; > + uint64_t size; > +}; > + > /** > * Memory structure includes region and mapping information. > */ > struct rte_vhost_memory { > + struct rte_vhost_mem_region_cache cache_region; This breaks ABI. > uint32_t nregions; > struct rte_vhost_mem_region regions[]; > }; > @@ -232,11 +240,30 @@ rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, > struct rte_vhost_mem_region *r; > uint32_t i; > > + struct rte_vhost_mem_region_cache *r_cache; > + /* check with cached region */ > + r_cache = &mem->cache_region; > + if (likely(gpa >= r_cache->guest_phys_addr && gpa < > + r_cache->guest_phys_addr_end)) { > + if (unlikely(*len > r_cache->guest_phys_addr_end - gpa)) > + *len = r_cache->guest_phys_addr_end - gpa; > + > + return gpa - r_cache->host_user_addr_offset; > + } Does this help a lot in performance? We can implement this caching for builtin backend first. > + > + > for (i = 0; i < mem->nregions; i++) { > r = &mem->regions[i]; > if (gpa >= r->guest_phys_addr && > gpa < r->guest_phys_addr + r->size) { > > + r_cache->guest_phys_addr = r->guest_phys_addr; > + r_cache->guest_phys_addr_end = r->guest_phys_addr + > + r->size; > + r_cache->size = r->size; > + r_cache->host_user_addr_offset = r->guest_phys_addr - > + r->host_user_addr; > + > if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) > *len = r->guest_phys_addr + r->size - gpa; > > -- > 2.17.1 >
On Thu, Sep 26, 2019 at 01:13:16AM +0800, Marvin Liu wrote:
> Add macro for unifying Clang/ICC/GCC unroll pragma format. Batch
> functions were contained of several small loops which optimized by
> compiler’s loop unrolling pragma.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index 8623e91c0..30839a001 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user
> CFLAGS += -fno-strict-aliasing
> LDLIBS += -lpthread
>
> +ifeq ($(RTE_TOOLCHAIN), gcc)
> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), clang)
> +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1)
> +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), icc)
> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA
> +endif
> +endif
> +
> ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y)
> LDLIBS += -lnuma
> endif
You need to add meson support as well.
On Thu, Sep 26, 2019 at 01:13:24AM +0800, Marvin Liu wrote:
> static __rte_noinline uint32_t
> virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> struct rte_mbuf **pkts, uint32_t count)
> {
> uint32_t pkt_idx = 0;
> - uint16_t num_buffers;
> - struct buf_vector buf_vec[BUF_VECTOR_MAX];
> -
> - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
> - uint16_t nr_vec = 0;
> - uint16_t nr_descs = 0;
> -
> - if (unlikely(reserve_avail_buf_packed(dev, vq,
> - pkt_len, buf_vec, &nr_vec,
> - &num_buffers, &nr_descs) < 0)) {
> - VHOST_LOG_DEBUG(VHOST_DATA,
> - "(%d) failed to get enough desc from vring\n",
> - dev->vid);
> - vq->shadow_used_idx -= num_buffers;
> - break;
> + uint32_t remained = count;
> +
> + do {
> + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx &
> + (vq->size - 1)]);
You can't assume packed ring size is a power of 2.
> -----Original Message----- > From: Marvin Liu <yong.liu@intel.com> > Sent: Thursday, September 26, 2019 1:13 AM > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com; stephen@networkplumber.org; Gavin Hu (Arm > Technology China) <Gavin.Hu@arm.com> > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > Subject: [PATCH v3 07/15] vhost: add flush function for batch enqueue > > Flush used flags when batched enqueue function is finished. Descriptor's > flags are pre-calculated as they will be reset by vhost. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 18a207fc6..7bf9ff9b7 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | > VRING_DESC_F_USED \ > + | VRING_DESC_F_WRITE) > +#define PACKED_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) > #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > sizeof(struct vring_packed_desc)) > #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index f85619dc2..a629e66d4 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -169,6 +169,49 @@ update_shadow_used_ring_packed(struct > vhost_virtqueue *vq, > vq->shadow_used_packed[i].count = count; > } > > +static __rte_always_inline void > +flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue > *vq, > + uint64_t *lens, uint16_t *ids, uint16_t flags) > +{ > + uint16_t i; > + > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; > + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; > + } > + > + rte_smp_wmb(); > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + vq->desc_packed[vq->last_used_idx + i].flags = flags; > + > + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * > + sizeof(struct vring_packed_desc), > + sizeof(struct vring_packed_desc) * > + PACKED_BATCH_SIZE); > + vhost_log_cache_sync(dev, vq); > + > + vq->last_used_idx += PACKED_BATCH_SIZE; > + if (vq->last_used_idx >= vq->size) { > + vq->used_wrap_counter ^= 1; > + vq->last_used_idx -= vq->size; > + } > +} > + > +static __rte_always_inline void > +flush_enqueue_batch_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > + uint64_t *lens, uint16_t *ids) > +{ > + uint16_t flags = 0; > + > + if (vq->used_wrap_counter) > + flags = PACKED_RX_USED_FLAG; > + else > + flags = PACKED_RX_USED_WRAP_FLAG; > + flush_used_batch_packed(dev, vq, lens, ids, flags); > +} > + > static __rte_always_inline void > update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, > uint16_t desc_idx, uint32_t len, uint16_t count) > @@ -937,6 +980,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; > uint32_t buf_offset = dev->vhost_hlen; > uint64_t lens[PACKED_BATCH_SIZE]; > + uint16_t ids[PACKED_BATCH_SIZE]; > uint16_t i; > > if (unlikely(avail_idx & PACKED_BATCH_MASK)) > @@ -1003,6 +1047,12 @@ virtio_dev_rx_batch_packed(struct virtio_net > *dev, struct vhost_virtqueue *vq, > pkts[i]->pkt_len); > } > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + ids[i] = descs[avail_idx + i].id; > + > + flush_enqueue_batch_packed(dev, vq, lens, ids); > + > return 0; > } Reviewed-by: Gavin Hu <gavin.hu@arm.com> > -- > 2.17.1
> -----Original Message----- > From: Bie, Tiwei > Sent: Thursday, September 26, 2019 1:32 PM > To: Liu, Yong <yong.liu@intel.com> > Cc: maxime.coquelin@redhat.com; Wang, Zhihong <zhihong.wang@intel.com>; > stephen@networkplumber.org; gavin.hu@arm.com; dev@dpdk.org > Subject: Re: [PATCH v3 13/15] vhost: cache address translation result > > On Thu, Sep 26, 2019 at 01:13:27AM +0800, Marvin Liu wrote: > > Cache address translation result and use it in next translation. Due > > to limited regions are supported, buffers are most likely in same > > region when doing data transmission. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h > > index 7fb172912..d90235cd6 100644 > > --- a/lib/librte_vhost/rte_vhost.h > > +++ b/lib/librte_vhost/rte_vhost.h > > @@ -91,10 +91,18 @@ struct rte_vhost_mem_region { > > int fd; > > }; > > > > +struct rte_vhost_mem_region_cache { > > + uint64_t guest_phys_addr; > > + uint64_t guest_phys_addr_end; > > + int64_t host_user_addr_offset; > > + uint64_t size; > > +}; > > + > > /** > > * Memory structure includes region and mapping information. > > */ > > struct rte_vhost_memory { > > + struct rte_vhost_mem_region_cache cache_region; > > This breaks ABI. > Got, will remove it as no clear performance gain with this patch. > > uint32_t nregions; > > struct rte_vhost_mem_region regions[]; > > }; > > @@ -232,11 +240,30 @@ rte_vhost_va_from_guest_pa(struct rte_vhost_memory > *mem, > > struct rte_vhost_mem_region *r; > > uint32_t i; > > > > + struct rte_vhost_mem_region_cache *r_cache; > > + /* check with cached region */ > > + r_cache = &mem->cache_region; > > + if (likely(gpa >= r_cache->guest_phys_addr && gpa < > > + r_cache->guest_phys_addr_end)) { > > + if (unlikely(*len > r_cache->guest_phys_addr_end - gpa)) > > + *len = r_cache->guest_phys_addr_end - gpa; > > + > > + return gpa - r_cache->host_user_addr_offset; > > + } > > Does this help a lot in performance? > We can implement this caching for builtin backend first. > Tiwei, It won’t help too much in performance as region number will be 1 at most of times. Will remove cache function in next version. Thanks, Marvin > > > + > > + > > for (i = 0; i < mem->nregions; i++) { > > r = &mem->regions[i]; > > if (gpa >= r->guest_phys_addr && > > gpa < r->guest_phys_addr + r->size) { > > > > + r_cache->guest_phys_addr = r->guest_phys_addr; > > + r_cache->guest_phys_addr_end = r->guest_phys_addr + > > + r->size; > > + r_cache->size = r->size; > > + r_cache->host_user_addr_offset = r->guest_phys_addr - > > + r->host_user_addr; > > + > > if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) > > *len = r->guest_phys_addr + r->size - gpa; > > > > -- > > 2.17.1 > >
> -----Original Message----- > From: Mattias Rönnblom [mailto:hofors@lysator.liu.se] > Sent: Thursday, September 26, 2019 3:31 AM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; > stephen@networkplumber.org; gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [dpdk-dev] [PATCH v3 03/15] vhost: add batch enqueue function > for packed ring > > On 2019-09-25 19:13, Marvin Liu wrote: > > Batch enqueue function will first check whether descriptors are cache > > aligned. It will also check prerequisites in the beginning. Batch > > enqueue function not support chained mbufs, single packet enqueue > > function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index 4cba8c5ef..e241436c7 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,10 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > > + sizeof(struct vring_packed_desc)) > > +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > > + > > #ifdef SUPPORT_GCC_UNROLL_PRAGMA > > #define UNROLL_PRAGMA_PARAM "GCC unroll 4" > > #endif > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index 520c4c6a8..5e08f7d9b 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -883,6 +883,86 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > return pkt_idx; > > } > > > > +static __rte_unused int > > +virtio_dev_rx_batch_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > + struct rte_mbuf **pkts) > > +{ > > + bool wrap_counter = vq->avail_wrap_counter; > > + struct vring_packed_desc *descs = vq->desc_packed; > > + uint16_t avail_idx = vq->last_avail_idx; > > + uint64_t desc_addrs[PACKED_BATCH_SIZE]; > > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uint64_t lens[PACKED_BATCH_SIZE]; > > + uint16_t i; > > + > > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > > + return -1; > > Does this really generate better code than just "avail_idx < > PACKED_BATCH_SIZE"? and+jne vs cmp+jbe. Hi Mattias, This comparison is to check whether descriptor location is cache aligned. In x86 cache line size is 64 bytes, so here mask is 0x3. This check will be and + test + je which is very simple. Most of times the cost of execution will be eliminated as the result can be predicted. Thanks, Marvin > > > + > > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > > + return -1; > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > + if (unlikely(pkts[i]->next != NULL)) > > + return -1; > > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > > + wrap_counter))) > > + return -1; > > + } > > + > > + rte_smp_rmb(); > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > > + lens[i] = descs[avail_idx + i].len; > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > > + return -1; > > + } > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > + descs[avail_idx + i].addr, > > + &lens[i], > > + VHOST_ACCESS_RW); > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > > + return -1; > > + } > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) > > + (uintptr_t)desc_addrs[i]; > > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > > + } > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > > + > > + vq->last_avail_idx += PACKED_BATCH_SIZE; > > + if (vq->last_avail_idx >= vq->size) { > > + vq->last_avail_idx -= vq->size; > > + vq->avail_wrap_counter ^= 1; > > + } > > + > > + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > + pkts[i]->pkt_len); > > + } > > + > > + return 0; > > +} > > + > > static __rte_unused int16_t > > virtio_dev_rx_single_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > struct rte_mbuf *pkt) > >
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. Disable sofware prefetch if hardware can do better. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 40%. v4: - Support meson build - Remove memory region cache for no clear performance gain and ABI break - Not assume ring size is power of two v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (14): vhost: add single packet enqueue function vhost: unify unroll pragma parameter vhost: add batch enqueue function for packed ring vhost: add single packet dequeue function vhost: add batch dequeue function vhost: flush vhost enqueue shadow ring by batch vhost: add flush function for batch enqueue vhost: buffer vhost dequeue shadow ring vhost: split enqueue and dequeue flush functions vhost: optimize enqueue function of packed ring vhost: add batch and single zero dequeue functions vhost: optimize dequeue function of packed ring vhost: check whether disable software pre-fetch vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 24 + lib/librte_vhost/meson.build | 11 + lib/librte_vhost/vhost.h | 33 ++ lib/librte_vhost/virtio_net.c | 993 +++++++++++++++++++++++++++------- 4 files changed, 878 insertions(+), 183 deletions(-) -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5b85b832d..520c4c6a8 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -774,6 +774,58 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, struct buf_vector *buf_vec, uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -831,6 +883,36 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq->last_avail_idx += nr_descs; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Add macro for unifying Clang/ICC/GCC unroll pragma format. Batch functions were contained of several small loops which optimized by compiler’s loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..30839a001 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index cb1123ae3..ddf0ee579 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -8,6 +8,13 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) + cflags += '-DSUPPORT_GCC_UNROLL_PRAGMA' +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) + cflags += '-DSUPPORT_CLANG_UNROLL_PRAGMA' +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) + cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 884befa85..4cba8c5ef 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,24 @@ #define VHOST_LOG_CACHE_NR 32 +#ifdef SUPPORT_GCC_UNROLL_PRAGMA +#define UNROLL_PRAGMA_PARAM "GCC unroll 4" +#endif + +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA +#define UNROLL_PRAGMA_PARAM "unroll 4" +#endif + +#ifdef SUPPORT_ICC_UNROLL_PRAGMA +#define UNROLL_PRAGMA_PARAM "unroll (4)" +#endif + +#ifdef UNROLL_PRAGMA_PARAM +#define UNROLL_PRAGMA(param) _Pragma(param) +#else +#define UNROLL_PRAGMA(param) do {} while (0); +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 4cba8c5ef..e241436c7 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,10 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + #ifdef SUPPORT_GCC_UNROLL_PRAGMA #define UNROLL_PRAGMA_PARAM "GCC unroll 4" #endif diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 520c4c6a8..5e08f7d9b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -883,6 +883,86 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + lens[i] = descs[avail_idx + i].len; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq->last_avail_idx += PACKED_BATCH_SIZE; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5e08f7d9b..17aabe8eb 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1571,6 +1571,60 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index e241436c7..e50e137ca 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -61,6 +61,8 @@ #define UNROLL_PRAGMA(param) do {} while (0); #endif +#define PACKED_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | VRING_DESC_F_INDIRECT) + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 17aabe8eb..2ff7329b2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1571,6 +1571,119 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t avail_idx, uintptr_t *desc_addrs, uint16_t *ids) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + if (unlikely(descs[avail_idx + i].flags & + PACKED_SINGLE_DEQUEUE_FLAG)) + return -1; + } + + rte_smp_rmb(); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + lens[i] = descs[avail_idx + i].len; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + return 0; +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + struct virtio_net_hdr *hdr; + uint16_t i; + + if (vhost_dequeue_batch_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addrs, ids)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + } + + if (virtio_net_with_host_offload(dev)) { + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + vq->last_avail_idx += PACKED_BATCH_SIZE; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t *buf_id, -- 2.17.1
Buffer vhost enqueue shadow ring update, flush shadow ring until buffered descriptors number exceed one batch. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index e50e137ca..18a207fc6 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -163,6 +163,7 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + uint16_t enqueue_shadow_count; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 2ff7329b2..f85619dc2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -169,6 +169,24 @@ update_shadow_used_ring_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t desc_idx, uint32_t len, uint16_t count) +{ + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->enqueue_shadow_count = vq->last_used_idx & + PACKED_BATCH_MASK; + + uint16_t i = vq->shadow_used_idx++; + + vq->shadow_used_packed[i].id = desc_idx; + vq->shadow_used_packed[i].len = len; + vq->shadow_used_packed[i].count = count; + + vq->enqueue_shadow_count += count; +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -198,6 +216,23 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +flush_enqueue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint32_t len[], uint16_t id[], + uint16_t count[], uint16_t num_buffers) +{ + int i; + for (i = 0; i < num_buffers; i++) { + update_enqueue_shadow_used_ring_packed(vq, id[i], len[i], + count[i]); + + if (vq->enqueue_shadow_count >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + flush_shadow_used_ring_packed(dev, vq); + } + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -786,6 +821,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -811,6 +849,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -822,6 +863,8 @@ vhost_enqueue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + flush_enqueue_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); return 0; } -- 2.17.1
Flush used flags when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 18a207fc6..7bf9ff9b7 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ + | VRING_DESC_F_WRITE) +#define PACKED_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index f85619dc2..a629e66d4 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -169,6 +169,49 @@ update_shadow_used_ring_packed(struct vhost_virtqueue *vq, vq->shadow_used_packed[i].count = count; } +static __rte_always_inline void +flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t *lens, uint16_t *ids, uint16_t flags) +{ + uint16_t i; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq->last_used_idx += PACKED_BATCH_SIZE; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static __rte_always_inline void +flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t *lens, uint16_t *ids) +{ + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = PACKED_RX_USED_FLAG; + else + flags = PACKED_RX_USED_WRAP_FLAG; + flush_used_batch_packed(dev, vq, lens, ids, flags); +} + static __rte_always_inline void update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -937,6 +980,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1003,6 +1047,12 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + ids[i] = descs[avail_idx + i].id; + + flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor index and its wrap counter. First shadowed ring index is recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 7bf9ff9b7..f62e9ec3f 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_RX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED \ | VRING_DESC_F_WRITE) #define PACKED_RX_USED_WRAP_FLAG (VRING_DESC_F_WRITE) +#define PACKED_TX_USED_FLAG (0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) +#define PACKED_TX_USED_WRAP_FLAG (0x0) #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) @@ -110,9 +112,11 @@ struct log_cache_entry { }; struct vring_used_elem_packed { + uint16_t used_idx; uint16_t id; uint32_t len; uint32_t count; + uint16_t used_wrap_counter; }; /** @@ -167,6 +171,7 @@ struct vhost_virtqueue { }; uint16_t shadow_used_idx; uint16_t enqueue_shadow_count; + uint16_t dequeue_shadow_head; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index a629e66d4..8f7209f83 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -230,6 +230,41 @@ update_enqueue_shadow_used_ring_packed(struct vhost_virtqueue *vq, vq->enqueue_shadow_count += count; } +static __rte_always_inline void +update_dequeue_shadow_used_ring_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, uint16_t count) +{ + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + + if (vq->used_wrap_counter) + vq->desc_packed[vq->last_used_idx].flags = + PACKED_TX_USED_FLAG; + else + vq->desc_packed[vq->last_used_idx].flags = + PACKED_TX_USED_WRAP_FLAG; + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1822,6 +1857,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; + update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count); + vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { vq->last_avail_idx -= vq->size; -- 2.17.1
Vhost enqueue descriptors are updated by batch number, while vhost dequeue descriptors are buffered. Meanwhile in dequeue function only first descriptor is buffered. Due to these differences, split vhost enqueue and dequeue flush functions. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 8f7209f83..1b0fa2c64 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -92,8 +92,8 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, } static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) +flush_enqueue_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) { int i; uint16_t used_idx = vq->last_used_idx; @@ -158,6 +158,32 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +flush_dequeue_shadow_used_ring_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + uint16_t head_idx = vq->dequeue_shadow_head; + uint16_t head_flags; + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + if (used_elem->used_wrap_counter) + head_flags = PACKED_TX_USED_FLAG; + else + head_flags = PACKED_TX_USED_WRAP_FLAG; + + vq->desc_packed[head_idx].id = used_elem->id; + + rte_smp_wmb(); + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -199,6 +225,47 @@ flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags = 0; + uint16_t i; + + if (vq->used_wrap_counter) + flags = PACKED_TX_USED_FLAG; + else + flags = PACKED_TX_USED_WRAP_FLAG; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 1; i < PACKED_BATCH_SIZE; i++) + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + rte_smp_wmb(); + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 1; i < PACKED_BATCH_SIZE; i++) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vq->shadow_used_idx = 1; + vq->last_used_idx += PACKED_BATCH_SIZE; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + } else { + uint64_t lens[PACKED_BATCH_SIZE] = {0}; + flush_used_batch_packed(dev, vq, lens, ids, flags); + } +} + static __rte_always_inline void flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t *lens, uint16_t *ids) @@ -306,11 +373,29 @@ flush_enqueue_packed(struct virtio_net *dev, if (vq->enqueue_shadow_count >= PACKED_BATCH_SIZE) { do_data_copy_enqueue(dev, vq); - flush_shadow_used_ring_packed(dev, vq); + flush_enqueue_shadow_used_ring_packed(dev, vq); } } } +static __rte_unused void +flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + if (!vq->shadow_used_idx) + return; + + int16_t shadow_count = vq->last_used_idx - vq->dequeue_shadow_head; + if (shadow_count <= 0) + shadow_count += vq->size; + + /* buffer used descs as many as possible when doing dequeue */ + if ((uint16_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + flush_dequeue_shadow_used_ring_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1165,7 +1250,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_enqueue_shadow_used_ring_packed(dev, vq); vhost_vring_call_packed(dev, vq); } @@ -1796,6 +1881,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, pkts[i]->pkt_len); } + update_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_with_host_offload(dev)) { UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) for (i = 0; i < PACKED_BATCH_SIZE; i++) { @@ -1896,7 +1983,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_dequeue_shadow_used_ring_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } @@ -1975,7 +2062,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, if (unlikely(i < count)) vq->shadow_used_idx = i; if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + flush_dequeue_shadow_used_ring_packed(dev, vq); vhost_vring_call_packed(dev, vq); } } -- 2.17.1
Optimize vhost device Tx datapath by separate functions. Packets can be filled into one descriptor will be handled by batch and others will be handled one by one as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1b0fa2c64..5f2822ba2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -753,64 +753,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1089,7 +1031,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) { @@ -1176,7 +1118,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) { @@ -1205,52 +1147,36 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } - static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t remained = count; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } - - do_data_copy_enqueue(dev, vq); + pkt_idx++; + remained--; + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { - flush_enqueue_shadow_used_ring_packed(dev, vq); + if (pkt_idx) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); + flush_enqueue_shadow_used_ring_packed(dev, vq); + } vhost_vring_call_packed(dev, vq); } -- 2.17.1
Optimize vhost zero copy dequeue path like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5f2822ba2..deb9d0e39 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1881,6 +1881,141 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_dequeue_batch_packed(dev, vq, mbuf_pool, pkts, avail_idx, + desc_addrs, ids)) + return -1; + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + zmbufs[i] = get_zmbuf(vq); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + if (!zmbufs[i]) + goto free_pkt; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_mbuf_refcnt_update(pkts[i], 1); + + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq->last_avail_idx += PACKED_BATCH_SIZE; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; + +free_pkt: + UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq->last_avail_idx += desc_count; + if (vq->last_avail_idx >= vq->size) { + vq->last_avail_idx -= vq->size; + vq->avail_wrap_counter ^= 1; + } + + return 0; +} + +static __rte_always_inline void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags = 0; + + if (vq->used_wrap_counter) + flags = PACKED_TX_USED_FLAG; + else + flags = PACKED_TX_USED_WRAP_FLAG; + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq->last_used_idx += zmbuf->desc_count; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device Rx datapath by separate functions. No-chained and direct descriptors will be handled by batch and other will be handled one by one as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index deb9d0e39..56c2080fb 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -184,17 +184,6 @@ flush_dequeue_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static __rte_always_inline void flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t *lens, uint16_t *ids, uint16_t flags) @@ -378,7 +367,7 @@ flush_enqueue_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { if (!vq->shadow_used_idx) @@ -1784,7 +1773,7 @@ vhost_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1859,7 +1848,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) { @@ -1881,7 +1870,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1940,7 +1929,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts) @@ -2017,118 +2006,73 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + free_zmbuf(vq); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, + &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + pkt_idx++; + remained--; + } while (remained); - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t dummy_len; - uint16_t desc_count, nr_vec = 0; - int err; + return pkt_idx; +} - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &dummy_len, - VHOST_ACCESS_RO) < 0)) - break; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; - } - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; - - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); - - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } - - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } - } + pkt_idx++; + remained--; + flush_dequeue_packed(dev, vq); + } while (remained); - if (likely(dev->dequeue_zero_copy == 0)) { - do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_dequeue_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } + if (pkt_idx) { + if (vq->shadow_used_idx) + do_data_copy_dequeue(vq); } - return i; + return pkt_idx; } uint16_t @@ -2204,9 +2148,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
Disable software pre-fetch actions on Skylake and later platforms. Hardware can fetch needed data for vhost, additional software pre-fetch will impact performance. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 30839a001..5f3b42e56 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep AVX512F) + +ifneq ($(AVX512_SUPPORT),) +CFLAGS += -DDISABLE_SWPREFETCH +endif + ifeq ($(RTE_TOOLCHAIN), gcc) ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index ddf0ee579..5c6f0c0b4 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -15,6 +15,10 @@ elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' endif +r = run_command(toolchain, '-march=native', '-dM', '-E', '-', '</dev/null', '|', 'grep AVX512F') +if (r.stdout().strip() != '') + cflags += '-DDISABLE_SWPREFETCH' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 56c2080fb..046e497c2 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1075,7 +1075,9 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) for (i = 0; i < PACKED_BATCH_SIZE; i++) { +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); +#endif hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) (uintptr_t)desc_addrs[i]; lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; @@ -1144,7 +1146,9 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t remained = count; do { +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); +#endif if (remained >= PACKED_BATCH_SIZE) { if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { @@ -1790,7 +1794,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) for (i = 0; i < PACKED_BATCH_SIZE; i++) { +#ifndef DISABLE_SWPREFETCH rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); +#endif rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); @@ -2046,7 +2052,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint32_t remained = count; do { +#ifndef DISABLE_SWPREFETCH rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); +#endif if (remained >= PACKED_BATCH_SIZE) { if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 046e497c2..6f28082bc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -214,6 +220,29 @@ flush_used_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } + + vq->last_used_idx += PACKED_BATCH_SIZE; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static __rte_always_inline void update_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, uint16_t *ids) @@ -321,6 +350,32 @@ update_dequeue_shadow_used_ring_packed(struct vhost_virtqueue *vq, } } +static __rte_always_inline void +update_dequeue_shadow_used_ring_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, uint16_t count) +{ + vq->shadow_used_packed[0].id = buf_id; + + if (!vq->shadow_used_idx) { + vq->dequeue_shadow_head = vq->last_used_idx; + + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].used_wrap_counter = + vq->used_wrap_counter; + + vq->shadow_used_idx = 1; + } + + vq->last_used_idx += count; + + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1801,8 +1856,12 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); } + if (virtio_net_is_inorder(dev)) + update_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_MASK]); + else + update_dequeue_batch_packed(dev, vq, ids); - update_dequeue_batch_packed(dev, vq, ids); if (virtio_net_with_host_offload(dev)) { UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) @@ -1865,7 +1924,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, &desc_count)) return -1; - update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + update_dequeue_shadow_used_ring_packed_inorder(vq, buf_id, + desc_count); + else + update_dequeue_shadow_used_ring_packed(vq, buf_id, desc_count); vq->last_avail_idx += desc_count; if (vq->last_avail_idx >= vq->size) { -- 2.17.1
On 10/9/19 3:38 PM, Marvin Liu wrote:
> Add vhost enqueue function for single packet and meanwhile left space
> for flush used ring function.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
On 10/9/19 3:38 PM, Marvin Liu wrote: > Add macro for unifying Clang/ICC/GCC unroll pragma format. Batch > functions were contained of several small loops which optimized by > compiler’s loop unrolling pragma. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > index 8623e91c0..30839a001 100644 > --- a/lib/librte_vhost/Makefile > +++ b/lib/librte_vhost/Makefile > @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user > CFLAGS += -fno-strict-aliasing > LDLIBS += -lpthread > > +ifeq ($(RTE_TOOLCHAIN), gcc) > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) > +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > +endif > +endif > + > +ifeq ($(RTE_TOOLCHAIN), clang) > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) > +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA > +endif > +endif > + > +ifeq ($(RTE_TOOLCHAIN), icc) > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) > +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA > +endif > +endif > + > ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) > LDLIBS += -lnuma > endif > diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build > index cb1123ae3..ddf0ee579 100644 > --- a/lib/librte_vhost/meson.build > +++ b/lib/librte_vhost/meson.build > @@ -8,6 +8,13 @@ endif > if has_libnuma == 1 > dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) > endif > +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) > + cflags += '-DSUPPORT_GCC_UNROLL_PRAGMA' > +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) > + cflags += '-DSUPPORT_CLANG_UNROLL_PRAGMA' > +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) > + cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' > +endif > dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', > cc.has_header('linux/userfaultfd.h')) > version = 4 > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 884befa85..4cba8c5ef 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,24 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#ifdef SUPPORT_GCC_UNROLL_PRAGMA > +#define UNROLL_PRAGMA_PARAM "GCC unroll 4" Shouldn't al these defines be either prefixed with VHOST_, or being declared in EAL headers, so that it can be used by other DPDK libs? I will pick it as is for now, but please consider above comment and and send a patch on top if it makes sense. Thanks, Maxime > +#endif > + > +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA > +#define UNROLL_PRAGMA_PARAM "unroll 4" > +#endif > + > +#ifdef SUPPORT_ICC_UNROLL_PRAGMA > +#define UNROLL_PRAGMA_PARAM "unroll (4)" > +#endif > + > +#ifdef UNROLL_PRAGMA_PARAM > +#define UNROLL_PRAGMA(param) _Pragma(param) > +#else > +#define UNROLL_PRAGMA(param) do {} while (0); > +#endif > + > /** > * Structure contains buffer address, length and descriptor index > * from vring to do scatter RX. >
On 10/9/19 3:38 PM, Marvin Liu wrote:
> Batch enqueue function will first check whether descriptors are cache
> aligned. It will also check prerequisites in the beginning. Batch
> enqueue function not support chained mbufs, single packet enqueue
> function will handle it.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
On 10/9/19 3:38 PM, Marvin Liu wrote:
> Add vhost single packet dequeue function for packed ring and meanwhile
> left space for shadow used ring update function.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
On 10/9/19 3:38 PM, Marvin Liu wrote: > Disable software pre-fetch actions on Skylake and later platforms. > Hardware can fetch needed data for vhost, additional software pre-fetch > will impact performance. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > index 30839a001..5f3b42e56 100644 > --- a/lib/librte_vhost/Makefile > +++ b/lib/librte_vhost/Makefile > @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user > CFLAGS += -fno-strict-aliasing > LDLIBS += -lpthread > > +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep AVX512F) > + > +ifneq ($(AVX512_SUPPORT),) > +CFLAGS += -DDISABLE_SWPREFETCH > +endif That's problematic I think, because the machine running the lib may be different from the machine building it, for example distros. In this case, a Skylake or later may be used to build the package, but with passing "-march=haswell". It would end-up prefetching being disabled whereas we would expect it to be enabled. I see several solutions: - Check for CONFIG_RTE_ENABLE_AVX512 flag. - Keep prefetch instructions (what would be the impact on Skylake and later?) - Remove prefetch instructions (what would be the impact on pre- Skylake?) But really, I think we need some figures before applying such a patch. What performance gain do you measure with this patch? > ifeq ($(RTE_TOOLCHAIN), gcc) > ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) > CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build > index ddf0ee579..5c6f0c0b4 100644 > --- a/lib/librte_vhost/meson.build > +++ b/lib/librte_vhost/meson.build > @@ -15,6 +15,10 @@ elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) > elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) > cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' > endif > +r = run_command(toolchain, '-march=native', '-dM', '-E', '-', '</dev/null', '|', 'grep AVX512F') > +if (r.stdout().strip() != '') > + cflags += '-DDISABLE_SWPREFETCH' > +endif > dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', > cc.has_header('linux/userfaultfd.h')) > version = 4 > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 56c2080fb..046e497c2 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -1075,7 +1075,9 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > > UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > for (i = 0; i < PACKED_BATCH_SIZE; i++) { > +#ifndef DISABLE_SWPREFETCH > rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > +#endif > hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) > (uintptr_t)desc_addrs[i]; > lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > @@ -1144,7 +1146,9 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > uint32_t remained = count; > > do { > +#ifndef DISABLE_SWPREFETCH > rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); > +#endif > > if (remained >= PACKED_BATCH_SIZE) { > if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { > @@ -1790,7 +1794,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > > UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > for (i = 0; i < PACKED_BATCH_SIZE; i++) { > +#ifndef DISABLE_SWPREFETCH > rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > +#endif > rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > pkts[i]->pkt_len); > @@ -2046,7 +2052,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > uint32_t remained = count; > > do { > +#ifndef DISABLE_SWPREFETCH > rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); > +#endif > > if (remained >= PACKED_BATCH_SIZE) { > if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, >
On 10/9/19 3:38 PM, Marvin Liu wrote:
> Batch enqueue function will first check whether descriptors are cache
> aligned. It will also check prerequisites in the beginning. Batch
> enqueue function not support chained mbufs, single packet enqueue
> function will handle it.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Thinking again about this patch and series in general...
So this series improves performance by 40% in cases where:
- descriptors are cache aligned
- single mbuf
But my understanding is that it will cause performance regression for
the other cases, which may not be that uncommon, no?
Do you have some number about the performance impact on these other
cases?
Thanks,
Maxime
> -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Friday, October 11, 2019 10:12 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v4 13/14] vhost: check whether disable software pre- > fetch > > > > On 10/9/19 3:38 PM, Marvin Liu wrote: > > Disable software pre-fetch actions on Skylake and later platforms. > > Hardware can fetch needed data for vhost, additional software pre-fetch > > will impact performance. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > > index 30839a001..5f3b42e56 100644 > > --- a/lib/librte_vhost/Makefile > > +++ b/lib/librte_vhost/Makefile > > @@ -16,6 +16,12 @@ CFLAGS += -I vhost_user > > CFLAGS += -fno-strict-aliasing > > LDLIBS += -lpthread > > > > +AVX512_SUPPORT=$(shell $(CC) -march=native -dM -E - </dev/null |grep > AVX512F) > > + > > +ifneq ($(AVX512_SUPPORT),) > > +CFLAGS += -DDISABLE_SWPREFETCH > > +endif > > That's problematic I think, because the machine running the lib may be > different from the machine building it, for example distros. > > In this case, a Skylake or later may be used to build the package, but > with passing "-march=haswell". It would end-up prefetching being > disabled whereas we would expect it to be enabled. > Thanks, Maxime. Got your idea. Compiling environment and running environment maybe different. Performance impact on skylake is around 1% in V1 patch under vhost/virtio loopback scenario. Since the impact is very small and has no impact in later revised version. I'd like to remove this patch. Regards, Marvin > I see several solutions: > - Check for CONFIG_RTE_ENABLE_AVX512 flag. > - Keep prefetch instructions (what would be the impact on Skylake and > later?) > - Remove prefetch instructions (what would be the impact on pre- > Skylake?) > > > But really, I think we need some figures before applying such a patch. > What performance gain do you measure with this patch? > > > ifeq ($(RTE_TOOLCHAIN), gcc) > > ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) > > CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > > diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build > > index ddf0ee579..5c6f0c0b4 100644 > > --- a/lib/librte_vhost/meson.build > > +++ b/lib/librte_vhost/meson.build > > @@ -15,6 +15,10 @@ elif (toolchain == 'clang' and > cc.version().version_compare('>=3.7.0')) > > elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) > > cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' > > endif > > +r = run_command(toolchain, '-march=native', '-dM', '-E', '-', > '</dev/null', '|', 'grep AVX512F') > > +if (r.stdout().strip() != '') > > + cflags += '-DDISABLE_SWPREFETCH' > > +endif > > dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', > > cc.has_header('linux/userfaultfd.h')) > > version = 4 > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index 56c2080fb..046e497c2 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -1075,7 +1075,9 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > > > > UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > +#ifndef DISABLE_SWPREFETCH > > rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > +#endif > > hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) > > (uintptr_t)desc_addrs[i]; > > lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > > @@ -1144,7 +1146,9 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > uint32_t remained = count; > > > > do { > > +#ifndef DISABLE_SWPREFETCH > > rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); > > +#endif > > > > if (remained >= PACKED_BATCH_SIZE) { > > if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { > > @@ -1790,7 +1794,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > > > > UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > > for (i = 0; i < PACKED_BATCH_SIZE; i++) { > > +#ifndef DISABLE_SWPREFETCH > > rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > +#endif > > rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > pkts[i]->pkt_len); > > @@ -2046,7 +2052,9 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq, > > uint32_t remained = count; > > > > do { > > +#ifndef DISABLE_SWPREFETCH > > rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); > > +#endif > > > > if (remained >= PACKED_BATCH_SIZE) { > > if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, > >
> -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Friday, October 11, 2019 8:49 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v4 02/14] vhost: unify unroll pragma parameter > > > > On 10/9/19 3:38 PM, Marvin Liu wrote: > > Add macro for unifying Clang/ICC/GCC unroll pragma format. Batch > > functions were contained of several small loops which optimized by > > compiler’s loop unrolling pragma. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile > > index 8623e91c0..30839a001 100644 > > --- a/lib/librte_vhost/Makefile > > +++ b/lib/librte_vhost/Makefile > > @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user > > CFLAGS += -fno-strict-aliasing > > LDLIBS += -lpthread > > > > +ifeq ($(RTE_TOOLCHAIN), gcc) > > +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) > > +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA > > +endif > > +endif > > + > > +ifeq ($(RTE_TOOLCHAIN), clang) > > +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 > && echo 1), 1) > > +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA > > +endif > > +endif > > + > > +ifeq ($(RTE_TOOLCHAIN), icc) > > +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) > > +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA > > +endif > > +endif > > + > > ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) > > LDLIBS += -lnuma > > endif > > diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build > > index cb1123ae3..ddf0ee579 100644 > > --- a/lib/librte_vhost/meson.build > > +++ b/lib/librte_vhost/meson.build > > @@ -8,6 +8,13 @@ endif > > if has_libnuma == 1 > > dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) > > endif > > +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) > > + cflags += '-DSUPPORT_GCC_UNROLL_PRAGMA' > > +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) > > + cflags += '-DSUPPORT_CLANG_UNROLL_PRAGMA' > > +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) > > + cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' > > +endif > > dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', > > cc.has_header('linux/userfaultfd.h')) > > version = 4 > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index 884befa85..4cba8c5ef 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,24 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#ifdef SUPPORT_GCC_UNROLL_PRAGMA > > +#define UNROLL_PRAGMA_PARAM "GCC unroll 4" > > Shouldn't al these defines be either prefixed with VHOST_, or being > declared in EAL headers, so that it can be used by other DPDK libs? > > I will pick it as is for now, but please consider above comment and > and send a patch on top if it makes sense. > Hi Maxime, For making loop unroll macro more generic, modified version as below. Since only vhost utilize the benefit of compiler's unroll feature, I'd like to keep it in vhost by now. #ifdef SUPPORT_GCC_UNROLL_PRAGMA #define for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ for (iter = val; iter < size; iter++) #endif #ifdef SUPPORT_CLANG_UNROLL_PRAGMA #define for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ for (iter = val; iter < size; iter++) #endif #ifdef SUPPORT_ICC_UNROLL_PRAGMA #define for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ for (iter = val; iter < size; iter++) #endif #ifndef for_each_try_unroll #define for_each_try_unroll(iter, val, num) \ for (iter = val; iter < num; iter++) #endif Regards, Marvin > Thanks, > Maxime > > +#endif > > + > > +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA > > +#define UNROLL_PRAGMA_PARAM "unroll 4" > > +#endif > > + > > +#ifdef SUPPORT_ICC_UNROLL_PRAGMA > > +#define UNROLL_PRAGMA_PARAM "unroll (4)" > > +#endif > > + > > +#ifdef UNROLL_PRAGMA_PARAM > > +#define UNROLL_PRAGMA(param) _Pragma(param) > > +#else > > +#define UNROLL_PRAGMA(param) do {} while (0); > > +#endif > > + > > /** > > * Structure contains buffer address, length and descriptor index > > * from vring to do scatter RX. > >
> -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Friday, October 11, 2019 10:22 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v4 03/14] vhost: add batch enqueue function for packed > ring > > > > On 10/9/19 3:38 PM, Marvin Liu wrote: > > Batch enqueue function will first check whether descriptors are cache > > aligned. It will also check prerequisites in the beginning. Batch > > enqueue function not support chained mbufs, single packet enqueue > > function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > Thinking again about this patch and series in general... > > So this series improves performance by 40% in cases where: > - descriptors are cache aligned > - single mbuf > > But my understanding is that it will cause performance regression for > the other cases, which may not be that uncommon, no? > > Do you have some number about the performance impact on these other > cases? > Hi Maxime, Check prerequisites of batch handling is pretty simple and fast. It almost has no performance impact on uncommon case. Chained packets can slightly benefit from cache related optimization. As shown in below table, all cases I run can benefit from vhost optimization. From our experimental, more performance gain can be seen if more packets handled by batch. +---------------------------------------------------+ | | 19.08 | + opt | |-----------------------------------|-------|-------| | 1518B PvP | 2.63M | 2.98M | |-----------------------------------|-------|-------| | 64B loopback | 7.81M | 12.0M | |-----------------------------------|-------|-------| | 1518B loopback | 3.59M | 4.69M | |-----------------------------------|-------|-------| | 16K chained loopback | 297K | 306K | |-----------------------------------|-------|-------| | 50% 256B + 50% 16K | 296K | 309K | |-----------------------------------|-------|-------| | pktgen_sample03_burst_single_flow | 6.03M | 6.39M | +---------------------------------------------------+ Regards, Marvin > Thanks, > Maxime
Hi Marvin, > -----Original Message----- > From: Marvin Liu <yong.liu@intel.com> > Sent: Wednesday, October 16, 2019 12:08 AM > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > zhihong.wang@intel.com; stephen@networkplumber.org; Gavin Hu (Arm > Technology China) <Gavin.Hu@arm.com> > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > Subject: [PATCH v6 04/13] vhost: add packed ring batch enqueue > > Batch enqueue function will first check whether descriptors are cache > aligned. It will also check prerequisites in the beginning. Batch > enqueue function do not support chained mbufs, single packet enqueue > function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 142c14e04..a8130dc06 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > vhost_virtqueue *vq, > return pkt_idx; > } > > +static __rte_unused int > +virtio_dev_rx_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mbuf **pkts) > +{ > + bool wrap_counter = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + uint16_t avail_idx = vq->last_avail_idx; > + uint64_t desc_addrs[PACKED_BATCH_SIZE]; > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint64_t lens[PACKED_BATCH_SIZE]; > + uint16_t i; > + > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > + return -1; > + > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > + return -1; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely(pkts[i]->next != NULL)) > + return -1; > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > + wrap_counter))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + lens[i] = descs[avail_idx + i].len; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > + return -1; > + } > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], > + VHOST_ACCESS_RW); > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > + return -1; > + } > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) > + (uintptr_t)desc_addrs[i]; > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > + } > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > + > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); Is the last_avail_idx a shared variable? Why is updated before the following payload copy? This will cause the other side get earlier-than-arrival data? /Gavin > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + pkts[i]->pkt_len); > + } > + > + return 0; > +} > + > static __rte_unused int16_t > virtio_dev_rx_single_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, > -- > 2.17.1
> -----Original Message----- > From: Gavin Hu (Arm Technology China) [mailto:Gavin.Hu@arm.com] > Sent: Tuesday, October 15, 2019 7:36 PM > To: Liu, Yong <yong.liu@intel.com>; maxime.coquelin@redhat.com; Bie, Tiwei > <tiwei.bie@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; > stephen@networkplumber.org > Cc: dev@dpdk.org; nd <nd@arm.com> > Subject: RE: [PATCH v6 04/13] vhost: add packed ring batch enqueue > > Hi Marvin, > > > -----Original Message----- > > From: Marvin Liu <yong.liu@intel.com> > > Sent: Wednesday, October 16, 2019 12:08 AM > > To: maxime.coquelin@redhat.com; tiwei.bie@intel.com; > > zhihong.wang@intel.com; stephen@networkplumber.org; Gavin Hu (Arm > > Technology China) <Gavin.Hu@arm.com> > > Cc: dev@dpdk.org; Marvin Liu <yong.liu@intel.com> > > Subject: [PATCH v6 04/13] vhost: add packed ring batch enqueue > > > > Batch enqueue function will first check whether descriptors are cache > > aligned. It will also check prerequisites in the beginning. Batch > > enqueue function do not support chained mbufs, single packet enqueue > > function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> > > > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index 142c14e04..a8130dc06 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct > > vhost_virtqueue *vq, > > return pkt_idx; > > } > > > > +static __rte_unused int > > +virtio_dev_rx_batch_packed(struct virtio_net *dev, > > + struct vhost_virtqueue *vq, > > + struct rte_mbuf **pkts) > > +{ > > + bool wrap_counter = vq->avail_wrap_counter; > > + struct vring_packed_desc *descs = vq->desc_packed; > > + uint16_t avail_idx = vq->last_avail_idx; > > + uint64_t desc_addrs[PACKED_BATCH_SIZE]; > > + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uint64_t lens[PACKED_BATCH_SIZE]; > > + uint16_t i; > > + > > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > > + return -1; > > + > > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > > + return -1; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely(pkts[i]->next != NULL)) > > + return -1; > > + if (unlikely(!desc_is_avail(&descs[avail_idx + i], > > + wrap_counter))) > > + return -1; > > + } > > + > > + rte_smp_rmb(); > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + lens[i] = descs[avail_idx + i].len; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) > > + return -1; > > + } > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > + descs[avail_idx + i].addr, > > + &lens[i], > > + VHOST_ACCESS_RW); > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely(lens[i] != descs[avail_idx + i].len)) > > + return -1; > > + } > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) > > + (uintptr_t)desc_addrs[i]; > > + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; > > + } > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); > > + > > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); > > Is the last_avail_idx a shared variable? Why is updated before the > following payload copy? > This will cause the other side get earlier-than-arrival data? > /Gavin Hi Gavin, Last_avail_idx and last_used_idx are all vhost local variables. They are used for tracking next available and used index of virtqueue. Last avail_idx value should increase after descs are consumed. Last used_idx value should increase after descs flags are updated. Thanks, Marvin > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > + pkts[i]->pkt_len); > > + } > > + > > + return 0; > > +} > > + > > static __rte_unused int16_t > > virtio_dev_rx_single_packed(struct virtio_net *dev, > > struct vhost_virtqueue *vq, > > -- > > 2.17.1
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 35%. v5: - Remove disable sw prefetch as performance impact is small - change unroll pragma macro format - Rename shadow counter elements names - clean dequeue update check condition - add inline functions replace of duplicated code - unify code style v4: - Support meson build - Remove memory region cache for no clear performance gain and ABI break - Not assume ring size is power of two v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (13): vhost: add packed ring indexes increasing function vhost: add packed ring single enqueue vhost: try to unroll for each loop vhost: add packed ring batch enqueue vhost: add packed ring single dequeue vhost: add packed ring batch dequeue vhost: flush enqueue updates by batch vhost: flush batched enqueue descs flags directly vhost: buffer packed ring dequeue updates vhost: optimize packed ring enqueue vhost: add packed ring zcopy batch and single dequeue vhost: optimize packed ring dequeue vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 18 + lib/librte_vhost/meson.build | 7 + lib/librte_vhost/vhost.h | 57 +++ lib/librte_vhost/virtio_net.c | 927 +++++++++++++++++++++++++++------- 4 files changed, 814 insertions(+), 195 deletions(-) -- 2.17.1
When vhost doing [de]nqueue, vq's local variable last_[used/avail]_idx will be inceased. Adding inline functions can avoid duplicated codes. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 5131a97a3..22a3ddc38 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -350,6 +350,26 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) wrap_counter != !!(flags & VRING_DESC_F_USED); } +static inline void +vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_used_idx += num; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static inline void +vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_avail_idx += num; + if (vq->last_avail_idx >= vq->size) { + vq->avail_wrap_counter ^= 1; + vq->last_avail_idx -= vq->size; + } +} + void __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5b85b832d..42b662080 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -140,11 +140,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, head_flags = flags; } - vq->last_used_idx += vq->shadow_used_packed[i].count; - if (vq->last_used_idx >= vq->size) { - vq->used_wrap_counter ^= 1; - vq->last_used_idx -= vq->size; - } + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); } vq->desc_packed[head_idx].flags = head_flags; @@ -865,11 +861,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, nr_descs); } do_data_copy_enqueue(dev, vq); @@ -1504,11 +1496,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, desc_count); } if (likely(dev->dequeue_zero_copy == 0)) { -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 42b662080..142c14e04 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -770,6 +770,60 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, + struct buf_vector *buf_vec, + uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -827,6 +881,32 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq_inc_last_avail_packed(vq, nr_descs); + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Create macro for adding unroll pragma before for each loop. Batch functions will be contained of several small loops which can be optimized by compilers' loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..30839a001 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index cb1123ae3..ddf0ee579 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -8,6 +8,13 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) + cflags += '-DSUPPORT_GCC_UNROLL_PRAGMA' +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) + cflags += '-DSUPPORT_CLANG_UNROLL_PRAGMA' +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) + cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 22a3ddc38..18d01cb19 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,30 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + +#ifdef SUPPORT_GCC_UNROLL_PRAGMA +#define for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA +#define for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef SUPPORT_ICC_UNROLL_PRAGMA +#define for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ + for (iter = val; iter < size; iter++) +#endif + +#ifndef for_each_try_unroll +#define for_each_try_unroll(iter, val, num) \ + for (iter = val; iter < num; iter++) +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function do not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 142c14e04..a8130dc06 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index a8130dc06..e1b06c1ce 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1551,6 +1551,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq_inc_last_avail_packed(vq, desc_count); + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 18d01cb19..96bf763b1 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ + VRING_DESC_F_INDIRECT) + #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index e1b06c1ce..274a28f99 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1551,6 +1551,113 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_reserve_avail_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t avail_idx, + uintptr_t *desc_addrs, + uint16_t *ids) +{ + bool wrap = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + struct virtio_net_hdr *hdr; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t flags, i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + flags = descs[avail_idx + i].flags; + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || + (wrap == !!(flags & VRING_DESC_F_USED)) || + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) + return -1; + } + + rte_smp_rmb(); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + if (virtio_net_with_host_offload(dev)) { + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + return 0; + +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + } + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Buffer vhost enqueue shadowed ring flush action buffered number exceed one batch. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 96bf763b1..a60b88d89 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -166,6 +166,8 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + /* Record packed ring enqueue latest desc cache aligned index */ + uint16_t shadow_aligned_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 274a28f99..020c9b858 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -91,6 +91,69 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, vq->shadow_used_split[i].len = len; } +static __rte_always_inline void +vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } + + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); + } + + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -194,6 +257,33 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint32_t len[], + uint16_t id[], + uint16_t count[], + uint16_t num_buffers) +{ + uint16_t i; + for (i = 0; i < num_buffers; i++) { + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->shadow_aligned_idx = vq->last_used_idx & + PACKED_BATCH_MASK; + vq->shadow_used_packed[vq->shadow_used_idx].id = id[i]; + vq->shadow_used_packed[vq->shadow_used_idx].len = len[i]; + vq->shadow_used_packed[vq->shadow_used_idx].count = count[i]; + vq->shadow_aligned_idx += count[i]; + vq->shadow_used_idx++; + } + + if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -785,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -810,6 +903,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -821,6 +917,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } @@ -1017,7 +1116,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Flush used flags when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index a60b88d89..bf3c30f43 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ + VRING_DESC_F_WRITE) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 020c9b858..ec17353da 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,36 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_enqueue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t *lens, + uint16_t *ids) +{ + uint16_t i; + uint16_t flags; + + flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -992,6 +1022,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1047,6 +1078,11 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + ids[i] = descs[avail_idx + i].id; + + vhost_flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor's flags. First shadowed ring index was recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index bf3c30f43..bc4d039df 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ VRING_DESC_F_WRITE) +#define PACKED_DESC_DEQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) : 0x0) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) @@ -114,6 +116,7 @@ struct log_cache_entry { struct vring_used_elem_packed { uint16_t id; + uint16_t flags; uint32_t len; uint32_t count; }; @@ -171,6 +174,8 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; /* Record packed ring enqueue latest desc cache aligned index */ uint16_t shadow_aligned_idx; + /* Record packed ring first dequeue desc index */ + uint16_t shadow_last_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ec17353da..750d16f50 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,23 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_dequeue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id; + rte_smp_wmb(); + vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags; + + vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void vhost_flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -247,6 +264,70 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags; + uint16_t i; + uint16_t begin; + + flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + begin = 1; + } else + begin = 0; + + for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = 0; + } + + rte_smp_wmb(); + for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + +static __rte_always_inline void +vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_idx++; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + vq->desc_packed[vq->last_used_idx].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + } + + vq_inc_last_used_packed(vq, count); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -314,6 +395,26 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } +static __rte_unused void +vhost_flush_dequeue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int shadow_count; + if (!vq->shadow_used_idx) + return; + + shadow_count = vq->last_used_idx - vq->shadow_last_used_idx; + if (shadow_count <= 0) + shadow_count += vq->size; + + /* buffer used descs as many as possible when doing dequeue */ + if ((uint32_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + vhost_flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1788,6 +1889,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + vhost_shadow_dequeue_batch_packed(dev, vq, ids); + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); return 0; @@ -1843,6 +1946,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + vq_inc_last_avail_packed(vq, desc_count); return 0; -- 2.17.1
Optimize vhost device packed ring enqueue function by splitting batch and single functions. Packets can be filled into one desc will be handled by batch and others will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 750d16f50..5cdca9a7f 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -772,64 +772,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1111,7 +1053,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) @@ -1187,7 +1129,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) @@ -1214,49 +1156,40 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, } static __rte_noinline uint32_t -virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf **pkts, uint32_t count) +virtio_dev_rx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, + uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t remained = count; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, nr_descs); - } - - do_data_copy_enqueue(dev, vq); + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); vhost_flush_enqueue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); + return pkt_idx; } -- 2.17.1
Add vhost packed ring zero copy batch and single dequeue functions like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5cdca9a7f..01d1603e3 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1886,6 +1886,122 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + zmbufs[i] = get_zmbuf(vq); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (!zmbufs[i]) + goto free_pkt; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_mbuf_refcnt_update(pkts[i], 1); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; + +free_pkt: + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq_inc_last_avail_packed(vq, desc_count); + return 0; +} + +static __rte_always_inline void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags = 0; + bool wrap; + + wrap = vq->used_wrap_counter; + flags = PACKED_DESC_DEQUEUE_USED_FLAG(wrap); + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq_inc_last_used_packed(vq, zmbuf->desc_count); + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device packed ring dequeue function by splitting batch and single functions. No-chained and direct descriptors will be handled by batch and other will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 01d1603e3..85ccc02da 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -201,69 +201,6 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } -static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) -{ - int i; - uint16_t used_idx = vq->last_used_idx; - uint16_t head_idx = vq->last_used_idx; - uint16_t head_flags = 0; - - /* Split loop in two to save memory barriers */ - for (i = 0; i < vq->shadow_used_idx; i++) { - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; - - used_idx += vq->shadow_used_packed[i].count; - if (used_idx >= vq->size) - used_idx -= vq->size; - } - - rte_smp_wmb(); - - for (i = 0; i < vq->shadow_used_idx; i++) { - uint16_t flags; - - if (vq->shadow_used_packed[i].len) - flags = VRING_DESC_F_WRITE; - else - flags = 0; - - if (vq->used_wrap_counter) { - flags |= VRING_DESC_F_USED; - flags |= VRING_DESC_F_AVAIL; - } else { - flags &= ~VRING_DESC_F_USED; - flags &= ~VRING_DESC_F_AVAIL; - } - - if (i > 0) { - vq->desc_packed[vq->last_used_idx].flags = flags; - - vhost_log_cache_used_vring(dev, vq, - vq->last_used_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - } else { - head_idx = vq->last_used_idx; - head_flags = flags; - } - - vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); - } - - vq->desc_packed[head_idx].flags = head_flags; - - vhost_log_cache_used_vring(dev, vq, - head_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - - vq->shadow_used_idx = 0; - vhost_log_cache_sync(dev, vq); -} - static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -328,17 +265,6 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -395,7 +321,7 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void vhost_flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1799,7 +1725,7 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1866,7 +1792,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1886,7 +1812,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1935,7 +1861,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2003,114 +1929,78 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; + uint32_t pkt_idx = 0; + uint32_t remained = count; - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + free_zmbuf(vq); - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); - - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, + &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + pkt_idx++; + remained--; - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + } while (remained); - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t dummy_len; - uint16_t desc_count, nr_vec = 0; - int err; + return pkt_idx; +} - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &dummy_len, - VHOST_ACCESS_RO) < 0)) - break; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + vhost_flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; - } - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; - - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; - - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); - - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } + vhost_flush_dequeue_packed(dev, vq); + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, desc_count); - } + } while (remained); - if (likely(dev->dequeue_zero_copy == 0)) { + if (vq->shadow_used_idx) do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - return i; + return pkt_idx; } uint16_t @@ -2186,9 +2076,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 85ccc02da..88632caff 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -201,6 +207,25 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, + uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -265,6 +290,26 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } +static __rte_always_inline void +vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + vq->shadow_used_packed[0].id = buf_id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, count); +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1748,7 +1793,11 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } - vhost_shadow_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_SIZE - 1]); + else + vhost_shadow_dequeue_batch_packed(dev, vq, ids); vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); @@ -1805,7 +1854,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; - vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, + desc_count); + else + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); vq_inc_last_avail_packed(vq, desc_count); -- 2.17.1
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 35%. v6: - Fix dequeue zcopy result check v5: - Remove disable sw prefetch as performance impact is small - Change unroll pragma macro format - Rename shadow counter elements names - Clean dequeue update check condition - Add inline functions replace of duplicated code - Unify code style v4: - Support meson build - Remove memory region cache for no clear performance gain and ABI break - Not assume ring size is power of two v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (13): vhost: add packed ring indexes increasing function vhost: add packed ring single enqueue vhost: try to unroll for each loop vhost: add packed ring batch enqueue vhost: add packed ring single dequeue vhost: add packed ring batch dequeue vhost: flush enqueue updates by batch vhost: flush batched enqueue descs directly vhost: buffer packed ring dequeue updates vhost: optimize packed ring enqueue vhost: add packed ring zcopy batch and single dequeue vhost: optimize packed ring dequeue vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 18 + lib/librte_vhost/meson.build | 7 + lib/librte_vhost/vhost.h | 57 +++ lib/librte_vhost/virtio_net.c | 924 +++++++++++++++++++++++++++------- 4 files changed, 812 insertions(+), 194 deletions(-) -- 2.17.1
When vhost doing [de]nqueue, vq's local variable last_[used/avail]_idx will be inceased. Adding inline functions can avoid duplicated codes. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 5131a97a3..22a3ddc38 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -350,6 +350,26 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) wrap_counter != !!(flags & VRING_DESC_F_USED); } +static inline void +vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_used_idx += num; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static inline void +vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_avail_idx += num; + if (vq->last_avail_idx >= vq->size) { + vq->avail_wrap_counter ^= 1; + vq->last_avail_idx -= vq->size; + } +} + void __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5b85b832d..42b662080 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -140,11 +140,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, head_flags = flags; } - vq->last_used_idx += vq->shadow_used_packed[i].count; - if (vq->last_used_idx >= vq->size) { - vq->used_wrap_counter ^= 1; - vq->last_used_idx -= vq->size; - } + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); } vq->desc_packed[head_idx].flags = head_flags; @@ -865,11 +861,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, nr_descs); } do_data_copy_enqueue(dev, vq); @@ -1504,11 +1496,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, desc_count); } if (likely(dev->dequeue_zero_copy == 0)) { -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 42b662080..142c14e04 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -770,6 +770,60 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, + struct buf_vector *buf_vec, + uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -827,6 +881,32 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq_inc_last_avail_packed(vq, nr_descs); + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Create macro for adding unroll pragma before for each loop. Batch functions will be contained of several small loops which can be optimized by compilers' loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..30839a001 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index cb1123ae3..ddf0ee579 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -8,6 +8,13 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) + cflags += '-DSUPPORT_GCC_UNROLL_PRAGMA' +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) + cflags += '-DSUPPORT_CLANG_UNROLL_PRAGMA' +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) + cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 22a3ddc38..18d01cb19 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,30 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + +#ifdef SUPPORT_GCC_UNROLL_PRAGMA +#define for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA +#define for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef SUPPORT_ICC_UNROLL_PRAGMA +#define for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ + for (iter = val; iter < size; iter++) +#endif + +#ifndef for_each_try_unroll +#define for_each_try_unroll(iter, val, num) \ + for (iter = val; iter < num; iter++) +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function do not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 142c14e04..a8130dc06 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index a8130dc06..e1b06c1ce 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1551,6 +1551,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq_inc_last_avail_packed(vq, desc_count); + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 18d01cb19..96bf763b1 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ + VRING_DESC_F_INDIRECT) + #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index e1b06c1ce..274a28f99 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1551,6 +1551,113 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_reserve_avail_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t avail_idx, + uintptr_t *desc_addrs, + uint16_t *ids) +{ + bool wrap = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + struct virtio_net_hdr *hdr; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t flags, i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + flags = descs[avail_idx + i].flags; + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || + (wrap == !!(flags & VRING_DESC_F_USED)) || + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) + return -1; + } + + rte_smp_rmb(); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + if (virtio_net_with_host_offload(dev)) { + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + return 0; + +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + } + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Buffer vhost enqueue shadowed ring flush action buffered number exceed one batch. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 96bf763b1..a60b88d89 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -166,6 +166,8 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + /* Record packed ring enqueue latest desc cache aligned index */ + uint16_t shadow_aligned_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 274a28f99..020c9b858 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -91,6 +91,69 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, vq->shadow_used_split[i].len = len; } +static __rte_always_inline void +vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } + + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); + } + + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -194,6 +257,33 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint32_t len[], + uint16_t id[], + uint16_t count[], + uint16_t num_buffers) +{ + uint16_t i; + for (i = 0; i < num_buffers; i++) { + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->shadow_aligned_idx = vq->last_used_idx & + PACKED_BATCH_MASK; + vq->shadow_used_packed[vq->shadow_used_idx].id = id[i]; + vq->shadow_used_packed[vq->shadow_used_idx].len = len[i]; + vq->shadow_used_packed[vq->shadow_used_idx].count = count[i]; + vq->shadow_aligned_idx += count[i]; + vq->shadow_used_idx++; + } + + if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -785,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -810,6 +903,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -821,6 +917,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } @@ -1017,7 +1116,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Flush used elements when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index a60b88d89..bf3c30f43 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ + VRING_DESC_F_WRITE) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 020c9b858..ec17353da 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,36 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_enqueue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t *lens, + uint16_t *ids) +{ + uint16_t i; + uint16_t flags; + + flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -992,6 +1022,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1047,6 +1078,11 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + ids[i] = descs[avail_idx + i].id; + + vhost_flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor's flags. First shadowed ring index was recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index bf3c30f43..bc4d039df 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ VRING_DESC_F_WRITE) +#define PACKED_DESC_DEQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) : 0x0) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) @@ -114,6 +116,7 @@ struct log_cache_entry { struct vring_used_elem_packed { uint16_t id; + uint16_t flags; uint32_t len; uint32_t count; }; @@ -171,6 +174,8 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; /* Record packed ring enqueue latest desc cache aligned index */ uint16_t shadow_aligned_idx; + /* Record packed ring first dequeue desc index */ + uint16_t shadow_last_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ec17353da..750d16f50 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,23 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_dequeue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id; + rte_smp_wmb(); + vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags; + + vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void vhost_flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -247,6 +264,70 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags; + uint16_t i; + uint16_t begin; + + flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + begin = 1; + } else + begin = 0; + + for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = 0; + } + + rte_smp_wmb(); + for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + +static __rte_always_inline void +vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_idx++; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + vq->desc_packed[vq->last_used_idx].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + } + + vq_inc_last_used_packed(vq, count); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -314,6 +395,26 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } +static __rte_unused void +vhost_flush_dequeue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int shadow_count; + if (!vq->shadow_used_idx) + return; + + shadow_count = vq->last_used_idx - vq->shadow_last_used_idx; + if (shadow_count <= 0) + shadow_count += vq->size; + + /* buffer used descs as many as possible when doing dequeue */ + if ((uint32_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + vhost_flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1788,6 +1889,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + vhost_shadow_dequeue_batch_packed(dev, vq, ids); + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); return 0; @@ -1843,6 +1946,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + vq_inc_last_avail_packed(vq, desc_count); return 0; -- 2.17.1
Optimize vhost device packed ring enqueue function by splitting batch and single functions. Packets can be filled into one desc will be handled by batch and others will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 750d16f50..5cdca9a7f 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -772,64 +772,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1111,7 +1053,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) @@ -1187,7 +1129,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) @@ -1214,49 +1156,40 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, } static __rte_noinline uint32_t -virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf **pkts, uint32_t count) +virtio_dev_rx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, + uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t remained = count; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, nr_descs); - } - - do_data_copy_enqueue(dev, vq); + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); vhost_flush_enqueue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); + return pkt_idx; } -- 2.17.1
Add vhost packed ring zero copy batch and single dequeue functions like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 5cdca9a7f..01d1603e3 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1886,6 +1886,122 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + zmbufs[i] = get_zmbuf(vq); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (!zmbufs[i]) + goto free_pkt; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_mbuf_refcnt_update(pkts[i], 1); + + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; + +free_pkt: + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq_inc_last_avail_packed(vq, desc_count); + return 0; +} + +static __rte_always_inline void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags = 0; + bool wrap; + + wrap = vq->used_wrap_counter; + flags = PACKED_DESC_DEQUEUE_USED_FLAG(wrap); + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq_inc_last_used_packed(vq, zmbuf->desc_count); + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device packed ring dequeue function by splitting batch and single functions. No-chained and direct descriptors will be handled by batch and other will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 01d1603e3..7c22225b4 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -201,69 +201,6 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } -static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) -{ - int i; - uint16_t used_idx = vq->last_used_idx; - uint16_t head_idx = vq->last_used_idx; - uint16_t head_flags = 0; - - /* Split loop in two to save memory barriers */ - for (i = 0; i < vq->shadow_used_idx; i++) { - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; - - used_idx += vq->shadow_used_packed[i].count; - if (used_idx >= vq->size) - used_idx -= vq->size; - } - - rte_smp_wmb(); - - for (i = 0; i < vq->shadow_used_idx; i++) { - uint16_t flags; - - if (vq->shadow_used_packed[i].len) - flags = VRING_DESC_F_WRITE; - else - flags = 0; - - if (vq->used_wrap_counter) { - flags |= VRING_DESC_F_USED; - flags |= VRING_DESC_F_AVAIL; - } else { - flags &= ~VRING_DESC_F_USED; - flags &= ~VRING_DESC_F_AVAIL; - } - - if (i > 0) { - vq->desc_packed[vq->last_used_idx].flags = flags; - - vhost_log_cache_used_vring(dev, vq, - vq->last_used_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - } else { - head_idx = vq->last_used_idx; - head_flags = flags; - } - - vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); - } - - vq->desc_packed[head_idx].flags = head_flags; - - vhost_log_cache_used_vring(dev, vq, - head_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - - vq->shadow_used_idx = 0; - vhost_log_cache_sync(dev, vq); -} - static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -328,17 +265,6 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -395,7 +321,7 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void vhost_flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1799,7 +1725,7 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1866,7 +1792,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1886,7 +1812,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1935,7 +1861,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2003,114 +1929,77 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; + uint32_t pkt_idx = 0; + uint32_t remained = count; - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + free_zmbuf(vq); - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); - - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + pkt_idx++; + remained--; - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); + } while (remained); - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t dummy_len; - uint16_t desc_count, nr_vec = 0; - int err; + return pkt_idx; +} - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &dummy_len, - VHOST_ACCESS_RO) < 0)) - break; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - pkts[i] = rte_pktmbuf_alloc(mbuf_pool); - if (unlikely(pkts[i] == NULL)) { - RTE_LOG(ERR, VHOST_DATA, - "Failed to allocate memory for mbuf.\n"); - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + vhost_flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; - } - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; - - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; - - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); - - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); - } + vhost_flush_dequeue_packed(dev, vq); + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, desc_count); - } + } while (remained); - if (likely(dev->dequeue_zero_copy == 0)) { + if (vq->shadow_used_idx) do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - return i; + return pkt_idx; } uint16_t @@ -2186,9 +2075,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 7c22225b4..93ebdd7b6 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -201,6 +207,25 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, + uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -265,6 +290,26 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } +static __rte_always_inline void +vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + vq->shadow_used_packed[0].id = buf_id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = count; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, count); +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1748,7 +1793,11 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } - vhost_shadow_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_SIZE - 1]); + else + vhost_shadow_dequeue_batch_packed(dev, vq, ids); vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); @@ -1805,7 +1854,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; - vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, + desc_count); + else + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); vq_inc_last_avail_packed(vq, desc_count); -- 2.17.1
On 10/15/19 6:07 PM, Marvin Liu wrote: > When vhost doing [de]nqueue, vq's local variable last_[used/avail]_idx > will be inceased. Adding inline functions can avoid duplicated codes. When enqueuing or dequeuing, the virtqueue's local available and used indexes are increased. Other than that: Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 5131a97a3..22a3ddc38 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -350,6 +350,26 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) > wrap_counter != !!(flags & VRING_DESC_F_USED); > } > > +static inline void > +vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) > +{ > + vq->last_used_idx += num; > + if (vq->last_used_idx >= vq->size) { > + vq->used_wrap_counter ^= 1; > + vq->last_used_idx -= vq->size; > + } > +} > + > +static inline void > +vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) > +{ > + vq->last_avail_idx += num; > + if (vq->last_avail_idx >= vq->size) { > + vq->avail_wrap_counter ^= 1; > + vq->last_avail_idx -= vq->size; > + } > +} > + > void __vhost_log_cache_write(struct virtio_net *dev, > struct vhost_virtqueue *vq, > uint64_t addr, uint64_t len); > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 5b85b832d..42b662080 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -140,11 +140,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, > head_flags = flags; > } > > - vq->last_used_idx += vq->shadow_used_packed[i].count; > - if (vq->last_used_idx >= vq->size) { > - vq->used_wrap_counter ^= 1; > - vq->last_used_idx -= vq->size; > - } > + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); > } > > vq->desc_packed[head_idx].flags = head_flags; > @@ -865,11 +861,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > break; > } > > - vq->last_avail_idx += nr_descs; > - if (vq->last_avail_idx >= vq->size) { > - vq->last_avail_idx -= vq->size; > - vq->avail_wrap_counter ^= 1; > - } > + vq_inc_last_avail_packed(vq, nr_descs); > } > > do_data_copy_enqueue(dev, vq); > @@ -1504,11 +1496,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); > } > > - vq->last_avail_idx += desc_count; > - if (vq->last_avail_idx >= vq->size) { > - vq->last_avail_idx -= vq->size; > - vq->avail_wrap_counter ^= 1; > - } > + vq_inc_last_avail_packed(vq, desc_count); > } > > if (likely(dev->dequeue_zero_copy == 0)) { >
Hi Marvin,
On 10/15/19 6:07 PM, Marvin Liu wrote:
> Create macro for adding unroll pragma before for each loop. Batch
> functions will be contained of several small loops which can be
> optimized by compilers' loop unrolling pragma.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index 8623e91c0..30839a001 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user
> CFLAGS += -fno-strict-aliasing
> LDLIBS += -lpthread
>
> +ifeq ($(RTE_TOOLCHAIN), gcc)
> +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1)
> +CFLAGS += -DSUPPORT_GCC_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), clang)
> +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1)
> +CFLAGS += -DSUPPORT_CLANG_UNROLL_PRAGMA
> +endif
> +endif
> +
> +ifeq ($(RTE_TOOLCHAIN), icc)
> +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1)
> +CFLAGS += -DSUPPORT_ICC_UNROLL_PRAGMA
> +endif
> +endif
> +
> ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y)
> LDLIBS += -lnuma
> endif
> diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
> index cb1123ae3..ddf0ee579 100644
> --- a/lib/librte_vhost/meson.build
> +++ b/lib/librte_vhost/meson.build
> @@ -8,6 +8,13 @@ endif
> if has_libnuma == 1
> dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
> endif
> +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
> + cflags += '-DSUPPORT_GCC_UNROLL_PRAGMA'
> +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> + cflags += '-DSUPPORT_CLANG_UNROLL_PRAGMA'
> +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0'))
> + cflags += '-DSUPPORT_ICC_UNROLL_PRAGMA'
> +endif
> dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY',
> cc.has_header('linux/userfaultfd.h'))
> version = 4
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 22a3ddc38..18d01cb19 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -39,6 +39,30 @@
>
> #define VHOST_LOG_CACHE_NR 32
>
> +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> + sizeof(struct vring_packed_desc))
> +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> +
> +#ifdef SUPPORT_GCC_UNROLL_PRAGMA
> +#define for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \
> + for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifdef SUPPORT_CLANG_UNROLL_PRAGMA
> +#define for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
> + for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifdef SUPPORT_ICC_UNROLL_PRAGMA
> +#define for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
> + for (iter = val; iter < size; iter++)
> +#endif
> +
> +#ifndef for_each_try_unroll
> +#define for_each_try_unroll(iter, val, num) \
> + for (iter = val; iter < num; iter++)
> +#endif
> +
> /**
> * Structure contains buffer address, length and descriptor index
> * from vring to do scatter RX.
>
As it is Vhost specific, please prefix all the defines and macros with
VHOST_.
Thanks,
Maxime
On 10/15/19 6:07 PM, Marvin Liu wrote: > Add batch dequeue function like enqueue function for packed ring, batch > dequeue function will not support chained descritpors, single packet > dequeue function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 18d01cb19..96bf763b1 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ > + VRING_DESC_F_INDIRECT) > + > #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > sizeof(struct vring_packed_desc)) > #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index e1b06c1ce..274a28f99 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -1551,6 +1551,113 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return i; > } > > +static __rte_always_inline int > +vhost_reserve_avail_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts, > + uint16_t avail_idx, > + uintptr_t *desc_addrs, > + uint16_t *ids) > +{ > + bool wrap = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + struct virtio_net_hdr *hdr; > + uint64_t lens[PACKED_BATCH_SIZE]; > + uint64_t buf_lens[PACKED_BATCH_SIZE]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint16_t flags, i; > + > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > + return -1; > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > + return -1; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + flags = descs[avail_idx + i].flags; > + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || > + (wrap == !!(flags & VRING_DESC_F_USED)) || > + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + lens[i] = descs[avail_idx + i].len; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], VHOST_ACCESS_RW); > + } > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely((lens[i] != descs[avail_idx + i].len))) > + return -1; > + } > + > + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) > + return -1; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) > + goto free_buf; > + } > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; > + pkts[i]->data_len = pkts[i]->pkt_len; > + ids[i] = descs[avail_idx + i].id; > + } > + > + if (virtio_net_with_host_offload(dev)) { > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); > + vhost_dequeue_offload(hdr, pkts[i]); > + } > + } > + > + return 0; > + > +free_buf: > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + rte_pktmbuf_free(pkts[i]); > + > + return -1; > +} > + > +static __rte_unused int > +virtio_dev_tx_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts) > +{ > + uint16_t avail_idx = vq->last_avail_idx; > + uint32_t buf_offset = dev->vhost_hlen; > + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; > + uint16_t ids[PACKED_BATCH_SIZE]; > + uint16_t i; > + > + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, > + avail_idx, desc_addrs, ids)) > + return -1; > + > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); Wouldn't we have better performance with a dedicated unroll loop for the prefetches, so that desc_addrs[i+1] is being prefetched while the desc_addr[i] memcpy is done? > + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + pkts[i]->pkt_len); > + } > + > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); > + > + return 0; > +} > + > static __rte_always_inline int > vhost_dequeue_single_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, >
On 10/15/19 6:07 PM, Marvin Liu wrote: > Buffer vhost enqueue shadowed ring flush action buffered number exceed > one batch. Thus virtio can receive packets at a faster frequency. The commit message isn't very clear to me. Could you please improve it? > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index 96bf763b1..a60b88d89 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -166,6 +166,8 @@ struct vhost_virtqueue { > struct vring_used_elem_packed *shadow_used_packed; > }; > uint16_t shadow_used_idx; > + /* Record packed ring enqueue latest desc cache aligned index */ > + uint16_t shadow_aligned_idx; > struct vhost_vring_addr ring_addrs; > > struct batch_copy_elem *batch_copy_elems; > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 274a28f99..020c9b858 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -91,6 +91,69 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, > vq->shadow_used_split[i].len = len; > } > > +static __rte_always_inline void > +vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq) > +{ > + int i; > + uint16_t used_idx = vq->last_used_idx; > + uint16_t head_idx = vq->last_used_idx; > + uint16_t head_flags = 0; > + > + /* Split loop in two to save memory barriers */ > + for (i = 0; i < vq->shadow_used_idx; i++) { > + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; > + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; > + > + used_idx += vq->shadow_used_packed[i].count; > + if (used_idx >= vq->size) > + used_idx -= vq->size; > + } > + > + rte_smp_wmb(); > + > + for (i = 0; i < vq->shadow_used_idx; i++) { > + uint16_t flags; > + > + if (vq->shadow_used_packed[i].len) > + flags = VRING_DESC_F_WRITE; > + else > + flags = 0; > + > + if (vq->used_wrap_counter) { > + flags |= VRING_DESC_F_USED; > + flags |= VRING_DESC_F_AVAIL; > + } else { > + flags &= ~VRING_DESC_F_USED; > + flags &= ~VRING_DESC_F_AVAIL; > + } > + > + if (i > 0) { > + vq->desc_packed[vq->last_used_idx].flags = flags; > + > + vhost_log_cache_used_vring(dev, vq, > + vq->last_used_idx * > + sizeof(struct vring_packed_desc), > + sizeof(struct vring_packed_desc)); > + } else { > + head_idx = vq->last_used_idx; > + head_flags = flags; > + } > + > + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); > + } > + > + vq->desc_packed[head_idx].flags = head_flags; > + > + vhost_log_cache_used_vring(dev, vq, > + head_idx * > + sizeof(struct vring_packed_desc), > + sizeof(struct vring_packed_desc)); > + > + vq->shadow_used_idx = 0; > + vhost_log_cache_sync(dev, vq); > +} > + > static __rte_always_inline void > flush_shadow_used_ring_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq) > @@ -194,6 +257,33 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) > vq->batch_copy_nb_elems = 0; > } > > +static __rte_always_inline void > +vhost_shadow_enqueue_single_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + uint32_t len[], > + uint16_t id[], > + uint16_t count[], > + uint16_t num_buffers) > +{ > + uint16_t i; > + for (i = 0; i < num_buffers; i++) { > + /* enqueue shadow flush action aligned with batch num */ > + if (!vq->shadow_used_idx) > + vq->shadow_aligned_idx = vq->last_used_idx & > + PACKED_BATCH_MASK; > + vq->shadow_used_packed[vq->shadow_used_idx].id = id[i]; > + vq->shadow_used_packed[vq->shadow_used_idx].len = len[i]; > + vq->shadow_used_packed[vq->shadow_used_idx].count = count[i]; > + vq->shadow_aligned_idx += count[i]; > + vq->shadow_used_idx++; > + } > + > + if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) { > + do_data_copy_enqueue(dev, vq); > + vhost_flush_enqueue_shadow_packed(dev, vq); > + } > +} > + > /* avoid write operation when necessary, to lessen cache issues */ > #define ASSIGN_UNLESS_EQUAL(var, val) do { \ > if ((var) != (val)) \ > @@ -785,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, > uint16_t desc_count; > uint32_t size = pkt->pkt_len + dev->vhost_hlen; > uint16_t num_buffers = 0; > + uint32_t buffer_len[vq->size]; > + uint16_t buffer_buf_id[vq->size]; > + uint16_t buffer_desc_count[vq->size]; With rings up to 1024 elements, maybe it would be better to have that allocated as vq metadata like shadow_used_packed? > > if (rxvq_is_mergeable(dev)) > max_tries = vq->size - 1; > @@ -810,6 +903,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, > len = RTE_MIN(len, size); > size -= len; > > + buffer_len[num_buffers] = len; > + buffer_buf_id[num_buffers] = buf_id; > + buffer_desc_count[num_buffers] = desc_count; > num_buffers += 1; > > *nr_descs += desc_count; > @@ -821,6 +917,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, > if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) > return -1; > > + vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id, > + buffer_desc_count, num_buffers); > + > return 0; > } > > @@ -1017,7 +1116,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > do_data_copy_enqueue(dev, vq); > > if (likely(vq->shadow_used_idx)) { > - flush_shadow_used_ring_packed(dev, vq); > + vhost_flush_enqueue_shadow_packed(dev, vq); > vhost_vring_call_packed(dev, vq); > } > >
On 10/15/19 6:07 PM, Marvin Liu wrote:
> Flush used elements when batched enqueue function is finished.
> Descriptor's flags are pre-calculated as they will be reset by vhost.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
On 10/15/19 6:07 PM, Marvin Liu wrote:
> Buffer used ring updates as many as possible in vhost dequeue function
> for coordinating with virtio driver. For supporting buffer, shadow used
> ring element should contain descriptor's flags. First shadowed ring
> index was recorded for calculating buffered number.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
On 10/15/19 6:07 PM, Marvin Liu wrote:
> Optimize vhost device packed ring enqueue function by splitting batch
> and single functions. Packets can be filled into one desc will be
> handled by batch and others will be handled by single as before.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Wednesday, October 16, 2019 6:36 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v6 06/13] vhost: add packed ring batch dequeue > > > > On 10/15/19 6:07 PM, Marvin Liu wrote: > > Add batch dequeue function like enqueue function for packed ring, batch > > dequeue function will not support chained descritpors, single packet > > dequeue function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index 18d01cb19..96bf763b1 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,9 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ > > + VRING_DESC_F_INDIRECT) > > + > > #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > > sizeof(struct vring_packed_desc)) > > #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index e1b06c1ce..274a28f99 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -1551,6 +1551,113 @@ virtio_dev_tx_split(struct virtio_net *dev, > struct vhost_virtqueue *vq, > > return i; > > } > > > > +static __rte_always_inline int > > +vhost_reserve_avail_batch_packed(struct virtio_net *dev, > > + struct vhost_virtqueue *vq, > > + struct rte_mempool *mbuf_pool, > > + struct rte_mbuf **pkts, > > + uint16_t avail_idx, > > + uintptr_t *desc_addrs, > > + uint16_t *ids) > > +{ > > + bool wrap = vq->avail_wrap_counter; > > + struct vring_packed_desc *descs = vq->desc_packed; > > + struct virtio_net_hdr *hdr; > > + uint64_t lens[PACKED_BATCH_SIZE]; > > + uint64_t buf_lens[PACKED_BATCH_SIZE]; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uint16_t flags, i; > > + > > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > > + return -1; > > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > > + return -1; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + flags = descs[avail_idx + i].flags; > > + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || > > + (wrap == !!(flags & VRING_DESC_F_USED)) || > > + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) > > + return -1; > > + } > > + > > + rte_smp_rmb(); > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + lens[i] = descs[avail_idx + i].len; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > + descs[avail_idx + i].addr, > > + &lens[i], VHOST_ACCESS_RW); > > + } > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely((lens[i] != descs[avail_idx + i].len))) > > + return -1; > > + } > > + > > + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) > > + return -1; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) > > + goto free_buf; > > + } > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; > > + pkts[i]->data_len = pkts[i]->pkt_len; > > + ids[i] = descs[avail_idx + i].id; > > + } > > + > > + if (virtio_net_with_host_offload(dev)) { > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); > > + vhost_dequeue_offload(hdr, pkts[i]); > > + } > > + } > > + > > + return 0; > > + > > +free_buf: > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > > + rte_pktmbuf_free(pkts[i]); > > + > > + return -1; > > +} > > + > > +static __rte_unused int > > +virtio_dev_tx_batch_packed(struct virtio_net *dev, > > + struct vhost_virtqueue *vq, > > + struct rte_mempool *mbuf_pool, > > + struct rte_mbuf **pkts) > > +{ > > + uint16_t avail_idx = vq->last_avail_idx; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; > > + uint16_t ids[PACKED_BATCH_SIZE]; > > + uint16_t i; > > + > > + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, > > + avail_idx, desc_addrs, ids)) > > + return -1; > > + > > + for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > Wouldn't we have better performance with a dedicated unroll loop for the > prefetches, so that desc_addrs[i+1] is being prefetched while the > desc_addr[i] memcpy is done? > Thanks, Maxime. It will be slightly better. Will have dedicated unroll loop in next version. Regards, marvin > > + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > + pkts[i]->pkt_len); > > + } > > + > > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); > > + > > + return 0; > > +} > > + > > static __rte_always_inline int > > vhost_dequeue_single_packed(struct virtio_net *dev, > > struct vhost_virtqueue *vq, > >
On 10/15/19 6:07 PM, Marvin Liu wrote:
> Add vhost packed ring zero copy batch and single dequeue functions like
> normal dequeue path.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
On 10/15/19 6:07 PM, Marvin Liu wrote:
> Optimize vhost device packed ring dequeue function by splitting batch
> and single functions. No-chained and direct descriptors will be handled
> by batch and other will be handled by single as before.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
On 10/15/19 6:07 PM, Marvin Liu wrote:
> When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue
> function by only update first used descriptor.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Hi Marvin, This is almost good, just fix the small comments I made. Also, please rebase on top of next-virtio branch, because I applied below patch from Flavio that you need to take into account: http://patches.dpdk.org/patch/61284/ Regards, Maxime On 10/15/19 6:07 PM, Marvin Liu wrote: > Packed ring has more compact ring format and thus can significantly > reduce the number of cache miss. It can lead to better performance. > This has been approved in virtio user driver, on normal E5 Xeon cpu > single core performance can raise 12%. > > http://mails.dpdk.org/archives/dev/2018-April/095470.html > > However vhost performance with packed ring performance was decreased. > Through analysis, mostly extra cost was from the calculating of each > descriptor flag which depended on ring wrap counter. Moreover, both > frontend and backend need to write same descriptors which will cause > cache contention. Especially when doing vhost enqueue function, virtio > refill packed ring function may write same cache line when vhost doing > enqueue function. This kind of extra cache cost will reduce the benefit > of reducing cache misses. > > For optimizing vhost packed ring performance, vhost enqueue and dequeue > function will be splitted into fast and normal path. > > Several methods will be taken in fast path: > Handle descriptors in one cache line by batch. > Split loop function into more pieces and unroll them. > Prerequisite check that whether I/O space can copy directly into mbuf > space and vice versa. > Prerequisite check that whether descriptor mapping is successful. > Distinguish vhost used ring update function by enqueue and dequeue > function. > Buffer dequeue used descriptors as many as possible. > Update enqueue used descriptors by cache line. > > After all these methods done, single core vhost PvP performance with 64B > packet on Xeon 8180 can boost 35%. > > v6: > - Fix dequeue zcopy result check > > v5: > - Remove disable sw prefetch as performance impact is small > - Change unroll pragma macro format > - Rename shadow counter elements names > - Clean dequeue update check condition > - Add inline functions replace of duplicated code > - Unify code style > > v4: > - Support meson build > - Remove memory region cache for no clear performance gain and ABI break > - Not assume ring size is power of two > > v3: > - Check available index overflow > - Remove dequeue remained descs number check > - Remove changes in split ring datapath > - Call memory write barriers once when updating used flags > - Rename some functions and macros > - Code style optimization > > v2: > - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc > - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) > - Optimize dequeue used ring update when in_order negotiated > > > Marvin Liu (13): > vhost: add packed ring indexes increasing function > vhost: add packed ring single enqueue > vhost: try to unroll for each loop > vhost: add packed ring batch enqueue > vhost: add packed ring single dequeue > vhost: add packed ring batch dequeue > vhost: flush enqueue updates by batch > vhost: flush batched enqueue descs directly > vhost: buffer packed ring dequeue updates > vhost: optimize packed ring enqueue > vhost: add packed ring zcopy batch and single dequeue > vhost: optimize packed ring dequeue > vhost: optimize packed ring dequeue when in-order > > lib/librte_vhost/Makefile | 18 + > lib/librte_vhost/meson.build | 7 + > lib/librte_vhost/vhost.h | 57 +++ > lib/librte_vhost/virtio_net.c | 924 +++++++++++++++++++++++++++------- > 4 files changed, 812 insertions(+), 194 deletions(-) >
> -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Thursday, October 17, 2019 3:31 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v6 00/13] vhost packed ring performance optimization > > Hi Marvin, > > This is almost good, just fix the small comments I made. > > Also, please rebase on top of next-virtio branch, because I applied > below patch from Flavio that you need to take into account: > > http://patches.dpdk.org/patch/61284/ Thanks, Maxime. I will start rebasing work. > > Regards, > Maxime > > On 10/15/19 6:07 PM, Marvin Liu wrote: > > Packed ring has more compact ring format and thus can significantly > > reduce the number of cache miss. It can lead to better performance. > > This has been approved in virtio user driver, on normal E5 Xeon cpu > > single core performance can raise 12%. > > > > http://mails.dpdk.org/archives/dev/2018-April/095470.html > > > > However vhost performance with packed ring performance was decreased. > > Through analysis, mostly extra cost was from the calculating of each > > descriptor flag which depended on ring wrap counter. Moreover, both > > frontend and backend need to write same descriptors which will cause > > cache contention. Especially when doing vhost enqueue function, virtio > > refill packed ring function may write same cache line when vhost doing > > enqueue function. This kind of extra cache cost will reduce the benefit > > of reducing cache misses. > > > > For optimizing vhost packed ring performance, vhost enqueue and dequeue > > function will be splitted into fast and normal path. > > > > Several methods will be taken in fast path: > > Handle descriptors in one cache line by batch. > > Split loop function into more pieces and unroll them. > > Prerequisite check that whether I/O space can copy directly into mbuf > > space and vice versa. > > Prerequisite check that whether descriptor mapping is successful. > > Distinguish vhost used ring update function by enqueue and dequeue > > function. > > Buffer dequeue used descriptors as many as possible. > > Update enqueue used descriptors by cache line. > > > > After all these methods done, single core vhost PvP performance with 64B > > packet on Xeon 8180 can boost 35%. > > > > v6: > > - Fix dequeue zcopy result check > > > > v5: > > - Remove disable sw prefetch as performance impact is small > > - Change unroll pragma macro format > > - Rename shadow counter elements names > > - Clean dequeue update check condition > > - Add inline functions replace of duplicated code > > - Unify code style > > > > v4: > > - Support meson build > > - Remove memory region cache for no clear performance gain and ABI break > > - Not assume ring size is power of two > > > > v3: > > - Check available index overflow > > - Remove dequeue remained descs number check > > - Remove changes in split ring datapath > > - Call memory write barriers once when updating used flags > > - Rename some functions and macros > > - Code style optimization > > > > v2: > > - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc > > - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) > > - Optimize dequeue used ring update when in_order negotiated > > > > > > Marvin Liu (13): > > vhost: add packed ring indexes increasing function > > vhost: add packed ring single enqueue > > vhost: try to unroll for each loop > > vhost: add packed ring batch enqueue > > vhost: add packed ring single dequeue > > vhost: add packed ring batch dequeue > > vhost: flush enqueue updates by batch > > vhost: flush batched enqueue descs directly > > vhost: buffer packed ring dequeue updates > > vhost: optimize packed ring enqueue > > vhost: add packed ring zcopy batch and single dequeue > > vhost: optimize packed ring dequeue > > vhost: optimize packed ring dequeue when in-order > > > > lib/librte_vhost/Makefile | 18 + > > lib/librte_vhost/meson.build | 7 + > > lib/librte_vhost/vhost.h | 57 +++ > > lib/librte_vhost/virtio_net.c | 924 +++++++++++++++++++++++++++------- > > 4 files changed, 812 insertions(+), 194 deletions(-) > >
On 10/21/19 5:40 PM, Marvin Liu wrote: > Add vhost single packet dequeue function for packed ring and meanwhile > left space for shadow used ring update function. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 4ddf26567..317be1aed 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -1635,6 +1635,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return i; > } > > +static __rte_always_inline int > +vhost_dequeue_single_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts, > + uint16_t *buf_id, > + uint16_t *desc_count) > +{ > + struct buf_vector buf_vec[BUF_VECTOR_MAX]; > + uint32_t dummy_len; > + uint16_t nr_vec = 0; > + int err; > + > + if (unlikely(fill_vec_buf_packed(dev, vq, > + vq->last_avail_idx, desc_count, > + buf_vec, &nr_vec, > + buf_id, &dummy_len, > + VHOST_ACCESS_RO) < 0)) > + return -1; > + > + *pkts = rte_pktmbuf_alloc(mbuf_pool); This is not properly rebased, you should no more call this API directly, but instead virtio_dev_pktmbuf_alloc() which was introduced withg flavio's patch. > + if (unlikely(*pkts == NULL)) { > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > + return -1; > + } > + > + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, > + mbuf_pool); > + if (unlikely(err)) { > + rte_pktmbuf_free(*pkts); > + return -1; > + } > + > + return 0; > +} > + > +static __rte_unused int > +virtio_dev_tx_single_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts) > +{ > + > + uint16_t buf_id, desc_count; > + > + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, > + &desc_count)) > + return -1; > + > + vq_inc_last_avail_packed(vq, desc_count); > + > + return 0; > +} > + > static __rte_noinline uint16_t > virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, > struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) >
On 10/21/19 5:40 PM, Marvin Liu wrote: > Add batch dequeue function like enqueue function for packed ring, batch > dequeue function will not support chained descritpors, single packet > dequeue function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index a2b9221e0..67724c342 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ > + VRING_DESC_F_INDIRECT) > + > #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > sizeof(struct vring_packed_desc)) > #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 317be1aed..f13fcafbb 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -1635,6 +1635,114 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return i; > } > > +static __rte_always_inline int > +vhost_reserve_avail_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts, > + uint16_t avail_idx, > + uintptr_t *desc_addrs, > + uint16_t *ids) > +{ > + bool wrap = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + struct virtio_net_hdr *hdr; > + uint64_t lens[PACKED_BATCH_SIZE]; > + uint64_t buf_lens[PACKED_BATCH_SIZE]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint16_t flags, i; > + > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > + return -1; > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > + return -1; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + flags = descs[avail_idx + i].flags; > + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || > + (wrap == !!(flags & VRING_DESC_F_USED)) || > + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + lens[i] = descs[avail_idx + i].len; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], VHOST_ACCESS_RW); > + } > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely((lens[i] != descs[avail_idx + i].len))) > + return -1; > + } > + > + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) Same here, you may want to create a variant of Flavio's virtio_dev_pktmbuf_alloc for bulk allocations. > + return -1; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) > + goto free_buf; > + } > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; > + pkts[i]->data_len = pkts[i]->pkt_len; > + ids[i] = descs[avail_idx + i].id; > + } > + > + if (virtio_net_with_host_offload(dev)) { > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); > + vhost_dequeue_offload(hdr, pkts[i]); > + } > + } > + > + return 0; > + > +free_buf: > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + rte_pktmbuf_free(pkts[i]); > + > + return -1; > +} > + > +static __rte_unused int > +virtio_dev_tx_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts) > +{ > + uint16_t avail_idx = vq->last_avail_idx; > + uint32_t buf_offset = dev->vhost_hlen; > + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; > + uint16_t ids[PACKED_BATCH_SIZE]; > + uint16_t i; > + > + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, > + avail_idx, desc_addrs, ids)) > + return -1; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + pkts[i]->pkt_len); > + > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); > + > + return 0; > +} > + > static __rte_always_inline int > vhost_dequeue_single_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, >
Thanks Maxime, has been modified in v8. > -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Monday, October 21, 2019 5:47 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v7 06/13] vhost: add packed ring batch dequeue > > > > On 10/21/19 5:40 PM, Marvin Liu wrote: > > Add batch dequeue function like enqueue function for packed ring, batch > > dequeue function will not support chained descritpors, single packet > > dequeue function will handle it. > > > > Signed-off-by: Marvin Liu <yong.liu@intel.com> > > > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > > index a2b9221e0..67724c342 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -39,6 +39,9 @@ > > > > #define VHOST_LOG_CACHE_NR 32 > > > > +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ > > + VRING_DESC_F_INDIRECT) > > + > > #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > > sizeof(struct vring_packed_desc)) > > #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > > diff --git a/lib/librte_vhost/virtio_net.c > b/lib/librte_vhost/virtio_net.c > > index 317be1aed..f13fcafbb 100644 > > --- a/lib/librte_vhost/virtio_net.c > > +++ b/lib/librte_vhost/virtio_net.c > > @@ -1635,6 +1635,114 @@ virtio_dev_tx_split(struct virtio_net *dev, > struct vhost_virtqueue *vq, > > return i; > > } > > > > +static __rte_always_inline int > > +vhost_reserve_avail_batch_packed(struct virtio_net *dev, > > + struct vhost_virtqueue *vq, > > + struct rte_mempool *mbuf_pool, > > + struct rte_mbuf **pkts, > > + uint16_t avail_idx, > > + uintptr_t *desc_addrs, > > + uint16_t *ids) > > +{ > > + bool wrap = vq->avail_wrap_counter; > > + struct vring_packed_desc *descs = vq->desc_packed; > > + struct virtio_net_hdr *hdr; > > + uint64_t lens[PACKED_BATCH_SIZE]; > > + uint64_t buf_lens[PACKED_BATCH_SIZE]; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uint16_t flags, i; > > + > > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > > + return -1; > > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > > + return -1; > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + flags = descs[avail_idx + i].flags; > > + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || > > + (wrap == !!(flags & VRING_DESC_F_USED)) || > > + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) > > + return -1; > > + } > > + > > + rte_smp_rmb(); > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + lens[i] = descs[avail_idx + i].len; > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > > + descs[avail_idx + i].addr, > > + &lens[i], VHOST_ACCESS_RW); > > + } > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely((lens[i] != descs[avail_idx + i].len))) > > + return -1; > > + } > > + > > + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) > > Same here, you may want to create a variant of Flavio's > virtio_dev_pktmbuf_alloc for bulk allocations. > > > + return -1; > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) > > + goto free_buf; > > + } > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; > > + pkts[i]->data_len = pkts[i]->pkt_len; > > + ids[i] = descs[avail_idx + i].id; > > + } > > + > > + if (virtio_net_with_host_offload(dev)) { > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > > + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); > > + vhost_dequeue_offload(hdr, pkts[i]); > > + } > > + } > > + > > + return 0; > > + > > +free_buf: > > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > > + rte_pktmbuf_free(pkts[i]); > > + > > + return -1; > > +} > > + > > +static __rte_unused int > > +virtio_dev_tx_batch_packed(struct virtio_net *dev, > > + struct vhost_virtqueue *vq, > > + struct rte_mempool *mbuf_pool, > > + struct rte_mbuf **pkts) > > +{ > > + uint16_t avail_idx = vq->last_avail_idx; > > + uint32_t buf_offset = dev->vhost_hlen; > > + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; > > + uint16_t ids[PACKED_BATCH_SIZE]; > > + uint16_t i; > > + > > + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, > > + avail_idx, desc_addrs, ids)) > > + return -1; > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > > + > > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > > + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > > + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > > + pkts[i]->pkt_len); > > + > > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); > > + > > + return 0; > > +} > > + > > static __rte_always_inline int > > vhost_dequeue_single_packed(struct virtio_net *dev, > > struct vhost_virtqueue *vq, > >
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 35%. v7: - Rebase code - Rename unroll macro and definitions - Calculate flags when doing single dequeue v6: - Fix dequeue zcopy result check v5: - Remove disable sw prefetch as performance impact is small - Change unroll pragma macro format - Rename shadow counter elements names - Clean dequeue update check condition - Add inline functions replace of duplicated code - Unify code style v4: - Support meson build - Remove memory region cache for no clear performance gain and ABI break - Not assume ring size is power of two v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (13): vhost: add packed ring indexes increasing function vhost: add packed ring single enqueue vhost: try to unroll for each loop vhost: add packed ring batch enqueue vhost: add packed ring single dequeue vhost: add packed ring batch dequeue vhost: flush enqueue updates by cacheline vhost: flush batched enqueue descs directly vhost: buffer packed ring dequeue updates vhost: optimize packed ring enqueue vhost: add packed ring zcopy batch and single dequeue vhost: optimize packed ring dequeue vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 18 + lib/librte_vhost/meson.build | 7 + lib/librte_vhost/vhost.h | 57 ++ lib/librte_vhost/virtio_net.c | 945 +++++++++++++++++++++++++++------- 4 files changed, 834 insertions(+), 193 deletions(-) -- 2.17.1
When enqueuing or dequeuing, the virtqueue's local available and used indexes are increased. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index c76d40115..02b3c91ff 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -367,6 +367,26 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) wrap_counter != !!(flags & VRING_DESC_F_USED); } +static inline void +vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_used_idx += num; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static inline void +vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_avail_idx += num; + if (vq->last_avail_idx >= vq->size) { + vq->avail_wrap_counter ^= 1; + vq->last_avail_idx -= vq->size; + } +} + void __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 66f0c7206..070d62bc0 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -138,11 +138,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, head_flags = flags; } - vq->last_used_idx += vq->shadow_used_packed[i].count; - if (vq->last_used_idx >= vq->size) { - vq->used_wrap_counter ^= 1; - vq->last_used_idx -= vq->size; - } + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); } __atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags, @@ -865,11 +861,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, nr_descs); } do_data_copy_enqueue(dev, vq); @@ -1585,11 +1577,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, desc_count); } if (likely(dev->dequeue_zero_copy == 0)) { -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 070d62bc0..4fb6552cc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -770,6 +770,60 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, + struct buf_vector *buf_vec, + uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -827,6 +881,32 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq_inc_last_avail_packed(vq, nr_descs); + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Create macro for adding unroll pragma before for each loop. Batch functions will be contained of several small loops which can be optimized by compilers' loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..87ce1fb27 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DVHOST_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DVHOST_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DVHOST_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index cb1123ae3..00435777e 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -8,6 +8,13 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) + cflags += '-DVHOST_GCC_UNROLL_PRAGMA' +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) + cflags += '-DVHOST_CLANG_UNROLL_PRAGMA' +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) + cflags += '-DVHOST_ICC_UNROLL_PRAGMA' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 02b3c91ff..a2b9221e0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,30 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + +#ifdef VHOST_GCC_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VHOST_CLANG_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VHOST_ICC_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ + for (iter = val; iter < size; iter++) +#endif + +#ifndef vhost_for_each_try_unroll +#define vhost_for_each_try_unroll(iter, val, num) \ + for (iter = val; iter < num; iter++) +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function do not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 4fb6552cc..4ddf26567 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 4ddf26567..317be1aed 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1635,6 +1635,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t dummy_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &dummy_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq_inc_last_avail_packed(vq, desc_count); + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index a2b9221e0..67724c342 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ + VRING_DESC_F_INDIRECT) + #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 317be1aed..f13fcafbb 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1635,6 +1635,114 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_reserve_avail_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t avail_idx, + uintptr_t *desc_addrs, + uint16_t *ids) +{ + bool wrap = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + struct virtio_net_hdr *hdr; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t flags, i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + flags = descs[avail_idx + i].flags; + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || + (wrap == !!(flags & VRING_DESC_F_USED)) || + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) + return -1; + } + + rte_smp_rmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + if (virtio_net_with_host_offload(dev)) { + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + return 0; + +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Buffer vhost packed ring enqueue updates, flush ring descs if buffered content filled up one cacheline. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 67724c342..d59446442 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -174,6 +174,8 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + /* Record packed ring enqueue latest desc cache aligned index */ + uint16_t shadow_aligned_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index f13fcafbb..1cff9b86f 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -91,6 +91,69 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, vq->shadow_used_split[i].len = len; } +static __rte_always_inline void +vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } + + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); + } + + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -194,6 +257,33 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint32_t len[], + uint16_t id[], + uint16_t count[], + uint16_t num_buffers) +{ + uint16_t i; + for (i = 0; i < num_buffers; i++) { + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->shadow_aligned_idx = vq->last_used_idx & + PACKED_BATCH_MASK; + vq->shadow_used_packed[vq->shadow_used_idx].id = id[i]; + vq->shadow_used_packed[vq->shadow_used_idx].len = len[i]; + vq->shadow_used_packed[vq->shadow_used_idx].count = count[i]; + vq->shadow_aligned_idx += count[i]; + vq->shadow_used_idx++; + } + + if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -785,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -810,6 +903,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -821,6 +917,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } @@ -1017,7 +1116,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Flush used elements when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index d59446442..f8dbe841c 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ + VRING_DESC_F_WRITE) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1cff9b86f..eefa1efa3 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,36 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_enqueue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t *lens, + uint16_t *ids) +{ + uint16_t i; + uint16_t flags; + + flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -992,6 +1022,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1047,6 +1078,11 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + ids[i] = descs[avail_idx + i].id; + + vhost_flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor's flags. First shadowed ring index was recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index f8dbe841c..9f11b28a3 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ VRING_DESC_F_WRITE) +#define PACKED_DESC_DEQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) : 0x0) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) @@ -114,6 +116,7 @@ struct log_cache_entry { struct vring_used_elem_packed { uint16_t id; + uint16_t flags; uint32_t len; uint32_t count; }; @@ -179,6 +182,8 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; /* Record packed ring enqueue latest desc cache aligned index */ uint16_t shadow_aligned_idx; + /* Record packed ring first dequeue desc index */ + uint16_t shadow_last_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index eefa1efa3..511b80afc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,23 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_dequeue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id; + rte_smp_wmb(); + vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags; + + vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void vhost_flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -246,6 +263,78 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags; + uint16_t i; + uint16_t begin; + + flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + begin = 1; + } else + begin = 0; + + vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = 0; + } + + rte_smp_wmb(); + vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + +static __rte_always_inline void +vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + uint16_t flags; + + flags = vq->desc_packed[vq->last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + vq->desc_packed[vq->last_used_idx].flags = flags; + } + + vq_inc_last_used_packed(vq, count); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -314,6 +403,25 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } +static __rte_unused void +vhost_flush_dequeue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int shadow_count; + if (!vq->shadow_used_idx) + return; + + shadow_count = vq->last_used_idx - vq->shadow_last_used_idx; + if (shadow_count <= 0) + shadow_count += vq->size; + + if ((uint32_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + vhost_flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1873,6 +1981,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); + vhost_shadow_dequeue_batch_packed(dev, vq, ids); + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); return 0; @@ -1928,6 +2038,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + vq_inc_last_avail_packed(vq, desc_count); return 0; -- 2.17.1
Optimize vhost device packed ring enqueue function by splitting batch and single functions. Packets can be filled into one desc will be handled by batch and others will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 511b80afc..2f90c3a7c 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -778,64 +778,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1118,7 +1060,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) @@ -1194,7 +1136,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) @@ -1221,49 +1163,40 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, } static __rte_noinline uint32_t -virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf **pkts, uint32_t count) +virtio_dev_rx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, + uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t remained = count; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, nr_descs); - } - - do_data_copy_enqueue(dev, vq); + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); vhost_flush_enqueue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); + return pkt_idx; } -- 2.17.1
Add vhost packed ring zero copy batch and single dequeue functions like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 2f90c3a7c..e053766f5 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1978,6 +1978,126 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + zmbufs[i] = get_zmbuf(vq); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (!zmbufs[i]) + goto free_pkt; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_mbuf_refcnt_update(pkts[i], 1); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; + +free_pkt: + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq_inc_last_avail_packed(vq, desc_count); + return 0; +} + +static __rte_always_inline void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags; + flags = vq->desc_packed[last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq_inc_last_used_packed(vq, zmbuf->desc_count); + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device packed ring dequeue function by splitting batch and single functions. No-chained and direct descriptors will be handled by batch and other will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index e053766f5..ca3a6551b 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -201,68 +201,6 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } -static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) -{ - int i; - uint16_t used_idx = vq->last_used_idx; - uint16_t head_idx = vq->last_used_idx; - uint16_t head_flags = 0; - - /* Split loop in two to save memory barriers */ - for (i = 0; i < vq->shadow_used_idx; i++) { - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; - - used_idx += vq->shadow_used_packed[i].count; - if (used_idx >= vq->size) - used_idx -= vq->size; - } - - for (i = 0; i < vq->shadow_used_idx; i++) { - uint16_t flags; - - if (vq->shadow_used_packed[i].len) - flags = VRING_DESC_F_WRITE; - else - flags = 0; - - if (vq->used_wrap_counter) { - flags |= VRING_DESC_F_USED; - flags |= VRING_DESC_F_AVAIL; - } else { - flags &= ~VRING_DESC_F_USED; - flags &= ~VRING_DESC_F_AVAIL; - } - - if (i > 0) { - vq->desc_packed[vq->last_used_idx].flags = flags; - - vhost_log_cache_used_vring(dev, vq, - vq->last_used_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - } else { - head_idx = vq->last_used_idx; - head_flags = flags; - } - - vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); - } - - __atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags, - __ATOMIC_RELEASE); - - vhost_log_cache_used_vring(dev, vq, - head_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - - vq->shadow_used_idx = 0; - vhost_log_cache_sync(dev, vq); -} - static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -335,17 +273,6 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -403,7 +330,7 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void vhost_flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1890,7 +1817,7 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1958,7 +1885,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1978,7 +1905,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2027,7 +1954,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2099,111 +2026,77 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + free_zmbuf(vq); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); - - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t buf_len; - uint16_t desc_count, nr_vec = 0; - int err; - - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &buf_len, - VHOST_ACCESS_RO) < 0)) + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; + pkt_idx++; + remained--; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + } while (remained); - pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); - if (unlikely(pkts[i] == NULL)) - break; - - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; - } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; + return pkt_idx; +} - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + vhost_flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - vq_inc_last_avail_packed(vq, desc_count); - } + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + vhost_flush_dequeue_packed(dev, vq); + pkt_idx++; + remained--; - if (likely(dev->dequeue_zero_copy == 0)) { + } while (remained); + + if (vq->shadow_used_idx) do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - return i; + return pkt_idx; } uint16_t @@ -2279,9 +2172,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ca3a6551b..670790850 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -201,6 +207,24 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, + uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -273,6 +297,34 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } +static __rte_always_inline void +vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + uint16_t flags; + + vq->shadow_used_packed[0].id = buf_id; + + flags = vq->desc_packed[vq->last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, count); +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1841,7 +1893,11 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); - vhost_shadow_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_SIZE - 1]); + else + vhost_shadow_dequeue_batch_packed(dev, vq, ids); vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); @@ -1898,7 +1954,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; - vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, + desc_count); + else + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); vq_inc_last_avail_packed(vq, desc_count); -- 2.17.1
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be splitted into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 35%. v8: - Allocate mbuf by virtio_dev_pktmbuf_alloc v7: - Rebase code - Rename unroll macro and definitions - Calculate flags when doing single dequeue v6: - Fix dequeue zcopy result check v5: - Remove disable sw prefetch as performance impact is small - Change unroll pragma macro format - Rename shadow counter elements names - Clean dequeue update check condition - Add inline functions replace of duplicated code - Unify code style v4: - Support meson build - Remove memory region cache for no clear performance gain and ABI break - Not assume ring size is power of two v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (13): vhost: add packed ring indexes increasing function vhost: add packed ring single enqueue vhost: try to unroll for each loop vhost: add packed ring batch enqueue vhost: add packed ring single dequeue vhost: add packed ring batch dequeue vhost: flush enqueue updates by cacheline vhost: flush batched enqueue descs directly vhost: buffer packed ring dequeue updates vhost: optimize packed ring enqueue vhost: add packed ring zcopy batch and single dequeue vhost: optimize packed ring dequeue vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 18 + lib/librte_vhost/meson.build | 7 + lib/librte_vhost/vhost.h | 57 ++ lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++------- 4 files changed, 837 insertions(+), 193 deletions(-) -- 2.17.1
When enqueuing or dequeuing, the virtqueue's local available and used indexes are increased. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index c76d40115..02b3c91ff 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -367,6 +367,26 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) wrap_counter != !!(flags & VRING_DESC_F_USED); } +static inline void +vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_used_idx += num; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static inline void +vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_avail_idx += num; + if (vq->last_avail_idx >= vq->size) { + vq->avail_wrap_counter ^= 1; + vq->last_avail_idx -= vq->size; + } +} + void __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 66f0c7206..070d62bc0 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -138,11 +138,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, head_flags = flags; } - vq->last_used_idx += vq->shadow_used_packed[i].count; - if (vq->last_used_idx >= vq->size) { - vq->used_wrap_counter ^= 1; - vq->last_used_idx -= vq->size; - } + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); } __atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags, @@ -865,11 +861,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, nr_descs); } do_data_copy_enqueue(dev, vq); @@ -1585,11 +1577,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, desc_count); } if (likely(dev->dequeue_zero_copy == 0)) { -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 070d62bc0..4fb6552cc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -770,6 +770,60 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, + struct buf_vector *buf_vec, + uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -827,6 +881,32 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq_inc_last_avail_packed(vq, nr_descs); + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Create macro for adding unroll pragma before for each loop. Batch functions will be contained of several small loops which can be optimized by compilers' loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..87ce1fb27 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DVHOST_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DVHOST_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DVHOST_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index cb1123ae3..00435777e 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -8,6 +8,13 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) + cflags += '-DVHOST_GCC_UNROLL_PRAGMA' +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) + cflags += '-DVHOST_CLANG_UNROLL_PRAGMA' +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) + cflags += '-DVHOST_ICC_UNROLL_PRAGMA' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 02b3c91ff..a2b9221e0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,30 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + +#ifdef VHOST_GCC_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VHOST_CLANG_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VHOST_ICC_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ + for (iter = val; iter < size; iter++) +#endif + +#ifndef vhost_for_each_try_unroll +#define vhost_for_each_try_unroll(iter, val, num) \ + for (iter = val; iter < num; iter++) +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function do not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 4fb6552cc..4ddf26567 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 4ddf26567..87f2ae49e 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1635,6 +1635,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t buf_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &buf_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq_inc_last_avail_packed(vq, desc_count); + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index a2b9221e0..67724c342 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ + VRING_DESC_F_INDIRECT) + #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 87f2ae49e..76435204f 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1635,6 +1635,117 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_reserve_avail_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t avail_idx, + uintptr_t *desc_addrs, + uint16_t *ids) +{ + bool wrap = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + struct virtio_net_hdr *hdr; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t flags, i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + flags = descs[avail_idx + i].flags; + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || + (wrap == !!(flags & VRING_DESC_F_USED)) || + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) + return -1; + } + + rte_smp_rmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, lens[i]); + if (!pkts[i]) + goto free_buf; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + if (virtio_net_with_host_offload(dev)) { + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + return 0; + +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Buffer vhost packed ring enqueue updates, flush ring descs if buffered content filled up one cacheline. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 67724c342..d59446442 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -174,6 +174,8 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + /* Record packed ring enqueue latest desc cache aligned index */ + uint16_t shadow_aligned_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 76435204f..25bffdd52 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -91,6 +91,69 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, vq->shadow_used_split[i].len = len; } +static __rte_always_inline void +vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } + + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); + } + + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -194,6 +257,33 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint32_t len[], + uint16_t id[], + uint16_t count[], + uint16_t num_buffers) +{ + uint16_t i; + for (i = 0; i < num_buffers; i++) { + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->shadow_aligned_idx = vq->last_used_idx & + PACKED_BATCH_MASK; + vq->shadow_used_packed[vq->shadow_used_idx].id = id[i]; + vq->shadow_used_packed[vq->shadow_used_idx].len = len[i]; + vq->shadow_used_packed[vq->shadow_used_idx].count = count[i]; + vq->shadow_aligned_idx += count[i]; + vq->shadow_used_idx++; + } + + if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -785,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -810,6 +903,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -821,6 +917,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } @@ -1017,7 +1116,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Flush used elements when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index d59446442..f8dbe841c 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ + VRING_DESC_F_WRITE) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 25bffdd52..51ce32064 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,36 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_enqueue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t *lens, + uint16_t *ids) +{ + uint16_t i; + uint16_t flags; + + flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -992,6 +1022,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1047,6 +1078,11 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + ids[i] = descs[avail_idx + i].id; + + vhost_flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor's flags. First shadowed ring index was recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index f8dbe841c..9f11b28a3 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ VRING_DESC_F_WRITE) +#define PACKED_DESC_DEQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) : 0x0) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) @@ -114,6 +116,7 @@ struct log_cache_entry { struct vring_used_elem_packed { uint16_t id; + uint16_t flags; uint32_t len; uint32_t count; }; @@ -179,6 +182,8 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; /* Record packed ring enqueue latest desc cache aligned index */ uint16_t shadow_aligned_idx; + /* Record packed ring first dequeue desc index */ + uint16_t shadow_last_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 51ce32064..b09e03fbc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,23 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_dequeue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id; + rte_smp_wmb(); + vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags; + + vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void vhost_flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -246,6 +263,78 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags; + uint16_t i; + uint16_t begin; + + flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + begin = 1; + } else + begin = 0; + + vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = 0; + } + + rte_smp_wmb(); + vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + +static __rte_always_inline void +vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + uint16_t flags; + + flags = vq->desc_packed[vq->last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + vq->desc_packed[vq->last_used_idx].flags = flags; + } + + vq_inc_last_used_packed(vq, count); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -314,6 +403,25 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } +static __rte_unused void +vhost_flush_dequeue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int shadow_count; + if (!vq->shadow_used_idx) + return; + + shadow_count = vq->last_used_idx - vq->shadow_last_used_idx; + if (shadow_count <= 0) + shadow_count += vq->size; + + if ((uint32_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + vhost_flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1876,6 +1984,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); + vhost_shadow_dequeue_batch_packed(dev, vq, ids); + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); return 0; @@ -1931,6 +2041,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + vq_inc_last_avail_packed(vq, desc_count); return 0; -- 2.17.1
Optimize vhost device packed ring enqueue function by splitting batch and single functions. Packets can be filled into one desc will be handled by batch and others will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index b09e03fbc..1c63262ce 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -778,64 +778,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1118,7 +1060,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) @@ -1194,7 +1136,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) @@ -1221,49 +1163,40 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, } static __rte_noinline uint32_t -virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf **pkts, uint32_t count) +virtio_dev_rx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, + uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t remained = count; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, nr_descs); - } - - do_data_copy_enqueue(dev, vq); + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); vhost_flush_enqueue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); + return pkt_idx; } -- 2.17.1
Add vhost packed ring zero copy batch and single dequeue functions like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1c63262ce..528e0fab1 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1981,6 +1981,126 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + zmbufs[i] = get_zmbuf(vq); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (!zmbufs[i]) + goto free_pkt; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_mbuf_refcnt_update(pkts[i], 1); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; + +free_pkt: + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq_inc_last_avail_packed(vq, desc_count); + return 0; +} + +static __rte_always_inline void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags; + flags = vq->desc_packed[last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq_inc_last_used_packed(vq, zmbuf->desc_count); + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device packed ring dequeue function by splitting batch and single functions. No-chained and direct descriptors will be handled by batch and other will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 528e0fab1..ab6726996 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -201,68 +201,6 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } -static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) -{ - int i; - uint16_t used_idx = vq->last_used_idx; - uint16_t head_idx = vq->last_used_idx; - uint16_t head_flags = 0; - - /* Split loop in two to save memory barriers */ - for (i = 0; i < vq->shadow_used_idx; i++) { - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; - - used_idx += vq->shadow_used_packed[i].count; - if (used_idx >= vq->size) - used_idx -= vq->size; - } - - for (i = 0; i < vq->shadow_used_idx; i++) { - uint16_t flags; - - if (vq->shadow_used_packed[i].len) - flags = VRING_DESC_F_WRITE; - else - flags = 0; - - if (vq->used_wrap_counter) { - flags |= VRING_DESC_F_USED; - flags |= VRING_DESC_F_AVAIL; - } else { - flags &= ~VRING_DESC_F_USED; - flags &= ~VRING_DESC_F_AVAIL; - } - - if (i > 0) { - vq->desc_packed[vq->last_used_idx].flags = flags; - - vhost_log_cache_used_vring(dev, vq, - vq->last_used_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - } else { - head_idx = vq->last_used_idx; - head_flags = flags; - } - - vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); - } - - __atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags, - __ATOMIC_RELEASE); - - vhost_log_cache_used_vring(dev, vq, - head_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - - vq->shadow_used_idx = 0; - vhost_log_cache_sync(dev, vq); -} - static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -335,17 +273,6 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -403,7 +330,7 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void vhost_flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1893,7 +1820,7 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1961,7 +1888,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1981,7 +1908,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2030,7 +1957,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2102,111 +2029,77 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + free_zmbuf(vq); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); - - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t buf_len; - uint16_t desc_count, nr_vec = 0; - int err; - - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &buf_len, - VHOST_ACCESS_RO) < 0)) + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; + pkt_idx++; + remained--; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + } while (remained); - pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); - if (unlikely(pkts[i] == NULL)) - break; - - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; - } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; + return pkt_idx; +} - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + vhost_flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - vq_inc_last_avail_packed(vq, desc_count); - } + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + vhost_flush_dequeue_packed(dev, vq); + pkt_idx++; + remained--; - if (likely(dev->dequeue_zero_copy == 0)) { + } while (remained); + + if (vq->shadow_used_idx) do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - return i; + return pkt_idx; } uint16_t @@ -2282,9 +2175,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ab6726996..5b8cb9e63 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -201,6 +207,24 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, + uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -273,6 +297,34 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } +static __rte_always_inline void +vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + uint16_t flags; + + vq->shadow_used_packed[0].id = buf_id; + + flags = vq->desc_packed[vq->last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, count); +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1844,7 +1896,11 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); - vhost_shadow_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_SIZE - 1]); + else + vhost_shadow_dequeue_batch_packed(dev, vq, ids); vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); @@ -1901,7 +1957,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; - vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, + desc_count); + else + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); vq_inc_last_avail_packed(vq, desc_count); -- 2.17.1
On 10/22/19 12:08 AM, Marvin Liu wrote:
> Create macro for adding unroll pragma before for each loop. Batch
> functions will be contained of several small loops which can be
> optimized by compilers' loop unrolling pragma.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
On 10/22/19 12:08 AM, Marvin Liu wrote: > Add batch dequeue function like enqueue function for packed ring, batch > dequeue function will not support chained descritpors, single packet > dequeue function will handle it. > > Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> Thanks, Maxime p.s. It would be better if you could generate the series with providing the diffstat in every patch (which is default behavior of git format- patch). > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h > index a2b9221e0..67724c342 100644 > --- a/lib/librte_vhost/vhost.h > +++ b/lib/librte_vhost/vhost.h > @@ -39,6 +39,9 @@ > > #define VHOST_LOG_CACHE_NR 32 > > +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ > + VRING_DESC_F_INDIRECT) > + > #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ > sizeof(struct vring_packed_desc)) > #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c > index 87f2ae49e..76435204f 100644 > --- a/lib/librte_vhost/virtio_net.c > +++ b/lib/librte_vhost/virtio_net.c > @@ -1635,6 +1635,117 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, > return i; > } > > +static __rte_always_inline int > +vhost_reserve_avail_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts, > + uint16_t avail_idx, > + uintptr_t *desc_addrs, > + uint16_t *ids) > +{ > + bool wrap = vq->avail_wrap_counter; > + struct vring_packed_desc *descs = vq->desc_packed; > + struct virtio_net_hdr *hdr; > + uint64_t lens[PACKED_BATCH_SIZE]; > + uint64_t buf_lens[PACKED_BATCH_SIZE]; > + uint32_t buf_offset = dev->vhost_hlen; > + uint16_t flags, i; > + > + if (unlikely(avail_idx & PACKED_BATCH_MASK)) > + return -1; > + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) > + return -1; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + flags = descs[avail_idx + i].flags; > + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || > + (wrap == !!(flags & VRING_DESC_F_USED)) || > + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) > + return -1; > + } > + > + rte_smp_rmb(); > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + lens[i] = descs[avail_idx + i].len; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + desc_addrs[i] = vhost_iova_to_vva(dev, vq, > + descs[avail_idx + i].addr, > + &lens[i], VHOST_ACCESS_RW); > + } > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely((lens[i] != descs[avail_idx + i].len))) > + return -1; > + } > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, lens[i]); > + if (!pkts[i]) > + goto free_buf; > + } > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) > + goto free_buf; > + } > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; > + pkts[i]->data_len = pkts[i]->pkt_len; > + ids[i] = descs[avail_idx + i].id; > + } > + > + if (virtio_net_with_host_offload(dev)) { > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { > + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); > + vhost_dequeue_offload(hdr, pkts[i]); > + } > + } > + > + return 0; > + > +free_buf: > + for (i = 0; i < PACKED_BATCH_SIZE; i++) > + rte_pktmbuf_free(pkts[i]); > + > + return -1; > +} > + > +static __rte_unused int > +virtio_dev_tx_batch_packed(struct virtio_net *dev, > + struct vhost_virtqueue *vq, > + struct rte_mempool *mbuf_pool, > + struct rte_mbuf **pkts) > +{ > + uint16_t avail_idx = vq->last_avail_idx; > + uint32_t buf_offset = dev->vhost_hlen; > + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; > + uint16_t ids[PACKED_BATCH_SIZE]; > + uint16_t i; > + > + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, > + avail_idx, desc_addrs, ids)) > + return -1; > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); > + > + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) > + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), > + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), > + pkts[i]->pkt_len); > + > + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); > + > + return 0; > +} > + > static __rte_always_inline int > vhost_dequeue_single_packed(struct virtio_net *dev, > struct vhost_virtqueue *vq, >
On 10/22/19 12:08 AM, Marvin Liu wrote:
> Buffer vhost packed ring enqueue updates, flush ring descs if buffered
> content filled up one cacheline. Thus virtio can receive packets at a
> faster frequency.
>
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
>
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Thanks,
Maxime
I get some checkpatch warnings, and build fails with clang.
Could you please fix these issues and send v9?
Thanks,
Maxime
### [PATCH] vhost: try to unroll for each loop
WARNING:CAMELCASE: Avoid CamelCase: <_Pragma>
#78: FILE: lib/librte_vhost/vhost.h:47:
+#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll
4") \
ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in
parenthesis
#78: FILE: lib/librte_vhost/vhost.h:47:
+#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll
4") \
+ for (iter = val; iter < size; iter++)
ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in
parenthesis
#83: FILE: lib/librte_vhost/vhost.h:52:
+#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \
+ for (iter = val; iter < size; iter++)
ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in
parenthesis
#88: FILE: lib/librte_vhost/vhost.h:57:
+#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \
+ for (iter = val; iter < size; iter++)
total: 3 errors, 1 warnings, 67 lines checked
0/1 valid patch/tmp/dpdk_build/lib/librte_vhost/virtio_net.c:2065:1:
error: unused function 'free_zmbuf' [-Werror,-Wunused-function]
free_zmbuf(struct vhost_virtqueue *vq)
^
1 error generated.
make[5]: *** [virtio_net.o] Error 1
make[4]: *** [librte_vhost] Error 2
make[4]: *** Waiting for unfinished jobs....
make[3]: *** [lib] Error 2
make[2]: *** [all] Error 2
make[1]: *** [pre_install] Error 2
make: *** [install] Error 2
On 10/22/19 12:08 AM, Marvin Liu wrote:
> Packed ring has more compact ring format and thus can significantly
> reduce the number of cache miss. It can lead to better performance.
> This has been approved in virtio user driver, on normal E5 Xeon cpu
> single core performance can raise 12%.
>
> http://mails.dpdk.org/archives/dev/2018-April/095470.html
>
> However vhost performance with packed ring performance was decreased.
> Through analysis, mostly extra cost was from the calculating of each
> descriptor flag which depended on ring wrap counter. Moreover, both
> frontend and backend need to write same descriptors which will cause
> cache contention. Especially when doing vhost enqueue function, virtio
> refill packed ring function may write same cache line when vhost doing
> enqueue function. This kind of extra cache cost will reduce the benefit
> of reducing cache misses.
>
> For optimizing vhost packed ring performance, vhost enqueue and dequeue
> function will be splitted into fast and normal path.
>
> Several methods will be taken in fast path:
> Handle descriptors in one cache line by batch.
> Split loop function into more pieces and unroll them.
> Prerequisite check that whether I/O space can copy directly into mbuf
> space and vice versa.
> Prerequisite check that whether descriptor mapping is successful.
> Distinguish vhost used ring update function by enqueue and dequeue
> function.
> Buffer dequeue used descriptors as many as possible.
> Update enqueue used descriptors by cache line.
>
> After all these methods done, single core vhost PvP performance with 64B
> packet on Xeon 8180 can boost 35%.
>
> v8:
> - Allocate mbuf by virtio_dev_pktmbuf_alloc
>
> v7:
> - Rebase code
> - Rename unroll macro and definitions
> - Calculate flags when doing single dequeue
>
> v6:
> - Fix dequeue zcopy result check
>
> v5:
> - Remove disable sw prefetch as performance impact is small
> - Change unroll pragma macro format
> - Rename shadow counter elements names
> - Clean dequeue update check condition
> - Add inline functions replace of duplicated code
> - Unify code style
>
> v4:
> - Support meson build
> - Remove memory region cache for no clear performance gain and ABI break
> - Not assume ring size is power of two
>
> v3:
> - Check available index overflow
> - Remove dequeue remained descs number check
> - Remove changes in split ring datapath
> - Call memory write barriers once when updating used flags
> - Rename some functions and macros
> - Code style optimization
>
> v2:
> - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc
> - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST)
> - Optimize dequeue used ring update when in_order negotiated
>
>
> Marvin Liu (13):
> vhost: add packed ring indexes increasing function
> vhost: add packed ring single enqueue
> vhost: try to unroll for each loop
> vhost: add packed ring batch enqueue
> vhost: add packed ring single dequeue
> vhost: add packed ring batch dequeue
> vhost: flush enqueue updates by cacheline
> vhost: flush batched enqueue descs directly
> vhost: buffer packed ring dequeue updates
> vhost: optimize packed ring enqueue
> vhost: add packed ring zcopy batch and single dequeue
> vhost: optimize packed ring dequeue
> vhost: optimize packed ring dequeue when in-order
>
> lib/librte_vhost/Makefile | 18 +
> lib/librte_vhost/meson.build | 7 +
> lib/librte_vhost/vhost.h | 57 ++
> lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++-------
> 4 files changed, 837 insertions(+), 193 deletions(-)
>
> -----Original Message----- > From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] > Sent: Thursday, October 24, 2019 2:50 PM > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, > Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; > gavin.hu@arm.com > Cc: dev@dpdk.org > Subject: Re: [PATCH v8 00/13] vhost packed ring performance optimization > > I get some checkpatch warnings, and build fails with clang. > Could you please fix these issues and send v9? > Hi Maxime, Clang build fails will be fixed in v9. For checkpatch warning, it was due to pragma string inside. Previous version can avoid such warning, while format is a little messy as below. I prefer to keep code clean and more readable. How about your idea? +#ifdef UNROLL_PRAGMA_PARAM +#define VHOST_UNROLL_PRAGMA(param) _Pragma(param) +#else +#define VHOST_UNROLL_PRAGMA(param) do {} while (0); +#endif + VHOST_UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) + for (i = 0; i < PACKED_BATCH_SIZE; i++) Regards, Marvin > Thanks, > Maxime > > ### [PATCH] vhost: try to unroll for each loop > > WARNING:CAMELCASE: Avoid CamelCase: <_Pragma> > #78: FILE: lib/librte_vhost/vhost.h:47: > +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll > 4") \ > > ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in > parenthesis > #78: FILE: lib/librte_vhost/vhost.h:47: > +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll > 4") \ > + for (iter = val; iter < size; iter++) > > ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in > parenthesis > #83: FILE: lib/librte_vhost/vhost.h:52: > +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ > + for (iter = val; iter < size; iter++) > > ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in > parenthesis > #88: FILE: lib/librte_vhost/vhost.h:57: > +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ > + for (iter = val; iter < size; iter++) > > total: 3 errors, 1 warnings, 67 lines checked > > 0/1 valid patch/tmp/dpdk_build/lib/librte_vhost/virtio_net.c:2065:1: > error: unused function 'free_zmbuf' [-Werror,-Wunused-function] > free_zmbuf(struct vhost_virtqueue *vq) > ^ > 1 error generated. > make[5]: *** [virtio_net.o] Error 1 > make[4]: *** [librte_vhost] Error 2 > make[4]: *** Waiting for unfinished jobs.... > make[3]: *** [lib] Error 2 > make[2]: *** [all] Error 2 > make[1]: *** [pre_install] Error 2 > make: *** [install] Error 2 > > > On 10/22/19 12:08 AM, Marvin Liu wrote: > > Packed ring has more compact ring format and thus can significantly > > reduce the number of cache miss. It can lead to better performance. > > This has been approved in virtio user driver, on normal E5 Xeon cpu > > single core performance can raise 12%. > > > > http://mails.dpdk.org/archives/dev/2018-April/095470.html > > > > However vhost performance with packed ring performance was decreased. > > Through analysis, mostly extra cost was from the calculating of each > > descriptor flag which depended on ring wrap counter. Moreover, both > > frontend and backend need to write same descriptors which will cause > > cache contention. Especially when doing vhost enqueue function, virtio > > refill packed ring function may write same cache line when vhost doing > > enqueue function. This kind of extra cache cost will reduce the benefit > > of reducing cache misses. > > > > For optimizing vhost packed ring performance, vhost enqueue and dequeue > > function will be splitted into fast and normal path. > > > > Several methods will be taken in fast path: > > Handle descriptors in one cache line by batch. > > Split loop function into more pieces and unroll them. > > Prerequisite check that whether I/O space can copy directly into mbuf > > space and vice versa. > > Prerequisite check that whether descriptor mapping is successful. > > Distinguish vhost used ring update function by enqueue and dequeue > > function. > > Buffer dequeue used descriptors as many as possible. > > Update enqueue used descriptors by cache line. > > > > After all these methods done, single core vhost PvP performance with 64B > > packet on Xeon 8180 can boost 35%. > > > > v8: > > - Allocate mbuf by virtio_dev_pktmbuf_alloc > > > > v7: > > - Rebase code > > - Rename unroll macro and definitions > > - Calculate flags when doing single dequeue > > > > v6: > > - Fix dequeue zcopy result check > > > > v5: > > - Remove disable sw prefetch as performance impact is small > > - Change unroll pragma macro format > > - Rename shadow counter elements names > > - Clean dequeue update check condition > > - Add inline functions replace of duplicated code > > - Unify code style > > > > v4: > > - Support meson build > > - Remove memory region cache for no clear performance gain and ABI break > > - Not assume ring size is power of two > > > > v3: > > - Check available index overflow > > - Remove dequeue remained descs number check > > - Remove changes in split ring datapath > > - Call memory write barriers once when updating used flags > > - Rename some functions and macros > > - Code style optimization > > > > v2: > > - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc > > - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) > > - Optimize dequeue used ring update when in_order negotiated > > > > > > Marvin Liu (13): > > vhost: add packed ring indexes increasing function > > vhost: add packed ring single enqueue > > vhost: try to unroll for each loop > > vhost: add packed ring batch enqueue > > vhost: add packed ring single dequeue > > vhost: add packed ring batch dequeue > > vhost: flush enqueue updates by cacheline > > vhost: flush batched enqueue descs directly > > vhost: buffer packed ring dequeue updates > > vhost: optimize packed ring enqueue > > vhost: add packed ring zcopy batch and single dequeue > > vhost: optimize packed ring dequeue > > vhost: optimize packed ring dequeue when in-order > > > > lib/librte_vhost/Makefile | 18 + > > lib/librte_vhost/meson.build | 7 + > > lib/librte_vhost/vhost.h | 57 ++ > > lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++------- > > 4 files changed, 837 insertions(+), 193 deletions(-) > >
On 10/24/19 9:18 AM, Liu, Yong wrote: > > >> -----Original Message----- >> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com] >> Sent: Thursday, October 24, 2019 2:50 PM >> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang, >> Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org; >> gavin.hu@arm.com >> Cc: dev@dpdk.org >> Subject: Re: [PATCH v8 00/13] vhost packed ring performance optimization >> >> I get some checkpatch warnings, and build fails with clang. >> Could you please fix these issues and send v9? >> > > > Hi Maxime, > Clang build fails will be fixed in v9. For checkpatch warning, it was due to pragma string inside. > Previous version can avoid such warning, while format is a little messy as below. > I prefer to keep code clean and more readable. How about your idea? > > +#ifdef UNROLL_PRAGMA_PARAM > +#define VHOST_UNROLL_PRAGMA(param) _Pragma(param) > +#else > +#define VHOST_UNROLL_PRAGMA(param) do {} while (0); > +#endif > > + VHOST_UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM) > + for (i = 0; i < PACKED_BATCH_SIZE; i++) That's less clean indeed. I agree to waive the checkpatch errors. just fix the Clang build for patch 8 and we're good. Thanks, Maxime > Regards, > Marvin > >> Thanks, >> Maxime >> >> ### [PATCH] vhost: try to unroll for each loop >> >> WARNING:CAMELCASE: Avoid CamelCase: <_Pragma> >> #78: FILE: lib/librte_vhost/vhost.h:47: >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll >> 4") \ >> >> ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in >> parenthesis >> #78: FILE: lib/librte_vhost/vhost.h:47: >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll >> 4") \ >> + for (iter = val; iter < size; iter++) >> >> ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in >> parenthesis >> #83: FILE: lib/librte_vhost/vhost.h:52: >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ >> + for (iter = val; iter < size; iter++) >> >> ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in >> parenthesis >> #88: FILE: lib/librte_vhost/vhost.h:57: >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ >> + for (iter = val; iter < size; iter++) >> >> total: 3 errors, 1 warnings, 67 lines checked >> >> 0/1 valid patch/tmp/dpdk_build/lib/librte_vhost/virtio_net.c:2065:1: >> error: unused function 'free_zmbuf' [-Werror,-Wunused-function] >> free_zmbuf(struct vhost_virtqueue *vq) >> ^ >> 1 error generated. >> make[5]: *** [virtio_net.o] Error 1 >> make[4]: *** [librte_vhost] Error 2 >> make[4]: *** Waiting for unfinished jobs.... >> make[3]: *** [lib] Error 2 >> make[2]: *** [all] Error 2 >> make[1]: *** [pre_install] Error 2 >> make: *** [install] Error 2 >> >> >> On 10/22/19 12:08 AM, Marvin Liu wrote: >>> Packed ring has more compact ring format and thus can significantly >>> reduce the number of cache miss. It can lead to better performance. >>> This has been approved in virtio user driver, on normal E5 Xeon cpu >>> single core performance can raise 12%. >>> >>> http://mails.dpdk.org/archives/dev/2018-April/095470.html >>> >>> However vhost performance with packed ring performance was decreased. >>> Through analysis, mostly extra cost was from the calculating of each >>> descriptor flag which depended on ring wrap counter. Moreover, both >>> frontend and backend need to write same descriptors which will cause >>> cache contention. Especially when doing vhost enqueue function, virtio >>> refill packed ring function may write same cache line when vhost doing >>> enqueue function. This kind of extra cache cost will reduce the benefit >>> of reducing cache misses. >>> >>> For optimizing vhost packed ring performance, vhost enqueue and dequeue >>> function will be splitted into fast and normal path. >>> >>> Several methods will be taken in fast path: >>> Handle descriptors in one cache line by batch. >>> Split loop function into more pieces and unroll them. >>> Prerequisite check that whether I/O space can copy directly into mbuf >>> space and vice versa. >>> Prerequisite check that whether descriptor mapping is successful. >>> Distinguish vhost used ring update function by enqueue and dequeue >>> function. >>> Buffer dequeue used descriptors as many as possible. >>> Update enqueue used descriptors by cache line. >>> >>> After all these methods done, single core vhost PvP performance with 64B >>> packet on Xeon 8180 can boost 35%. >>> >>> v8: >>> - Allocate mbuf by virtio_dev_pktmbuf_alloc >>> >>> v7: >>> - Rebase code >>> - Rename unroll macro and definitions >>> - Calculate flags when doing single dequeue >>> >>> v6: >>> - Fix dequeue zcopy result check >>> >>> v5: >>> - Remove disable sw prefetch as performance impact is small >>> - Change unroll pragma macro format >>> - Rename shadow counter elements names >>> - Clean dequeue update check condition >>> - Add inline functions replace of duplicated code >>> - Unify code style >>> >>> v4: >>> - Support meson build >>> - Remove memory region cache for no clear performance gain and ABI break >>> - Not assume ring size is power of two >>> >>> v3: >>> - Check available index overflow >>> - Remove dequeue remained descs number check >>> - Remove changes in split ring datapath >>> - Call memory write barriers once when updating used flags >>> - Rename some functions and macros >>> - Code style optimization >>> >>> v2: >>> - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc >>> - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) >>> - Optimize dequeue used ring update when in_order negotiated >>> >>> >>> Marvin Liu (13): >>> vhost: add packed ring indexes increasing function >>> vhost: add packed ring single enqueue >>> vhost: try to unroll for each loop >>> vhost: add packed ring batch enqueue >>> vhost: add packed ring single dequeue >>> vhost: add packed ring batch dequeue >>> vhost: flush enqueue updates by cacheline >>> vhost: flush batched enqueue descs directly >>> vhost: buffer packed ring dequeue updates >>> vhost: optimize packed ring enqueue >>> vhost: add packed ring zcopy batch and single dequeue >>> vhost: optimize packed ring dequeue >>> vhost: optimize packed ring dequeue when in-order >>> >>> lib/librte_vhost/Makefile | 18 + >>> lib/librte_vhost/meson.build | 7 + >>> lib/librte_vhost/vhost.h | 57 ++ >>> lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++------- >>> 4 files changed, 837 insertions(+), 193 deletions(-) >>> >
Thanks, Maxime. Just sent out v9.
> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Thursday, October 24, 2019 4:25 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang,
> Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org;
> gavin.hu@arm.com
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v8 00/13] vhost packed ring performance optimization
>
>
>
> On 10/24/19 9:18 AM, Liu, Yong wrote:
> >
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> >> Sent: Thursday, October 24, 2019 2:50 PM
> >> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> Wang,
> >> Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org;
> >> gavin.hu@arm.com
> >> Cc: dev@dpdk.org
> >> Subject: Re: [PATCH v8 00/13] vhost packed ring performance optimization
> >>
> >> I get some checkpatch warnings, and build fails with clang.
> >> Could you please fix these issues and send v9?
> >>
> >
> >
> > Hi Maxime,
> > Clang build fails will be fixed in v9. For checkpatch warning, it was due
> to pragma string inside.
> > Previous version can avoid such warning, while format is a little messy
> as below.
> > I prefer to keep code clean and more readable. How about your idea?
> >
> > +#ifdef UNROLL_PRAGMA_PARAM
> > +#define VHOST_UNROLL_PRAGMA(param) _Pragma(param)
> > +#else
> > +#define VHOST_UNROLL_PRAGMA(param) do {} while (0);
> > +#endif
> >
> > + VHOST_UNROLL_PRAGMA(UNROLL_PRAGMA_PARAM)
> > + for (i = 0; i < PACKED_BATCH_SIZE; i++)
>
> That's less clean indeed. I agree to waive the checkpatch errors.
> just fix the Clang build for patch 8 and we're good.
>
> Thanks,
> Maxime
>
> > Regards,
> > Marvin
> >
> >> Thanks,
> >> Maxime
> >>
> >> ### [PATCH] vhost: try to unroll for each loop
> >>
> >> WARNING:CAMELCASE: Avoid CamelCase: <_Pragma>
> >> #78: FILE: lib/librte_vhost/vhost.h:47:
> >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll
> >> 4") \
> >>
> >> ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in
> >> parenthesis
> >> #78: FILE: lib/librte_vhost/vhost.h:47:
> >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll
> >> 4") \
> >> + for (iter = val; iter < size; iter++)
> >>
> >> ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in
> >> parenthesis
> >> #83: FILE: lib/librte_vhost/vhost.h:52:
> >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4")
> \
> >> + for (iter = val; iter < size; iter++)
> >>
> >> ERROR:COMPLEX_MACRO: Macros with complex values should be enclosed in
> >> parenthesis
> >> #88: FILE: lib/librte_vhost/vhost.h:57:
> >> +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)")
> \
> >> + for (iter = val; iter < size; iter++)
> >>
> >> total: 3 errors, 1 warnings, 67 lines checked
> >>
> >> 0/1 valid patch/tmp/dpdk_build/lib/librte_vhost/virtio_net.c:2065:1:
> >> error: unused function 'free_zmbuf' [-Werror,-Wunused-function]
> >> free_zmbuf(struct vhost_virtqueue *vq)
> >> ^
> >> 1 error generated.
> >> make[5]: *** [virtio_net.o] Error 1
> >> make[4]: *** [librte_vhost] Error 2
> >> make[4]: *** Waiting for unfinished jobs....
> >> make[3]: *** [lib] Error 2
> >> make[2]: *** [all] Error 2
> >> make[1]: *** [pre_install] Error 2
> >> make: *** [install] Error 2
> >>
> >>
> >> On 10/22/19 12:08 AM, Marvin Liu wrote:
> >>> Packed ring has more compact ring format and thus can significantly
> >>> reduce the number of cache miss. It can lead to better performance.
> >>> This has been approved in virtio user driver, on normal E5 Xeon cpu
> >>> single core performance can raise 12%.
> >>>
> >>> http://mails.dpdk.org/archives/dev/2018-April/095470.html
> >>>
> >>> However vhost performance with packed ring performance was decreased.
> >>> Through analysis, mostly extra cost was from the calculating of each
> >>> descriptor flag which depended on ring wrap counter. Moreover, both
> >>> frontend and backend need to write same descriptors which will cause
> >>> cache contention. Especially when doing vhost enqueue function, virtio
> >>> refill packed ring function may write same cache line when vhost doing
> >>> enqueue function. This kind of extra cache cost will reduce the benefit
> >>> of reducing cache misses.
> >>>
> >>> For optimizing vhost packed ring performance, vhost enqueue and dequeue
> >>> function will be splitted into fast and normal path.
> >>>
> >>> Several methods will be taken in fast path:
> >>> Handle descriptors in one cache line by batch.
> >>> Split loop function into more pieces and unroll them.
> >>> Prerequisite check that whether I/O space can copy directly into mbuf
> >>> space and vice versa.
> >>> Prerequisite check that whether descriptor mapping is successful.
> >>> Distinguish vhost used ring update function by enqueue and dequeue
> >>> function.
> >>> Buffer dequeue used descriptors as many as possible.
> >>> Update enqueue used descriptors by cache line.
> >>>
> >>> After all these methods done, single core vhost PvP performance with
> 64B
> >>> packet on Xeon 8180 can boost 35%.
> >>>
> >>> v8:
> >>> - Allocate mbuf by virtio_dev_pktmbuf_alloc
> >>>
> >>> v7:
> >>> - Rebase code
> >>> - Rename unroll macro and definitions
> >>> - Calculate flags when doing single dequeue
> >>>
> >>> v6:
> >>> - Fix dequeue zcopy result check
> >>>
> >>> v5:
> >>> - Remove disable sw prefetch as performance impact is small
> >>> - Change unroll pragma macro format
> >>> - Rename shadow counter elements names
> >>> - Clean dequeue update check condition
> >>> - Add inline functions replace of duplicated code
> >>> - Unify code style
> >>>
> >>> v4:
> >>> - Support meson build
> >>> - Remove memory region cache for no clear performance gain and ABI
> break
> >>> - Not assume ring size is power of two
> >>>
> >>> v3:
> >>> - Check available index overflow
> >>> - Remove dequeue remained descs number check
> >>> - Remove changes in split ring datapath
> >>> - Call memory write barriers once when updating used flags
> >>> - Rename some functions and macros
> >>> - Code style optimization
> >>>
> >>> v2:
> >>> - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc
> >>> - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST)
> >>> - Optimize dequeue used ring update when in_order negotiated
> >>>
> >>>
> >>> Marvin Liu (13):
> >>> vhost: add packed ring indexes increasing function
> >>> vhost: add packed ring single enqueue
> >>> vhost: try to unroll for each loop
> >>> vhost: add packed ring batch enqueue
> >>> vhost: add packed ring single dequeue
> >>> vhost: add packed ring batch dequeue
> >>> vhost: flush enqueue updates by cacheline
> >>> vhost: flush batched enqueue descs directly
> >>> vhost: buffer packed ring dequeue updates
> >>> vhost: optimize packed ring enqueue
> >>> vhost: add packed ring zcopy batch and single dequeue
> >>> vhost: optimize packed ring dequeue
> >>> vhost: optimize packed ring dequeue when in-order
> >>>
> >>> lib/librte_vhost/Makefile | 18 +
> >>> lib/librte_vhost/meson.build | 7 +
> >>> lib/librte_vhost/vhost.h | 57 ++
> >>> lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++-------
> >>> 4 files changed, 837 insertions(+), 193 deletions(-)
> >>>
> >
On 10/24/19 6:08 PM, Marvin Liu wrote:
> Packed ring has more compact ring format and thus can significantly
> reduce the number of cache miss. It can lead to better performance.
> This has been approved in virtio user driver, on normal E5 Xeon cpu
> single core performance can raise 12%.
>
> http://mails.dpdk.org/archives/dev/2018-April/095470.html
>
> However vhost performance with packed ring performance was decreased.
> Through analysis, mostly extra cost was from the calculating of each
> descriptor flag which depended on ring wrap counter. Moreover, both
> frontend and backend need to write same descriptors which will cause
> cache contention. Especially when doing vhost enqueue function, virtio
> refill packed ring function may write same cache line when vhost doing
> enqueue function. This kind of extra cache cost will reduce the benefit
> of reducing cache misses.
>
> For optimizing vhost packed ring performance, vhost enqueue and dequeue
> function will be split into fast and normal path.
>
> Several methods will be taken in fast path:
> Handle descriptors in one cache line by batch.
> Split loop function into more pieces and unroll them.
> Prerequisite check that whether I/O space can copy directly into mbuf
> space and vice versa.
> Prerequisite check that whether descriptor mapping is successful.
> Distinguish vhost used ring update function by enqueue and dequeue
> function.
> Buffer dequeue used descriptors as many as possible.
> Update enqueue used descriptors by cache line.
>
> After all these methods done, single core vhost PvP performance with 64B
> packet on Xeon 8180 can boost 35%.
>
> v9:
> - Fix clang build error
>
> v8:
> - Allocate mbuf by virtio_dev_pktmbuf_alloc
>
> v7:
> - Rebase code
> - Rename unroll macro and definitions
> - Calculate flags when doing single dequeue
>
> v6:
> - Fix dequeue zcopy result check
>
> v5:
> - Remove disable sw prefetch as performance impact is small
> - Change unroll pragma macro format
> - Rename shadow counter elements names
> - Clean dequeue update check condition
> - Add inline functions replace of duplicated code
> - Unify code style
>
> v4:
> - Support meson build
> - Remove memory region cache for no clear performance gain and ABI break
> - Not assume ring size is power of two
>
> v3:
> - Check available index overflow
> - Remove dequeue remained descs number check
> - Remove changes in split ring datapath
> - Call memory write barriers once when updating used flags
> - Rename some functions and macros
> - Code style optimization
>
> v2:
> - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc
> - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST)
> - Optimize dequeue used ring update when in_order negotiated
>
>
> Marvin Liu (13):
> vhost: add packed ring indexes increasing function
> vhost: add packed ring single enqueue
> vhost: try to unroll for each loop
> vhost: add packed ring batch enqueue
> vhost: add packed ring single dequeue
> vhost: add packed ring batch dequeue
> vhost: flush enqueue updates by cacheline
> vhost: flush batched enqueue descs directly
> vhost: buffer packed ring dequeue updates
> vhost: optimize packed ring enqueue
> vhost: add packed ring zcopy batch and single dequeue
> vhost: optimize packed ring dequeue
> vhost: optimize packed ring dequeue when in-order
>
> lib/librte_vhost/Makefile | 18 +
> lib/librte_vhost/meson.build | 7 +
> lib/librte_vhost/vhost.h | 57 ++
> lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++-------
> 4 files changed, 837 insertions(+), 193 deletions(-)
>
Applied to dpdk-next-virtio/master.
Thanks,
Maxime
Packed ring has more compact ring format and thus can significantly reduce the number of cache miss. It can lead to better performance. This has been approved in virtio user driver, on normal E5 Xeon cpu single core performance can raise 12%. http://mails.dpdk.org/archives/dev/2018-April/095470.html However vhost performance with packed ring performance was decreased. Through analysis, mostly extra cost was from the calculating of each descriptor flag which depended on ring wrap counter. Moreover, both frontend and backend need to write same descriptors which will cause cache contention. Especially when doing vhost enqueue function, virtio refill packed ring function may write same cache line when vhost doing enqueue function. This kind of extra cache cost will reduce the benefit of reducing cache misses. For optimizing vhost packed ring performance, vhost enqueue and dequeue function will be split into fast and normal path. Several methods will be taken in fast path: Handle descriptors in one cache line by batch. Split loop function into more pieces and unroll them. Prerequisite check that whether I/O space can copy directly into mbuf space and vice versa. Prerequisite check that whether descriptor mapping is successful. Distinguish vhost used ring update function by enqueue and dequeue function. Buffer dequeue used descriptors as many as possible. Update enqueue used descriptors by cache line. After all these methods done, single core vhost PvP performance with 64B packet on Xeon 8180 can boost 35%. v9: - Fix clang build error v8: - Allocate mbuf by virtio_dev_pktmbuf_alloc v7: - Rebase code - Rename unroll macro and definitions - Calculate flags when doing single dequeue v6: - Fix dequeue zcopy result check v5: - Remove disable sw prefetch as performance impact is small - Change unroll pragma macro format - Rename shadow counter elements names - Clean dequeue update check condition - Add inline functions replace of duplicated code - Unify code style v4: - Support meson build - Remove memory region cache for no clear performance gain and ABI break - Not assume ring size is power of two v3: - Check available index overflow - Remove dequeue remained descs number check - Remove changes in split ring datapath - Call memory write barriers once when updating used flags - Rename some functions and macros - Code style optimization v2: - Utilize compiler's pragma to unroll loop, distinguish clang/icc/gcc - Buffered dequeue used desc number changed to (RING_SZ - PKT_BURST) - Optimize dequeue used ring update when in_order negotiated Marvin Liu (13): vhost: add packed ring indexes increasing function vhost: add packed ring single enqueue vhost: try to unroll for each loop vhost: add packed ring batch enqueue vhost: add packed ring single dequeue vhost: add packed ring batch dequeue vhost: flush enqueue updates by cacheline vhost: flush batched enqueue descs directly vhost: buffer packed ring dequeue updates vhost: optimize packed ring enqueue vhost: add packed ring zcopy batch and single dequeue vhost: optimize packed ring dequeue vhost: optimize packed ring dequeue when in-order lib/librte_vhost/Makefile | 18 + lib/librte_vhost/meson.build | 7 + lib/librte_vhost/vhost.h | 57 ++ lib/librte_vhost/virtio_net.c | 948 +++++++++++++++++++++++++++------- 4 files changed, 837 insertions(+), 193 deletions(-) -- 2.17.1
When enqueuing or dequeuing, the virtqueue's local available and used indexes are increased. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/vhost.h | 20 ++++++++++++++++++++ lib/librte_vhost/virtio_net.c | 18 +++--------------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index c76d40115..02b3c91ff 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -367,6 +367,26 @@ desc_is_avail(struct vring_packed_desc *desc, bool wrap_counter) wrap_counter != !!(flags & VRING_DESC_F_USED); } +static inline void +vq_inc_last_used_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_used_idx += num; + if (vq->last_used_idx >= vq->size) { + vq->used_wrap_counter ^= 1; + vq->last_used_idx -= vq->size; + } +} + +static inline void +vq_inc_last_avail_packed(struct vhost_virtqueue *vq, uint16_t num) +{ + vq->last_avail_idx += num; + if (vq->last_avail_idx >= vq->size) { + vq->avail_wrap_counter ^= 1; + vq->last_avail_idx -= vq->size; + } +} + void __vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq, uint64_t addr, uint64_t len); diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 66f0c7206..070d62bc0 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -138,11 +138,7 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, head_flags = flags; } - vq->last_used_idx += vq->shadow_used_packed[i].count; - if (vq->last_used_idx >= vq->size) { - vq->used_wrap_counter ^= 1; - vq->last_used_idx -= vq->size; - } + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); } __atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags, @@ -865,11 +861,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, break; } - vq->last_avail_idx += nr_descs; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, nr_descs); } do_data_copy_enqueue(dev, vq); @@ -1585,11 +1577,7 @@ virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); } - vq->last_avail_idx += desc_count; - if (vq->last_avail_idx >= vq->size) { - vq->last_avail_idx -= vq->size; - vq->avail_wrap_counter ^= 1; - } + vq_inc_last_avail_packed(vq, desc_count); } if (likely(dev->dequeue_zero_copy == 0)) { -- 2.17.1
Add vhost enqueue function for single packet and meanwhile left space for flush used ring function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 070d62bc0..4fb6552cc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -770,6 +770,60 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, return error; } +static __rte_always_inline int +vhost_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt, + struct buf_vector *buf_vec, + uint16_t *nr_descs) +{ + uint16_t nr_vec = 0; + uint16_t avail_idx = vq->last_avail_idx; + uint16_t max_tries, tries = 0; + uint16_t buf_id = 0; + uint32_t len = 0; + uint16_t desc_count; + uint32_t size = pkt->pkt_len + dev->vhost_hlen; + uint16_t num_buffers = 0; + + if (rxvq_is_mergeable(dev)) + max_tries = vq->size - 1; + else + max_tries = 1; + + while (size > 0) { + /* + * if we tried all available ring items, and still + * can't get enough buf, it means something abnormal + * happened. + */ + if (unlikely(++tries > max_tries)) + return -1; + + if (unlikely(fill_vec_buf_packed(dev, vq, + avail_idx, &desc_count, + buf_vec, &nr_vec, + &buf_id, &len, + VHOST_ACCESS_RW) < 0)) + return -1; + + len = RTE_MIN(len, size); + size -= len; + + num_buffers += 1; + + *nr_descs += desc_count; + avail_idx += desc_count; + if (avail_idx >= vq->size) + avail_idx -= vq->size; + } + + if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) + return -1; + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) @@ -827,6 +881,32 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int16_t +virtio_dev_rx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf *pkt) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint16_t nr_descs = 0; + + rte_smp_rmb(); + if (unlikely(vhost_enqueue_single_packed(dev, vq, pkt, buf_vec, + &nr_descs) < 0)) { + VHOST_LOG_DEBUG(VHOST_DATA, + "(%d) failed to get enough desc from vring\n", + dev->vid); + return -1; + } + + VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", + dev->vid, vq->last_avail_idx, + vq->last_avail_idx + nr_descs); + + vq_inc_last_avail_packed(vq, nr_descs); + + return 0; +} + static __rte_noinline uint32_t virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts, uint32_t count) -- 2.17.1
Create macro for adding unroll pragma before for each loop. Batch functions will be contained of several small loops which can be optimized by compilers' loop unrolling pragma. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/Makefile | 18 ++++++++++++++++++ lib/librte_vhost/meson.build | 7 +++++++ lib/librte_vhost/vhost.h | 24 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile index 8623e91c0..87ce1fb27 100644 --- a/lib/librte_vhost/Makefile +++ b/lib/librte_vhost/Makefile @@ -16,6 +16,24 @@ CFLAGS += -I vhost_user CFLAGS += -fno-strict-aliasing LDLIBS += -lpthread +ifeq ($(RTE_TOOLCHAIN), gcc) +ifeq ($(shell test $(GCC_VERSION) -ge 83 && echo 1), 1) +CFLAGS += -DVHOST_GCC_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), clang) +ifeq ($(shell test $(CLANG_MAJOR_VERSION)$(CLANG_MINOR_VERSION) -ge 37 && echo 1), 1) +CFLAGS += -DVHOST_CLANG_UNROLL_PRAGMA +endif +endif + +ifeq ($(RTE_TOOLCHAIN), icc) +ifeq ($(shell test $(ICC_MAJOR_VERSION) -ge 16 && echo 1), 1) +CFLAGS += -DVHOST_ICC_UNROLL_PRAGMA +endif +endif + ifeq ($(CONFIG_RTE_LIBRTE_VHOST_NUMA),y) LDLIBS += -lnuma endif diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build index cb1123ae3..00435777e 100644 --- a/lib/librte_vhost/meson.build +++ b/lib/librte_vhost/meson.build @@ -8,6 +8,13 @@ endif if has_libnuma == 1 dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true) endif +if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0')) + cflags += '-DVHOST_GCC_UNROLL_PRAGMA' +elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0')) + cflags += '-DVHOST_CLANG_UNROLL_PRAGMA' +elif (toolchain == 'icc' and cc.version().version_compare('>=16.0.0')) + cflags += '-DVHOST_ICC_UNROLL_PRAGMA' +endif dpdk_conf.set('RTE_LIBRTE_VHOST_POSTCOPY', cc.has_header('linux/userfaultfd.h')) version = 4 diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 02b3c91ff..a2b9221e0 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,30 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ + sizeof(struct vring_packed_desc)) +#define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) + +#ifdef VHOST_GCC_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("GCC unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VHOST_CLANG_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll 4") \ + for (iter = val; iter < size; iter++) +#endif + +#ifdef VHOST_ICC_UNROLL_PRAGMA +#define vhost_for_each_try_unroll(iter, val, size) _Pragma("unroll (4)") \ + for (iter = val; iter < size; iter++) +#endif + +#ifndef vhost_for_each_try_unroll +#define vhost_for_each_try_unroll(iter, val, num) \ + for (iter = val; iter < num; iter++) +#endif + /** * Structure contains buffer address, length and descriptor index * from vring to do scatter RX. -- 2.17.1
Batch enqueue function will first check whether descriptors are cache aligned. It will also check prerequisites in the beginning. Batch enqueue function do not support chained mbufs, single packet enqueue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 70 +++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 4fb6552cc..4ddf26567 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -881,6 +881,76 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } +static __rte_unused int +virtio_dev_rx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts) +{ + bool wrap_counter = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + uint16_t avail_idx = vq->last_avail_idx; + uint64_t desc_addrs[PACKED_BATCH_SIZE]; + struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->next != NULL)) + return -1; + if (unlikely(!desc_is_avail(&descs[avail_idx + i], + wrap_counter))) + return -1; + } + + rte_smp_rmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(pkts[i]->pkt_len > (lens[i] - buf_offset))) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], + VHOST_ACCESS_RW); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *) + (uintptr_t)desc_addrs[i]; + lens[i] = pkts[i]->pkt_len + dev->vhost_hlen; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + virtio_enqueue_offload(pkts[i], &hdrs[i]->hdr); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + rte_memcpy((void *)(uintptr_t)(desc_addrs[i] + buf_offset), + rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + pkts[i]->pkt_len); + } + + return 0; +} + static __rte_unused int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Add vhost single packet dequeue function for packed ring and meanwhile left space for shadow used ring update function. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 55 +++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 4ddf26567..87f2ae49e 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1635,6 +1635,61 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_dequeue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t *buf_id, + uint16_t *desc_count) +{ + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t buf_len; + uint16_t nr_vec = 0; + int err; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, + buf_vec, &nr_vec, + buf_id, &buf_len, + VHOST_ACCESS_RO) < 0)) + return -1; + + *pkts = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); + if (unlikely(*pkts == NULL)) { + RTE_LOG(ERR, VHOST_DATA, + "Failed to allocate memory for mbuf.\n"); + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(*pkts); + return -1; + } + + return 0; +} + +static __rte_unused int +virtio_dev_tx_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + + uint16_t buf_id, desc_count; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + vq_inc_last_avail_packed(vq, desc_count); + + return 0; +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Add batch dequeue function like enqueue function for packed ring, batch dequeue function will not support chained descritpors, single packet dequeue function will handle it. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/vhost.h | 3 + lib/librte_vhost/virtio_net.c | 111 ++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index a2b9221e0..67724c342 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ + VRING_DESC_F_INDIRECT) + #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \ sizeof(struct vring_packed_desc)) #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 87f2ae49e..76435204f 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1635,6 +1635,117 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return i; } +static __rte_always_inline int +vhost_reserve_avail_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint16_t avail_idx, + uintptr_t *desc_addrs, + uint16_t *ids) +{ + bool wrap = vq->avail_wrap_counter; + struct vring_packed_desc *descs = vq->desc_packed; + struct virtio_net_hdr *hdr; + uint64_t lens[PACKED_BATCH_SIZE]; + uint64_t buf_lens[PACKED_BATCH_SIZE]; + uint32_t buf_offset = dev->vhost_hlen; + uint16_t flags, i; + + if (unlikely(avail_idx & PACKED_BATCH_MASK)) + return -1; + if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + flags = descs[avail_idx + i].flags; + if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) || + (wrap == !!(flags & VRING_DESC_F_USED)) || + (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG))) + return -1; + } + + rte_smp_rmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + lens[i] = descs[avail_idx + i].len; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + desc_addrs[i] = vhost_iova_to_vva(dev, vq, + descs[avail_idx + i].addr, + &lens[i], VHOST_ACCESS_RW); + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, lens[i]); + if (!pkts[i]) + goto free_buf; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (unlikely(buf_lens[i] < (lens[i] - buf_offset))) + goto free_buf; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset; + pkts[i]->data_len = pkts[i]->pkt_len; + ids[i] = descs[avail_idx + i].id; + } + + if (virtio_net_with_host_offload(dev)) { + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + hdr = (struct virtio_net_hdr *)(desc_addrs[i]); + vhost_dequeue_offload(hdr, pkts[i]); + } + } + + return 0; + +free_buf: + for (i = 0; i < PACKED_BATCH_SIZE; i++) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t avail_idx = vq->last_avail_idx; + uint32_t buf_offset = dev->vhost_hlen; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_prefetch0((void *)(uintptr_t)desc_addrs[i]); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0), + (void *)(uintptr_t)(desc_addrs[i] + buf_offset), + pkts[i]->pkt_len); + + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; +} + static __rte_always_inline int vhost_dequeue_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, -- 2.17.1
Buffer vhost packed ring enqueue updates, flush ring descs if buffered content filled up one cacheline. Thus virtio can receive packets at a faster frequency. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/vhost.h | 2 + lib/librte_vhost/virtio_net.c | 101 +++++++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index 67724c342..d59446442 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -174,6 +174,8 @@ struct vhost_virtqueue { struct vring_used_elem_packed *shadow_used_packed; }; uint16_t shadow_used_idx; + /* Record packed ring enqueue latest desc cache aligned index */ + uint16_t shadow_aligned_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 76435204f..25bffdd52 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -91,6 +91,69 @@ update_shadow_used_ring_split(struct vhost_virtqueue *vq, vq->shadow_used_split[i].len = len; } +static __rte_always_inline void +vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int i; + uint16_t used_idx = vq->last_used_idx; + uint16_t head_idx = vq->last_used_idx; + uint16_t head_flags = 0; + + /* Split loop in two to save memory barriers */ + for (i = 0; i < vq->shadow_used_idx; i++) { + vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; + vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; + + used_idx += vq->shadow_used_packed[i].count; + if (used_idx >= vq->size) + used_idx -= vq->size; + } + + rte_smp_wmb(); + + for (i = 0; i < vq->shadow_used_idx; i++) { + uint16_t flags; + + if (vq->shadow_used_packed[i].len) + flags = VRING_DESC_F_WRITE; + else + flags = 0; + + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (i > 0) { + vq->desc_packed[vq->last_used_idx].flags = flags; + + vhost_log_cache_used_vring(dev, vq, + vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + } else { + head_idx = vq->last_used_idx; + head_flags = flags; + } + + vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); + } + + vq->desc_packed[head_idx].flags = head_flags; + + vhost_log_cache_used_vring(dev, vq, + head_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -194,6 +257,33 @@ do_data_copy_dequeue(struct vhost_virtqueue *vq) vq->batch_copy_nb_elems = 0; } +static __rte_always_inline void +vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint32_t len[], + uint16_t id[], + uint16_t count[], + uint16_t num_buffers) +{ + uint16_t i; + for (i = 0; i < num_buffers; i++) { + /* enqueue shadow flush action aligned with batch num */ + if (!vq->shadow_used_idx) + vq->shadow_aligned_idx = vq->last_used_idx & + PACKED_BATCH_MASK; + vq->shadow_used_packed[vq->shadow_used_idx].id = id[i]; + vq->shadow_used_packed[vq->shadow_used_idx].len = len[i]; + vq->shadow_used_packed[vq->shadow_used_idx].count = count[i]; + vq->shadow_aligned_idx += count[i]; + vq->shadow_used_idx++; + } + + if (vq->shadow_aligned_idx >= PACKED_BATCH_SIZE) { + do_data_copy_enqueue(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -785,6 +875,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, uint16_t desc_count; uint32_t size = pkt->pkt_len + dev->vhost_hlen; uint16_t num_buffers = 0; + uint32_t buffer_len[vq->size]; + uint16_t buffer_buf_id[vq->size]; + uint16_t buffer_desc_count[vq->size]; if (rxvq_is_mergeable(dev)) max_tries = vq->size - 1; @@ -810,6 +903,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, len = RTE_MIN(len, size); size -= len; + buffer_len[num_buffers] = len; + buffer_buf_id[num_buffers] = buf_id; + buffer_desc_count[num_buffers] = desc_count; num_buffers += 1; *nr_descs += desc_count; @@ -821,6 +917,9 @@ vhost_enqueue_single_packed(struct virtio_net *dev, if (copy_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers) < 0) return -1; + vhost_shadow_enqueue_single_packed(dev, vq, buffer_len, buffer_buf_id, + buffer_desc_count, num_buffers); + return 0; } @@ -1017,7 +1116,7 @@ virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, do_data_copy_enqueue(dev, vq); if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); + vhost_flush_enqueue_shadow_packed(dev, vq); vhost_vring_call_packed(dev, vq); } -- 2.17.1
Flush used elements when batched enqueue function is finished. Descriptor's flags are pre-calculated as they will be reset by vhost. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Gavin Hu <gavin.hu@arm.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/vhost.h | 3 +++ lib/librte_vhost/virtio_net.c | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index d59446442..f8dbe841c 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -39,6 +39,9 @@ #define VHOST_LOG_CACHE_NR 32 +#define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ + VRING_DESC_F_WRITE) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 25bffdd52..51ce32064 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,36 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_enqueue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t *lens, + uint16_t *ids) +{ + uint16_t i; + uint16_t flags; + + flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = lens[i]; + } + + rte_smp_wmb(); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void flush_shadow_used_ring_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) @@ -992,6 +1022,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, struct virtio_net_hdr_mrg_rxbuf *hdrs[PACKED_BATCH_SIZE]; uint32_t buf_offset = dev->vhost_hlen; uint64_t lens[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; uint16_t i; if (unlikely(avail_idx & PACKED_BATCH_MASK)) @@ -1047,6 +1078,11 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, pkts[i]->pkt_len); } + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + ids[i] = descs[avail_idx + i].id; + + vhost_flush_enqueue_batch_packed(dev, vq, lens, ids); + return 0; } -- 2.17.1
Buffer used ring updates as many as possible in vhost dequeue function for coordinating with virtio driver. For supporting buffer, shadow used ring element should contain descriptor's flags. First shadowed ring index was recorded for calculating buffered number. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/vhost.h | 5 ++ lib/librte_vhost/virtio_net.c | 112 ++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index f8dbe841c..9f11b28a3 100644 --- a/lib/librte_vhost/vhost.h +++ b/lib/librte_vhost/vhost.h @@ -42,6 +42,8 @@ #define PACKED_DESC_ENQUEUE_USED_FLAG(w) \ ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \ VRING_DESC_F_WRITE) +#define PACKED_DESC_DEQUEUE_USED_FLAG(w) \ + ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED) : 0x0) #define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \ VRING_DESC_F_INDIRECT) @@ -114,6 +116,7 @@ struct log_cache_entry { struct vring_used_elem_packed { uint16_t id; + uint16_t flags; uint32_t len; uint32_t count; }; @@ -179,6 +182,8 @@ struct vhost_virtqueue { uint16_t shadow_used_idx; /* Record packed ring enqueue latest desc cache aligned index */ uint16_t shadow_aligned_idx; + /* Record packed ring first dequeue desc index */ + uint16_t shadow_last_used_idx; struct vhost_vring_addr ring_addrs; struct batch_copy_elem *batch_copy_elems; diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 51ce32064..b09e03fbc 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -154,6 +154,23 @@ vhost_flush_enqueue_shadow_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_flush_dequeue_shadow_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + struct vring_used_elem_packed *used_elem = &vq->shadow_used_packed[0]; + + vq->desc_packed[vq->shadow_last_used_idx].id = used_elem->id; + rte_smp_wmb(); + vq->desc_packed[vq->shadow_last_used_idx].flags = used_elem->flags; + + vhost_log_cache_used_vring(dev, vq, vq->shadow_last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc)); + vq->shadow_used_idx = 0; + vhost_log_cache_sync(dev, vq); +} + static __rte_always_inline void vhost_flush_enqueue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -246,6 +263,78 @@ flush_shadow_used_ring_packed(struct virtio_net *dev, vhost_log_cache_sync(dev, vq); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint16_t *ids) +{ + uint16_t flags; + uint16_t i; + uint16_t begin; + + flags = PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].id = ids[0]; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + begin = 1; + } else + begin = 0; + + vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) { + vq->desc_packed[vq->last_used_idx + i].id = ids[i]; + vq->desc_packed[vq->last_used_idx + i].len = 0; + } + + rte_smp_wmb(); + vhost_for_each_try_unroll(i, begin, PACKED_BATCH_SIZE) + vq->desc_packed[vq->last_used_idx + i].flags = flags; + + vhost_log_cache_used_vring(dev, vq, vq->last_used_idx * + sizeof(struct vring_packed_desc), + sizeof(struct vring_packed_desc) * + PACKED_BATCH_SIZE); + vhost_log_cache_sync(dev, vq); + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + +static __rte_always_inline void +vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + uint16_t flags; + + flags = vq->desc_packed[vq->last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + + vq->shadow_used_packed[0].id = buf_id; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + } else { + vq->desc_packed[vq->last_used_idx].id = buf_id; + vq->desc_packed[vq->last_used_idx].len = 0; + vq->desc_packed[vq->last_used_idx].flags = flags; + } + + vq_inc_last_used_packed(vq, count); +} + static __rte_always_inline void update_shadow_used_ring_packed(struct vhost_virtqueue *vq, uint16_t desc_idx, uint32_t len, uint16_t count) @@ -314,6 +403,25 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } +static __rte_unused void +vhost_flush_dequeue_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq) +{ + int shadow_count; + if (!vq->shadow_used_idx) + return; + + shadow_count = vq->last_used_idx - vq->shadow_last_used_idx; + if (shadow_count <= 0) + shadow_count += vq->size; + + if ((uint32_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { + do_data_copy_dequeue(vq); + vhost_flush_dequeue_shadow_packed(dev, vq); + vhost_vring_call_packed(dev, vq); + } +} + /* avoid write operation when necessary, to lessen cache issues */ #define ASSIGN_UNLESS_EQUAL(var, val) do { \ if ((var) != (val)) \ @@ -1876,6 +1984,8 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); + vhost_shadow_dequeue_batch_packed(dev, vq, ids); + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); return 0; @@ -1931,6 +2041,8 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + vq_inc_last_avail_packed(vq, desc_count); return 0; -- 2.17.1
Optimize vhost device packed ring enqueue function by splitting batch and single functions. Packets can be filled into one desc will be handled by batch and others will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 115 +++++++--------------------------- 1 file changed, 24 insertions(+), 91 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index b09e03fbc..1c63262ce 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -778,64 +778,6 @@ fill_vec_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, return 0; } -/* - * Returns -1 on fail, 0 on success - */ -static inline int -reserve_avail_buf_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - uint32_t size, struct buf_vector *buf_vec, - uint16_t *nr_vec, uint16_t *num_buffers, - uint16_t *nr_descs) -{ - uint16_t avail_idx; - uint16_t vec_idx = 0; - uint16_t max_tries, tries = 0; - - uint16_t buf_id = 0; - uint32_t len = 0; - uint16_t desc_count; - - *num_buffers = 0; - avail_idx = vq->last_avail_idx; - - if (rxvq_is_mergeable(dev)) - max_tries = vq->size - 1; - else - max_tries = 1; - - while (size > 0) { - /* - * if we tried all available ring items, and still - * can't get enough buf, it means something abnormal - * happened. - */ - if (unlikely(++tries > max_tries)) - return -1; - - if (unlikely(fill_vec_buf_packed(dev, vq, - avail_idx, &desc_count, - buf_vec, &vec_idx, - &buf_id, &len, - VHOST_ACCESS_RW) < 0)) - return -1; - - len = RTE_MIN(len, size); - update_shadow_used_ring_packed(vq, buf_id, len, desc_count); - size -= len; - - avail_idx += desc_count; - if (avail_idx >= vq->size) - avail_idx -= vq->size; - - *nr_descs += desc_count; - *num_buffers += 1; - } - - *nr_vec = vec_idx; - - return 0; -} - static __rte_noinline void copy_vnet_hdr_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq, struct buf_vector *buf_vec, @@ -1118,7 +1060,7 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, return pkt_idx; } -static __rte_unused int +static __rte_always_inline int virtio_dev_rx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf **pkts) @@ -1194,7 +1136,7 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, return 0; } -static __rte_unused int16_t +static __rte_always_inline int16_t virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mbuf *pkt) @@ -1221,49 +1163,40 @@ virtio_dev_rx_single_packed(struct virtio_net *dev, } static __rte_noinline uint32_t -virtio_dev_rx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mbuf **pkts, uint32_t count) +virtio_dev_rx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mbuf **pkts, + uint32_t count) { uint32_t pkt_idx = 0; - uint16_t num_buffers; - struct buf_vector buf_vec[BUF_VECTOR_MAX]; + uint32_t remained = count; - for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { - uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen; - uint16_t nr_vec = 0; - uint16_t nr_descs = 0; + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - if (unlikely(reserve_avail_buf_packed(dev, vq, - pkt_len, buf_vec, &nr_vec, - &num_buffers, &nr_descs) < 0)) { - VHOST_LOG_DEBUG(VHOST_DATA, - "(%d) failed to get enough desc from vring\n", - dev->vid); - vq->shadow_used_idx -= num_buffers; - break; + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_rx_batch_packed(dev, vq, pkts)) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n", - dev->vid, vq->last_avail_idx, - vq->last_avail_idx + num_buffers); - - if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx], - buf_vec, nr_vec, - num_buffers) < 0) { - vq->shadow_used_idx -= num_buffers; + if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx])) break; - } + pkt_idx++; + remained--; - vq_inc_last_avail_packed(vq, nr_descs); - } - - do_data_copy_enqueue(dev, vq); + } while (pkt_idx < count); - if (likely(vq->shadow_used_idx)) { + if (vq->shadow_used_idx) { + do_data_copy_enqueue(dev, vq); vhost_flush_enqueue_shadow_packed(dev, vq); - vhost_vring_call_packed(dev, vq); } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); + return pkt_idx; } -- 2.17.1
Add vhost packed ring zero copy batch and single dequeue functions like normal dequeue path. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 120 ++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 1c63262ce..0243573a3 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -1981,6 +1981,126 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } +static __rte_unused int +virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + struct zcopy_mbuf *zmbufs[PACKED_BATCH_SIZE]; + uintptr_t desc_addrs[PACKED_BATCH_SIZE]; + uint16_t ids[PACKED_BATCH_SIZE]; + uint16_t i; + + uint16_t avail_idx = vq->last_avail_idx; + + if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts, + avail_idx, desc_addrs, ids)) + return -1; + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + zmbufs[i] = get_zmbuf(vq); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + if (!zmbufs[i]) + goto free_pkt; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + zmbufs[i]->mbuf = pkts[i]; + zmbufs[i]->desc_idx = avail_idx + i; + zmbufs[i]->desc_count = 1; + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_mbuf_refcnt_update(pkts[i], 1); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbufs[i], next); + + vq->nr_zmbuf += PACKED_BATCH_SIZE; + vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); + + return 0; + +free_pkt: + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) + rte_pktmbuf_free(pkts[i]); + + return -1; +} + +static __rte_unused int +virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts) +{ + uint16_t buf_id, desc_count; + struct zcopy_mbuf *zmbuf; + + if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, + &desc_count)) + return -1; + + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(*pkts); + return -1; + } + zmbuf->mbuf = *pkts; + zmbuf->desc_idx = vq->last_avail_idx; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); + + vq->nr_zmbuf += 1; + TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + + vq_inc_last_avail_packed(vq, desc_count); + return 0; +} + +static __rte_unused void +free_zmbuf(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *next = NULL; + struct zcopy_mbuf *zmbuf; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + uint16_t last_used_idx = vq->last_used_idx; + + if (mbuf_is_consumed(zmbuf->mbuf)) { + uint16_t flags; + flags = vq->desc_packed[last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + vq->desc_packed[last_used_idx].id = zmbuf->desc_idx; + vq->desc_packed[last_used_idx].len = 0; + + rte_smp_wmb(); + vq->desc_packed[last_used_idx].flags = flags; + + vq_inc_last_used_packed(vq, zmbuf->desc_count); + + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + restore_mbuf(zmbuf->mbuf); + rte_pktmbuf_free(zmbuf->mbuf); + put_zmbuf(zmbuf); + vq->nr_zmbuf -= 1; + } + } +} + static __rte_noinline uint16_t virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) -- 2.17.1
Optimize vhost device packed ring dequeue function by splitting batch and single functions. No-chained and direct descriptors will be handled by batch and other will be handled by single as before. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 236 ++++++++++------------------------ 1 file changed, 67 insertions(+), 169 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index 0243573a3..ab6726996 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -201,68 +201,6 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } -static __rte_always_inline void -flush_shadow_used_ring_packed(struct virtio_net *dev, - struct vhost_virtqueue *vq) -{ - int i; - uint16_t used_idx = vq->last_used_idx; - uint16_t head_idx = vq->last_used_idx; - uint16_t head_flags = 0; - - /* Split loop in two to save memory barriers */ - for (i = 0; i < vq->shadow_used_idx; i++) { - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id; - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len; - - used_idx += vq->shadow_used_packed[i].count; - if (used_idx >= vq->size) - used_idx -= vq->size; - } - - for (i = 0; i < vq->shadow_used_idx; i++) { - uint16_t flags; - - if (vq->shadow_used_packed[i].len) - flags = VRING_DESC_F_WRITE; - else - flags = 0; - - if (vq->used_wrap_counter) { - flags |= VRING_DESC_F_USED; - flags |= VRING_DESC_F_AVAIL; - } else { - flags &= ~VRING_DESC_F_USED; - flags &= ~VRING_DESC_F_AVAIL; - } - - if (i > 0) { - vq->desc_packed[vq->last_used_idx].flags = flags; - - vhost_log_cache_used_vring(dev, vq, - vq->last_used_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - } else { - head_idx = vq->last_used_idx; - head_flags = flags; - } - - vq_inc_last_used_packed(vq, vq->shadow_used_packed[i].count); - } - - __atomic_store_n(&vq->desc_packed[head_idx].flags, head_flags, - __ATOMIC_RELEASE); - - vhost_log_cache_used_vring(dev, vq, - head_idx * - sizeof(struct vring_packed_desc), - sizeof(struct vring_packed_desc)); - - vq->shadow_used_idx = 0; - vhost_log_cache_sync(dev, vq); -} - static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -335,17 +273,6 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } -static __rte_always_inline void -update_shadow_used_ring_packed(struct vhost_virtqueue *vq, - uint16_t desc_idx, uint32_t len, uint16_t count) -{ - uint16_t i = vq->shadow_used_idx++; - - vq->shadow_used_packed[i].id = desc_idx; - vq->shadow_used_packed[i].len = len; - vq->shadow_used_packed[i].count = count; -} - static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -403,7 +330,7 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, } } -static __rte_unused void +static __rte_always_inline void vhost_flush_dequeue_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1893,7 +1820,7 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1961,7 +1888,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -1981,7 +1908,7 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, return 0; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2030,7 +1957,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, return -1; } -static __rte_unused int +static __rte_always_inline int virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, struct rte_mempool *mbuf_pool, @@ -2061,7 +1988,7 @@ virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, return 0; } -static __rte_unused void +static __rte_always_inline void free_zmbuf(struct vhost_virtqueue *vq) { struct zcopy_mbuf *next = NULL; @@ -2102,111 +2029,77 @@ free_zmbuf(struct vhost_virtqueue *vq) } static __rte_noinline uint16_t -virtio_dev_tx_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, - struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +virtio_dev_tx_packed_zmbuf(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) { - uint16_t i; - - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf, *next; - - for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); - zmbuf != NULL; zmbuf = next) { - next = TAILQ_NEXT(zmbuf, next); + uint32_t pkt_idx = 0; + uint32_t remained = count; - if (mbuf_is_consumed(zmbuf->mbuf)) { - update_shadow_used_ring_packed(vq, - zmbuf->desc_idx, - 0, - zmbuf->desc_count); + free_zmbuf(vq); - TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); - restore_mbuf(zmbuf->mbuf); - rte_pktmbuf_free(zmbuf->mbuf); - put_zmbuf(zmbuf); - vq->nr_zmbuf -= 1; + do { + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed_zmbuf(dev, vq, + mbuf_pool, &pkts[pkt_idx])) { + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; } } - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__); - - count = RTE_MIN(count, MAX_PKT_BURST); - VHOST_LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n", - dev->vid, count); - - for (i = 0; i < count; i++) { - struct buf_vector buf_vec[BUF_VECTOR_MAX]; - uint16_t buf_id; - uint32_t buf_len; - uint16_t desc_count, nr_vec = 0; - int err; - - if (unlikely(fill_vec_buf_packed(dev, vq, - vq->last_avail_idx, &desc_count, - buf_vec, &nr_vec, - &buf_id, &buf_len, - VHOST_ACCESS_RO) < 0)) + if (virtio_dev_tx_single_packed_zmbuf(dev, vq, mbuf_pool, + &pkts[pkt_idx])) break; + pkt_idx++; + remained--; - if (likely(dev->dequeue_zero_copy == 0)) - update_shadow_used_ring_packed(vq, buf_id, 0, - desc_count); + } while (remained); - pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); - if (unlikely(pkts[i] == NULL)) - break; - - err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], - mbuf_pool); - if (unlikely(err)) { - rte_pktmbuf_free(pkts[i]); - break; - } + if (pkt_idx) + vhost_vring_call_packed(dev, vq); - if (unlikely(dev->dequeue_zero_copy)) { - struct zcopy_mbuf *zmbuf; + return pkt_idx; +} - zmbuf = get_zmbuf(vq); - if (!zmbuf) { - rte_pktmbuf_free(pkts[i]); - break; - } - zmbuf->mbuf = pkts[i]; - zmbuf->desc_idx = buf_id; - zmbuf->desc_count = desc_count; +static __rte_noinline uint16_t +virtio_dev_tx_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, + uint32_t count) +{ + uint32_t pkt_idx = 0; + uint32_t remained = count; - /* - * Pin lock the mbuf; we will check later to see - * whether the mbuf is freed (when we are the last - * user) or not. If that's the case, we then could - * update the used ring safely. - */ - rte_mbuf_refcnt_update(pkts[i], 1); + do { + rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]); - vq->nr_zmbuf += 1; - TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next); + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { + vhost_flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; + } } - vq_inc_last_avail_packed(vq, desc_count); - } + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; + vhost_flush_dequeue_packed(dev, vq); + pkt_idx++; + remained--; - if (likely(dev->dequeue_zero_copy == 0)) { + } while (remained); + + if (vq->shadow_used_idx) do_data_copy_dequeue(vq); - if (unlikely(i < count)) - vq->shadow_used_idx = i; - if (likely(vq->shadow_used_idx)) { - flush_shadow_used_ring_packed(dev, vq); - vhost_vring_call_packed(dev, vq); - } - } - return i; + return pkt_idx; } uint16_t @@ -2282,9 +2175,14 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, count -= 1; } - if (vq_is_packed(dev)) - count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, count); - else + if (vq_is_packed(dev)) { + if (unlikely(dev->dequeue_zero_copy)) + count = virtio_dev_tx_packed_zmbuf(dev, vq, mbuf_pool, + pkts, count); + else + count = virtio_dev_tx_packed(dev, vq, mbuf_pool, pkts, + count); + } else count = virtio_dev_tx_split(dev, vq, mbuf_pool, pkts, count); out: -- 2.17.1
When VIRTIO_F_IN_ORDER feature is negotiated, vhost can optimize dequeue function by only update first used descriptor. Signed-off-by: Marvin Liu <yong.liu@intel.com> Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com> --- lib/librte_vhost/virtio_net.c | 64 +++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c index ab6726996..5b8cb9e63 100644 --- a/lib/librte_vhost/virtio_net.c +++ b/lib/librte_vhost/virtio_net.c @@ -31,6 +31,12 @@ rxvq_is_mergeable(struct virtio_net *dev) return dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF); } +static __rte_always_inline bool +virtio_net_is_inorder(struct virtio_net *dev) +{ + return dev->features & (1ULL << VIRTIO_F_IN_ORDER); +} + static bool is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) { @@ -201,6 +207,24 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); } +static __rte_always_inline void +vhost_shadow_dequeue_batch_packed_inorder(struct vhost_virtqueue *vq, + uint16_t id) +{ + vq->shadow_used_packed[0].id = id; + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].flags = + PACKED_DESC_DEQUEUE_USED_FLAG(vq->used_wrap_counter); + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].count = 1; + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, PACKED_BATCH_SIZE); +} + static __rte_always_inline void vhost_shadow_dequeue_batch_packed(struct virtio_net *dev, struct vhost_virtqueue *vq, @@ -273,6 +297,34 @@ vhost_shadow_dequeue_single_packed(struct vhost_virtqueue *vq, vq_inc_last_used_packed(vq, count); } +static __rte_always_inline void +vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq, + uint16_t buf_id, + uint16_t count) +{ + uint16_t flags; + + vq->shadow_used_packed[0].id = buf_id; + + flags = vq->desc_packed[vq->last_used_idx].flags; + if (vq->used_wrap_counter) { + flags |= VRING_DESC_F_USED; + flags |= VRING_DESC_F_AVAIL; + } else { + flags &= ~VRING_DESC_F_USED; + flags &= ~VRING_DESC_F_AVAIL; + } + + if (!vq->shadow_used_idx) { + vq->shadow_last_used_idx = vq->last_used_idx; + vq->shadow_used_packed[0].len = 0; + vq->shadow_used_packed[0].flags = flags; + vq->shadow_used_idx++; + } + + vq_inc_last_used_packed(vq, count); +} + static inline void do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -1844,7 +1896,11 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev, (void *)(uintptr_t)(desc_addrs[i] + buf_offset), pkts[i]->pkt_len); - vhost_shadow_dequeue_batch_packed(dev, vq, ids); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_batch_packed_inorder(vq, + ids[PACKED_BATCH_SIZE - 1]); + else + vhost_shadow_dequeue_batch_packed(dev, vq, ids); vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE); @@ -1901,7 +1957,11 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, &desc_count)) return -1; - vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); + if (virtio_net_is_inorder(dev)) + vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, + desc_count); + else + vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); vq_inc_last_avail_packed(vq, desc_count); -- 2.17.1