From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jianfeng Tan Subject: [PATCH 2/3] net/virtio_user: add vhost kernel support Date: Fri, 2 Dec 2016 14:31:14 +0000 Message-ID: <1480689075-66977-3-git-send-email-jianfeng.tan@intel.com> References: <1480689075-66977-1-git-send-email-jianfeng.tan@intel.com> Cc: yuanhan.liu@linux.intel.com, ferruh.yigit@intel.com, cunming.liang@intel.com, Jianfeng Tan To: dev@dpdk.org Return-path: Received: from mga06.intel.com (mga06.intel.com [134.134.136.31]) by dpdk.org (Postfix) with ESMTP id 4BFF0FA37 for ; Fri, 2 Dec 2016 15:30:43 +0100 (CET) In-Reply-To: <1480689075-66977-1-git-send-email-jianfeng.tan@intel.com> List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" This patch add support vhost kernel support under vhost abstraction layer. Three main hook functions are added: - vhost_kernel_setup() to open char device, each vq pair (Rx and Tx) would need open it once; - vhost_kernel_ioctl() to communicate control messages with vhost kernel module; - vhost_kernel_enable_queue_pair() to open tap device and set it as the backend of corresonding vhost fd (that is to say, vq pair). Signed-off-by: Jianfeng Tan --- drivers/net/virtio/Makefile | 1 + drivers/net/virtio/virtio_user/vhost.c | 21 +- drivers/net/virtio/virtio_user/vhost.h | 5 + drivers/net/virtio/virtio_user/vhost_kernel.c | 413 ++++++++++++++++++++++++++ 4 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.c diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile index 17f7129..f671f1f 100644 --- a/drivers/net/virtio/Makefile +++ b/drivers/net/virtio/Makefile @@ -61,6 +61,7 @@ endif ifeq ($(CONFIG_RTE_VIRTIO_USER),y) SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost.c SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c endif diff --git a/drivers/net/virtio/virtio_user/vhost.c b/drivers/net/virtio/virtio_user/vhost.c index 09e2e92..5384992 100644 --- a/drivers/net/virtio/virtio_user/vhost.c +++ b/drivers/net/virtio/virtio_user/vhost.c @@ -56,7 +56,7 @@ vhost_internal_valid_vid(int vid) static int vhost_internal_alloc(void) { - int i; + int i, q; for (i = 0; i < VHOST_MAX_DEVICES; ++i) if (internals[i].ops == NULL) @@ -69,16 +69,32 @@ vhost_internal_alloc(void) internals[i].vhostfd = -1; + for (q = 0; q < VHOST_KERNEL_MAX_QUEUES; ++q) { + internals[i].vhostfds[q] = -1; + internals[i].tapfds[q] = -1; + } + return 0; } static void vhost_internal_free(int id) { + int q; + internals[id].ops = NULL; if (internals[id].vhostfd >= 0) close(internals[id].vhostfd); + + for (q = 0; q < VHOST_KERNEL_MAX_QUEUES; ++q) { + if (internals[id].vhostfds[q] >= 0) + close(internals[id].vhostfds[q]); + if (internals[id].tapfds[q] >= 0) + close(internals[id].tapfds[q]); + } + if (internals[id].ifname) + free(internals[id].ifname); } @@ -107,6 +123,9 @@ vhost_setup(const char *path) if (is_vhost_user_by_type(path)) { ret = vhost_ops_user.setup(&internals[vid], path); internals[vid].ops = &vhost_ops_user; + } else { + ret = vhost_ops_kernel.setup(&internals[vid], path); + internals[vid].ops = &vhost_ops_kernel; } if (ret < 0) diff --git a/drivers/net/virtio/virtio_user/vhost.h b/drivers/net/virtio/virtio_user/vhost.h index b476ecc..b6fd092 100644 --- a/drivers/net/virtio/virtio_user/vhost.h +++ b/drivers/net/virtio/virtio_user/vhost.h @@ -114,6 +114,10 @@ struct vhost_internal { int vhostfd; /* for vhost-kernel */ + char *ifname; +#define VHOST_KERNEL_MAX_QUEUES 8 + int vhostfds[VHOST_KERNEL_MAX_QUEUES]; + int tapfds[VHOST_KERNEL_MAX_QUEUES]; }; typedef int (*vhost_setup_t)(struct vhost_internal *internal, @@ -132,6 +136,7 @@ struct vhost_ops { }; struct vhost_ops vhost_ops_user; +struct vhost_ops vhost_ops_kernel; int vhost_setup(const char *path); int vhost_call(int vid, enum vhost_user_request req, void *arg); diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c new file mode 100644 index 0000000..9a9e8bd --- /dev/null +++ b/drivers/net/virtio/virtio_user/vhost_kernel.c @@ -0,0 +1,413 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "vhost.h" + +struct vhost_memory_kernel { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[0]; +}; + +/* vhost kernel ioctls */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/* TUN ioctls */ +#define TUNSETIFF _IOW('T', 202, int) +#define TUNGETFEATURES _IOR('T', 207, unsigned int) +#define TUNSETOFFLOAD _IOW('T', 208, unsigned int) +#define TUNGETIFF _IOR('T', 210, unsigned int) +#define TUNSETSNDBUF _IOW('T', 212, int) +#define TUNGETVNETHDRSZ _IOR('T', 215, int) +#define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETVNETLE _IOW('T', 220, int) +#define TUNSETVNETBE _IOW('T', 222, int) + +/* TUNSETIFF ifr flags */ +#define IFF_TAP 0x0002 +#define IFF_NO_PI 0x1000 +#define IFF_ONE_QUEUE 0x2000 +#define IFF_VNET_HDR 0x4000 +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 + +/* Features for GSO (TUNSETOFFLOAD). */ +#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ +#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ +#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ +#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ +#define TUN_F_UFO 0x10 /* I can handle UFO packets */ + +/* Constants */ +#define TUN_DEF_SNDBUF (1ull << 20) +#define PATH_NET_TUN "/dev/net/tun" +#define VHOST_KERNEL_MAX_REGIONS 64 + +static uint64_t vhost_req_user_to_kernel[] = { + [VHOST_USER_SET_OWNER] = VHOST_SET_OWNER, + [VHOST_USER_RESET_OWNER] = VHOST_RESET_OWNER, + [VHOST_USER_SET_FEATURES] = VHOST_SET_FEATURES, + [VHOST_USER_GET_FEATURES] = VHOST_GET_FEATURES, + [VHOST_USER_SET_VRING_CALL] = VHOST_SET_VRING_CALL, + [VHOST_USER_SET_VRING_NUM] = VHOST_SET_VRING_NUM, + [VHOST_USER_SET_VRING_BASE] = VHOST_SET_VRING_BASE, + [VHOST_USER_GET_VRING_BASE] = VHOST_GET_VRING_BASE, + [VHOST_USER_SET_VRING_ADDR] = VHOST_SET_VRING_ADDR, + [VHOST_USER_SET_VRING_KICK] = VHOST_SET_VRING_KICK, + [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE, +}; + +/* By default, vhost kernel module allows 64 regions, but DPDK allows + * 256 segments. As a relief, below function merges those virtually + * adjacent memsegs into one region. + */ +static struct vhost_memory_kernel * +prepare_vhost_memory_kernel(void) +{ + uint32_t i, j, k = 0; + struct rte_memseg *seg; + struct vhost_memory_region *mr; + struct vhost_memory_kernel *vm; + + vm = malloc(sizeof(struct vhost_memory_kernel) + + VHOST_KERNEL_MAX_REGIONS * + sizeof(struct vhost_memory_region)); + + for (i = 0; i < RTE_MAX_MEMSEG; ++i) { + seg = &rte_eal_get_configuration()->mem_config->memseg[i]; + if (!seg->addr) + break; + + int new_region = 1; + + for (j = 0; j < k; ++j) { + mr = &vm->regions[j]; + + if (mr->userspace_addr + mr->memory_size == + (uint64_t)seg->addr) { + mr->memory_size += seg->len; + new_region = 0; + break; + } + + if ((uint64_t)seg->addr + seg->len == + mr->userspace_addr) { + mr->guest_phys_addr = (uint64_t)seg->addr; + mr->userspace_addr = (uint64_t)seg->addr; + mr->memory_size += seg->len; + new_region = 0; + break; + } + } + + if (new_region == 0) + continue; + + mr = &vm->regions[k++]; + mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */ + mr->userspace_addr = (uint64_t)seg->addr; + mr->memory_size = seg->len; + mr->mmap_offset = 0; + + if (k >= VHOST_KERNEL_MAX_REGIONS) { + free(vm); + return NULL; + } + } + + vm->nregions = k; + vm->padding = 0; + return vm; +} + +static const uint64_t guest_offloads_mask = + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | + (1ULL << VIRTIO_NET_F_GUEST_ECN) | + (1ULL << VIRTIO_NET_F_GUEST_UFO); + +static int +vhost_kernel_ioctl(struct vhost_internal *internal, + enum vhost_user_request req, + void *arg) +{ + int i, ret = -1; + uint64_t req_kernel; + struct vhost_memory_kernel *vm = NULL; + + req_kernel = vhost_req_user_to_kernel[req]; + + if (req_kernel == VHOST_SET_MEM_TABLE) { + vm = prepare_vhost_memory_kernel(); + if (!vm) + return -1; + arg = (void *)vm; + } + + /* Does not work when VIRTIO_F_IOMMU_PLATFORM now, why? */ + if (req_kernel == VHOST_SET_FEATURES) + *(uint64_t *)arg &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM); + + for (i = 0; i < VHOST_KERNEL_MAX_QUEUES; ++i) { + if (internal->vhostfds[i] < 0) + continue; + + ret = ioctl(internal->vhostfds[i], req_kernel, arg); + if (ret < 0) + break; + } + + if (!ret && req_kernel == VHOST_SET_FEATURES) + internal->features = *((uint64_t *)arg); + + /* with tap as the backend, all these features are supported but not + * claimed by vhost-net, so we add them back when reporting to upper + * layer + */ + if (!ret && req_kernel == VHOST_GET_FEATURES) + *((uint64_t *)arg) |= guest_offloads_mask; + + if (vm) + free(vm); + + return ret; +} + +/** + * Set up environment to talk with a vhost kernel backend. + * @param path + * - The path to vhost net (kernel) character file. + * + * @return + * - (-1) if fail to set up; + * - (>=0) if successful. + */ +static int +vhost_kernel_setup(struct vhost_internal *internal, const char *path) +{ + int vhostfd; + uint32_t q; + uint32_t queue_pairs = 1; + + for (q = 0; q < queue_pairs; ++q) { + vhostfd = open(path, O_RDWR); + if (vhostfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s, %s", + path, strerror(errno)); + return -1; + } + + internal->vhostfds[q] = vhostfd; + } + + return 0; +} + +static int +_vhost_kernel_enable_queue_pair(int vhostfd, int tapfd) +{ + struct vhost_vring_file f; + + f.fd = tapfd; + f.index = 0; + if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", + strerror(errno)); + return -1; + } + + f.index = 1; + if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) { + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", + strerror(errno)); + return -1; + } + + return 0; +} + +static int +vhost_kernel_enable_queue_pair(struct vhost_internal *internal, + uint16_t pair_idx, + int enable) +{ + unsigned int offload = + TUN_F_CSUM | + TUN_F_TSO4 | + TUN_F_TSO6 | + TUN_F_TSO_ECN | + TUN_F_UFO; + unsigned int features; + int sndbuf = TUN_DEF_SNDBUF; + struct ifreq ifr; + int hdr_size; + int vhostfd; + int tapfd; + int req_mq; + + + vhostfd = internal->vhostfds[pair_idx]; + + if (!enable) { + if (internal->tapfds[pair_idx]) { + close(internal->tapfds[pair_idx]); + internal->tapfds[pair_idx] = -1; + } + return _vhost_kernel_enable_queue_pair(vhostfd, -1); + } + + if ((internal->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || + (internal->features & (1ULL << VIRTIO_F_VERSION_1))) + hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + hdr_size = sizeof(struct virtio_net_hdr); + + req_mq = !!(internal->features & (1ULL << VIRTIO_NET_F_MQ)); + + /* TODO: + * 1. get/set offload capability, tap_probe_has_ufo, tap_fd_set_offload + * 2. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len + * 3. get number of memory regions from vhost module parameter + * max_mem_regions, supported in newer version linux kernel + */ + tapfd = open(PATH_NET_TUN, O_RDWR); + if (tapfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s: %s", + PATH_NET_TUN, strerror(errno)); + return -1; + } + + /* Construct ifr */ + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + + if (ioctl(tapfd, TUNGETFEATURES, &features) == -1) { + PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); + goto error; + } + if (features & IFF_ONE_QUEUE) + ifr.ifr_flags |= IFF_ONE_QUEUE; + + /* Let tap instead of vhost-net handle vnet header, as the latter does + * not support offloading. And in this case, we should not set feature + * bit VHOST_NET_F_VIRTIO_NET_HDR. + */ + if (features & IFF_VNET_HDR) { + ifr.ifr_flags |= IFF_VNET_HDR; + } else { + PMD_DRV_LOG(ERR, "TAP does not support IFF_VNET_HDR"); + goto error; + } + + if (req_mq) { + if (features & IFF_MULTI_QUEUE) + ifr.ifr_flags |= IFF_MULTI_QUEUE; + else { + PMD_DRV_LOG(ERR, "multiqueue not supported by kernel"); + goto error; + } + } + + if (internal->ifname) + strncpy(ifr.ifr_name, internal->ifname, IFNAMSIZ); + else + strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ); + if (ioctl(tapfd, TUNSETIFF, (void *)&ifr) == -1) { + PMD_DRV_LOG(ERR, "TUNSETIFF failed: %s", strerror(errno)); + goto error; + } + + fcntl(tapfd, F_SETFL, O_NONBLOCK); + + if (ioctl(tapfd, TUNSETVNETHDRSZ, &hdr_size) < 0) { + PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ failed: %s", strerror(errno)); + goto error; + } + + if (ioctl(tapfd, TUNSETSNDBUF, &sndbuf) < 0) { + PMD_DRV_LOG(ERR, "TUNSETSNDBUF failed: %s", strerror(errno)); + goto error; + } + + if (_vhost_kernel_enable_queue_pair(vhostfd, tapfd) < 0) + goto error; + + /* Only try best to set offload */ + if (ioctl(tapfd, TUNSETOFFLOAD, offload) != 0) + PMD_DRV_LOG(ERR, "TUNSETOFFLOAD ioctl() failed: %s", + strerror(errno)); + + internal->tapfds[pair_idx] = tapfd; + if (!internal->ifname) + internal->ifname = strdup(ifr.ifr_name); + + return 0; +error: + return -1; +} + +struct vhost_ops vhost_ops_kernel = { + .setup = vhost_kernel_setup, + .control = vhost_kernel_ioctl, + .enable_qp = vhost_kernel_enable_queue_pair +}; -- 2.7.4