From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jianfeng Tan Subject: [PATCH v6 4/7] virtio-user: add vhost adapter layer Date: Thu, 2 Jun 2016 09:54:34 +0000 Message-ID: <1464861277-130265-5-git-send-email-jianfeng.tan@intel.com> References: <1446748276-132087-1-git-send-email-jianfeng.tan@intel.com> <1464861277-130265-1-git-send-email-jianfeng.tan@intel.com> Cc: Jianfeng Tan , Huawei Xie , rich.lane@bigswitch.com, yuanhan.liu@linux.intel.com, mst@redhat.com, nakajima.yoshihiro@lab.ntt.co.jp, p.fedin@samsung.com, ann.zhuangyanying@huawei.com, mukawa@igel.co.jp, nhorman@tuxdriver.com To: dev@dpdk.org Return-path: Received: from mga09.intel.com (mga09.intel.com [134.134.136.24]) by dpdk.org (Postfix) with ESMTP id 0A233568E for ; Thu, 2 Jun 2016 11:54:59 +0200 (CEST) In-Reply-To: <1464861277-130265-1-git-send-email-jianfeng.tan@intel.com> List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" This patch is to provide vhost adapter layer implementations. Instead of relying on a hypervisor to translate between device emulation and vhost backend, here we directly talk with vhost backend through the vhost file. Depending on the type of vhost file, - vhost-user is used if the given path points to a unix socket; - vhost-kernel is used if the given path points to a char device. Here three main APIs are provided to upper layer (device emulation): - vhost_user_setup(), to set up env to talk to a vhost user backend; - vhost_kernel_setup(), to set up env to talk to a vhost kernel backend. - vhost_call(), to provide a unified interface to communicate with vhost backend. ---------------------- | ------------------ | | | virtio driver | | | ------------------ | | | | | ------------------ | ------> virtio-user PMD | | device emulate | | | | | | | | vhost adapter |-|----> (vhost_user.c, vhost_kernel.c, vhost.c) | ------------------ | ---------------------- | | -------------- --> (vhost-user protocol or vhost-net ioctls) | ------------------ | vhost backend | ------------------ Signed-off-by: Huawei Xie Signed-off-by: Jianfeng Tan Acked-by: Neil Horman --- config/common_linuxapp | 3 + drivers/net/virtio/Makefile | 6 + drivers/net/virtio/virtio_user/vhost.c | 105 +++++++ drivers/net/virtio/virtio_user/vhost.h | 222 +++++++++++++++ drivers/net/virtio/virtio_user/vhost_kernel.c | 254 +++++++++++++++++ drivers/net/virtio/virtio_user/vhost_user.c | 378 ++++++++++++++++++++++++++ 6 files changed, 968 insertions(+) create mode 100644 drivers/net/virtio/virtio_user/vhost.c create mode 100644 drivers/net/virtio/virtio_user/vhost.h create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.c create mode 100644 drivers/net/virtio/virtio_user/vhost_user.c diff --git a/config/common_linuxapp b/config/common_linuxapp index 7e698e2..946a6d4 100644 --- a/config/common_linuxapp +++ b/config/common_linuxapp @@ -43,3 +43,6 @@ CONFIG_RTE_LIBRTE_VHOST=y CONFIG_RTE_LIBRTE_PMD_VHOST=y CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y CONFIG_RTE_LIBRTE_POWER=y + +# Enable virtio-user +CONFIG_RTE_VIRTIO_VDEV=y diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile index ef84f60..c9f2bc0 100644 --- a/drivers/net/virtio/Makefile +++ b/drivers/net/virtio/Makefile @@ -55,6 +55,12 @@ ifeq ($(findstring RTE_MACHINE_CPUFLAG_SSSE3,$(CFLAGS)),RTE_MACHINE_CPUFLAG_SSSE SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_rxtx_simple.c endif +ifeq ($(CONFIG_RTE_VIRTIO_VDEV),y) +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c +endif + # this lib depends upon: DEPDIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += lib/librte_eal lib/librte_ether DEPDIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += lib/librte_mempool lib/librte_mbuf diff --git a/drivers/net/virtio/virtio_user/vhost.c b/drivers/net/virtio/virtio_user/vhost.c new file mode 100644 index 0000000..1944a97 --- /dev/null +++ b/drivers/net/virtio/virtio_user/vhost.c @@ -0,0 +1,105 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vhost.h" + +static const char * const vhost_msg_strings[] = { + [VHOST_MSG_SET_OWNER] = "VHOST_MSG_SET_OWNER", + [VHOST_MSG_RESET_OWNER] = "VHOST_MSG_RESET_OWNER", + [VHOST_MSG_SET_FEATURES] = "VHOST_MSG_SET_FEATURES", + [VHOST_MSG_GET_FEATURES] = "VHOST_MSG_GET_FEATURES", + [VHOST_MSG_SET_VRING_CALL] = "VHOST_MSG_SET_VRING_CALL", + [VHOST_MSG_SET_VRING_NUM] = "VHOST_MSG_SET_VRING_NUM", + [VHOST_MSG_SET_VRING_BASE] = "VHOST_MSG_SET_VRING_BASE", + [VHOST_MSG_GET_VRING_BASE] = "VHOST_MSG_GET_VRING_BASE", + [VHOST_MSG_SET_VRING_ADDR] = "VHOST_MSG_SET_VRING_ADDR", + [VHOST_MSG_SET_VRING_KICK] = "VHOST_MSG_SET_VRING_KICK", + [VHOST_MSG_SET_MEM_TABLE] = "VHOST_MSG_SET_MEM_TABLE", + NULL, +}; + +static uint64_t vhost_req_map[][2] = { + [VHOST_MSG_SET_OWNER] = { + VHOST_SET_OWNER, VHOST_USER_SET_OWNER + }, + [VHOST_MSG_RESET_OWNER] = { + VHOST_RESET_OWNER, VHOST_USER_RESET_OWNER + }, + [VHOST_MSG_SET_FEATURES] = { + VHOST_SET_FEATURES, VHOST_USER_SET_FEATURES + }, + [VHOST_MSG_GET_FEATURES] = { + VHOST_GET_FEATURES, VHOST_USER_GET_FEATURES + }, + [VHOST_MSG_SET_VRING_CALL] = { + VHOST_SET_VRING_CALL, VHOST_USER_SET_VRING_CALL + }, + [VHOST_MSG_SET_VRING_NUM] = { + VHOST_SET_VRING_NUM, VHOST_USER_SET_VRING_NUM + }, + [VHOST_MSG_SET_VRING_BASE] = { + VHOST_SET_VRING_BASE, VHOST_USER_SET_VRING_BASE + }, + [VHOST_MSG_GET_VRING_BASE] = { + VHOST_GET_VRING_BASE, VHOST_USER_GET_VRING_BASE + }, + [VHOST_MSG_SET_VRING_ADDR] = { + VHOST_SET_VRING_ADDR, VHOST_USER_SET_VRING_ADDR + }, + [VHOST_MSG_SET_VRING_KICK] = { + VHOST_SET_VRING_KICK, VHOST_USER_SET_VRING_KICK + }, + [VHOST_MSG_SET_MEM_TABLE] = { + VHOST_SET_MEM_TABLE, VHOST_USER_SET_MEM_TABLE + }, +}; + +int +vhost_call(int vhostfd, int type, uint64_t req, void *arg) +{ + uint64_t req_new; + int ret; + + PMD_DRV_LOG(INFO, "%s", vhost_msg_strings[req]); + + req_new = vhost_req_map[req][type]; + if (type == VHOST_USER) + ret = vhost_user_sock(vhostfd, req_new, arg); + else + ret = vhost_kernel_ioctl(vhostfd, req_new, arg); + + if (ret < 0) + PMD_DRV_LOG(ERR, "vhost_call %s failed: %s\n", + vhost_msg_strings[req], strerror(errno)); + return ret; +} diff --git a/drivers/net/virtio/virtio_user/vhost.h b/drivers/net/virtio/virtio_user/vhost.h new file mode 100644 index 0000000..6bac1e8 --- /dev/null +++ b/drivers/net/virtio/virtio_user/vhost.h @@ -0,0 +1,222 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include +#include +#include + +#include "../virtio_pci.h" +#include "../virtio_logs.h" +#include "../virtqueue.h" + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; +}; + +struct vhost_vring_addr { + unsigned int index; + /* Option flags. */ + unsigned int flags; + /* Flag values: */ + /* Whether log address is valid. If set enables logging. */ +#define VHOST_VRING_F_LOG 0 + + /* Start of array of descriptors (virtually contiguous) */ + uint64_t desc_user_addr; + /* Used structure address. Must be 32 bit aligned */ + uint64_t used_user_addr; + /* Available structure address. Must be 16 bit aligned */ + uint64_t avail_user_addr; + /* Logging support. */ + /* Log writes to used structure, at offset calculated from specified + * address. Address must be 32 bit aligned. + */ + uint64_t log_guest_addr; +}; + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_MAX +}; + +struct vhost_memory_region { + uint64_t guest_phys_addr; + uint64_t memory_size; /* bytes */ + uint64_t userspace_addr; + uint64_t mmap_offset; +}; + +struct vhost_memory_kernel { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[0]; +}; + +struct vhost_memory { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[VHOST_MEMORY_MAX_NREGIONS]; +}; + +struct vhost_user_msg { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_memory memory; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)); + +#define VHOST_USER_HDR_SIZE offsetof(struct vhost_user_msg, payload.u64) +#define VHOST_USER_PAYLOAD_SIZE \ + (sizeof(struct vhost_user_msg) - VHOST_USER_HDR_SIZE) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel) +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +/*****************************************************************************/ + +/* Ioctl defines */ +#define TUNSETIFF _IOW('T', 202, int) +#define TUNGETFEATURES _IOR('T', 207, unsigned int) +#define TUNSETOFFLOAD _IOW('T', 208, unsigned int) +#define TUNGETIFF _IOR('T', 210, unsigned int) +#define TUNSETSNDBUF _IOW('T', 212, int) +#define TUNGETVNETHDRSZ _IOR('T', 215, int) +#define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNSETQUEUE _IOW('T', 217, int) +#define TUNSETVNETLE _IOW('T', 220, int) +#define TUNSETVNETBE _IOW('T', 222, int) + +/* TUNSETIFF ifr flags */ +#define IFF_TAP 0x0002 +#define IFF_NO_PI 0x1000 +#define IFF_ONE_QUEUE 0x2000 +#define IFF_VNET_HDR 0x4000 +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 + +/* Features for GSO (TUNSETOFFLOAD). */ +#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ +#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */ +#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */ +#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */ +#define TUN_F_UFO 0x10 /* I can handle UFO packets */ + +enum { + VHOST_MSG_SET_OWNER, + VHOST_MSG_RESET_OWNER, + VHOST_MSG_SET_FEATURES, + VHOST_MSG_GET_FEATURES, + VHOST_MSG_SET_VRING_CALL, + VHOST_MSG_SET_VRING_NUM, + VHOST_MSG_SET_VRING_BASE, + VHOST_MSG_GET_VRING_BASE, + VHOST_MSG_SET_VRING_ADDR, + VHOST_MSG_SET_VRING_KICK, + VHOST_MSG_SET_MEM_TABLE, + VHOST_MSG_MAX, +}; + +#define VHOST_KERNEL 0 +#define VHOST_USER 1 + +int vhost_user_sock(int vhostfd, uint64_t req, void *arg); +int vhost_user_setup(const char *path); + +int vhost_kernel_ioctl(int vhostfd, uint64_t req, void *arg); +int vhost_kernel_setup(const char *path, const char *ifname, int *p_tapfd); +int vhost_kernel_post(int vhostfd, int tapfd, uint64_t features, uint32_t nvqs); + +int vhost_call(int vhostfd, int type, uint64_t req, void *arg); + +#endif diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c new file mode 100644 index 0000000..41196cf --- /dev/null +++ b/drivers/net/virtio/virtio_user/vhost_kernel.c @@ -0,0 +1,254 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "vhost.h" + +#define TUN_DEF_SNDBUF (1ull << 20) +#define PATH_NET_TUN "/dev/net/tun" + +/** Merge those virtually adjacent memsegs into one region. + */ +static void +prepare_vhost_memory_kernel(struct vhost_memory_kernel **p_vm) +{ + uint32_t i, j, k = 0; + struct rte_memseg *seg; + struct vhost_memory_region *mr; + struct vhost_memory_kernel *vm; + + vm = malloc(sizeof(struct vhost_memory_kernel) + + RTE_MAX_MEMSEG * sizeof(struct vhost_memory_region)); + + for (i = 0; i < RTE_MAX_MEMSEG; ++i) { + seg = &rte_eal_get_configuration()->mem_config->memseg[i]; + if (!seg->addr) + break; + + int new_region = 1; + + for (j = 0; j < k; ++j) { + mr = &vm->regions[j]; + + if (mr->userspace_addr + mr->memory_size == + (uint64_t)seg->addr) { + mr->memory_size += seg->len; + new_region = 0; + break; + } + + if ((uint64_t)seg->addr + seg->len == + mr->userspace_addr) { + mr->guest_phys_addr = (uint64_t)seg->addr; + mr->userspace_addr = (uint64_t)seg->addr; + mr->memory_size += seg->len; + new_region = 0; + break; + } + } + + if (new_region == 0) + continue; + + mr = &vm->regions[k++]; + mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */ + mr->userspace_addr = (uint64_t)seg->addr; + mr->memory_size = seg->len; + mr->mmap_offset = 0; + } + + vm->nregions = k; + vm->padding = 0; + *p_vm = vm; +} + +int +vhost_kernel_ioctl(int vhostfd, uint64_t req, void *arg) +{ + int ret; + struct vhost_memory_kernel *vm = NULL; + + if (req == VHOST_SET_MEM_TABLE) { + prepare_vhost_memory_kernel(&vm); + arg = (void *)vm; + } + + ret = ioctl(vhostfd, req, arg); + + if (req == VHOST_SET_MEM_TABLE) + free(vm); + + return ret; +} + +/** + * Set up environment to talk with a vhost kernel backend. + * @param path + * - The path to vhost net (kernel) character file. + * + * @param ifname + * - Specify the tap device name if any, or NULL. + * + * @param p_tapfd + * - Pointer to store the fd of tap device. + * + * @return + * - (-1) if fail to set up; + * - (>=0) if successful, and it is the fd to vhostfd. + */ +int +vhost_kernel_setup(const char *path, const char *ifname, int *p_tapfd) +{ + int vhostfd, tapfd; + int len = sizeof(struct virtio_net_hdr_mrg_rxbuf); + int req_mq = 0; + int sndbuf = TUN_DEF_SNDBUF; + unsigned int features; + struct ifreq ifr; + + /* TODO: + * 1. get/set offload capability, tap_probe_has_ufo, tap_fd_set_offload + * 2. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len + * 3. get number of memory regions from vhost module parameter + * max_mem_regions, supported in newer version linux kernel + */ + + tapfd = open(PATH_NET_TUN, O_RDWR); + if (tapfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s: %s", + PATH_NET_TUN, strerror(errno)); + return -1; + } + + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + + if (ioctl(tapfd, TUNGETFEATURES, &features) == -1) { + PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno)); + goto error; + } + + if (features & IFF_ONE_QUEUE) + ifr.ifr_flags |= IFF_ONE_QUEUE; + + if (features & IFF_VNET_HDR) + ifr.ifr_flags |= IFF_VNET_HDR; + else { + PMD_DRV_LOG(ERR, "vnet_hdr not supported by kernel"); + goto error; + } + + if (req_mq) { + if (features & IFF_MULTI_QUEUE) + ifr.ifr_flags |= IFF_MULTI_QUEUE; + else { + PMD_DRV_LOG(ERR, "multiqueue not supported by kernel"); + goto error; + } + } + + if (ifname) + strncpy(ifr.ifr_name, ifname, IFNAMSIZ); + else + strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ); + if (ioctl(tapfd, TUNSETIFF, (void *)&ifr) == -1) { + PMD_DRV_LOG(ERR, "TUNSETIFF failed: %s", strerror(errno)); + goto error; + } + fcntl(tapfd, F_SETFL, O_NONBLOCK); + + if (ioctl(tapfd, TUNSETVNETHDRSZ, &len) < 0) { + PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ failed: %s", strerror(errno)); + goto error; + } + + if (ioctl(tapfd, TUNSETSNDBUF, &sndbuf) < 0) { + PMD_DRV_LOG(ERR, "TUNSETSNDBUF failed: %s", strerror(errno)); + goto error; + } + + vhostfd = open(path, O_RDWR); + if (vhostfd < 0) { + PMD_DRV_LOG(ERR, "fail to open %s, %s", path, strerror(errno)); + goto error; + } + + *p_tapfd = tapfd; + return vhostfd; + +error: + close(tapfd); + return -1; +} + +int +vhost_kernel_post(int vhostfd, int tapfd, uint64_t features, uint32_t nvqs) +{ + struct vhost_vring_file f; + uint16_t hdr_size; + int ret; + + if ((features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || + (features & (1ULL << VIRTIO_F_VERSION_1))) + hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); + else + hdr_size = sizeof(struct virtio_net_hdr); + + if (ioctl(tapfd, TUNSETVNETHDRSZ, &hdr_size) == -1) { + PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ fails, %s", strerror(errno)); + return -1; + } + + f.fd = tapfd; + for (f.index = 0; f.index < nvqs; ++f.index) { + ret = vhost_kernel_ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f); + if (ret < 0) { + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s", + strerror(errno)); + return -1; + } + } + + return 0; +} diff --git a/drivers/net/virtio/virtio_user/vhost_user.c b/drivers/net/virtio/virtio_user/vhost_user.c new file mode 100644 index 0000000..b0a84cf --- /dev/null +++ b/drivers/net/virtio/virtio_user/vhost_user.c @@ -0,0 +1,378 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +static int +vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num) +{ + int r; + struct msghdr msgh; + struct iovec iov; + size_t fd_size = fd_num * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = (uint8_t *)buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fd_size); + + do { + r = sendmsg(fd, &msgh, 0); + } while (r < 0 && errno == EINTR); + + return r; +} + +static int +vhost_user_read(int fd, struct vhost_user_msg *msg) +{ + uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; + int ret, sz_hdr = VHOST_USER_HDR_SIZE, sz_payload; + + ret = recv(fd, (void *)msg, sz_hdr, 0); + if (ret < sz_hdr) { + PMD_DRV_LOG(ERR, "Failed to recv msg hdr: %d instead of %d.", + ret, sz_hdr); + goto fail; + } + + /* validate msg flags */ + if (msg->flags != (valid_flags)) { + PMD_DRV_LOG(ERR, "Failed to recv msg: flags %x instead of %x.", + msg->flags, valid_flags); + goto fail; + } + + sz_payload = msg->size; + if (sz_payload) { + ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0); + if (ret < sz_payload) { + PMD_DRV_LOG(ERR, "Failed to recv msg payload: %d instead of %d.", + ret, msg->size); + goto fail; + } + } + + return 0; + +fail: + return -1; +} + +struct hugepage_file_info { + uint64_t addr; /**< virtual addr */ + size_t size; /**< the file size */ + char path[PATH_MAX]; /**< path to backing file */ +}; + +/* Two possible options: + * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file + * array. This is simple but cannot be used in secondary process because + * secondary process will close and munmap that file. + * 2. Match HUGEFILE_FMT to find hugepage files directly. + * + * We choose option 2. + */ +static int +get_hugepage_file_info(struct hugepage_file_info huges[], int max) +{ + int idx; + FILE *f; + char buf[BUFSIZ], *tmp, *tail; + char *str_underline, *str_start; + int huge_index; + uint64_t v_start, v_end; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + PMD_DRV_LOG(ERR, "cannot open /proc/self/maps"); + return -1; + } + + idx = 0; + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) { + PMD_DRV_LOG(ERR, "Failed to parse address"); + goto error; + } + + tmp = strchr(buf, ' ') + 1; /** skip address */ + tmp = strchr(tmp, ' ') + 1; /** skip perm */ + tmp = strchr(tmp, ' ') + 1; /** skip offset */ + tmp = strchr(tmp, ' ') + 1; /** skip dev */ + tmp = strchr(tmp, ' ') + 1; /** skip inode */ + while (*tmp == ' ') /** skip spaces */ + tmp++; + tail = strrchr(tmp, '\n'); /** remove newline if exists */ + if (tail) + *tail = '\0'; + + /* Match HUGEFILE_FMT, aka "%s/%smap_%d", + * which is defined in eal_filesystem.h + */ + str_underline = strrchr(tmp, '_'); + if (!str_underline) + continue; + + str_start = str_underline - strlen("map"); + if (str_start < tmp) + continue; + + if (sscanf(str_start, "map_%d", &huge_index) != 1) + continue; + + if (idx >= max) { + PMD_DRV_LOG(ERR, "Exceed maximum of %d", max); + goto error; + } + huges[idx].addr = v_start; + huges[idx].size = v_end - v_start; + strcpy(huges[idx].path, tmp); + idx++; + } + + fclose(f); + return idx; + +error: + fclose(f); + return -1; +} + +static int +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) +{ + int i, num; + struct hugepage_file_info huges[VHOST_MEMORY_MAX_NREGIONS]; + struct vhost_memory_region *mr; + + num = get_hugepage_file_info(huges, VHOST_MEMORY_MAX_NREGIONS); + if (num < 0) { + PMD_INIT_LOG(ERR, "Failed to prepare memory for vhost-user"); + return -1; + } + + for (i = 0; i < num; ++i) { + mr = &msg->payload.memory.regions[i]; + mr->guest_phys_addr = huges[i].addr; /* use vaddr! */ + mr->userspace_addr = huges[i].addr; + mr->memory_size = huges[i].size; + mr->mmap_offset = 0; + fds[i] = open(huges[i].path, O_RDWR); + } + + msg->payload.memory.nregions = num; + msg->payload.memory.padding = 0; + + return 0; +} + +static struct vhost_user_msg m; + +int +vhost_user_sock(int vhostfd, uint64_t req, void *arg) +{ + struct vhost_user_msg msg; + struct vhost_vring_file *file = 0; + int need_reply = 0; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num = 0; + int i, len; + + msg.request = req; + msg.flags = VHOST_USER_VERSION; + msg.size = 0; + + switch (req) { + case VHOST_USER_GET_FEATURES: + need_reply = 1; + break; + + case VHOST_USER_SET_FEATURES: + case VHOST_USER_SET_LOG_BASE: + msg.payload.u64 = *((__u64 *)arg); + msg.size = sizeof(m.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + break; + + case VHOST_USER_SET_MEM_TABLE: + if (prepare_vhost_memory_user(&msg, fds) < 0) + return -1; + fd_num = msg.payload.memory.nregions; + msg.size = sizeof(m.payload.memory.nregions); + msg.size += sizeof(m.payload.memory.padding); + msg.size += fd_num * sizeof(struct vhost_memory_region); + break; + + case VHOST_USER_SET_LOG_FD: + fds[fd_num++] = *((int *)arg); + break; + + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(m.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(m.payload.state); + need_reply = 1; + break; + + case VHOST_USER_SET_VRING_ADDR: + memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr)); + msg.size = sizeof(m.payload.addr); + break; + + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + file = arg; + msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK; + msg.size = sizeof(m.payload.u64); + if (file->fd > 0) + fds[fd_num++] = file->fd; + else + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; + break; + + default: + PMD_DRV_LOG(ERR, "trying to send unhandled msg type"); + return -1; + } + + len = VHOST_USER_HDR_SIZE + msg.size; + if (vhost_user_write(vhostfd, &msg, len, fds, fd_num) < 0) + return 0; + + if (req == VHOST_USER_SET_MEM_TABLE) + for (i = 0; i < fd_num; ++i) + close(fds[i]); + + if (need_reply) { + if (vhost_user_read(vhostfd, &msg) < 0) + return -1; + + if (req != msg.request) { + PMD_DRV_LOG(ERR, "Received unexpected msg type."); + return -1; + } + + switch (req) { + case VHOST_USER_GET_FEATURES: + if (msg.size != sizeof(m.payload.u64)) { + PMD_DRV_LOG(ERR, "Received bad msg size."); + return -1; + } + *((__u64 *)arg) = msg.payload.u64; + break; + case VHOST_USER_GET_VRING_BASE: + if (msg.size != sizeof(m.payload.state)) { + PMD_DRV_LOG(ERR, "Received bad msg size."); + return -1; + } + memcpy(arg, &msg.payload.state, + sizeof(struct vhost_vring_state)); + break; + default: + PMD_DRV_LOG(ERR, "Received unexpected msg type."); + return -1; + } + } + + return 0; +} + +/** + * Set up environment to talk with a vhost user backend. + * @param path + * - The path to vhost user unix socket file. + * + * @return + * - (-1) if fail to set up; + * - (>=0) if successful, and it is the fd to vhostfd. + */ +int +vhost_user_setup(const char *path) +{ + int fd; + int flag; + struct sockaddr_un un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + PMD_DRV_LOG(ERR, "socket() error, %s", strerror(errno)); + return -1; + } + + flag = fcntl(fd, F_GETFD); + fcntl(fd, F_SETFD, flag | FD_CLOEXEC); + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + snprintf(un.sun_path, sizeof(un.sun_path), "%s", path); + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + PMD_DRV_LOG(ERR, "connect error, %s", strerror(errno)); + close(fd); + return -1; + } + + return fd; +} -- 2.1.4