From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jason Wang Subject: [PATCH 01/12] tap: multiqueue support Date: Fri, 28 Dec 2012 18:31:53 +0800 Message-ID: <1356690724-37891-2-git-send-email-jasowang@redhat.com> References: <1356690724-37891-1-git-send-email-jasowang@redhat.com> Cc: rusty@rustcorp.com.au, kvm@vger.kernel.org, mprivozn@redhat.com, shiyer@redhat.com, krkumar2@in.ibm.com, jwhan@filewood.snu.ac.kr, Jason Wang To: mst@redhat.com, aliguori@us.ibm.com, stefanha@redhat.com, qemu-devel@nongnu.org Return-path: Received: from mx1.redhat.com ([209.132.183.28]:30211 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752979Ab2L1KlL (ORCPT ); Fri, 28 Dec 2012 05:41:11 -0500 In-Reply-To: <1356690724-37891-1-git-send-email-jasowang@redhat.com> Sender: kvm-owner@vger.kernel.org List-ID: Recently, linux support multiqueue tap which could let userspace call TUNSETIFF for a signle device many times to create multiple file descriptors as independent queues. User could also enable/disabe a specific queue through TUNSETQUEUE. The patch adds the generic infrastructure to create multiqueue taps. To achieve this a new parameter "queues" were introduced to specify how many queues were expected to be created for tap. The "fd" parameter were also changed to support a list of file descriptors which could be used by management (such as libvirt) to pass pre-created file descriptors (queues) to qemu. Each TAPState were still associated to a tap fd, which mean multiple TAPStates were created when user needs multiqueue taps. Only linux part were implemented now, since it's the only OS that support multiqueue tap. Signed-off-by: Jason Wang --- net/tap-aix.c | 18 ++++- net/tap-bsd.c | 18 ++++- net/tap-haiku.c | 18 ++++- net/tap-linux.c | 70 +++++++++++++++- net/tap-linux.h | 4 + net/tap-solaris.c | 18 ++++- net/tap-win32.c | 10 ++ net/tap.c | 248 +++++++++++++++++++++++++++++++++++++---------------- net/tap.h | 8 ++- qapi-schema.json | 5 +- 10 files changed, 335 insertions(+), 82 deletions(-) diff --git a/net/tap-aix.c b/net/tap-aix.c index f27c177..f931ef3 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -25,7 +25,8 @@ #include "net/tap.h" #include -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, "no tap on AIX\n"); return -1; @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ + return -1; +} + +int tap_fd_detach(int fd) +{ + return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ + return -1; +} diff --git a/net/tap-bsd.c b/net/tap-bsd.c index a3b717d..07c287d 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -33,7 +33,8 @@ #include #endif -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { int fd; #ifdef TAPGIFNAME @@ -145,3 +146,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ + return -1; +} + +int tap_fd_detach(int fd) +{ + return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ + return -1; +} diff --git a/net/tap-haiku.c b/net/tap-haiku.c index 34739d1..62ab423 100644 --- a/net/tap-haiku.c +++ b/net/tap-haiku.c @@ -25,7 +25,8 @@ #include "net/tap.h" #include -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, "no tap on Haiku\n"); return -1; @@ -59,3 +60,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ + return -1; +} + +int tap_fd_detach(int fd) +{ + return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ + return -1; +} diff --git a/net/tap-linux.c b/net/tap-linux.c index c6521be..0854ef5 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -35,7 +35,8 @@ #define PATH_NET_TUN "/dev/net/tun" -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { struct ifreq ifr; int fd, ret; @@ -67,6 +68,20 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required } } + if (mq_required) { + unsigned int features; + + if ((ioctl(fd, TUNGETFEATURES, &features) != 0) || + !(features & IFF_MULTI_QUEUE)) { + error_report("multiqueue required, but no kernel " + "support for IFF_MULTI_QUEUE available"); + close(fd); + return -1; + } else { + ifr.ifr_flags |= IFF_MULTI_QUEUE; + } + } + if (ifname[0] != '\0') pstrcpy(ifr.ifr_name, IFNAMSIZ, ifname); else @@ -200,3 +215,56 @@ void tap_fd_set_offload(int fd, int csum, int tso4, } } } + +/* Attach a file descriptor to a TUN/TAP device. This descriptor should be + * detached before. + */ +int tap_fd_attach(int fd) +{ + struct ifreq ifr; + int ret; + + memset(&ifr, 0, sizeof(ifr)); + + ifr.ifr_flags = IFF_ATTACH_QUEUE; + ret = ioctl(fd, TUNSETQUEUE, (void *) &ifr); + + if (ret != 0) { + error_report("could not attach fd to tap"); + } + + return ret; +} + +/* Detach a file descriptor to a TUN/TAP device. This file descriptor must have + * been attach to a device. + */ +int tap_fd_detach(int fd) +{ + struct ifreq ifr; + int ret; + + memset(&ifr, 0, sizeof(ifr)); + + ifr.ifr_flags = IFF_DETACH_QUEUE; + ret = ioctl(fd, TUNSETQUEUE, (void *) &ifr); + + if (ret != 0) { + error_report("could not detach fd"); + } + + return ret; +} + +int tap_get_ifname(int fd, char *ifname) +{ + struct ifreq ifr; + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) { + error_report("TUNGETIFF ioctl() failed: %s", strerror(errno)); + return -1; + } + + pstrcpy(ifname, sizeof(ifr.ifr_name), ifr.ifr_name); + return 0; +} diff --git a/net/tap-linux.h b/net/tap-linux.h index 659e981..648d29f 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -29,6 +29,7 @@ #define TUNSETSNDBUF _IOW('T', 212, int) #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNSETQUEUE _IOW('T', 217, int) #endif @@ -36,6 +37,9 @@ #define IFF_TAP 0x0002 #define IFF_NO_PI 0x1000 #define IFF_VNET_HDR 0x4000 +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 /* Features for GSO (TUNSETOFFLOAD). */ #define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */ diff --git a/net/tap-solaris.c b/net/tap-solaris.c index 5d6ac42..2df3ec1 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -173,7 +173,8 @@ static int tap_alloc(char *dev, size_t dev_size) return tap_fd; } -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { char dev[10]=""; int fd; @@ -225,3 +226,18 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_attach(int fd) +{ + return -1; +} + +int tap_fd_detach(int fd) +{ + return -1; +} + +int tap_fd_ifname(int fd, char *ifname) +{ + return -1; +} diff --git a/net/tap-win32.c b/net/tap-win32.c index f9bd741..d7b1f7a 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -763,3 +763,13 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len) { assert(0); } + +int tap_attach(NetClientState *nc) +{ + assert(0); +} + +int tap_detach(NetClientState *nc) +{ + assert(0); +} diff --git a/net/tap.c b/net/tap.c index 1abfd44..01f826a 100644 --- a/net/tap.c +++ b/net/tap.c @@ -60,6 +60,7 @@ typedef struct TAPState { unsigned int write_poll : 1; unsigned int using_vnet_hdr : 1; unsigned int has_ufo: 1; + unsigned int enabled:1; VHostNetState *vhost_net; unsigned host_vnet_hdr_len; } TAPState; @@ -73,9 +74,9 @@ static void tap_writable(void *opaque); static void tap_update_fd_handler(TAPState *s) { qemu_set_fd_handler2(s->fd, - s->read_poll ? tap_can_send : NULL, - s->read_poll ? tap_send : NULL, - s->write_poll ? tap_writable : NULL, + s->read_poll && s->enabled ? tap_can_send : NULL, + s->read_poll && s->enabled ? tap_send : NULL, + s->write_poll && s->enabled ? tap_writable : NULL, s); } @@ -340,6 +341,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer, s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; s->using_vnet_hdr = 0; s->has_ufo = tap_probe_has_ufo(s->fd); + s->enabled = 1; tap_set_offload(&s->nc, 0, 0, 0, 0, 0); /* * Make sure host header length is set correctly in tap: @@ -559,17 +561,10 @@ int net_init_bridge(const NetClientOptions *opts, const char *name, static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, const char *setup_script, char *ifname, - size_t ifname_sz) + size_t ifname_sz, int mq_required) { int fd, vnet_hdr_required; - if (tap->has_ifname) { - pstrcpy(ifname, ifname_sz, tap->ifname); - } else { - assert(ifname_sz > 0); - ifname[0] = '\0'; - } - if (tap->has_vnet_hdr) { *vnet_hdr = tap->vnet_hdr; vnet_hdr_required = *vnet_hdr; @@ -578,7 +573,8 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, vnet_hdr_required = 0; } - TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required)); + TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required, + mq_required)); if (fd < 0) { return -1; } @@ -594,69 +590,37 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, return fd; } -int net_init_tap(const NetClientOptions *opts, const char *name, - NetClientState *peer) -{ - const NetdevTapOptions *tap; - - int fd, vnet_hdr = 0; - const char *model; - TAPState *s; +#define MAX_TAP_QUEUES 1024 - /* for the no-fd, no-helper case */ - const char *script = NULL; /* suppress wrong "uninit'd use" gcc warning */ - char ifname[128]; - - assert(opts->kind == NET_CLIENT_OPTIONS_KIND_TAP); - tap = opts->tap; - - if (tap->has_fd) { - if (tap->has_ifname || tap->has_script || tap->has_downscript || - tap->has_vnet_hdr || tap->has_helper) { - error_report("ifname=, script=, downscript=, vnet_hdr=, " - "and helper= are invalid with fd="); - return -1; - } - - fd = monitor_handle_fd_param(cur_mon, tap->fd); - if (fd == -1) { - return -1; - } - - fcntl(fd, F_SETFL, O_NONBLOCK); - - vnet_hdr = tap_probe_vnet_hdr(fd); - - model = "tap"; - - } else if (tap->has_helper) { - if (tap->has_ifname || tap->has_script || tap->has_downscript || - tap->has_vnet_hdr) { - error_report("ifname=, script=, downscript=, and vnet_hdr= " - "are invalid with helper="); - return -1; - } - - fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE); - if (fd == -1) { - return -1; - } +static int tap_fd(const StringList *fd, const char **fds) +{ + const StringList *c = fd; + size_t i = 0, num_opts = 0; - fcntl(fd, F_SETFL, O_NONBLOCK); + while (c) { + num_opts++; + c = c->next; + } - vnet_hdr = tap_probe_vnet_hdr(fd); + if (num_opts == 0) { + return 0; + } - model = "bridge"; + c = fd; + while (c) { + fds[i++] = c->value->str; + c = c->next; + } - } else { - script = tap->has_script ? tap->script : DEFAULT_NETWORK_SCRIPT; - fd = net_tap_init(tap, &vnet_hdr, script, ifname, sizeof ifname); - if (fd == -1) { - return -1; - } + return num_opts; +} - model = "tap"; - } +static int __net_init_tap(const NetdevTapOptions *tap, NetClientState *peer, + const char *model, const char *name, + const char *ifname, const char *script, + const char *downscript, int vnet_hdr, int fd) +{ + TAPState *s; s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); if (!s) { @@ -674,11 +638,6 @@ int net_init_tap(const NetClientOptions *opts, const char *name, snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s", tap->helper); } else { - const char *downscript; - - downscript = tap->has_downscript ? tap->downscript : - DEFAULT_NETWORK_DOWN_SCRIPT; - snprintf(s->nc.info_str, sizeof(s->nc.info_str), "ifname=%s,script=%s,downscript=%s", ifname, script, downscript); @@ -716,9 +675,150 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return 0; } +int net_init_tap(const NetClientOptions *opts, const char *name, + NetClientState *peer) +{ + const NetdevTapOptions *tap; + const char *fds[MAX_TAP_QUEUES]; + int fd, vnet_hdr = 0, i, queues; + /* for the no-fd, no-helper case */ + const char *script = NULL; /* suppress wrong "uninit'd use" gcc warning */ + const char *downscript = NULL; + char ifname[128]; + + assert(opts->kind == NET_CLIENT_OPTIONS_KIND_TAP); + tap = opts->tap; + queues = tap->has_queues ? tap->queues : 1; + + if (tap->has_fd) { + if (tap->has_ifname || tap->has_script || tap->has_downscript || + tap->has_vnet_hdr || tap->has_helper) { + error_report("ifname=, script=, downscript=, vnet_hdr=, " + "and helper= are invalid with fd="); + return -1; + } + + if (queues != tap_fd(tap->fd, fds)) { + error_report("the number of fds were not equal to queues"); + return -1; + } + + for (i = 0; i < queues; i++) { + fd = monitor_handle_fd_param(cur_mon, fds[i]); + if (fd == -1) { + return -1; + } + + fcntl(fd, F_SETFL, O_NONBLOCK); + + if (i == 0) { + vnet_hdr = tap_probe_vnet_hdr(fd); + } + + if (__net_init_tap(tap, peer, "tap", name, ifname, + script, downscript, vnet_hdr, fd)) { + return -1; + } + } + } else if (tap->has_helper) { + if (tap->has_ifname || tap->has_script || tap->has_downscript || + tap->has_vnet_hdr) { + error_report("ifname=, script=, downscript=, and vnet_hdr= " + "are invalid with helper="); + return -1; + } + + /* FIXME: correct ? */ + for (i = 0; i < queues; i++) { + fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE); + if (fd == -1) { + return -1; + } + + fcntl(fd, F_SETFL, O_NONBLOCK); + + if (i == 0) { + vnet_hdr = tap_probe_vnet_hdr(fd); + } + + if (__net_init_tap(tap, peer, "bridge", name, ifname, + script, downscript, vnet_hdr, fd)) { + return -1; + } + } + } else { + script = tap->has_script ? tap->script : DEFAULT_NETWORK_SCRIPT; + downscript = tap->has_downscript ? tap->downscript : + DEFAULT_NETWORK_DOWN_SCRIPT; + + if (tap->has_ifname) { + pstrcpy(ifname, sizeof ifname, tap->ifname); + } else { + ifname[0] = '\0'; + } + + for (i = 0; i < queues; i++) { + fd = net_tap_init(tap, &vnet_hdr, i >= 1 ? "no" : script, + ifname, sizeof ifname, queues > 1); + if (fd == -1) { + return -1; + } + + if (i == 0 && tap_get_ifname(fd, ifname) != 0) { + error_report("could not get ifname"); + return -1; + } + + if (__net_init_tap(tap, peer, "tap", name, ifname, + i >= 1 ? "no" : script, + i >= 1 ? "no" : downscript, + vnet_hdr, fd)) { + return -1; + } + } + } + + return 0; +} + VHostNetState *tap_get_vhost_net(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP); return s->vhost_net; } + +int tap_attach(NetClientState *nc) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + int ret; + + if (s->enabled) { + return 0; + } else { + ret = tap_fd_attach(s->fd); + if (ret == 0) { + s->enabled = 1; + tap_update_fd_handler(s); + } + return ret; + } +} + +int tap_detach(NetClientState *nc) +{ + TAPState *s = DO_UPCAST(TAPState, nc, nc); + int ret; + + if (s->enabled == 0) { + return 0; + } else { + ret = tap_fd_detach(s->fd); + if (ret == 0) { + qemu_purge_queued_packets(nc); + s->enabled = 0; + tap_update_fd_handler(s); + } + return ret; + } +} diff --git a/net/tap.h b/net/tap.h index d44d83a..02f154e 100644 --- a/net/tap.h +++ b/net/tap.h @@ -32,7 +32,8 @@ #define DEFAULT_NETWORK_SCRIPT "/etc/qemu-ifup" #define DEFAULT_NETWORK_DOWN_SCRIPT "/etc/qemu-ifdown" -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required); +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required); ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen); @@ -49,6 +50,11 @@ int tap_probe_vnet_hdr_len(int fd, int len); int tap_probe_has_ufo(int fd); void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); void tap_fd_set_vnet_hdr_len(int fd, int len); +int tap_fd_attach(int fd); +int tap_fd_detach(int fd); +int tap_attach(NetClientState *nc); +int tap_detach(NetClientState *nc); +int tap_get_ifname(int fd, char *ifname); int tap_get_fd(NetClientState *nc); diff --git a/qapi-schema.json b/qapi-schema.json index 5dfa052..583eb7c 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -2465,7 +2465,7 @@ { 'type': 'NetdevTapOptions', 'data': { '*ifname': 'str', - '*fd': 'str', + '*fd': ['String'], '*script': 'str', '*downscript': 'str', '*helper': 'str', @@ -2473,7 +2473,8 @@ '*vnet_hdr': 'bool', '*vhost': 'bool', '*vhostfd': 'str', - '*vhostforce': 'bool' } } + '*vhostforce': 'bool', + '*queues': 'uint32'} } ## # @NetdevSocketOptions -- 1.7.1