All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
@ 2010-01-26 20:40 Sridhar Samudrala
  2010-01-26 20:47 ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-26 20:40 UTC (permalink / raw)
  To: avi, markmc, Anthony Liguori, Michael S. Tsirkin, ogerlitz
  Cc: kvm, qemu-devel

This patch adds raw socket backend to qemu and is based on Or Gerlitz's
patch re-factored and ported to the latest qemu-kvm git tree.
It also includes support for vnet_hdr option that enables gso/checksum
offload with raw backend. You can find the linux kernel patch to support
this feature here.
   http://thread.gmane.org/gmane.linux.network/150308

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com> 

diff --git a/Makefile.objs b/Makefile.objs
index 357d305..4468124 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -34,6 +34,8 @@ net-nested-$(CONFIG_SOLARIS) += tap-solaris.o
 net-nested-$(CONFIG_AIX) += tap-aix.o
 net-nested-$(CONFIG_SLIRP) += slirp.o
 net-nested-$(CONFIG_VDE) += vde.o
+net-nested-$(CONFIG_POSIX) += raw.o
+net-nested-$(CONFIG_LINUX) += raw-linux.o
 net-obj-y += $(addprefix net/, $(net-nested-y))
 
 ######################################################################
diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index eba578a..4aa40f2 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -15,6 +15,7 @@
 #include "net.h"
 #include "net/checksum.h"
 #include "net/tap.h"
+#include "net/raw.h"
 #include "qemu-timer.h"
 #include "virtio-net.h"
 
@@ -133,6 +134,9 @@ static int peer_has_vnet_hdr(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_RAW:
+        n->has_vnet_hdr = raw_has_vnet_hdr(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -149,6 +153,9 @@ static int peer_has_ufo(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_ufo = tap_has_ufo(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_RAW:
+        n->has_ufo = raw_has_ufo(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -165,6 +172,9 @@ static void peer_using_vnet_hdr(VirtIONet *n, int using_vnet_hdr)
     case NET_CLIENT_TYPE_TAP:
         tap_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
         break;
+    case NET_CLIENT_TYPE_RAW:
+        raw_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
+        break;
     default:
         break; 
     }
@@ -180,6 +190,9 @@ static void peer_set_offload(VirtIONet *n, int csum, int tso4, int tso6,
     case NET_CLIENT_TYPE_TAP:
         tap_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
         break;
+    case NET_CLIENT_TYPE_RAW:
+        raw_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
+        break;
     default:
         break; 
     }
diff --git a/net.c b/net.c
index 6ef93e6..1ca2415 100644
--- a/net.c
+++ b/net.c
@@ -26,6 +26,7 @@
 #include "config-host.h"
 
 #include "net/tap.h"
+#include "net/raw.h"
 #include "net/socket.h"
 #include "net/dump.h"
 #include "net/slirp.h"
@@ -1004,6 +1005,27 @@ static struct {
             },
             { /* end of list */ }
         },
+    }, {
+        .type = "raw",
+        .init = net_init_raw,
+        .desc = {
+            NET_COMMON_PARAMS_DESC,
+            {
+                .name = "fd",
+                .type = QEMU_OPT_STRING,
+                .help = "file descriptor of an already opened raw socket",
+            }, {
+                .name = "ifname",
+                .type = QEMU_OPT_STRING,
+                .help = "interface name",
+           }, {
+               .name = "vnet_hdr",
+               .type = QEMU_OPT_BOOL,
+               .help = "enable PACKET_VNET_HDR option on the raw interface"
+           },
+            { /* end of list */ }
+       },
+
 #ifdef CONFIG_VDE
     }, {
         .type = "vde",
@@ -1076,6 +1098,7 @@ int net_client_init(Monitor *mon, QemuOpts *opts, int is_netdev)
 #ifdef CONFIG_VDE
             strcmp(type, "vde") != 0 &&
 #endif
+            strcmp(type, "raw") != 0 &&
             strcmp(type, "socket") != 0) {
             qemu_error("The '%s' network backend type is not valid with -netdev\n",
                        type);
diff --git a/net.h b/net.h
index 116bb80..4722185 100644
--- a/net.h
+++ b/net.h
@@ -34,7 +34,8 @@ typedef enum {
     NET_CLIENT_TYPE_TAP,
     NET_CLIENT_TYPE_SOCKET,
     NET_CLIENT_TYPE_VDE,
-    NET_CLIENT_TYPE_DUMP
+    NET_CLIENT_TYPE_DUMP,
+    NET_CLIENT_TYPE_RAW,
 } net_client_type;
 
 typedef void (NetPoll)(VLANClientState *, bool enable);
diff --git a/net/raw-linux.c b/net/raw-linux.c
new file mode 100644
index 0000000..9ed2e6a
--- /dev/null
+++ b/net/raw-linux.c
@@ -0,0 +1,97 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "net/raw.h"
+#include "net/raw-linux.h"
+
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <netinet/in.h>
+
+#include "sysemu.h"
+#include "qemu-common.h"
+
+int raw_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required)
+{
+    struct ifreq req;
+    int fd, ret;
+    struct sockaddr_ll lladdr;
+    int val;
+
+    fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+    if (fd < 0)
+        fprintf(stderr, "packet socket failed\n");
+
+    memset(&req, 0, sizeof(req));
+    strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
+    ret = ioctl(fd, SIOCGIFINDEX, &req);
+    if (ret < 0)
+        fprintf(stderr, "SIOCGIFINDEX failed\n");
+
+    memset(&lladdr, 0, sizeof(lladdr));
+    lladdr.sll_family   = AF_PACKET;
+    lladdr.sll_protocol = htons(ETH_P_ALL);
+    lladdr.sll_ifindex  = req.ifr_ifindex;
+    ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
+    if (ret < 0)
+        fprintf(stderr, "bind failed\n");
+
+    if (*vnet_hdr) {
+        val = 1;
+        ret=setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, (const char *)&val,
+                       sizeof(val));
+        if (ret < 0) {
+            fprintf(stderr, "setsockopt(SOL_PACKET, PACKET_VNET_HDR) failed\n");
+            *vnet_hdr = 0;
+        } else {
+            *vnet_hdr = 1;
+        }
+
+        if (vnet_hdr_required && !*vnet_hdr) {
+            qemu_error("vnet_hdr=1 requested, but kernel "
+                       "doesn't support PACKET_VNET_HDR");
+            close(fd);
+            return -1;
+        }
+    }
+
+    ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+    if (ret < 0)
+        fprintf(stderr, "O_NONBLOCK set failed\n");
+
+    return fd;
+}
+
+int raw_probe_vnet_hdr(int fd)
+{
+    int val, len;
+	
+    len = sizeof(val);
+    if (getsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &val, (socklen_t *)&len) == 0) 
+        return 1;	
+	
+    return 0;
+}
diff --git a/net/raw-linux.h b/net/raw-linux.h
new file mode 100644
index 0000000..ca463f4
--- /dev/null
+++ b/net/raw-linux.h
@@ -0,0 +1,42 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_RAW_LINUX_H
+#define QEMU_RAW_LINUX_H
+
+#include <stdint.h>
+
+#define PACKET_VNET_HDR	15
+
+struct virtio_net_hdr
+{
+    uint8_t flags;
+    uint8_t gso_type;
+    uint16_t hdr_len;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+#endif /* QEMU_RAW_LINUX_H */
diff --git a/net/raw.c b/net/raw.c
new file mode 100644
index 0000000..9dbc2f4
--- /dev/null
+++ b/net/raw.c
@@ -0,0 +1,362 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "net/raw.h"
+
+#include "config-host.h"
+
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <net/if.h>
+
+#include "net.h"
+#include "sysemu.h"
+#include "qemu-char.h"
+#include "qemu-common.h"
+
+#include "net/raw-linux.h"
+
+/* Maximum GSO packet size (64k) plus plenty of room for
+ * the ethernet and virtio_net headers
+ */
+#define RAW_BUFSIZE (4096 + 65536)
+
+typedef struct RAWState {
+    VLANClientState nc;
+    int fd;
+    uint8_t buf[RAW_BUFSIZE];
+    int promisc;
+    unsigned int read_poll:1;
+    unsigned int write_poll:1;
+    unsigned int has_vnet_hdr:1;
+    unsigned int using_vnet_hdr:1;	
+    unsigned int has_ufo:1;
+} RAWState;
+
+static int raw_can_send(void *opaque);
+static void raw_send(void *opaque);
+static void raw_writable(void *opaque);
+
+static void raw_update_fd_handler(RAWState *s)
+{
+    qemu_set_fd_handler2(s->fd,
+                         s->read_poll  ? raw_can_send : NULL,
+                         s->read_poll  ? raw_send     : NULL,
+                         s->write_poll ? raw_writable : NULL,
+                         s);
+}
+
+static void raw_read_poll(RAWState *s, int enable)
+{
+    s->read_poll = !!enable;
+    raw_update_fd_handler(s);
+}
+
+static void raw_write_poll(RAWState *s, int enable)
+{
+    s->write_poll = !!enable;
+    raw_update_fd_handler(s);
+}
+
+static void raw_writable(void *opaque)
+{
+    RAWState *s = opaque;
+
+    raw_write_poll(s, 0);
+    qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t raw_write_packet(RAWState *s, const struct iovec *iov, int iovcnt)
+{
+    ssize_t len;
+
+    do {
+        len = writev(s->fd, iov, iovcnt);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1 && errno == EAGAIN) {
+        raw_write_poll(s, 1);
+        return 0;
+    }
+
+    if (len == -1)
+        printf("raw_write_packet: errno:%d\n", errno);
+
+    return len;
+}
+
+static ssize_t raw_receive_iov(VLANClientState *nc, const struct iovec *iov,
+                               int iovcnt)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+    const struct iovec *iovp = iov;
+    struct iovec iov_copy[iovcnt + 1];
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+        iov_copy[0].iov_base = &hdr;
+        iov_copy[0].iov_len =  sizeof(hdr);
+        memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
+        iovp = iov_copy;
+        iovcnt++;
+    }
+
+    return raw_write_packet(s, iovp, iovcnt);
+}
+
+static ssize_t raw_receive_raw(VLANClientState *nc, const uint8_t *buf, size_t size)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+    struct iovec iov[2];
+    int iovcnt = 0;
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr) {
+        iov[iovcnt].iov_base = &hdr;
+        iov[iovcnt].iov_len  = sizeof(hdr);
+        iovcnt++;
+    }
+
+    iov[iovcnt].iov_base = (char *)buf;
+    iov[iovcnt].iov_len  = size;
+    iovcnt++;
+
+    return raw_write_packet(s, iov, iovcnt);
+}
+
+static ssize_t raw_receive(VLANClientState *nc, const uint8_t *buf, size_t size)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+    struct iovec iov[1];
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr)
+        return raw_receive_raw(nc, buf, size);
+
+    iov[0].iov_base = (char *)buf;
+    iov[0].iov_len  = size;
+
+    return raw_write_packet(s, iov, 1);
+}
+
+static int raw_can_send(void *opaque)
+{
+    RAWState *s = opaque;
+
+    return qemu_can_send_packet(&s->nc);
+}
+
+ssize_t raw_read_packet(int rawfd, uint8_t *buf, int maxlen, int flags)
+{
+    int ret;
+
+    ret = recv(rawfd, buf, maxlen, flags);
+    return ret;
+}
+
+static void raw_send_completed(VLANClientState *nc, ssize_t len)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+
+    raw_read_poll(s, 1);
+}
+
+static void raw_send(void *opaque)
+{
+    RAWState *s = opaque;
+    int size;
+
+    do {
+        uint8_t *buf = s->buf;
+
+        size = raw_read_packet(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
+        if (size <= 0)
+            break;
+
+        if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+            buf  += sizeof(struct virtio_net_hdr);
+            size -= sizeof(struct virtio_net_hdr);
+        }
+
+        size = qemu_send_packet_async(&s->nc, buf, size, raw_send_completed);
+        if (size == 0)
+            raw_read_poll(s, 0);
+
+    } while (size > 0 && qemu_can_send_packet(&s->nc));
+}
+
+int raw_has_ufo(VLANClientState *nc)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_RAW);
+
+    return s->has_ufo;
+}
+
+int raw_has_vnet_hdr(VLANClientState *nc)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_RAW);
+
+    return s->has_vnet_hdr;
+}
+
+void raw_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+
+    using_vnet_hdr = using_vnet_hdr != 0;
+
+    assert(nc->info->type == NET_CLIENT_TYPE_RAW);
+    assert(s->has_vnet_hdr == using_vnet_hdr);
+
+    s->using_vnet_hdr = using_vnet_hdr;
+}
+
+void raw_set_offload(VLANClientState *nc, int csum, int tso4,
+                     int tso6, int ecn, int ufo)
+{
+    return;
+}
+
+static void raw_cleanup(VLANClientState *nc)
+{
+    RAWState *s = DO_UPCAST(RAWState, nc, nc);
+
+    qemu_purge_queued_packets(nc);
+
+    raw_read_poll(s, 0);
+    raw_write_poll(s, 0);
+    close(s->fd);
+}
+
+/* fd support */
+
+static NetClientInfo net_raw_info = {
+    .type = NET_CLIENT_TYPE_RAW,
+    .size = sizeof(RAWState),
+    .receive = raw_receive,
+    .receive_raw = NULL,
+    .receive_iov = raw_receive_iov,
+    .cleanup = raw_cleanup,
+};
+
+
+static RAWState *net_raw_fd_init(VLANState *vlan, const char *model,
+                                 const char *name, int fd, int vnet_hdr)
+{
+    VLANClientState *nc;
+    RAWState *s;
+
+    nc = qemu_new_net_client(&net_raw_info, vlan, NULL, model, name);
+
+    s = DO_UPCAST(RAWState, nc, nc);
+
+    s->fd = fd;
+    s->has_vnet_hdr = vnet_hdr != 0;
+    s->using_vnet_hdr = 0;
+    s->has_ufo = 1;
+    raw_read_poll(s, 1);
+
+    return s;
+}
+
+static int net_raw_init(QemuOpts *opts, int *vnet_hdr)
+{
+    int fd, vnet_hdr_required;
+    char ifname[128] = {0,};
+
+    if (qemu_opt_get(opts, "ifname")) {
+        pstrcpy(ifname, sizeof(ifname), qemu_opt_get(opts, "ifname"));
+    }
+
+    *vnet_hdr = qemu_opt_get_bool(opts, "vnet_hdr", 1);
+    if (qemu_opt_get(opts, "vnet_hdr")) {
+        vnet_hdr_required = *vnet_hdr;
+    } else {
+        vnet_hdr_required = 0;
+    }
+
+    TFR(fd = raw_open(ifname, sizeof(ifname), vnet_hdr, vnet_hdr_required));
+    if (fd < 0)
+        return -1;
+
+    qemu_opt_set(opts, "ifname", ifname);
+
+    return fd;
+}
+
+int net_init_raw(QemuOpts *opts, Monitor *mon, const char *name,
+		 VLANState *vlan)
+{
+    RAWState *s;
+    int fd, vnet_hdr = 0;
+
+    if (qemu_opt_get(opts, "fd")) {
+        if (qemu_opt_get(opts, "ifname")) {
+            qemu_error("ifname=, is invalid with fd=\n");
+            return -1;
+        }
+
+        fd = net_handle_fd_param(mon, qemu_opt_get(opts, "fd"));
+        if (fd == -1) {
+            return -1;
+        }
+
+        fcntl(fd, F_SETFL, O_NONBLOCK);
+
+        vnet_hdr = raw_probe_vnet_hdr(fd);
+    } else {
+        fd = net_raw_init(opts, &vnet_hdr);
+        if (fd == -1) {
+            return -1;
+        }
+    }
+
+    s = net_raw_fd_init(vlan, "raw", name, fd, vnet_hdr);
+    if (!s) {
+        close(fd);
+        return -1;
+    }
+
+    if (qemu_opt_get(opts, "fd")) {
+        snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd);
+    } else {
+        const char *ifname;
+
+        ifname = qemu_opt_get(opts, "ifname");
+        snprintf(s->nc.info_str, sizeof(s->nc.info_str), "ifname=%s", ifname);
+
+    }
+
+    if (vlan) {
+        vlan->nb_host_devs++;
+    }
+	
+    return 0;
+}
diff --git a/net/raw.h b/net/raw.h
new file mode 100644
index 0000000..7260080
--- /dev/null
+++ b/net/raw.h
@@ -0,0 +1,40 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_NET_RAW_H
+#define QEMU_NET_RAW_H
+
+#include "qemu-common.h"
+#include "qemu-option.h"
+
+int net_init_raw(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan);
+int raw_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required);
+ssize_t raw_read_packet(int rawfd, uint8_t *buf, int maxlen, int flags);
+int raw_has_ufo(VLANClientState *vc);
+int raw_has_vnet_hdr(VLANClientState *vc);
+void raw_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr);
+int raw_probe_vnet_hdr(int fd);
+void raw_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, int ecn, int ufo);
+
+#endif /* QEMU_NET_RAW_H */





^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-26 20:40 [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu Sridhar Samudrala
@ 2010-01-26 20:47 ` Anthony Liguori
  2010-01-26 20:50   ` Anthony Liguori
  2010-01-26 23:15     ` [Qemu-devel] " Sridhar Samudrala
  0 siblings, 2 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-26 20:47 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: avi, markmc, Michael S. Tsirkin, ogerlitz, kvm, qemu-devel

On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
> This patch adds raw socket backend to qemu and is based on Or Gerlitz's
> patch re-factored and ported to the latest qemu-kvm git tree.
> It also includes support for vnet_hdr option that enables gso/checksum
> offload with raw backend. You can find the linux kernel patch to support
> this feature here.
>     http://thread.gmane.org/gmane.linux.network/150308
>
> Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
>    

See the previous discussion about the raw backend from Or's original 
patch.  There's no obvious reason why we should have this in addition to 
a tun/tap backend.

The only use-case I know of is macvlan but macvtap addresses this 
functionality while not introduce the rather nasty security problems 
associated with a raw backend.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-26 20:47 ` Anthony Liguori
@ 2010-01-26 20:50   ` Anthony Liguori
  2010-01-26 23:19     ` Sridhar Samudrala
  2010-01-27  9:24     ` Michael S. Tsirkin
  2010-01-26 23:15     ` [Qemu-devel] " Sridhar Samudrala
  1 sibling, 2 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-26 20:50 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: avi, markmc, Michael S. Tsirkin, ogerlitz, kvm, qemu-devel

On 01/26/2010 02:47 PM, Anthony Liguori wrote:
> On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
>> This patch adds raw socket backend to qemu and is based on Or Gerlitz's
>> patch re-factored and ported to the latest qemu-kvm git tree.
>> It also includes support for vnet_hdr option that enables gso/checksum
>> offload with raw backend. You can find the linux kernel patch to support
>> this feature here.
>>     http://thread.gmane.org/gmane.linux.network/150308
>>
>> Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
>
> See the previous discussion about the raw backend from Or's original 
> patch.  There's no obvious reason why we should have this in addition 
> to a tun/tap backend.
>
> The only use-case I know of is macvlan but macvtap addresses this 
> functionality while not introduce the rather nasty security problems 
> associated with a raw backend.

Not to mention that from a user perspective, raw makes almost no sense 
as it's an obscure socket protocol family.

A user wants to do useful things like bridged networking or direct VF 
assignment.  We should have -net backends that reflect things that make 
sense to a user.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-26 20:47 ` Anthony Liguori
@ 2010-01-26 23:15     ` Sridhar Samudrala
  2010-01-26 23:15     ` [Qemu-devel] " Sridhar Samudrala
  1 sibling, 0 replies; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-26 23:15 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: avi, markmc, Michael S. Tsirkin, ogerlitz, kvm, qemu-devel

On Tue, 2010-01-26 at 14:47 -0600, Anthony Liguori wrote:
> On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
> > This patch adds raw socket backend to qemu and is based on Or Gerlitz's
> > patch re-factored and ported to the latest qemu-kvm git tree.
> > It also includes support for vnet_hdr option that enables gso/checksum
> > offload with raw backend. You can find the linux kernel patch to support
> > this feature here.
> >     http://thread.gmane.org/gmane.linux.network/150308
> >
> > Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
> >    
> 
> See the previous discussion about the raw backend from Or's original 
> patch.  There's no obvious reason why we should have this in addition to 
> a tun/tap backend.
> 
> The only use-case I know of is macvlan but macvtap addresses this 
> functionality while not introduce the rather nasty security problems 
> associated with a raw backend.

The raw backend can be attached to a physical device, macvlan or SR-IOV VF.
I don't think AF_PACKET socket itself introduces any security problems. The
raw socket can be created only by a user with CAP_RAW capability. The only
issue is if we need to assume that qemu itself is an untrusted process and a
raw fd cannot be passed to it.
But, i think it is a useful backend to support in qemu that provides guest to
remote host connectivity without the need for a bridge/tap.

macvtap could be an alternative if it supports binding to SR-IOV VFs too.

Thanks
Sridhar



^ permalink raw reply	[flat|nested] 45+ messages in thread

* [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
@ 2010-01-26 23:15     ` Sridhar Samudrala
  0 siblings, 0 replies; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-26 23:15 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: markmc, kvm, Michael S. Tsirkin, qemu-devel, ogerlitz, avi

On Tue, 2010-01-26 at 14:47 -0600, Anthony Liguori wrote:
> On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
> > This patch adds raw socket backend to qemu and is based on Or Gerlitz's
> > patch re-factored and ported to the latest qemu-kvm git tree.
> > It also includes support for vnet_hdr option that enables gso/checksum
> > offload with raw backend. You can find the linux kernel patch to support
> > this feature here.
> >     http://thread.gmane.org/gmane.linux.network/150308
> >
> > Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
> >    
> 
> See the previous discussion about the raw backend from Or's original 
> patch.  There's no obvious reason why we should have this in addition to 
> a tun/tap backend.
> 
> The only use-case I know of is macvlan but macvtap addresses this 
> functionality while not introduce the rather nasty security problems 
> associated with a raw backend.

The raw backend can be attached to a physical device, macvlan or SR-IOV VF.
I don't think AF_PACKET socket itself introduces any security problems. The
raw socket can be created only by a user with CAP_RAW capability. The only
issue is if we need to assume that qemu itself is an untrusted process and a
raw fd cannot be passed to it.
But, i think it is a useful backend to support in qemu that provides guest to
remote host connectivity without the need for a bridge/tap.

macvtap could be an alternative if it supports binding to SR-IOV VFs too.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-26 20:50   ` Anthony Liguori
@ 2010-01-26 23:19     ` Sridhar Samudrala
  2010-01-27  9:24     ` Michael S. Tsirkin
  1 sibling, 0 replies; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-26 23:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: avi, markmc, Michael S. Tsirkin, ogerlitz, kvm, qemu-devel

On Tue, 2010-01-26 at 14:50 -0600, Anthony Liguori wrote:
> On 01/26/2010 02:47 PM, Anthony Liguori wrote:
> > On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
> >> This patch adds raw socket backend to qemu and is based on Or Gerlitz's
> >> patch re-factored and ported to the latest qemu-kvm git tree.
> >> It also includes support for vnet_hdr option that enables gso/checksum
> >> offload with raw backend. You can find the linux kernel patch to support
> >> this feature here.
> >>     http://thread.gmane.org/gmane.linux.network/150308
> >>
> >> Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
> >
> > See the previous discussion about the raw backend from Or's original 
> > patch.  There's no obvious reason why we should have this in addition 
> > to a tun/tap backend.
> >
> > The only use-case I know of is macvlan but macvtap addresses this 
> > functionality while not introduce the rather nasty security problems 
> > associated with a raw backend.
> 
> Not to mention that from a user perspective, raw makes almost no sense 
> as it's an obscure socket protocol family.
Not clear what you mean here. AF_PACKET socket is just a transport
mechanism for the host kernel to put the packets from the guest directly
to an attached interface and vice-versa.

> A user wants to do useful things like bridged networking or direct VF 
> assignment.  We should have -net backends that reflect things that make 
> sense to a user.

Binding to a SR-IOV VF is one of the use-case that is supported by raw
backend.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-26 23:15     ` [Qemu-devel] " Sridhar Samudrala
  (?)
@ 2010-01-27  0:06     ` Anthony Liguori
  2010-01-27  6:52         ` Arnd Bergmann
  -1 siblings, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27  0:06 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: markmc, kvm, Michael S. Tsirkin, qemu-devel, ogerlitz, avi

On 01/26/2010 05:15 PM, Sridhar Samudrala wrote:
> On Tue, 2010-01-26 at 14:47 -0600, Anthony Liguori wrote:
>    
>> On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
>>      
>>> This patch adds raw socket backend to qemu and is based on Or Gerlitz's
>>> patch re-factored and ported to the latest qemu-kvm git tree.
>>> It also includes support for vnet_hdr option that enables gso/checksum
>>> offload with raw backend. You can find the linux kernel patch to support
>>> this feature here.
>>>      http://thread.gmane.org/gmane.linux.network/150308
>>>
>>> Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
>>>
>>>        
>> See the previous discussion about the raw backend from Or's original
>> patch.  There's no obvious reason why we should have this in addition to
>> a tun/tap backend.
>>
>> The only use-case I know of is macvlan but macvtap addresses this
>> functionality while not introduce the rather nasty security problems
>> associated with a raw backend.
>>      
> The raw backend can be attached to a physical device

This is equivalent to bridging with tun/tap except that it has the 
unexpected behaviour of unreliable host/guest networking (which is not 
universally consistent across platforms either).  This is not a mode we 
want to encourage users to use.

> , macvlan

macvtap is a superior way to achieve this use case because a macvtap fd 
can safely be given to a lesser privilege process without allowing 
escalation of privileges.

>   or SR-IOV VF.
>    

This depends on vhost-net.  In general, what I would like to see for 
this is something more user friendly that dealt specifically with this 
use-case.  Although honestly, given the recent security concerns around 
raw sockets, I'm very concerned about supporting raw sockets in qemu at all.

Essentially, you get worse security doing vhost-net + raw + VF then with 
PCI passthrough + VF because at least in the later case you can run qemu 
without privileges.  CAP_NET_RAW is a very big privilege.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27  0:06     ` Anthony Liguori
@ 2010-01-27  6:52         ` Arnd Bergmann
  0 siblings, 0 replies; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-27  6:52 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Sridhar Samudrala, markmc, kvm, Michael S. Tsirkin, qemu-devel,
	ogerlitz, avi

On Wednesday 27 January 2010, Anthony Liguori wrote:
> > The raw backend can be attached to a physical device
> 
> This is equivalent to bridging with tun/tap except that it has the 
> unexpected behaviour of unreliable host/guest networking (which is not 
> universally consistent across platforms either).  This is not a mode we 
> want to encourage users to use.

It's not the most common scenario, but I've seen systems (I remember
one on s/390 with z/VM) where you really want to isolate the guest
network as much as possible from the host network. Besides PCI
passthrough, giving the host device to a guest using a raw socket
is the next best approximation of that.

Then again, macvtap will do that too, if the device driver supports
multiple unicast MAC addresses without forcing promiscous mode.

> > , macvlan
> 
> macvtap is a superior way to achieve this use case because a macvtap fd 
> can safely be given to a lesser privilege process without allowing 
> escalation of privileges.

Yes.

> >   or SR-IOV VF.
> >    
> 
> This depends on vhost-net.

Why? I don't see anything in this scenario that is vhost-net specific.
I also plan to cover this aspect in macvtap in the future, but the current
code does not do it yet. It also requires device driver changes.

>                               In general, what I would like to see for 
> this is something more user friendly that dealt specifically with this 
> use-case.  Although honestly, given the recent security concerns around 
> raw sockets, I'm very concerned about supporting raw sockets in qemu at all.
> 
> Essentially, you get worse security doing vhost-net + raw + VF then with 
> PCI passthrough + VF because at least in the later case you can run qemu 
> without privileges.  CAP_NET_RAW is a very big privilege.

It can be contained to a large degree with network namespaces. When you
run qemu in its own namespace and add the VF to that, CAP_NET_RAW
should ideally have no effect on other parts of the system (except
bugs in the namespace implementation).

	Arnd

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
@ 2010-01-27  6:52         ` Arnd Bergmann
  0 siblings, 0 replies; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-27  6:52 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: markmc, kvm, Michael S. Tsirkin, qemu-devel, ogerlitz, avi,
	Sridhar Samudrala

On Wednesday 27 January 2010, Anthony Liguori wrote:
> > The raw backend can be attached to a physical device
> 
> This is equivalent to bridging with tun/tap except that it has the 
> unexpected behaviour of unreliable host/guest networking (which is not 
> universally consistent across platforms either).  This is not a mode we 
> want to encourage users to use.

It's not the most common scenario, but I've seen systems (I remember
one on s/390 with z/VM) where you really want to isolate the guest
network as much as possible from the host network. Besides PCI
passthrough, giving the host device to a guest using a raw socket
is the next best approximation of that.

Then again, macvtap will do that too, if the device driver supports
multiple unicast MAC addresses without forcing promiscous mode.

> > , macvlan
> 
> macvtap is a superior way to achieve this use case because a macvtap fd 
> can safely be given to a lesser privilege process without allowing 
> escalation of privileges.

Yes.

> >   or SR-IOV VF.
> >    
> 
> This depends on vhost-net.

Why? I don't see anything in this scenario that is vhost-net specific.
I also plan to cover this aspect in macvtap in the future, but the current
code does not do it yet. It also requires device driver changes.

>                               In general, what I would like to see for 
> this is something more user friendly that dealt specifically with this 
> use-case.  Although honestly, given the recent security concerns around 
> raw sockets, I'm very concerned about supporting raw sockets in qemu at all.
> 
> Essentially, you get worse security doing vhost-net + raw + VF then with 
> PCI passthrough + VF because at least in the later case you can run qemu 
> without privileges.  CAP_NET_RAW is a very big privilege.

It can be contained to a large degree with network namespaces. When you
run qemu in its own namespace and add the VF to that, CAP_NET_RAW
should ideally have no effect on other parts of the system (except
bugs in the namespace implementation).

	Arnd

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-26 20:50   ` Anthony Liguori
  2010-01-26 23:19     ` Sridhar Samudrala
@ 2010-01-27  9:24     ` Michael S. Tsirkin
  2010-01-27  9:34       ` Arnd Bergmann
  2010-01-27 14:07       ` Anthony Liguori
  1 sibling, 2 replies; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-27  9:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On Tue, Jan 26, 2010 at 02:50:28PM -0600, Anthony Liguori wrote:
> On 01/26/2010 02:47 PM, Anthony Liguori wrote:
>> On 01/26/2010 02:40 PM, Sridhar Samudrala wrote:
>>> This patch adds raw socket backend to qemu and is based on Or Gerlitz's
>>> patch re-factored and ported to the latest qemu-kvm git tree.
>>> It also includes support for vnet_hdr option that enables gso/checksum
>>> offload with raw backend. You can find the linux kernel patch to support
>>> this feature here.
>>>     http://thread.gmane.org/gmane.linux.network/150308
>>>
>>> Signed-off-by: Sridhar Samudrala<sri@us.ibm.com>
>>
>> See the previous discussion about the raw backend from Or's original  
>> patch.  There's no obvious reason why we should have this in addition  
>> to a tun/tap backend.

I thought this was cleared already: vepa support is the requirement
here.  Existing tap solution requires management of host linux
networking which some users would rather avoid.

>> The only use-case I know of is macvlan but macvtap addresses this  
>> functionality while not introduce the rather nasty security problems  
>> associated with a raw backend.

I am not sure I agree with this sentiment.  The main issue being that
macvtap doesn't exist on all kernels :). macvlan also requires hardware
support, packet socket can work with any network card in promisc mode.

> Not to mention that from a user perspective, raw makes almost no sense  
> as it's an obscure socket protocol family.
>
> A user wants to do useful things like bridged networking or direct VF  
> assignment.  We should have -net backends that reflect things that make  
> sense to a user.
>
> Regards,
>
> Anthony Liguori


I agree to that. People don't even seem to agree whether it's a raw
socket or a packet socket :) We need a better name for this option: what
it really does is rely on an external device to loopback a packet to us,
so how about -net loopback or -net extbridge?

-- 
MST

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27  9:24     ` Michael S. Tsirkin
@ 2010-01-27  9:34       ` Arnd Bergmann
  2010-01-27  9:44         ` Michael S. Tsirkin
  2010-01-27 14:07       ` Anthony Liguori
  1 sibling, 1 reply; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-27  9:34 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel

On Wednesday 27 January 2010, Michael S. Tsirkin wrote:
> I am not sure I agree with this sentiment.  The main issue being that
> macvtap doesn't exist on all kernels :). macvlan also requires hardware
> support, packet socket can work with any network card in promisc mode.

To be clear, macvlan does not require hardware support, it will happily
put cards into promiscous mode if they don't support multiple mac addresses.

> I agree to that. People don't even seem to agree whether it's a raw
> socket or a packet socket :) We need a better name for this option: what
> it really does is rely on an external device to loopback a packet to us,
> so how about -net loopback or -net extbridge?

I think -net socket,fd should just be (trivially) extended to work with raw
sockets out of the box, with no support for opening it. Then you can have
libvirt or some wrapper open a raw socket and a private namespace and just pass it
down. If you really want to let qemu open the socket itself, -net socket,raw=eth0
is probably closer to what you want than a new -net xxx option.

	Arnd

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27  9:34       ` Arnd Bergmann
@ 2010-01-27  9:44         ` Michael S. Tsirkin
  2010-01-27 14:03           ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-27  9:44 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Anthony Liguori, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel

On Wed, Jan 27, 2010 at 10:34:35AM +0100, Arnd Bergmann wrote:
> On Wednesday 27 January 2010, Michael S. Tsirkin wrote:
> > I am not sure I agree with this sentiment.  The main issue being that
> > macvtap doesn't exist on all kernels :). macvlan also requires hardware
> > support, packet socket can work with any network card in promisc mode.
> 
> To be clear, macvlan does not require hardware support, it will happily
> put cards into promiscous mode if they don't support multiple mac addresses.
> 
> > I agree to that. People don't even seem to agree whether it's a raw
> > socket or a packet socket :) We need a better name for this option: what
> > it really does is rely on an external device to loopback a packet to us,
> > so how about -net loopback or -net extbridge?
> 
> I think -net socket,fd should just be (trivially) extended to work with raw
> sockets out of the box, with no support for opening it. Then you can have
> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> down.

That'd work. Anthony?

> If you really want to let qemu open the socket itself, -net socket,raw=eth0
> is probably closer to what you want than a new -net xxx option.
> 
> 	Arnd

So again if implemented this probably should be 
-net socket,raw,loopback=eth0 or -net socket,raw,extbridge=eth0
or some such, just to make it abundantly clear that you must not bind it
to a regular ethernet device.

-- 
MST

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27  9:44         ` Michael S. Tsirkin
@ 2010-01-27 14:03           ` Anthony Liguori
  2010-01-27 21:39             ` Arnd Bergmann
  0 siblings, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 14:03 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/27/2010 03:44 AM, Michael S. Tsirkin wrote:
> On Wed, Jan 27, 2010 at 10:34:35AM +0100, Arnd Bergmann wrote:
>    
>> On Wednesday 27 January 2010, Michael S. Tsirkin wrote:
>>      
>>> I am not sure I agree with this sentiment.  The main issue being that
>>> macvtap doesn't exist on all kernels :). macvlan also requires hardware
>>> support, packet socket can work with any network card in promisc mode.
>>>        
>> To be clear, macvlan does not require hardware support, it will happily
>> put cards into promiscous mode if they don't support multiple mac addresses.
>>
>>      
>>> I agree to that. People don't even seem to agree whether it's a raw
>>> socket or a packet socket :) We need a better name for this option: what
>>> it really does is rely on an external device to loopback a packet to us,
>>> so how about -net loopback or -net extbridge?
>>>        
>> I think -net socket,fd should just be (trivially) extended to work with raw
>> sockets out of the box, with no support for opening it. Then you can have
>> libvirt or some wrapper open a raw socket and a private namespace and just pass it
>> down.
>>      
> That'd work. Anthony?
>    

What functionality are we trying to achieve?  Let's be very specific 
about use-cases here.  If it's VEPA, like you mentioned earlier, why 
isn't macvtap a better solution from a security perspective?

The fundamental problem that I have with all of this is that we should 
not be introducing new network backends that are based around something 
only a developer is going to understand.  If I'm a user and I want to 
use an external switch in VEPA mode, how in the world am I going to know 
that I'm supposed to use the -net raw backend or the -net socket 
backend?  It might as well be the -net butterflies backend as far as a 
user is concerned.

Networking in QEMU is already hard enough for users, we shouldn't make 
it worse than it already is.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27  9:24     ` Michael S. Tsirkin
  2010-01-27  9:34       ` Arnd Bergmann
@ 2010-01-27 14:07       ` Anthony Liguori
  2010-01-27 16:59         ` Michael S. Tsirkin
  1 sibling, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 14:07 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/27/2010 03:24 AM, Michael S. Tsirkin wrote:
> I am not sure I agree with this sentiment.  The main issue being that
> macvtap doesn't exist on all kernels :).

Neither does vhost ;-)  If it were just that as the difference, I'd be 
inclined to agree, but macvtap is much better from a security PoV.

>> Not to mention that from a user perspective, raw makes almost no sense
>> as it's an obscure socket protocol family.
>>
>> A user wants to do useful things like bridged networking or direct VF
>> assignment.  We should have -net backends that reflect things that make
>> sense to a user.
>>
>> Regards,
>>
>> Anthony Liguori
>>      
>
> I agree to that. People don't even seem to agree whether it's a raw
> socket or a packet socket :) We need a better name for this option: what
> it really does is rely on an external device to loopback a packet to us,
> so how about -net loopback or -net extbridge?
>    

Specifically for VEPA, something like:

-net extbridge,if=eth0

or even

-net vepa,if=eth0

Would be fantastic.  I think the best way to achieve this is to 
introduce a small helper that gets called and can create a macvtap 
device and hand the file descriptor back to qemu :-)  A builtin backend 
would also be fine since we don't have the helper infrastructure.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27  6:52         ` Arnd Bergmann
@ 2010-01-27 14:14           ` Anthony Liguori
  -1 siblings, 0 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 14:14 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Sridhar Samudrala, markmc, kvm, Michael S. Tsirkin, qemu-devel,
	ogerlitz, avi

On 01/27/2010 12:52 AM, Arnd Bergmann wrote:
> On Wednesday 27 January 2010, Anthony Liguori wrote:
>    
>>> The raw backend can be attached to a physical device
>>>        
>> This is equivalent to bridging with tun/tap except that it has the
>> unexpected behaviour of unreliable host/guest networking (which is not
>> universally consistent across platforms either).  This is not a mode we
>> want to encourage users to use.
>>      
> It's not the most common scenario, but I've seen systems (I remember
> one on s/390 with z/VM) where you really want to isolate the guest
> network as much as possible from the host network. Besides PCI
> passthrough, giving the host device to a guest using a raw socket
> is the next best approximation of that.
>    

But if you care about isolation, it's the worst possible thing to do.  
If a guest breaks into qemu, it's one bind() away from accessing any 
other guests network.

Using a bridge with a single interface on it is much better from an 
isolation perspective.

>>                                In general, what I would like to see for
>> this is something more user friendly that dealt specifically with this
>> use-case.  Although honestly, given the recent security concerns around
>> raw sockets, I'm very concerned about supporting raw sockets in qemu at all.
>>
>> Essentially, you get worse security doing vhost-net + raw + VF then with
>> PCI passthrough + VF because at least in the later case you can run qemu
>> without privileges.  CAP_NET_RAW is a very big privilege.
>>      
> It can be contained to a large degree with network namespaces. When you
> run qemu in its own namespace and add the VF to that, CAP_NET_RAW
> should ideally have no effect on other parts of the system (except
> bugs in the namespace implementation).
>    

That's a pretty big hammer to hit this problem with.  QEMU should not 
require CAP_NET_RAW and so far has been able to avoid it quite 
successfully.  So far, I haven't heard a compelling reason that to use 
raw other than bridging can be complicated to setup.

If we had the equivalent of a raw socket that could be bound to a socket 
and then "locked" such that it could be safely handed to a 
non-privileged process, then it would be a different story.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
@ 2010-01-27 14:14           ` Anthony Liguori
  0 siblings, 0 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 14:14 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: markmc, kvm, Michael S. Tsirkin, qemu-devel, ogerlitz, avi,
	Sridhar Samudrala

On 01/27/2010 12:52 AM, Arnd Bergmann wrote:
> On Wednesday 27 January 2010, Anthony Liguori wrote:
>    
>>> The raw backend can be attached to a physical device
>>>        
>> This is equivalent to bridging with tun/tap except that it has the
>> unexpected behaviour of unreliable host/guest networking (which is not
>> universally consistent across platforms either).  This is not a mode we
>> want to encourage users to use.
>>      
> It's not the most common scenario, but I've seen systems (I remember
> one on s/390 with z/VM) where you really want to isolate the guest
> network as much as possible from the host network. Besides PCI
> passthrough, giving the host device to a guest using a raw socket
> is the next best approximation of that.
>    

But if you care about isolation, it's the worst possible thing to do.  
If a guest breaks into qemu, it's one bind() away from accessing any 
other guests network.

Using a bridge with a single interface on it is much better from an 
isolation perspective.

>>                                In general, what I would like to see for
>> this is something more user friendly that dealt specifically with this
>> use-case.  Although honestly, given the recent security concerns around
>> raw sockets, I'm very concerned about supporting raw sockets in qemu at all.
>>
>> Essentially, you get worse security doing vhost-net + raw + VF then with
>> PCI passthrough + VF because at least in the later case you can run qemu
>> without privileges.  CAP_NET_RAW is a very big privilege.
>>      
> It can be contained to a large degree with network namespaces. When you
> run qemu in its own namespace and add the VF to that, CAP_NET_RAW
> should ideally have no effect on other parts of the system (except
> bugs in the namespace implementation).
>    

That's a pretty big hammer to hit this problem with.  QEMU should not 
require CAP_NET_RAW and so far has been able to avoid it quite 
successfully.  So far, I haven't heard a compelling reason that to use 
raw other than bridging can be complicated to setup.

If we had the equivalent of a raw socket that could be bound to a socket 
and then "locked" such that it could be safely handed to a 
non-privileged process, then it would be a different story.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 14:07       ` Anthony Liguori
@ 2010-01-27 16:59         ` Michael S. Tsirkin
  2010-01-27 17:07           ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-27 16:59 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On Wed, Jan 27, 2010 at 08:07:11AM -0600, Anthony Liguori wrote:
> On 01/27/2010 03:24 AM, Michael S. Tsirkin wrote:
>> I am not sure I agree with this sentiment.  The main issue being that
>> macvtap doesn't exist on all kernels :).
>
> Neither does vhost ;-)  If it were just that as the difference, I'd be  
> inclined to agree, but macvtap is much better from a security PoV.
>
>>> Not to mention that from a user perspective, raw makes almost no sense
>>> as it's an obscure socket protocol family.
>>>
>>> A user wants to do useful things like bridged networking or direct VF
>>> assignment.  We should have -net backends that reflect things that make
>>> sense to a user.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      
>>
>> I agree to that. People don't even seem to agree whether it's a raw
>> socket or a packet socket :) We need a better name for this option: what
>> it really does is rely on an external device to loopback a packet to us,
>> so how about -net loopback or -net extbridge?
>>    
>
> Specifically for VEPA, something like:
>
> -net extbridge,if=eth0
>
> or even
>
> -net vepa,if=eth0
>
> Would be fantastic.

extbridge is IMO better.

> I think the best way to achieve this is to  
> introduce a small helper that gets called and can create a macvtap  
> device and hand the file descriptor back to qemu :-)  A builtin backend  
> would also be fine since we don't have the helper infrastructure.

Excellent.
Sridhar, this is actually not a lot of work on top of what you
already posted.

> Regards,
>
> Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 16:59         ` Michael S. Tsirkin
@ 2010-01-27 17:07           ` Anthony Liguori
  2010-01-27 17:25             ` Michael S. Tsirkin
  0 siblings, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 17:07 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/27/2010 10:59 AM, Michael S. Tsirkin wrote:
> On Wed, Jan 27, 2010 at 08:07:11AM -0600, Anthony Liguori wrote:
>    
>> On 01/27/2010 03:24 AM, Michael S. Tsirkin wrote:
>>      
>>> I am not sure I agree with this sentiment.  The main issue being that
>>> macvtap doesn't exist on all kernels :).
>>>        
>> Neither does vhost ;-)  If it were just that as the difference, I'd be
>> inclined to agree, but macvtap is much better from a security PoV.
>>
>>      
>>>> Not to mention that from a user perspective, raw makes almost no sense
>>>> as it's an obscure socket protocol family.
>>>>
>>>> A user wants to do useful things like bridged networking or direct VF
>>>> assignment.  We should have -net backends that reflect things that make
>>>> sense to a user.
>>>>
>>>> Regards,
>>>>
>>>> Anthony Liguori
>>>>
>>>>          
>>> I agree to that. People don't even seem to agree whether it's a raw
>>> socket or a packet socket :) We need a better name for this option: what
>>> it really does is rely on an external device to loopback a packet to us,
>>> so how about -net loopback or -net extbridge?
>>>
>>>        
>> Specifically for VEPA, something like:
>>
>> -net extbridge,if=eth0
>>
>> or even
>>
>> -net vepa,if=eth0
>>
>> Would be fantastic.
>>      
> extbridge is IMO better.
>
>    
>> I think the best way to achieve this is to
>> introduce a small helper that gets called and can create a macvtap
>> device and hand the file descriptor back to qemu :-)  A builtin backend
>> would also be fine since we don't have the helper infrastructure.
>>      
> Excellent.
> Sridhar, this is actually not a lot of work on top of what you
> already posted.
>
>    

N.B.  I had suggested using macvtap, not raw.  In this case, the full 
syntax would be:

-net vepa,if=eth0

or

-net vepa,fd=N

where N is a macvtap fd.

For raw, I think there's a real problem wrt security.  I think it's 
important that we support running qemu as a non-privileged user.  In 
fact, this seems to be the mode libvirt is now preferring to operate in.

I think we need to re-evaluate the use of any raw socket by qemu as it's 
very dangerous from a security perspective (assuming we cannot 
introduced a "locked" raw socket mode).

Regards,

Anthony Liguori

>> Regards,
>>
>> Anthony Liguori
>>      


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 17:07           ` Anthony Liguori
@ 2010-01-27 17:25             ` Michael S. Tsirkin
  2010-01-27 17:36               ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-27 17:25 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On Wed, Jan 27, 2010 at 11:07:45AM -0600, Anthony Liguori wrote:
> On 01/27/2010 10:59 AM, Michael S. Tsirkin wrote:
>> On Wed, Jan 27, 2010 at 08:07:11AM -0600, Anthony Liguori wrote:
>>    
>>> On 01/27/2010 03:24 AM, Michael S. Tsirkin wrote:
>>>      
>>>> I am not sure I agree with this sentiment.  The main issue being that
>>>> macvtap doesn't exist on all kernels :).
>>>>        
>>> Neither does vhost ;-)  If it were just that as the difference, I'd be
>>> inclined to agree, but macvtap is much better from a security PoV.
>>>
>>>      
>>>>> Not to mention that from a user perspective, raw makes almost no sense
>>>>> as it's an obscure socket protocol family.
>>>>>
>>>>> A user wants to do useful things like bridged networking or direct VF
>>>>> assignment.  We should have -net backends that reflect things that make
>>>>> sense to a user.
>>>>>
>>>>> Regards,
>>>>>
>>>>> Anthony Liguori
>>>>>
>>>>>          
>>>> I agree to that. People don't even seem to agree whether it's a raw
>>>> socket or a packet socket :) We need a better name for this option: what
>>>> it really does is rely on an external device to loopback a packet to us,
>>>> so how about -net loopback or -net extbridge?
>>>>
>>>>        
>>> Specifically for VEPA, something like:
>>>
>>> -net extbridge,if=eth0
>>>
>>> or even
>>>
>>> -net vepa,if=eth0
>>>
>>> Would be fantastic.
>>>      
>> extbridge is IMO better.
>>
>>    
>>> I think the best way to achieve this is to
>>> introduce a small helper that gets called and can create a macvtap
>>> device and hand the file descriptor back to qemu :-)  A builtin backend
>>> would also be fine since we don't have the helper infrastructure.
>>>      
>> Excellent.
>> Sridhar, this is actually not a lot of work on top of what you
>> already posted.
>>
>>    
>
> N.B.  I had suggested using macvtap, not raw.

Well, this is an implementation detail :) In fact, I don't have any
objections to using macvtap.  As I tried to hint, macvtap doesn't seem
to exist in any Linux yet, packet sockets have been supported since
ages. So we might want to support packet sockets at least optionally
as a backend for extbridge.

>  In this case, the full  
> syntax would be:
>
> -net vepa,if=eth0
>
> or
>
> -net vepa,fd=N

I still hope it's extbridge, vepa is an acronym that will likely not be
known for 99% of users.

> where N is a macvtap fd.
>
> For raw, I think there's a real problem wrt security.  I think it's  
> important that we support running qemu as a non-privileged user.  In  
> fact, this seems to be the mode libvirt is now preferring to operate in.
>
> I think we need to re-evaluate the use of any raw socket by qemu as it's  
> very dangerous from a security perspective (assuming we cannot  
> introduced a "locked" raw socket mode).

As was pointed out on netdev and elsewhere this seems to be what
namespaces/selinux are there for. Can qemu be run within a namespace and
if yes would that address your concerns?  Security is probably a wrong
reason to use character devices: they are much more likely to have
security problems than standard interfaces.  Ease of setup/compatibility
with tap would be a better reason.

> Regards,
>
> Anthony Liguori
>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 17:25             ` Michael S. Tsirkin
@ 2010-01-27 17:36               ` Anthony Liguori
  2010-01-27 17:54                 ` Sridhar Samudrala
  2010-01-27 18:12                 ` Michael S. Tsirkin
  0 siblings, 2 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 17:36 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/27/2010 11:25 AM, Michael S. Tsirkin wrote:
>>   In this case, the full
>> syntax would be:
>>
>> -net vepa,if=eth0
>>
>> or
>>
>> -net vepa,fd=N
>>      
> I still hope it's extbridge, vepa is an acronym that will likely not be
> known for 99% of users.
>    

Oh sorry, I don't care about the name at all.  If you prefer extbridge, 
I'm all for it :-)

>> where N is a macvtap fd.
>>
>> For raw, I think there's a real problem wrt security.  I think it's
>> important that we support running qemu as a non-privileged user.  In
>> fact, this seems to be the mode libvirt is now preferring to operate in.
>>
>> I think we need to re-evaluate the use of any raw socket by qemu as it's
>> very dangerous from a security perspective (assuming we cannot
>> introduced a "locked" raw socket mode).
>>      
> As was pointed out on netdev and elsewhere this seems to be what
> namespaces/selinux are there for. Can qemu be run within a namespace and
> if yes would that address your concerns?

It's unclear to me what this would even involve.  But really, we just 
want an interface to inject packets directly into a physical device.  
raw sockets give us that but it also gives us way more.  Using network 
namespaces to restrict this is a bit convoluted.  It seems to me that 
providing an interface that never gives us way more to start with is 
better overall from a security perspective.

Regards,

Anthony Liguori

>    Security is probably a wrong
> reason to use character devices: they are much more likely to have
> security problems than standard interfaces.  Ease of setup/compatibility
> with tap would be a better reason.
>
>    
>> Regards,
>>
>> Anthony Liguori
>>
>>      
>>>> Regards,
>>>>
>>>> Anthony Liguori
>>>>
>>>>          


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 17:36               ` Anthony Liguori
@ 2010-01-27 17:54                 ` Sridhar Samudrala
  2010-01-27 18:02                   ` Anthony Liguori
  2010-01-27 18:12                 ` Michael S. Tsirkin
  1 sibling, 1 reply; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-27 17:54 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Michael S. Tsirkin, avi, markmc, ogerlitz, kvm, qemu-devel

On Wed, 2010-01-27 at 11:36 -0600, Anthony Liguori wrote:
> On 01/27/2010 11:25 AM, Michael S. Tsirkin wrote:
> >>   In this case, the full
> >> syntax would be:
> >>
> >> -net vepa,if=eth0
> >>
> >> or
> >>
> >> -net vepa,fd=N
> >>      
> > I still hope it's extbridge, vepa is an acronym that will likely not be
> > known for 99% of users.
> >    
> 
> Oh sorry, I don't care about the name at all.  If you prefer extbridge, 
> I'm all for it :-)
> 
> >> where N is a macvtap fd.
> >>
> >> For raw, I think there's a real problem wrt security.  I think it's
> >> important that we support running qemu as a non-privileged user.  In
> >> fact, this seems to be the mode libvirt is now preferring to operate in.
> >>
> >> I think we need to re-evaluate the use of any raw socket by qemu as it's
> >> very dangerous from a security perspective (assuming we cannot
> >> introduced a "locked" raw socket mode).
> >>      
> > As was pointed out on netdev and elsewhere this seems to be what
> > namespaces/selinux are there for. Can qemu be run within a namespace and
> > if yes would that address your concerns?
> 
> It's unclear to me what this would even involve.  But really, we just 
> want an interface to inject packets directly into a physical device.  
> raw sockets give us that but it also gives us way more.  Using network 
> namespaces to restrict this is a bit convoluted.  It seems to me that 
> providing an interface that never gives us way more to start with is 
> better overall from a security perspective.

I too think that we should not block raw backend in qemu just because of
security reasons. It should be perfectly fine to use raw backend in
scenarios where qemu can be run as a privileged process.

libvirt need not support raw backend until we figure out a secure way to
start qemu when passing raw fd. using network namespaces seems like a
good option.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 17:54                 ` Sridhar Samudrala
@ 2010-01-27 18:02                   ` Anthony Liguori
  2010-01-27 18:03                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 18:02 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Michael S. Tsirkin, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/27/2010 11:54 AM, Sridhar Samudrala wrote:
> I too think that we should not block raw backend in qemu just because of
> security reasons. It should be perfectly fine to use raw backend in
> scenarios where qemu can be run as a privileged process.
>
> libvirt need not support raw backend until we figure out a secure way to
> start qemu when passing raw fd. using network namespaces seems like a
> good option.
>    

Introducing something that is known to be problematic from a security 
perspective without any clear idea of what the use-case for it is is a 
bad idea IMHO.

Regards,

Anthony Liguori

> Thanks
> Sridhar
>
>    


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 18:02                   ` Anthony Liguori
@ 2010-01-27 18:03                     ` Michael S. Tsirkin
  2010-01-27 19:54                       ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-27 18:03 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On Wed, Jan 27, 2010 at 12:02:34PM -0600, Anthony Liguori wrote:
> On 01/27/2010 11:54 AM, Sridhar Samudrala wrote:
>> I too think that we should not block raw backend in qemu just because of
>> security reasons. It should be perfectly fine to use raw backend in
>> scenarios where qemu can be run as a privileged process.
>>
>> libvirt need not support raw backend until we figure out a secure way to
>> start qemu when passing raw fd. using network namespaces seems like a
>> good option.
>>    
>
> Introducing something that is known to be problematic from a security  
> perspective without any clear idea of what the use-case for it is is a  
> bad idea IMHO.

vepa on existing kernels is one use-case.

> Regards,
>
> Anthony Liguori
>
>> Thanks
>> Sridhar
>>
>>    

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 17:36               ` Anthony Liguori
  2010-01-27 17:54                 ` Sridhar Samudrala
@ 2010-01-27 18:12                 ` Michael S. Tsirkin
  1 sibling, 0 replies; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-27 18:12 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On Wed, Jan 27, 2010 at 11:36:31AM -0600, Anthony Liguori wrote:
> On 01/27/2010 11:25 AM, Michael S. Tsirkin wrote:
>>>   In this case, the full
>>> syntax would be:
>>>
>>> -net vepa,if=eth0
>>>
>>> or
>>>
>>> -net vepa,fd=N
>>>      
>> I still hope it's extbridge, vepa is an acronym that will likely not be
>> known for 99% of users.
>>    
>
> Oh sorry, I don't care about the name at all.  If you prefer extbridge,  
> I'm all for it :-)
>
>>> where N is a macvtap fd.
>>>
>>> For raw, I think there's a real problem wrt security.  I think it's
>>> important that we support running qemu as a non-privileged user.  In
>>> fact, this seems to be the mode libvirt is now preferring to operate in.
>>>
>>> I think we need to re-evaluate the use of any raw socket by qemu as it's
>>> very dangerous from a security perspective (assuming we cannot
>>> introduced a "locked" raw socket mode).
>>>      
>> As was pointed out on netdev and elsewhere this seems to be what
>> namespaces/selinux are there for. Can qemu be run within a namespace and
>> if yes would that address your concerns?
>
> It's unclear to me what this would even involve.  But really, we just  
> want an interface to inject packets directly into a physical device.   

Not only. We also want to program filters by mac/vlan, enable/disable
promisc mode, set mac, maybe more, all this in response to guest
activity so it's not as trivial as doing it in a helper script.  The
patches supplied do not do this and do filtering in userspace but I
trust this is short-term.

> raw sockets give us that but it also gives us way more.  Using network  
> namespaces to restrict this is a bit convoluted.  It seems to me that  
> providing an interface that never gives us way more to start with is  
> better overall from a security perspective.

You are thinking about qemu security so custom groups and permissions on
character devices and/or suid scripts with custom configuration files
look nice to you.  But think in terms of an overall system security. If
you write custom kernel interfaces you end up with an unmanageable
security policy. And system administrator not being in control of
security policy is very bad for security.

All the above is basically repeating what others said on netdev.
If you care, pls argue on disablenetwork thread.

> Regards,
>
> Anthony Liguori
>
>>    Security is probably a wrong
>> reason to use character devices: they are much more likely to have
>> security problems than standard interfaces.  Ease of setup/compatibility
>> with tap would be a better reason.
>>
>>    
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>      
>>>>> Regards,
>>>>>
>>>>> Anthony Liguori
>>>>>
>>>>>          

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 18:03                     ` Michael S. Tsirkin
@ 2010-01-27 19:54                       ` Anthony Liguori
  2010-01-28  8:12                         ` Arnd Bergmann
  2010-02-01 15:47                         ` Or Gerlitz
  0 siblings, 2 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-27 19:54 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/27/2010 12:03 PM, Michael S. Tsirkin wrote:
> On Wed, Jan 27, 2010 at 12:02:34PM -0600, Anthony Liguori wrote:
>    
>> On 01/27/2010 11:54 AM, Sridhar Samudrala wrote:
>>      
>>> I too think that we should not block raw backend in qemu just because of
>>> security reasons. It should be perfectly fine to use raw backend in
>>> scenarios where qemu can be run as a privileged process.
>>>
>>> libvirt need not support raw backend until we figure out a secure way to
>>> start qemu when passing raw fd. using network namespaces seems like a
>>> good option.
>>>
>>>        
>> Introducing something that is known to be problematic from a security
>> perspective without any clear idea of what the use-case for it is is a
>> bad idea IMHO.
>>      
> vepa on existing kernels is one use-case.
>    

Considering VEPA enabled hardware doesn't exist today and the standards 
aren't even finished being defined, I don't think it's a really strong 
use case ;-)

Regards,

Anthony Liguori

>> Regards,
>>
>> Anthony Liguori
>>
>>      
>>> Thanks
>>> Sridhar
>>>
>>>
>>>        


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 14:03           ` Anthony Liguori
@ 2010-01-27 21:39             ` Arnd Bergmann
  2010-01-27 22:56               ` Sridhar Samudrala
  0 siblings, 1 reply; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-27 21:39 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Michael S. Tsirkin, Sridhar Samudrala, avi, markmc, ogerlitz,
	kvm, qemu-devel

On Wednesday 27 January 2010, Anthony Liguori wrote:
> >> I think -net socket,fd should just be (trivially) extended to work with raw
> >> sockets out of the box, with no support for opening it. Then you can have
> >> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> >> down.
> >>      
> > That'd work. Anthony?
> 
> The fundamental problem that I have with all of this is that we should 
> not be introducing new network backends that are based around something 
> only a developer is going to understand.  If I'm a user and I want to 
> use an external switch in VEPA mode, how in the world am I going to know 
> that I'm supposed to use the -net raw backend or the -net socket 
> backend?  It might as well be the -net butterflies backend as far as a 
> user is concerned.

My point is that we already have -net socket,fd and any user that passes
an fd into that already knows what he wants to do with it. Making it
work with raw sockets is just a natural extension to this, which works
on all kernels and (with separate namespaces) is reasonably secure.

I fully agree that we should not introduce further network backends
that would confuse users, but making the existing backends more
flexible is something entirely different.

	Arnd

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 21:39             ` Arnd Bergmann
@ 2010-01-27 22:56               ` Sridhar Samudrala
  2010-01-28  6:06                 ` Arnd Bergmann
                                   ` (2 more replies)
  0 siblings, 3 replies; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-27 22:56 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Anthony Liguori, Michael S. Tsirkin, avi, markmc, ogerlitz, kvm,
	qemu-devel

On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> On Wednesday 27 January 2010, Anthony Liguori wrote:
> > >> I think -net socket,fd should just be (trivially) extended to work with raw
> > >> sockets out of the box, with no support for opening it. Then you can have
> > >> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> > >> down.
> > >>      
> > > That'd work. Anthony?
> > 
> > The fundamental problem that I have with all of this is that we should 
> > not be introducing new network backends that are based around something 
> > only a developer is going to understand.  If I'm a user and I want to 
> > use an external switch in VEPA mode, how in the world am I going to know 
> > that I'm supposed to use the -net raw backend or the -net socket 
> > backend?  It might as well be the -net butterflies backend as far as a 
> > user is concerned.
> 
> My point is that we already have -net socket,fd and any user that passes
> an fd into that already knows what he wants to do with it. Making it
> work with raw sockets is just a natural extension to this, which works
> on all kernels and (with separate namespaces) is reasonably secure.

Didn't realize that -net socket is already there and supports TCP and
UDP sockets. I will look into extending -net socket to support AF_PACKET
SOCK_RAW type sockets.

Thanks
Sridhar


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 22:56               ` Sridhar Samudrala
@ 2010-01-28  6:06                 ` Arnd Bergmann
  2010-01-28 16:53                   ` Jens Osterkamp
  2010-01-28 11:22                 ` Or Gerlitz
  2010-01-29 20:52                   ` [Qemu-devel] " Sridhar Samudrala
  2 siblings, 1 reply; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-28  6:06 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Anthony Liguori, Michael S. Tsirkin, avi, markmc, ogerlitz, kvm,
	qemu-devel, Jens Osterkamp

On Wednesday 27 January 2010, Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> > On Wednesday 27 January 2010, Anthony Liguori wrote:
> > > >> I think -net socket,fd should just be (trivially) extended to work with raw
> > > >> sockets out of the box, with no support for opening it. Then you can have
> > > >> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> > > >> down.
> > > >>      
> > > > That'd work. Anthony?
> > > 
> > > The fundamental problem that I have with all of this is that we should 
> > > not be introducing new network backends that are based around something 
> > > only a developer is going to understand.  If I'm a user and I want to 
> > > use an external switch in VEPA mode, how in the world am I going to know 
> > > that I'm supposed to use the -net raw backend or the -net socket 
> > > backend?  It might as well be the -net butterflies backend as far as a 
> > > user is concerned.
> > 
> > My point is that we already have -net socket,fd and any user that passes
> > an fd into that already knows what he wants to do with it. Making it
> > work with raw sockets is just a natural extension to this, which works
> > on all kernels and (with separate namespaces) is reasonably secure.
> 
> Didn't realize that -net socket is already there and supports TCP and
> UDP sockets. I will look into extending -net socket to support AF_PACKET
> SOCK_RAW type sockets.

Actually, Jens had a patch doing this in early 2009 already but we
decided to not send that one out at the time after Or had sent his
version of the raw socket interface, which was a superset. Maybe Jens
can post his patch again if that still applies?

	Arnd

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 19:54                       ` Anthony Liguori
@ 2010-01-28  8:12                         ` Arnd Bergmann
  2010-01-28 13:56                           ` Michael S. Tsirkin
  2010-02-01 15:47                         ` Or Gerlitz
  1 sibling, 1 reply; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-28  8:12 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Michael S. Tsirkin, Sridhar Samudrala, avi, markmc, ogerlitz,
	kvm, qemu-devel

On Wednesday 27 January 2010, Anthony Liguori wrote:
> >>>        
> >> Introducing something that is known to be problematic from a security
> >> perspective without any clear idea of what the use-case for it is is a
> >> bad idea IMHO.
> >>      
> > vepa on existing kernels is one use-case.
> >    
> 
> Considering VEPA enabled hardware doesn't exist today and the standards 
> aren't even finished being defined, I don't think it's a really strong 
> use case ;-)

The hairpin turn (the part that is required on the bridge) was implemented
in the Linux bridge in 2.6.32, so that is one existing implementation you
can use as a peer.

The VEPA mode in macvlan only made it into 2.6.33, so using the raw socket
on older kernels does not give you actual VEPA semantics.

The part of the standard that is still under discussion is the management
side, which is almost entirely unrelated to this question though. With
Linux-2.6.33 on both sides using raw/macvlan and bridge respectively,
you can have a working VEPA setup. The only thing missing is that the
hypervisor will not be able to tell the bridge to automatically enable
hairpin mode (you need to do that on the bridge on a per-port basis).


Now, the most important use case I see for the raw socket interface
in qemu is to get vhost-net and the qemu user implementation to
support the same feature set. If you ask for a network setup involving
a raw socket and vhost-net and the kernel can support raw sockets
but for some reason fails to set up vhost-net, you should have a
fallback that has the exact same semantics at a possibly significant
performance loss.

	Arnd


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 22:56               ` Sridhar Samudrala
  2010-01-28  6:06                 ` Arnd Bergmann
@ 2010-01-28 11:22                 ` Or Gerlitz
  2010-01-29 20:52                   ` [Qemu-devel] " Sridhar Samudrala
  2 siblings, 0 replies; 45+ messages in thread
From: Or Gerlitz @ 2010-01-28 11:22 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Arnd Bergmann, Anthony Liguori, Michael S. Tsirkin, avi, markmc,
	kvm, qemu-devel

Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
>> we already have -net socket,fd and any user that passes an fd into 
>> that already knows what he wants to do with it. Making it work with 
>> raw sockets is just a natural extension to this
> Didn't realize that -net socket is already there and supports TCP and UDP sockets. I will look into extending -net socket to support AF_PACKET SOCK_RAW type sockets
The original thought was that the -raw option will be integrated in a 
pass through manner, that is bypassing the qemu vlan (internal bridge). 
This will allow qemu to use the mac address of the SR-IOV (e.g HW VF, 
software macvlan) NIC as the mac delivered to the VM, in that sense it 
is pretty different from the -net socket option.

Or.

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28  8:12                         ` Arnd Bergmann
@ 2010-01-28 13:56                           ` Michael S. Tsirkin
  2010-01-28 14:13                             ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-28 13:56 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Anthony Liguori, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel

On Thu, Jan 28, 2010 at 09:12:04AM +0100, Arnd Bergmann wrote:
> On Wednesday 27 January 2010, Anthony Liguori wrote:
> > >>>        
> > >> Introducing something that is known to be problematic from a security
> > >> perspective without any clear idea of what the use-case for it is is a
> > >> bad idea IMHO.
> > >>      
> > > vepa on existing kernels is one use-case.
> > >    
> > 
> > Considering VEPA enabled hardware doesn't exist today and the standards 
> > aren't even finished being defined, I don't think it's a really strong 
> > use case ;-)
> 
> The hairpin turn (the part that is required on the bridge) was implemented
> in the Linux bridge in 2.6.32, so that is one existing implementation you
> can use as a peer.
> 
> The VEPA mode in macvlan only made it into 2.6.33, so using the raw socket
> on older kernels does not give you actual VEPA semantics.
> 
> The part of the standard that is still under discussion is the management
> side, which is almost entirely unrelated to this question though. With
> Linux-2.6.33 on both sides using raw/macvlan and bridge respectively,
> you can have a working VEPA setup. The only thing missing is that the
> hypervisor will not be able to tell the bridge to automatically enable
> hairpin mode (you need to do that on the bridge on a per-port basis).
> 
> 
> Now, the most important use case I see for the raw socket interface
> in qemu is to get vhost-net and the qemu user implementation to
> support the same feature set. If you ask for a network setup involving
> a raw socket and vhost-net and the kernel can support raw sockets
> but for some reason fails to set up vhost-net, you should have a
> fallback that has the exact same semantics at a possibly significant
> performance loss.
> 
> 	Arnd

Makes sense. A simple reason you can't do vhost-net would be
that you are using tcg.

-- 
MST

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 13:56                           ` Michael S. Tsirkin
@ 2010-01-28 14:13                             ` Anthony Liguori
  2010-01-28 14:39                               ` Anthony Liguori
  2010-01-28 14:52                               ` Michael S. Tsirkin
  0 siblings, 2 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-28 14:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/28/2010 07:56 AM, Michael S. Tsirkin wrote:
>> Now, the most important use case I see for the raw socket interface
>> in qemu is to get vhost-net and the qemu user implementation to
>> support the same feature set. If you ask for a network setup involving
>> a raw socket and vhost-net and the kernel can support raw sockets
>> but for some reason fails to set up vhost-net, you should have a
>> fallback that has the exact same semantics at a possibly significant
>> performance loss.
>>
>> 	Arnd
>>      
> Makes sense. A simple reason you can't do vhost-net would be
> that you are using tcg.
>    

Some good arguments have been raised in this thread.  I really don't 
like making our security depend on something external to qemu that is 
not widely used or understood.

That said, I'm not seeing a lot of great alternatives.  I definitely 
like -net socket better than -net raw.  In the absence of an 
extraordinarily clever solution, I think we may be stuck with doing this.

Regards,

Anthony Liguori


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 14:13                             ` Anthony Liguori
@ 2010-01-28 14:39                               ` Anthony Liguori
  2010-01-28 14:52                               ` Michael S. Tsirkin
  1 sibling, 0 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-28 14:39 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On 01/28/2010 08:13 AM, Anthony Liguori wrote:
> On 01/28/2010 07:56 AM, Michael S. Tsirkin wrote:
>>> Now, the most important use case I see for the raw socket interface
>>> in qemu is to get vhost-net and the qemu user implementation to
>>> support the same feature set. If you ask for a network setup involving
>>> a raw socket and vhost-net and the kernel can support raw sockets
>>> but for some reason fails to set up vhost-net, you should have a
>>> fallback that has the exact same semantics at a possibly significant
>>> performance loss.
>>>
>>>     Arnd
>> Makes sense. A simple reason you can't do vhost-net would be
>> that you are using tcg.
>
> Some good arguments have been raised in this thread.  I really don't 
> like making our security depend on something external to qemu that is 
> not widely used or understood.

Thinking about it, I don't think network namespaces actually provides us 
the security that we need.  It's quite easy to break out of it if not 
being used in the context of a full container.

But this discussion belongs in netdev, I'll raise the issue there.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 14:13                             ` Anthony Liguori
  2010-01-28 14:39                               ` Anthony Liguori
@ 2010-01-28 14:52                               ` Michael S. Tsirkin
  2010-01-28 15:05                                 ` Anthony Liguori
  1 sibling, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-28 14:52 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm, qemu-devel

On Thu, Jan 28, 2010 at 08:13:53AM -0600, Anthony Liguori wrote:
> On 01/28/2010 07:56 AM, Michael S. Tsirkin wrote:
>>> Now, the most important use case I see for the raw socket interface
>>> in qemu is to get vhost-net and the qemu user implementation to
>>> support the same feature set. If you ask for a network setup involving
>>> a raw socket and vhost-net and the kernel can support raw sockets
>>> but for some reason fails to set up vhost-net, you should have a
>>> fallback that has the exact same semantics at a possibly significant
>>> performance loss.
>>>
>>> 	Arnd
>>>      
>> Makes sense. A simple reason you can't do vhost-net would be
>> that you are using tcg.
>>    
>
> Some good arguments have been raised in this thread.  I really don't  
> like making our security depend on something external to qemu that is  
> not widely used or understood.
>
> That said, I'm not seeing a lot of great alternatives.  I definitely  
> like -net socket better than -net raw.  In the absence of an  
> extraordinarily clever solution, I think we may be stuck with doing this.

Agreed on all points.

> Regards,
>
> Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 14:52                               ` Michael S. Tsirkin
@ 2010-01-28 15:05                                 ` Anthony Liguori
  2010-01-28 16:37                                   ` Michael S. Tsirkin
  2010-01-28 20:29                                   ` Arnd Bergmann
  0 siblings, 2 replies; 45+ messages in thread
From: Anthony Liguori @ 2010-01-28 15:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel, Chris Wright

On 01/28/2010 08:52 AM, Michael S. Tsirkin wrote:
> On Thu, Jan 28, 2010 at 08:13:53AM -0600, Anthony Liguori wrote:
>    
>> On 01/28/2010 07:56 AM, Michael S. Tsirkin wrote:
>>      
>>>> Now, the most important use case I see for the raw socket interface
>>>> in qemu is to get vhost-net and the qemu user implementation to
>>>> support the same feature set. If you ask for a network setup involving
>>>> a raw socket and vhost-net and the kernel can support raw sockets
>>>> but for some reason fails to set up vhost-net, you should have a
>>>> fallback that has the exact same semantics at a possibly significant
>>>> performance loss.
>>>>
>>>> 	Arnd
>>>>
>>>>          
>>> Makes sense. A simple reason you can't do vhost-net would be
>>> that you are using tcg.
>>>
>>>        
>> Some good arguments have been raised in this thread.  I really don't
>> like making our security depend on something external to qemu that is
>> not widely used or understood.
>>
>> That said, I'm not seeing a lot of great alternatives.  I definitely
>> like -net socket better than -net raw.  In the absence of an
>> extraordinarily clever solution, I think we may be stuck with doing this.
>>      
> Agreed on all points.
>    

The scenario I'm concerned about is:

normal user uses libvirt to launch custom qemu instance.  libvirt passes 
an fd of a raw socket to qemu and puts the qemu process in a restricted 
network namespace.  user has another program running listening on a unix 
domain socket and does something to the qemu process that causes it to 
open the domain socket and send the fd it received from libvirt via 
SCM_RIGHTS.

user now has a raw socket that is not confined to a network namespace.

I'm trying to digest the disablenetwork thread right now.  Basically 
though, what would be ideal is a /dev/net/ethN that we could open, and 
use read/write to send packets to and use ioctls to issue commands to do 
things like enable/disable offloads.

I understand that raw socket is the interface we have today but I think 
we aren't going to be able to get around the need for a restricted file 
descriptor vs. using process restrictions to achieve isolation.

Regards,

Anthony Liguori

>> Regards,
>>
>> Anthony Liguori
>>      


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 15:05                                 ` Anthony Liguori
@ 2010-01-28 16:37                                   ` Michael S. Tsirkin
  2010-01-28 17:58                                     ` Anthony Liguori
  2010-01-28 20:29                                   ` Arnd Bergmann
  1 sibling, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-28 16:37 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel, Chris Wright

On Thu, Jan 28, 2010 at 09:05:45AM -0600, Anthony Liguori wrote:
> On 01/28/2010 08:52 AM, Michael S. Tsirkin wrote:
>> On Thu, Jan 28, 2010 at 08:13:53AM -0600, Anthony Liguori wrote:
>>    
>>> On 01/28/2010 07:56 AM, Michael S. Tsirkin wrote:
>>>      
>>>>> Now, the most important use case I see for the raw socket interface
>>>>> in qemu is to get vhost-net and the qemu user implementation to
>>>>> support the same feature set. If you ask for a network setup involving
>>>>> a raw socket and vhost-net and the kernel can support raw sockets
>>>>> but for some reason fails to set up vhost-net, you should have a
>>>>> fallback that has the exact same semantics at a possibly significant
>>>>> performance loss.
>>>>>
>>>>> 	Arnd
>>>>>
>>>>>          
>>>> Makes sense. A simple reason you can't do vhost-net would be
>>>> that you are using tcg.
>>>>
>>>>        
>>> Some good arguments have been raised in this thread.  I really don't
>>> like making our security depend on something external to qemu that is
>>> not widely used or understood.
>>>
>>> That said, I'm not seeing a lot of great alternatives.  I definitely
>>> like -net socket better than -net raw.  In the absence of an
>>> extraordinarily clever solution, I think we may be stuck with doing this.
>>>      
>> Agreed on all points.
>>    
>
> The scenario I'm concerned about is:
>
> normal user uses libvirt to launch custom qemu instance.  libvirt passes  
> an fd of a raw socket to qemu and puts the qemu process in a restricted  
> network namespace.  user has another program running listening on a unix  
> domain socket and does something to the qemu process that causes it to  
> open the domain socket and send the fd it received from libvirt via  
> SCM_RIGHTS.
>
> user now has a raw socket that is not confined to a network namespace.
>
> I'm trying to digest the disablenetwork thread right now.  Basically  
> though, what would be ideal is a /dev/net/ethN that we could open, and  
> use read/write to send packets to and use ioctls to issue commands to do  
> things like enable/disable offloads.
>
> I understand that raw socket is the interface we have today but I think  
> we aren't going to be able to get around the need for a restricted file  
> descriptor vs. using process restrictions to achieve isolation.


So actually, this is an interesting argument in favor of
turning disablenetwork from per-process as it is now
to per-file.


> Regards,
>
> Anthony Liguori
>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28  6:06                 ` Arnd Bergmann
@ 2010-01-28 16:53                   ` Jens Osterkamp
  0 siblings, 0 replies; 45+ messages in thread
From: Jens Osterkamp @ 2010-01-28 16:53 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Sridhar Samudrala, Anthony Liguori, Michael S. Tsirkin, avi,
	markmc, ogerlitz, kvm, qemu-devel

On Thursday 28 January 2010, Arnd Bergmann wrote:
> On Wednesday 27 January 2010, Sridhar Samudrala wrote:
> > On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> > > On Wednesday 27 January 2010, Anthony Liguori wrote:
> > > > >> I think -net socket,fd should just be (trivially) extended to work
> > > > >> with raw sockets out of the box, with no support for opening it.
> > > > >> Then you can have libvirt or some wrapper open a raw socket and a
> > > > >> private namespace and just pass it down.
> > > > >
> > > > > That'd work. Anthony?
> > > >
> > > > The fundamental problem that I have with all of this is that we
> > > > should not be introducing new network backends that are based around
> > > > something only a developer is going to understand.  If I'm a user and
> > > > I want to use an external switch in VEPA mode, how in the world am I
> > > > going to know that I'm supposed to use the -net raw backend or the
> > > > -net socket backend?  It might as well be the -net butterflies
> > > > backend as far as a user is concerned.
> > >
> > > My point is that we already have -net socket,fd and any user that
> > > passes an fd into that already knows what he wants to do with it.
> > > Making it work with raw sockets is just a natural extension to this,
> > > which works on all kernels and (with separate namespaces) is reasonably
> > > secure.
> >
> > Didn't realize that -net socket is already there and supports TCP and
> > UDP sockets. I will look into extending -net socket to support AF_PACKET
> > SOCK_RAW type sockets.
>
> Actually, Jens had a patch doing this in early 2009 already but we
> decided to not send that one out at the time after Or had sent his
> version of the raw socket interface, which was a superset. Maybe Jens
> can post his patch again if that still applies?

It's been a while since I last looked at it. I think it will need a bitt 
massaging before it will apply again.

Jens


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 16:37                                   ` Michael S. Tsirkin
@ 2010-01-28 17:58                                     ` Anthony Liguori
  2010-01-28 18:04                                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-28 17:58 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel, Chris Wright

On 01/28/2010 10:37 AM, Michael S. Tsirkin wrote:
> So actually, this is an interesting argument in favor of
> turning disablenetwork from per-process as it is now
> to per-file.
>    

Yup.  I think we really need a file-based restriction mechanism and so 
far, neither disablenetwork or network namespace seems to do that.

I think you might be able to mitigate this with SELinux since I'm fairly 
certain it can prevent SCM_RIGHTS but SELinux is not something that can 
be enforced within a set of applications so we'd be relying on SELinux 
being enabled (honestly, unlikely) and the policy being correctly 
configured (unlikely in the general case at least).

Regards,

Anthony Liguori

>    
>> Regards,
>>
>> Anthony Liguori
>>
>>      
>>>> Regards,
>>>>
>>>> Anthony Liguori
>>>>
>>>>          


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 17:58                                     ` Anthony Liguori
@ 2010-01-28 18:04                                       ` Michael S. Tsirkin
  2010-01-28 19:57                                         ` Anthony Liguori
  0 siblings, 1 reply; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-28 18:04 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel, Chris Wright

On Thu, Jan 28, 2010 at 11:58:48AM -0600, Anthony Liguori wrote:
> On 01/28/2010 10:37 AM, Michael S. Tsirkin wrote:
>> So actually, this is an interesting argument in favor of
>> turning disablenetwork from per-process as it is now
>> to per-file.
>>    
>
> Yup.  I think we really need a file-based restriction mechanism and so  
> far, neither disablenetwork or network namespace seems to do that.
>
> I think you might be able to mitigate this with SELinux since I'm fairly  
> certain it can prevent SCM_RIGHTS but SELinux is not something that can  
> be enforced within a set of applications so we'd be relying on SELinux  
> being enabled (honestly, unlikely) and the policy being correctly  
> configured (unlikely in the general case at least).
>
> Regards,
>
> Anthony Liguori

I am not convinced SELinux being disabled is a problem we necessarily
need to deal with, and qemu does not verify e.g. that it is not run as
root either. A more serious problem IMO is that SCM_RIGHTS might be
needed for some other functionality.

>>    
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>      
>>>>> Regards,
>>>>>
>>>>> Anthony Liguori
>>>>>
>>>>>          

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 18:04                                       ` Michael S. Tsirkin
@ 2010-01-28 19:57                                         ` Anthony Liguori
  2010-01-29 11:26                                           ` Michael S. Tsirkin
  0 siblings, 1 reply; 45+ messages in thread
From: Anthony Liguori @ 2010-01-28 19:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel, Chris Wright, Daniel P. Berrange

On 01/28/2010 12:04 PM, Michael S. Tsirkin wrote:
> On Thu, Jan 28, 2010 at 11:58:48AM -0600, Anthony Liguori wrote:
>    
>> On 01/28/2010 10:37 AM, Michael S. Tsirkin wrote:
>>      
>>> So actually, this is an interesting argument in favor of
>>> turning disablenetwork from per-process as it is now
>>> to per-file.
>>>
>>>        
>> Yup.  I think we really need a file-based restriction mechanism and so
>> far, neither disablenetwork or network namespace seems to do that.
>>
>> I think you might be able to mitigate this with SELinux since I'm fairly
>> certain it can prevent SCM_RIGHTS but SELinux is not something that can
>> be enforced within a set of applications so we'd be relying on SELinux
>> being enabled (honestly, unlikely) and the policy being correctly
>> configured (unlikely in the general case at least).
>>
>> Regards,
>>
>> Anthony Liguori
>>      
> I am not convinced SELinux being disabled is a problem we necessarily
> need to deal with, and qemu does not verify e.g. that it is not run as
> root either. A more serious problem IMO is that SCM_RIGHTS might be
> needed for some other functionality.
>    

It would mean that libvirt is insecure unless SELinux is enabled.  
That's a pretty fundamental flaw IMHO.

At any rate, I think we both agree that we need to figure out a 
solution, so that's good :-)

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 15:05                                 ` Anthony Liguori
  2010-01-28 16:37                                   ` Michael S. Tsirkin
@ 2010-01-28 20:29                                   ` Arnd Bergmann
  1 sibling, 0 replies; 45+ messages in thread
From: Arnd Bergmann @ 2010-01-28 20:29 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Michael S. Tsirkin, Sridhar Samudrala, avi, markmc, ogerlitz,
	kvm, qemu-devel, Chris Wright

On Thursday 28 January 2010, Anthony Liguori wrote:
> normal user uses libvirt to launch custom qemu instance.  libvirt passes 
> an fd of a raw socket to qemu and puts the qemu process in a restricted 
> network namespace.  user has another program running listening on a unix 
> domain socket and does something to the qemu process that causes it to 
> open the domain socket and send the fd it received from libvirt via 
> SCM_RIGHTS.

I looked at the af_unix code and it seems to suggest that this is not
possible, because you cannot bind to a socket that belongs to a different
network namespace. I haven't tried it though, so I may have missed
something.

	Arnd


^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-28 19:57                                         ` Anthony Liguori
@ 2010-01-29 11:26                                           ` Michael S. Tsirkin
  0 siblings, 0 replies; 45+ messages in thread
From: Michael S. Tsirkin @ 2010-01-29 11:26 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Arnd Bergmann, Sridhar Samudrala, avi, markmc, ogerlitz, kvm,
	qemu-devel, Chris Wright, Daniel P. Berrange

On Thu, Jan 28, 2010 at 01:57:33PM -0600, Anthony Liguori wrote:
> On 01/28/2010 12:04 PM, Michael S. Tsirkin wrote:
>> On Thu, Jan 28, 2010 at 11:58:48AM -0600, Anthony Liguori wrote:
>>    
>>> On 01/28/2010 10:37 AM, Michael S. Tsirkin wrote:
>>>      
>>>> So actually, this is an interesting argument in favor of
>>>> turning disablenetwork from per-process as it is now
>>>> to per-file.
>>>>
>>>>        
>>> Yup.  I think we really need a file-based restriction mechanism and so
>>> far, neither disablenetwork or network namespace seems to do that.
>>>
>>> I think you might be able to mitigate this with SELinux since I'm fairly
>>> certain it can prevent SCM_RIGHTS but SELinux is not something that can
>>> be enforced within a set of applications so we'd be relying on SELinux
>>> being enabled (honestly, unlikely) and the policy being correctly
>>> configured (unlikely in the general case at least).
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      
>> I am not convinced SELinux being disabled is a problem we necessarily
>> need to deal with, and qemu does not verify e.g. that it is not run as
>> root either. A more serious problem IMO is that SCM_RIGHTS might be
>> needed for some other functionality.
>>    
>
> It would mean that libvirt is insecure unless SELinux is enabled.   
> That's a pretty fundamental flaw IMHO.
>
> At any rate, I think we both agree that we need to figure out a  
> solution, so that's good :-)
>
> Regards,
>
> Anthony Liguori

Yes, but I am still not sure the problem is real. Pls discuss on netdev.

-- 
MST

^ permalink raw reply	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 22:56               ` Sridhar Samudrala
@ 2010-01-29 20:52                   ` Sridhar Samudrala
  2010-01-28 11:22                 ` Or Gerlitz
  2010-01-29 20:52                   ` [Qemu-devel] " Sridhar Samudrala
  2 siblings, 0 replies; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-29 20:52 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Anthony Liguori, Michael S. Tsirkin, avi, markmc, ogerlitz, kvm,
	qemu-devel, vivk

On Wed, 2010-01-27 at 14:56 -0800, Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> > On Wednesday 27 January 2010, Anthony Liguori wrote:
> > > >> I think -net socket,fd should just be (trivially) extended to work with raw
> > > >> sockets out of the box, with no support for opening it. Then you can have
> > > >> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> > > >> down.
> > > >>      
> > > > That'd work. Anthony?
> > > 
> > > The fundamental problem that I have with all of this is that we should 
> > > not be introducing new network backends that are based around something 
> > > only a developer is going to understand.  If I'm a user and I want to 
> > > use an external switch in VEPA mode, how in the world am I going to know 
> > > that I'm supposed to use the -net raw backend or the -net socket 
> > > backend?  It might as well be the -net butterflies backend as far as a 
> > > user is concerned.
> > 
> > My point is that we already have -net socket,fd and any user that passes
> > an fd into that already knows what he wants to do with it. Making it
> > work with raw sockets is just a natural extension to this, which works
> > on all kernels and (with separate namespaces) is reasonably secure.
> 
> Didn't realize that -net socket is already there and supports TCP and
> UDP sockets. I will look into extending -net socket to support AF_PACKET
> SOCK_RAW type sockets.

OK. Here is a patch that adds AF_PACKET-SOCK_RAW support to -netdev socket
backend. It allows specifying a already opened raw fd or a ifname to which a
raw socket can be bind.

   -netdev socket,fd=X,id=str
   -netdev socket,ifname=<ethX/macvlanX>,id=str

However, i found that struct NetSocketState doesn't include all the State info that
is required to support AF_PACKET Raw sockets. So i had to add NetSocketRawState 
and also couldn't re-use much of the code.

I think -net socket backend is more geared towards AF_INET sockets. Adding support
for a new family of socket doesn't fit nicely with the existing code.
But if this approach is more acceptable than a new -net raw,fd backend, i am fine 
with it.

Thanks
Sridhar

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index eba578a..7d62dd9 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -15,6 +15,7 @@
 #include "net.h"
 #include "net/checksum.h"
 #include "net/tap.h"
+#include "net/socket.h"
 #include "qemu-timer.h"
 #include "virtio-net.h"
 
@@ -133,6 +134,9 @@ static int peer_has_vnet_hdr(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        n->has_vnet_hdr = sock_raw_has_vnet_hdr(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -149,6 +153,9 @@ static int peer_has_ufo(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_ufo = tap_has_ufo(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        n->has_ufo = sock_raw_has_ufo(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -165,6 +172,9 @@ static void peer_using_vnet_hdr(VirtIONet *n, int using_vnet_hdr)
     case NET_CLIENT_TYPE_TAP:
         tap_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        sock_raw_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
+        break;
     default:
         break; 
     }
@@ -180,6 +190,9 @@ static void peer_set_offload(VirtIONet *n, int csum, int tso4, int tso6,
     case NET_CLIENT_TYPE_TAP:
         tap_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        sock_raw_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
+        break;
     default:
         break; 
     }
diff --git a/net.c b/net.c
index 6ef93e6..3d25d64 100644
--- a/net.c
+++ b/net.c
@@ -1002,6 +1002,11 @@ static struct {
                 .type = QEMU_OPT_STRING,
                 .help = "UDP multicast address and port number",
             },
+            {
+                .name = "ifname",
+                .type = QEMU_OPT_STRING,
+                .help = "interface name",
+            },
             { /* end of list */ }
         },
 #ifdef CONFIG_VDE
diff --git a/net.h b/net.h
index 116bb80..74b3e69 100644
--- a/net.h
+++ b/net.h
@@ -34,7 +34,8 @@ typedef enum {
     NET_CLIENT_TYPE_TAP,
     NET_CLIENT_TYPE_SOCKET,
     NET_CLIENT_TYPE_VDE,
-    NET_CLIENT_TYPE_DUMP
+    NET_CLIENT_TYPE_DUMP,
+    NET_CLIENT_TYPE_SOCKET_RAW,
 } net_client_type;
 
 typedef void (NetPoll)(VLANClientState *, bool enable);
diff --git a/net/socket.c b/net/socket.c
index 5533737..56f5bad 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -32,6 +32,327 @@
 #include "qemu_socket.h"
 #include "sysemu.h"
 
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+/* Maximum GSO packet size (64k) plus plenty of room for
+ * the ethernet and virtio_net headers
+ */
+#define RAW_BUFSIZE (4096 + 65536)
+
+typedef struct NetSocketRawState {
+    VLANClientState nc;
+    int fd;
+    uint8_t buf[RAW_BUFSIZE];
+    int promisc;
+    unsigned int read_poll:1;
+    unsigned int write_poll:1;
+    unsigned int has_vnet_hdr:1;
+    unsigned int using_vnet_hdr:1;	
+    unsigned int has_ufo:1;
+} NetSocketRawState;
+
+struct virtio_net_hdr
+{
+    uint8_t flags;
+    uint8_t gso_type;
+    uint16_t hdr_len;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+static int sock_raw_can_send(void *opaque);
+static void sock_raw_send(void *opaque);
+static void sock_raw_writable(void *opaque);
+
+static void sock_raw_update_fd_handler(NetSocketRawState *s)
+{
+    qemu_set_fd_handler2(s->fd,
+                         s->read_poll  ? sock_raw_can_send : NULL,
+                         s->read_poll  ? sock_raw_send     : NULL,
+                         s->write_poll ? sock_raw_writable : NULL,
+                         s);
+}
+
+static void sock_raw_read_poll(NetSocketRawState *s, int enable)
+{
+    s->read_poll = !!enable;
+    sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_write_poll(NetSocketRawState *s, int enable)
+{
+    s->write_poll = !!enable;
+    sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_writable(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+
+    sock_raw_write_poll(s, 0);
+    qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t sock_raw_write_packet(NetSocketRawState *s,
+                                     const struct iovec *iov,
+                                     int iovcnt)
+{
+    ssize_t len;
+
+    do {
+        len = writev(s->fd, iov, iovcnt);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1 && errno == EAGAIN) {
+        sock_raw_write_poll(s, 1);
+        return 0;
+    }
+
+    if (len == -1)
+        printf("raw_write_packet: errno:%d\n", errno);
+
+    return len;
+}
+
+static ssize_t sock_raw_receive_iov(VLANClientState *nc,
+                                    const struct iovec *iov,
+                                    int iovcnt)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    const struct iovec *iovp = iov;
+    struct iovec iov_copy[iovcnt + 1];
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+        iov_copy[0].iov_base = &hdr;
+        iov_copy[0].iov_len =  sizeof(hdr);
+        memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
+        iovp = iov_copy;
+        iovcnt++;
+    }
+
+    return sock_raw_write_packet(s, iovp, iovcnt);
+}
+
+static ssize_t sock_raw_receive_raw(VLANClientState *nc, const uint8_t *buf,
+                                    size_t size)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    struct iovec iov[2];
+    int iovcnt = 0;
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr) {
+        iov[iovcnt].iov_base = &hdr;
+        iov[iovcnt].iov_len  = sizeof(hdr);
+        iovcnt++;
+    }
+
+    iov[iovcnt].iov_base = (char *)buf;
+    iov[iovcnt].iov_len  = size;
+    iovcnt++;
+
+    return sock_raw_write_packet(s, iov, iovcnt);
+}
+
+static ssize_t sock_raw_receive(VLANClientState *nc, const uint8_t *buf,
+                                size_t size)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    struct iovec iov[1];
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr)
+        return sock_raw_receive_raw(nc, buf, size);
+
+    iov[0].iov_base = (char *)buf;
+    iov[0].iov_len  = size;
+
+    return sock_raw_write_packet(s, iov, 1);
+}
+
+static int sock_raw_can_send(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+
+    return qemu_can_send_packet(&s->nc);
+}
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags)
+{
+    int ret;
+
+    ret = recv(fd, buf, maxlen, flags);
+    return ret;
+}
+
+static void sock_raw_send_completed(VLANClientState *nc, ssize_t len)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    sock_raw_read_poll(s, 1);
+}
+
+static void sock_raw_send(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+    int size;
+
+    do {
+        uint8_t *buf = s->buf;
+
+        size = sock_raw_read_packet(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
+        if (size <= 0)
+            break;
+
+        if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+            buf  += sizeof(struct virtio_net_hdr);
+            size -= sizeof(struct virtio_net_hdr);
+        }
+
+        size = qemu_send_packet_async(&s->nc, buf, size,
+                                      sock_raw_send_completed);
+        if (size == 0)
+            sock_raw_read_poll(s, 0);
+
+    } while (size > 0 && qemu_can_send_packet(&s->nc));
+}
+
+int sock_raw_has_ufo(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+    return s->has_ufo;
+}
+
+int sock_raw_has_vnet_hdr(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+    return s->has_vnet_hdr;
+}
+
+void sock_raw_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    using_vnet_hdr = using_vnet_hdr != 0;
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+    assert(s->has_vnet_hdr == using_vnet_hdr);
+
+    s->using_vnet_hdr = using_vnet_hdr;
+}
+
+void sock_raw_set_offload(VLANClientState *nc, int csum, int tso4,
+                     int tso6, int ecn, int ufo)
+{
+    return;
+}
+
+static void sock_raw_cleanup(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    qemu_purge_queued_packets(nc);
+
+    sock_raw_read_poll(s, 0);
+    sock_raw_write_poll(s, 0);
+    close(s->fd);
+}
+
+int sock_raw_probe_vnet_hdr(int fd)
+{
+    int val, len;
+	
+    len = sizeof(val);
+    if (getsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &val, (socklen_t *)&len)
+                                                                         == 0) 
+        return 1;	
+	
+    return 0;
+}
+
+static NetClientInfo net_raw_info = {
+    .type = NET_CLIENT_TYPE_SOCKET_RAW,
+    .size = sizeof(NetSocketRawState),
+    .receive = sock_raw_receive,
+    .receive_raw = NULL,
+    .receive_iov = sock_raw_receive_iov,
+    .cleanup = sock_raw_cleanup,
+};
+
+
+static NetSocketRawState *net_socket_fd_init_raw(VLANState *vlan,
+                                                 const char *model,
+                                                 const char *name, int fd)
+{
+    VLANClientState *nc;
+    NetSocketRawState *s;
+
+    nc = qemu_new_net_client(&net_raw_info, vlan, NULL, model, name);
+
+    s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    s->fd = fd;
+    s->has_vnet_hdr = sock_raw_probe_vnet_hdr(fd);
+    s->using_vnet_hdr = 0;
+    s->has_ufo = 1;
+    sock_raw_read_poll(s, 1);
+
+    return s;
+}
+
+static int net_socket_raw_ifname_init(VLANState *vlan, const char *model,
+				      const char *name, const char *ifname)
+{
+    struct ifreq req;
+    int fd, ret;
+    struct sockaddr_ll lladdr;
+    int val;
+
+    fd = qemu_socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+    if (fd < 0)
+        fprintf(stderr, "packet socket failed\n");
+
+    memset(&req, 0, sizeof(req));
+    strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
+    ret = ioctl(fd, SIOCGIFINDEX, &req);
+    if (ret < 0)
+        fprintf(stderr, "SIOCGIFINDEX failed\n");
+
+    memset(&lladdr, 0, sizeof(lladdr));
+    lladdr.sll_family   = AF_PACKET;
+    lladdr.sll_protocol = htons(ETH_P_ALL);
+    lladdr.sll_ifindex  = req.ifr_ifindex;
+    ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
+    if (ret < 0)
+        fprintf(stderr, "bind failed\n");
+
+    val = 1;
+    ret=setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, (const char *)&val,
+                   sizeof(val));
+    if (ret < 0) {
+        fprintf(stderr, "setsockopt(SOL_PACKET, PACKET_VNET_HDR) failed\n");
+    } 
+
+    ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+    if (ret < 0)
+        fprintf(stderr, "fcntl(O_NONBLOCK) set failed\n");
+
+    net_socket_fd_init_raw(vlan, model, name, fd);
+
+    return 0;
+}
+
+
 typedef struct NetSocketState {
     VLANClientState nc;
     int fd;
@@ -337,6 +658,8 @@ static NetSocketState *net_socket_fd_init(VLANState *vlan,
         return net_socket_fd_init_dgram(vlan, model, name, fd, is_connected);
     case SOCK_STREAM:
         return net_socket_fd_init_stream(vlan, model, name, fd, is_connected);
+    case SOCK_RAW:
+        return (struct NetSocketState *)net_socket_fd_init_raw(vlan, model, name, fd);
     default:
         /* who knows ... this could be a eg. a pty, do warn and continue as stream */
         fprintf(stderr, "qemu: warning: socket type=%d for fd=%d is not SOCK_DGRAM or SOCK_STREAM\n", so_type, fd);
@@ -519,6 +842,22 @@ int net_init_socket(QemuOpts *opts,
             close(fd);
             return -1;
         }
+    } else if (qemu_opt_get(opts, "ifname")) {
+        const char *ifname;
+
+        if (qemu_opt_get(opts, "fd") ||
+            qemu_opt_get(opts, "connect") ||
+            qemu_opt_get(opts, "listen") ||
+            qemu_opt_get(opts, "mcast")) {
+            qemu_error("fd=, connect= and mcast= and listen= is invalid with ifname=\n");
+            return -1;
+        }
+
+        ifname = qemu_opt_get(opts, "ifname");
+
+        if (net_socket_raw_ifname_init(vlan, "socket", name, ifname) == -1) {
+            return -1;
+        }
     } else if (qemu_opt_get(opts, "listen")) {
         const char *listen;
 
diff --git a/net/socket.h b/net/socket.h
index ea46f02..cc09866 100644
--- a/net/socket.h
+++ b/net/socket.h
@@ -30,4 +30,13 @@
 int net_init_socket(QemuOpts *opts, Monitor *mon,
                     const char *name, VLANState *vlan);
 
+#define PACKET_VNET_HDR	15
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags);
+int sock_raw_has_ufo(VLANClientState *vc);
+int sock_raw_has_vnet_hdr(VLANClientState *vc);
+void sock_raw_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr);
+int sock_raw_probe_vnet_hdr(int fd);
+void sock_raw_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, int ecn, int ufo);
+
 #endif /* QEMU_NET_SOCKET_H */



^ permalink raw reply related	[flat|nested] 45+ messages in thread

* [Qemu-devel] Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
@ 2010-01-29 20:52                   ` Sridhar Samudrala
  0 siblings, 0 replies; 45+ messages in thread
From: Sridhar Samudrala @ 2010-01-29 20:52 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: markmc, vivk, kvm, Michael S. Tsirkin, qemu-devel, ogerlitz, avi

On Wed, 2010-01-27 at 14:56 -0800, Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> > On Wednesday 27 January 2010, Anthony Liguori wrote:
> > > >> I think -net socket,fd should just be (trivially) extended to work with raw
> > > >> sockets out of the box, with no support for opening it. Then you can have
> > > >> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> > > >> down.
> > > >>      
> > > > That'd work. Anthony?
> > > 
> > > The fundamental problem that I have with all of this is that we should 
> > > not be introducing new network backends that are based around something 
> > > only a developer is going to understand.  If I'm a user and I want to 
> > > use an external switch in VEPA mode, how in the world am I going to know 
> > > that I'm supposed to use the -net raw backend or the -net socket 
> > > backend?  It might as well be the -net butterflies backend as far as a 
> > > user is concerned.
> > 
> > My point is that we already have -net socket,fd and any user that passes
> > an fd into that already knows what he wants to do with it. Making it
> > work with raw sockets is just a natural extension to this, which works
> > on all kernels and (with separate namespaces) is reasonably secure.
> 
> Didn't realize that -net socket is already there and supports TCP and
> UDP sockets. I will look into extending -net socket to support AF_PACKET
> SOCK_RAW type sockets.

OK. Here is a patch that adds AF_PACKET-SOCK_RAW support to -netdev socket
backend. It allows specifying a already opened raw fd or a ifname to which a
raw socket can be bind.

   -netdev socket,fd=X,id=str
   -netdev socket,ifname=<ethX/macvlanX>,id=str

However, i found that struct NetSocketState doesn't include all the State info that
is required to support AF_PACKET Raw sockets. So i had to add NetSocketRawState 
and also couldn't re-use much of the code.

I think -net socket backend is more geared towards AF_INET sockets. Adding support
for a new family of socket doesn't fit nicely with the existing code.
But if this approach is more acceptable than a new -net raw,fd backend, i am fine 
with it.

Thanks
Sridhar

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index eba578a..7d62dd9 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -15,6 +15,7 @@
 #include "net.h"
 #include "net/checksum.h"
 #include "net/tap.h"
+#include "net/socket.h"
 #include "qemu-timer.h"
 #include "virtio-net.h"
 
@@ -133,6 +134,9 @@ static int peer_has_vnet_hdr(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        n->has_vnet_hdr = sock_raw_has_vnet_hdr(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -149,6 +153,9 @@ static int peer_has_ufo(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_ufo = tap_has_ufo(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        n->has_ufo = sock_raw_has_ufo(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -165,6 +172,9 @@ static void peer_using_vnet_hdr(VirtIONet *n, int using_vnet_hdr)
     case NET_CLIENT_TYPE_TAP:
         tap_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        sock_raw_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
+        break;
     default:
         break; 
     }
@@ -180,6 +190,9 @@ static void peer_set_offload(VirtIONet *n, int csum, int tso4, int tso6,
     case NET_CLIENT_TYPE_TAP:
         tap_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        sock_raw_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
+        break;
     default:
         break; 
     }
diff --git a/net.c b/net.c
index 6ef93e6..3d25d64 100644
--- a/net.c
+++ b/net.c
@@ -1002,6 +1002,11 @@ static struct {
                 .type = QEMU_OPT_STRING,
                 .help = "UDP multicast address and port number",
             },
+            {
+                .name = "ifname",
+                .type = QEMU_OPT_STRING,
+                .help = "interface name",
+            },
             { /* end of list */ }
         },
 #ifdef CONFIG_VDE
diff --git a/net.h b/net.h
index 116bb80..74b3e69 100644
--- a/net.h
+++ b/net.h
@@ -34,7 +34,8 @@ typedef enum {
     NET_CLIENT_TYPE_TAP,
     NET_CLIENT_TYPE_SOCKET,
     NET_CLIENT_TYPE_VDE,
-    NET_CLIENT_TYPE_DUMP
+    NET_CLIENT_TYPE_DUMP,
+    NET_CLIENT_TYPE_SOCKET_RAW,
 } net_client_type;
 
 typedef void (NetPoll)(VLANClientState *, bool enable);
diff --git a/net/socket.c b/net/socket.c
index 5533737..56f5bad 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -32,6 +32,327 @@
 #include "qemu_socket.h"
 #include "sysemu.h"
 
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+/* Maximum GSO packet size (64k) plus plenty of room for
+ * the ethernet and virtio_net headers
+ */
+#define RAW_BUFSIZE (4096 + 65536)
+
+typedef struct NetSocketRawState {
+    VLANClientState nc;
+    int fd;
+    uint8_t buf[RAW_BUFSIZE];
+    int promisc;
+    unsigned int read_poll:1;
+    unsigned int write_poll:1;
+    unsigned int has_vnet_hdr:1;
+    unsigned int using_vnet_hdr:1;	
+    unsigned int has_ufo:1;
+} NetSocketRawState;
+
+struct virtio_net_hdr
+{
+    uint8_t flags;
+    uint8_t gso_type;
+    uint16_t hdr_len;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+static int sock_raw_can_send(void *opaque);
+static void sock_raw_send(void *opaque);
+static void sock_raw_writable(void *opaque);
+
+static void sock_raw_update_fd_handler(NetSocketRawState *s)
+{
+    qemu_set_fd_handler2(s->fd,
+                         s->read_poll  ? sock_raw_can_send : NULL,
+                         s->read_poll  ? sock_raw_send     : NULL,
+                         s->write_poll ? sock_raw_writable : NULL,
+                         s);
+}
+
+static void sock_raw_read_poll(NetSocketRawState *s, int enable)
+{
+    s->read_poll = !!enable;
+    sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_write_poll(NetSocketRawState *s, int enable)
+{
+    s->write_poll = !!enable;
+    sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_writable(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+
+    sock_raw_write_poll(s, 0);
+    qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t sock_raw_write_packet(NetSocketRawState *s,
+                                     const struct iovec *iov,
+                                     int iovcnt)
+{
+    ssize_t len;
+
+    do {
+        len = writev(s->fd, iov, iovcnt);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1 && errno == EAGAIN) {
+        sock_raw_write_poll(s, 1);
+        return 0;
+    }
+
+    if (len == -1)
+        printf("raw_write_packet: errno:%d\n", errno);
+
+    return len;
+}
+
+static ssize_t sock_raw_receive_iov(VLANClientState *nc,
+                                    const struct iovec *iov,
+                                    int iovcnt)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    const struct iovec *iovp = iov;
+    struct iovec iov_copy[iovcnt + 1];
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+        iov_copy[0].iov_base = &hdr;
+        iov_copy[0].iov_len =  sizeof(hdr);
+        memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
+        iovp = iov_copy;
+        iovcnt++;
+    }
+
+    return sock_raw_write_packet(s, iovp, iovcnt);
+}
+
+static ssize_t sock_raw_receive_raw(VLANClientState *nc, const uint8_t *buf,
+                                    size_t size)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    struct iovec iov[2];
+    int iovcnt = 0;
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr) {
+        iov[iovcnt].iov_base = &hdr;
+        iov[iovcnt].iov_len  = sizeof(hdr);
+        iovcnt++;
+    }
+
+    iov[iovcnt].iov_base = (char *)buf;
+    iov[iovcnt].iov_len  = size;
+    iovcnt++;
+
+    return sock_raw_write_packet(s, iov, iovcnt);
+}
+
+static ssize_t sock_raw_receive(VLANClientState *nc, const uint8_t *buf,
+                                size_t size)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    struct iovec iov[1];
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr)
+        return sock_raw_receive_raw(nc, buf, size);
+
+    iov[0].iov_base = (char *)buf;
+    iov[0].iov_len  = size;
+
+    return sock_raw_write_packet(s, iov, 1);
+}
+
+static int sock_raw_can_send(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+
+    return qemu_can_send_packet(&s->nc);
+}
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags)
+{
+    int ret;
+
+    ret = recv(fd, buf, maxlen, flags);
+    return ret;
+}
+
+static void sock_raw_send_completed(VLANClientState *nc, ssize_t len)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    sock_raw_read_poll(s, 1);
+}
+
+static void sock_raw_send(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+    int size;
+
+    do {
+        uint8_t *buf = s->buf;
+
+        size = sock_raw_read_packet(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
+        if (size <= 0)
+            break;
+
+        if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+            buf  += sizeof(struct virtio_net_hdr);
+            size -= sizeof(struct virtio_net_hdr);
+        }
+
+        size = qemu_send_packet_async(&s->nc, buf, size,
+                                      sock_raw_send_completed);
+        if (size == 0)
+            sock_raw_read_poll(s, 0);
+
+    } while (size > 0 && qemu_can_send_packet(&s->nc));
+}
+
+int sock_raw_has_ufo(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+    return s->has_ufo;
+}
+
+int sock_raw_has_vnet_hdr(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+    return s->has_vnet_hdr;
+}
+
+void sock_raw_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    using_vnet_hdr = using_vnet_hdr != 0;
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+    assert(s->has_vnet_hdr == using_vnet_hdr);
+
+    s->using_vnet_hdr = using_vnet_hdr;
+}
+
+void sock_raw_set_offload(VLANClientState *nc, int csum, int tso4,
+                     int tso6, int ecn, int ufo)
+{
+    return;
+}
+
+static void sock_raw_cleanup(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    qemu_purge_queued_packets(nc);
+
+    sock_raw_read_poll(s, 0);
+    sock_raw_write_poll(s, 0);
+    close(s->fd);
+}
+
+int sock_raw_probe_vnet_hdr(int fd)
+{
+    int val, len;
+	
+    len = sizeof(val);
+    if (getsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &val, (socklen_t *)&len)
+                                                                         == 0) 
+        return 1;	
+	
+    return 0;
+}
+
+static NetClientInfo net_raw_info = {
+    .type = NET_CLIENT_TYPE_SOCKET_RAW,
+    .size = sizeof(NetSocketRawState),
+    .receive = sock_raw_receive,
+    .receive_raw = NULL,
+    .receive_iov = sock_raw_receive_iov,
+    .cleanup = sock_raw_cleanup,
+};
+
+
+static NetSocketRawState *net_socket_fd_init_raw(VLANState *vlan,
+                                                 const char *model,
+                                                 const char *name, int fd)
+{
+    VLANClientState *nc;
+    NetSocketRawState *s;
+
+    nc = qemu_new_net_client(&net_raw_info, vlan, NULL, model, name);
+
+    s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    s->fd = fd;
+    s->has_vnet_hdr = sock_raw_probe_vnet_hdr(fd);
+    s->using_vnet_hdr = 0;
+    s->has_ufo = 1;
+    sock_raw_read_poll(s, 1);
+
+    return s;
+}
+
+static int net_socket_raw_ifname_init(VLANState *vlan, const char *model,
+				      const char *name, const char *ifname)
+{
+    struct ifreq req;
+    int fd, ret;
+    struct sockaddr_ll lladdr;
+    int val;
+
+    fd = qemu_socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+    if (fd < 0)
+        fprintf(stderr, "packet socket failed\n");
+
+    memset(&req, 0, sizeof(req));
+    strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
+    ret = ioctl(fd, SIOCGIFINDEX, &req);
+    if (ret < 0)
+        fprintf(stderr, "SIOCGIFINDEX failed\n");
+
+    memset(&lladdr, 0, sizeof(lladdr));
+    lladdr.sll_family   = AF_PACKET;
+    lladdr.sll_protocol = htons(ETH_P_ALL);
+    lladdr.sll_ifindex  = req.ifr_ifindex;
+    ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
+    if (ret < 0)
+        fprintf(stderr, "bind failed\n");
+
+    val = 1;
+    ret=setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, (const char *)&val,
+                   sizeof(val));
+    if (ret < 0) {
+        fprintf(stderr, "setsockopt(SOL_PACKET, PACKET_VNET_HDR) failed\n");
+    } 
+
+    ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+    if (ret < 0)
+        fprintf(stderr, "fcntl(O_NONBLOCK) set failed\n");
+
+    net_socket_fd_init_raw(vlan, model, name, fd);
+
+    return 0;
+}
+
+
 typedef struct NetSocketState {
     VLANClientState nc;
     int fd;
@@ -337,6 +658,8 @@ static NetSocketState *net_socket_fd_init(VLANState *vlan,
         return net_socket_fd_init_dgram(vlan, model, name, fd, is_connected);
     case SOCK_STREAM:
         return net_socket_fd_init_stream(vlan, model, name, fd, is_connected);
+    case SOCK_RAW:
+        return (struct NetSocketState *)net_socket_fd_init_raw(vlan, model, name, fd);
     default:
         /* who knows ... this could be a eg. a pty, do warn and continue as stream */
         fprintf(stderr, "qemu: warning: socket type=%d for fd=%d is not SOCK_DGRAM or SOCK_STREAM\n", so_type, fd);
@@ -519,6 +842,22 @@ int net_init_socket(QemuOpts *opts,
             close(fd);
             return -1;
         }
+    } else if (qemu_opt_get(opts, "ifname")) {
+        const char *ifname;
+
+        if (qemu_opt_get(opts, "fd") ||
+            qemu_opt_get(opts, "connect") ||
+            qemu_opt_get(opts, "listen") ||
+            qemu_opt_get(opts, "mcast")) {
+            qemu_error("fd=, connect= and mcast= and listen= is invalid with ifname=\n");
+            return -1;
+        }
+
+        ifname = qemu_opt_get(opts, "ifname");
+
+        if (net_socket_raw_ifname_init(vlan, "socket", name, ifname) == -1) {
+            return -1;
+        }
     } else if (qemu_opt_get(opts, "listen")) {
         const char *listen;
 
diff --git a/net/socket.h b/net/socket.h
index ea46f02..cc09866 100644
--- a/net/socket.h
+++ b/net/socket.h
@@ -30,4 +30,13 @@
 int net_init_socket(QemuOpts *opts, Monitor *mon,
                     const char *name, VLANState *vlan);
 
+#define PACKET_VNET_HDR	15
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags);
+int sock_raw_has_ufo(VLANClientState *vc);
+int sock_raw_has_vnet_hdr(VLANClientState *vc);
+void sock_raw_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr);
+int sock_raw_probe_vnet_hdr(int fd);
+void sock_raw_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, int ecn, int ufo);
+
 #endif /* QEMU_NET_SOCKET_H */

^ permalink raw reply related	[flat|nested] 45+ messages in thread

* Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu
  2010-01-27 19:54                       ` Anthony Liguori
  2010-01-28  8:12                         ` Arnd Bergmann
@ 2010-02-01 15:47                         ` Or Gerlitz
  1 sibling, 0 replies; 45+ messages in thread
From: Or Gerlitz @ 2010-02-01 15:47 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Michael S. Tsirkin, Sridhar Samudrala, avi, markmc, kvm, qemu-devel

Anthony Liguori wrote:
> Considering VEPA enabled hardware doesn't exist today and the 
> standards aren't even finished being defined, I don't think it's a 
> really strong use case ;-)

Anthony,

VEPA enabled NIC hardware is live and kicking, maybe even @ your onboard 
1Gbs NIC: the intel 82576 (<-- Linux igb network driver) supports SR-IOV 
&& VEPA:

1. register exists which dictates whether the NIC does switching between 
the different VFs or just send every packet transmitted from the VF to 
the "uplink" PF

2. a logic exists which makes sure a "downstream" (incoming from the 
network) packet is never sent to a VF who has the source mac of this 
packet, which account for multicast support.

To learn more about that, see the "Intel 82576 SR-IOV Driver Companion 
Guide", available on the web.

Or.



^ permalink raw reply	[flat|nested] 45+ messages in thread

end of thread, other threads:[~2010-02-01 15:47 UTC | newest]

Thread overview: 45+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-26 20:40 [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu Sridhar Samudrala
2010-01-26 20:47 ` Anthony Liguori
2010-01-26 20:50   ` Anthony Liguori
2010-01-26 23:19     ` Sridhar Samudrala
2010-01-27  9:24     ` Michael S. Tsirkin
2010-01-27  9:34       ` Arnd Bergmann
2010-01-27  9:44         ` Michael S. Tsirkin
2010-01-27 14:03           ` Anthony Liguori
2010-01-27 21:39             ` Arnd Bergmann
2010-01-27 22:56               ` Sridhar Samudrala
2010-01-28  6:06                 ` Arnd Bergmann
2010-01-28 16:53                   ` Jens Osterkamp
2010-01-28 11:22                 ` Or Gerlitz
2010-01-29 20:52                 ` Sridhar Samudrala
2010-01-29 20:52                   ` [Qemu-devel] " Sridhar Samudrala
2010-01-27 14:07       ` Anthony Liguori
2010-01-27 16:59         ` Michael S. Tsirkin
2010-01-27 17:07           ` Anthony Liguori
2010-01-27 17:25             ` Michael S. Tsirkin
2010-01-27 17:36               ` Anthony Liguori
2010-01-27 17:54                 ` Sridhar Samudrala
2010-01-27 18:02                   ` Anthony Liguori
2010-01-27 18:03                     ` Michael S. Tsirkin
2010-01-27 19:54                       ` Anthony Liguori
2010-01-28  8:12                         ` Arnd Bergmann
2010-01-28 13:56                           ` Michael S. Tsirkin
2010-01-28 14:13                             ` Anthony Liguori
2010-01-28 14:39                               ` Anthony Liguori
2010-01-28 14:52                               ` Michael S. Tsirkin
2010-01-28 15:05                                 ` Anthony Liguori
2010-01-28 16:37                                   ` Michael S. Tsirkin
2010-01-28 17:58                                     ` Anthony Liguori
2010-01-28 18:04                                       ` Michael S. Tsirkin
2010-01-28 19:57                                         ` Anthony Liguori
2010-01-29 11:26                                           ` Michael S. Tsirkin
2010-01-28 20:29                                   ` Arnd Bergmann
2010-02-01 15:47                         ` Or Gerlitz
2010-01-27 18:12                 ` Michael S. Tsirkin
2010-01-26 23:15   ` Sridhar Samudrala
2010-01-26 23:15     ` [Qemu-devel] " Sridhar Samudrala
2010-01-27  0:06     ` Anthony Liguori
2010-01-27  6:52       ` Arnd Bergmann
2010-01-27  6:52         ` Arnd Bergmann
2010-01-27 14:14         ` Anthony Liguori
2010-01-27 14:14           ` Anthony Liguori

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.