All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] Unified Socket Driver
@ 2017-07-18 17:08 anton.ivanov
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport anton.ivanov
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: anton.ivanov @ 2017-07-18 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: jasowang

Hi Jason, hi list,

Following is the unified socket transport patch split into parts
to make it more digestible.

Part 1 is a rewrite of the L2TPv3 driver to isolate the common
portions. 

The remaining 2 patches are additional transports 

In all cases the transports are switchless - they allow to connect
VM to a host, VM or a remote network device without involving any
switches in-between. They can be used with a switch too.

My longer term goal is to try to map RX and TX to either virtio
or some other mechanism which allows me to grab a multi-packet
event on TX so I can implement sendmmsg. I will try that in a
future patch.

It should be possible to migrate the UDP half of the existing
socket driver to this infrastructure. Unfortunately, it does
not apply to TCP, so only half of the driver can be migrated.

A.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-18 17:08 [Qemu-devel] Unified Socket Driver anton.ivanov
@ 2017-07-18 17:08 ` anton.ivanov
  2017-07-19  5:39   ` Jason Wang
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support anton.ivanov
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support anton.ivanov
  2 siblings, 1 reply; 23+ messages in thread
From: anton.ivanov @ 2017-07-18 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: jasowang, Anton Ivanov

From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

1. Creates a common backend for socket transports using
recvmmsg().
2. Migrates L2TPv3 to the new backend

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 configure         |  10 +-
 net/Makefile.objs |   2 +-
 net/l2tpv3.c      | 531 +++++++++---------------------------------------------
 net/net.c         |   4 +-
 net/unified.c     | 406 +++++++++++++++++++++++++++++++++++++++++
 net/unified.h     | 118 ++++++++++++
 6 files changed, 613 insertions(+), 458 deletions(-)
 create mode 100644 net/unified.c
 create mode 100644 net/unified.h

diff --git a/configure b/configure
index a3f0522e8f..99a60b723c 100755
--- a/configure
+++ b/configure
@@ -1862,7 +1862,7 @@ if ! compile_object -Werror ; then
 fi
 
 ##########################################
-# L2TPV3 probe
+# UNIFIED probe
 
 cat > $TMPC <<EOF
 #include <sys/socket.h>
@@ -1870,9 +1870,9 @@ cat > $TMPC <<EOF
 int main(void) { return sizeof(struct mmsghdr); }
 EOF
 if compile_prog "" "" ; then
-  l2tpv3=yes
+  unified=yes
 else
-  l2tpv3=no
+  unified=no
 fi
 
 ##########################################
@@ -5458,8 +5458,8 @@ fi
 if test "$netmap" = "yes" ; then
   echo "CONFIG_NETMAP=y" >> $config_host_mak
 fi
-if test "$l2tpv3" = "yes" ; then
-  echo "CONFIG_L2TPV3=y" >> $config_host_mak
+if test "$unified" = "yes" ; then
+  echo "CONFIG_UNIFIED=y" >> $config_host_mak
 fi
 if test "$cap_ng" = "yes" ; then
   echo "CONFIG_LIBCAP=y" >> $config_host_mak
diff --git a/net/Makefile.objs b/net/Makefile.objs
index 67ba5e26fb..8026ad778a 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
 common-obj-y += socket.o
 common-obj-y += dump.o
 common-obj-y += eth.o
-common-obj-$(CONFIG_L2TPV3) += l2tpv3.o
+common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
 common-obj-$(CONFIG_POSIX) += vhost-user.o
 common-obj-$(CONFIG_SLIRP) += slirp.o
 common-obj-$(CONFIG_VDE) += vde.o
diff --git a/net/l2tpv3.c b/net/l2tpv3.c
index 6745b78990..05413c9cbd 100644
--- a/net/l2tpv3.c
+++ b/net/l2tpv3.c
@@ -1,6 +1,7 @@
 /*
  * QEMU System Emulator
  *
+ * Copyright (c) 2015-2017 Cambridge Greys Limited
  * Copyright (c) 2003-2008 Fabrice Bellard
  * Copyright (c) 2012-2014 Cisco Systems
  *
@@ -34,19 +35,9 @@
 #include "qemu/sockets.h"
 #include "qemu/iov.h"
 #include "qemu/main-loop.h"
+#include "unified.h"
 
 
-/* The buffer size needs to be investigated for optimum numbers and
- * optimum means of paging in on different systems. This size is
- * chosen to be sufficient to accommodate one packet with some headers
- */
-
-#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
-#define BUFFER_SIZE 2048
-#define IOVSIZE 2
-#define MAX_L2TPV3_MSGCNT 64
-#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE)
-
 /* Header set to 0x30000 signifies a data packet */
 
 #define L2TPV3_DATA_PACKET 0x30000
@@ -57,31 +48,7 @@
 #define IPPROTO_L2TP 0x73
 #endif
 
-typedef struct NetL2TPV3State {
-    NetClientState nc;
-    int fd;
-
-    /*
-     * these are used for xmit - that happens packet a time
-     * and for first sign of life packet (easier to parse that once)
-     */
-
-    uint8_t *header_buf;
-    struct iovec *vec;
-
-    /*
-     * these are used for receive - try to "eat" up to 32 packets at a time
-     */
-
-    struct mmsghdr *msgvec;
-
-    /*
-     * peer address
-     */
-
-    struct sockaddr_storage *dgram_dst;
-    uint32_t dst_size;
-
+typedef struct L2TPV3TunnelParams {
     /*
      * L2TPv3 parameters
      */
@@ -90,37 +57,8 @@ typedef struct NetL2TPV3State {
     uint64_t tx_cookie;
     uint32_t rx_session;
     uint32_t tx_session;
-    uint32_t header_size;
     uint32_t counter;
 
-    /*
-    * DOS avoidance in error handling
-    */
-
-    bool header_mismatch;
-
-    /*
-     * Ring buffer handling
-     */
-
-    int queue_head;
-    int queue_tail;
-    int queue_depth;
-
-    /*
-     * Precomputed offsets
-     */
-
-    uint32_t offset;
-    uint32_t cookie_offset;
-    uint32_t counter_offset;
-    uint32_t session_offset;
-
-    /* Poll Control */
-
-    bool read_poll;
-    bool write_poll;
-
     /* Flags */
 
     bool ipv6;
@@ -130,189 +68,62 @@ typedef struct NetL2TPV3State {
     bool cookie;
     bool cookie_is_64;
 
-} NetL2TPV3State;
-
-static void net_l2tpv3_send(void *opaque);
-static void l2tpv3_writable(void *opaque);
-
-static void l2tpv3_update_fd_handler(NetL2TPV3State *s)
-{
-    qemu_set_fd_handler(s->fd,
-                        s->read_poll ? net_l2tpv3_send : NULL,
-                        s->write_poll ? l2tpv3_writable : NULL,
-                        s);
-}
-
-static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable)
-{
-    if (s->read_poll != enable) {
-        s->read_poll = enable;
-        l2tpv3_update_fd_handler(s);
-    }
-}
+    /* Precomputed L2TPV3 specific offsets */
+    uint32_t cookie_offset;
+    uint32_t counter_offset;
+    uint32_t session_offset;
 
-static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable)
-{
-    if (s->write_poll != enable) {
-        s->write_poll = enable;
-        l2tpv3_update_fd_handler(s);
-    }
-}
+} L2TPV3TunnelParams;
 
-static void l2tpv3_writable(void *opaque)
-{
-    NetL2TPV3State *s = opaque;
-    l2tpv3_write_poll(s, false);
-    qemu_flush_queued_packets(&s->nc);
-}
 
-static void l2tpv3_send_completed(NetClientState *nc, ssize_t len)
-{
-    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
-    l2tpv3_read_poll(s, true);
-}
 
-static void l2tpv3_poll(NetClientState *nc, bool enable)
+static void l2tpv3_form_header(void *us)
 {
-    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
-    l2tpv3_write_poll(s, enable);
-    l2tpv3_read_poll(s, enable);
-}
+    NetUnifiedState *s = (NetUnifiedState *) us;
+    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
 
-static void l2tpv3_form_header(NetL2TPV3State *s)
-{
     uint32_t *counter;
 
-    if (s->udp) {
+    if (p->udp) {
         stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET);
     }
     stl_be_p(
-            (uint32_t *) (s->header_buf + s->session_offset),
-            s->tx_session
+            (uint32_t *) (s->header_buf + p->session_offset),
+            p->tx_session
         );
-    if (s->cookie) {
-        if (s->cookie_is_64) {
+    if (p->cookie) {
+        if (p->cookie_is_64) {
             stq_be_p(
-                (uint64_t *)(s->header_buf + s->cookie_offset),
-                s->tx_cookie
+                (uint64_t *)(s->header_buf + p->cookie_offset),
+                p->tx_cookie
             );
         } else {
             stl_be_p(
-                (uint32_t *) (s->header_buf + s->cookie_offset),
-                s->tx_cookie
+                (uint32_t *) (s->header_buf + p->cookie_offset),
+                p->tx_cookie
             );
         }
     }
-    if (s->has_counter) {
-        counter = (uint32_t *)(s->header_buf + s->counter_offset);
-        if (s->pin_counter) {
+    if (p->has_counter) {
+        counter = (uint32_t *)(s->header_buf + p->counter_offset);
+        if (p->pin_counter) {
             *counter = 0;
         } else {
-            stl_be_p(counter, ++s->counter);
-        }
-    }
-}
-
-static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc,
-                    const struct iovec *iov,
-                    int iovcnt)
-{
-    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
-
-    struct msghdr message;
-    int ret;
-
-    if (iovcnt > MAX_L2TPV3_IOVCNT - 1) {
-        error_report(
-            "iovec too long %d > %d, change l2tpv3.h",
-            iovcnt, MAX_L2TPV3_IOVCNT
-        );
-        return -1;
-    }
-    l2tpv3_form_header(s);
-    memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
-    s->vec->iov_base = s->header_buf;
-    s->vec->iov_len = s->offset;
-    message.msg_name = s->dgram_dst;
-    message.msg_namelen = s->dst_size;
-    message.msg_iov = s->vec;
-    message.msg_iovlen = iovcnt + 1;
-    message.msg_control = NULL;
-    message.msg_controllen = 0;
-    message.msg_flags = 0;
-    do {
-        ret = sendmsg(s->fd, &message, 0);
-    } while ((ret == -1) && (errno == EINTR));
-    if (ret > 0) {
-        ret -= s->offset;
-    } else if (ret == 0) {
-        /* belt and braces - should not occur on DGRAM
-        * we should get an error and never a 0 send
-        */
-        ret = iov_size(iov, iovcnt);
-    } else {
-        /* signal upper layer that socket buffer is full */
-        ret = -errno;
-        if (ret == -EAGAIN || ret == -ENOBUFS) {
-            l2tpv3_write_poll(s, true);
-            ret = 0;
+            stl_be_p(counter, ++p->counter);
         }
     }
-    return ret;
 }
 
-static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc,
-                    const uint8_t *buf,
-                    size_t size)
-{
-    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
-
-    struct iovec *vec;
-    struct msghdr message;
-    ssize_t ret = 0;
-
-    l2tpv3_form_header(s);
-    vec = s->vec;
-    vec->iov_base = s->header_buf;
-    vec->iov_len = s->offset;
-    vec++;
-    vec->iov_base = (void *) buf;
-    vec->iov_len = size;
-    message.msg_name = s->dgram_dst;
-    message.msg_namelen = s->dst_size;
-    message.msg_iov = s->vec;
-    message.msg_iovlen = 2;
-    message.msg_control = NULL;
-    message.msg_controllen = 0;
-    message.msg_flags = 0;
-    do {
-        ret = sendmsg(s->fd, &message, 0);
-    } while ((ret == -1) && (errno == EINTR));
-    if (ret > 0) {
-        ret -= s->offset;
-    } else if (ret == 0) {
-        /* belt and braces - should not occur on DGRAM
-        * we should get an error and never a 0 send
-        */
-        ret = size;
-    } else {
-        ret = -errno;
-        if (ret == -EAGAIN || ret == -ENOBUFS) {
-            /* signal upper layer that socket buffer is full */
-            l2tpv3_write_poll(s, true);
-            ret = 0;
-        }
-    }
-    return ret;
-}
 
-static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
+static int l2tpv3_verify_header(void *us, uint8_t *buf)
 {
 
+    NetUnifiedState *s = (NetUnifiedState *) us;
+    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
     uint32_t *session;
     uint64_t cookie;
 
-    if ((!s->udp) && (!s->ipv6)) {
+    if ((!p->udp) && (!p->ipv6)) {
         buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
     }
 
@@ -321,21 +132,21 @@ static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
     * that anyway.
     */
 
-    if (s->cookie) {
-        if (s->cookie_is_64) {
-            cookie = ldq_be_p(buf + s->cookie_offset);
+    if (p->cookie) {
+        if (p->cookie_is_64) {
+            cookie = ldq_be_p(buf + p->cookie_offset);
         } else {
-            cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL;
+            cookie = ldl_be_p(buf + p->cookie_offset) & 0xffffffffULL;
         }
-        if (cookie != s->rx_cookie) {
+        if (cookie != p->rx_cookie) {
             if (!s->header_mismatch) {
                 error_report("unknown cookie id");
             }
             return -1;
         }
     }
-    session = (uint32_t *) (buf + s->session_offset);
-    if (ldl_be_p(session) != s->rx_session) {
+    session = (uint32_t *) (buf + p->session_offset);
+    if (ldl_be_p(session) != p->rx_session) {
         if (!s->header_mismatch) {
             error_report("session mismatch");
         }
@@ -344,203 +155,31 @@ static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
     return 0;
 }
 
-static void net_l2tpv3_process_queue(NetL2TPV3State *s)
-{
-    int size = 0;
-    struct iovec *vec;
-    bool bad_read;
-    int data_size;
-    struct mmsghdr *msgvec;
-
-    /* go into ring mode only if there is a "pending" tail */
-    if (s->queue_depth > 0) {
-        do {
-            msgvec = s->msgvec + s->queue_tail;
-            if (msgvec->msg_len > 0) {
-                data_size = msgvec->msg_len - s->header_size;
-                vec = msgvec->msg_hdr.msg_iov;
-                if ((data_size > 0) &&
-                    (l2tpv3_verify_header(s, vec->iov_base) == 0)) {
-                    vec++;
-                    /* Use the legacy delivery for now, we will
-                     * switch to using our own ring as a queueing mechanism
-                     * at a later date
-                     */
-                    size = qemu_send_packet_async(
-                            &s->nc,
-                            vec->iov_base,
-                            data_size,
-                            l2tpv3_send_completed
-                        );
-                    if (size == 0) {
-                        l2tpv3_read_poll(s, false);
-                    }
-                    bad_read = false;
-                } else {
-                    bad_read = true;
-                    if (!s->header_mismatch) {
-                        /* report error only once */
-                        error_report("l2tpv3 header verification failed");
-                        s->header_mismatch = true;
-                    }
-                }
-            } else {
-                bad_read = true;
-            }
-            s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT;
-            s->queue_depth--;
-        } while (
-                (s->queue_depth > 0) &&
-                 qemu_can_send_packet(&s->nc) &&
-                ((size > 0) || bad_read)
-            );
-    }
-}
-
-static void net_l2tpv3_send(void *opaque)
-{
-    NetL2TPV3State *s = opaque;
-    int target_count, count;
-    struct mmsghdr *msgvec;
-
-    /* go into ring mode only if there is a "pending" tail */
-
-    if (s->queue_depth) {
-
-        /* The ring buffer we use has variable intake
-         * count of how much we can read varies - adjust accordingly
-         */
-
-        target_count = MAX_L2TPV3_MSGCNT - s->queue_depth;
-
-        /* Ensure we do not overrun the ring when we have
-         * a lot of enqueued packets
-         */
-
-        if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) {
-            target_count = MAX_L2TPV3_MSGCNT - s->queue_head;
-        }
-    } else {
-
-        /* we do not have any pending packets - we can use
-        * the whole message vector linearly instead of using
-        * it as a ring
-        */
-
-        s->queue_head = 0;
-        s->queue_tail = 0;
-        target_count = MAX_L2TPV3_MSGCNT;
-    }
-
-    msgvec = s->msgvec + s->queue_head;
-    if (target_count > 0) {
-        do {
-            count = recvmmsg(
-                s->fd,
-                msgvec,
-                target_count, MSG_DONTWAIT, NULL);
-        } while ((count == -1) && (errno == EINTR));
-        if (count < 0) {
-            /* Recv error - we still need to flush packets here,
-             * (re)set queue head to current position
-             */
-            count = 0;
-        }
-        s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT;
-        s->queue_depth += count;
-    }
-    net_l2tpv3_process_queue(s);
-}
-
-static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount)
-{
-    int i, j;
-    struct iovec *iov;
-    struct mmsghdr *cleanup = msgvec;
-    if (cleanup) {
-        for (i = 0; i < count; i++) {
-            if (cleanup->msg_hdr.msg_iov) {
-                iov = cleanup->msg_hdr.msg_iov;
-                for (j = 0; j < iovcount; j++) {
-                    g_free(iov->iov_base);
-                    iov++;
-                }
-                g_free(cleanup->msg_hdr.msg_iov);
-            }
-            cleanup++;
-        }
-        g_free(msgvec);
-    }
-}
-
-static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count)
-{
-    int i;
-    struct iovec *iov;
-    struct mmsghdr *msgvec, *result;
-
-    msgvec = g_new(struct mmsghdr, count);
-    result = msgvec;
-    for (i = 0; i < count ; i++) {
-        msgvec->msg_hdr.msg_name = NULL;
-        msgvec->msg_hdr.msg_namelen = 0;
-        iov =  g_new(struct iovec, IOVSIZE);
-        msgvec->msg_hdr.msg_iov = iov;
-        iov->iov_base = g_malloc(s->header_size);
-        iov->iov_len = s->header_size;
-        iov++ ;
-        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
-        iov->iov_len = BUFFER_SIZE;
-        msgvec->msg_hdr.msg_iovlen = 2;
-        msgvec->msg_hdr.msg_control = NULL;
-        msgvec->msg_hdr.msg_controllen = 0;
-        msgvec->msg_hdr.msg_flags = 0;
-        msgvec++;
-    }
-    return result;
-}
-
-static void net_l2tpv3_cleanup(NetClientState *nc)
-{
-    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
-    qemu_purge_queued_packets(nc);
-    l2tpv3_read_poll(s, false);
-    l2tpv3_write_poll(s, false);
-    if (s->fd >= 0) {
-        close(s->fd);
-    }
-    destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE);
-    g_free(s->vec);
-    g_free(s->header_buf);
-    g_free(s->dgram_dst);
-}
-
-static NetClientInfo net_l2tpv3_info = {
-    .type = NET_CLIENT_DRIVER_L2TPV3,
-    .size = sizeof(NetL2TPV3State),
-    .receive = net_l2tpv3_receive_dgram,
-    .receive_iov = net_l2tpv3_receive_dgram_iov,
-    .poll = l2tpv3_poll,
-    .cleanup = net_l2tpv3_cleanup,
-};
-
 int net_init_l2tpv3(const Netdev *netdev,
                     const char *name,
                     NetClientState *peer, Error **errp)
 {
     /* FIXME error_setg(errp, ...) on failure */
     const NetdevL2TPv3Options *l2tpv3;
-    NetL2TPV3State *s;
+    NetUnifiedState *s;
     NetClientState *nc;
+    L2TPV3TunnelParams *p;
+
     int fd = -1, gairet;
     struct addrinfo hints;
     struct addrinfo *result = NULL;
     char *srcport, *dstport;
 
-    nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name);
+    nc = qemu_new_unified_net_client(name, peer);
+
+    s = DO_UPCAST(NetUnifiedState, nc, nc);
+
+    p = g_malloc(sizeof(L2TPV3TunnelParams));
 
-    s = DO_UPCAST(NetL2TPV3State, nc, nc);
+    s->params = p;
 
+    s->form_header = &l2tpv3_form_header;
+    s->verify_header = &l2tpv3_verify_header;
     s->queue_head = 0;
     s->queue_tail = 0;
     s->header_mismatch = false;
@@ -549,9 +188,9 @@ int net_init_l2tpv3(const Netdev *netdev,
     l2tpv3 = &netdev->u.l2tpv3;
 
     if (l2tpv3->has_ipv6 && l2tpv3->ipv6) {
-        s->ipv6 = l2tpv3->ipv6;
+        p->ipv6 = l2tpv3->ipv6;
     } else {
-        s->ipv6 = false;
+        p->ipv6 = false;
     }
 
     if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) {
@@ -561,22 +200,22 @@ int net_init_l2tpv3(const Netdev *netdev,
 
     if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) {
         if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) {
-            s->cookie = true;
+            p->cookie = true;
         } else {
             goto outerr;
         }
     } else {
-        s->cookie = false;
+        p->cookie = false;
     }
 
     if (l2tpv3->has_cookie64 || l2tpv3->cookie64) {
-        s->cookie_is_64  = true;
+        p->cookie_is_64  = true;
     } else {
-        s->cookie_is_64  = false;
+        p->cookie_is_64  = false;
     }
 
     if (l2tpv3->has_udp && l2tpv3->udp) {
-        s->udp = true;
+        p->udp = true;
         if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) {
             error_report("l2tpv3_open : need both src and dst port for udp");
             goto outerr;
@@ -585,52 +224,52 @@ int net_init_l2tpv3(const Netdev *netdev,
             dstport = l2tpv3->dstport;
         }
     } else {
-        s->udp = false;
+        p->udp = false;
         srcport = NULL;
         dstport = NULL;
     }
 
 
     s->offset = 4;
-    s->session_offset = 0;
-    s->cookie_offset = 4;
-    s->counter_offset = 4;
+    p->session_offset = 0;
+    p->cookie_offset = 4;
+    p->counter_offset = 4;
 
-    s->tx_session = l2tpv3->txsession;
+    p->tx_session = l2tpv3->txsession;
     if (l2tpv3->has_rxsession) {
-        s->rx_session = l2tpv3->rxsession;
+        p->rx_session = l2tpv3->rxsession;
     } else {
-        s->rx_session = s->tx_session;
+        p->rx_session = p->tx_session;
     }
 
-    if (s->cookie) {
-        s->rx_cookie = l2tpv3->rxcookie;
-        s->tx_cookie = l2tpv3->txcookie;
-        if (s->cookie_is_64 == true) {
+    if (p->cookie) {
+        p->rx_cookie = l2tpv3->rxcookie;
+        p->tx_cookie = l2tpv3->txcookie;
+        if (p->cookie_is_64 == true) {
             /* 64 bit cookie */
             s->offset += 8;
-            s->counter_offset += 8;
+            p->counter_offset += 8;
         } else {
             /* 32 bit cookie */
             s->offset += 4;
-            s->counter_offset += 4;
+            p->counter_offset += 4;
         }
     }
 
     memset(&hints, 0, sizeof(hints));
 
-    if (s->ipv6) {
+    if (p->ipv6) {
         hints.ai_family = AF_INET6;
     } else {
         hints.ai_family = AF_INET;
     }
-    if (s->udp) {
+    if (p->udp) {
         hints.ai_socktype = SOCK_DGRAM;
         hints.ai_protocol = 0;
         s->offset += 4;
-        s->counter_offset += 4;
-        s->session_offset += 4;
-        s->cookie_offset += 4;
+        p->counter_offset += 4;
+        p->session_offset += 4;
+        p->cookie_offset += 4;
     } else {
         hints.ai_socktype = SOCK_RAW;
         hints.ai_protocol = IPPROTO_L2TP;
@@ -661,12 +300,12 @@ int net_init_l2tpv3(const Netdev *netdev,
 
     memset(&hints, 0, sizeof(hints));
 
-    if (s->ipv6) {
+    if (p->ipv6) {
         hints.ai_family = AF_INET6;
     } else {
         hints.ai_family = AF_INET;
     }
-    if (s->udp) {
+    if (p->udp) {
         hints.ai_socktype = SOCK_DGRAM;
         hints.ai_protocol = 0;
     } else {
@@ -693,17 +332,17 @@ int net_init_l2tpv3(const Netdev *netdev,
     }
 
     if (l2tpv3->has_counter && l2tpv3->counter) {
-        s->has_counter = true;
+        p->has_counter = true;
         s->offset += 4;
     } else {
-        s->has_counter = false;
+        p->has_counter = false;
     }
 
     if (l2tpv3->has_pincounter && l2tpv3->pincounter) {
-        s->has_counter = true;  /* pin counter implies that there is counter */
-        s->pin_counter = true;
+        p->has_counter = true;  /* pin counter implies that there is counter */
+        p->pin_counter = true;
     } else {
-        s->pin_counter = false;
+        p->pin_counter = false;
     }
 
     if (l2tpv3->has_offset) {
@@ -711,22 +350,14 @@ int net_init_l2tpv3(const Netdev *netdev,
         s->offset += l2tpv3->offset;
     }
 
-    if ((s->ipv6) || (s->udp)) {
+    if ((p->ipv6) || (p->udp)) {
         s->header_size = s->offset;
     } else {
         s->header_size = s->offset + sizeof(struct iphdr);
     }
 
-    s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT);
-    s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT);
-    s->header_buf = g_malloc(s->header_size);
-
-    qemu_set_nonblock(fd);
-
-    s->fd = fd;
-    s->counter = 0;
-
-    l2tpv3_read_poll(s, true);
+    qemu_net_finalize_unified_init(s, fd);
+    p->counter = 0;
 
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "l2tpv3: connected");
diff --git a/net/net.c b/net/net.c
index 6235aabed8..9270b52ac8 100644
--- a/net/net.c
+++ b/net/net.c
@@ -959,8 +959,8 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
 #ifdef CONFIG_VHOST_NET_USED
         [NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user,
 #endif
-#ifdef CONFIG_L2TPV3
-        [NET_CLIENT_DRIVER_L2TPV3]    = net_init_l2tpv3,
+#ifdef CONFIG_UNIFIED
+        [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
 #endif
 };
 
diff --git a/net/unified.c b/net/unified.c
new file mode 100644
index 0000000000..f15d1e1eed
--- /dev/null
+++ b/net/unified.c
@@ -0,0 +1,406 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2015-2017 Cambridge Greys Limited
+ * Copyright (c) 2012-2014 Cisco Systems
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <linux/ip.h>
+#include <netdb.h>
+#include "net/net.h"
+#include "clients.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "unified.h"
+
+static void net_unified_send(void *opaque);
+static void unified_writable(void *opaque);
+
+static void unified_update_fd_handler(NetUnifiedState *s)
+{
+    qemu_set_fd_handler(s->fd,
+                        s->read_poll ? net_unified_send : NULL,
+                        s->write_poll ? unified_writable : NULL,
+                        s);
+}
+
+static void unified_read_poll(NetUnifiedState *s, bool enable)
+{
+    if (s->read_poll != enable) {
+        s->read_poll = enable;
+        unified_update_fd_handler(s);
+    }
+}
+
+static void unified_write_poll(NetUnifiedState *s, bool enable)
+{
+    if (s->write_poll != enable) {
+        s->write_poll = enable;
+        unified_update_fd_handler(s);
+    }
+}
+
+static void unified_writable(void *opaque)
+{
+    NetUnifiedState *s = opaque;
+    unified_write_poll(s, false);
+    qemu_flush_queued_packets(&s->nc);
+}
+
+static void unified_send_completed(NetClientState *nc, ssize_t len)
+{
+    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
+    unified_read_poll(s, true);
+}
+
+static void unified_poll(NetClientState *nc, bool enable)
+{
+    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
+    unified_write_poll(s, enable);
+    unified_read_poll(s, enable);
+}
+
+static ssize_t net_unified_receive_dgram_iov(NetClientState *nc,
+                    const struct iovec *iov,
+                    int iovcnt)
+{
+    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
+
+    struct msghdr message;
+    int ret;
+
+    if (iovcnt > MAX_UNIFIED_IOVCNT - 1) {
+        error_report(
+            "iovec too long %d > %d, change unified.h",
+            iovcnt, MAX_UNIFIED_IOVCNT
+        );
+        return -1;
+    }
+    if (s->offset > 0) {
+        s->form_header(s);
+        memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
+        s->vec->iov_base = s->header_buf;
+        s->vec->iov_len = s->offset;
+        message.msg_iovlen = iovcnt + 1;
+    } else {
+        memcpy(s->vec, iov, iovcnt * sizeof(struct iovec));
+        message.msg_iovlen = iovcnt;
+    }
+    message.msg_name = s->dgram_dst;
+    message.msg_namelen = s->dst_size;
+    message.msg_iov = s->vec;
+    message.msg_control = NULL;
+    message.msg_controllen = 0;
+    message.msg_flags = 0;
+    do {
+        ret = sendmsg(s->fd, &message, 0);
+    } while ((ret == -1) && (errno == EINTR));
+    if (ret > 0) {
+        ret -= s->offset;
+    } else if (ret == 0) {
+        /* belt and braces - should not occur on DGRAM
+        * we should get an error and never a 0 send
+        */
+        ret = iov_size(iov, iovcnt);
+    } else {
+        /* signal upper layer that socket buffer is full */
+        ret = -errno;
+        if (ret == -EAGAIN || ret == -ENOBUFS) {
+            unified_write_poll(s, true);
+            ret = 0;
+        }
+    }
+    return ret;
+}
+
+static ssize_t net_unified_receive_dgram(NetClientState *nc,
+                    const uint8_t *buf,
+                    size_t size)
+{
+    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
+
+    struct iovec *vec;
+    struct msghdr message;
+    ssize_t ret = 0;
+
+    vec = s->vec;
+    if (s->offset > 0) {
+        s->form_header(s);
+        vec->iov_base = s->header_buf;
+        vec->iov_len = s->offset;
+        message.msg_iovlen = 2;
+        vec++;
+    } else {
+        message.msg_iovlen = 1;
+    }
+    vec->iov_base = (void *) buf;
+    vec->iov_len = size;
+    message.msg_name = s->dgram_dst;
+    message.msg_namelen = s->dst_size;
+    message.msg_iov = s->vec;
+    message.msg_control = NULL;
+    message.msg_controllen = 0;
+    message.msg_flags = 0;
+    do {
+        ret = sendmsg(s->fd, &message, 0);
+    } while ((ret == -1) && (errno == EINTR));
+    if (ret > 0) {
+        ret -= s->offset;
+    } else if (ret == 0) {
+        /* belt and braces - should not occur on DGRAM
+        * we should get an error and never a 0 send
+        */
+        ret = size;
+    } else {
+        ret = -errno;
+        if (ret == -EAGAIN || ret == -ENOBUFS) {
+            /* signal upper layer that socket buffer is full */
+            unified_write_poll(s, true);
+            ret = 0;
+        }
+    }
+    return ret;
+}
+
+
+static void net_unified_process_queue(NetUnifiedState *s)
+{
+    int size = 0;
+    struct iovec *vec;
+    bool bad_read;
+    int data_size;
+    struct mmsghdr *msgvec;
+
+    /* go into ring mode only if there is a "pending" tail */
+    if (s->queue_depth > 0) {
+        do {
+            msgvec = s->msgvec + s->queue_tail;
+            if (msgvec->msg_len > 0) {
+                data_size = msgvec->msg_len - s->header_size;
+                vec = msgvec->msg_hdr.msg_iov;
+                if ((data_size > 0) &&
+                    (s->verify_header(s, vec->iov_base) == 0)) {
+                    if (s->header_size > 0) {
+                        vec++;
+                    }
+                    /* Use the legacy delivery for now, we will
+                     * switch to using our own ring as a queueing mechanism
+                     * at a later date
+                     */
+                    size = qemu_send_packet_async(
+                            &s->nc,
+                            vec->iov_base,
+                            data_size,
+                            unified_send_completed
+                        );
+                    if (size == 0) {
+                        unified_read_poll(s, false);
+                    }
+                    bad_read = false;
+                } else {
+                    bad_read = true;
+                    if (!s->header_mismatch) {
+                        /* report error only once */
+                        error_report("unified header verification failed");
+                        s->header_mismatch = true;
+                    }
+                }
+            } else {
+                bad_read = true;
+            }
+            s->queue_tail = (s->queue_tail + 1) % MAX_UNIFIED_MSGCNT;
+            s->queue_depth--;
+        } while (
+                (s->queue_depth > 0) &&
+                 qemu_can_send_packet(&s->nc) &&
+                ((size > 0) || bad_read)
+            );
+    }
+}
+
+static void net_unified_send(void *opaque)
+{
+    NetUnifiedState *s = opaque;
+    int target_count, count;
+    struct mmsghdr *msgvec;
+
+    /* go into ring mode only if there is a "pending" tail */
+
+    if (s->queue_depth) {
+
+        /* The ring buffer we use has variable intake
+         * count of how much we can read varies - adjust accordingly
+         */
+
+        target_count = MAX_UNIFIED_MSGCNT - s->queue_depth;
+
+        /* Ensure we do not overrun the ring when we have
+         * a lot of enqueued packets
+         */
+
+        if (s->queue_head + target_count > MAX_UNIFIED_MSGCNT) {
+            target_count = MAX_UNIFIED_MSGCNT - s->queue_head;
+        }
+    } else {
+
+        /* we do not have any pending packets - we can use
+        * the whole message vector linearly instead of using
+        * it as a ring
+        */
+
+        s->queue_head = 0;
+        s->queue_tail = 0;
+        target_count = MAX_UNIFIED_MSGCNT;
+    }
+
+    msgvec = s->msgvec + s->queue_head;
+    if (target_count > 0) {
+        do {
+            count = recvmmsg(
+                s->fd,
+                msgvec,
+                target_count, MSG_DONTWAIT, NULL);
+        } while ((count == -1) && (errno == EINTR));
+        if (count < 0) {
+            /* Recv error - we still need to flush packets here,
+             * (re)set queue head to current position
+             */
+            count = 0;
+        }
+        s->queue_head = (s->queue_head + count) % MAX_UNIFIED_MSGCNT;
+        s->queue_depth += count;
+    }
+    net_unified_process_queue(s);
+}
+
+static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount)
+{
+    int i, j;
+    struct iovec *iov;
+    struct mmsghdr *cleanup = msgvec;
+    if (cleanup) {
+        for (i = 0; i < count; i++) {
+            if (cleanup->msg_hdr.msg_iov) {
+                iov = cleanup->msg_hdr.msg_iov;
+                for (j = 0; j < iovcount; j++) {
+                    g_free(iov->iov_base);
+                    iov++;
+                }
+                g_free(cleanup->msg_hdr.msg_iov);
+            }
+            cleanup++;
+        }
+        g_free(msgvec);
+    }
+}
+
+
+
+static struct mmsghdr *build_unified_vector(NetUnifiedState *s, int count)
+{
+    int i;
+    struct iovec *iov;
+    struct mmsghdr *msgvec, *result;
+
+    msgvec = g_new(struct mmsghdr, count);
+    result = msgvec;
+    for (i = 0; i < count ; i++) {
+        msgvec->msg_hdr.msg_name = NULL;
+        msgvec->msg_hdr.msg_namelen = 0;
+        iov =  g_new(struct iovec, IOVSIZE);
+        msgvec->msg_hdr.msg_iov = iov;
+        if (s->header_size > 0) {
+            iov->iov_base = g_malloc(s->header_size);
+            iov->iov_len = s->header_size;
+            iov++ ;
+        }
+        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
+        iov->iov_len = BUFFER_SIZE;
+        msgvec->msg_hdr.msg_iovlen = 2;
+        msgvec->msg_hdr.msg_control = NULL;
+        msgvec->msg_hdr.msg_controllen = 0;
+        msgvec->msg_hdr.msg_flags = 0;
+        msgvec++;
+    }
+    return result;
+}
+
+static void net_unified_cleanup(NetClientState *nc)
+{
+    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
+    qemu_purge_queued_packets(nc);
+    unified_read_poll(s, false);
+    unified_write_poll(s, false);
+    if (s->fd >= 0) {
+        close(s->fd);
+    }
+    if (s->header_size > 0) {
+        destroy_vector(s->msgvec, MAX_UNIFIED_MSGCNT, IOVSIZE);
+    } else {
+        destroy_vector(s->msgvec, MAX_UNIFIED_MSGCNT, 1);
+    }
+    g_free(s->vec);
+    if (s->header_buf != NULL) {
+        g_free(s->header_buf);
+    }
+    if (s->dgram_dst != NULL) {
+        g_free(s->dgram_dst);
+    }
+}
+
+static NetClientInfo net_unified_info = {
+    /* we share this one for all types for now, wrong I know :) */
+    .type = NET_CLIENT_DRIVER_L2TPV3,
+    .size = sizeof(NetUnifiedState),
+    .receive = net_unified_receive_dgram,
+    .receive_iov = net_unified_receive_dgram_iov,
+    .poll = unified_poll,
+    .cleanup = net_unified_cleanup,
+};
+
+NetClientState *qemu_new_unified_net_client(const char *name,
+                    NetClientState *peer) {
+    return qemu_new_net_client(&net_unified_info, peer, "unified", name);
+}
+
+void qemu_net_finalize_unified_init(NetUnifiedState *s, int fd)
+{
+
+    s->msgvec = build_unified_vector(s, MAX_UNIFIED_MSGCNT);
+    s->vec = g_new(struct iovec, MAX_UNIFIED_IOVCNT);
+    if (s->header_size > 0) {
+        s->header_buf = g_malloc(s->header_size);
+    } else {
+        s->header_buf = NULL;
+    }
+    qemu_set_nonblock(fd);
+
+    s->fd = fd;
+    unified_read_poll(s, true);
+
+}
+
diff --git a/net/unified.h b/net/unified.h
new file mode 100644
index 0000000000..97ec743f0e
--- /dev/null
+++ b/net/unified.h
@@ -0,0 +1,118 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2015-2017 Cambridge Greys Limited
+ * Copyright (c) 2012-2014 Cisco Systems
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+
+#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
+#define BUFFER_SIZE 2048
+#define IOVSIZE 2
+#define MAX_UNIFIED_MSGCNT 64
+#define MAX_UNIFIED_IOVCNT (MAX_UNIFIED_MSGCNT * IOVSIZE)
+
+#ifndef QEMU_NET_UNIFIED_H
+#define QEMU_NET_UNIFIED_H
+
+typedef struct NetUnifiedState {
+    NetClientState nc;
+
+    int fd;
+
+    /*
+     * these are used for xmit - that happens packet a time
+     * and for first sign of life packet (easier to parse that once)
+     */
+
+    uint8_t *header_buf;
+    struct iovec *vec;
+
+    /*
+     * these are used for receive - try to "eat" up to 32 packets at a time
+     */
+
+    struct mmsghdr *msgvec;
+
+    /*
+     * peer address
+     */
+
+    struct sockaddr_storage *dgram_dst;
+    uint32_t dst_size;
+
+    /*
+     * Internal Queue
+     */
+
+    /*
+    * DOS avoidance in error handling
+    */
+
+    /* Easier to keep l2tpv3 specific */
+
+    bool header_mismatch;
+
+    /*
+     *
+     * Ring buffer handling
+     *
+     */
+
+    int queue_head;
+    int queue_tail;
+    int queue_depth;
+
+    /*
+     * Offset to data - common for all protocols
+     */
+
+    uint32_t offset;
+
+    /*
+     * Header size - common for all protocols
+     */
+
+    uint32_t header_size;
+    /* Poll Control */
+
+    bool read_poll;
+    bool write_poll;
+
+    /* Parameters */
+
+    void *params;
+
+    /* header forming functions */
+
+    int (*verify_header)(void *s, uint8_t *buf);
+    void (*form_header)(void *s);
+
+} NetUnifiedState;
+
+extern NetClientState *qemu_new_unified_net_client(const char *name,
+                    NetClientState *peer);
+
+extern void qemu_net_finalize_unified_init(NetUnifiedState *s, int fd);
+#endif
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-18 17:08 [Qemu-devel] Unified Socket Driver anton.ivanov
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport anton.ivanov
@ 2017-07-18 17:08 ` anton.ivanov
  2017-07-19  5:48   ` Jason Wang
  2017-07-19 14:40   ` Eric Blake
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support anton.ivanov
  2 siblings, 2 replies; 23+ messages in thread
From: anton.ivanov @ 2017-07-18 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: jasowang, Anton Ivanov

From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

This adds GRETAP support to the unified socket driver.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 net/Makefile.objs |   2 +-
 net/clients.h     |   4 +
 net/gre.c         | 313 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/net.c         |   5 +
 qapi-schema.json  |  46 +++++++-
 qemu-options.hx   |  63 ++++++++++-
 6 files changed, 425 insertions(+), 8 deletions(-)
 create mode 100644 net/gre.c

diff --git a/net/Makefile.objs b/net/Makefile.objs
index 8026ad778a..128164e39b 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
 common-obj-y += socket.o
 common-obj-y += dump.o
 common-obj-y += eth.o
-common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
+common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o
 common-obj-$(CONFIG_POSIX) += vhost-user.o
 common-obj-$(CONFIG_SLIRP) += slirp.o
 common-obj-$(CONFIG_VDE) += vde.o
diff --git a/net/clients.h b/net/clients.h
index 5cae479730..8f8a59aee3 100644
--- a/net/clients.h
+++ b/net/clients.h
@@ -49,6 +49,10 @@ int net_init_bridge(const Netdev *netdev, const char *name,
 
 int net_init_l2tpv3(const Netdev *netdev, const char *name,
                     NetClientState *peer, Error **errp);
+
+int net_init_gre(const Netdev *netdev, const char *name,
+                    NetClientState *peer, Error **errp);
+
 #ifdef CONFIG_VDE
 int net_init_vde(const Netdev *netdev, const char *name,
                  NetClientState *peer, Error **errp);
diff --git a/net/gre.c b/net/gre.c
new file mode 100644
index 0000000000..ee8c36dd4d
--- /dev/null
+++ b/net/gre.c
@@ -0,0 +1,313 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2015-2017 Cambridge GREys Limited
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2012-2014 Cisco Systems
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <linux/ip.h>
+#include <netdb.h>
+#include "net/net.h"
+#include "clients.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "unified.h"
+
+/* IANA-assigned IP protocol ID for GRE */
+
+
+#ifndef IPPROTO_GRE
+#define IPPROTO_GRE 0x2F
+#endif
+
+#define GRE_MODE_CHECKSUM     htons(8 << 12)   /* checksum */
+#define GRE_MODE_RESERVED     htons(4 << 12)   /* unused */
+#define GRE_MODE_KEY          htons(2 << 12)   /* KEY present */
+#define GRE_MODE_SEQUENCE     htons(1 << 12)   /* no sequence */
+
+
+/* GRE TYPE for Ethernet in GRE aka GRETAP */
+
+#define GRE_IRB htons(0x6558)
+
+struct gre_minimal_header {
+   uint16_t header;
+   uint16_t arptype;
+};
+
+typedef struct GRETunnelParams {
+    /*
+     * GRE parameters
+     */
+
+    uint32_t rx_key;
+    uint32_t tx_key;
+    uint32_t sequence;
+
+    /* Flags */
+
+    bool ipv6;
+    bool udp;
+    bool has_sequence;
+    bool pin_sequence;
+    bool checksum;
+    bool key;
+
+    /* Precomputed GRE specific offsets */
+
+    uint32_t key_offset;
+    uint32_t sequence_offset;
+    uint32_t checksum_offset;
+
+    struct gre_minimal_header header_bits;
+
+} GRETunnelParams;
+
+
+
+static void gre_form_header(void *us)
+{
+    NetUnifiedState *s = (NetUnifiedState *) us;
+    GRETunnelParams *p = (GRETunnelParams *) s->params;
+
+    uint32_t *sequence;
+
+    *((uint32_t *) s->header_buf) = *((uint32_t *) &p->header_bits);
+
+    if (p->key) {
+        stl_be_p(
+            (uint32_t *) (s->header_buf + p->key_offset),
+            p->tx_key
+        );
+    }
+    if (p->has_sequence) {
+        sequence = (uint32_t *)(s->header_buf + p->sequence_offset);
+        if (p->pin_sequence) {
+            *sequence = 0;
+        } else {
+            stl_be_p(sequence, ++p->sequence);
+        }
+    }
+}
+
+static int gre_verify_header(void *us, uint8_t *buf)
+{
+
+    NetUnifiedState *s = (NetUnifiedState *) us;
+    GRETunnelParams *p = (GRETunnelParams *) s->params;
+    uint32_t key;
+
+
+    if (!p->ipv6) {
+        buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
+    }
+
+    if (*((uint32_t *) buf) != *((uint32_t *) &p->header_bits)) {
+        if (!s->header_mismatch) {
+            error_report("header type disagreement, expecting %0x, got %0x",
+                *((uint32_t *) &p->header_bits), *((uint32_t *) buf));
+        }
+        return -1;
+    }
+
+    if (p->key) {
+        key = ldl_be_p(buf + p->key_offset);
+        if (key != p->rx_key) {
+            if (!s->header_mismatch) {
+                error_report("unknown key id %0x, expecting %0x",
+                    key, p->rx_key);
+            }
+            return -1;
+        }
+    }
+    return 0;
+}
+
+int net_init_gre(const Netdev *netdev,
+                    const char *name,
+                    NetClientState *peer, Error **errp)
+{
+    /* FIXME error_setg(errp, ...) on failure */
+    const NetdevGREOptions *gre;
+    NetUnifiedState *s;
+    NetClientState *nc;
+    GRETunnelParams *p;
+
+    int fd = -1, gairet;
+    struct addrinfo hints;
+    struct addrinfo *result = NULL;
+
+    nc = qemu_new_unified_net_client(name, peer);
+
+    s = DO_UPCAST(NetUnifiedState, nc, nc);
+
+    p = g_malloc(sizeof(GRETunnelParams));
+
+    s->params = p;
+    p->header_bits.arptype = GRE_IRB;
+    p->header_bits.header = 0;
+
+    s->form_header = &gre_form_header;
+    s->verify_header = &gre_verify_header;
+    s->queue_head = 0;
+    s->queue_tail = 0;
+    s->header_mismatch = false;
+
+    assert(netdev->type == NET_CLIENT_DRIVER_GRE);
+    gre = &netdev->u.gre;
+
+    if (gre->has_ipv6 && gre->ipv6) {
+        p->ipv6 = gre->ipv6;
+    } else {
+        p->ipv6 = false;
+    }
+
+    s->offset = 4;
+    p->key_offset = 4;
+    p->sequence_offset = 4;
+    p->checksum_offset = 4;
+
+    if (gre->has_rxkey || gre->has_txkey) {
+        if (gre->has_rxkey && gre->has_txkey) {
+            p->key = true;
+            p->header_bits.header |= GRE_MODE_KEY;
+        } else {
+            goto outerr;
+        }
+    } else {
+        p->key = false;
+    }
+
+    if (p->key) {
+        p->rx_key = gre->rxkey;
+        p->tx_key = gre->txkey;
+        s->offset += 4;
+        p->sequence_offset += 4;
+    }
+
+
+    if (gre->has_sequence && gre->sequence) {
+        s->offset += 4;
+        p->has_sequence = true;
+        p->header_bits.header |= GRE_MODE_SEQUENCE;
+    } else {
+        p->sequence = false;
+    }
+
+    if (gre->has_pinsequence && gre->pinsequence) {
+        /* pin sequence implies that there is sequence */
+        p->has_sequence = true;
+        p->pin_sequence = true;
+    } else {
+        p->pin_sequence = false;
+    }
+
+    memset(&hints, 0, sizeof(hints));
+
+    if (p->ipv6) {
+        hints.ai_family = AF_INET6;
+    } else {
+        hints.ai_family = AF_INET;
+    }
+
+    hints.ai_socktype = SOCK_RAW;
+    hints.ai_protocol = IPPROTO_GRE;
+
+    gairet = getaddrinfo(gre->src, NULL, &hints, &result);
+
+    if ((gairet != 0) || (result == NULL)) {
+        error_report(
+            "gre_open : could not resolve src, errno = %s",
+            gai_strerror(gairet)
+        );
+        goto outerr;
+    }
+    fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol);
+    if (fd == -1) {
+        fd = -errno;
+        error_report("gre_open : socket creation failed, errno = %d", -fd);
+        goto outerr;
+    }
+    if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) {
+        error_report("gre_open :  could not bind socket err=%i", errno);
+        goto outerr;
+    }
+    if (result) {
+        freeaddrinfo(result);
+    }
+
+    memset(&hints, 0, sizeof(hints));
+
+    if (p->ipv6) {
+        hints.ai_family = AF_INET6;
+    } else {
+        hints.ai_family = AF_INET;
+    }
+    hints.ai_socktype = SOCK_RAW;
+    hints.ai_protocol = IPPROTO_GRE;
+
+    result = NULL;
+    gairet = getaddrinfo(gre->dst, NULL, &hints, &result);
+    if ((gairet != 0) || (result == NULL)) {
+        error_report(
+            "gre_open : could not resolve dst, error = %s",
+            gai_strerror(gairet)
+        );
+        goto outerr;
+    }
+
+    s->dgram_dst = g_new0(struct sockaddr_storage, 1);
+    memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen);
+    s->dst_size = result->ai_addrlen;
+
+    if (result) {
+        freeaddrinfo(result);
+    }
+
+    if ((p->ipv6) || (p->udp)) {
+        s->header_size = s->offset;
+    } else {
+        s->header_size = s->offset + sizeof(struct iphdr);
+    }
+
+    qemu_net_finalize_unified_init(s, fd);
+
+    p->sequence = 0;
+
+    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+             "gre: connected");
+    return 0;
+outerr:
+    qemu_del_net_client(nc);
+    if (fd >= 0) {
+        close(fd);
+    }
+    if (result) {
+        freeaddrinfo(result);
+    }
+    return -1;
+}
diff --git a/net/net.c b/net/net.c
index 9270b52ac8..b75b6e8154 100644
--- a/net/net.c
+++ b/net/net.c
@@ -961,6 +961,7 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
 #endif
 #ifdef CONFIG_UNIFIED
         [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
+        [NET_CLIENT_DRIVER_GRE] = net_init_gre,
 #endif
 };
 
@@ -1012,6 +1013,10 @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
             legacy.type = NET_CLIENT_DRIVER_L2TPV3;
             legacy.u.l2tpv3 = opts->u.l2tpv3;
             break;
+        case NET_LEGACY_OPTIONS_TYPE_GRE:
+            legacy.type = NET_CLIENT_DRIVER_GRE;
+            legacy.u.gre = opts->u.gre;
+            break;
         case NET_LEGACY_OPTIONS_TYPE_SOCKET:
             legacy.type = NET_CLIENT_DRIVER_SOCKET;
             legacy.u.socket = opts->u.socket;
diff --git a/qapi-schema.json b/qapi-schema.json
index ab438ead70..aec303a14e 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3847,7 +3847,41 @@
     'txsession':    'uint32',
     '*rxsession':   'uint32',
     '*offset':      'uint32' } }
-
+##
+# @NetdevGREOptions:
+#
+# Connect the VLAN to Ethernet over Ethernet over GRE (GRETAP) tunnel
+#
+# @src: source address
+#
+# @dst: destination address
+#
+# @ipv6: force the use of ipv6
+#
+# @sequence: have sequence counter
+#
+# @pinsequence: pin sequence counter to zero -
+#              workaround for buggy implementations or
+#              networks with packet reorder
+#
+# @txkey: 32 bit transmit key
+#
+# @rxkey: 32 bit receive key
+#
+# Note - gre checksums are not supported at present
+#
+#
+# Since 2.9
+##
+{ 'struct': 'NetdevGREOptions',
+  'data': {
+    'src':          'str',
+    'dst':          'str',
+    '*ipv6':        'bool',
+    '*sequence':     'bool',
+    '*pinsequence':  'bool',
+    '*txkey':    'uint32',
+    '*rxkey':    'uint32' } }
 ##
 # @NetdevVdeOptions:
 #
@@ -3966,7 +4000,7 @@
 ##
 { 'enum': 'NetClientDriver',
   'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
-            'bridge', 'hubport', 'netmap', 'vhost-user' ] }
+            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
 
 ##
 # @Netdev:
@@ -3996,7 +4030,8 @@
     'bridge':   'NetdevBridgeOptions',
     'hubport':  'NetdevHubPortOptions',
     'netmap':   'NetdevNetmapOptions',
-    'vhost-user': 'NetdevVhostUserOptions' } }
+    'vhost-user': 'NetdevVhostUserOptions',
+    'gre':      'NetdevGREOptions' } }
 
 ##
 # @NetLegacy:
@@ -4027,7 +4062,7 @@
 ##
 { 'enum': 'NetLegacyOptionsType',
   'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
-           'dump', 'bridge', 'netmap', 'vhost-user'] }
+           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
 
 ##
 # @NetLegacyOptions:
@@ -4050,7 +4085,8 @@
     'dump':     'NetdevDumpOptions',
     'bridge':   'NetdevBridgeOptions',
     'netmap':   'NetdevNetmapOptions',
-    'vhost-user': 'NetdevVhostUserOptions' } }
+    'vhost-user': 'NetdevVhostUserOptions',
+    'gre':      'NetdevGREOptions' } }
 
 ##
 # @NetFilterDirection:
diff --git a/qemu-options.hx b/qemu-options.hx
index 2cc70b9cfc..6f8d5cbe21 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1945,7 +1945,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
     "                connected to a bridge (default=" DEFAULT_BRIDGE_INTERFACE ")\n"
     "                using the program 'helper (default=" DEFAULT_BRIDGE_HELPER ")\n"
 #endif
-#ifdef __linux__
+#ifdef CONFIG_UNIFIED
     "-netdev l2tpv3,id=str,src=srcaddr,dst=dstaddr[,srcport=srcport][,dstport=dstport]\n"
     "         [,rxsession=rxsession],txsession=txsession[,ipv6=on/off][,udp=on/off]\n"
     "         [,cookie64=on/off][,counter][,pincounter][,txcookie=txcookie]\n"
@@ -1971,6 +1971,23 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
     "                use 'counter=off' to force a 'cut-down' L2TPv3 with no counter\n"
     "                use 'pincounter=on' to work around broken counter handling in peer\n"
     "                use 'offset=X' to add an extra offset between header and data\n"
+    "-netdev gre,id=str,src=srcaddr,dst=dstaddr[,rxkey=rxkey],txkey=txkey[,ipv6=on/off]\n"
+    "         [,sequence][,pinsequence]\n"
+    "                configure a network backend with ID 'str' connected to\n"
+    "                an Ethernet over GRE pseudowire (aka GRE TAP).\n"
+    "                Linux kernel 3.3+ as well as most routers and some switches\n"
+    "                can talk GRETAP. This transport allows connecting a VM to a VM,\n"
+    "                VM to a router and even VM to Host. It is a nearly-universal\n"
+    "                standard (RFC1701).\n"
+    "                use 'src=' to specify source address\n"
+    "                use 'dst=' to specify destination address\n"
+    "                use 'ipv6=on' to force v6\n"
+    "                GRE may use keys to prevent misconfiguration as\n"
+    "                well as a weak security measure\n"
+    "                use 'rxkey=0x01234' to specify a rxkey\n"
+    "                use 'txkey=0x01234' to specify a txkey\n"
+    "                use 'sequence=on' to add frame sequence to each packet\n"
+    "                use 'pinsequence=on' to work around broken sequence handling in peer\n"
 #endif
     "-netdev socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
     "                configure a network backend to connect to another network\n"
@@ -2394,12 +2411,54 @@ ip l2tp add session tunnel_id 1 name vmtunnel0 session_id \
 ifconfig vmtunnel0 mtu 1500
 ifconfig vmtunnel0 up
 brctl addif br-lan vmtunnel0
+@end example
+
+Alternatively, it is possible to assign an IP address to vmtunnel0, which allows
+the VM to connect to the host directly without using Linux bridging.
+
+
+@item -netdev gre,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,ipv6][,sequence][,pinsequence][,txkey=@var{txkey}][,rxkey=@var{rxkey}]
+@itemx -net gre[,vlan=@var{n}][,name=@var{name}],src=@var{srcaddr},dst=@var{dstaddr}[,ipv6][,sequence][,pinsequence][,txkey=@var{txkey}][,rxkey=@var{rxkey}]
+Connect VLAN @var{n} to a GRE pseudowire. GRE (RFC1701) is a popular
+protocol to transport various data frames between two systems.
+We are interested in a specific GRE variety where the transported
+frames are Ethernet. This GRE type is usually referred to as GRETAP.
+It is present in routers, firewalls, switches and the Linux kernel
+(from version 3.3 onwards).
+
+This transport allows a VM to communicate to another VM, router or firewall directly.
+
+@item src=@var{srcaddr}
+    source address (mandatory)
+@item dst=@var{dstaddr}
+    destination address (mandatory)
+@item ipv6
+    force v6, otherwise defaults to v4.
+@item rxkey=@var{rxkey}
+@itemx txkey=@var{txkey}
+    Keys are a weak form of security in the gre specification.
+Their function is mostly to prevent misconfiguration.
+@item sequence=on
+    Add frame sequence to GRE frames
+@item pinsequence=on
+    Work around broken sequence handling in peer. This may also help on
+networks which have packet reorder.
+
+For example, to attach a VM running on host 4.3.2.1 via GRETAP to the bridge br-lan
+on the remote Linux host 1.2.3.4:
+@example
+# Setup tunnel on linux host using raw ip as encapsulation
+# on 1.2.3.4
+ip link add gt0 type gretap local 1.2.3.4 remote 4.3.2.1
+ifconfig gt0 mtu 1500
+ifconfig gt0 up
+brctl addif br-lan gt0
 
 
 # on 4.3.2.1
 # launch QEMU instance - if your network has reorder or is very lossy add ,pincounter
 
-qemu-system-i386 linux.img -net nic -net l2tpv3,src=4.2.3.1,dst=1.2.3.4,udp,srcport=16384,dstport=16384,rxsession=0xffffffff,txsession=0xffffffff,counter
+qemu-system-i386 linux.img -net nic -net gre,src=4.2.3.1,dst=1.2.3.4
 
 
 @end example
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-18 17:08 [Qemu-devel] Unified Socket Driver anton.ivanov
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport anton.ivanov
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support anton.ivanov
@ 2017-07-18 17:08 ` anton.ivanov
  2017-07-19  5:58   ` Jason Wang
  2017-07-19 14:42   ` Eric Blake
  2 siblings, 2 replies; 23+ messages in thread
From: anton.ivanov @ 2017-07-18 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: jasowang, Anton Ivanov

From: Anton Ivanov <anton.ivanov@cambridgegreys.com>

This adds raw socket support to the unified socket driver.

Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
---
 net/Makefile.objs |   2 +-
 net/clients.h     |   3 ++
 net/net.c         |   5 +++
 net/raw.c         | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qapi-schema.json  |  25 +++++++++--
 qemu-options.hx   |  33 +++++++++++++++
 6 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 net/raw.c

diff --git a/net/Makefile.objs b/net/Makefile.objs
index 128164e39b..54cf7dd194 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
 common-obj-y += socket.o
 common-obj-y += dump.o
 common-obj-y += eth.o
-common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o
+common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o raw.o
 common-obj-$(CONFIG_POSIX) += vhost-user.o
 common-obj-$(CONFIG_SLIRP) += slirp.o
 common-obj-$(CONFIG_VDE) += vde.o
diff --git a/net/clients.h b/net/clients.h
index 8f8a59aee3..98d8ae59b7 100644
--- a/net/clients.h
+++ b/net/clients.h
@@ -53,6 +53,9 @@ int net_init_l2tpv3(const Netdev *netdev, const char *name,
 int net_init_gre(const Netdev *netdev, const char *name,
                     NetClientState *peer, Error **errp);
 
+int net_init_raw(const Netdev *netdev, const char *name,
+                    NetClientState *peer, Error **errp);
+
 #ifdef CONFIG_VDE
 int net_init_vde(const Netdev *netdev, const char *name,
                  NetClientState *peer, Error **errp);
diff --git a/net/net.c b/net/net.c
index b75b6e8154..2d988a120c 100644
--- a/net/net.c
+++ b/net/net.c
@@ -962,6 +962,7 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
 #ifdef CONFIG_UNIFIED
         [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
         [NET_CLIENT_DRIVER_GRE] = net_init_gre,
+        [NET_CLIENT_DRIVER_RAW] = net_init_raw,
 #endif
 };
 
@@ -1017,6 +1018,10 @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
             legacy.type = NET_CLIENT_DRIVER_GRE;
             legacy.u.gre = opts->u.gre;
             break;
+        case NET_LEGACY_OPTIONS_TYPE_RAW:
+            legacy.type = NET_CLIENT_DRIVER_RAW;
+            legacy.u.raw = opts->u.raw;
+            break;
         case NET_LEGACY_OPTIONS_TYPE_SOCKET:
             legacy.type = NET_CLIENT_DRIVER_SOCKET;
             legacy.u.socket = opts->u.socket;
diff --git a/net/raw.c b/net/raw.c
new file mode 100644
index 0000000000..73e2fd9fe3
--- /dev/null
+++ b/net/raw.c
@@ -0,0 +1,123 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2015-2017 Cambridge Greys Limited
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2012-2014 Cisco Systems
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <linux/ip.h>
+#include <netdb.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include "net/net.h"
+ #include <sys/socket.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include "clients.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "unified.h"
+
+static int noop(void *us, uint8_t *buf)
+{
+    return 0;
+}
+
+int net_init_raw(const Netdev *netdev,
+                    const char *name,
+                    NetClientState *peer, Error **errp)
+{
+
+    const NetdevRawOptions *raw;
+    NetUnifiedState *s;
+    NetClientState *nc;
+
+    int fd = -1;
+    int err;
+
+    struct ifreq ifr;
+    struct sockaddr_ll sock;
+
+
+    nc = qemu_new_unified_net_client(name, peer);
+
+    s = DO_UPCAST(NetUnifiedState, nc, nc);
+
+    fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+    if (fd == -1) {
+        err = -errno;
+        error_report("raw_open : raw socket creation failed, errno = %d", -err);
+        goto outerr;
+    }
+
+
+    s->form_header = NULL;
+    s->verify_header = &noop;
+    s->queue_head = 0;
+    s->queue_tail = 0;
+    s->header_mismatch = false;
+    s->dgram_dst = NULL;
+    s->dst_size = 0;
+
+    assert(netdev->type == NET_CLIENT_DRIVER_RAW);
+    raw = &netdev->u.raw;
+
+    memset(&ifr, 0, sizeof(struct ifreq));
+    strncpy((char *) &ifr.ifr_name, raw->ifname, sizeof(ifr.ifr_name) - 1);
+
+    if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
+        err = -errno;
+        error_report("SIOCGIFINDEX, failed to get raw interface index for %s",
+            raw->ifname);
+        goto outerr;
+    }
+
+    sock.sll_family = AF_PACKET;
+    sock.sll_protocol = htons(ETH_P_ALL);
+    sock.sll_ifindex = ifr.ifr_ifindex;
+
+    if (bind(fd, (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
+        error_report("raw: failed to bind raw socket");
+        err = -errno;
+        goto outerr;
+    }
+
+    s->offset = 0;
+
+    qemu_net_finalize_unified_init(s, fd);
+
+    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+             "raw: connected");
+    return 0;
+outerr:
+    qemu_del_net_client(nc);
+    if (fd >= 0) {
+        close(fd);
+    }
+    return -1;
+}
+
diff --git a/qapi-schema.json b/qapi-schema.json
index aec303a14e..cde78ce3a1 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -3883,6 +3883,21 @@
     '*txkey':    'uint32',
     '*rxkey':    'uint32' } }
 ##
+# @NetdevRawOptions:
+#
+# Connect the VLAN to an network interface using raw sockets
+#
+# @ifname: network interface name
+#
+
+# Since 2.9
+##
+{ 'struct': 'NetdevRawOptions',
+  'data': {
+    'ifname':          'str'
+} }
+
+##
 # @NetdevVdeOptions:
 #
 # Connect the VLAN to a vde switch running on the host.
@@ -4000,7 +4015,7 @@
 ##
 { 'enum': 'NetClientDriver',
   'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
-            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
+            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre', 'raw' ] }
 
 ##
 # @Netdev:
@@ -4031,7 +4046,8 @@
     'hubport':  'NetdevHubPortOptions',
     'netmap':   'NetdevNetmapOptions',
     'vhost-user': 'NetdevVhostUserOptions',
-    'gre':      'NetdevGREOptions' } }
+    'gre':      'NetdevGREOptions',
+    'raw':      'NetdevRawOptions' } }
 
 ##
 # @NetLegacy:
@@ -4062,7 +4078,7 @@
 ##
 { 'enum': 'NetLegacyOptionsType',
   'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
-           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
+           'dump', 'bridge', 'netmap', 'vhost-user', 'gre', 'raw'] }
 
 ##
 # @NetLegacyOptions:
@@ -4086,7 +4102,8 @@
     'bridge':   'NetdevBridgeOptions',
     'netmap':   'NetdevNetmapOptions',
     'vhost-user': 'NetdevVhostUserOptions',
-    'gre':      'NetdevGREOptions' } }
+    'gre':      'NetdevGREOptions',
+    'raw':      'NetdevRawOptions' } }
 
 ##
 # @NetFilterDirection:
diff --git a/qemu-options.hx b/qemu-options.hx
index 6f8d5cbe21..d9db8b576b 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1988,6 +1988,13 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
     "                use 'txkey=0x01234' to specify a txkey\n"
     "                use 'sequence=on' to add frame sequence to each packet\n"
     "                use 'pinsequence=on' to work around broken sequence handling in peer\n"
+    "-netdev raw,id=str,ifname=ifname\n"
+    "                configure a network backend with ID 'str' connected to\n"
+    "                an Ethernet interface named ifname via raw socket.\n"
+    "                This backend does not change the interface settings.\n"
+    "                Most interfaces will require being set into promisc mode,\n"
+    "                as well having most offloads (TSO, etc) turned off.\n"
+    "                Some virtual interfaces like tap support only RX.\n"
 #endif
     "-netdev socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
     "                configure a network backend to connect to another network\n"
@@ -2463,6 +2470,32 @@ qemu-system-i386 linux.img -net nic -net gre,src=4.2.3.1,dst=1.2.3.4
 
 @end example
 
+@item -netdev raw,id=@var{id},ifname=@var{ifname}
+@itemx -net raw[,vlan=@var{n}][,name=@var{name}],ifname=@var{ifname}
+Connect VLAN @var{n} directly to an Ethernet interface using raw socket.
+
+This transport allows a VM to bypass most of the network stack which is
+extremely useful for tapping.
+
+@item ifname=@var{ifname}
+    interface name (mandatory)
+
+@example
+# set up the interface - put it in promiscuous mode and turn off offloads
+ifconfig eth0 up
+ifconfig eth0 promisc
+
+/sbin/ethtool -K eth0 gro off
+/sbin/ethtool -K eth0 tso off
+/sbin/ethtool -K eth0 gso off
+/sbin/ethtool -K eth0 tx off
+
+# launch QEMU instance - if your network has reorder or is very lossy add ,pincounter
+
+qemu-system-i386 linux.img -net nic -net raw,ifname=eth0
+
+@end example
+
 @item -netdev vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
 @itemx -net vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
 Connect VLAN @var{n} to PORT @var{n} of a vde switch running on host and
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport anton.ivanov
@ 2017-07-19  5:39   ` Jason Wang
  2017-07-19  5:48     ` Anton Ivanov
  2017-07-21 17:50     ` Anton Ivanov
  0 siblings, 2 replies; 23+ messages in thread
From: Jason Wang @ 2017-07-19  5:39 UTC (permalink / raw)
  To: anton.ivanov, qemu-devel



On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>
> 1. Creates a common backend for socket transports using
> recvmmsg().
> 2. Migrates L2TPv3 to the new backend

It would be better if you could further split out 2 from this patch.

>
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>   configure         |  10 +-
>   net/Makefile.objs |   2 +-
>   net/l2tpv3.c      | 531 +++++++++---------------------------------------------
>   net/net.c         |   4 +-
>   net/unified.c     | 406 +++++++++++++++++++++++++++++++++++++++++
>   net/unified.h     | 118 ++++++++++++
>   6 files changed, 613 insertions(+), 458 deletions(-)
>   create mode 100644 net/unified.c
>   create mode 100644 net/unified.h
>
> diff --git a/configure b/configure
> index a3f0522e8f..99a60b723c 100755
> --- a/configure
> +++ b/configure
> @@ -1862,7 +1862,7 @@ if ! compile_object -Werror ; then
>   fi
>   
>   ##########################################
> -# L2TPV3 probe
> +# UNIFIED probe
>   
>   cat > $TMPC <<EOF
>   #include <sys/socket.h>
> @@ -1870,9 +1870,9 @@ cat > $TMPC <<EOF
>   int main(void) { return sizeof(struct mmsghdr); }
>   EOF
>   if compile_prog "" "" ; then
> -  l2tpv3=yes
> +  unified=yes
>   else
> -  l2tpv3=no
> +  unified=no
>   fi
>   
>   ##########################################
> @@ -5458,8 +5458,8 @@ fi
>   if test "$netmap" = "yes" ; then
>     echo "CONFIG_NETMAP=y" >> $config_host_mak
>   fi
> -if test "$l2tpv3" = "yes" ; then
> -  echo "CONFIG_L2TPV3=y" >> $config_host_mak
> +if test "$unified" = "yes" ; then
> +  echo "CONFIG_UNIFIED=y" >> $config_host_mak
>   fi

Could we keep l2tpv3 option?

>   if test "$cap_ng" = "yes" ; then
>     echo "CONFIG_LIBCAP=y" >> $config_host_mak
> diff --git a/net/Makefile.objs b/net/Makefile.objs
> index 67ba5e26fb..8026ad778a 100644
> --- a/net/Makefile.objs
> +++ b/net/Makefile.objs
> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>   common-obj-y += socket.o
>   common-obj-y += dump.o
>   common-obj-y += eth.o
> -common-obj-$(CONFIG_L2TPV3) += l2tpv3.o
> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>   common-obj-$(CONFIG_SLIRP) += slirp.o
>   common-obj-$(CONFIG_VDE) += vde.o
> diff --git a/net/l2tpv3.c b/net/l2tpv3.c
> index 6745b78990..05413c9cbd 100644
> --- a/net/l2tpv3.c
> +++ b/net/l2tpv3.c
> @@ -1,6 +1,7 @@
>   /*
>    * QEMU System Emulator
>    *
> + * Copyright (c) 2015-2017 Cambridge Greys Limited
>    * Copyright (c) 2003-2008 Fabrice Bellard
>    * Copyright (c) 2012-2014 Cisco Systems
>    *
> @@ -34,19 +35,9 @@
>   #include "qemu/sockets.h"
>   #include "qemu/iov.h"
>   #include "qemu/main-loop.h"
> +#include "unified.h"
>   
>   
> -/* The buffer size needs to be investigated for optimum numbers and
> - * optimum means of paging in on different systems. This size is
> - * chosen to be sufficient to accommodate one packet with some headers
> - */
> -
> -#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
> -#define BUFFER_SIZE 2048
> -#define IOVSIZE 2
> -#define MAX_L2TPV3_MSGCNT 64
> -#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE)
> -
>   /* Header set to 0x30000 signifies a data packet */
>   
>   #define L2TPV3_DATA_PACKET 0x30000
> @@ -57,31 +48,7 @@
>   #define IPPROTO_L2TP 0x73
>   #endif
>   
> -typedef struct NetL2TPV3State {
> -    NetClientState nc;
> -    int fd;
> -
> -    /*
> -     * these are used for xmit - that happens packet a time
> -     * and for first sign of life packet (easier to parse that once)
> -     */
> -
> -    uint8_t *header_buf;
> -    struct iovec *vec;
> -
> -    /*
> -     * these are used for receive - try to "eat" up to 32 packets at a time
> -     */
> -
> -    struct mmsghdr *msgvec;
> -
> -    /*
> -     * peer address
> -     */
> -
> -    struct sockaddr_storage *dgram_dst;
> -    uint32_t dst_size;
> -
> +typedef struct L2TPV3TunnelParams {
>       /*
>        * L2TPv3 parameters
>        */
> @@ -90,37 +57,8 @@ typedef struct NetL2TPV3State {
>       uint64_t tx_cookie;
>       uint32_t rx_session;
>       uint32_t tx_session;
> -    uint32_t header_size;
>       uint32_t counter;
>   
> -    /*
> -    * DOS avoidance in error handling
> -    */
> -
> -    bool header_mismatch;
> -
> -    /*
> -     * Ring buffer handling
> -     */
> -
> -    int queue_head;
> -    int queue_tail;
> -    int queue_depth;
> -
> -    /*
> -     * Precomputed offsets
> -     */
> -
> -    uint32_t offset;
> -    uint32_t cookie_offset;
> -    uint32_t counter_offset;
> -    uint32_t session_offset;
> -
> -    /* Poll Control */
> -
> -    bool read_poll;
> -    bool write_poll;
> -
>       /* Flags */
>   
>       bool ipv6;
> @@ -130,189 +68,62 @@ typedef struct NetL2TPV3State {
>       bool cookie;
>       bool cookie_is_64;
>   
> -} NetL2TPV3State;
> -
> -static void net_l2tpv3_send(void *opaque);
> -static void l2tpv3_writable(void *opaque);
> -
> -static void l2tpv3_update_fd_handler(NetL2TPV3State *s)
> -{
> -    qemu_set_fd_handler(s->fd,
> -                        s->read_poll ? net_l2tpv3_send : NULL,
> -                        s->write_poll ? l2tpv3_writable : NULL,
> -                        s);
> -}
> -
> -static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable)
> -{
> -    if (s->read_poll != enable) {
> -        s->read_poll = enable;
> -        l2tpv3_update_fd_handler(s);
> -    }
> -}
> +    /* Precomputed L2TPV3 specific offsets */
> +    uint32_t cookie_offset;
> +    uint32_t counter_offset;
> +    uint32_t session_offset;
>   
> -static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable)
> -{
> -    if (s->write_poll != enable) {
> -        s->write_poll = enable;
> -        l2tpv3_update_fd_handler(s);
> -    }
> -}
> +} L2TPV3TunnelParams;
>   
> -static void l2tpv3_writable(void *opaque)
> -{
> -    NetL2TPV3State *s = opaque;
> -    l2tpv3_write_poll(s, false);
> -    qemu_flush_queued_packets(&s->nc);
> -}
>   
> -static void l2tpv3_send_completed(NetClientState *nc, ssize_t len)
> -{
> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
> -    l2tpv3_read_poll(s, true);
> -}
>   
> -static void l2tpv3_poll(NetClientState *nc, bool enable)
> +static void l2tpv3_form_header(void *us)
>   {
> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
> -    l2tpv3_write_poll(s, enable);
> -    l2tpv3_read_poll(s, enable);
> -}
> +    NetUnifiedState *s = (NetUnifiedState *) us;
> +    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;

How about embedding NetUnifiedState into this structure and keep using 
NetL2TPV3State? Then:

-  's' could be kept and lots of lines of changes could be saved here 
and l2tpv3_verify_header()
-  each transport could have their own type instead of using 
NET_CLIENT_DRIVER_L2TPV3

?

>   
> -static void l2tpv3_form_header(NetL2TPV3State *s)
> -{
>       uint32_t *counter;
>   
> -    if (s->udp) {
> +    if (p->udp) {
>           stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET);
>       }
>       stl_be_p(
> -            (uint32_t *) (s->header_buf + s->session_offset),
> -            s->tx_session
> +            (uint32_t *) (s->header_buf + p->session_offset),
> +            p->tx_session
>           );
> -    if (s->cookie) {
> -        if (s->cookie_is_64) {
> +    if (p->cookie) {
> +        if (p->cookie_is_64) {
>               stq_be_p(
> -                (uint64_t *)(s->header_buf + s->cookie_offset),
> -                s->tx_cookie
> +                (uint64_t *)(s->header_buf + p->cookie_offset),
> +                p->tx_cookie
>               );
>           } else {
>               stl_be_p(
> -                (uint32_t *) (s->header_buf + s->cookie_offset),
> -                s->tx_cookie
> +                (uint32_t *) (s->header_buf + p->cookie_offset),
> +                p->tx_cookie
>               );
>           }
>       }
> -    if (s->has_counter) {
> -        counter = (uint32_t *)(s->header_buf + s->counter_offset);
> -        if (s->pin_counter) {
> +    if (p->has_counter) {
> +        counter = (uint32_t *)(s->header_buf + p->counter_offset);
> +        if (p->pin_counter) {
>               *counter = 0;
>           } else {
> -            stl_be_p(counter, ++s->counter);
> -        }
> -    }
> -}
> -
> -static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc,
> -                    const struct iovec *iov,
> -                    int iovcnt)
> -{
> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
> -
> -    struct msghdr message;
> -    int ret;
> -
> -    if (iovcnt > MAX_L2TPV3_IOVCNT - 1) {
> -        error_report(
> -            "iovec too long %d > %d, change l2tpv3.h",
> -            iovcnt, MAX_L2TPV3_IOVCNT
> -        );
> -        return -1;
> -    }
> -    l2tpv3_form_header(s);
> -    memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
> -    s->vec->iov_base = s->header_buf;
> -    s->vec->iov_len = s->offset;
> -    message.msg_name = s->dgram_dst;
> -    message.msg_namelen = s->dst_size;
> -    message.msg_iov = s->vec;
> -    message.msg_iovlen = iovcnt + 1;
> -    message.msg_control = NULL;
> -    message.msg_controllen = 0;
> -    message.msg_flags = 0;
> -    do {
> -        ret = sendmsg(s->fd, &message, 0);
> -    } while ((ret == -1) && (errno == EINTR));
> -    if (ret > 0) {
> -        ret -= s->offset;
> -    } else if (ret == 0) {
> -        /* belt and braces - should not occur on DGRAM
> -        * we should get an error and never a 0 send
> -        */
> -        ret = iov_size(iov, iovcnt);
> -    } else {
> -        /* signal upper layer that socket buffer is full */
> -        ret = -errno;
> -        if (ret == -EAGAIN || ret == -ENOBUFS) {
> -            l2tpv3_write_poll(s, true);
> -            ret = 0;
> +            stl_be_p(counter, ++p->counter);
>           }
>       }
> -    return ret;
>   }
>   
> -static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc,
> -                    const uint8_t *buf,
> -                    size_t size)
> -{
> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
> -
> -    struct iovec *vec;
> -    struct msghdr message;
> -    ssize_t ret = 0;
> -
> -    l2tpv3_form_header(s);
> -    vec = s->vec;
> -    vec->iov_base = s->header_buf;
> -    vec->iov_len = s->offset;
> -    vec++;
> -    vec->iov_base = (void *) buf;
> -    vec->iov_len = size;
> -    message.msg_name = s->dgram_dst;
> -    message.msg_namelen = s->dst_size;
> -    message.msg_iov = s->vec;
> -    message.msg_iovlen = 2;
> -    message.msg_control = NULL;
> -    message.msg_controllen = 0;
> -    message.msg_flags = 0;
> -    do {
> -        ret = sendmsg(s->fd, &message, 0);
> -    } while ((ret == -1) && (errno == EINTR));
> -    if (ret > 0) {
> -        ret -= s->offset;
> -    } else if (ret == 0) {
> -        /* belt and braces - should not occur on DGRAM
> -        * we should get an error and never a 0 send
> -        */
> -        ret = size;
> -    } else {
> -        ret = -errno;
> -        if (ret == -EAGAIN || ret == -ENOBUFS) {
> -            /* signal upper layer that socket buffer is full */
> -            l2tpv3_write_poll(s, true);
> -            ret = 0;
> -        }
> -    }
> -    return ret;
> -}
>   
> -static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
> +static int l2tpv3_verify_header(void *us, uint8_t *buf)
>   {
>   
> +    NetUnifiedState *s = (NetUnifiedState *) us;
> +    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
>       uint32_t *session;
>       uint64_t cookie;
>   
> -    if ((!s->udp) && (!s->ipv6)) {
> +    if ((!p->udp) && (!p->ipv6)) {
>           buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
>       }
>   
> @@ -321,21 +132,21 @@ static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
>       * that anyway.
>       */
>   
> -    if (s->cookie) {
> -        if (s->cookie_is_64) {
> -            cookie = ldq_be_p(buf + s->cookie_offset);
> +    if (p->cookie) {
> +        if (p->cookie_is_64) {
> +            cookie = ldq_be_p(buf + p->cookie_offset);
>           } else {
> -            cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL;
> +            cookie = ldl_be_p(buf + p->cookie_offset) & 0xffffffffULL;
>           }
> -        if (cookie != s->rx_cookie) {
> +        if (cookie != p->rx_cookie) {
>               if (!s->header_mismatch) {
>                   error_report("unknown cookie id");
>               }
>               return -1;
>           }
>       }
> -    session = (uint32_t *) (buf + s->session_offset);
> -    if (ldl_be_p(session) != s->rx_session) {
> +    session = (uint32_t *) (buf + p->session_offset);
> +    if (ldl_be_p(session) != p->rx_session) {
>           if (!s->header_mismatch) {
>               error_report("session mismatch");
>           }
> @@ -344,203 +155,31 @@ static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
>       return 0;
>   }
>   
> -static void net_l2tpv3_process_queue(NetL2TPV3State *s)
> -{
> -    int size = 0;
> -    struct iovec *vec;
> -    bool bad_read;
> -    int data_size;
> -    struct mmsghdr *msgvec;
> -
> -    /* go into ring mode only if there is a "pending" tail */
> -    if (s->queue_depth > 0) {
> -        do {
> -            msgvec = s->msgvec + s->queue_tail;
> -            if (msgvec->msg_len > 0) {
> -                data_size = msgvec->msg_len - s->header_size;
> -                vec = msgvec->msg_hdr.msg_iov;
> -                if ((data_size > 0) &&
> -                    (l2tpv3_verify_header(s, vec->iov_base) == 0)) {
> -                    vec++;
> -                    /* Use the legacy delivery for now, we will
> -                     * switch to using our own ring as a queueing mechanism
> -                     * at a later date
> -                     */
> -                    size = qemu_send_packet_async(
> -                            &s->nc,
> -                            vec->iov_base,
> -                            data_size,
> -                            l2tpv3_send_completed
> -                        );
> -                    if (size == 0) {
> -                        l2tpv3_read_poll(s, false);
> -                    }
> -                    bad_read = false;
> -                } else {
> -                    bad_read = true;
> -                    if (!s->header_mismatch) {
> -                        /* report error only once */
> -                        error_report("l2tpv3 header verification failed");
> -                        s->header_mismatch = true;
> -                    }
> -                }
> -            } else {
> -                bad_read = true;
> -            }
> -            s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT;
> -            s->queue_depth--;
> -        } while (
> -                (s->queue_depth > 0) &&
> -                 qemu_can_send_packet(&s->nc) &&
> -                ((size > 0) || bad_read)
> -            );
> -    }
> -}
> -
> -static void net_l2tpv3_send(void *opaque)
> -{
> -    NetL2TPV3State *s = opaque;
> -    int target_count, count;
> -    struct mmsghdr *msgvec;
> -
> -    /* go into ring mode only if there is a "pending" tail */
> -
> -    if (s->queue_depth) {
> -
> -        /* The ring buffer we use has variable intake
> -         * count of how much we can read varies - adjust accordingly
> -         */
> -
> -        target_count = MAX_L2TPV3_MSGCNT - s->queue_depth;
> -
> -        /* Ensure we do not overrun the ring when we have
> -         * a lot of enqueued packets
> -         */
> -
> -        if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) {
> -            target_count = MAX_L2TPV3_MSGCNT - s->queue_head;
> -        }
> -    } else {
> -
> -        /* we do not have any pending packets - we can use
> -        * the whole message vector linearly instead of using
> -        * it as a ring
> -        */
> -
> -        s->queue_head = 0;
> -        s->queue_tail = 0;
> -        target_count = MAX_L2TPV3_MSGCNT;
> -    }
> -
> -    msgvec = s->msgvec + s->queue_head;
> -    if (target_count > 0) {
> -        do {
> -            count = recvmmsg(
> -                s->fd,
> -                msgvec,
> -                target_count, MSG_DONTWAIT, NULL);
> -        } while ((count == -1) && (errno == EINTR));
> -        if (count < 0) {
> -            /* Recv error - we still need to flush packets here,
> -             * (re)set queue head to current position
> -             */
> -            count = 0;
> -        }
> -        s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT;
> -        s->queue_depth += count;
> -    }
> -    net_l2tpv3_process_queue(s);
> -}
> -
> -static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount)
> -{
> -    int i, j;
> -    struct iovec *iov;
> -    struct mmsghdr *cleanup = msgvec;
> -    if (cleanup) {
> -        for (i = 0; i < count; i++) {
> -            if (cleanup->msg_hdr.msg_iov) {
> -                iov = cleanup->msg_hdr.msg_iov;
> -                for (j = 0; j < iovcount; j++) {
> -                    g_free(iov->iov_base);
> -                    iov++;
> -                }
> -                g_free(cleanup->msg_hdr.msg_iov);
> -            }
> -            cleanup++;
> -        }
> -        g_free(msgvec);
> -    }
> -}
> -
> -static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count)
> -{
> -    int i;
> -    struct iovec *iov;
> -    struct mmsghdr *msgvec, *result;
> -
> -    msgvec = g_new(struct mmsghdr, count);
> -    result = msgvec;
> -    for (i = 0; i < count ; i++) {
> -        msgvec->msg_hdr.msg_name = NULL;
> -        msgvec->msg_hdr.msg_namelen = 0;
> -        iov =  g_new(struct iovec, IOVSIZE);
> -        msgvec->msg_hdr.msg_iov = iov;
> -        iov->iov_base = g_malloc(s->header_size);
> -        iov->iov_len = s->header_size;
> -        iov++ ;
> -        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
> -        iov->iov_len = BUFFER_SIZE;
> -        msgvec->msg_hdr.msg_iovlen = 2;
> -        msgvec->msg_hdr.msg_control = NULL;
> -        msgvec->msg_hdr.msg_controllen = 0;
> -        msgvec->msg_hdr.msg_flags = 0;
> -        msgvec++;
> -    }
> -    return result;
> -}
> -
> -static void net_l2tpv3_cleanup(NetClientState *nc)
> -{
> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
> -    qemu_purge_queued_packets(nc);
> -    l2tpv3_read_poll(s, false);
> -    l2tpv3_write_poll(s, false);
> -    if (s->fd >= 0) {
> -        close(s->fd);
> -    }
> -    destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE);
> -    g_free(s->vec);
> -    g_free(s->header_buf);
> -    g_free(s->dgram_dst);
> -}
> -
> -static NetClientInfo net_l2tpv3_info = {
> -    .type = NET_CLIENT_DRIVER_L2TPV3,
> -    .size = sizeof(NetL2TPV3State),
> -    .receive = net_l2tpv3_receive_dgram,
> -    .receive_iov = net_l2tpv3_receive_dgram_iov,
> -    .poll = l2tpv3_poll,
> -    .cleanup = net_l2tpv3_cleanup,
> -};
> -
>   int net_init_l2tpv3(const Netdev *netdev,
>                       const char *name,
>                       NetClientState *peer, Error **errp)
>   {
>       /* FIXME error_setg(errp, ...) on failure */
>       const NetdevL2TPv3Options *l2tpv3;
> -    NetL2TPV3State *s;
> +    NetUnifiedState *s;
>       NetClientState *nc;
> +    L2TPV3TunnelParams *p;
> +
>       int fd = -1, gairet;
>       struct addrinfo hints;
>       struct addrinfo *result = NULL;
>       char *srcport, *dstport;
>   
> -    nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name);
> +    nc = qemu_new_unified_net_client(name, peer);
> +
> +    s = DO_UPCAST(NetUnifiedState, nc, nc);
> +
> +    p = g_malloc(sizeof(L2TPV3TunnelParams));

Where was this freed?

>   
> -    s = DO_UPCAST(NetL2TPV3State, nc, nc);
> +    s->params = p;
>   
> +    s->form_header = &l2tpv3_form_header;
> +    s->verify_header = &l2tpv3_verify_header;
>       s->queue_head = 0;
>       s->queue_tail = 0;
>       s->header_mismatch = false;

Why not move all above into qemu_new_unified_net()?

> @@ -549,9 +188,9 @@ int net_init_l2tpv3(const Netdev *netdev,
>       l2tpv3 = &netdev->u.l2tpv3;
>   
>       if (l2tpv3->has_ipv6 && l2tpv3->ipv6) {
> -        s->ipv6 = l2tpv3->ipv6;
> +        p->ipv6 = l2tpv3->ipv6;
>       } else {
> -        s->ipv6 = false;
> +        p->ipv6 = false;
>       }
>   
>       if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) {
> @@ -561,22 +200,22 @@ int net_init_l2tpv3(const Netdev *netdev,
>   
>       if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) {
>           if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) {
> -            s->cookie = true;
> +            p->cookie = true;
>           } else {
>               goto outerr;
>           }
>       } else {
> -        s->cookie = false;
> +        p->cookie = false;
>       }
>   
>       if (l2tpv3->has_cookie64 || l2tpv3->cookie64) {
> -        s->cookie_is_64  = true;
> +        p->cookie_is_64  = true;
>       } else {
> -        s->cookie_is_64  = false;
> +        p->cookie_is_64  = false;
>       }
>   
>       if (l2tpv3->has_udp && l2tpv3->udp) {
> -        s->udp = true;
> +        p->udp = true;
>           if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) {
>               error_report("l2tpv3_open : need both src and dst port for udp");
>               goto outerr;
> @@ -585,52 +224,52 @@ int net_init_l2tpv3(const Netdev *netdev,
>               dstport = l2tpv3->dstport;
>           }
>       } else {
> -        s->udp = false;
> +        p->udp = false;
>           srcport = NULL;
>           dstport = NULL;
>       }
>   
>   
>       s->offset = 4;
> -    s->session_offset = 0;
> -    s->cookie_offset = 4;
> -    s->counter_offset = 4;
> +    p->session_offset = 0;
> +    p->cookie_offset = 4;
> +    p->counter_offset = 4;
>   
> -    s->tx_session = l2tpv3->txsession;
> +    p->tx_session = l2tpv3->txsession;
>       if (l2tpv3->has_rxsession) {
> -        s->rx_session = l2tpv3->rxsession;
> +        p->rx_session = l2tpv3->rxsession;
>       } else {
> -        s->rx_session = s->tx_session;
> +        p->rx_session = p->tx_session;
>       }
>   
> -    if (s->cookie) {
> -        s->rx_cookie = l2tpv3->rxcookie;
> -        s->tx_cookie = l2tpv3->txcookie;
> -        if (s->cookie_is_64 == true) {
> +    if (p->cookie) {
> +        p->rx_cookie = l2tpv3->rxcookie;
> +        p->tx_cookie = l2tpv3->txcookie;
> +        if (p->cookie_is_64 == true) {
>               /* 64 bit cookie */
>               s->offset += 8;
> -            s->counter_offset += 8;
> +            p->counter_offset += 8;
>           } else {
>               /* 32 bit cookie */
>               s->offset += 4;
> -            s->counter_offset += 4;
> +            p->counter_offset += 4;
>           }
>       }
>   
>       memset(&hints, 0, sizeof(hints));
>   
> -    if (s->ipv6) {
> +    if (p->ipv6) {
>           hints.ai_family = AF_INET6;
>       } else {
>           hints.ai_family = AF_INET;
>       }
> -    if (s->udp) {
> +    if (p->udp) {
>           hints.ai_socktype = SOCK_DGRAM;
>           hints.ai_protocol = 0;
>           s->offset += 4;
> -        s->counter_offset += 4;
> -        s->session_offset += 4;
> -        s->cookie_offset += 4;
> +        p->counter_offset += 4;
> +        p->session_offset += 4;
> +        p->cookie_offset += 4;
>       } else {
>           hints.ai_socktype = SOCK_RAW;
>           hints.ai_protocol = IPPROTO_L2TP;
> @@ -661,12 +300,12 @@ int net_init_l2tpv3(const Netdev *netdev,
>   
>       memset(&hints, 0, sizeof(hints));
>   
> -    if (s->ipv6) {
> +    if (p->ipv6) {
>           hints.ai_family = AF_INET6;
>       } else {
>           hints.ai_family = AF_INET;
>       }
> -    if (s->udp) {
> +    if (p->udp) {
>           hints.ai_socktype = SOCK_DGRAM;
>           hints.ai_protocol = 0;
>       } else {
> @@ -693,17 +332,17 @@ int net_init_l2tpv3(const Netdev *netdev,
>       }
>   
>       if (l2tpv3->has_counter && l2tpv3->counter) {
> -        s->has_counter = true;
> +        p->has_counter = true;
>           s->offset += 4;
>       } else {
> -        s->has_counter = false;
> +        p->has_counter = false;
>       }
>   
>       if (l2tpv3->has_pincounter && l2tpv3->pincounter) {
> -        s->has_counter = true;  /* pin counter implies that there is counter */
> -        s->pin_counter = true;
> +        p->has_counter = true;  /* pin counter implies that there is counter */
> +        p->pin_counter = true;
>       } else {
> -        s->pin_counter = false;
> +        p->pin_counter = false;
>       }
>   
>       if (l2tpv3->has_offset) {
> @@ -711,22 +350,14 @@ int net_init_l2tpv3(const Netdev *netdev,
>           s->offset += l2tpv3->offset;
>       }
>   
> -    if ((s->ipv6) || (s->udp)) {
> +    if ((p->ipv6) || (p->udp)) {
>           s->header_size = s->offset;
>       } else {
>           s->header_size = s->offset + sizeof(struct iphdr);
>       }
>   
> -    s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT);
> -    s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT);
> -    s->header_buf = g_malloc(s->header_size);
> -
> -    qemu_set_nonblock(fd);
> -
> -    s->fd = fd;
> -    s->counter = 0;
> -
> -    l2tpv3_read_poll(s, true);
> +    qemu_net_finalize_unified_init(s, fd);
> +    p->counter = 0;
>   
>       snprintf(s->nc.info_str, sizeof(s->nc.info_str),
>                "l2tpv3: connected");
> diff --git a/net/net.c b/net/net.c
> index 6235aabed8..9270b52ac8 100644
> --- a/net/net.c
> +++ b/net/net.c
> @@ -959,8 +959,8 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>   #ifdef CONFIG_VHOST_NET_USED
>           [NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user,
>   #endif
> -#ifdef CONFIG_L2TPV3
> -        [NET_CLIENT_DRIVER_L2TPV3]    = net_init_l2tpv3,
> +#ifdef CONFIG_UNIFIED
> +        [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
>   #endif
>   };
>   
> diff --git a/net/unified.c b/net/unified.c

Not a native speaker, but I think we need a better name here e.g udst 
which is short for Unified Datagram Socket Transport?

> new file mode 100644
> index 0000000000..f15d1e1eed
> --- /dev/null
> +++ b/net/unified.c
> @@ -0,0 +1,406 @@
> +/*
> + * QEMU System Emulator
> + *
> + * Copyright (c) 2015-2017 Cambridge Greys Limited
> + * Copyright (c) 2012-2014 Cisco Systems
> + * Copyright (c) 2003-2008 Fabrice Bellard
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +#include <linux/ip.h>
> +#include <netdb.h>
> +#include "net/net.h"
> +#include "clients.h"
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/option.h"
> +#include "qemu/sockets.h"
> +#include "qemu/iov.h"
> +#include "qemu/main-loop.h"
> +#include "unified.h"
> +
> +static void net_unified_send(void *opaque);
> +static void unified_writable(void *opaque);
> +
> +static void unified_update_fd_handler(NetUnifiedState *s)
> +{
> +    qemu_set_fd_handler(s->fd,
> +                        s->read_poll ? net_unified_send : NULL,
> +                        s->write_poll ? unified_writable : NULL,
> +                        s);
> +}
> +
> +static void unified_read_poll(NetUnifiedState *s, bool enable)
> +{
> +    if (s->read_poll != enable) {
> +        s->read_poll = enable;
> +        unified_update_fd_handler(s);
> +    }
> +}
> +
> +static void unified_write_poll(NetUnifiedState *s, bool enable)
> +{
> +    if (s->write_poll != enable) {
> +        s->write_poll = enable;
> +        unified_update_fd_handler(s);
> +    }
> +}
> +
> +static void unified_writable(void *opaque)
> +{
> +    NetUnifiedState *s = opaque;
> +    unified_write_poll(s, false);
> +    qemu_flush_queued_packets(&s->nc);
> +}
> +
> +static void unified_send_completed(NetClientState *nc, ssize_t len)
> +{
> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
> +    unified_read_poll(s, true);
> +}
> +
> +static void unified_poll(NetClientState *nc, bool enable)
> +{
> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
> +    unified_write_poll(s, enable);
> +    unified_read_poll(s, enable);
> +}
> +
> +static ssize_t net_unified_receive_dgram_iov(NetClientState *nc,
> +                    const struct iovec *iov,
> +                    int iovcnt)
> +{
> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
> +
> +    struct msghdr message;
> +    int ret;
> +
> +    if (iovcnt > MAX_UNIFIED_IOVCNT - 1) {
> +        error_report(
> +            "iovec too long %d > %d, change unified.h",
> +            iovcnt, MAX_UNIFIED_IOVCNT
> +        );
> +        return -1;
> +    }
> +    if (s->offset > 0) {

net_l2tpv3_receive_dgram_iov() does not have this check. I guess it 
s->offset=0 will be used by other transport. Maybe it's better to delay 
this change until is has a real user or add a comment here.

> +        s->form_header(s);
> +        memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
> +        s->vec->iov_base = s->header_buf;
> +        s->vec->iov_len = s->offset;
> +        message.msg_iovlen = iovcnt + 1;
> +    } else {
> +        memcpy(s->vec, iov, iovcnt * sizeof(struct iovec));
> +        message.msg_iovlen = iovcnt;
> +    }
> +    message.msg_name = s->dgram_dst;
> +    message.msg_namelen = s->dst_size;
> +    message.msg_iov = s->vec;
> +    message.msg_control = NULL;
> +    message.msg_controllen = 0;
> +    message.msg_flags = 0;
> +    do {
> +        ret = sendmsg(s->fd, &message, 0);
> +    } while ((ret == -1) && (errno == EINTR));
> +    if (ret > 0) {
> +        ret -= s->offset;
> +    } else if (ret == 0) {
> +        /* belt and braces - should not occur on DGRAM
> +        * we should get an error and never a 0 send
> +        */
> +        ret = iov_size(iov, iovcnt);
> +    } else {
> +        /* signal upper layer that socket buffer is full */
> +        ret = -errno;
> +        if (ret == -EAGAIN || ret == -ENOBUFS) {
> +            unified_write_poll(s, true);
> +            ret = 0;
> +        }
> +    }
> +    return ret;
> +}
> +
> +static ssize_t net_unified_receive_dgram(NetClientState *nc,
> +                    const uint8_t *buf,
> +                    size_t size)
> +{
> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
> +
> +    struct iovec *vec;
> +    struct msghdr message;
> +    ssize_t ret = 0;
> +
> +    vec = s->vec;
> +    if (s->offset > 0) {
> +        s->form_header(s);
> +        vec->iov_base = s->header_buf;
> +        vec->iov_len = s->offset;
> +        message.msg_iovlen = 2;
> +        vec++;
> +    } else {
> +        message.msg_iovlen = 1;
> +    }
> +    vec->iov_base = (void *) buf;
> +    vec->iov_len = size;
> +    message.msg_name = s->dgram_dst;
> +    message.msg_namelen = s->dst_size;
> +    message.msg_iov = s->vec;
> +    message.msg_control = NULL;
> +    message.msg_controllen = 0;
> +    message.msg_flags = 0;
> +    do {
> +        ret = sendmsg(s->fd, &message, 0);
> +    } while ((ret == -1) && (errno == EINTR));
> +    if (ret > 0) {
> +        ret -= s->offset;
> +    } else if (ret == 0) {
> +        /* belt and braces - should not occur on DGRAM
> +        * we should get an error and never a 0 send
> +        */
> +        ret = size;
> +    } else {
> +        ret = -errno;
> +        if (ret == -EAGAIN || ret == -ENOBUFS) {
> +            /* signal upper layer that socket buffer is full */
> +            unified_write_poll(s, true);
> +            ret = 0;
> +        }
> +    }
> +    return ret;
> +}
> +
> +
> +static void net_unified_process_queue(NetUnifiedState *s)
> +{
> +    int size = 0;
> +    struct iovec *vec;
> +    bool bad_read;
> +    int data_size;
> +    struct mmsghdr *msgvec;
> +
> +    /* go into ring mode only if there is a "pending" tail */
> +    if (s->queue_depth > 0) {
> +        do {
> +            msgvec = s->msgvec + s->queue_tail;
> +            if (msgvec->msg_len > 0) {
> +                data_size = msgvec->msg_len - s->header_size;
> +                vec = msgvec->msg_hdr.msg_iov;
> +                if ((data_size > 0) &&
> +                    (s->verify_header(s, vec->iov_base) == 0)) {
> +                    if (s->header_size > 0) {
> +                        vec++;
> +                    }
> +                    /* Use the legacy delivery for now, we will
> +                     * switch to using our own ring as a queueing mechanism
> +                     * at a later date
> +                     */
> +                    size = qemu_send_packet_async(
> +                            &s->nc,
> +                            vec->iov_base,
> +                            data_size,
> +                            unified_send_completed
> +                        );
> +                    if (size == 0) {
> +                        unified_read_poll(s, false);
> +                    }
> +                    bad_read = false;
> +                } else {
> +                    bad_read = true;
> +                    if (!s->header_mismatch) {
> +                        /* report error only once */
> +                        error_report("unified header verification failed");
> +                        s->header_mismatch = true;
> +                    }
> +                }
> +            } else {
> +                bad_read = true;
> +            }
> +            s->queue_tail = (s->queue_tail + 1) % MAX_UNIFIED_MSGCNT;
> +            s->queue_depth--;
> +        } while (
> +                (s->queue_depth > 0) &&
> +                 qemu_can_send_packet(&s->nc) &&
> +                ((size > 0) || bad_read)
> +            );
> +    }
> +}
> +
> +static void net_unified_send(void *opaque)
> +{
> +    NetUnifiedState *s = opaque;
> +    int target_count, count;
> +    struct mmsghdr *msgvec;
> +
> +    /* go into ring mode only if there is a "pending" tail */
> +
> +    if (s->queue_depth) {
> +
> +        /* The ring buffer we use has variable intake
> +         * count of how much we can read varies - adjust accordingly
> +         */
> +
> +        target_count = MAX_UNIFIED_MSGCNT - s->queue_depth;
> +
> +        /* Ensure we do not overrun the ring when we have
> +         * a lot of enqueued packets
> +         */
> +
> +        if (s->queue_head + target_count > MAX_UNIFIED_MSGCNT) {
> +            target_count = MAX_UNIFIED_MSGCNT - s->queue_head;
> +        }
> +    } else {
> +
> +        /* we do not have any pending packets - we can use
> +        * the whole message vector linearly instead of using
> +        * it as a ring
> +        */
> +
> +        s->queue_head = 0;
> +        s->queue_tail = 0;
> +        target_count = MAX_UNIFIED_MSGCNT;
> +    }
> +
> +    msgvec = s->msgvec + s->queue_head;
> +    if (target_count > 0) {
> +        do {
> +            count = recvmmsg(
> +                s->fd,
> +                msgvec,
> +                target_count, MSG_DONTWAIT, NULL);
> +        } while ((count == -1) && (errno == EINTR));
> +        if (count < 0) {
> +            /* Recv error - we still need to flush packets here,
> +             * (re)set queue head to current position
> +             */
> +            count = 0;
> +        }
> +        s->queue_head = (s->queue_head + count) % MAX_UNIFIED_MSGCNT;
> +        s->queue_depth += count;
> +    }
> +    net_unified_process_queue(s);
> +}
> +
> +static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount)
> +{
> +    int i, j;
> +    struct iovec *iov;
> +    struct mmsghdr *cleanup = msgvec;
> +    if (cleanup) {
> +        for (i = 0; i < count; i++) {
> +            if (cleanup->msg_hdr.msg_iov) {
> +                iov = cleanup->msg_hdr.msg_iov;
> +                for (j = 0; j < iovcount; j++) {
> +                    g_free(iov->iov_base);
> +                    iov++;
> +                }
> +                g_free(cleanup->msg_hdr.msg_iov);
> +            }
> +            cleanup++;
> +        }
> +        g_free(msgvec);
> +    }
> +}
> +
> +
> +
> +static struct mmsghdr *build_unified_vector(NetUnifiedState *s, int count)
> +{
> +    int i;
> +    struct iovec *iov;
> +    struct mmsghdr *msgvec, *result;
> +
> +    msgvec = g_new(struct mmsghdr, count);
> +    result = msgvec;
> +    for (i = 0; i < count ; i++) {
> +        msgvec->msg_hdr.msg_name = NULL;
> +        msgvec->msg_hdr.msg_namelen = 0;
> +        iov =  g_new(struct iovec, IOVSIZE);
> +        msgvec->msg_hdr.msg_iov = iov;
> +        if (s->header_size > 0) {

Same here.

> +            iov->iov_base = g_malloc(s->header_size);
> +            iov->iov_len = s->header_size;
> +            iov++ ;
> +        }
> +        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
> +        iov->iov_len = BUFFER_SIZE;
> +        msgvec->msg_hdr.msg_iovlen = 2;
> +        msgvec->msg_hdr.msg_control = NULL;
> +        msgvec->msg_hdr.msg_controllen = 0;
> +        msgvec->msg_hdr.msg_flags = 0;
> +        msgvec++;
> +    }
> +    return result;
> +}
> +
> +static void net_unified_cleanup(NetClientState *nc)
> +{
> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
> +    qemu_purge_queued_packets(nc);
> +    unified_read_poll(s, false);
> +    unified_write_poll(s, false);
> +    if (s->fd >= 0) {
> +        close(s->fd);
> +    }
> +    if (s->header_size > 0) {
> +        destroy_vector(s->msgvec, MAX_UNIFIED_MSGCNT, IOVSIZE);
> +    } else {
> +        destroy_vector(s->msgvec, MAX_UNIFIED_MSGCNT, 1);
> +    }
> +    g_free(s->vec);
> +    if (s->header_buf != NULL) {
> +        g_free(s->header_buf);
> +    }
> +    if (s->dgram_dst != NULL) {
> +        g_free(s->dgram_dst);
> +    }
> +}
> +
> +static NetClientInfo net_unified_info = {
> +    /* we share this one for all types for now, wrong I know :) */
> +    .type = NET_CLIENT_DRIVER_L2TPV3,

Like I said above, better to have transport specific type.

Thanks

> +    .size = sizeof(NetUnifiedState),
> +    .receive = net_unified_receive_dgram,
> +    .receive_iov = net_unified_receive_dgram_iov,
> +    .poll = unified_poll,
> +    .cleanup = net_unified_cleanup,
> +};
> +
> +NetClientState *qemu_new_unified_net_client(const char *name,
> +                    NetClientState *peer) {
> +    return qemu_new_net_client(&net_unified_info, peer, "unified", name);
> +}
> +
> +void qemu_net_finalize_unified_init(NetUnifiedState *s, int fd)
> +{
> +
> +    s->msgvec = build_unified_vector(s, MAX_UNIFIED_MSGCNT);
> +    s->vec = g_new(struct iovec, MAX_UNIFIED_IOVCNT);
> +    if (s->header_size > 0) {
> +        s->header_buf = g_malloc(s->header_size);
> +    } else {
> +        s->header_buf = NULL;
> +    }
> +    qemu_set_nonblock(fd);
> +
> +    s->fd = fd;
> +    unified_read_poll(s, true);
> +
> +}
> +
> diff --git a/net/unified.h b/net/unified.h
> new file mode 100644
> index 0000000000..97ec743f0e
> --- /dev/null
> +++ b/net/unified.h
> @@ -0,0 +1,118 @@
> +/*
> + * QEMU System Emulator
> + *
> + * Copyright (c) 2015-2017 Cambridge Greys Limited
> + * Copyright (c) 2012-2014 Cisco Systems
> + * Copyright (c) 2003-2008 Fabrice Bellard
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +
> +
> +#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
> +#define BUFFER_SIZE 2048
> +#define IOVSIZE 2
> +#define MAX_UNIFIED_MSGCNT 64
> +#define MAX_UNIFIED_IOVCNT (MAX_UNIFIED_MSGCNT * IOVSIZE)
> +
> +#ifndef QEMU_NET_UNIFIED_H
> +#define QEMU_NET_UNIFIED_H
> +
> +typedef struct NetUnifiedState {
> +    NetClientState nc;
> +
> +    int fd;
> +
> +    /*
> +     * these are used for xmit - that happens packet a time
> +     * and for first sign of life packet (easier to parse that once)
> +     */
> +
> +    uint8_t *header_buf;
> +    struct iovec *vec;
> +
> +    /*
> +     * these are used for receive - try to "eat" up to 32 packets at a time
> +     */
> +
> +    struct mmsghdr *msgvec;
> +
> +    /*
> +     * peer address
> +     */
> +
> +    struct sockaddr_storage *dgram_dst;
> +    uint32_t dst_size;
> +
> +    /*
> +     * Internal Queue
> +     */
> +
> +    /*
> +    * DOS avoidance in error handling
> +    */
> +
> +    /* Easier to keep l2tpv3 specific */
> +
> +    bool header_mismatch;
> +
> +    /*
> +     *
> +     * Ring buffer handling
> +     *
> +     */
> +
> +    int queue_head;
> +    int queue_tail;
> +    int queue_depth;
> +
> +    /*
> +     * Offset to data - common for all protocols
> +     */
> +
> +    uint32_t offset;
> +
> +    /*
> +     * Header size - common for all protocols
> +     */
> +
> +    uint32_t header_size;
> +    /* Poll Control */
> +
> +    bool read_poll;
> +    bool write_poll;
> +
> +    /* Parameters */
> +
> +    void *params;
> +
> +    /* header forming functions */
> +
> +    int (*verify_header)(void *s, uint8_t *buf);
> +    void (*form_header)(void *s);
> +
> +} NetUnifiedState;
> +
> +extern NetClientState *qemu_new_unified_net_client(const char *name,
> +                    NetClientState *peer);
> +
> +extern void qemu_net_finalize_unified_init(NetUnifiedState *s, int fd);
> +#endif

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support anton.ivanov
@ 2017-07-19  5:48   ` Jason Wang
  2017-07-19  5:50     ` Anton Ivanov
  2017-07-19 14:40   ` Eric Blake
  1 sibling, 1 reply; 23+ messages in thread
From: Jason Wang @ 2017-07-19  5:48 UTC (permalink / raw)
  To: anton.ivanov, qemu-devel



On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>
> This adds GRETAP support to the unified socket driver.
>
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>   net/Makefile.objs |   2 +-
>   net/clients.h     |   4 +
>   net/gre.c         | 313 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   net/net.c         |   5 +
>   qapi-schema.json  |  46 +++++++-
>   qemu-options.hx   |  63 ++++++++++-
>   6 files changed, 425 insertions(+), 8 deletions(-)
>   create mode 100644 net/gre.c
>
> diff --git a/net/Makefile.objs b/net/Makefile.objs
> index 8026ad778a..128164e39b 100644
> --- a/net/Makefile.objs
> +++ b/net/Makefile.objs
> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>   common-obj-y += socket.o
>   common-obj-y += dump.o
>   common-obj-y += eth.o
> -common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o
>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>   common-obj-$(CONFIG_SLIRP) += slirp.o
>   common-obj-$(CONFIG_VDE) += vde.o
> diff --git a/net/clients.h b/net/clients.h
> index 5cae479730..8f8a59aee3 100644
> --- a/net/clients.h
> +++ b/net/clients.h
> @@ -49,6 +49,10 @@ int net_init_bridge(const Netdev *netdev, const char *name,
>   
>   int net_init_l2tpv3(const Netdev *netdev, const char *name,
>                       NetClientState *peer, Error **errp);
> +
> +int net_init_gre(const Netdev *netdev, const char *name,
> +                    NetClientState *peer, Error **errp);
> +
>   #ifdef CONFIG_VDE
>   int net_init_vde(const Netdev *netdev, const char *name,
>                    NetClientState *peer, Error **errp);
> diff --git a/net/gre.c b/net/gre.c
> new file mode 100644
> index 0000000000..ee8c36dd4d
> --- /dev/null
> +++ b/net/gre.c
> @@ -0,0 +1,313 @@
> +/*
> + * QEMU System Emulator
> + *
> + * Copyright (c) 2015-2017 Cambridge GREys Limited
> + * Copyright (c) 2003-2008 Fabrice Bellard
> + * Copyright (c) 2012-2014 Cisco Systems
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +#include <linux/ip.h>
> +#include <netdb.h>
> +#include "net/net.h"
> +#include "clients.h"
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/option.h"
> +#include "qemu/sockets.h"
> +#include "qemu/iov.h"
> +#include "qemu/main-loop.h"
> +#include "unified.h"
> +
> +/* IANA-assigned IP protocol ID for GRE */
> +
> +
> +#ifndef IPPROTO_GRE
> +#define IPPROTO_GRE 0x2F
> +#endif
> +
> +#define GRE_MODE_CHECKSUM     htons(8 << 12)   /* checksum */
> +#define GRE_MODE_RESERVED     htons(4 << 12)   /* unused */
> +#define GRE_MODE_KEY          htons(2 << 12)   /* KEY present */
> +#define GRE_MODE_SEQUENCE     htons(1 << 12)   /* no sequence */
> +
> +
> +/* GRE TYPE for Ethernet in GRE aka GRETAP */
> +
> +#define GRE_IRB htons(0x6558)
> +
> +struct gre_minimal_header {
> +   uint16_t header;
> +   uint16_t arptype;
> +};
> +
> +typedef struct GRETunnelParams {
> +    /*
> +     * GRE parameters
> +     */
> +
> +    uint32_t rx_key;
> +    uint32_t tx_key;
> +    uint32_t sequence;
> +
> +    /* Flags */
> +
> +    bool ipv6;
> +    bool udp;
> +    bool has_sequence;
> +    bool pin_sequence;
> +    bool checksum;
> +    bool key;
> +
> +    /* Precomputed GRE specific offsets */
> +
> +    uint32_t key_offset;
> +    uint32_t sequence_offset;
> +    uint32_t checksum_offset;
> +
> +    struct gre_minimal_header header_bits;
> +
> +} GRETunnelParams;
> +
> +
> +
> +static void gre_form_header(void *us)
> +{
> +    NetUnifiedState *s = (NetUnifiedState *) us;
> +    GRETunnelParams *p = (GRETunnelParams *) s->params;
> +
> +    uint32_t *sequence;
> +
> +    *((uint32_t *) s->header_buf) = *((uint32_t *) &p->header_bits);
> +
> +    if (p->key) {
> +        stl_be_p(
> +            (uint32_t *) (s->header_buf + p->key_offset),
> +            p->tx_key
> +        );
> +    }
> +    if (p->has_sequence) {
> +        sequence = (uint32_t *)(s->header_buf + p->sequence_offset);
> +        if (p->pin_sequence) {
> +            *sequence = 0;
> +        } else {
> +            stl_be_p(sequence, ++p->sequence);
> +        }
> +    }
> +}
> +
> +static int gre_verify_header(void *us, uint8_t *buf)
> +{
> +
> +    NetUnifiedState *s = (NetUnifiedState *) us;
> +    GRETunnelParams *p = (GRETunnelParams *) s->params;
> +    uint32_t key;
> +
> +
> +    if (!p->ipv6) {
> +        buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
> +    }
> +
> +    if (*((uint32_t *) buf) != *((uint32_t *) &p->header_bits)) {
> +        if (!s->header_mismatch) {
> +            error_report("header type disagreement, expecting %0x, got %0x",
> +                *((uint32_t *) &p->header_bits), *((uint32_t *) buf));
> +        }
> +        return -1;
> +    }
> +
> +    if (p->key) {
> +        key = ldl_be_p(buf + p->key_offset);
> +        if (key != p->rx_key) {
> +            if (!s->header_mismatch) {
> +                error_report("unknown key id %0x, expecting %0x",
> +                    key, p->rx_key);
> +            }
> +            return -1;
> +        }
> +    }
> +    return 0;
> +}
> +
> +int net_init_gre(const Netdev *netdev,
> +                    const char *name,
> +                    NetClientState *peer, Error **errp)
> +{
> +    /* FIXME error_setg(errp, ...) on failure */

Let's do this in next version.

> +    const NetdevGREOptions *gre;
> +    NetUnifiedState *s;
> +    NetClientState *nc;
> +    GRETunnelParams *p;
> +
> +    int fd = -1, gairet;
> +    struct addrinfo hints;
> +    struct addrinfo *result = NULL;
> +
> +    nc = qemu_new_unified_net_client(name, peer);
> +
> +    s = DO_UPCAST(NetUnifiedState, nc, nc);
> +
> +    p = g_malloc(sizeof(GRETunnelParams));
> +
> +    s->params = p;
> +    p->header_bits.arptype = GRE_IRB;
> +    p->header_bits.header = 0;
> +
> +    s->form_header = &gre_form_header;
> +    s->verify_header = &gre_verify_header;
> +    s->queue_head = 0;
> +    s->queue_tail = 0;
> +    s->header_mismatch = false;
> +
> +    assert(netdev->type == NET_CLIENT_DRIVER_GRE);
> +    gre = &netdev->u.gre;
> +
> +    if (gre->has_ipv6 && gre->ipv6) {
> +        p->ipv6 = gre->ipv6;
> +    } else {
> +        p->ipv6 = false;
> +    }
> +
> +    s->offset = 4;
> +    p->key_offset = 4;
> +    p->sequence_offset = 4;
> +    p->checksum_offset = 4;
> +
> +    if (gre->has_rxkey || gre->has_txkey) {
> +        if (gre->has_rxkey && gre->has_txkey) {
> +            p->key = true;
> +            p->header_bits.header |= GRE_MODE_KEY;
> +        } else {
> +            goto outerr;
> +        }
> +    } else {
> +        p->key = false;
> +    }
> +
> +    if (p->key) {
> +        p->rx_key = gre->rxkey;
> +        p->tx_key = gre->txkey;
> +        s->offset += 4;
> +        p->sequence_offset += 4;
> +    }
> +
> +
> +    if (gre->has_sequence && gre->sequence) {
> +        s->offset += 4;
> +        p->has_sequence = true;
> +        p->header_bits.header |= GRE_MODE_SEQUENCE;
> +    } else {
> +        p->sequence = false;
> +    }
> +
> +    if (gre->has_pinsequence && gre->pinsequence) {
> +        /* pin sequence implies that there is sequence */
> +        p->has_sequence = true;
> +        p->pin_sequence = true;
> +    } else {
> +        p->pin_sequence = false;
> +    }
> +
> +    memset(&hints, 0, sizeof(hints));
> +
> +    if (p->ipv6) {
> +        hints.ai_family = AF_INET6;
> +    } else {
> +        hints.ai_family = AF_INET;
> +    }
> +
> +    hints.ai_socktype = SOCK_RAW;
> +    hints.ai_protocol = IPPROTO_GRE;
> +
> +    gairet = getaddrinfo(gre->src, NULL, &hints, &result);
> +
> +    if ((gairet != 0) || (result == NULL)) {
> +        error_report(
> +            "gre_open : could not resolve src, errno = %s",
> +            gai_strerror(gairet)
> +        );
> +        goto outerr;
> +    }
> +    fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol);
> +    if (fd == -1) {
> +        fd = -errno;
> +        error_report("gre_open : socket creation failed, errno = %d", -fd);
> +        goto outerr;
> +    }
> +    if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) {
> +        error_report("gre_open :  could not bind socket err=%i", errno);
> +        goto outerr;
> +    }
> +    if (result) {
> +        freeaddrinfo(result);
> +    }
> +
> +    memset(&hints, 0, sizeof(hints));
> +
> +    if (p->ipv6) {
> +        hints.ai_family = AF_INET6;
> +    } else {
> +        hints.ai_family = AF_INET;
> +    }
> +    hints.ai_socktype = SOCK_RAW;
> +    hints.ai_protocol = IPPROTO_GRE;
> +
> +    result = NULL;
> +    gairet = getaddrinfo(gre->dst, NULL, &hints, &result);
> +    if ((gairet != 0) || (result == NULL)) {
> +        error_report(
> +            "gre_open : could not resolve dst, error = %s",
> +            gai_strerror(gairet)
> +        );
> +        goto outerr;
> +    }
> +
> +    s->dgram_dst = g_new0(struct sockaddr_storage, 1);
> +    memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen);

Use g_memdup() instead?

> +    s->dst_size = result->ai_addrlen;
> +
> +    if (result) {
> +        freeaddrinfo(result);
> +    }
> +
> +    if ((p->ipv6) || (p->udp)) {
> +        s->header_size = s->offset;
> +    } else {
> +        s->header_size = s->offset + sizeof(struct iphdr);
> +    }
> +
> +    qemu_net_finalize_unified_init(s, fd);
> +
> +    p->sequence = 0;
> +
> +    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
> +             "gre: connected");
> +    return 0;
> +outerr:
> +    qemu_del_net_client(nc);
> +    if (fd >= 0) {
> +        close(fd);
> +    }
> +    if (result) {
> +        freeaddrinfo(result);
> +    }
> +    return -1;
> +}
> diff --git a/net/net.c b/net/net.c
> index 9270b52ac8..b75b6e8154 100644
> --- a/net/net.c
> +++ b/net/net.c
> @@ -961,6 +961,7 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>   #endif
>   #ifdef CONFIG_UNIFIED
>           [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
> +        [NET_CLIENT_DRIVER_GRE] = net_init_gre,
>   #endif
>   };
>   
> @@ -1012,6 +1013,10 @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
>               legacy.type = NET_CLIENT_DRIVER_L2TPV3;
>               legacy.u.l2tpv3 = opts->u.l2tpv3;
>               break;
> +        case NET_LEGACY_OPTIONS_TYPE_GRE:
> +            legacy.type = NET_CLIENT_DRIVER_GRE;
> +            legacy.u.gre = opts->u.gre;
> +            break;
>           case NET_LEGACY_OPTIONS_TYPE_SOCKET:
>               legacy.type = NET_CLIENT_DRIVER_SOCKET;
>               legacy.u.socket = opts->u.socket;
> diff --git a/qapi-schema.json b/qapi-schema.json
> index ab438ead70..aec303a14e 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -3847,7 +3847,41 @@
>       'txsession':    'uint32',
>       '*rxsession':   'uint32',
>       '*offset':      'uint32' } }
> -
> +##
> +# @NetdevGREOptions:
> +#
> +# Connect the VLAN to Ethernet over Ethernet over GRE (GRETAP) tunnel
> +#
> +# @src: source address
> +#
> +# @dst: destination address
> +#
> +# @ipv6: force the use of ipv6
> +#
> +# @sequence: have sequence counter
> +#
> +# @pinsequence: pin sequence counter to zero -
> +#              workaround for buggy implementations or
> +#              networks with packet reorder
> +#
> +# @txkey: 32 bit transmit key
> +#
> +# @rxkey: 32 bit receive key
> +#
> +# Note - gre checksums are not supported at present
> +#
> +#
> +# Since 2.9

We are soft freeze now. So I will target this for 2.11.

> +##
> +{ 'struct': 'NetdevGREOptions',
> +  'data': {
> +    'src':          'str',
> +    'dst':          'str',
> +    '*ipv6':        'bool',
> +    '*sequence':     'bool',
> +    '*pinsequence':  'bool',
> +    '*txkey':    'uint32',
> +    '*rxkey':    'uint32' } }
>   ##
>   # @NetdevVdeOptions:
>   #
> @@ -3966,7 +4000,7 @@
>   ##
>   { 'enum': 'NetClientDriver',
>     'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
> -            'bridge', 'hubport', 'netmap', 'vhost-user' ] }
> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
>   
>   ##
>   # @Netdev:
> @@ -3996,7 +4030,8 @@
>       'bridge':   'NetdevBridgeOptions',
>       'hubport':  'NetdevHubPortOptions',
>       'netmap':   'NetdevNetmapOptions',
> -    'vhost-user': 'NetdevVhostUserOptions' } }
> +    'vhost-user': 'NetdevVhostUserOptions',
> +    'gre':      'NetdevGREOptions' } }
>   
>   ##
>   # @NetLegacy:
> @@ -4027,7 +4062,7 @@
>   ##
>   { 'enum': 'NetLegacyOptionsType',
>     'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
> -           'dump', 'bridge', 'netmap', 'vhost-user'] }
> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
>   
>   ##
>   # @NetLegacyOptions:
> @@ -4050,7 +4085,8 @@
>       'dump':     'NetdevDumpOptions',
>       'bridge':   'NetdevBridgeOptions',
>       'netmap':   'NetdevNetmapOptions',
> -    'vhost-user': 'NetdevVhostUserOptions' } }
> +    'vhost-user': 'NetdevVhostUserOptions',
> +    'gre':      'NetdevGREOptions' } }
>   
>   ##
>   # @NetFilterDirection:
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 2cc70b9cfc..6f8d5cbe21 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -1945,7 +1945,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>       "                connected to a bridge (default=" DEFAULT_BRIDGE_INTERFACE ")\n"
>       "                using the program 'helper (default=" DEFAULT_BRIDGE_HELPER ")\n"
>   #endif
> -#ifdef __linux__
> +#ifdef CONFIG_UNIFIED
>       "-netdev l2tpv3,id=str,src=srcaddr,dst=dstaddr[,srcport=srcport][,dstport=dstport]\n"
>       "         [,rxsession=rxsession],txsession=txsession[,ipv6=on/off][,udp=on/off]\n"
>       "         [,cookie64=on/off][,counter][,pincounter][,txcookie=txcookie]\n"
> @@ -1971,6 +1971,23 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>       "                use 'counter=off' to force a 'cut-down' L2TPv3 with no counter\n"
>       "                use 'pincounter=on' to work around broken counter handling in peer\n"
>       "                use 'offset=X' to add an extra offset between header and data\n"
> +    "-netdev gre,id=str,src=srcaddr,dst=dstaddr[,rxkey=rxkey],txkey=txkey[,ipv6=on/off]\n"
> +    "         [,sequence][,pinsequence]\n"
> +    "                configure a network backend with ID 'str' connected to\n"
> +    "                an Ethernet over GRE pseudowire (aka GRE TAP).\n"
> +    "                Linux kernel 3.3+ as well as most routers and some switches\n"
> +    "                can talk GRETAP. This transport allows connecting a VM to a VM,\n"
> +    "                VM to a router and even VM to Host. It is a nearly-universal\n"
> +    "                standard (RFC1701).\n"
> +    "                use 'src=' to specify source address\n"
> +    "                use 'dst=' to specify destination address\n"
> +    "                use 'ipv6=on' to force v6\n"
> +    "                GRE may use keys to prevent misconfiguration as\n"
> +    "                well as a weak security measure\n"
> +    "                use 'rxkey=0x01234' to specify a rxkey\n"
> +    "                use 'txkey=0x01234' to specify a txkey\n"
> +    "                use 'sequence=on' to add frame sequence to each packet\n"
> +    "                use 'pinsequence=on' to work around broken sequence handling in peer\n"
>   #endif
>       "-netdev socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
>       "                configure a network backend to connect to another network\n"
> @@ -2394,12 +2411,54 @@ ip l2tp add session tunnel_id 1 name vmtunnel0 session_id \
>   ifconfig vmtunnel0 mtu 1500
>   ifconfig vmtunnel0 up
>   brctl addif br-lan vmtunnel0
> +@end example
> +
> +Alternatively, it is possible to assign an IP address to vmtunnel0, which allows
> +the VM to connect to the host directly without using Linux bridging.
> +
> +
> +@item -netdev gre,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,ipv6][,sequence][,pinsequence][,txkey=@var{txkey}][,rxkey=@var{rxkey}]
> +@itemx -net gre[,vlan=@var{n}][,name=@var{name}],src=@var{srcaddr},dst=@var{dstaddr}[,ipv6][,sequence][,pinsequence][,txkey=@var{txkey}][,rxkey=@var{rxkey}]
> +Connect VLAN @var{n} to a GRE pseudowire. GRE (RFC1701) is a popular
> +protocol to transport various data frames between two systems.
> +We are interested in a specific GRE variety where the transported
> +frames are Ethernet. This GRE type is usually referred to as GRETAP.
> +It is present in routers, firewalls, switches and the Linux kernel
> +(from version 3.3 onwards).
> +
> +This transport allows a VM to communicate to another VM, router or firewall directly.
> +
> +@item src=@var{srcaddr}
> +    source address (mandatory)
> +@item dst=@var{dstaddr}
> +    destination address (mandatory)
> +@item ipv6
> +    force v6, otherwise defaults to v4.
> +@item rxkey=@var{rxkey}
> +@itemx txkey=@var{txkey}
> +    Keys are a weak form of security in the gre specification.
> +Their function is mostly to prevent misconfiguration.
> +@item sequence=on
> +    Add frame sequence to GRE frames
> +@item pinsequence=on
> +    Work around broken sequence handling in peer. This may also help on
> +networks which have packet reorder.
> +
> +For example, to attach a VM running on host 4.3.2.1 via GRETAP to the bridge br-lan
> +on the remote Linux host 1.2.3.4:
> +@example
> +# Setup tunnel on linux host using raw ip as encapsulation
> +# on 1.2.3.4
> +ip link add gt0 type gretap local 1.2.3.4 remote 4.3.2.1
> +ifconfig gt0 mtu 1500
> +ifconfig gt0 up
> +brctl addif br-lan gt0
>   
>   
>   # on 4.3.2.1
>   # launch QEMU instance - if your network has reorder or is very lossy add ,pincounter
>   
> -qemu-system-i386 linux.img -net nic -net l2tpv3,src=4.2.3.1,dst=1.2.3.4,udp,srcport=16384,dstport=16384,rxsession=0xffffffff,txsession=0xffffffff,counter
> +qemu-system-i386 linux.img -net nic -net gre,src=4.2.3.1,dst=1.2.3.4
>   
>   
>   @end example

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-19  5:39   ` Jason Wang
@ 2017-07-19  5:48     ` Anton Ivanov
  2017-07-19  6:07       ` Jason Wang
  2017-07-21 17:50     ` Anton Ivanov
  1 sibling, 1 reply; 23+ messages in thread
From: Anton Ivanov @ 2017-07-19  5:48 UTC (permalink / raw)
  To: Jason Wang, qemu-devel



On 19/07/17 06:39, Jason Wang wrote:
>
>
> On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> 1. Creates a common backend for socket transports using
>> recvmmsg().
>> 2. Migrates L2TPv3 to the new backend
>
> It would be better if you could further split out 2 from this patch.
>
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   configure         |  10 +-
>>   net/Makefile.objs |   2 +-
>>   net/l2tpv3.c      | 531 
>> +++++++++---------------------------------------------
>>   net/net.c         |   4 +-
>>   net/unified.c     | 406 +++++++++++++++++++++++++++++++++++++++++
>>   net/unified.h     | 118 ++++++++++++
>>   6 files changed, 613 insertions(+), 458 deletions(-)
>>   create mode 100644 net/unified.c
>>   create mode 100644 net/unified.h
>>
>> diff --git a/configure b/configure
>> index a3f0522e8f..99a60b723c 100755
>> --- a/configure
>> +++ b/configure
>> @@ -1862,7 +1862,7 @@ if ! compile_object -Werror ; then
>>   fi
>>     ##########################################
>> -# L2TPV3 probe
>> +# UNIFIED probe
>>     cat > $TMPC <<EOF
>>   #include <sys/socket.h>
>> @@ -1870,9 +1870,9 @@ cat > $TMPC <<EOF
>>   int main(void) { return sizeof(struct mmsghdr); }
>>   EOF
>>   if compile_prog "" "" ; then
>> -  l2tpv3=yes
>> +  unified=yes
>>   else
>> -  l2tpv3=no
>> +  unified=no
>>   fi
>>     ##########################################
>> @@ -5458,8 +5458,8 @@ fi
>>   if test "$netmap" = "yes" ; then
>>     echo "CONFIG_NETMAP=y" >> $config_host_mak
>>   fi
>> -if test "$l2tpv3" = "yes" ; then
>> -  echo "CONFIG_L2TPV3=y" >> $config_host_mak
>> +if test "$unified" = "yes" ; then
>> +  echo "CONFIG_UNIFIED=y" >> $config_host_mak
>>   fi
>
> Could we keep l2tpv3 option?

The l2tpv3 test is actually a test for recvmmsg. If you can do one 
recvmmsg transport you can do all of them.

>
>>   if test "$cap_ng" = "yes" ; then
>>     echo "CONFIG_LIBCAP=y" >> $config_host_mak
>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>> index 67ba5e26fb..8026ad778a 100644
>> --- a/net/Makefile.objs
>> +++ b/net/Makefile.objs
>> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>>   common-obj-y += socket.o
>>   common-obj-y += dump.o
>>   common-obj-y += eth.o
>> -common-obj-$(CONFIG_L2TPV3) += l2tpv3.o
>> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
>>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>>   common-obj-$(CONFIG_SLIRP) += slirp.o
>>   common-obj-$(CONFIG_VDE) += vde.o
>> diff --git a/net/l2tpv3.c b/net/l2tpv3.c
>> index 6745b78990..05413c9cbd 100644
>> --- a/net/l2tpv3.c
>> +++ b/net/l2tpv3.c
>> @@ -1,6 +1,7 @@
>>   /*
>>    * QEMU System Emulator
>>    *
>> + * Copyright (c) 2015-2017 Cambridge Greys Limited
>>    * Copyright (c) 2003-2008 Fabrice Bellard
>>    * Copyright (c) 2012-2014 Cisco Systems
>>    *
>> @@ -34,19 +35,9 @@
>>   #include "qemu/sockets.h"
>>   #include "qemu/iov.h"
>>   #include "qemu/main-loop.h"
>> +#include "unified.h"
>>     -/* The buffer size needs to be investigated for optimum numbers and
>> - * optimum means of paging in on different systems. This size is
>> - * chosen to be sufficient to accommodate one packet with some headers
>> - */
>> -
>> -#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
>> -#define BUFFER_SIZE 2048
>> -#define IOVSIZE 2
>> -#define MAX_L2TPV3_MSGCNT 64
>> -#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE)
>> -
>>   /* Header set to 0x30000 signifies a data packet */
>>     #define L2TPV3_DATA_PACKET 0x30000
>> @@ -57,31 +48,7 @@
>>   #define IPPROTO_L2TP 0x73
>>   #endif
>>   -typedef struct NetL2TPV3State {
>> -    NetClientState nc;
>> -    int fd;
>> -
>> -    /*
>> -     * these are used for xmit - that happens packet a time
>> -     * and for first sign of life packet (easier to parse that once)
>> -     */
>> -
>> -    uint8_t *header_buf;
>> -    struct iovec *vec;
>> -
>> -    /*
>> -     * these are used for receive - try to "eat" up to 32 packets at 
>> a time
>> -     */
>> -
>> -    struct mmsghdr *msgvec;
>> -
>> -    /*
>> -     * peer address
>> -     */
>> -
>> -    struct sockaddr_storage *dgram_dst;
>> -    uint32_t dst_size;
>> -
>> +typedef struct L2TPV3TunnelParams {
>>       /*
>>        * L2TPv3 parameters
>>        */
>> @@ -90,37 +57,8 @@ typedef struct NetL2TPV3State {
>>       uint64_t tx_cookie;
>>       uint32_t rx_session;
>>       uint32_t tx_session;
>> -    uint32_t header_size;
>>       uint32_t counter;
>>   -    /*
>> -    * DOS avoidance in error handling
>> -    */
>> -
>> -    bool header_mismatch;
>> -
>> -    /*
>> -     * Ring buffer handling
>> -     */
>> -
>> -    int queue_head;
>> -    int queue_tail;
>> -    int queue_depth;
>> -
>> -    /*
>> -     * Precomputed offsets
>> -     */
>> -
>> -    uint32_t offset;
>> -    uint32_t cookie_offset;
>> -    uint32_t counter_offset;
>> -    uint32_t session_offset;
>> -
>> -    /* Poll Control */
>> -
>> -    bool read_poll;
>> -    bool write_poll;
>> -
>>       /* Flags */
>>         bool ipv6;
>> @@ -130,189 +68,62 @@ typedef struct NetL2TPV3State {
>>       bool cookie;
>>       bool cookie_is_64;
>>   -} NetL2TPV3State;
>> -
>> -static void net_l2tpv3_send(void *opaque);
>> -static void l2tpv3_writable(void *opaque);
>> -
>> -static void l2tpv3_update_fd_handler(NetL2TPV3State *s)
>> -{
>> -    qemu_set_fd_handler(s->fd,
>> -                        s->read_poll ? net_l2tpv3_send : NULL,
>> -                        s->write_poll ? l2tpv3_writable : NULL,
>> -                        s);
>> -}
>> -
>> -static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable)
>> -{
>> -    if (s->read_poll != enable) {
>> -        s->read_poll = enable;
>> -        l2tpv3_update_fd_handler(s);
>> -    }
>> -}
>> +    /* Precomputed L2TPV3 specific offsets */
>> +    uint32_t cookie_offset;
>> +    uint32_t counter_offset;
>> +    uint32_t session_offset;
>>   -static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable)
>> -{
>> -    if (s->write_poll != enable) {
>> -        s->write_poll = enable;
>> -        l2tpv3_update_fd_handler(s);
>> -    }
>> -}
>> +} L2TPV3TunnelParams;
>>   -static void l2tpv3_writable(void *opaque)
>> -{
>> -    NetL2TPV3State *s = opaque;
>> -    l2tpv3_write_poll(s, false);
>> -    qemu_flush_queued_packets(&s->nc);
>> -}
>>   -static void l2tpv3_send_completed(NetClientState *nc, ssize_t len)
>> -{
>> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
>> -    l2tpv3_read_poll(s, true);
>> -}
>>   -static void l2tpv3_poll(NetClientState *nc, bool enable)
>> +static void l2tpv3_form_header(void *us)
>>   {
>> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
>> -    l2tpv3_write_poll(s, enable);
>> -    l2tpv3_read_poll(s, enable);
>> -}
>> +    NetUnifiedState *s = (NetUnifiedState *) us;
>> +    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
>
> How about embedding NetUnifiedState into this structure and keep using 
> NetL2TPV3State? Then:
>
> -  's' could be kept and lots of lines of changes could be saved here 
> and l2tpv3_verify_header()
> -  each transport could have their own type instead of using 
> NET_CLIENT_DRIVER_L2TPV3

Good idea. I will try it and see how it pans out.

>
> ?
>
>>   -static void l2tpv3_form_header(NetL2TPV3State *s)
>> -{
>>       uint32_t *counter;
>>   -    if (s->udp) {
>> +    if (p->udp) {
>>           stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET);
>>       }
>>       stl_be_p(
>> -            (uint32_t *) (s->header_buf + s->session_offset),
>> -            s->tx_session
>> +            (uint32_t *) (s->header_buf + p->session_offset),
>> +            p->tx_session
>>           );
>> -    if (s->cookie) {
>> -        if (s->cookie_is_64) {
>> +    if (p->cookie) {
>> +        if (p->cookie_is_64) {
>>               stq_be_p(
>> -                (uint64_t *)(s->header_buf + s->cookie_offset),
>> -                s->tx_cookie
>> +                (uint64_t *)(s->header_buf + p->cookie_offset),
>> +                p->tx_cookie
>>               );
>>           } else {
>>               stl_be_p(
>> -                (uint32_t *) (s->header_buf + s->cookie_offset),
>> -                s->tx_cookie
>> +                (uint32_t *) (s->header_buf + p->cookie_offset),
>> +                p->tx_cookie
>>               );
>>           }
>>       }
>> -    if (s->has_counter) {
>> -        counter = (uint32_t *)(s->header_buf + s->counter_offset);
>> -        if (s->pin_counter) {
>> +    if (p->has_counter) {
>> +        counter = (uint32_t *)(s->header_buf + p->counter_offset);
>> +        if (p->pin_counter) {
>>               *counter = 0;
>>           } else {
>> -            stl_be_p(counter, ++s->counter);
>> -        }
>> -    }
>> -}
>> -
>> -static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc,
>> -                    const struct iovec *iov,
>> -                    int iovcnt)
>> -{
>> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
>> -
>> -    struct msghdr message;
>> -    int ret;
>> -
>> -    if (iovcnt > MAX_L2TPV3_IOVCNT - 1) {
>> -        error_report(
>> -            "iovec too long %d > %d, change l2tpv3.h",
>> -            iovcnt, MAX_L2TPV3_IOVCNT
>> -        );
>> -        return -1;
>> -    }
>> -    l2tpv3_form_header(s);
>> -    memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
>> -    s->vec->iov_base = s->header_buf;
>> -    s->vec->iov_len = s->offset;
>> -    message.msg_name = s->dgram_dst;
>> -    message.msg_namelen = s->dst_size;
>> -    message.msg_iov = s->vec;
>> -    message.msg_iovlen = iovcnt + 1;
>> -    message.msg_control = NULL;
>> -    message.msg_controllen = 0;
>> -    message.msg_flags = 0;
>> -    do {
>> -        ret = sendmsg(s->fd, &message, 0);
>> -    } while ((ret == -1) && (errno == EINTR));
>> -    if (ret > 0) {
>> -        ret -= s->offset;
>> -    } else if (ret == 0) {
>> -        /* belt and braces - should not occur on DGRAM
>> -        * we should get an error and never a 0 send
>> -        */
>> -        ret = iov_size(iov, iovcnt);
>> -    } else {
>> -        /* signal upper layer that socket buffer is full */
>> -        ret = -errno;
>> -        if (ret == -EAGAIN || ret == -ENOBUFS) {
>> -            l2tpv3_write_poll(s, true);
>> -            ret = 0;
>> +            stl_be_p(counter, ++p->counter);
>>           }
>>       }
>> -    return ret;
>>   }
>>   -static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc,
>> -                    const uint8_t *buf,
>> -                    size_t size)
>> -{
>> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
>> -
>> -    struct iovec *vec;
>> -    struct msghdr message;
>> -    ssize_t ret = 0;
>> -
>> -    l2tpv3_form_header(s);
>> -    vec = s->vec;
>> -    vec->iov_base = s->header_buf;
>> -    vec->iov_len = s->offset;
>> -    vec++;
>> -    vec->iov_base = (void *) buf;
>> -    vec->iov_len = size;
>> -    message.msg_name = s->dgram_dst;
>> -    message.msg_namelen = s->dst_size;
>> -    message.msg_iov = s->vec;
>> -    message.msg_iovlen = 2;
>> -    message.msg_control = NULL;
>> -    message.msg_controllen = 0;
>> -    message.msg_flags = 0;
>> -    do {
>> -        ret = sendmsg(s->fd, &message, 0);
>> -    } while ((ret == -1) && (errno == EINTR));
>> -    if (ret > 0) {
>> -        ret -= s->offset;
>> -    } else if (ret == 0) {
>> -        /* belt and braces - should not occur on DGRAM
>> -        * we should get an error and never a 0 send
>> -        */
>> -        ret = size;
>> -    } else {
>> -        ret = -errno;
>> -        if (ret == -EAGAIN || ret == -ENOBUFS) {
>> -            /* signal upper layer that socket buffer is full */
>> -            l2tpv3_write_poll(s, true);
>> -            ret = 0;
>> -        }
>> -    }
>> -    return ret;
>> -}
>>   -static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
>> +static int l2tpv3_verify_header(void *us, uint8_t *buf)
>>   {
>>   +    NetUnifiedState *s = (NetUnifiedState *) us;
>> +    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
>>       uint32_t *session;
>>       uint64_t cookie;
>>   -    if ((!s->udp) && (!s->ipv6)) {
>> +    if ((!p->udp) && (!p->ipv6)) {
>>           buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
>>       }
>>   @@ -321,21 +132,21 @@ static int 
>> l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
>>       * that anyway.
>>       */
>>   -    if (s->cookie) {
>> -        if (s->cookie_is_64) {
>> -            cookie = ldq_be_p(buf + s->cookie_offset);
>> +    if (p->cookie) {
>> +        if (p->cookie_is_64) {
>> +            cookie = ldq_be_p(buf + p->cookie_offset);
>>           } else {
>> -            cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL;
>> +            cookie = ldl_be_p(buf + p->cookie_offset) & 0xffffffffULL;
>>           }
>> -        if (cookie != s->rx_cookie) {
>> +        if (cookie != p->rx_cookie) {
>>               if (!s->header_mismatch) {
>>                   error_report("unknown cookie id");
>>               }
>>               return -1;
>>           }
>>       }
>> -    session = (uint32_t *) (buf + s->session_offset);
>> -    if (ldl_be_p(session) != s->rx_session) {
>> +    session = (uint32_t *) (buf + p->session_offset);
>> +    if (ldl_be_p(session) != p->rx_session) {
>>           if (!s->header_mismatch) {
>>               error_report("session mismatch");
>>           }
>> @@ -344,203 +155,31 @@ static int l2tpv3_verify_header(NetL2TPV3State 
>> *s, uint8_t *buf)
>>       return 0;
>>   }
>>   -static void net_l2tpv3_process_queue(NetL2TPV3State *s)
>> -{
>> -    int size = 0;
>> -    struct iovec *vec;
>> -    bool bad_read;
>> -    int data_size;
>> -    struct mmsghdr *msgvec;
>> -
>> -    /* go into ring mode only if there is a "pending" tail */
>> -    if (s->queue_depth > 0) {
>> -        do {
>> -            msgvec = s->msgvec + s->queue_tail;
>> -            if (msgvec->msg_len > 0) {
>> -                data_size = msgvec->msg_len - s->header_size;
>> -                vec = msgvec->msg_hdr.msg_iov;
>> -                if ((data_size > 0) &&
>> -                    (l2tpv3_verify_header(s, vec->iov_base) == 0)) {
>> -                    vec++;
>> -                    /* Use the legacy delivery for now, we will
>> -                     * switch to using our own ring as a queueing 
>> mechanism
>> -                     * at a later date
>> -                     */
>> -                    size = qemu_send_packet_async(
>> -                            &s->nc,
>> -                            vec->iov_base,
>> -                            data_size,
>> -                            l2tpv3_send_completed
>> -                        );
>> -                    if (size == 0) {
>> -                        l2tpv3_read_poll(s, false);
>> -                    }
>> -                    bad_read = false;
>> -                } else {
>> -                    bad_read = true;
>> -                    if (!s->header_mismatch) {
>> -                        /* report error only once */
>> -                        error_report("l2tpv3 header verification 
>> failed");
>> -                        s->header_mismatch = true;
>> -                    }
>> -                }
>> -            } else {
>> -                bad_read = true;
>> -            }
>> -            s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT;
>> -            s->queue_depth--;
>> -        } while (
>> -                (s->queue_depth > 0) &&
>> -                 qemu_can_send_packet(&s->nc) &&
>> -                ((size > 0) || bad_read)
>> -            );
>> -    }
>> -}
>> -
>> -static void net_l2tpv3_send(void *opaque)
>> -{
>> -    NetL2TPV3State *s = opaque;
>> -    int target_count, count;
>> -    struct mmsghdr *msgvec;
>> -
>> -    /* go into ring mode only if there is a "pending" tail */
>> -
>> -    if (s->queue_depth) {
>> -
>> -        /* The ring buffer we use has variable intake
>> -         * count of how much we can read varies - adjust accordingly
>> -         */
>> -
>> -        target_count = MAX_L2TPV3_MSGCNT - s->queue_depth;
>> -
>> -        /* Ensure we do not overrun the ring when we have
>> -         * a lot of enqueued packets
>> -         */
>> -
>> -        if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) {
>> -            target_count = MAX_L2TPV3_MSGCNT - s->queue_head;
>> -        }
>> -    } else {
>> -
>> -        /* we do not have any pending packets - we can use
>> -        * the whole message vector linearly instead of using
>> -        * it as a ring
>> -        */
>> -
>> -        s->queue_head = 0;
>> -        s->queue_tail = 0;
>> -        target_count = MAX_L2TPV3_MSGCNT;
>> -    }
>> -
>> -    msgvec = s->msgvec + s->queue_head;
>> -    if (target_count > 0) {
>> -        do {
>> -            count = recvmmsg(
>> -                s->fd,
>> -                msgvec,
>> -                target_count, MSG_DONTWAIT, NULL);
>> -        } while ((count == -1) && (errno == EINTR));
>> -        if (count < 0) {
>> -            /* Recv error - we still need to flush packets here,
>> -             * (re)set queue head to current position
>> -             */
>> -            count = 0;
>> -        }
>> -        s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT;
>> -        s->queue_depth += count;
>> -    }
>> -    net_l2tpv3_process_queue(s);
>> -}
>> -
>> -static void destroy_vector(struct mmsghdr *msgvec, int count, int 
>> iovcount)
>> -{
>> -    int i, j;
>> -    struct iovec *iov;
>> -    struct mmsghdr *cleanup = msgvec;
>> -    if (cleanup) {
>> -        for (i = 0; i < count; i++) {
>> -            if (cleanup->msg_hdr.msg_iov) {
>> -                iov = cleanup->msg_hdr.msg_iov;
>> -                for (j = 0; j < iovcount; j++) {
>> -                    g_free(iov->iov_base);
>> -                    iov++;
>> -                }
>> -                g_free(cleanup->msg_hdr.msg_iov);
>> -            }
>> -            cleanup++;
>> -        }
>> -        g_free(msgvec);
>> -    }
>> -}
>> -
>> -static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int 
>> count)
>> -{
>> -    int i;
>> -    struct iovec *iov;
>> -    struct mmsghdr *msgvec, *result;
>> -
>> -    msgvec = g_new(struct mmsghdr, count);
>> -    result = msgvec;
>> -    for (i = 0; i < count ; i++) {
>> -        msgvec->msg_hdr.msg_name = NULL;
>> -        msgvec->msg_hdr.msg_namelen = 0;
>> -        iov =  g_new(struct iovec, IOVSIZE);
>> -        msgvec->msg_hdr.msg_iov = iov;
>> -        iov->iov_base = g_malloc(s->header_size);
>> -        iov->iov_len = s->header_size;
>> -        iov++ ;
>> -        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
>> -        iov->iov_len = BUFFER_SIZE;
>> -        msgvec->msg_hdr.msg_iovlen = 2;
>> -        msgvec->msg_hdr.msg_control = NULL;
>> -        msgvec->msg_hdr.msg_controllen = 0;
>> -        msgvec->msg_hdr.msg_flags = 0;
>> -        msgvec++;
>> -    }
>> -    return result;
>> -}
>> -
>> -static void net_l2tpv3_cleanup(NetClientState *nc)
>> -{
>> -    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
>> -    qemu_purge_queued_packets(nc);
>> -    l2tpv3_read_poll(s, false);
>> -    l2tpv3_write_poll(s, false);
>> -    if (s->fd >= 0) {
>> -        close(s->fd);
>> -    }
>> -    destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE);
>> -    g_free(s->vec);
>> -    g_free(s->header_buf);
>> -    g_free(s->dgram_dst);
>> -}
>> -
>> -static NetClientInfo net_l2tpv3_info = {
>> -    .type = NET_CLIENT_DRIVER_L2TPV3,
>> -    .size = sizeof(NetL2TPV3State),
>> -    .receive = net_l2tpv3_receive_dgram,
>> -    .receive_iov = net_l2tpv3_receive_dgram_iov,
>> -    .poll = l2tpv3_poll,
>> -    .cleanup = net_l2tpv3_cleanup,
>> -};
>> -
>>   int net_init_l2tpv3(const Netdev *netdev,
>>                       const char *name,
>>                       NetClientState *peer, Error **errp)
>>   {
>>       /* FIXME error_setg(errp, ...) on failure */
>>       const NetdevL2TPv3Options *l2tpv3;
>> -    NetL2TPV3State *s;
>> +    NetUnifiedState *s;
>>       NetClientState *nc;
>> +    L2TPV3TunnelParams *p;
>> +
>>       int fd = -1, gairet;
>>       struct addrinfo hints;
>>       struct addrinfo *result = NULL;
>>       char *srcport, *dstport;
>>   -    nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name);
>> +    nc = qemu_new_unified_net_client(name, peer);
>> +
>> +    s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +
>> +    p = g_malloc(sizeof(L2TPV3TunnelParams));
>
> Where was this freed?
>
>>   -    s = DO_UPCAST(NetL2TPV3State, nc, nc);
>> +    s->params = p;
>>   +    s->form_header = &l2tpv3_form_header;
>> +    s->verify_header = &l2tpv3_verify_header;
>>       s->queue_head = 0;
>>       s->queue_tail = 0;
>>       s->header_mismatch = false;
>
> Why not move all above into qemu_new_unified_net()?

Only queue head/tail assignment can move.

raw which uses same backend does not use header_mismatch. Form/verify 
header are different for each sub-transport. F.e. for gre you need the 
gre one, for raw you need the raw one, etc.

>
>> @@ -549,9 +188,9 @@ int net_init_l2tpv3(const Netdev *netdev,
>>       l2tpv3 = &netdev->u.l2tpv3;
>>         if (l2tpv3->has_ipv6 && l2tpv3->ipv6) {
>> -        s->ipv6 = l2tpv3->ipv6;
>> +        p->ipv6 = l2tpv3->ipv6;
>>       } else {
>> -        s->ipv6 = false;
>> +        p->ipv6 = false;
>>       }
>>         if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) {
>> @@ -561,22 +200,22 @@ int net_init_l2tpv3(const Netdev *netdev,
>>         if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) {
>>           if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) {
>> -            s->cookie = true;
>> +            p->cookie = true;
>>           } else {
>>               goto outerr;
>>           }
>>       } else {
>> -        s->cookie = false;
>> +        p->cookie = false;
>>       }
>>         if (l2tpv3->has_cookie64 || l2tpv3->cookie64) {
>> -        s->cookie_is_64  = true;
>> +        p->cookie_is_64  = true;
>>       } else {
>> -        s->cookie_is_64  = false;
>> +        p->cookie_is_64  = false;
>>       }
>>         if (l2tpv3->has_udp && l2tpv3->udp) {
>> -        s->udp = true;
>> +        p->udp = true;
>>           if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) {
>>               error_report("l2tpv3_open : need both src and dst port 
>> for udp");
>>               goto outerr;
>> @@ -585,52 +224,52 @@ int net_init_l2tpv3(const Netdev *netdev,
>>               dstport = l2tpv3->dstport;
>>           }
>>       } else {
>> -        s->udp = false;
>> +        p->udp = false;
>>           srcport = NULL;
>>           dstport = NULL;
>>       }
>>           s->offset = 4;
>> -    s->session_offset = 0;
>> -    s->cookie_offset = 4;
>> -    s->counter_offset = 4;
>> +    p->session_offset = 0;
>> +    p->cookie_offset = 4;
>> +    p->counter_offset = 4;
>>   -    s->tx_session = l2tpv3->txsession;
>> +    p->tx_session = l2tpv3->txsession;
>>       if (l2tpv3->has_rxsession) {
>> -        s->rx_session = l2tpv3->rxsession;
>> +        p->rx_session = l2tpv3->rxsession;
>>       } else {
>> -        s->rx_session = s->tx_session;
>> +        p->rx_session = p->tx_session;
>>       }
>>   -    if (s->cookie) {
>> -        s->rx_cookie = l2tpv3->rxcookie;
>> -        s->tx_cookie = l2tpv3->txcookie;
>> -        if (s->cookie_is_64 == true) {
>> +    if (p->cookie) {
>> +        p->rx_cookie = l2tpv3->rxcookie;
>> +        p->tx_cookie = l2tpv3->txcookie;
>> +        if (p->cookie_is_64 == true) {
>>               /* 64 bit cookie */
>>               s->offset += 8;
>> -            s->counter_offset += 8;
>> +            p->counter_offset += 8;
>>           } else {
>>               /* 32 bit cookie */
>>               s->offset += 4;
>> -            s->counter_offset += 4;
>> +            p->counter_offset += 4;
>>           }
>>       }
>>         memset(&hints, 0, sizeof(hints));
>>   -    if (s->ipv6) {
>> +    if (p->ipv6) {
>>           hints.ai_family = AF_INET6;
>>       } else {
>>           hints.ai_family = AF_INET;
>>       }
>> -    if (s->udp) {
>> +    if (p->udp) {
>>           hints.ai_socktype = SOCK_DGRAM;
>>           hints.ai_protocol = 0;
>>           s->offset += 4;
>> -        s->counter_offset += 4;
>> -        s->session_offset += 4;
>> -        s->cookie_offset += 4;
>> +        p->counter_offset += 4;
>> +        p->session_offset += 4;
>> +        p->cookie_offset += 4;
>>       } else {
>>           hints.ai_socktype = SOCK_RAW;
>>           hints.ai_protocol = IPPROTO_L2TP;
>> @@ -661,12 +300,12 @@ int net_init_l2tpv3(const Netdev *netdev,
>>         memset(&hints, 0, sizeof(hints));
>>   -    if (s->ipv6) {
>> +    if (p->ipv6) {
>>           hints.ai_family = AF_INET6;
>>       } else {
>>           hints.ai_family = AF_INET;
>>       }
>> -    if (s->udp) {
>> +    if (p->udp) {
>>           hints.ai_socktype = SOCK_DGRAM;
>>           hints.ai_protocol = 0;
>>       } else {
>> @@ -693,17 +332,17 @@ int net_init_l2tpv3(const Netdev *netdev,
>>       }
>>         if (l2tpv3->has_counter && l2tpv3->counter) {
>> -        s->has_counter = true;
>> +        p->has_counter = true;
>>           s->offset += 4;
>>       } else {
>> -        s->has_counter = false;
>> +        p->has_counter = false;
>>       }
>>         if (l2tpv3->has_pincounter && l2tpv3->pincounter) {
>> -        s->has_counter = true;  /* pin counter implies that there is 
>> counter */
>> -        s->pin_counter = true;
>> +        p->has_counter = true;  /* pin counter implies that there is 
>> counter */
>> +        p->pin_counter = true;
>>       } else {
>> -        s->pin_counter = false;
>> +        p->pin_counter = false;
>>       }
>>         if (l2tpv3->has_offset) {
>> @@ -711,22 +350,14 @@ int net_init_l2tpv3(const Netdev *netdev,
>>           s->offset += l2tpv3->offset;
>>       }
>>   -    if ((s->ipv6) || (s->udp)) {
>> +    if ((p->ipv6) || (p->udp)) {
>>           s->header_size = s->offset;
>>       } else {
>>           s->header_size = s->offset + sizeof(struct iphdr);
>>       }
>>   -    s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT);
>> -    s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT);
>> -    s->header_buf = g_malloc(s->header_size);
>> -
>> -    qemu_set_nonblock(fd);
>> -
>> -    s->fd = fd;
>> -    s->counter = 0;
>> -
>> -    l2tpv3_read_poll(s, true);
>> +    qemu_net_finalize_unified_init(s, fd);
>> +    p->counter = 0;
>>         snprintf(s->nc.info_str, sizeof(s->nc.info_str),
>>                "l2tpv3: connected");
>> diff --git a/net/net.c b/net/net.c
>> index 6235aabed8..9270b52ac8 100644
>> --- a/net/net.c
>> +++ b/net/net.c
>> @@ -959,8 +959,8 @@ static int (* const 
>> net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>>   #ifdef CONFIG_VHOST_NET_USED
>>           [NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user,
>>   #endif
>> -#ifdef CONFIG_L2TPV3
>> -        [NET_CLIENT_DRIVER_L2TPV3]    = net_init_l2tpv3,
>> +#ifdef CONFIG_UNIFIED
>> +        [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
>>   #endif
>>   };
>>   diff --git a/net/unified.c b/net/unified.c
>
> Not a native speaker, but I think we need a better name here e.g udst 
> which is short for Unified Datagram Socket Transport?

I am not a native speaker either :)

I am OK - let's call it udst as this is more descriptive and this 
clearly delineates that you cannot
migrate tcp/socket to it.

>
>> new file mode 100644
>> index 0000000000..f15d1e1eed
>> --- /dev/null
>> +++ b/net/unified.c
>> @@ -0,0 +1,406 @@
>> +/*
>> + * QEMU System Emulator
>> + *
>> + * Copyright (c) 2015-2017 Cambridge Greys Limited
>> + * Copyright (c) 2012-2014 Cisco Systems
>> + * Copyright (c) 2003-2008 Fabrice Bellard
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a copy
>> + * of this software and associated documentation files (the 
>> "Software"), to deal
>> + * in the Software without restriction, including without limitation 
>> the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, 
>> and/or sell
>> + * copies of the Software, and to permit persons to whom the 
>> Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be 
>> included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
>> DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include <linux/ip.h>
>> +#include <netdb.h>
>> +#include "net/net.h"
>> +#include "clients.h"
>> +#include "qemu-common.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/option.h"
>> +#include "qemu/sockets.h"
>> +#include "qemu/iov.h"
>> +#include "qemu/main-loop.h"
>> +#include "unified.h"
>> +
>> +static void net_unified_send(void *opaque);
>> +static void unified_writable(void *opaque);
>> +
>> +static void unified_update_fd_handler(NetUnifiedState *s)
>> +{
>> +    qemu_set_fd_handler(s->fd,
>> +                        s->read_poll ? net_unified_send : NULL,
>> +                        s->write_poll ? unified_writable : NULL,
>> +                        s);
>> +}
>> +
>> +static void unified_read_poll(NetUnifiedState *s, bool enable)
>> +{
>> +    if (s->read_poll != enable) {
>> +        s->read_poll = enable;
>> +        unified_update_fd_handler(s);
>> +    }
>> +}
>> +
>> +static void unified_write_poll(NetUnifiedState *s, bool enable)
>> +{
>> +    if (s->write_poll != enable) {
>> +        s->write_poll = enable;
>> +        unified_update_fd_handler(s);
>> +    }
>> +}
>> +
>> +static void unified_writable(void *opaque)
>> +{
>> +    NetUnifiedState *s = opaque;
>> +    unified_write_poll(s, false);
>> +    qemu_flush_queued_packets(&s->nc);
>> +}
>> +
>> +static void unified_send_completed(NetClientState *nc, ssize_t len)
>> +{
>> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +    unified_read_poll(s, true);
>> +}
>> +
>> +static void unified_poll(NetClientState *nc, bool enable)
>> +{
>> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +    unified_write_poll(s, enable);
>> +    unified_read_poll(s, enable);
>> +}
>> +
>> +static ssize_t net_unified_receive_dgram_iov(NetClientState *nc,
>> +                    const struct iovec *iov,
>> +                    int iovcnt)
>> +{
>> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +
>> +    struct msghdr message;
>> +    int ret;
>> +
>> +    if (iovcnt > MAX_UNIFIED_IOVCNT - 1) {
>> +        error_report(
>> +            "iovec too long %d > %d, change unified.h",
>> +            iovcnt, MAX_UNIFIED_IOVCNT
>> +        );
>> +        return -1;
>> +    }
>> +    if (s->offset > 0) {
>
> net_l2tpv3_receive_dgram_iov() does not have this check. I guess it 
> s->offset=0 will be used by other transport. Maybe it's better to 
> delay this change until is has a real user or add a comment here.

The real user is in patch No 2. Raw.

>
>> +        s->form_header(s);
>> +        memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
>> +        s->vec->iov_base = s->header_buf;
>> +        s->vec->iov_len = s->offset;
>> +        message.msg_iovlen = iovcnt + 1;
>> +    } else {
>> +        memcpy(s->vec, iov, iovcnt * sizeof(struct iovec));
>> +        message.msg_iovlen = iovcnt;
>> +    }
>> +    message.msg_name = s->dgram_dst;
>> +    message.msg_namelen = s->dst_size;
>> +    message.msg_iov = s->vec;
>> +    message.msg_control = NULL;
>> +    message.msg_controllen = 0;
>> +    message.msg_flags = 0;
>> +    do {
>> +        ret = sendmsg(s->fd, &message, 0);
>> +    } while ((ret == -1) && (errno == EINTR));
>> +    if (ret > 0) {
>> +        ret -= s->offset;
>> +    } else if (ret == 0) {
>> +        /* belt and braces - should not occur on DGRAM
>> +        * we should get an error and never a 0 send
>> +        */
>> +        ret = iov_size(iov, iovcnt);
>> +    } else {
>> +        /* signal upper layer that socket buffer is full */
>> +        ret = -errno;
>> +        if (ret == -EAGAIN || ret == -ENOBUFS) {
>> +            unified_write_poll(s, true);
>> +            ret = 0;
>> +        }
>> +    }
>> +    return ret;
>> +}
>> +
>> +static ssize_t net_unified_receive_dgram(NetClientState *nc,
>> +                    const uint8_t *buf,
>> +                    size_t size)
>> +{
>> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +
>> +    struct iovec *vec;
>> +    struct msghdr message;
>> +    ssize_t ret = 0;
>> +
>> +    vec = s->vec;
>> +    if (s->offset > 0) {
>> +        s->form_header(s);
>> +        vec->iov_base = s->header_buf;
>> +        vec->iov_len = s->offset;
>> +        message.msg_iovlen = 2;
>> +        vec++;
>> +    } else {
>> +        message.msg_iovlen = 1;
>> +    }
>> +    vec->iov_base = (void *) buf;
>> +    vec->iov_len = size;
>> +    message.msg_name = s->dgram_dst;
>> +    message.msg_namelen = s->dst_size;
>> +    message.msg_iov = s->vec;
>> +    message.msg_control = NULL;
>> +    message.msg_controllen = 0;
>> +    message.msg_flags = 0;
>> +    do {
>> +        ret = sendmsg(s->fd, &message, 0);
>> +    } while ((ret == -1) && (errno == EINTR));
>> +    if (ret > 0) {
>> +        ret -= s->offset;
>> +    } else if (ret == 0) {
>> +        /* belt and braces - should not occur on DGRAM
>> +        * we should get an error and never a 0 send
>> +        */
>> +        ret = size;
>> +    } else {
>> +        ret = -errno;
>> +        if (ret == -EAGAIN || ret == -ENOBUFS) {
>> +            /* signal upper layer that socket buffer is full */
>> +            unified_write_poll(s, true);
>> +            ret = 0;
>> +        }
>> +    }
>> +    return ret;
>> +}
>> +
>> +
>> +static void net_unified_process_queue(NetUnifiedState *s)
>> +{
>> +    int size = 0;
>> +    struct iovec *vec;
>> +    bool bad_read;
>> +    int data_size;
>> +    struct mmsghdr *msgvec;
>> +
>> +    /* go into ring mode only if there is a "pending" tail */
>> +    if (s->queue_depth > 0) {
>> +        do {
>> +            msgvec = s->msgvec + s->queue_tail;
>> +            if (msgvec->msg_len > 0) {
>> +                data_size = msgvec->msg_len - s->header_size;
>> +                vec = msgvec->msg_hdr.msg_iov;
>> +                if ((data_size > 0) &&
>> +                    (s->verify_header(s, vec->iov_base) == 0)) {
>> +                    if (s->header_size > 0) {
>> +                        vec++;
>> +                    }
>> +                    /* Use the legacy delivery for now, we will
>> +                     * switch to using our own ring as a queueing 
>> mechanism
>> +                     * at a later date
>> +                     */
>> +                    size = qemu_send_packet_async(
>> +                            &s->nc,
>> +                            vec->iov_base,
>> +                            data_size,
>> +                            unified_send_completed
>> +                        );
>> +                    if (size == 0) {
>> +                        unified_read_poll(s, false);
>> +                    }
>> +                    bad_read = false;
>> +                } else {
>> +                    bad_read = true;
>> +                    if (!s->header_mismatch) {
>> +                        /* report error only once */
>> +                        error_report("unified header verification 
>> failed");
>> +                        s->header_mismatch = true;
>> +                    }
>> +                }
>> +            } else {
>> +                bad_read = true;
>> +            }
>> +            s->queue_tail = (s->queue_tail + 1) % MAX_UNIFIED_MSGCNT;
>> +            s->queue_depth--;
>> +        } while (
>> +                (s->queue_depth > 0) &&
>> +                 qemu_can_send_packet(&s->nc) &&
>> +                ((size > 0) || bad_read)
>> +            );
>> +    }
>> +}
>> +
>> +static void net_unified_send(void *opaque)
>> +{
>> +    NetUnifiedState *s = opaque;
>> +    int target_count, count;
>> +    struct mmsghdr *msgvec;
>> +
>> +    /* go into ring mode only if there is a "pending" tail */
>> +
>> +    if (s->queue_depth) {
>> +
>> +        /* The ring buffer we use has variable intake
>> +         * count of how much we can read varies - adjust accordingly
>> +         */
>> +
>> +        target_count = MAX_UNIFIED_MSGCNT - s->queue_depth;
>> +
>> +        /* Ensure we do not overrun the ring when we have
>> +         * a lot of enqueued packets
>> +         */
>> +
>> +        if (s->queue_head + target_count > MAX_UNIFIED_MSGCNT) {
>> +            target_count = MAX_UNIFIED_MSGCNT - s->queue_head;
>> +        }
>> +    } else {
>> +
>> +        /* we do not have any pending packets - we can use
>> +        * the whole message vector linearly instead of using
>> +        * it as a ring
>> +        */
>> +
>> +        s->queue_head = 0;
>> +        s->queue_tail = 0;
>> +        target_count = MAX_UNIFIED_MSGCNT;
>> +    }
>> +
>> +    msgvec = s->msgvec + s->queue_head;
>> +    if (target_count > 0) {
>> +        do {
>> +            count = recvmmsg(
>> +                s->fd,
>> +                msgvec,
>> +                target_count, MSG_DONTWAIT, NULL);
>> +        } while ((count == -1) && (errno == EINTR));
>> +        if (count < 0) {
>> +            /* Recv error - we still need to flush packets here,
>> +             * (re)set queue head to current position
>> +             */
>> +            count = 0;
>> +        }
>> +        s->queue_head = (s->queue_head + count) % MAX_UNIFIED_MSGCNT;
>> +        s->queue_depth += count;
>> +    }
>> +    net_unified_process_queue(s);
>> +}
>> +
>> +static void destroy_vector(struct mmsghdr *msgvec, int count, int 
>> iovcount)
>> +{
>> +    int i, j;
>> +    struct iovec *iov;
>> +    struct mmsghdr *cleanup = msgvec;
>> +    if (cleanup) {
>> +        for (i = 0; i < count; i++) {
>> +            if (cleanup->msg_hdr.msg_iov) {
>> +                iov = cleanup->msg_hdr.msg_iov;
>> +                for (j = 0; j < iovcount; j++) {
>> +                    g_free(iov->iov_base);
>> +                    iov++;
>> +                }
>> +                g_free(cleanup->msg_hdr.msg_iov);
>> +            }
>> +            cleanup++;
>> +        }
>> +        g_free(msgvec);
>> +    }
>> +}
>> +
>> +
>> +
>> +static struct mmsghdr *build_unified_vector(NetUnifiedState *s, int 
>> count)
>> +{
>> +    int i;
>> +    struct iovec *iov;
>> +    struct mmsghdr *msgvec, *result;
>> +
>> +    msgvec = g_new(struct mmsghdr, count);
>> +    result = msgvec;
>> +    for (i = 0; i < count ; i++) {
>> +        msgvec->msg_hdr.msg_name = NULL;
>> +        msgvec->msg_hdr.msg_namelen = 0;
>> +        iov =  g_new(struct iovec, IOVSIZE);
>> +        msgvec->msg_hdr.msg_iov = iov;
>> +        if (s->header_size > 0) {
>
> Same here.
>
>> +            iov->iov_base = g_malloc(s->header_size);
>> +            iov->iov_len = s->header_size;
>> +            iov++ ;
>> +        }
>> +        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
>> +        iov->iov_len = BUFFER_SIZE;
>> +        msgvec->msg_hdr.msg_iovlen = 2;
>> +        msgvec->msg_hdr.msg_control = NULL;
>> +        msgvec->msg_hdr.msg_controllen = 0;
>> +        msgvec->msg_hdr.msg_flags = 0;
>> +        msgvec++;
>> +    }
>> +    return result;
>> +}
>> +
>> +static void net_unified_cleanup(NetClientState *nc)
>> +{
>> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +    qemu_purge_queued_packets(nc);
>> +    unified_read_poll(s, false);
>> +    unified_write_poll(s, false);
>> +    if (s->fd >= 0) {
>> +        close(s->fd);
>> +    }
>> +    if (s->header_size > 0) {
>> +        destroy_vector(s->msgvec, MAX_UNIFIED_MSGCNT, IOVSIZE);
>> +    } else {
>> +        destroy_vector(s->msgvec, MAX_UNIFIED_MSGCNT, 1);
>> +    }
>> +    g_free(s->vec);
>> +    if (s->header_buf != NULL) {
>> +        g_free(s->header_buf);
>> +    }
>> +    if (s->dgram_dst != NULL) {
>> +        g_free(s->dgram_dst);
>> +    }
>> +}
>> +
>> +static NetClientInfo net_unified_info = {
>> +    /* we share this one for all types for now, wrong I know :) */
>> +    .type = NET_CLIENT_DRIVER_L2TPV3,
>
> Like I said above, better to have transport specific type.

Agree. I will get on with it. I may need some help on how to introduce a 
transport which is not selectable by users (just used as backend for 
other transports) into the json schema.

It is now designed to produce "end-user-visible" options.

A.

>
> Thanks
>
>> +    .size = sizeof(NetUnifiedState),
>> +    .receive = net_unified_receive_dgram,
>> +    .receive_iov = net_unified_receive_dgram_iov,
>> +    .poll = unified_poll,
>> +    .cleanup = net_unified_cleanup,
>> +};
>> +
>> +NetClientState *qemu_new_unified_net_client(const char *name,
>> +                    NetClientState *peer) {
>> +    return qemu_new_net_client(&net_unified_info, peer, "unified", 
>> name);
>> +}
>> +
>> +void qemu_net_finalize_unified_init(NetUnifiedState *s, int fd)
>> +{
>> +
>> +    s->msgvec = build_unified_vector(s, MAX_UNIFIED_MSGCNT);
>> +    s->vec = g_new(struct iovec, MAX_UNIFIED_IOVCNT);
>> +    if (s->header_size > 0) {
>> +        s->header_buf = g_malloc(s->header_size);
>> +    } else {
>> +        s->header_buf = NULL;
>> +    }
>> +    qemu_set_nonblock(fd);
>> +
>> +    s->fd = fd;
>> +    unified_read_poll(s, true);
>> +
>> +}
>> +
>> diff --git a/net/unified.h b/net/unified.h
>> new file mode 100644
>> index 0000000000..97ec743f0e
>> --- /dev/null
>> +++ b/net/unified.h
>> @@ -0,0 +1,118 @@
>> +/*
>> + * QEMU System Emulator
>> + *
>> + * Copyright (c) 2015-2017 Cambridge Greys Limited
>> + * Copyright (c) 2012-2014 Cisco Systems
>> + * Copyright (c) 2003-2008 Fabrice Bellard
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a copy
>> + * of this software and associated documentation files (the 
>> "Software"), to deal
>> + * in the Software without restriction, including without limitation 
>> the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, 
>> and/or sell
>> + * copies of the Software, and to permit persons to whom the 
>> Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be 
>> included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
>> DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +
>> +
>> +#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
>> +#define BUFFER_SIZE 2048
>> +#define IOVSIZE 2
>> +#define MAX_UNIFIED_MSGCNT 64
>> +#define MAX_UNIFIED_IOVCNT (MAX_UNIFIED_MSGCNT * IOVSIZE)
>> +
>> +#ifndef QEMU_NET_UNIFIED_H
>> +#define QEMU_NET_UNIFIED_H
>> +
>> +typedef struct NetUnifiedState {
>> +    NetClientState nc;
>> +
>> +    int fd;
>> +
>> +    /*
>> +     * these are used for xmit - that happens packet a time
>> +     * and for first sign of life packet (easier to parse that once)
>> +     */
>> +
>> +    uint8_t *header_buf;
>> +    struct iovec *vec;
>> +
>> +    /*
>> +     * these are used for receive - try to "eat" up to 32 packets at 
>> a time
>> +     */
>> +
>> +    struct mmsghdr *msgvec;
>> +
>> +    /*
>> +     * peer address
>> +     */
>> +
>> +    struct sockaddr_storage *dgram_dst;
>> +    uint32_t dst_size;
>> +
>> +    /*
>> +     * Internal Queue
>> +     */
>> +
>> +    /*
>> +    * DOS avoidance in error handling
>> +    */
>> +
>> +    /* Easier to keep l2tpv3 specific */
>> +
>> +    bool header_mismatch;
>> +
>> +    /*
>> +     *
>> +     * Ring buffer handling
>> +     *
>> +     */
>> +
>> +    int queue_head;
>> +    int queue_tail;
>> +    int queue_depth;
>> +
>> +    /*
>> +     * Offset to data - common for all protocols
>> +     */
>> +
>> +    uint32_t offset;
>> +
>> +    /*
>> +     * Header size - common for all protocols
>> +     */
>> +
>> +    uint32_t header_size;
>> +    /* Poll Control */
>> +
>> +    bool read_poll;
>> +    bool write_poll;
>> +
>> +    /* Parameters */
>> +
>> +    void *params;
>> +
>> +    /* header forming functions */
>> +
>> +    int (*verify_header)(void *s, uint8_t *buf);
>> +    void (*form_header)(void *s);
>> +
>> +} NetUnifiedState;
>> +
>> +extern NetClientState *qemu_new_unified_net_client(const char *name,
>> +                    NetClientState *peer);
>> +
>> +extern void qemu_net_finalize_unified_init(NetUnifiedState *s, int fd);
>> +#endif
>
>

-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-19  5:48   ` Jason Wang
@ 2017-07-19  5:50     ` Anton Ivanov
  0 siblings, 0 replies; 23+ messages in thread
From: Anton Ivanov @ 2017-07-19  5:50 UTC (permalink / raw)
  To: Jason Wang, qemu-devel

OK. Will address both comments in next version.

Brgds,

A.


On 19/07/17 06:48, Jason Wang wrote:
>
>
> On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> This adds GRETAP support to the unified socket driver.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   net/Makefile.objs |   2 +-
>>   net/clients.h     |   4 +
>>   net/gre.c         | 313 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   net/net.c         |   5 +
>>   qapi-schema.json  |  46 +++++++-
>>   qemu-options.hx   |  63 ++++++++++-
>>   6 files changed, 425 insertions(+), 8 deletions(-)
>>   create mode 100644 net/gre.c
>>
>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>> index 8026ad778a..128164e39b 100644
>> --- a/net/Makefile.objs
>> +++ b/net/Makefile.objs
>> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>>   common-obj-y += socket.o
>>   common-obj-y += dump.o
>>   common-obj-y += eth.o
>> -common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
>> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o
>>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>>   common-obj-$(CONFIG_SLIRP) += slirp.o
>>   common-obj-$(CONFIG_VDE) += vde.o
>> diff --git a/net/clients.h b/net/clients.h
>> index 5cae479730..8f8a59aee3 100644
>> --- a/net/clients.h
>> +++ b/net/clients.h
>> @@ -49,6 +49,10 @@ int net_init_bridge(const Netdev *netdev, const 
>> char *name,
>>     int net_init_l2tpv3(const Netdev *netdev, const char *name,
>>                       NetClientState *peer, Error **errp);
>> +
>> +int net_init_gre(const Netdev *netdev, const char *name,
>> +                    NetClientState *peer, Error **errp);
>> +
>>   #ifdef CONFIG_VDE
>>   int net_init_vde(const Netdev *netdev, const char *name,
>>                    NetClientState *peer, Error **errp);
>> diff --git a/net/gre.c b/net/gre.c
>> new file mode 100644
>> index 0000000000..ee8c36dd4d
>> --- /dev/null
>> +++ b/net/gre.c
>> @@ -0,0 +1,313 @@
>> +/*
>> + * QEMU System Emulator
>> + *
>> + * Copyright (c) 2015-2017 Cambridge GREys Limited
>> + * Copyright (c) 2003-2008 Fabrice Bellard
>> + * Copyright (c) 2012-2014 Cisco Systems
>> + *
>> + * Permission is hereby granted, free of charge, to any person 
>> obtaining a copy
>> + * of this software and associated documentation files (the 
>> "Software"), to deal
>> + * in the Software without restriction, including without limitation 
>> the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, 
>> and/or sell
>> + * copies of the Software, and to permit persons to whom the 
>> Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be 
>> included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>> ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
>> DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include <linux/ip.h>
>> +#include <netdb.h>
>> +#include "net/net.h"
>> +#include "clients.h"
>> +#include "qemu-common.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/option.h"
>> +#include "qemu/sockets.h"
>> +#include "qemu/iov.h"
>> +#include "qemu/main-loop.h"
>> +#include "unified.h"
>> +
>> +/* IANA-assigned IP protocol ID for GRE */
>> +
>> +
>> +#ifndef IPPROTO_GRE
>> +#define IPPROTO_GRE 0x2F
>> +#endif
>> +
>> +#define GRE_MODE_CHECKSUM     htons(8 << 12)   /* checksum */
>> +#define GRE_MODE_RESERVED     htons(4 << 12)   /* unused */
>> +#define GRE_MODE_KEY          htons(2 << 12)   /* KEY present */
>> +#define GRE_MODE_SEQUENCE     htons(1 << 12)   /* no sequence */
>> +
>> +
>> +/* GRE TYPE for Ethernet in GRE aka GRETAP */
>> +
>> +#define GRE_IRB htons(0x6558)
>> +
>> +struct gre_minimal_header {
>> +   uint16_t header;
>> +   uint16_t arptype;
>> +};
>> +
>> +typedef struct GRETunnelParams {
>> +    /*
>> +     * GRE parameters
>> +     */
>> +
>> +    uint32_t rx_key;
>> +    uint32_t tx_key;
>> +    uint32_t sequence;
>> +
>> +    /* Flags */
>> +
>> +    bool ipv6;
>> +    bool udp;
>> +    bool has_sequence;
>> +    bool pin_sequence;
>> +    bool checksum;
>> +    bool key;
>> +
>> +    /* Precomputed GRE specific offsets */
>> +
>> +    uint32_t key_offset;
>> +    uint32_t sequence_offset;
>> +    uint32_t checksum_offset;
>> +
>> +    struct gre_minimal_header header_bits;
>> +
>> +} GRETunnelParams;
>> +
>> +
>> +
>> +static void gre_form_header(void *us)
>> +{
>> +    NetUnifiedState *s = (NetUnifiedState *) us;
>> +    GRETunnelParams *p = (GRETunnelParams *) s->params;
>> +
>> +    uint32_t *sequence;
>> +
>> +    *((uint32_t *) s->header_buf) = *((uint32_t *) &p->header_bits);
>> +
>> +    if (p->key) {
>> +        stl_be_p(
>> +            (uint32_t *) (s->header_buf + p->key_offset),
>> +            p->tx_key
>> +        );
>> +    }
>> +    if (p->has_sequence) {
>> +        sequence = (uint32_t *)(s->header_buf + p->sequence_offset);
>> +        if (p->pin_sequence) {
>> +            *sequence = 0;
>> +        } else {
>> +            stl_be_p(sequence, ++p->sequence);
>> +        }
>> +    }
>> +}
>> +
>> +static int gre_verify_header(void *us, uint8_t *buf)
>> +{
>> +
>> +    NetUnifiedState *s = (NetUnifiedState *) us;
>> +    GRETunnelParams *p = (GRETunnelParams *) s->params;
>> +    uint32_t key;
>> +
>> +
>> +    if (!p->ipv6) {
>> +        buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
>> +    }
>> +
>> +    if (*((uint32_t *) buf) != *((uint32_t *) &p->header_bits)) {
>> +        if (!s->header_mismatch) {
>> +            error_report("header type disagreement, expecting %0x, 
>> got %0x",
>> +                *((uint32_t *) &p->header_bits), *((uint32_t *) buf));
>> +        }
>> +        return -1;
>> +    }
>> +
>> +    if (p->key) {
>> +        key = ldl_be_p(buf + p->key_offset);
>> +        if (key != p->rx_key) {
>> +            if (!s->header_mismatch) {
>> +                error_report("unknown key id %0x, expecting %0x",
>> +                    key, p->rx_key);
>> +            }
>> +            return -1;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +int net_init_gre(const Netdev *netdev,
>> +                    const char *name,
>> +                    NetClientState *peer, Error **errp)
>> +{
>> +    /* FIXME error_setg(errp, ...) on failure */
>
> Let's do this in next version.
>
>> +    const NetdevGREOptions *gre;
>> +    NetUnifiedState *s;
>> +    NetClientState *nc;
>> +    GRETunnelParams *p;
>> +
>> +    int fd = -1, gairet;
>> +    struct addrinfo hints;
>> +    struct addrinfo *result = NULL;
>> +
>> +    nc = qemu_new_unified_net_client(name, peer);
>> +
>> +    s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +
>> +    p = g_malloc(sizeof(GRETunnelParams));
>> +
>> +    s->params = p;
>> +    p->header_bits.arptype = GRE_IRB;
>> +    p->header_bits.header = 0;
>> +
>> +    s->form_header = &gre_form_header;
>> +    s->verify_header = &gre_verify_header;
>> +    s->queue_head = 0;
>> +    s->queue_tail = 0;
>> +    s->header_mismatch = false;
>> +
>> +    assert(netdev->type == NET_CLIENT_DRIVER_GRE);
>> +    gre = &netdev->u.gre;
>> +
>> +    if (gre->has_ipv6 && gre->ipv6) {
>> +        p->ipv6 = gre->ipv6;
>> +    } else {
>> +        p->ipv6 = false;
>> +    }
>> +
>> +    s->offset = 4;
>> +    p->key_offset = 4;
>> +    p->sequence_offset = 4;
>> +    p->checksum_offset = 4;
>> +
>> +    if (gre->has_rxkey || gre->has_txkey) {
>> +        if (gre->has_rxkey && gre->has_txkey) {
>> +            p->key = true;
>> +            p->header_bits.header |= GRE_MODE_KEY;
>> +        } else {
>> +            goto outerr;
>> +        }
>> +    } else {
>> +        p->key = false;
>> +    }
>> +
>> +    if (p->key) {
>> +        p->rx_key = gre->rxkey;
>> +        p->tx_key = gre->txkey;
>> +        s->offset += 4;
>> +        p->sequence_offset += 4;
>> +    }
>> +
>> +
>> +    if (gre->has_sequence && gre->sequence) {
>> +        s->offset += 4;
>> +        p->has_sequence = true;
>> +        p->header_bits.header |= GRE_MODE_SEQUENCE;
>> +    } else {
>> +        p->sequence = false;
>> +    }
>> +
>> +    if (gre->has_pinsequence && gre->pinsequence) {
>> +        /* pin sequence implies that there is sequence */
>> +        p->has_sequence = true;
>> +        p->pin_sequence = true;
>> +    } else {
>> +        p->pin_sequence = false;
>> +    }
>> +
>> +    memset(&hints, 0, sizeof(hints));
>> +
>> +    if (p->ipv6) {
>> +        hints.ai_family = AF_INET6;
>> +    } else {
>> +        hints.ai_family = AF_INET;
>> +    }
>> +
>> +    hints.ai_socktype = SOCK_RAW;
>> +    hints.ai_protocol = IPPROTO_GRE;
>> +
>> +    gairet = getaddrinfo(gre->src, NULL, &hints, &result);
>> +
>> +    if ((gairet != 0) || (result == NULL)) {
>> +        error_report(
>> +            "gre_open : could not resolve src, errno = %s",
>> +            gai_strerror(gairet)
>> +        );
>> +        goto outerr;
>> +    }
>> +    fd = socket(result->ai_family, result->ai_socktype, 
>> result->ai_protocol);
>> +    if (fd == -1) {
>> +        fd = -errno;
>> +        error_report("gre_open : socket creation failed, errno = 
>> %d", -fd);
>> +        goto outerr;
>> +    }
>> +    if (bind(fd, (struct sockaddr *) result->ai_addr, 
>> result->ai_addrlen)) {
>> +        error_report("gre_open :  could not bind socket err=%i", 
>> errno);
>> +        goto outerr;
>> +    }
>> +    if (result) {
>> +        freeaddrinfo(result);
>> +    }
>> +
>> +    memset(&hints, 0, sizeof(hints));
>> +
>> +    if (p->ipv6) {
>> +        hints.ai_family = AF_INET6;
>> +    } else {
>> +        hints.ai_family = AF_INET;
>> +    }
>> +    hints.ai_socktype = SOCK_RAW;
>> +    hints.ai_protocol = IPPROTO_GRE;
>> +
>> +    result = NULL;
>> +    gairet = getaddrinfo(gre->dst, NULL, &hints, &result);
>> +    if ((gairet != 0) || (result == NULL)) {
>> +        error_report(
>> +            "gre_open : could not resolve dst, error = %s",
>> +            gai_strerror(gairet)
>> +        );
>> +        goto outerr;
>> +    }
>> +
>> +    s->dgram_dst = g_new0(struct sockaddr_storage, 1);
>> +    memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen);
>
> Use g_memdup() instead?
>
>> +    s->dst_size = result->ai_addrlen;
>> +
>> +    if (result) {
>> +        freeaddrinfo(result);
>> +    }
>> +
>> +    if ((p->ipv6) || (p->udp)) {
>> +        s->header_size = s->offset;
>> +    } else {
>> +        s->header_size = s->offset + sizeof(struct iphdr);
>> +    }
>> +
>> +    qemu_net_finalize_unified_init(s, fd);
>> +
>> +    p->sequence = 0;
>> +
>> +    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
>> +             "gre: connected");
>> +    return 0;
>> +outerr:
>> +    qemu_del_net_client(nc);
>> +    if (fd >= 0) {
>> +        close(fd);
>> +    }
>> +    if (result) {
>> +        freeaddrinfo(result);
>> +    }
>> +    return -1;
>> +}
>> diff --git a/net/net.c b/net/net.c
>> index 9270b52ac8..b75b6e8154 100644
>> --- a/net/net.c
>> +++ b/net/net.c
>> @@ -961,6 +961,7 @@ static int (* const 
>> net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>>   #endif
>>   #ifdef CONFIG_UNIFIED
>>           [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
>> +        [NET_CLIENT_DRIVER_GRE] = net_init_gre,
>>   #endif
>>   };
>>   @@ -1012,6 +1013,10 @@ static int net_client_init1(const void 
>> *object, bool is_netdev, Error **errp)
>>               legacy.type = NET_CLIENT_DRIVER_L2TPV3;
>>               legacy.u.l2tpv3 = opts->u.l2tpv3;
>>               break;
>> +        case NET_LEGACY_OPTIONS_TYPE_GRE:
>> +            legacy.type = NET_CLIENT_DRIVER_GRE;
>> +            legacy.u.gre = opts->u.gre;
>> +            break;
>>           case NET_LEGACY_OPTIONS_TYPE_SOCKET:
>>               legacy.type = NET_CLIENT_DRIVER_SOCKET;
>>               legacy.u.socket = opts->u.socket;
>> diff --git a/qapi-schema.json b/qapi-schema.json
>> index ab438ead70..aec303a14e 100644
>> --- a/qapi-schema.json
>> +++ b/qapi-schema.json
>> @@ -3847,7 +3847,41 @@
>>       'txsession':    'uint32',
>>       '*rxsession':   'uint32',
>>       '*offset':      'uint32' } }
>> -
>> +##
>> +# @NetdevGREOptions:
>> +#
>> +# Connect the VLAN to Ethernet over Ethernet over GRE (GRETAP) tunnel
>> +#
>> +# @src: source address
>> +#
>> +# @dst: destination address
>> +#
>> +# @ipv6: force the use of ipv6
>> +#
>> +# @sequence: have sequence counter
>> +#
>> +# @pinsequence: pin sequence counter to zero -
>> +#              workaround for buggy implementations or
>> +#              networks with packet reorder
>> +#
>> +# @txkey: 32 bit transmit key
>> +#
>> +# @rxkey: 32 bit receive key
>> +#
>> +# Note - gre checksums are not supported at present
>> +#
>> +#
>> +# Since 2.9
>
> We are soft freeze now. So I will target this for 2.11.
>
>> +##
>> +{ 'struct': 'NetdevGREOptions',
>> +  'data': {
>> +    'src':          'str',
>> +    'dst':          'str',
>> +    '*ipv6':        'bool',
>> +    '*sequence':     'bool',
>> +    '*pinsequence':  'bool',
>> +    '*txkey':    'uint32',
>> +    '*rxkey':    'uint32' } }
>>   ##
>>   # @NetdevVdeOptions:
>>   #
>> @@ -3966,7 +4000,7 @@
>>   ##
>>   { 'enum': 'NetClientDriver',
>>     'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 
>> 'vde', 'dump',
>> -            'bridge', 'hubport', 'netmap', 'vhost-user' ] }
>> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
>>     ##
>>   # @Netdev:
>> @@ -3996,7 +4030,8 @@
>>       'bridge':   'NetdevBridgeOptions',
>>       'hubport':  'NetdevHubPortOptions',
>>       'netmap':   'NetdevNetmapOptions',
>> -    'vhost-user': 'NetdevVhostUserOptions' } }
>> +    'vhost-user': 'NetdevVhostUserOptions',
>> +    'gre':      'NetdevGREOptions' } }
>>     ##
>>   # @NetLegacy:
>> @@ -4027,7 +4062,7 @@
>>   ##
>>   { 'enum': 'NetLegacyOptionsType',
>>     'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
>> -           'dump', 'bridge', 'netmap', 'vhost-user'] }
>> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
>>     ##
>>   # @NetLegacyOptions:
>> @@ -4050,7 +4085,8 @@
>>       'dump':     'NetdevDumpOptions',
>>       'bridge':   'NetdevBridgeOptions',
>>       'netmap':   'NetdevNetmapOptions',
>> -    'vhost-user': 'NetdevVhostUserOptions' } }
>> +    'vhost-user': 'NetdevVhostUserOptions',
>> +    'gre':      'NetdevGREOptions' } }
>>     ##
>>   # @NetFilterDirection:
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 2cc70b9cfc..6f8d5cbe21 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -1945,7 +1945,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>>       "                connected to a bridge (default=" 
>> DEFAULT_BRIDGE_INTERFACE ")\n"
>>       "                using the program 'helper (default=" 
>> DEFAULT_BRIDGE_HELPER ")\n"
>>   #endif
>> -#ifdef __linux__
>> +#ifdef CONFIG_UNIFIED
>>       "-netdev 
>> l2tpv3,id=str,src=srcaddr,dst=dstaddr[,srcport=srcport][,dstport=dstport]\n"
>>       " 
>> [,rxsession=rxsession],txsession=txsession[,ipv6=on/off][,udp=on/off]\n"
>>       " [,cookie64=on/off][,counter][,pincounter][,txcookie=txcookie]\n"
>> @@ -1971,6 +1971,23 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>>       "                use 'counter=off' to force a 'cut-down' L2TPv3 
>> with no counter\n"
>>       "                use 'pincounter=on' to work around broken 
>> counter handling in peer\n"
>>       "                use 'offset=X' to add an extra offset between 
>> header and data\n"
>> +    "-netdev 
>> gre,id=str,src=srcaddr,dst=dstaddr[,rxkey=rxkey],txkey=txkey[,ipv6=on/off]\n"
>> +    "         [,sequence][,pinsequence]\n"
>> +    "                configure a network backend with ID 'str' 
>> connected to\n"
>> +    "                an Ethernet over GRE pseudowire (aka GRE TAP).\n"
>> +    "                Linux kernel 3.3+ as well as most routers and 
>> some switches\n"
>> +    "                can talk GRETAP. This transport allows 
>> connecting a VM to a VM,\n"
>> +    "                VM to a router and even VM to Host. It is a 
>> nearly-universal\n"
>> +    "                standard (RFC1701).\n"
>> +    "                use 'src=' to specify source address\n"
>> +    "                use 'dst=' to specify destination address\n"
>> +    "                use 'ipv6=on' to force v6\n"
>> +    "                GRE may use keys to prevent misconfiguration as\n"
>> +    "                well as a weak security measure\n"
>> +    "                use 'rxkey=0x01234' to specify a rxkey\n"
>> +    "                use 'txkey=0x01234' to specify a txkey\n"
>> +    "                use 'sequence=on' to add frame sequence to each 
>> packet\n"
>> +    "                use 'pinsequence=on' to work around broken 
>> sequence handling in peer\n"
>>   #endif
>>       "-netdev 
>> socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
>>       "                configure a network backend to connect to 
>> another network\n"
>> @@ -2394,12 +2411,54 @@ ip l2tp add session tunnel_id 1 name 
>> vmtunnel0 session_id \
>>   ifconfig vmtunnel0 mtu 1500
>>   ifconfig vmtunnel0 up
>>   brctl addif br-lan vmtunnel0
>> +@end example
>> +
>> +Alternatively, it is possible to assign an IP address to vmtunnel0, 
>> which allows
>> +the VM to connect to the host directly without using Linux bridging.
>> +
>> +
>> +@item -netdev 
>> gre,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,ipv6][,sequence][,pinsequence][,txkey=@var{txkey}][,rxkey=@var{rxkey}]
>> +@itemx -net 
>> gre[,vlan=@var{n}][,name=@var{name}],src=@var{srcaddr},dst=@var{dstaddr}[,ipv6][,sequence][,pinsequence][,txkey=@var{txkey}][,rxkey=@var{rxkey}]
>> +Connect VLAN @var{n} to a GRE pseudowire. GRE (RFC1701) is a popular
>> +protocol to transport various data frames between two systems.
>> +We are interested in a specific GRE variety where the transported
>> +frames are Ethernet. This GRE type is usually referred to as GRETAP.
>> +It is present in routers, firewalls, switches and the Linux kernel
>> +(from version 3.3 onwards).
>> +
>> +This transport allows a VM to communicate to another VM, router or 
>> firewall directly.
>> +
>> +@item src=@var{srcaddr}
>> +    source address (mandatory)
>> +@item dst=@var{dstaddr}
>> +    destination address (mandatory)
>> +@item ipv6
>> +    force v6, otherwise defaults to v4.
>> +@item rxkey=@var{rxkey}
>> +@itemx txkey=@var{txkey}
>> +    Keys are a weak form of security in the gre specification.
>> +Their function is mostly to prevent misconfiguration.
>> +@item sequence=on
>> +    Add frame sequence to GRE frames
>> +@item pinsequence=on
>> +    Work around broken sequence handling in peer. This may also help on
>> +networks which have packet reorder.
>> +
>> +For example, to attach a VM running on host 4.3.2.1 via GRETAP to 
>> the bridge br-lan
>> +on the remote Linux host 1.2.3.4:
>> +@example
>> +# Setup tunnel on linux host using raw ip as encapsulation
>> +# on 1.2.3.4
>> +ip link add gt0 type gretap local 1.2.3.4 remote 4.3.2.1
>> +ifconfig gt0 mtu 1500
>> +ifconfig gt0 up
>> +brctl addif br-lan gt0
>>       # on 4.3.2.1
>>   # launch QEMU instance - if your network has reorder or is very 
>> lossy add ,pincounter
>>   -qemu-system-i386 linux.img -net nic -net 
>> l2tpv3,src=4.2.3.1,dst=1.2.3.4,udp,srcport=16384,dstport=16384,rxsession=0xffffffff,txsession=0xffffffff,counter
>> +qemu-system-i386 linux.img -net nic -net gre,src=4.2.3.1,dst=1.2.3.4
>>       @end example
>
>

-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support anton.ivanov
@ 2017-07-19  5:58   ` Jason Wang
  2017-07-19  6:02     ` Anton Ivanov
  2017-07-21 18:50     ` Anton Ivanov
  2017-07-19 14:42   ` Eric Blake
  1 sibling, 2 replies; 23+ messages in thread
From: Jason Wang @ 2017-07-19  5:58 UTC (permalink / raw)
  To: anton.ivanov, qemu-devel



On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>
> This adds raw socket support to the unified socket driver.

Interesting, in fact, I've finished a tpacket backend. Let me post it 
sometime after hardfreeze.

> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>   net/Makefile.objs |   2 +-
>   net/clients.h     |   3 ++
>   net/net.c         |   5 +++
>   net/raw.c         | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   qapi-schema.json  |  25 +++++++++--
>   qemu-options.hx   |  33 +++++++++++++++
>   6 files changed, 186 insertions(+), 5 deletions(-)
>   create mode 100644 net/raw.c
>
> diff --git a/net/Makefile.objs b/net/Makefile.objs
> index 128164e39b..54cf7dd194 100644
> --- a/net/Makefile.objs
> +++ b/net/Makefile.objs
> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>   common-obj-y += socket.o
>   common-obj-y += dump.o
>   common-obj-y += eth.o
> -common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o
> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o raw.o
>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>   common-obj-$(CONFIG_SLIRP) += slirp.o
>   common-obj-$(CONFIG_VDE) += vde.o
> diff --git a/net/clients.h b/net/clients.h
> index 8f8a59aee3..98d8ae59b7 100644
> --- a/net/clients.h
> +++ b/net/clients.h
> @@ -53,6 +53,9 @@ int net_init_l2tpv3(const Netdev *netdev, const char *name,
>   int net_init_gre(const Netdev *netdev, const char *name,
>                       NetClientState *peer, Error **errp);
>   
> +int net_init_raw(const Netdev *netdev, const char *name,
> +                    NetClientState *peer, Error **errp);
> +
>   #ifdef CONFIG_VDE
>   int net_init_vde(const Netdev *netdev, const char *name,
>                    NetClientState *peer, Error **errp);
> diff --git a/net/net.c b/net/net.c
> index b75b6e8154..2d988a120c 100644
> --- a/net/net.c
> +++ b/net/net.c
> @@ -962,6 +962,7 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>   #ifdef CONFIG_UNIFIED
>           [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
>           [NET_CLIENT_DRIVER_GRE] = net_init_gre,
> +        [NET_CLIENT_DRIVER_RAW] = net_init_raw,
>   #endif
>   };
>   
> @@ -1017,6 +1018,10 @@ static int net_client_init1(const void *object, bool is_netdev, Error **errp)
>               legacy.type = NET_CLIENT_DRIVER_GRE;
>               legacy.u.gre = opts->u.gre;
>               break;
> +        case NET_LEGACY_OPTIONS_TYPE_RAW:
> +            legacy.type = NET_CLIENT_DRIVER_RAW;
> +            legacy.u.raw = opts->u.raw;
> +            break;
>           case NET_LEGACY_OPTIONS_TYPE_SOCKET:
>               legacy.type = NET_CLIENT_DRIVER_SOCKET;
>               legacy.u.socket = opts->u.socket;
> diff --git a/net/raw.c b/net/raw.c
> new file mode 100644
> index 0000000000..73e2fd9fe3
> --- /dev/null
> +++ b/net/raw.c
> @@ -0,0 +1,123 @@
> +/*
> + * QEMU System Emulator
> + *
> + * Copyright (c) 2015-2017 Cambridge Greys Limited
> + * Copyright (c) 2003-2008 Fabrice Bellard
> + * Copyright (c) 2012-2014 Cisco Systems
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "qemu/osdep.h"
> +#include <linux/ip.h>
> +#include <netdb.h>
> +#include <sys/ioctl.h>
> +#include <net/if.h>
> +#include "net/net.h"
> + #include <sys/socket.h>
> +#include <linux/if_packet.h>
> +#include <net/ethernet.h>
> +#include "clients.h"
> +#include "qemu-common.h"
> +#include "qemu/error-report.h"
> +#include "qemu/option.h"
> +#include "qemu/sockets.h"
> +#include "qemu/iov.h"
> +#include "qemu/main-loop.h"
> +#include "unified.h"
> +
> +static int noop(void *us, uint8_t *buf)
> +{
> +    return 0;
> +}
> +
> +int net_init_raw(const Netdev *netdev,
> +                    const char *name,
> +                    NetClientState *peer, Error **errp)
> +{
> +
> +    const NetdevRawOptions *raw;
> +    NetUnifiedState *s;
> +    NetClientState *nc;
> +
> +    int fd = -1;
> +    int err;
> +
> +    struct ifreq ifr;
> +    struct sockaddr_ll sock;
> +
> +
> +    nc = qemu_new_unified_net_client(name, peer);
> +
> +    s = DO_UPCAST(NetUnifiedState, nc, nc);
> +
> +    fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
> +    if (fd == -1) {
> +        err = -errno;
> +        error_report("raw_open : raw socket creation failed, errno = %d", -err);
> +        goto outerr;
> +    }
> +
> +
> +    s->form_header = NULL;
> +    s->verify_header = &noop;
> +    s->queue_head = 0;
> +    s->queue_tail = 0;
> +    s->header_mismatch = false;
> +    s->dgram_dst = NULL;
> +    s->dst_size = 0;
> +
> +    assert(netdev->type == NET_CLIENT_DRIVER_RAW);
> +    raw = &netdev->u.raw;
> +
> +    memset(&ifr, 0, sizeof(struct ifreq));
> +    strncpy((char *) &ifr.ifr_name, raw->ifname, sizeof(ifr.ifr_name) - 1);
> +
> +    if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
> +        err = -errno;
> +        error_report("SIOCGIFINDEX, failed to get raw interface index for %s",
> +            raw->ifname);
> +        goto outerr;
> +    }
> +
> +    sock.sll_family = AF_PACKET;
> +    sock.sll_protocol = htons(ETH_P_ALL);
> +    sock.sll_ifindex = ifr.ifr_ifindex;
> +
> +    if (bind(fd, (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
> +        error_report("raw: failed to bind raw socket");
> +        err = -errno;
> +        goto outerr;
> +    }
> +
> +    s->offset = 0;
> +
> +    qemu_net_finalize_unified_init(s, fd);
> +
> +    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
> +             "raw: connected");
> +    return 0;
> +outerr:
> +    qemu_del_net_client(nc);
> +    if (fd >= 0) {
> +        close(fd);
> +    }
> +    return -1;
> +}
> +
> diff --git a/qapi-schema.json b/qapi-schema.json
> index aec303a14e..cde78ce3a1 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -3883,6 +3883,21 @@
>       '*txkey':    'uint32',
>       '*rxkey':    'uint32' } }
>   ##
> +# @NetdevRawOptions:
> +#
> +# Connect the VLAN to an network interface using raw sockets
> +#
> +# @ifname: network interface name
> +#
> +
> +# Since 2.9

2.11.

> +##
> +{ 'struct': 'NetdevRawOptions',
> +  'data': {
> +    'ifname':          'str'
> +} }
> +
> +##
>   # @NetdevVdeOptions:
>   #
>   # Connect the VLAN to a vde switch running on the host.
> @@ -4000,7 +4015,7 @@
>   ##
>   { 'enum': 'NetClientDriver',
>     'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
> -            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre', 'raw' ] }
>   
>   ##
>   # @Netdev:
> @@ -4031,7 +4046,8 @@
>       'hubport':  'NetdevHubPortOptions',
>       'netmap':   'NetdevNetmapOptions',
>       'vhost-user': 'NetdevVhostUserOptions',
> -    'gre':      'NetdevGREOptions' } }
> +    'gre':      'NetdevGREOptions',
> +    'raw':      'NetdevRawOptions' } }
>   
>   ##
>   # @NetLegacy:
> @@ -4062,7 +4078,7 @@
>   ##
>   { 'enum': 'NetLegacyOptionsType',
>     'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
> -           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre', 'raw'] }
>   
>   ##
>   # @NetLegacyOptions:
> @@ -4086,7 +4102,8 @@
>       'bridge':   'NetdevBridgeOptions',
>       'netmap':   'NetdevNetmapOptions',
>       'vhost-user': 'NetdevVhostUserOptions',
> -    'gre':      'NetdevGREOptions' } }
> +    'gre':      'NetdevGREOptions',
> +    'raw':      'NetdevRawOptions' } }
>   
>   ##
>   # @NetFilterDirection:
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 6f8d5cbe21..d9db8b576b 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -1988,6 +1988,13 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>       "                use 'txkey=0x01234' to specify a txkey\n"
>       "                use 'sequence=on' to add frame sequence to each packet\n"
>       "                use 'pinsequence=on' to work around broken sequence handling in peer\n"
> +    "-netdev raw,id=str,ifname=ifname\n"
> +    "                configure a network backend with ID 'str' connected to\n"
> +    "                an Ethernet interface named ifname via raw socket.\n"
> +    "                This backend does not change the interface settings.\n"
> +    "                Most interfaces will require being set into promisc mode,\n"
> +    "                as well having most offloads (TSO, etc) turned off.\n"
> +    "                Some virtual interfaces like tap support only RX.\n"

Pay attention that qemu supports vnet header. So any reason to turn off 
e.g TSO here?

>   #endif
>       "-netdev socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
>       "                configure a network backend to connect to another network\n"
> @@ -2463,6 +2470,32 @@ qemu-system-i386 linux.img -net nic -net gre,src=4.2.3.1,dst=1.2.3.4
>   
>   @end example
>   
> +@item -netdev raw,id=@var{id},ifname=@var{ifname}
> +@itemx -net raw[,vlan=@var{n}][,name=@var{name}],ifname=@var{ifname}
> +Connect VLAN @var{n} directly to an Ethernet interface using raw socket.
> +
> +This transport allows a VM to bypass most of the network stack which is
> +extremely useful for tapping.
> +
> +@item ifname=@var{ifname}
> +    interface name (mandatory)
> +
> +@example
> +# set up the interface - put it in promiscuous mode and turn off offloads
> +ifconfig eth0 up
> +ifconfig eth0 promisc
> +
> +/sbin/ethtool -K eth0 gro off
> +/sbin/ethtool -K eth0 tso off
> +/sbin/ethtool -K eth0 gso off
> +/sbin/ethtool -K eth0 tx off

Any reason to turn off tx here?

> +
> +# launch QEMU instance - if your network has reorder or is very lossy add ,pincounter
> +
> +qemu-system-i386 linux.img -net nic -net raw,ifname=eth0

Can we switch to use -netdev here?

Thanks

> +
> +@end example
> +
>   @item -netdev vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>   @itemx -net vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>   Connect VLAN @var{n} to PORT @var{n} of a vde switch running on host and

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-19  5:58   ` Jason Wang
@ 2017-07-19  6:02     ` Anton Ivanov
  2017-07-21 18:50     ` Anton Ivanov
  1 sibling, 0 replies; 23+ messages in thread
From: Anton Ivanov @ 2017-07-19  6:02 UTC (permalink / raw)
  To: Jason Wang, anton.ivanov, qemu-devel

I tried tpacket long ago when I started working on this. It was slower as there was no way to turn off timestamps on the packet ring. They are quite expensive. 

I have not looked at it lately. It will be interesting to compare.

On 19 July 2017 07:58:48 CEST, Jason Wang <jasowang@redhat.com> wrote:
>
>
>On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> This adds raw socket support to the unified socket driver.
>
>Interesting, in fact, I've finished a tpacket backend. Let me post it 
>sometime after hardfreeze.
>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   net/Makefile.objs |   2 +-
>>   net/clients.h     |   3 ++
>>   net/net.c         |   5 +++
>>   net/raw.c         | 123
>++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   qapi-schema.json  |  25 +++++++++--
>>   qemu-options.hx   |  33 +++++++++++++++
>>   6 files changed, 186 insertions(+), 5 deletions(-)
>>   create mode 100644 net/raw.c
>>
>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>> index 128164e39b..54cf7dd194 100644
>> --- a/net/Makefile.objs
>> +++ b/net/Makefile.objs
>> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>>   common-obj-y += socket.o
>>   common-obj-y += dump.o
>>   common-obj-y += eth.o
>> -common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o
>> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o gre.o raw.o
>>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>>   common-obj-$(CONFIG_SLIRP) += slirp.o
>>   common-obj-$(CONFIG_VDE) += vde.o
>> diff --git a/net/clients.h b/net/clients.h
>> index 8f8a59aee3..98d8ae59b7 100644
>> --- a/net/clients.h
>> +++ b/net/clients.h
>> @@ -53,6 +53,9 @@ int net_init_l2tpv3(const Netdev *netdev, const
>char *name,
>>   int net_init_gre(const Netdev *netdev, const char *name,
>>                       NetClientState *peer, Error **errp);
>>   
>> +int net_init_raw(const Netdev *netdev, const char *name,
>> +                    NetClientState *peer, Error **errp);
>> +
>>   #ifdef CONFIG_VDE
>>   int net_init_vde(const Netdev *netdev, const char *name,
>>                    NetClientState *peer, Error **errp);
>> diff --git a/net/net.c b/net/net.c
>> index b75b6e8154..2d988a120c 100644
>> --- a/net/net.c
>> +++ b/net/net.c
>> @@ -962,6 +962,7 @@ static int (* const
>net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>>   #ifdef CONFIG_UNIFIED
>>           [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
>>           [NET_CLIENT_DRIVER_GRE] = net_init_gre,
>> +        [NET_CLIENT_DRIVER_RAW] = net_init_raw,
>>   #endif
>>   };
>>   
>> @@ -1017,6 +1018,10 @@ static int net_client_init1(const void
>*object, bool is_netdev, Error **errp)
>>               legacy.type = NET_CLIENT_DRIVER_GRE;
>>               legacy.u.gre = opts->u.gre;
>>               break;
>> +        case NET_LEGACY_OPTIONS_TYPE_RAW:
>> +            legacy.type = NET_CLIENT_DRIVER_RAW;
>> +            legacy.u.raw = opts->u.raw;
>> +            break;
>>           case NET_LEGACY_OPTIONS_TYPE_SOCKET:
>>               legacy.type = NET_CLIENT_DRIVER_SOCKET;
>>               legacy.u.socket = opts->u.socket;
>> diff --git a/net/raw.c b/net/raw.c
>> new file mode 100644
>> index 0000000000..73e2fd9fe3
>> --- /dev/null
>> +++ b/net/raw.c
>> @@ -0,0 +1,123 @@
>> +/*
>> + * QEMU System Emulator
>> + *
>> + * Copyright (c) 2015-2017 Cambridge Greys Limited
>> + * Copyright (c) 2003-2008 Fabrice Bellard
>> + * Copyright (c) 2012-2014 Cisco Systems
>> + *
>> + * Permission is hereby granted, free of charge, to any person
>obtaining a copy
>> + * of this software and associated documentation files (the
>"Software"), to deal
>> + * in the Software without restriction, including without limitation
>the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense,
>and/or sell
>> + * copies of the Software, and to permit persons to whom the
>Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be
>included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
>SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
>OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include <linux/ip.h>
>> +#include <netdb.h>
>> +#include <sys/ioctl.h>
>> +#include <net/if.h>
>> +#include "net/net.h"
>> + #include <sys/socket.h>
>> +#include <linux/if_packet.h>
>> +#include <net/ethernet.h>
>> +#include "clients.h"
>> +#include "qemu-common.h"
>> +#include "qemu/error-report.h"
>> +#include "qemu/option.h"
>> +#include "qemu/sockets.h"
>> +#include "qemu/iov.h"
>> +#include "qemu/main-loop.h"
>> +#include "unified.h"
>> +
>> +static int noop(void *us, uint8_t *buf)
>> +{
>> +    return 0;
>> +}
>> +
>> +int net_init_raw(const Netdev *netdev,
>> +                    const char *name,
>> +                    NetClientState *peer, Error **errp)
>> +{
>> +
>> +    const NetdevRawOptions *raw;
>> +    NetUnifiedState *s;
>> +    NetClientState *nc;
>> +
>> +    int fd = -1;
>> +    int err;
>> +
>> +    struct ifreq ifr;
>> +    struct sockaddr_ll sock;
>> +
>> +
>> +    nc = qemu_new_unified_net_client(name, peer);
>> +
>> +    s = DO_UPCAST(NetUnifiedState, nc, nc);
>> +
>> +    fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
>> +    if (fd == -1) {
>> +        err = -errno;
>> +        error_report("raw_open : raw socket creation failed, errno =
>%d", -err);
>> +        goto outerr;
>> +    }
>> +
>> +
>> +    s->form_header = NULL;
>> +    s->verify_header = &noop;
>> +    s->queue_head = 0;
>> +    s->queue_tail = 0;
>> +    s->header_mismatch = false;
>> +    s->dgram_dst = NULL;
>> +    s->dst_size = 0;
>> +
>> +    assert(netdev->type == NET_CLIENT_DRIVER_RAW);
>> +    raw = &netdev->u.raw;
>> +
>> +    memset(&ifr, 0, sizeof(struct ifreq));
>> +    strncpy((char *) &ifr.ifr_name, raw->ifname,
>sizeof(ifr.ifr_name) - 1);
>> +
>> +    if (ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
>> +        err = -errno;
>> +        error_report("SIOCGIFINDEX, failed to get raw interface
>index for %s",
>> +            raw->ifname);
>> +        goto outerr;
>> +    }
>> +
>> +    sock.sll_family = AF_PACKET;
>> +    sock.sll_protocol = htons(ETH_P_ALL);
>> +    sock.sll_ifindex = ifr.ifr_ifindex;
>> +
>> +    if (bind(fd, (struct sockaddr *) &sock, sizeof(struct
>sockaddr_ll)) < 0) {
>> +        error_report("raw: failed to bind raw socket");
>> +        err = -errno;
>> +        goto outerr;
>> +    }
>> +
>> +    s->offset = 0;
>> +
>> +    qemu_net_finalize_unified_init(s, fd);
>> +
>> +    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
>> +             "raw: connected");
>> +    return 0;
>> +outerr:
>> +    qemu_del_net_client(nc);
>> +    if (fd >= 0) {
>> +        close(fd);
>> +    }
>> +    return -1;
>> +}
>> +
>> diff --git a/qapi-schema.json b/qapi-schema.json
>> index aec303a14e..cde78ce3a1 100644
>> --- a/qapi-schema.json
>> +++ b/qapi-schema.json
>> @@ -3883,6 +3883,21 @@
>>       '*txkey':    'uint32',
>>       '*rxkey':    'uint32' } }
>>   ##
>> +# @NetdevRawOptions:
>> +#
>> +# Connect the VLAN to an network interface using raw sockets
>> +#
>> +# @ifname: network interface name
>> +#
>> +
>> +# Since 2.9
>
>2.11.
>
>> +##
>> +{ 'struct': 'NetdevRawOptions',
>> +  'data': {
>> +    'ifname':          'str'
>> +} }
>> +
>> +##
>>   # @NetdevVdeOptions:
>>   #
>>   # Connect the VLAN to a vde switch running on the host.
>> @@ -4000,7 +4015,7 @@
>>   ##
>>   { 'enum': 'NetClientDriver',
>>     'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket',
>'vde', 'dump',
>> -            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
>> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre',
>'raw' ] }
>>   
>>   ##
>>   # @Netdev:
>> @@ -4031,7 +4046,8 @@
>>       'hubport':  'NetdevHubPortOptions',
>>       'netmap':   'NetdevNetmapOptions',
>>       'vhost-user': 'NetdevVhostUserOptions',
>> -    'gre':      'NetdevGREOptions' } }
>> +    'gre':      'NetdevGREOptions',
>> +    'raw':      'NetdevRawOptions' } }
>>   
>>   ##
>>   # @NetLegacy:
>> @@ -4062,7 +4078,7 @@
>>   ##
>>   { 'enum': 'NetLegacyOptionsType',
>>     'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
>> -           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
>> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre', 'raw'] }
>>   
>>   ##
>>   # @NetLegacyOptions:
>> @@ -4086,7 +4102,8 @@
>>       'bridge':   'NetdevBridgeOptions',
>>       'netmap':   'NetdevNetmapOptions',
>>       'vhost-user': 'NetdevVhostUserOptions',
>> -    'gre':      'NetdevGREOptions' } }
>> +    'gre':      'NetdevGREOptions',
>> +    'raw':      'NetdevRawOptions' } }
>>   
>>   ##
>>   # @NetFilterDirection:
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 6f8d5cbe21..d9db8b576b 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -1988,6 +1988,13 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
>>       "                use 'txkey=0x01234' to specify a txkey\n"
>>       "                use 'sequence=on' to add frame sequence to
>each packet\n"
>>       "                use 'pinsequence=on' to work around broken
>sequence handling in peer\n"
>> +    "-netdev raw,id=str,ifname=ifname\n"
>> +    "                configure a network backend with ID 'str'
>connected to\n"
>> +    "                an Ethernet interface named ifname via raw
>socket.\n"
>> +    "                This backend does not change the interface
>settings.\n"
>> +    "                Most interfaces will require being set into
>promisc mode,\n"
>> +    "                as well having most offloads (TSO, etc) turned
>off.\n"
>> +    "                Some virtual interfaces like tap support only
>RX.\n"
>
>Pay attention that qemu supports vnet header. So any reason to turn off
>
>e.g TSO here?
>
>>   #endif
>>       "-netdev
>socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
>>       "                configure a network backend to connect to
>another network\n"
>> @@ -2463,6 +2470,32 @@ qemu-system-i386 linux.img -net nic -net
>gre,src=4.2.3.1,dst=1.2.3.4
>>   
>>   @end example
>>   
>> +@item -netdev raw,id=@var{id},ifname=@var{ifname}
>> +@itemx -net raw[,vlan=@var{n}][,name=@var{name}],ifname=@var{ifname}
>> +Connect VLAN @var{n} directly to an Ethernet interface using raw
>socket.
>> +
>> +This transport allows a VM to bypass most of the network stack which
>is
>> +extremely useful for tapping.
>> +
>> +@item ifname=@var{ifname}
>> +    interface name (mandatory)
>> +
>> +@example
>> +# set up the interface - put it in promiscuous mode and turn off
>offloads
>> +ifconfig eth0 up
>> +ifconfig eth0 promisc
>> +
>> +/sbin/ethtool -K eth0 gro off
>> +/sbin/ethtool -K eth0 tso off
>> +/sbin/ethtool -K eth0 gso off
>> +/sbin/ethtool -K eth0 tx off
>
>Any reason to turn off tx here?
>
>> +
>> +# launch QEMU instance - if your network has reorder or is very
>lossy add ,pincounter
>> +
>> +qemu-system-i386 linux.img -net nic -net raw,ifname=eth0
>
>Can we switch to use -netdev here?
>
>Thanks
>
>> +
>> +@end example
>> +
>>   @item -netdev
>vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>>   @itemx -net
>vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}]
>[,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>>   Connect VLAN @var{n} to PORT @var{n} of a vde switch running on
>host and

-- 
Sent from my Android device with K-9 Mail. Please excuse my brevity.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-19  5:48     ` Anton Ivanov
@ 2017-07-19  6:07       ` Jason Wang
  2017-07-19  6:48         ` Anton Ivanov
  0 siblings, 1 reply; 23+ messages in thread
From: Jason Wang @ 2017-07-19  6:07 UTC (permalink / raw)
  To: Anton Ivanov, qemu-devel



On 2017年07月19日 13:48, Anton Ivanov wrote:
>
>
> On 19/07/17 06:39, Jason Wang wrote:
>>
>>
>> On 2017年07月19日 01:08, anton.ivanov@cambridgegreys.com wrote:
>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>
>>> 1. Creates a common backend for socket transports using
>>> recvmmsg().
>>> 2. Migrates L2TPv3 to the new backend
>>
>> It would be better if you could further split out 2 from this patch.
>>
>>>
>>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>> ---
>>>   configure         |  10 +-
>>>   net/Makefile.objs |   2 +-
>>>   net/l2tpv3.c      | 531 
>>> +++++++++---------------------------------------------
>>>   net/net.c         |   4 +-
>>>   net/unified.c     | 406 +++++++++++++++++++++++++++++++++++++++++
>>>   net/unified.h     | 118 ++++++++++++
>>>   6 files changed, 613 insertions(+), 458 deletions(-)
>>>   create mode 100644 net/unified.c
>>>   create mode 100644 net/unified.h
>>>
>>> diff --git a/configure b/configure
>>> index a3f0522e8f..99a60b723c 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -1862,7 +1862,7 @@ if ! compile_object -Werror ; then
>>>   fi
>>>     ##########################################
>>> -# L2TPV3 probe
>>> +# UNIFIED probe
>>>     cat > $TMPC <<EOF
>>>   #include <sys/socket.h>
>>> @@ -1870,9 +1870,9 @@ cat > $TMPC <<EOF
>>>   int main(void) { return sizeof(struct mmsghdr); }
>>>   EOF
>>>   if compile_prog "" "" ; then
>>> -  l2tpv3=yes
>>> +  unified=yes
>>>   else
>>> -  l2tpv3=no
>>> +  unified=no
>>>   fi
>>>     ##########################################
>>> @@ -5458,8 +5458,8 @@ fi
>>>   if test "$netmap" = "yes" ; then
>>>     echo "CONFIG_NETMAP=y" >> $config_host_mak
>>>   fi
>>> -if test "$l2tpv3" = "yes" ; then
>>> -  echo "CONFIG_L2TPV3=y" >> $config_host_mak
>>> +if test "$unified" = "yes" ; then
>>> +  echo "CONFIG_UNIFIED=y" >> $config_host_mak
>>>   fi
>>
>> Could we keep l2tpv3 option?
>
> The l2tpv3 test is actually a test for recvmmsg. If you can do one 
> recvmmsg transport you can do all of them.

Yes, but I wonder whether or not the check for recvmmsg is too simple. 
We probably want something like what AV_VSOCK did, test the support of 
each transport through socket().

>
>>
>>>   if test "$cap_ng" = "yes" ; then
>>>     echo "CONFIG_LIBCAP=y" >> $config_host_mak
>>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>>> index 67ba5e26fb..8026ad778a 100644
>>> --- a/net/Makefile.objs
>>> +++ b/net/Makefile.objs
>>> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>>>   common-obj-y += socket.o
>>>   common-obj-y += dump.o
>>>   common-obj-y += eth.o
>>> -common-obj-$(CONFIG_L2TPV3) += l2tpv3.o
>>> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
>>>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>>>   common-obj-$(CONFIG_SLIRP) += slirp.o
>>>   common-obj-$(CONFIG_VDE) += vde.o

[...]

>>>>   -    s = DO_UPCAST(NetL2TPV3State, nc, nc);
>>>> +    s->params = p;
>>>>   +    s->form_header = &l2tpv3_form_header;
>>>> +    s->verify_header = &l2tpv3_verify_header;
>>>>       s->queue_head = 0;
>>>>       s->queue_tail = 0;
>>>>       s->header_mismatch = false;
>>>
>>> Why not move all above into qemu_new_unified_net()?
>
> Only queue head/tail assignment can move.
>
> raw which uses same backend does not use header_mismatch. Form/verify 
> header are different for each sub-transport. F.e. for gre you need the 
> gre one, for raw you need the raw one, etc.

Right, I mean pass function pointer to qemu_new_unified_net().

>
>>
>>> @@ -549,9 +188,9 @@ int net_init_l2tpv3(const Netdev *netdev,
>>>       l2tpv3 = &netdev->u.l2tpv3;
>>>         if (l2tpv3->has_ipv6 && l2tpv3->ipv6) {
>>> -        s->ipv6 = l2tpv3->ipv6;
>>> +        p->ipv6 = l2tpv3->ipv6;
>>>       } else {
>>> -        s->ipv6 = false;
>>> +        p->ipv6 = false;

[...]

>>>   diff --git a/net/unified.c b/net/unified.c
>>
>> Not a native speaker, but I think we need a better name here e.g udst 
>> which is short for Unified Datagram Socket Transport?
>
> I am not a native speaker either :)
>
> I am OK - let's call it udst as this is more descriptive and this 
> clearly delineates that you cannot
> migrate tcp/socket to it.

Ok.

>
>>
>>>

[...]

>>> +
>>> +static ssize_t net_unified_receive_dgram_iov(NetClientState *nc,
>>> +                    const struct iovec *iov,
>>> +                    int iovcnt)
>>> +{
>>> +    NetUnifiedState *s = DO_UPCAST(NetUnifiedState, nc, nc);
>>> +
>>> +    struct msghdr message;
>>> +    int ret;
>>> +
>>> +    if (iovcnt > MAX_UNIFIED_IOVCNT - 1) {
>>> +        error_report(
>>> +            "iovec too long %d > %d, change unified.h",
>>> +            iovcnt, MAX_UNIFIED_IOVCNT
>>> +        );
>>> +        return -1;
>>> +    }
>>> +    if (s->offset > 0) {
>>
>> net_l2tpv3_receive_dgram_iov() does not have this check. I guess it 
>> s->offset=0 will be used by other transport. Maybe it's better to 
>> delay this change until is has a real user or add a comment here.
>
> The real user is in patch No 2. Raw.

Ok.

Thanks.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-19  6:07       ` Jason Wang
@ 2017-07-19  6:48         ` Anton Ivanov
  0 siblings, 0 replies; 23+ messages in thread
From: Anton Ivanov @ 2017-07-19  6:48 UTC (permalink / raw)
  To: Jason Wang, qemu-devel

[snip]

>>> Could we keep l2tpv3 option?
>>
>> The l2tpv3 test is actually a test for recvmmsg. If you can do one 
>> recvmmsg transport you can do all of them.
>
> Yes, but I wonder whether or not the check for recvmmsg is too simple. 
> We probably want something like what AV_VSOCK did, test the support of 
> each transport through socket().

We may need this in the future.

I do not think we need it for the first 3 transports lined up for this - 
l2tpv3, gre and raw.  The only reqs are recvmmsg (and sendmmsg in the 
future) and raw sockets. They are very simple :)

So unless we try to fold all of raw initialization (on/off for offloads, 
etc) into the driver we should not need more tests for now. We will need 
them once we add more transports.

By the way - on raw, in addition to cost of timestamps, recvmmsg and 
especially sendmmsg in most cases will have lower number of copies 
compared to tpacket. IMHO there is still a very important use case for 
tpacket, but it will require hw/ work - vm used as a forensic tap. We 
will need to emulate one of the drivers which convey the timestamp so 
that a pcap/tpacket implementation in the VM can get a precise timestamp 
"at real capture".

>
>>
>>>
>>>>   if test "$cap_ng" = "yes" ; then
>>>>     echo "CONFIG_LIBCAP=y" >> $config_host_mak
>>>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>>>> index 67ba5e26fb..8026ad778a 100644
>>>> --- a/net/Makefile.objs
>>>> +++ b/net/Makefile.objs
>>>> @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o
>>>>   common-obj-y += socket.o
>>>>   common-obj-y += dump.o
>>>>   common-obj-y += eth.o
>>>> -common-obj-$(CONFIG_L2TPV3) += l2tpv3.o
>>>> +common-obj-$(CONFIG_UNIFIED) += l2tpv3.o unified.o
>>>>   common-obj-$(CONFIG_POSIX) += vhost-user.o
>>>>   common-obj-$(CONFIG_SLIRP) += slirp.o
>>>>   common-obj-$(CONFIG_VDE) += vde.o
>
> [...]
>
>>>>>   -    s = DO_UPCAST(NetL2TPV3State, nc, nc);
>>>>> +    s->params = p;
>>>>>   +    s->form_header = &l2tpv3_form_header;
>>>>> +    s->verify_header = &l2tpv3_verify_header;
>>>>>       s->queue_head = 0;
>>>>>       s->queue_tail = 0;
>>>>>       s->header_mismatch = false;
>>>>
>>>> Why not move all above into qemu_new_unified_net()?
>>
>> Only queue head/tail assignment can move.
>>
>> raw which uses same backend does not use header_mismatch. Form/verify 
>> header are different for each sub-transport. F.e. for gre you need 
>> the gre one, for raw you need the raw one, etc.
>
> Right, I mean pass function pointer to qemu_new_unified_net().

Ack - will do in the next revision.


[snip]

-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support anton.ivanov
  2017-07-19  5:48   ` Jason Wang
@ 2017-07-19 14:40   ` Eric Blake
  2017-07-19 14:46     ` Anton Ivanov
  2017-07-19 17:32     ` Anton Ivanov
  1 sibling, 2 replies; 23+ messages in thread
From: Eric Blake @ 2017-07-19 14:40 UTC (permalink / raw)
  To: anton.ivanov, qemu-devel; +Cc: jasowang

[-- Attachment #1: Type: text/plain, Size: 3278 bytes --]

On 07/18/2017 12:08 PM, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> This adds GRETAP support to the unified socket driver.
> 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---
>  net/Makefile.objs |   2 +-
>  net/clients.h     |   4 +
>  net/gre.c         | 313 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/net.c         |   5 +
>  qapi-schema.json  |  46 +++++++-
>  qemu-options.hx   |  63 ++++++++++-
>  6 files changed, 425 insertions(+), 8 deletions(-)
>  create mode 100644 net/gre.c
> 

Just an interface review:

> +++ b/qapi-schema.json
> @@ -3847,7 +3847,41 @@
>      'txsession':    'uint32',
>      '*rxsession':   'uint32',
>      '*offset':      'uint32' } }
> -
> +##
> +# @NetdevGREOptions:
> +#
> +# Connect the VLAN to Ethernet over Ethernet over GRE (GRETAP) tunnel
> +#
> +# @src: source address
> +#
> +# @dst: destination address
> +#
> +# @ipv6: force the use of ipv6

This doesn't quite match what we do with other sockets (where we have
both ipv4 and ipv6 booleans to allow IPv4-only, IPv6-only, or both).  Is
this something where we can reuse InetSocketAddress instead of inventing
yet another way of doing things?

Then again, it does match what NetdevL2TPv3Options did :(

> +#
> +# @sequence: have sequence counter
> +#
> +# @pinsequence: pin sequence counter to zero -
> +#              workaround for buggy implementations or
> +#              networks with packet reorder
> +#
> +# @txkey: 32 bit transmit key
> +#
> +# @rxkey: 32 bit receive key

Worth listing what the defaults are for these optional fields when not
present?

> +#
> +# Note - gre checksums are not supported at present
> +#
> +#
> +# Since 2.9

You've missed 2.9 by a long shot.  You've also missed 2.10 softfreeze
for a new feature, so this should read since 2.11.

> @@ -3966,7 +4000,7 @@
>  ##
>  { 'enum': 'NetClientDriver',
>    'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
> -            'bridge', 'hubport', 'netmap', 'vhost-user' ] }
> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }

Worth adding a comment that 'gre' is since 2.11 (look for other enums
that have had additions after the initial release of the enum, for an
example of the preferred format).

>  
>  ##
>  # @Netdev:
> @@ -3996,7 +4030,8 @@
>      'bridge':   'NetdevBridgeOptions',
>      'hubport':  'NetdevHubPortOptions',
>      'netmap':   'NetdevNetmapOptions',
> -    'vhost-user': 'NetdevVhostUserOptions' } }
> +    'vhost-user': 'NetdevVhostUserOptions',
> +    'gre':      'NetdevGREOptions' } }

Okay.

>  
>  ##
>  # @NetLegacy:
> @@ -4027,7 +4062,7 @@
>  ##
>  { 'enum': 'NetLegacyOptionsType',
>    'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
> -           'dump', 'bridge', 'netmap', 'vhost-user'] }
> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }

Not okay.  NetLegacy should never grow again (that's the whole point of
it being legacy - we're trying to phase it out).

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 619 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-18 17:08 ` [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support anton.ivanov
  2017-07-19  5:58   ` Jason Wang
@ 2017-07-19 14:42   ` Eric Blake
  1 sibling, 0 replies; 23+ messages in thread
From: Eric Blake @ 2017-07-19 14:42 UTC (permalink / raw)
  To: anton.ivanov, qemu-devel; +Cc: jasowang

[-- Attachment #1: Type: text/plain, Size: 1661 bytes --]

On 07/18/2017 12:08 PM, anton.ivanov@cambridgegreys.com wrote:
> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> 
> This adds raw socket support to the unified socket driver.
> 
> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
> ---

Interface review only.


> +++ b/qapi-schema.json
> @@ -3883,6 +3883,21 @@
>      '*txkey':    'uint32',
>      '*rxkey':    'uint32' } }
>  ##
> +# @NetdevRawOptions:
> +#
> +# Connect the VLAN to an network interface using raw sockets
> +#
> +# @ifname: network interface name
> +#
> +

Missing #

> +# Since 2.9

Same comments as on 2/3 - you've got the wrong version.

> +##
> +{ 'struct': 'NetdevRawOptions',
> +  'data': {
> +    'ifname':          'str'
> +} }
> +
> +##
>  # @NetdevVdeOptions:
>  #
>  # Connect the VLAN to a vde switch running on the host.
> @@ -4000,7 +4015,7 @@
>  ##
>  { 'enum': 'NetClientDriver',
>    'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
> -            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre', 'raw' ] }

Missing comment on the 2.11 addition

> @@ -4062,7 +4078,7 @@
>  ##
>  { 'enum': 'NetLegacyOptionsType',
>    'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
> -           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre', 'raw'] }

Again, this should NOT be added to NetLegacy.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 619 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-19 14:40   ` Eric Blake
@ 2017-07-19 14:46     ` Anton Ivanov
  2017-07-19 17:32     ` Anton Ivanov
  1 sibling, 0 replies; 23+ messages in thread
From: Anton Ivanov @ 2017-07-19 14:46 UTC (permalink / raw)
  To: Eric Blake, qemu-devel; +Cc: jasowang



On 19/07/17 15:40, Eric Blake wrote:
> On 07/18/2017 12:08 PM, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> This adds GRETAP support to the unified socket driver.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   net/Makefile.objs |   2 +-
>>   net/clients.h     |   4 +
>>   net/gre.c         | 313 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   net/net.c         |   5 +
>>   qapi-schema.json  |  46 +++++++-
>>   qemu-options.hx   |  63 ++++++++++-
>>   6 files changed, 425 insertions(+), 8 deletions(-)
>>   create mode 100644 net/gre.c
>>
> Just an interface review:
>
>> +++ b/qapi-schema.json
>> @@ -3847,7 +3847,41 @@
>>       'txsession':    'uint32',
>>       '*rxsession':   'uint32',
>>       '*offset':      'uint32' } }
>> -
>> +##
>> +# @NetdevGREOptions:
>> +#
>> +# Connect the VLAN to Ethernet over Ethernet over GRE (GRETAP) tunnel
>> +#
>> +# @src: source address
>> +#
>> +# @dst: destination address
>> +#
>> +# @ipv6: force the use of ipv6
> This doesn't quite match what we do with other sockets (where we have
> both ipv4 and ipv6 booleans to allow IPv4-only, IPv6-only, or both).  Is
> this something where we can reuse InetSocketAddress instead of inventing
> yet another way of doing things?

I can try that in the next version.

>
> Then again, it does match what NetdevL2TPv3Options did :(
>
>> +#
>> +# @sequence: have sequence counter
>> +#
>> +# @pinsequence: pin sequence counter to zero -
>> +#              workaround for buggy implementations or
>> +#              networks with packet reorder
>> +#
>> +# @txkey: 32 bit transmit key
>> +#
>> +# @rxkey: 32 bit receive key
> Worth listing what the defaults are for these optional fields when not
> present?

GRE header is incremental.

If there is no tx/rxkey the header is shorter and these are not present 
in the header.

If a key is specified the header is longer to accommodate the key.

They are optional - similar to the l2tpv3 cookie.

>
>> +#
>> +# Note - gre checksums are not supported at present
>> +#
>> +#
>> +# Since 2.9
> You've missed 2.9 by a long shot.  You've also missed 2.10 softfreeze
> for a new feature, so this should read since 2.11.

The patch has been sitting in my outgoing queue for reasons outside my 
control for months. I apologise for that and I am re-aligning all of 
them for 2.11

>
>> @@ -3966,7 +4000,7 @@
>>   ##
>>   { 'enum': 'NetClientDriver',
>>     'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde', 'dump',
>> -            'bridge', 'hubport', 'netmap', 'vhost-user' ] }
>> +            'bridge', 'hubport', 'netmap', 'vhost-user', 'gre' ] }
> Worth adding a comment that 'gre' is since 2.11 (look for other enums
> that have had additions after the initial release of the enum, for an
> example of the preferred format).

Ack.

>
>>   
>>   ##
>>   # @Netdev:
>> @@ -3996,7 +4030,8 @@
>>       'bridge':   'NetdevBridgeOptions',
>>       'hubport':  'NetdevHubPortOptions',
>>       'netmap':   'NetdevNetmapOptions',
>> -    'vhost-user': 'NetdevVhostUserOptions' } }
>> +    'vhost-user': 'NetdevVhostUserOptions',
>> +    'gre':      'NetdevGREOptions' } }
> Okay.
>
>>   
>>   ##
>>   # @NetLegacy:
>> @@ -4027,7 +4062,7 @@
>>   ##
>>   { 'enum': 'NetLegacyOptionsType',
>>     'data': ['none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
>> -           'dump', 'bridge', 'netmap', 'vhost-user'] }
>> +           'dump', 'bridge', 'netmap', 'vhost-user', 'gre'] }
> Not okay.  NetLegacy should never grow again (that's the whole point of
> it being legacy - we're trying to phase it out).

OK, I will revise the patch before the next submission.

A.


-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-19 14:40   ` Eric Blake
  2017-07-19 14:46     ` Anton Ivanov
@ 2017-07-19 17:32     ` Anton Ivanov
  2017-07-21 19:14       ` Eric Blake
  1 sibling, 1 reply; 23+ messages in thread
From: Anton Ivanov @ 2017-07-19 17:32 UTC (permalink / raw)
  To: Eric Blake, qemu-devel; +Cc: jasowang



On 19/07/17 15:40, Eric Blake wrote:
> On 07/18/2017 12:08 PM, anton.ivanov@cambridgegreys.com wrote:
>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>
>> This adds GRETAP support to the unified socket driver.
>>
>> Signed-off-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>> ---
>>   net/Makefile.objs |   2 +-
>>   net/clients.h     |   4 +
>>   net/gre.c         | 313 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   net/net.c         |   5 +
>>   qapi-schema.json  |  46 +++++++-
>>   qemu-options.hx   |  63 ++++++++++-
>>   6 files changed, 425 insertions(+), 8 deletions(-)
>>   create mode 100644 net/gre.c
>>
> Just an interface review:
>
>> +++ b/qapi-schema.json
>> @@ -3847,7 +3847,41 @@
>>       'txsession':    'uint32',
>>       '*rxsession':   'uint32',
>>       '*offset':      'uint32' } }
>> -
>> +##
>> +# @NetdevGREOptions:
>> +#
>> +# Connect the VLAN to Ethernet over Ethernet over GRE (GRETAP) tunnel
>> +#
>> +# @src: source address
>> +#
>> +# @dst: destination address
>> +#
>> +# @ipv6: force the use of ipv6
> This doesn't quite match what we do with other sockets (where we have
> both ipv4 and ipv6 booleans to allow IPv4-only, IPv6-only, or both).  Is
> this something where we can reuse InetSocketAddress instead of inventing
> yet another way of doing things?
>
> Then again, it does match what NetdevL2TPv3Options did :(

I just reviewed this again.

I do not think  we can today. This is the declaration:

##
{ 'struct': 'InetSocketAddressBase',
   'data': {
     'host': 'str',
     'port': 'str' } }

##

If I read this right port is mandatory, correct?

We may be able to do it if the port portion if InetSocketAddress becomes 
optional. There is no such thing as port for the protocols which use the 
raw families.

I now recall it being the reason why L2TPv3 does it this way.

I am addressing the rest of the comments in the meantime.

A.

[snip]

-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-19  5:39   ` Jason Wang
  2017-07-19  5:48     ` Anton Ivanov
@ 2017-07-21 17:50     ` Anton Ivanov
  2017-07-24  3:51       ` Jason Wang
  1 sibling, 1 reply; 23+ messages in thread
From: Anton Ivanov @ 2017-07-21 17:50 UTC (permalink / raw)
  To: Jason Wang, qemu-devel

[snip]

>> +    NetUnifiedState *s = (NetUnifiedState *) us;
>> +    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
>
> How about embedding NetUnifiedState into this structure and keep using 
> NetL2TPV3State? Then:
>
> -  's' could be kept and lots of lines of changes could be saved here 
> and l2tpv3_verify_header()
> -  each transport could have their own type instead of using 
> NET_CLIENT_DRIVER_L2TPV3

That means each of them having their own read/write functions in each 
transport, destroy functions, etc.

I am trying to achieve exactly the opposite which across all transports 
should save more code. There should be nothing in a transport which 
leverages the common datagram processing backend except:

1. Init and parse arguments
2. Form Header
3. Verify Header

All the rest can be common for a large family of datagram based 
transports - L2TPv3, GRE, RAW (both full interface and just pulling a 
specific vlan out of it), etc.

It is trivial to do that for fixed size headers (as in the current 
patchset family). It is a bit more difficult to that for variable 
headers, but still datagram (GUE, Geneve, etc).

These may also add 4 - I/O to control plane, but it remains to be seen 
if that is needed.

This also makes any improvements to the backend - f.e. switching from 
send() to sendmmsg() automatically available for all transports.

What cannot be done is to shoehorn into this stream based. I believe we 
have only one of those - the original socket.c in tcp mode and we can 
leave it to stay that way and switch only the datagram mode to a better 
backend.

I am going through the other comments in the meantime to see if I missed 
something else and fixing the omissions.

A.

[snip]

-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-19  5:58   ` Jason Wang
  2017-07-19  6:02     ` Anton Ivanov
@ 2017-07-21 18:50     ` Anton Ivanov
  2017-07-24  4:03       ` Jason Wang
  1 sibling, 1 reply; 23+ messages in thread
From: Anton Ivanov @ 2017-07-21 18:50 UTC (permalink / raw)
  To: Jason Wang, qemu-devel


[snip]

>> +    "-netdev raw,id=str,ifname=ifname\n"
>> +    "                configure a network backend with ID 'str' 
>> connected to\n"
>> +    "                an Ethernet interface named ifname via raw 
>> socket.\n"
>> +    "                This backend does not change the interface 
>> settings.\n"
>> +    "                Most interfaces will require being set into 
>> promisc mode,\n"
>> +    "                as well having most offloads (TSO, etc) turned 
>> off.\n"
>> +    "                Some virtual interfaces like tap support only 
>> RX.\n"
>
> Pay attention that qemu supports vnet header. So any reason to turn 
> off e.g TSO here?

I am not aware of any means to get extra info like checksums, etc show 
up on raw socket read.

If you know a way to make them show up, this is worth investigating.

>
>>   #endif
>>       "-netdev 
>> socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
>>       "                configure a network backend to connect to 
>> another network\n"
>> @@ -2463,6 +2470,32 @@ qemu-system-i386 linux.img -net nic -net 
>> gre,src=4.2.3.1,dst=1.2.3.4
>>     @end example
>>   +@item -netdev raw,id=@var{id},ifname=@var{ifname}
>> +@itemx -net raw[,vlan=@var{n}][,name=@var{name}],ifname=@var{ifname}
>> +Connect VLAN @var{n} directly to an Ethernet interface using raw 
>> socket.
>> +
>> +This transport allows a VM to bypass most of the network stack which is
>> +extremely useful for tapping.
>> +
>> +@item ifname=@var{ifname}
>> +    interface name (mandatory)
>> +
>> +@example
>> +# set up the interface - put it in promiscuous mode and turn off 
>> offloads
>> +ifconfig eth0 up
>> +ifconfig eth0 promisc
>> +
>> +/sbin/ethtool -K eth0 gro off
>> +/sbin/ethtool -K eth0 tso off
>> +/sbin/ethtool -K eth0 gso off
>> +/sbin/ethtool -K eth0 tx off
>
> Any reason to turn off tx here?

Yes - we already have it computed and we have written it as is as a 
whole packet. You do not want it
re-computed as at least some adapters do silly things if you start 
writing raw and the checksum already exists.

Once again, this one of the pros/cons of using tpacket vs recv/send 
(with or without mmsg) on a raw socket.

recvm(m)sg/sendm(m)sg are brute force as far as offloads, but things 
like scatter/gather work correctly so there are little copies.

Compared to that, tpacket will allow you some access to checksumming 
which you can map onto checksum offload in a vNIC. As a payback for this 
you end up copying in more cases than for send/recvmmsg and you pay 
penalty for timestamping if you do not have a hardware timestamp source 
in the NIC.

The other issue I always had with tpacket is that you "see" your own 
packets so you have to manage a  RX side BPF filter which removes those 
so you do not see your own packets. That can get quite interesting if 
you have a lot of MACs on a NIC (f.e. when there are multicast apps). 
Not sure if this is still the case - it definitely was in mid 3.x Linux 
kernels. If you use raw sendm(m)sg there is no issue - the packets are 
not looped when writing to physical interfaces.

>
>> +
>> +# launch QEMU instance - if your network has reorder or is very 
>> lossy add ,pincounter
>> +
>> +qemu-system-i386 linux.img -net nic -net raw,ifname=eth0
>
> Can we switch to use -netdev here?

This is done in the new revisions.

>
> Thanks
>
>> +
>> +@end example
>> +
>>   @item -netdev 
>> vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>>   @itemx -net 
>> vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] 
>> [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>>   Connect VLAN @var{n} to PORT @var{n} of a vde switch running on 
>> host and
>
>
-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-19 17:32     ` Anton Ivanov
@ 2017-07-21 19:14       ` Eric Blake
  2017-07-22  7:52         ` Anton Ivanov
  0 siblings, 1 reply; 23+ messages in thread
From: Eric Blake @ 2017-07-21 19:14 UTC (permalink / raw)
  To: Anton Ivanov, qemu-devel; +Cc: jasowang

[-- Attachment #1: Type: text/plain, Size: 1678 bytes --]

On 07/19/2017 12:32 PM, Anton Ivanov wrote:
> 
> 
> On 19/07/17 15:40, Eric Blake wrote:
>> On 07/18/2017 12:08 PM, anton.ivanov@cambridgegreys.com wrote:
>>> From: Anton Ivanov <anton.ivanov@cambridgegreys.com>
>>>
>>> This adds GRETAP support to the unified socket driver.
>>>

>>> +#
>>> +# @ipv6: force the use of ipv6
>> This doesn't quite match what we do with other sockets (where we have
>> both ipv4 and ipv6 booleans to allow IPv4-only, IPv6-only, or both).  Is
>> this something where we can reuse InetSocketAddress instead of inventing
>> yet another way of doing things?
>>
>> Then again, it does match what NetdevL2TPv3Options did :(
> 
> I just reviewed this again.
> 
> I do not think  we can today. This is the declaration:
> 
> ##
> { 'struct': 'InetSocketAddressBase',
>   'data': {
>     'host': 'str',
>     'port': 'str' } }
> 
> ##
> 
> If I read this right port is mandatory, correct?

Okay, so it sounds like reusing InetSocket directly may not be possible.
 But there's still the interface question of whether we want dual 'ipv4'
and 'ipv6' switches to allow finer-grain control over which (or both)
families to be used.

> 
> We may be able to do it if the port portion if InetSocketAddress becomes
> optional. There is no such thing as port for the protocols which use the
> raw families.

We can always create a new QAPI type that expresses only the fields we
need; I don't think InetSocketAddress should be changed to have an
optional port just for your code additions.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3266
Virtualization:  qemu.org | libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 619 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support
  2017-07-21 19:14       ` Eric Blake
@ 2017-07-22  7:52         ` Anton Ivanov
  0 siblings, 0 replies; 23+ messages in thread
From: Anton Ivanov @ 2017-07-22  7:52 UTC (permalink / raw)
  To: Eric Blake, qemu-devel; +Cc: jasowang



>> ##
>>
>> If I read this right port is mandatory, correct?
> Okay, so it sounds like reusing InetSocket directly may not be possible.
>   But there's still the interface question of whether we want dual 'ipv4'
> and 'ipv6' switches to allow finer-grain control over which (or both)
> families to be used.

I have that in the new version ready for submission. Behavior is 
identical with other arguments which have a v4 and v6 switch.

>
>> We may be able to do it if the port portion if InetSocketAddress becomes
>> optional. There is no such thing as port for the protocols which use the
>> raw families.
> We can always create a new QAPI type that expresses only the fields we
> need; I don't think InetSocketAddress should be changed to have an
> optional port just for your code additions.
>
Concur - there are places where people rely that it has the port 
mandatory. Changing it to optional will break too much unrelated code.

-- 
Anton R. Ivanov

Cambridge Greys Limited, England and Wales company No 10273661
http://www.cambridgegreys.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport
  2017-07-21 17:50     ` Anton Ivanov
@ 2017-07-24  3:51       ` Jason Wang
  0 siblings, 0 replies; 23+ messages in thread
From: Jason Wang @ 2017-07-24  3:51 UTC (permalink / raw)
  To: Anton Ivanov, qemu-devel



On 2017年07月22日 01:50, Anton Ivanov wrote:
> [snip]
>
>>> +    NetUnifiedState *s = (NetUnifiedState *) us;
>>> +    L2TPV3TunnelParams *p = (L2TPV3TunnelParams *) s->params;
>>
>> How about embedding NetUnifiedState into this structure and keep 
>> using NetL2TPV3State? Then:
>>
>> -  's' could be kept and lots of lines of changes could be saved here 
>> and l2tpv3_verify_header()
>> -  each transport could have their own type instead of using 
>> NET_CLIENT_DRIVER_L2TPV3
>
> That means each of them having their own read/write functions in each 
> transport, destroy functions, etc.

Looks not? Just something like

typedef struct L2TPV3State {
     NetUDSTState udst;
     /* L2TPV3 specific data */
     ....
};

static NetClientInfo l2tpv3_info = {
     /* we share this one for all types for now, wrong I know :) */
     .type = NET_CLIENT_DRIVER_L2TPV3,
     .size = sizeof(L2TPV3State),
     .receive = net_udst_receive_dgram,
     .receive_iov = net_udst_receive_dgram_iov,
     .poll = udst_poll,
     .cleanup = net_udst_cleanup,
};

Thanks

>
> I am trying to achieve exactly the opposite which across all 
> transports should save more code. There should be nothing in a 
> transport which leverages the common datagram processing backend except:
>
> 1. Init and parse arguments
> 2. Form Header
> 3. Verify Header
>
> All the rest can be common for a large family of datagram based 
> transports - L2TPv3, GRE, RAW (both full interface and just pulling a 
> specific vlan out of it), etc.
>
> It is trivial to do that for fixed size headers (as in the current 
> patchset family). It is a bit more difficult to that for variable 
> headers, but still datagram (GUE, Geneve, etc).
>
> These may also add 4 - I/O to control plane, but it remains to be seen 
> if that is needed.
>
> This also makes any improvements to the backend - f.e. switching from 
> send() to sendmmsg() automatically available for all transports.
>
> What cannot be done is to shoehorn into this stream based. I believe 
> we have only one of those - the original socket.c in tcp mode and we 
> can leave it to stay that way and switch only the datagram mode to a 
> better backend.
>
> I am going through the other comments in the meantime to see if I 
> missed something else and fixing the omissions.
>
> A.
>
> [snip]
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-21 18:50     ` Anton Ivanov
@ 2017-07-24  4:03       ` Jason Wang
  2017-09-08 17:22         ` Anton Ivanov
  0 siblings, 1 reply; 23+ messages in thread
From: Jason Wang @ 2017-07-24  4:03 UTC (permalink / raw)
  To: Anton Ivanov, qemu-devel



On 2017年07月22日 02:50, Anton Ivanov wrote:
>
> [snip]
>
>>> +    "-netdev raw,id=str,ifname=ifname\n"
>>> +    "                configure a network backend with ID 'str' 
>>> connected to\n"
>>> +    "                an Ethernet interface named ifname via raw 
>>> socket.\n"
>>> +    "                This backend does not change the interface 
>>> settings.\n"
>>> +    "                Most interfaces will require being set into 
>>> promisc mode,\n"
>>> +    "                as well having most offloads (TSO, etc) turned 
>>> off.\n"
>>> +    "                Some virtual interfaces like tap support only 
>>> RX.\n"
>>
>> Pay attention that qemu supports vnet header. So any reason to turn 
>> off e.g TSO here?
>
> I am not aware of any means to get extra info like checksums, etc show 
> up on raw socket read.
>
> If you know a way to make them show up, this is worth investigating.

See packet_rcv_vnet(). But a known 'issue' for raw socket is that it 
forbids change vnet header length after creation, we may need some 
workaround in qemu.

>
>>
>>>   #endif
>>>       "-netdev 
>>> socket,id=str[,fd=h][,listen=[host]:port][,connect=host:port]\n"
>>>       "                configure a network backend to connect to 
>>> another network\n"
>>> @@ -2463,6 +2470,32 @@ qemu-system-i386 linux.img -net nic -net 
>>> gre,src=4.2.3.1,dst=1.2.3.4
>>>     @end example
>>>   +@item -netdev raw,id=@var{id},ifname=@var{ifname}
>>> +@itemx -net raw[,vlan=@var{n}][,name=@var{name}],ifname=@var{ifname}
>>> +Connect VLAN @var{n} directly to an Ethernet interface using raw 
>>> socket.
>>> +
>>> +This transport allows a VM to bypass most of the network stack 
>>> which is
>>> +extremely useful for tapping.
>>> +
>>> +@item ifname=@var{ifname}
>>> +    interface name (mandatory)
>>> +
>>> +@example
>>> +# set up the interface - put it in promiscuous mode and turn off 
>>> offloads
>>> +ifconfig eth0 up
>>> +ifconfig eth0 promisc
>>> +
>>> +/sbin/ethtool -K eth0 gro off
>>> +/sbin/ethtool -K eth0 tso off
>>> +/sbin/ethtool -K eth0 gso off
>>> +/sbin/ethtool -K eth0 tx off
>>
>> Any reason to turn off tx here?
>
> Yes - we already have it computed and we have written it as is as a 
> whole packet. You do not want it
> re-computed as at least some adapters do silly things if you start 
> writing raw and the checksum already exists.

This looks like a bug of the driver?

For GRO it's easier to understand since guest may not handle big packets 
with partial checksum. But tso,gso,tx, this still looks questionable for 
the nic which may want to offload them to card (e.g virtio-net).

>
> Once again, this one of the pros/cons of using tpacket vs recv/send 
> (with or without mmsg) on a raw socket.
>
> recvm(m)sg/sendm(m)sg are brute force as far as offloads, but things 
> like scatter/gather work correctly so there are little copies.
>
> Compared to that, tpacket will allow you some access to checksumming 
> which you can map onto checksum offload in a vNIC. As a payback for 
> this you end up copying in more cases than for send/recvmmsg and you 
> pay penalty for timestamping if you do not have a hardware timestamp 
> source in the NIC.
>
> The other issue I always had with tpacket is that you "see" your own 
> packets so you have to manage a  RX side BPF filter which removes 
> those so you do not see your own packets.

Don't get here, looks like I don't get this 'issue'. Anyway we can 
discuss this when I post the tpacket backend.

Thanks.

> That can get quite interesting if you have a lot of MACs on a NIC 
> (f.e. when there are multicast apps). Not sure if this is still the 
> case - it definitely was in mid 3.x Linux kernels. If you use raw 
> sendm(m)sg there is no issue - the packets are not looped when writing 
> to physical interfaces.
>
>>
>>> +
>>> +# launch QEMU instance - if your network has reorder or is very 
>>> lossy add ,pincounter
>>> +
>>> +qemu-system-i386 linux.img -net nic -net raw,ifname=eth0
>>
>> Can we switch to use -netdev here?
>
> This is done in the new revisions.
>
>>
>> Thanks
>>
>>> +
>>> +@end example
>>> +
>>>   @item -netdev 
>>> vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>>>   @itemx -net 
>>> vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] 
>>> [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}]
>>>   Connect VLAN @var{n} to PORT @var{n} of a vde switch running on 
>>> host and
>>
>>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support
  2017-07-24  4:03       ` Jason Wang
@ 2017-09-08 17:22         ` Anton Ivanov
  0 siblings, 0 replies; 23+ messages in thread
From: Anton Ivanov @ 2017-09-08 17:22 UTC (permalink / raw)
  To: Jason Wang, qemu-devel

On 24/07/17 05:03, Jason Wang wrote:
>
>
> On 2017年07月22日 02:50, Anton Ivanov wrote:
>>
>> [snip]
>>
>>>> +    "-netdev raw,id=str,ifname=ifname\n"
>>>> +    "                configure a network backend with ID 'str'
>>>> connected to\n"
>>>> +    "                an Ethernet interface named ifname via raw
>>>> socket.\n"
>>>> +    "                This backend does not change the interface
>>>> settings.\n"
>>>> +    "                Most interfaces will require being set into
>>>> promisc mode,\n"
>>>> +    "                as well having most offloads (TSO, etc)
>>>> turned off.\n"
>>>> +    "                Some virtual interfaces like tap support only
>>>> RX.\n"
>>>
>>> Pay attention that qemu supports vnet header. So any reason to turn
>>> off e.g TSO here?
>>
>> I am not aware of any means to get extra info like checksums, etc
>> show up on raw socket read.
>>
>> If you know a way to make them show up, this is worth investigating.
>
> See packet_rcv_vnet(). But a known 'issue' for raw socket is that it
> forbids change vnet header length after creation, we may need some
> workaround in qemu.

There are a couple of other issues - if I understand the situation
correctly, probing the vnet header size relies on tap ioctls.

I have some support for this working now. I will try to get around to
redo the patches as per your last comments + initial vnet header support
and re-submit.

One definitive advantage of having it is that it allows to detect if
GSO/TSO is enabled on the interface as this is not something that gets
along very well with fixed buffer size vector IO.

A.

[snip]
>>>
>
>


-- 
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2017-09-08 17:22 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-18 17:08 [Qemu-devel] Unified Socket Driver anton.ivanov
2017-07-18 17:08 ` [Qemu-devel] [PATCH 1/3] Unified Datagram Socket Transport anton.ivanov
2017-07-19  5:39   ` Jason Wang
2017-07-19  5:48     ` Anton Ivanov
2017-07-19  6:07       ` Jason Wang
2017-07-19  6:48         ` Anton Ivanov
2017-07-21 17:50     ` Anton Ivanov
2017-07-24  3:51       ` Jason Wang
2017-07-18 17:08 ` [Qemu-devel] [PATCH 2/3] Unified Datagram Socket Transport - GRE support anton.ivanov
2017-07-19  5:48   ` Jason Wang
2017-07-19  5:50     ` Anton Ivanov
2017-07-19 14:40   ` Eric Blake
2017-07-19 14:46     ` Anton Ivanov
2017-07-19 17:32     ` Anton Ivanov
2017-07-21 19:14       ` Eric Blake
2017-07-22  7:52         ` Anton Ivanov
2017-07-18 17:08 ` [Qemu-devel] [PATCH 3/3] Unified Datagram Socket Transport - raw support anton.ivanov
2017-07-19  5:58   ` Jason Wang
2017-07-19  6:02     ` Anton Ivanov
2017-07-21 18:50     ` Anton Ivanov
2017-07-24  4:03       ` Jason Wang
2017-09-08 17:22         ` Anton Ivanov
2017-07-19 14:42   ` Eric Blake

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.