* [PATCH net-next V2 0/3] support changing steering policies in tuntap
@ 2017-10-31 10:32 Jason Wang
2017-10-31 10:32 ` [PATCH net-next V2 1/3] tun: abstract flow steering logic Jason Wang
` (2 more replies)
0 siblings, 3 replies; 21+ messages in thread
From: Jason Wang @ 2017-10-31 10:32 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemdebruijn.kernel, tom, Jason Wang
Hi all:
We use flow caches based flow steering policy now. This is good for
connection-oriented communication such as TCP but not for the others
e.g connectionless unidirectional workload which cares only about
pps. This calls the ability of supporting changing steering policies
in tuntap which was done by this series.
Flow steering policy was abstracted into tun_steering_ops in the first
patch. Then new ioctls to set or query current policy were introduced,
and the last patch introduces a very simple policy that select txq
based on processor id as an example.
An example policy was implemented which allows user to load an eBPF to
choose the txq. Test was done by using xdp_redirect to redirect
traffic generated from MoonGen that was running on a remote
machine. And I see 37% improvement for processor id policy compared to
automatic flow steering policy.
Changes from V1:
- drop processor id example, implement an eBPF based queue selection
policy instead
Jason Wang (3):
tun: abstract flow steering logic
tun: introduce ioctls to set and get steering policies
tun: add eBPF based queue selection method
drivers/net/tun.c | 197 ++++++++++++++++++++++++++++++++++++++------
include/uapi/linux/if_tun.h | 9 ++
2 files changed, 183 insertions(+), 23 deletions(-)
--
2.7.4
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH net-next V2 1/3] tun: abstract flow steering logic
2017-10-31 10:32 [PATCH net-next V2 0/3] support changing steering policies in tuntap Jason Wang
@ 2017-10-31 10:32 ` Jason Wang
2017-11-02 1:11 ` Willem de Bruijn
2017-10-31 10:32 ` [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies Jason Wang
2017-10-31 10:32 ` [PATCH net-next V2 3/3] tun: add eBPF based queue selection method Jason Wang
2 siblings, 1 reply; 21+ messages in thread
From: Jason Wang @ 2017-10-31 10:32 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemdebruijn.kernel, tom, Jason Wang
tun now use flow caches based automatic queue steering method. This
may not suffice all user cases. To extend it to be able to use more
flow steering policy, this patch abstracts flow steering logic into
tun_steering_ops, then we can declare and use different methods in
the future.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 63 insertions(+), 22 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ea29da9..bff6259 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -182,6 +182,14 @@ struct tun_file {
struct skb_array tx_array;
};
+struct tun_steering_ops {
+ u16 (*select_queue) (struct tun_struct *tun, struct sk_buff *skb);
+ void (*xmit) (struct tun_struct *tun, struct sk_buff *skb);
+ u32 (*pre_rx) (struct tun_struct *tun, struct sk_buff *skb);
+ void (*post_rx) (struct tun_struct *tun, struct tun_file *tfile,
+ u32 data);
+};
+
struct tun_flow_entry {
struct hlist_node hash_link;
struct rcu_head rcu;
@@ -232,6 +240,7 @@ struct tun_struct {
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
+ struct tun_steering_ops *steering_ops;
};
static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -537,10 +546,8 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
* different rxq no. here. If we could not get rxhash, then we would
* hope the rxq no. may help here.
*/
-static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
- void *accel_priv, select_queue_fallback_t fallback)
+static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
- struct tun_struct *tun = netdev_priv(dev);
struct tun_flow_entry *e;
u32 txq = 0;
u32 numqueues = 0;
@@ -564,9 +571,18 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
}
rcu_read_unlock();
+
return txq;
}
+static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
+ void *accel_priv, select_queue_fallback_t fallback)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+
+ return tun->steering_ops->select_queue(tun, skb);
+}
+
static inline bool tun_not_capable(struct tun_struct *tun)
{
const struct cred *cred = current_cred();
@@ -936,24 +952,10 @@ static int tun_net_close(struct net_device *dev)
return 0;
}
-/* Net device start xmit */
-static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
- struct tun_struct *tun = netdev_priv(dev);
- int txq = skb->queue_mapping;
- struct tun_file *tfile;
- u32 numqueues = 0;
-
- rcu_read_lock();
- tfile = rcu_dereference(tun->tfiles[txq]);
- numqueues = ACCESS_ONCE(tun->numqueues);
-
- /* Drop packet if interface is not attached */
- if (txq >= numqueues)
- goto drop;
-
#ifdef CONFIG_RPS
- if (numqueues == 1 && static_key_false(&rps_needed)) {
+ if (ACCESS_ONCE(tun->numqueues) == 1 && static_key_false(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here.
*/
@@ -969,6 +971,25 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
}
}
#endif
+}
+
+/* Net device start xmit */
+static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct tun_struct *tun = netdev_priv(dev);
+ int txq = skb->queue_mapping;
+ struct tun_file *tfile;
+ u32 numqueues = 0;
+
+ rcu_read_lock();
+ tfile = rcu_dereference(tun->tfiles[txq]);
+ numqueues = ACCESS_ONCE(tun->numqueues);
+
+ /* Drop packet if interface is not attached */
+ if (txq >= numqueues)
+ goto drop;
+
+ tun->steering_ops->xmit(tun, skb);
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
@@ -1532,6 +1553,17 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
return NULL;
}
+u32 tun_automq_pre_rx(struct tun_struct *tun, struct sk_buff *skb)
+{
+ return __skb_get_hash_symmetric(skb);
+}
+
+void tun_automq_post_rx(struct tun_struct *tun, struct tun_file *tfile,
+ u32 rxhash)
+{
+ tun_flow_update(tun, rxhash, tfile);
+}
+
/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
void *msg_control, struct iov_iter *from,
@@ -1547,7 +1579,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
int copylen;
bool zerocopy = false;
int err;
- u32 rxhash;
+ u32 data;
int skb_xdp = 1;
bool frags = tun_napi_frags_enabled(tun);
@@ -1735,7 +1767,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
rcu_read_unlock();
}
- rxhash = __skb_get_hash_symmetric(skb);
+ data = tun->steering_ops->pre_rx(tun, skb);
if (frags) {
/* Exercise flow dissector code path. */
@@ -1779,7 +1811,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
u64_stats_update_end(&stats->syncp);
put_cpu_ptr(stats);
- tun_flow_update(tun, rxhash, tfile);
+ tun->steering_ops->post_rx(tun, tfile, data);
return total_len;
}
@@ -2119,6 +2151,13 @@ static struct proto tun_proto = {
.obj_size = sizeof(struct tun_file),
};
+static struct tun_steering_ops tun_automq_ops = {
+ .select_queue = tun_automq_select_queue,
+ .xmit = tun_automq_xmit,
+ .pre_rx = tun_automq_pre_rx,
+ .post_rx = tun_automq_post_rx,
+};
+
static int tun_flags(struct tun_struct *tun)
{
return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
@@ -2278,6 +2317,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
goto err_free_dev;
}
+ tun->steering_ops = &tun_automq_ops;
+
spin_lock_init(&tun->lock);
err = security_tun_dev_alloc_security(&tun->security);
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies
2017-10-31 10:32 [PATCH net-next V2 0/3] support changing steering policies in tuntap Jason Wang
2017-10-31 10:32 ` [PATCH net-next V2 1/3] tun: abstract flow steering logic Jason Wang
@ 2017-10-31 10:32 ` Jason Wang
2017-11-02 1:15 ` Willem de Bruijn
2017-10-31 10:32 ` [PATCH net-next V2 3/3] tun: add eBPF based queue selection method Jason Wang
2 siblings, 1 reply; 21+ messages in thread
From: Jason Wang @ 2017-10-31 10:32 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemdebruijn.kernel, tom, Jason Wang
This patch introduces new ioctl for change packet steering policy for
tun. Only automatic flow steering is supported, more policies will
come.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 35 ++++++++++++++++++++++++++++++++++-
include/uapi/linux/if_tun.h | 7 +++++++
2 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bff6259..ab109ff 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -122,7 +122,8 @@ do { \
#define TUN_VNET_BE 0x40000000
#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
- IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+ IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS | \
+ IFF_MULTI_STEERING)
#define GOODCOPY_LEN 128
@@ -2516,6 +2517,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned int ifindex;
int le;
int ret;
+ unsigned int steering;
if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == SOCK_IOC_TYPE) {
if (copy_from_user(&ifr, argp, ifreq_len))
@@ -2774,6 +2776,37 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = 0;
break;
+ case TUNSETSTEERING:
+ ret = -EFAULT;
+ if (copy_from_user(&steering, argp, sizeof(steering)))
+ break;
+ ret = 0;
+ switch (steering) {
+ case TUN_STEERING_AUTOMQ:
+ tun->steering_ops = &tun_automq_ops;
+ break;
+ default:
+ ret = -EFAULT;
+ }
+ break;
+
+ case TUNGETSTEERING:
+ ret = 0;
+ if (tun->steering_ops == &tun_automq_ops)
+ steering = TUN_STEERING_AUTOMQ;
+ else
+ BUG();
+ if (copy_to_user(argp, &steering, sizeof(steering)))
+ ret = -EFAULT;
+ break;
+
+ case TUNGETSTEERINGFEATURES:
+ ret = 0;
+ steering = TUN_STEERING_AUTOMQ;
+ if (copy_to_user(argp, &steering, sizeof(steering)))
+ ret = -EFAULT;
+ break;
+
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 365ade5..109760e 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -56,6 +56,9 @@
*/
#define TUNSETVNETBE _IOW('T', 222, int)
#define TUNGETVNETBE _IOR('T', 223, int)
+#define TUNSETSTEERING _IOW('T', 224, unsigned int)
+#define TUNGETSTEERING _IOR('T', 225, unsigned int)
+#define TUNGETSTEERINGFEATURES _IOR('T', 226, unsigned int)
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
@@ -70,6 +73,8 @@
#define IFF_MULTI_QUEUE 0x0100
#define IFF_ATTACH_QUEUE 0x0200
#define IFF_DETACH_QUEUE 0x0400
+#define IFF_MULTI_STEERING 0x2000
+
/* read-only flag */
#define IFF_PERSIST 0x0800
#define IFF_NOFILTER 0x1000
@@ -106,4 +111,6 @@ struct tun_filter {
__u8 addr[0][ETH_ALEN];
};
+#define TUN_STEERING_AUTOMQ 0x01 /* Automatic flow steering */
+
#endif /* _UAPI__IF_TUN_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-10-31 10:32 [PATCH net-next V2 0/3] support changing steering policies in tuntap Jason Wang
2017-10-31 10:32 ` [PATCH net-next V2 1/3] tun: abstract flow steering logic Jason Wang
2017-10-31 10:32 ` [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies Jason Wang
@ 2017-10-31 10:32 ` Jason Wang
2017-10-31 16:45 ` Michael S. Tsirkin
2017-11-03 8:56 ` Willem de Bruijn
2 siblings, 2 replies; 21+ messages in thread
From: Jason Wang @ 2017-10-31 10:32 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemdebruijn.kernel, tom, Jason Wang
This patch introduces an eBPF based queue selection method based on
the flow steering policy ops. Userspace could load an eBPF program
through TUNSETSTEERINGEBPF. This gives much more flexibility compare
to simple but hard coded policy in kernel.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/if_tun.h | 2 ++
2 files changed, 80 insertions(+), 1 deletion(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ab109ff..4bdde21 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -191,6 +191,20 @@ struct tun_steering_ops {
u32 data);
};
+void tun_steering_xmit_nop(struct tun_struct *tun, struct sk_buff *skb)
+{
+}
+
+u32 tun_steering_pre_rx_nop(struct tun_struct *tun, struct sk_buff *skb)
+{
+ return 0;
+}
+
+void tun_steering_post_rx_nop(struct tun_struct *tun, struct tun_file *tfile,
+ u32 data)
+{
+}
+
struct tun_flow_entry {
struct hlist_node hash_link;
struct rcu_head rcu;
@@ -241,6 +255,7 @@ struct tun_struct {
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
+ struct bpf_prog __rcu *steering_prog;
struct tun_steering_ops *steering_ops;
};
@@ -576,6 +591,19 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
return txq;
}
+static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
+{
+ struct bpf_prog *prog;
+ u16 ret = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(tun->steering_prog);
+ if (prog)
+ ret = bpf_prog_run_clear_cb(prog, skb);
+ rcu_read_unlock();
+
+ return ret % tun->numqueues;
+}
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback)
{
@@ -2017,6 +2045,20 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
+static void __tun_set_steering_ebpf(struct tun_struct *tun,
+ struct bpf_prog *new)
+{
+ struct bpf_prog *old;
+
+ old = rtnl_dereference(tun->steering_prog);
+ rcu_assign_pointer(tun->steering_prog, new);
+
+ if (old) {
+ synchronize_net();
+ bpf_prog_destroy(old);
+ }
+}
+
static void tun_free_netdev(struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
@@ -2025,6 +2067,7 @@ static void tun_free_netdev(struct net_device *dev)
free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
+ __tun_set_steering_ebpf(tun, NULL);
}
static void tun_setup(struct net_device *dev)
@@ -2159,6 +2202,13 @@ static struct tun_steering_ops tun_automq_ops = {
.post_rx = tun_automq_post_rx,
};
+static struct tun_steering_ops tun_ebpf_ops = {
+ .select_queue = tun_ebpf_select_queue,
+ .xmit = tun_steering_xmit_nop,
+ .pre_rx = tun_steering_pre_rx_nop,
+ .post_rx = tun_steering_post_rx_nop,
+};
+
static int tun_flags(struct tun_struct *tun)
{
return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
@@ -2311,6 +2361,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
tun->rx_batched = 0;
+ RCU_INIT_POINTER(tun->steering_prog, NULL);
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
if (!tun->pcpu_stats) {
@@ -2503,6 +2554,23 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
return ret;
}
+static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+{
+ struct bpf_prog *prog;
+ u32 fd;
+
+ if (copy_from_user(&fd, data, sizeof(fd)))
+ return -EFAULT;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ __tun_set_steering_ebpf(tun, prog);
+
+ return 0;
+}
+
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg, int ifreq_len)
{
@@ -2785,6 +2853,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
case TUN_STEERING_AUTOMQ:
tun->steering_ops = &tun_automq_ops;
break;
+ case TUN_STEERING_EBPF:
+ tun->steering_ops = &tun_ebpf_ops;
+ break;
default:
ret = -EFAULT;
}
@@ -2794,6 +2865,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = 0;
if (tun->steering_ops == &tun_automq_ops)
steering = TUN_STEERING_AUTOMQ;
+ else if (tun->steering_ops == &tun_ebpf_ops)
+ steering = TUN_STEERING_EBPF;
else
BUG();
if (copy_to_user(argp, &steering, sizeof(steering)))
@@ -2802,11 +2875,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
case TUNGETSTEERINGFEATURES:
ret = 0;
- steering = TUN_STEERING_AUTOMQ;
+ steering = TUN_STEERING_AUTOMQ | TUN_STEERING_EBPF;
if (copy_to_user(argp, &steering, sizeof(steering)))
ret = -EFAULT;
break;
+ case TUNSETSTEERINGEBPF:
+ ret = tun_set_steering_ebpf(tun, argp);
+ break;
+
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 109760e..927f7e4 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -59,6 +59,7 @@
#define TUNSETSTEERING _IOW('T', 224, unsigned int)
#define TUNGETSTEERING _IOR('T', 225, unsigned int)
#define TUNGETSTEERINGFEATURES _IOR('T', 226, unsigned int)
+#define TUNSETSTEERINGEBPF _IOR('T', 227, int)
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
@@ -112,5 +113,6 @@ struct tun_filter {
};
#define TUN_STEERING_AUTOMQ 0x01 /* Automatic flow steering */
+#define TUN_STEERING_EBPF 0x02 /* eBPF based flow steering */
#endif /* _UAPI__IF_TUN_H */
--
2.7.4
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-10-31 10:32 ` [PATCH net-next V2 3/3] tun: add eBPF based queue selection method Jason Wang
@ 2017-10-31 16:45 ` Michael S. Tsirkin
2017-11-01 13:02 ` Jason Wang
2017-11-03 8:56 ` Willem de Bruijn
1 sibling, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2017-10-31 16:45 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, willemdebruijn.kernel, tom
On Tue, Oct 31, 2017 at 06:32:18PM +0800, Jason Wang wrote:
> This patch introduces an eBPF based queue selection method based on
> the flow steering policy ops. Userspace could load an eBPF program
> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
> to simple but hard coded policy in kernel.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/tun.c | 79 ++++++++++++++++++++++++++++++++++++++++++++-
> include/uapi/linux/if_tun.h | 2 ++
> 2 files changed, 80 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ab109ff..4bdde21 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -191,6 +191,20 @@ struct tun_steering_ops {
> u32 data);
> };
>
> +void tun_steering_xmit_nop(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +}
> +
> +u32 tun_steering_pre_rx_nop(struct tun_struct *tun, struct sk_buff *skb)
> +{
> + return 0;
> +}
> +
> +void tun_steering_post_rx_nop(struct tun_struct *tun, struct tun_file *tfile,
> + u32 data)
> +{
> +}
> +
> struct tun_flow_entry {
> struct hlist_node hash_link;
> struct rcu_head rcu;
> @@ -241,6 +255,7 @@ struct tun_struct {
> u32 rx_batched;
> struct tun_pcpu_stats __percpu *pcpu_stats;
> struct bpf_prog __rcu *xdp_prog;
> + struct bpf_prog __rcu *steering_prog;
> struct tun_steering_ops *steering_ops;
> };
>
> @@ -576,6 +591,19 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> return txq;
> }
>
> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> +{
> + struct bpf_prog *prog;
> + u16 ret = 0;
> +
> + rcu_read_lock();
> + prog = rcu_dereference(tun->steering_prog);
> + if (prog)
> + ret = bpf_prog_run_clear_cb(prog, skb);
> + rcu_read_unlock();
> +
> + return ret % tun->numqueues;
> +}
> static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
> void *accel_priv, select_queue_fallback_t fallback)
> {
> @@ -2017,6 +2045,20 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
> return ret;
> }
>
> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> + struct bpf_prog *new)
> +{
> + struct bpf_prog *old;
> +
> + old = rtnl_dereference(tun->steering_prog);
> + rcu_assign_pointer(tun->steering_prog, new);
> +
> + if (old) {
> + synchronize_net();
> + bpf_prog_destroy(old);
> + }
> +}
> +
Is this really called under rtnl? If no then rtnl_dereference
is wrong. If yes I'm not sure you can call synchronize_net
under rtnl.
> static void tun_free_netdev(struct net_device *dev)
> {
> struct tun_struct *tun = netdev_priv(dev);
> @@ -2025,6 +2067,7 @@ static void tun_free_netdev(struct net_device *dev)
> free_percpu(tun->pcpu_stats);
> tun_flow_uninit(tun);
> security_tun_dev_free_security(tun->security);
> + __tun_set_steering_ebpf(tun, NULL);
> }
>
> static void tun_setup(struct net_device *dev)
> @@ -2159,6 +2202,13 @@ static struct tun_steering_ops tun_automq_ops = {
> .post_rx = tun_automq_post_rx,
> };
>
> +static struct tun_steering_ops tun_ebpf_ops = {
> + .select_queue = tun_ebpf_select_queue,
> + .xmit = tun_steering_xmit_nop,
> + .pre_rx = tun_steering_pre_rx_nop,
> + .post_rx = tun_steering_post_rx_nop,
> +};
> +
> static int tun_flags(struct tun_struct *tun)
> {
> return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
> @@ -2311,6 +2361,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
> tun->filter_attached = false;
> tun->sndbuf = tfile->socket.sk->sk_sndbuf;
> tun->rx_batched = 0;
> + RCU_INIT_POINTER(tun->steering_prog, NULL);
>
> tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
> if (!tun->pcpu_stats) {
> @@ -2503,6 +2554,23 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
> return ret;
> }
>
> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
> +{
> + struct bpf_prog *prog;
> + u32 fd;
> +
> + if (copy_from_user(&fd, data, sizeof(fd)))
> + return -EFAULT;
> +
> + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
> + if (IS_ERR(prog))
> + return PTR_ERR(prog);
> +
> + __tun_set_steering_ebpf(tun, prog);
> +
> + return 0;
> +}
> +
> static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> unsigned long arg, int ifreq_len)
> {
> @@ -2785,6 +2853,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> case TUN_STEERING_AUTOMQ:
> tun->steering_ops = &tun_automq_ops;
> break;
> + case TUN_STEERING_EBPF:
> + tun->steering_ops = &tun_ebpf_ops;
> + break;
> default:
> ret = -EFAULT;
> }
> @@ -2794,6 +2865,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> ret = 0;
> if (tun->steering_ops == &tun_automq_ops)
> steering = TUN_STEERING_AUTOMQ;
> + else if (tun->steering_ops == &tun_ebpf_ops)
> + steering = TUN_STEERING_EBPF;
> else
> BUG();
> if (copy_to_user(argp, &steering, sizeof(steering)))
> @@ -2802,11 +2875,15 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>
> case TUNGETSTEERINGFEATURES:
> ret = 0;
> - steering = TUN_STEERING_AUTOMQ;
> + steering = TUN_STEERING_AUTOMQ | TUN_STEERING_EBPF;
> if (copy_to_user(argp, &steering, sizeof(steering)))
> ret = -EFAULT;
> break;
>
> + case TUNSETSTEERINGEBPF:
> + ret = tun_set_steering_ebpf(tun, argp);
> + break;
> +
> default:
> ret = -EINVAL;
> break;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 109760e..927f7e4 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -59,6 +59,7 @@
> #define TUNSETSTEERING _IOW('T', 224, unsigned int)
> #define TUNGETSTEERING _IOR('T', 225, unsigned int)
> #define TUNGETSTEERINGFEATURES _IOR('T', 226, unsigned int)
> +#define TUNSETSTEERINGEBPF _IOR('T', 227, int)
>
> /* TUNSETIFF ifr flags */
> #define IFF_TUN 0x0001
> @@ -112,5 +113,6 @@ struct tun_filter {
> };
>
> #define TUN_STEERING_AUTOMQ 0x01 /* Automatic flow steering */
> +#define TUN_STEERING_EBPF 0x02 /* eBPF based flow steering */
>
> #endif /* _UAPI__IF_TUN_H */
> --
> 2.7.4
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-10-31 16:45 ` Michael S. Tsirkin
@ 2017-11-01 13:02 ` Jason Wang
2017-11-01 13:59 ` Michael S. Tsirkin
0 siblings, 1 reply; 21+ messages in thread
From: Jason Wang @ 2017-11-01 13:02 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, willemdebruijn.kernel, tom
On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
>> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
>> + struct bpf_prog *new)
>> +{
>> + struct bpf_prog *old;
>> +
>> + old = rtnl_dereference(tun->steering_prog);
>> + rcu_assign_pointer(tun->steering_prog, new);
>> +
>> + if (old) {
>> + synchronize_net();
>> + bpf_prog_destroy(old);
>> + }
>> +}
>> +
> Is this really called under rtnl?
Yes it is __tun_chr_ioctl() will call rtnl_lock().
> If no then rtnl_dereference
> is wrong. If yes I'm not sure you can call synchronize_net
> under rtnl.
>
Are you worrying about the long wait? Looking at synchronize_net(), it does:
void synchronize_net(void)
{
might_sleep();
if (rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-01 13:02 ` Jason Wang
@ 2017-11-01 13:59 ` Michael S. Tsirkin
2017-11-01 19:12 ` Alexei Starovoitov
2017-11-02 3:45 ` Jason Wang
0 siblings, 2 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2017-11-01 13:59 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, willemdebruijn.kernel, tom
On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
>
>
> On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
> > > +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> > > + struct bpf_prog *new)
> > > +{
> > > + struct bpf_prog *old;
> > > +
> > > + old = rtnl_dereference(tun->steering_prog);
> > > + rcu_assign_pointer(tun->steering_prog, new);
> > > +
> > > + if (old) {
> > > + synchronize_net();
> > > + bpf_prog_destroy(old);
> > > + }
> > > +}
> > > +
> > Is this really called under rtnl?
>
> Yes it is __tun_chr_ioctl() will call rtnl_lock().
Is the call from tun_free_netdev under rtnl too?
> > If no then rtnl_dereference
> > is wrong. If yes I'm not sure you can call synchronize_net
> > under rtnl.
> >
>
> Are you worrying about the long wait? Looking at synchronize_net(), it does:
>
> void synchronize_net(void)
> {
> might_sleep();
> if (rtnl_is_locked())
> synchronize_rcu_expedited();
> else
> synchronize_rcu();
> }
> EXPORT_SYMBOL(synchronize_net);
>
> Thanks
Not the wait - expedited is not a good thing to allow unpriveledged
userspace to do, it interrupts all VMs running on the same box.
We could use a callback though the docs warn userspace can use that
to cause a DOS and needs to be limited.
--
MST
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-01 13:59 ` Michael S. Tsirkin
@ 2017-11-01 19:12 ` Alexei Starovoitov
2017-11-02 3:24 ` Jason Wang
2017-11-02 3:45 ` Jason Wang
1 sibling, 1 reply; 21+ messages in thread
From: Alexei Starovoitov @ 2017-11-01 19:12 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Jason Wang, netdev, linux-kernel, willemdebruijn.kernel, tom,
Daniel Borkmann
On Wed, Nov 01, 2017 at 03:59:48PM +0200, Michael S. Tsirkin wrote:
> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
> >
> >
> > On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
> > > > +static void __tun_set_steering_ebpf(struct tun_struct *tun,
> > > > + struct bpf_prog *new)
> > > > +{
> > > > + struct bpf_prog *old;
> > > > +
> > > > + old = rtnl_dereference(tun->steering_prog);
> > > > + rcu_assign_pointer(tun->steering_prog, new);
> > > > +
> > > > + if (old) {
> > > > + synchronize_net();
> > > > + bpf_prog_destroy(old);
> > > > + }
> > > > +}
> > > > +
> > > Is this really called under rtnl?
> >
> > Yes it is __tun_chr_ioctl() will call rtnl_lock().
>
> Is the call from tun_free_netdev under rtnl too?
>
> > > If no then rtnl_dereference
> > > is wrong. If yes I'm not sure you can call synchronize_net
> > > under rtnl.
> > >
> >
> > Are you worrying about the long wait? Looking at synchronize_net(), it does:
> >
> > void synchronize_net(void)
> > {
> > might_sleep();
> > if (rtnl_is_locked())
> > synchronize_rcu_expedited();
> > else
> > synchronize_rcu();
> > }
> > EXPORT_SYMBOL(synchronize_net);
> >
> > Thanks
>
> Not the wait - expedited is not a good thing to allow unpriveledged
> userspace to do, it interrupts all VMs running on the same box.
>
> We could use a callback though the docs warn userspace can use that
> to cause a DOS and needs to be limited.
the whole __tun_set_steering_ebpf() looks odd to me.
There is tun_attach_filter/tun_detach_filter pattern
that works for classic BPF. Why for eBPF this strange
synchronize_net() is there?
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic
2017-10-31 10:32 ` [PATCH net-next V2 1/3] tun: abstract flow steering logic Jason Wang
@ 2017-11-02 1:11 ` Willem de Bruijn
2017-11-02 3:43 ` Jason Wang
0 siblings, 1 reply; 21+ messages in thread
From: Willem de Bruijn @ 2017-11-02 1:11 UTC (permalink / raw)
To: Jason Wang; +Cc: Network Development, LKML, Michael S. Tsirkin, Tom Herbert
On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> tun now use flow caches based automatic queue steering method. This
> may not suffice all user cases. To extend it to be able to use more
> flow steering policy, this patch abstracts flow steering logic into
> tun_steering_ops, then we can declare and use different methods in
> the future.
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++++++++++--------------
> 1 file changed, 63 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index ea29da9..bff6259 100644
The previous RFC enabled support for multiple pluggable steering
policies. But as all can be implemented in BPF and we only plan to
support an eBPF policy besides the legacy one, this patch is no longer
needed. We can save a few indirect function calls.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies
2017-10-31 10:32 ` [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies Jason Wang
@ 2017-11-02 1:15 ` Willem de Bruijn
0 siblings, 0 replies; 21+ messages in thread
From: Willem de Bruijn @ 2017-11-02 1:15 UTC (permalink / raw)
To: Jason Wang; +Cc: Network Development, LKML, Michael S. Tsirkin, Tom Herbert
On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces new ioctl for change packet steering policy for
> tun. Only automatic flow steering is supported, more policies will
> come.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/net/tun.c | 35 ++++++++++++++++++++++++++++++++++-
> include/uapi/linux/if_tun.h | 7 +++++++
> 2 files changed, 41 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bff6259..ab109ff 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -122,7 +122,8 @@ do { \
> #define TUN_VNET_BE 0x40000000
>
> #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
> - IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
> + IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS | \
> + IFF_MULTI_STEERING)
>
> #define GOODCOPY_LEN 128
>
> @@ -2516,6 +2517,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> unsigned int ifindex;
> int le;
> int ret;
> + unsigned int steering;
>
> if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == SOCK_IOC_TYPE) {
> if (copy_from_user(&ifr, argp, ifreq_len))
> @@ -2774,6 +2776,37 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
> ret = 0;
> break;
>
> + case TUNSETSTEERING:
> + ret = -EFAULT;
> + if (copy_from_user(&steering, argp, sizeof(steering)))
> + break;
> + ret = 0;
> + switch (steering) {
> + case TUN_STEERING_AUTOMQ:
> + tun->steering_ops = &tun_automq_ops;
> + break;
> + default:
> + ret = -EFAULT;
> + }
> + break;
> +
> + case TUNGETSTEERING:
> + ret = 0;
> + if (tun->steering_ops == &tun_automq_ops)
> + steering = TUN_STEERING_AUTOMQ;
> + else
> + BUG();
> + if (copy_to_user(argp, &steering, sizeof(steering)))
> + ret = -EFAULT;
> + break;
> +
> + case TUNGETSTEERINGFEATURES:
> + ret = 0;
> + steering = TUN_STEERING_AUTOMQ;
> + if (copy_to_user(argp, &steering, sizeof(steering)))
> + ret = -EFAULT;
> + break;
> +
Similar to my comment in patch 1/3: if only eBPF is used, these
calls can be avoided in favor of only TUNSETSTEERINGEBPF
from patch 3/3.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-01 19:12 ` Alexei Starovoitov
@ 2017-11-02 3:24 ` Jason Wang
0 siblings, 0 replies; 21+ messages in thread
From: Jason Wang @ 2017-11-02 3:24 UTC (permalink / raw)
To: Alexei Starovoitov, Michael S. Tsirkin
Cc: netdev, linux-kernel, willemdebruijn.kernel, tom, Daniel Borkmann
On 2017年11月02日 03:12, Alexei Starovoitov wrote:
> On Wed, Nov 01, 2017 at 03:59:48PM +0200, Michael S. Tsirkin wrote:
>> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
>>>
>>> On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
>>>>> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
>>>>> + struct bpf_prog *new)
>>>>> +{
>>>>> + struct bpf_prog *old;
>>>>> +
>>>>> + old = rtnl_dereference(tun->steering_prog);
>>>>> + rcu_assign_pointer(tun->steering_prog, new);
>>>>> +
>>>>> + if (old) {
>>>>> + synchronize_net();
>>>>> + bpf_prog_destroy(old);
>>>>> + }
>>>>> +}
>>>>> +
>>>> Is this really called under rtnl?
>>> Yes it is __tun_chr_ioctl() will call rtnl_lock().
>> Is the call from tun_free_netdev under rtnl too?
>>
>>>> If no then rtnl_dereference
>>>> is wrong. If yes I'm not sure you can call synchronize_net
>>>> under rtnl.
>>>>
>>> Are you worrying about the long wait? Looking at synchronize_net(), it does:
>>>
>>> void synchronize_net(void)
>>> {
>>> might_sleep();
>>> if (rtnl_is_locked())
>>> synchronize_rcu_expedited();
>>> else
>>> synchronize_rcu();
>>> }
>>> EXPORT_SYMBOL(synchronize_net);
>>>
>>> Thanks
>> Not the wait - expedited is not a good thing to allow unpriveledged
>> userspace to do, it interrupts all VMs running on the same box.
>>
>> We could use a callback though the docs warn userspace can use that
>> to cause a DOS and needs to be limited.
> the whole __tun_set_steering_ebpf() looks odd to me.
> There is tun_attach_filter/tun_detach_filter pattern
> that works for classic BPF. Why for eBPF this strange
> synchronize_net() is there?
>
I'm not sure I get the question. eBPF here is used to do queue
selection, so we could not reuse socket filter (tun_detach_filter use
call_rcu()). cBPF could be used here, but I'm not quite sure it's worth
to support it. And I agree we should use call_rcu() here.
Hope this answer your question.
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic
2017-11-02 1:11 ` Willem de Bruijn
@ 2017-11-02 3:43 ` Jason Wang
2017-11-02 3:45 ` Michael S. Tsirkin
0 siblings, 1 reply; 21+ messages in thread
From: Jason Wang @ 2017-11-02 3:43 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Network Development, LKML, Michael S. Tsirkin, Tom Herbert
On 2017年11月02日 09:11, Willem de Bruijn wrote:
> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>> tun now use flow caches based automatic queue steering method. This
>> may not suffice all user cases. To extend it to be able to use more
>> flow steering policy, this patch abstracts flow steering logic into
>> tun_steering_ops, then we can declare and use different methods in
>> the future.
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++++++++++--------------
>> 1 file changed, 63 insertions(+), 22 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index ea29da9..bff6259 100644
> The previous RFC enabled support for multiple pluggable steering
> policies. But as all can be implemented in BPF and we only plan to
> support an eBPF policy besides the legacy one, this patch is no longer
> needed. We can save a few indirect function calls.
But we should at least support two kinds of steering policy, so this is
still needed?
And I'm not quite sure we can implement all kinds of policies through
BPF e.g RSS or we may want to offload the queue selection to underlayer
switch or nic .
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic
2017-11-02 3:43 ` Jason Wang
@ 2017-11-02 3:45 ` Michael S. Tsirkin
2017-11-02 3:51 ` Jason Wang
0 siblings, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2017-11-02 3:45 UTC (permalink / raw)
To: Jason Wang; +Cc: Willem de Bruijn, Network Development, LKML, Tom Herbert
On Thu, Nov 02, 2017 at 11:43:48AM +0800, Jason Wang wrote:
>
>
> On 2017年11月02日 09:11, Willem de Bruijn wrote:
> > On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> > > tun now use flow caches based automatic queue steering method. This
> > > may not suffice all user cases. To extend it to be able to use more
> > > flow steering policy, this patch abstracts flow steering logic into
> > > tun_steering_ops, then we can declare and use different methods in
> > > the future.
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > ---
> > > drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++++++++++--------------
> > > 1 file changed, 63 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > > index ea29da9..bff6259 100644
> > The previous RFC enabled support for multiple pluggable steering
> > policies. But as all can be implemented in BPF and we only plan to
> > support an eBPF policy besides the legacy one, this patch is no longer
> > needed. We can save a few indirect function calls.
>
> But we should at least support two kinds of steering policy, so this is
> still needed?
>
> And I'm not quite sure we can implement all kinds of policies through BPF
> e.g RSS or we may want to offload the queue selection to underlayer switch
> or nic .
>
> Thanks
I think a simple if condition is preferable for now, too. Let's wait
until we get some 3/4 of these.
--
MST
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-01 13:59 ` Michael S. Tsirkin
2017-11-01 19:12 ` Alexei Starovoitov
@ 2017-11-02 3:45 ` Jason Wang
1 sibling, 0 replies; 21+ messages in thread
From: Jason Wang @ 2017-11-02 3:45 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: netdev, linux-kernel, willemdebruijn.kernel, tom
On 2017年11月01日 21:59, Michael S. Tsirkin wrote:
> On Wed, Nov 01, 2017 at 09:02:03PM +0800, Jason Wang wrote:
>>
>> On 2017年11月01日 00:45, Michael S. Tsirkin wrote:
>>>> +static void __tun_set_steering_ebpf(struct tun_struct *tun,
>>>> + struct bpf_prog *new)
>>>> +{
>>>> + struct bpf_prog *old;
>>>> +
>>>> + old = rtnl_dereference(tun->steering_prog);
>>>> + rcu_assign_pointer(tun->steering_prog, new);
>>>> +
>>>> + if (old) {
>>>> + synchronize_net();
>>>> + bpf_prog_destroy(old);
>>>> + }
>>>> +}
>>>> +
>>> Is this really called under rtnl?
>> Yes it is __tun_chr_ioctl() will call rtnl_lock().
> Is the call from tun_free_netdev under rtnl too?
Looks not, need hold rtnl_lock() in tun_free_netdev in next version.
>
>>> If no then rtnl_dereference
>>> is wrong. If yes I'm not sure you can call synchronize_net
>>> under rtnl.
>>>
>> Are you worrying about the long wait? Looking at synchronize_net(), it does:
>>
>> void synchronize_net(void)
>> {
>> might_sleep();
>> if (rtnl_is_locked())
>> synchronize_rcu_expedited();
>> else
>> synchronize_rcu();
>> }
>> EXPORT_SYMBOL(synchronize_net);
>>
>> Thanks
> Not the wait - expedited is not a good thing to allow unpriveledged
> userspace to do, it interrupts all VMs running on the same box.
Good point.
>
> We could use a callback though the docs warn userspace can use that
> to cause a DOS and needs to be limited.
>
>
Will do this in next version.
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic
2017-11-02 3:45 ` Michael S. Tsirkin
@ 2017-11-02 3:51 ` Jason Wang
2017-11-03 8:49 ` Willem de Bruijn
0 siblings, 1 reply; 21+ messages in thread
From: Jason Wang @ 2017-11-02 3:51 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Willem de Bruijn, Network Development, LKML, Tom Herbert
On 2017年11月02日 11:45, Michael S. Tsirkin wrote:
> On Thu, Nov 02, 2017 at 11:43:48AM +0800, Jason Wang wrote:
>>
>> On 2017年11月02日 09:11, Willem de Bruijn wrote:
>>> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>>>> tun now use flow caches based automatic queue steering method. This
>>>> may not suffice all user cases. To extend it to be able to use more
>>>> flow steering policy, this patch abstracts flow steering logic into
>>>> tun_steering_ops, then we can declare and use different methods in
>>>> the future.
>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>> ---
>>>> drivers/net/tun.c | 85 +++++++++++++++++++++++++++++++++++++++++--------------
>>>> 1 file changed, 63 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index ea29da9..bff6259 100644
>>> The previous RFC enabled support for multiple pluggable steering
>>> policies. But as all can be implemented in BPF and we only plan to
>>> support an eBPF policy besides the legacy one, this patch is no longer
>>> needed. We can save a few indirect function calls.
>> But we should at least support two kinds of steering policy, so this is
>> still needed?
>>
>> And I'm not quite sure we can implement all kinds of policies through BPF
>> e.g RSS or we may want to offload the queue selection to underlayer switch
>> or nic .
>>
>> Thanks
> I think a simple if condition is preferable for now, too. Let's wait
> until we get some 3/4 of these.
>
That's a solution but we may need if in at least four places. If this is
ok, I will do it in next version.
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 1/3] tun: abstract flow steering logic
2017-11-02 3:51 ` Jason Wang
@ 2017-11-03 8:49 ` Willem de Bruijn
0 siblings, 0 replies; 21+ messages in thread
From: Willem de Bruijn @ 2017-11-03 8:49 UTC (permalink / raw)
To: Jason Wang; +Cc: Michael S. Tsirkin, Network Development, LKML, Tom Herbert
On Thu, Nov 2, 2017 at 12:51 PM, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2017年11月02日 11:45, Michael S. Tsirkin wrote:
>>
>> On Thu, Nov 02, 2017 at 11:43:48AM +0800, Jason Wang wrote:
>>>
>>>
>>> On 2017年11月02日 09:11, Willem de Bruijn wrote:
>>>>
>>>> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>>>>>
>>>>> tun now use flow caches based automatic queue steering method. This
>>>>> may not suffice all user cases. To extend it to be able to use more
>>>>> flow steering policy, this patch abstracts flow steering logic into
>>>>> tun_steering_ops, then we can declare and use different methods in
>>>>> the future.
>>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>>> ---
>>>>> drivers/net/tun.c | 85
>>>>> +++++++++++++++++++++++++++++++++++++++++--------------
>>>>> 1 file changed, 63 insertions(+), 22 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>> index ea29da9..bff6259 100644
>>>>
>>>> The previous RFC enabled support for multiple pluggable steering
>>>> policies. But as all can be implemented in BPF and we only plan to
>>>> support an eBPF policy besides the legacy one, this patch is no longer
>>>> needed. We can save a few indirect function calls.
>>>
>>> But we should at least support two kinds of steering policy, so this is
>>> still needed?
>>>
>>> And I'm not quite sure we can implement all kinds of policies through BPF
>>> e.g RSS or we may want to offload the queue selection to underlayer
>>> switch
>>> or nic .
>>>
>>> Thanks
>>
>> I think a simple if condition is preferable for now, too. Let's wait
>> until we get some 3/4 of these.
>>
>
> That's a solution but we may need if in at least four places. If this is ok,
> I will do it in next version.
That sounds good to me.
I only see three places that need to be modified, the callback sites
that this patch introduces. Strictly speaking, it's not even necessary
to forgo the rxhash operations. Though a nice optimization.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-10-31 10:32 ` [PATCH net-next V2 3/3] tun: add eBPF based queue selection method Jason Wang
2017-10-31 16:45 ` Michael S. Tsirkin
@ 2017-11-03 8:56 ` Willem de Bruijn
2017-11-03 23:56 ` Willem de Bruijn
1 sibling, 1 reply; 21+ messages in thread
From: Willem de Bruijn @ 2017-11-03 8:56 UTC (permalink / raw)
To: Jason Wang; +Cc: Network Development, LKML, Michael S. Tsirkin, Tom Herbert
On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces an eBPF based queue selection method based on
> the flow steering policy ops. Userspace could load an eBPF program
> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
> to simple but hard coded policy in kernel.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
> +{
> + struct bpf_prog *prog;
> + u32 fd;
> +
> + if (copy_from_user(&fd, data, sizeof(fd)))
> + return -EFAULT;
> +
> + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
If the idea is to allow guests to pass BPF programs down to the host,
you may want to define a new program type that is more restrictive than
socket filter.
The external functions allowed for socket filters (sk_filter_func_proto)
are relatively few (compared to, say, clsact), but may still leak host
information to a guest. More importantly, guest security considerations
limits how we can extend socket filters later.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-03 8:56 ` Willem de Bruijn
@ 2017-11-03 23:56 ` Willem de Bruijn
2017-11-08 5:28 ` Jason Wang
0 siblings, 1 reply; 21+ messages in thread
From: Willem de Bruijn @ 2017-11-03 23:56 UTC (permalink / raw)
To: Jason Wang; +Cc: Network Development, LKML, Michael S. Tsirkin, Tom Herbert
On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>> This patch introduces an eBPF based queue selection method based on
>> the flow steering policy ops. Userspace could load an eBPF program
>> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
>> to simple but hard coded policy in kernel.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
>> +{
>> + struct bpf_prog *prog;
>> + u32 fd;
>> +
>> + if (copy_from_user(&fd, data, sizeof(fd)))
>> + return -EFAULT;
>> +
>> + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>
> If the idea is to allow guests to pass BPF programs down to the host,
> you may want to define a new program type that is more restrictive than
> socket filter.
>
> The external functions allowed for socket filters (sk_filter_func_proto)
> are relatively few (compared to, say, clsact), but may still leak host
> information to a guest. More importantly, guest security considerations
> limits how we can extend socket filters later.
Unless the idea is for the hypervisor to prepared the BPF based on a
limited set of well defined modes that the guest can configure. Then
socket filters are fine, as the BPF is prepared by a regular host process.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-03 23:56 ` Willem de Bruijn
@ 2017-11-08 5:28 ` Jason Wang
2017-11-08 5:43 ` Michael S. Tsirkin
0 siblings, 1 reply; 21+ messages in thread
From: Jason Wang @ 2017-11-08 5:28 UTC (permalink / raw)
To: Willem de Bruijn
Cc: Network Development, LKML, Michael S. Tsirkin, Tom Herbert
On 2017年11月04日 08:56, Willem de Bruijn wrote:
> On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>>> This patch introduces an eBPF based queue selection method based on
>>> the flow steering policy ops. Userspace could load an eBPF program
>>> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
>>> to simple but hard coded policy in kernel.
>>>
>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> ---
>>> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
>>> +{
>>> + struct bpf_prog *prog;
>>> + u32 fd;
>>> +
>>> + if (copy_from_user(&fd, data, sizeof(fd)))
>>> + return -EFAULT;
>>> +
>>> + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>> If the idea is to allow guests to pass BPF programs down to the host,
>> you may want to define a new program type that is more restrictive than
>> socket filter.
>>
>> The external functions allowed for socket filters (sk_filter_func_proto)
>> are relatively few (compared to, say, clsact), but may still leak host
>> information to a guest. More importantly, guest security considerations
>> limits how we can extend socket filters later.
> Unless the idea is for the hypervisor to prepared the BPF based on a
> limited set of well defined modes that the guest can configure. Then
> socket filters are fine, as the BPF is prepared by a regular host process.
Yes, I think the idea is to let qemu to build a BPF program now.
Passing eBPF program from guest to host is interesting, but an obvious
issue is how to deal with the accessing of map.
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-08 5:28 ` Jason Wang
@ 2017-11-08 5:43 ` Michael S. Tsirkin
2017-11-08 11:13 ` Jason Wang
0 siblings, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2017-11-08 5:43 UTC (permalink / raw)
To: Jason Wang
Cc: Willem de Bruijn, Network Development, LKML, Tom Herbert, Aaron Conole
On Wed, Nov 08, 2017 at 02:28:53PM +0900, Jason Wang wrote:
>
>
> On 2017年11月04日 08:56, Willem de Bruijn wrote:
> > On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > > On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
> > > > This patch introduces an eBPF based queue selection method based on
> > > > the flow steering policy ops. Userspace could load an eBPF program
> > > > through TUNSETSTEERINGEBPF. This gives much more flexibility compare
> > > > to simple but hard coded policy in kernel.
> > > >
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > ---
> > > > +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
> > > > +{
> > > > + struct bpf_prog *prog;
> > > > + u32 fd;
> > > > +
> > > > + if (copy_from_user(&fd, data, sizeof(fd)))
> > > > + return -EFAULT;
> > > > +
> > > > + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
> > > If the idea is to allow guests to pass BPF programs down to the host,
> > > you may want to define a new program type that is more restrictive than
> > > socket filter.
> > >
> > > The external functions allowed for socket filters (sk_filter_func_proto)
> > > are relatively few (compared to, say, clsact), but may still leak host
> > > information to a guest. More importantly, guest security considerations
> > > limits how we can extend socket filters later.
> > Unless the idea is for the hypervisor to prepared the BPF based on a
> > limited set of well defined modes that the guest can configure. Then
> > socket filters are fine, as the BPF is prepared by a regular host process.
>
> Yes, I think the idea is to let qemu to build a BPF program now.
>
> Passing eBPF program from guest to host is interesting, but an obvious issue
> is how to deal with the accessing of map.
>
> Thanks
Fundamentally, I suspect the way to solve it is to allow
the program to specify "should be offloaded to host".
And then it would access the host map rather than the guest map.
Then add some control path API for guest to poke at the host map.
It's not that there's anything special about the host map -
it's just separate from the guest - so if we wanted to
do something that can work on bare-metal we could -
just do something like a namespace and put all host
maps there. But I'm not sure it's worth the complexity.
Cc Aaron who wanted to look at this.
--
MST
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH net-next V2 3/3] tun: add eBPF based queue selection method
2017-11-08 5:43 ` Michael S. Tsirkin
@ 2017-11-08 11:13 ` Jason Wang
0 siblings, 0 replies; 21+ messages in thread
From: Jason Wang @ 2017-11-08 11:13 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Willem de Bruijn, Network Development, LKML, Tom Herbert, Aaron Conole
On 2017年11月08日 14:43, Michael S. Tsirkin wrote:
> On Wed, Nov 08, 2017 at 02:28:53PM +0900, Jason Wang wrote:
>>
>> On 2017年11月04日 08:56, Willem de Bruijn wrote:
>>> On Fri, Nov 3, 2017 at 5:56 PM, Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> On Tue, Oct 31, 2017 at 7:32 PM, Jason Wang <jasowang@redhat.com> wrote:
>>>>> This patch introduces an eBPF based queue selection method based on
>>>>> the flow steering policy ops. Userspace could load an eBPF program
>>>>> through TUNSETSTEERINGEBPF. This gives much more flexibility compare
>>>>> to simple but hard coded policy in kernel.
>>>>>
>>>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>>>> ---
>>>>> +static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
>>>>> +{
>>>>> + struct bpf_prog *prog;
>>>>> + u32 fd;
>>>>> +
>>>>> + if (copy_from_user(&fd, data, sizeof(fd)))
>>>>> + return -EFAULT;
>>>>> +
>>>>> + prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>>>> If the idea is to allow guests to pass BPF programs down to the host,
>>>> you may want to define a new program type that is more restrictive than
>>>> socket filter.
>>>>
>>>> The external functions allowed for socket filters (sk_filter_func_proto)
>>>> are relatively few (compared to, say, clsact), but may still leak host
>>>> information to a guest. More importantly, guest security considerations
>>>> limits how we can extend socket filters later.
>>> Unless the idea is for the hypervisor to prepared the BPF based on a
>>> limited set of well defined modes that the guest can configure. Then
>>> socket filters are fine, as the BPF is prepared by a regular host process.
>> Yes, I think the idea is to let qemu to build a BPF program now.
>>
>> Passing eBPF program from guest to host is interesting, but an obvious issue
>> is how to deal with the accessing of map.
>>
>> Thanks
> Fundamentally, I suspect the way to solve it is to allow
> the program to specify "should be offloaded to host".
>
> And then it would access the host map rather than the guest map.
This looks a big extension.
>
> Then add some control path API for guest to poke at the host map.
Actually, as Willem said, we can even forbid using map through a type,
but this will lose lots of flexibility.
>
> It's not that there's anything special about the host map -
> it's just separate from the guest - so if we wanted to
> do something that can work on bare-metal we could -
> just do something like a namespace and put all host
> maps there. But I'm not sure it's worth the complexity.
>
> Cc Aaron who wanted to look at this.
>
Maybe the first step is to let classic BPF to be passed from guest and
consider eBPF on top.
Thanks
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2017-11-08 11:13 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-31 10:32 [PATCH net-next V2 0/3] support changing steering policies in tuntap Jason Wang
2017-10-31 10:32 ` [PATCH net-next V2 1/3] tun: abstract flow steering logic Jason Wang
2017-11-02 1:11 ` Willem de Bruijn
2017-11-02 3:43 ` Jason Wang
2017-11-02 3:45 ` Michael S. Tsirkin
2017-11-02 3:51 ` Jason Wang
2017-11-03 8:49 ` Willem de Bruijn
2017-10-31 10:32 ` [PATCH net-next V2 2/3] tun: introduce ioctls to set and get steering policies Jason Wang
2017-11-02 1:15 ` Willem de Bruijn
2017-10-31 10:32 ` [PATCH net-next V2 3/3] tun: add eBPF based queue selection method Jason Wang
2017-10-31 16:45 ` Michael S. Tsirkin
2017-11-01 13:02 ` Jason Wang
2017-11-01 13:59 ` Michael S. Tsirkin
2017-11-01 19:12 ` Alexei Starovoitov
2017-11-02 3:24 ` Jason Wang
2017-11-02 3:45 ` Jason Wang
2017-11-03 8:56 ` Willem de Bruijn
2017-11-03 23:56 ` Willem de Bruijn
2017-11-08 5:28 ` Jason Wang
2017-11-08 5:43 ` Michael S. Tsirkin
2017-11-08 11:13 ` Jason Wang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).