[net-next,V3] tun: add eBPF based queue selection method
diff mbox series

Message ID 1512379883-11887-1-git-send-email-jasowang@redhat.com
State New, archived
Headers show
Series
  • [net-next,V3] tun: add eBPF based queue selection method
Related show

Commit Message

Jason Wang Dec. 4, 2017, 9:31 a.m. UTC
This patch introduces an eBPF based queue selection method. With this,
the policy could be offloaded to userspace completely through a new
ioctl TUNSETSTEERINGEBPF.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes from V2:
- call rtnl during netdev free
- switch to use call_rcu() to prevent DOS from userspace
- drop the policies setting/getting ioctls and allow detach through
  passing -1 as fd
---
 drivers/net/tun.c           | 145 +++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 123 insertions(+), 23 deletions(-)

Comments

Willem de Bruijn Dec. 5, 2017, 12:16 a.m. UTC | #1
On Mon, Dec 4, 2017 at 4:31 AM, Jason Wang <jasowang@redhat.com> wrote:
> This patch introduces an eBPF based queue selection method. With this,
> the policy could be offloaded to userspace completely through a new
> ioctl TUNSETSTEERINGEBPF.
>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---

> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +       struct tun_steering_prog *prog;
> +       u16 ret = 0;
> +
> +       prog = rcu_dereference(tun->steering_prog);
> +       if (prog)
> +               ret = bpf_prog_run_clear_cb(prog->prog, skb);

This dereferences tun->steering_prog for a second time. It is safe
in this load balancing case to assign a few extra packets to queue 0.
But the issue can also be avoided by replacing the function with a
direct call in tun_net_xmit:

       struct tun_steering_prog *s = rcu_dereference(tun->steering_prog);
       if (s)
               ret = bpf_prog_run_clear_cb(s->prog, skb) % tun->numqueues;

>  /* Net device start xmit */
> -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
>  {
> -       struct tun_struct *tun = netdev_priv(dev);
> -       int txq = skb->queue_mapping;
> -       struct tun_file *tfile;
> -       u32 numqueues = 0;
> -
> -       rcu_read_lock();
> -       tfile = rcu_dereference(tun->tfiles[txq]);
> -       numqueues = READ_ONCE(tun->numqueues);
> -
> -       /* Drop packet if interface is not attached */
> -       if (txq >= numqueues)
> -               goto drop;
> -
>  #ifdef CONFIG_RPS
> -       if (numqueues == 1 && static_key_false(&rps_needed)) {
> +       if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
>                 /* Select queue was not called for the skbuff, so we extract the
>                  * RPS hash and save it into the flow_table here.
>                  */
> @@ -969,6 +986,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>                 }
>         }
>  #endif
> +}
> +
> +/* Net device start xmit */
> +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +       struct tun_struct *tun = netdev_priv(dev);
> +       int txq = skb->queue_mapping;
> +       struct tun_file *tfile;
> +       u32 numqueues = 0;
> +
> +       rcu_read_lock();
> +       tfile = rcu_dereference(tun->tfiles[txq]);
> +       numqueues = READ_ONCE(tun->numqueues);

Now tun->numqueues is read twice, reversing commit fa35864e0bb7
("tuntap: Fix for a race in accessing numqueues"). I don't see anything
left that would cause a divide by zero after the relevant code was
converted from divide to multiple and subsequently even removed.

But if it's safe to read multiple times, might as well remove the READ_ONCE.

> @@ -1551,7 +1588,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>         int copylen;
>         bool zerocopy = false;
>         int err;
> -       u32 rxhash;
> +       u32 rxhash = 0;
>         int skb_xdp = 1;
>         bool frags = tun_napi_frags_enabled(tun);
>
> @@ -1739,7 +1776,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                 rcu_read_unlock();
>         }
>
> -       rxhash = __skb_get_hash_symmetric(skb);
> +       rcu_read_lock();
> +       if (!rcu_dereference(tun->steering_prog))
> +               rxhash = __skb_get_hash_symmetric(skb);
> +       rcu_read_unlock();
>
>         if (frags) {
>                 /* Exercise flow dissector code path. */
> @@ -1783,7 +1823,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>         u64_stats_update_end(&stats->syncp);
>         put_cpu_ptr(stats);
>
> -       tun_flow_update(tun, rxhash, tfile);
> +       if (rxhash)
> +               tun_flow_update(tun, rxhash, tfile);
> +

Nit: zero is a valid hash? In which case, an int64_t initialized to -1 is the
safer check.
Jason Wang Dec. 5, 2017, 7:29 a.m. UTC | #2
On 2017年12月05日 08:16, Willem de Bruijn wrote:
> On Mon, Dec 4, 2017 at 4:31 AM, Jason Wang <jasowang@redhat.com> wrote:
>> This patch introduces an eBPF based queue selection method. With this,
>> the policy could be offloaded to userspace completely through a new
>> ioctl TUNSETSTEERINGEBPF.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>> +{
>> +       struct tun_steering_prog *prog;
>> +       u16 ret = 0;
>> +
>> +       prog = rcu_dereference(tun->steering_prog);
>> +       if (prog)
>> +               ret = bpf_prog_run_clear_cb(prog->prog, skb);
> This dereferences tun->steering_prog for a second time. It is safe
> in this load balancing case to assign a few extra packets to queue 0.
> But the issue can also be avoided by replacing the function with a
> direct call in tun_net_xmit:
>
>         struct tun_steering_prog *s = rcu_dereference(tun->steering_prog);
>         if (s)
>                 ret = bpf_prog_run_clear_cb(s->prog, skb) % tun->numqueues;

Right.

>
>>   /* Net device start xmit */
>> -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>> +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
>>   {
>> -       struct tun_struct *tun = netdev_priv(dev);
>> -       int txq = skb->queue_mapping;
>> -       struct tun_file *tfile;
>> -       u32 numqueues = 0;
>> -
>> -       rcu_read_lock();
>> -       tfile = rcu_dereference(tun->tfiles[txq]);
>> -       numqueues = READ_ONCE(tun->numqueues);
>> -
>> -       /* Drop packet if interface is not attached */
>> -       if (txq >= numqueues)
>> -               goto drop;
>> -
>>   #ifdef CONFIG_RPS
>> -       if (numqueues == 1 && static_key_false(&rps_needed)) {
>> +       if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
>>                  /* Select queue was not called for the skbuff, so we extract the
>>                   * RPS hash and save it into the flow_table here.
>>                   */
>> @@ -969,6 +986,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>>                  }
>>          }
>>   #endif
>> +}
>> +
>> +/* Net device start xmit */
>> +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>> +{
>> +       struct tun_struct *tun = netdev_priv(dev);
>> +       int txq = skb->queue_mapping;
>> +       struct tun_file *tfile;
>> +       u32 numqueues = 0;
>> +
>> +       rcu_read_lock();
>> +       tfile = rcu_dereference(tun->tfiles[txq]);
>> +       numqueues = READ_ONCE(tun->numqueues);
> Now tun->numqueues is read twice, reversing commit fa35864e0bb7
> ("tuntap: Fix for a race in accessing numqueues"). I don't see anything
> left that would cause a divide by zero after the relevant code was
> converted from divide to multiple and subsequently even removed.
>
> But if it's safe to read multiple times, might as well remove the READ_ONCE.

Good point, but READ_ONCE() is not something new, we'd better change 
this in another patch.

>
>> @@ -1551,7 +1588,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>>          int copylen;
>>          bool zerocopy = false;
>>          int err;
>> -       u32 rxhash;
>> +       u32 rxhash = 0;
>>          int skb_xdp = 1;
>>          bool frags = tun_napi_frags_enabled(tun);
>>
>> @@ -1739,7 +1776,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>>                  rcu_read_unlock();
>>          }
>>
>> -       rxhash = __skb_get_hash_symmetric(skb);
>> +       rcu_read_lock();
>> +       if (!rcu_dereference(tun->steering_prog))
>> +               rxhash = __skb_get_hash_symmetric(skb);
>> +       rcu_read_unlock();
>>
>>          if (frags) {
>>                  /* Exercise flow dissector code path. */
>> @@ -1783,7 +1823,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>>          u64_stats_update_end(&stats->syncp);
>>          put_cpu_ptr(stats);
>>
>> -       tun_flow_update(tun, rxhash, tfile);
>> +       if (rxhash)
>> +               tun_flow_update(tun, rxhash, tfile);
>> +
> Nit: zero is a valid hash? In which case, an int64_t initialized to -1 is the
> safer check.

Looks not? E.g looking at __flow_hash_from_keys() it did:

static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
{
     u32 hash;

     __flow_hash_consistentify(keys);

     hash = __flow_hash_words(flow_keys_hash_start(keys),
                  flow_keys_hash_length(keys), keyval);
     if (!hash)
         hash = 1;

     return hash;
}

Thanks
Willem de Bruijn Dec. 5, 2017, 4:13 p.m. UTC | #3
On Tue, Dec 5, 2017 at 2:29 AM, Jason Wang <jasowang@redhat.com> wrote:
>
>
> On 2017年12月05日 08:16, Willem de Bruijn wrote:
>>
>> On Mon, Dec 4, 2017 at 4:31 AM, Jason Wang <jasowang@redhat.com> wrote:
>>>
>>> This patch introduces an eBPF based queue selection method. With this,
>>> the policy could be offloaded to userspace completely through a new
>>> ioctl TUNSETSTEERINGEBPF.
>>>
>>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>>> ---
>>> +static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff
>>> *skb)
>>> +{
>>> +       struct tun_steering_prog *prog;
>>> +       u16 ret = 0;
>>> +
>>> +       prog = rcu_dereference(tun->steering_prog);
>>> +       if (prog)
>>> +               ret = bpf_prog_run_clear_cb(prog->prog, skb);
>>
>> This dereferences tun->steering_prog for a second time. It is safe
>> in this load balancing case to assign a few extra packets to queue 0.
>> But the issue can also be avoided by replacing the function with a
>> direct call in tun_net_xmit:
>>
>>         struct tun_steering_prog *s = rcu_dereference(tun->steering_prog);
>>         if (s)
>>                 ret = bpf_prog_run_clear_cb(s->prog, skb) %
>> tun->numqueues;
>
>
> Right.
>
>
>>
>>>   /* Net device start xmit */
>>> -static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device
>>> *dev)
>>> +static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
>>>   {
>>> -       struct tun_struct *tun = netdev_priv(dev);
>>> -       int txq = skb->queue_mapping;
>>> -       struct tun_file *tfile;
>>> -       u32 numqueues = 0;
>>> -
>>> -       rcu_read_lock();
>>> -       tfile = rcu_dereference(tun->tfiles[txq]);
>>> -       numqueues = READ_ONCE(tun->numqueues);
>>> -
>>> -       /* Drop packet if interface is not attached */
>>> -       if (txq >= numqueues)
>>> -               goto drop;
>>> -
>>>   #ifdef CONFIG_RPS
>>> -       if (numqueues == 1 && static_key_false(&rps_needed)) {
>>> +       if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
>>>                  /* Select queue was not called for the skbuff, so we
>>> extract the
>>>                   * RPS hash and save it into the flow_table here.
>>>                   */
>>> @@ -969,6 +986,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb,
>>> struct net_device *dev)
>>>                  }
>>>          }
>>>   #endif
>>> +}
>>> +
>>> +/* Net device start xmit */
>>> +static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device
>>> *dev)
>>> +{
>>> +       struct tun_struct *tun = netdev_priv(dev);
>>> +       int txq = skb->queue_mapping;
>>> +       struct tun_file *tfile;
>>> +       u32 numqueues = 0;
>>> +
>>> +       rcu_read_lock();
>>> +       tfile = rcu_dereference(tun->tfiles[txq]);
>>> +       numqueues = READ_ONCE(tun->numqueues);
>>
>> Now tun->numqueues is read twice, reversing commit fa35864e0bb7
>> ("tuntap: Fix for a race in accessing numqueues"). I don't see anything
>> left that would cause a divide by zero after the relevant code was
>> converted from divide to multiple and subsequently even removed.
>>
>> But if it's safe to read multiple times, might as well remove the
>> READ_ONCE.
>
>
> Good point, but READ_ONCE() is not something new, we'd better change this in
> another patch.

Sounds good. It's a simple follow-up. I can also send that.
>
>
>>
>>> @@ -1551,7 +1588,7 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>> struct tun_file *tfile,
>>>          int copylen;
>>>          bool zerocopy = false;
>>>          int err;
>>> -       u32 rxhash;
>>> +       u32 rxhash = 0;
>>>          int skb_xdp = 1;
>>>          bool frags = tun_napi_frags_enabled(tun);
>>>
>>> @@ -1739,7 +1776,10 @@ static ssize_t tun_get_user(struct tun_struct
>>> *tun, struct tun_file *tfile,
>>>                  rcu_read_unlock();
>>>          }
>>>
>>> -       rxhash = __skb_get_hash_symmetric(skb);
>>> +       rcu_read_lock();
>>> +       if (!rcu_dereference(tun->steering_prog))
>>> +               rxhash = __skb_get_hash_symmetric(skb);
>>> +       rcu_read_unlock();
>>>
>>>          if (frags) {
>>>                  /* Exercise flow dissector code path. */
>>> @@ -1783,7 +1823,9 @@ static ssize_t tun_get_user(struct tun_struct *tun,
>>> struct tun_file *tfile,
>>>          u64_stats_update_end(&stats->syncp);
>>>          put_cpu_ptr(stats);
>>>
>>> -       tun_flow_update(tun, rxhash, tfile);
>>> +       if (rxhash)
>>> +               tun_flow_update(tun, rxhash, tfile);
>>> +
>>
>> Nit: zero is a valid hash? In which case, an int64_t initialized to -1 is
>> the
>> safer check.
>
>
> Looks not? E.g looking at __flow_hash_from_keys() it did:
>
> static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
> {
>     u32 hash;
>
>     __flow_hash_consistentify(keys);
>
>     hash = __flow_hash_words(flow_keys_hash_start(keys),
>                  flow_keys_hash_length(keys), keyval);
>     if (!hash)
>         hash = 1;
>
>     return hash;
> }
>
> Thanks

Interesting, thanks. In that case

Acked-by: Willem de Bruijn <willemb@google.com>
David Miller Dec. 5, 2017, 5:02 p.m. UTC | #4
From: Jason Wang <jasowang@redhat.com>
Date: Mon,  4 Dec 2017 17:31:23 +0800

> This patch introduces an eBPF based queue selection method. With this,
> the policy could be offloaded to userspace completely through a new
> ioctl TUNSETSTEERINGEBPF.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> Changes from V2:
> - call rtnl during netdev free
> - switch to use call_rcu() to prevent DOS from userspace
> - drop the policies setting/getting ioctls and allow detach through
>   passing -1 as fd

Applied, thanks Jason.

I really wish this driver had newlink/changelink support rather than
us adding all of these ioctls...
Jason Wang Dec. 6, 2017, 2:30 a.m. UTC | #5
On 2017年12月06日 01:02, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Mon,  4 Dec 2017 17:31:23 +0800
>
>> This patch introduces an eBPF based queue selection method. With this,
>> the policy could be offloaded to userspace completely through a new
>> ioctl TUNSETSTEERINGEBPF.
>>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
>> Changes from V2:
>> - call rtnl during netdev free
>> - switch to use call_rcu() to prevent DOS from userspace
>> - drop the policies setting/getting ioctls and allow detach through
>>    passing -1 as fd
> Applied, thanks Jason.
>
> I really wish this driver had newlink/changelink support rather than
> us adding all of these ioctls...

Yes, will add this in my todo list.

Thanks
Eric Dumazet Dec. 7, 2017, 10:11 p.m. UTC | #6
On Mon, 2017-12-04 at 17:31 +0800, Jason Wang wrote:
> This patch introduces an eBPF based queue selection method. With
> this,
> the policy could be offloaded to userspace completely through a new
> ioctl TUNSETSTEERINGEBPF.

Sorry for the delay, I see this patch was merged already.

...

>  static void tun_free_netdev(struct net_device *dev)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
> @@ -1996,6 +2068,9 @@ static void tun_free_netdev(struct net_device
> *dev)
>  	free_percpu(tun->pcpu_stats);
>  	tun_flow_uninit(tun);
>  	security_tun_dev_free_security(tun->security);
> +	rtnl_lock();
> +	__tun_set_steering_ebpf(tun, NULL);
> +	rtnl_unlock();
>  }

I am pretty sure tun_free_netdev() (aka ->priv_destructor()) can be
called under RTNL (say from register_netdevice())

So this will dead lock badly ?
Jason Wang Dec. 8, 2017, 2:25 a.m. UTC | #7
On 2017年12月08日 06:11, Eric Dumazet wrote:
> On Mon, 2017-12-04 at 17:31 +0800, Jason Wang wrote:
>> This patch introduces an eBPF based queue selection method. With
>> this,
>> the policy could be offloaded to userspace completely through a new
>> ioctl TUNSETSTEERINGEBPF.
> Sorry for the delay, I see this patch was merged already.
>
> ...
>
>>   static void tun_free_netdev(struct net_device *dev)
>>   {
>>   	struct tun_struct *tun = netdev_priv(dev);
>> @@ -1996,6 +2068,9 @@ static void tun_free_netdev(struct net_device
>> *dev)
>>   	free_percpu(tun->pcpu_stats);
>>   	tun_flow_uninit(tun);
>>   	security_tun_dev_free_security(tun->security);
>> +	rtnl_lock();
>> +	__tun_set_steering_ebpf(tun, NULL);
>> +	rtnl_unlock();
>>   }
> I am pretty sure tun_free_netdev() (aka ->priv_destructor()) can be
> called under RTNL (say from register_netdevice())
>
> So this will dead lock badly ?
>
>

Unfortunately yes. Will switch to use spinlock (tun->lock) to 
synchronize here.

Thanks

Patch
diff mbox series

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9574900..f6557e8 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -195,6 +195,11 @@  struct tun_flow_entry {
 
 #define TUN_NUM_FLOW_ENTRIES 1024
 
+struct tun_steering_prog {
+	struct rcu_head rcu;
+	struct bpf_prog *prog;
+};
+
 /* Since the socket were moved to tun_file, to preserve the behavior of persist
  * device, socket filter, sndbuf and vnet header size were restore when the
  * file were attached to a persist device.
@@ -232,6 +237,7 @@  struct tun_struct {
 	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 	struct bpf_prog __rcu *xdp_prog;
+	struct tun_steering_prog __rcu *steering_prog;
 };
 
 static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -537,15 +543,12 @@  static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
  * different rxq no. here. If we could not get rxhash, then we would
  * hope the rxq no. may help here.
  */
-static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
-			    void *accel_priv, select_queue_fallback_t fallback)
+static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 {
-	struct tun_struct *tun = netdev_priv(dev);
 	struct tun_flow_entry *e;
 	u32 txq = 0;
 	u32 numqueues = 0;
 
-	rcu_read_lock();
 	numqueues = READ_ONCE(tun->numqueues);
 
 	txq = __skb_get_hash_symmetric(skb);
@@ -563,10 +566,37 @@  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 			txq -= numqueues;
 	}
 
-	rcu_read_unlock();
 	return txq;
 }
 
+static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
+{
+	struct tun_steering_prog *prog;
+	u16 ret = 0;
+
+	prog = rcu_dereference(tun->steering_prog);
+	if (prog)
+		ret = bpf_prog_run_clear_cb(prog->prog, skb);
+
+	return ret % tun->numqueues;
+}
+
+static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
+			    void *accel_priv, select_queue_fallback_t fallback)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	u16 ret;
+
+	rcu_read_lock();
+	if (rcu_dereference(tun->steering_prog))
+		ret = tun_ebpf_select_queue(tun, skb);
+	else
+		ret = tun_automq_select_queue(tun, skb);
+	rcu_read_unlock();
+
+	return ret;
+}
+
 static inline bool tun_not_capable(struct tun_struct *tun)
 {
 	const struct cred *cred = current_cred();
@@ -937,23 +967,10 @@  static int tun_net_close(struct net_device *dev)
 }
 
 /* Net device start xmit */
-static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 {
-	struct tun_struct *tun = netdev_priv(dev);
-	int txq = skb->queue_mapping;
-	struct tun_file *tfile;
-	u32 numqueues = 0;
-
-	rcu_read_lock();
-	tfile = rcu_dereference(tun->tfiles[txq]);
-	numqueues = READ_ONCE(tun->numqueues);
-
-	/* Drop packet if interface is not attached */
-	if (txq >= numqueues)
-		goto drop;
-
 #ifdef CONFIG_RPS
-	if (numqueues == 1 && static_key_false(&rps_needed)) {
+	if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
 		/* Select queue was not called for the skbuff, so we extract the
 		 * RPS hash and save it into the flow_table here.
 		 */
@@ -969,6 +986,26 @@  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 		}
 	}
 #endif
+}
+
+/* Net device start xmit */
+static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+	int txq = skb->queue_mapping;
+	struct tun_file *tfile;
+	u32 numqueues = 0;
+
+	rcu_read_lock();
+	tfile = rcu_dereference(tun->tfiles[txq]);
+	numqueues = READ_ONCE(tun->numqueues);
+
+	/* Drop packet if interface is not attached */
+	if (txq >= numqueues)
+		goto drop;
+
+	if (!rcu_dereference(tun->steering_prog))
+		tun_automq_xmit(tun, skb);
 
 	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 
@@ -1551,7 +1588,7 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int copylen;
 	bool zerocopy = false;
 	int err;
-	u32 rxhash;
+	u32 rxhash = 0;
 	int skb_xdp = 1;
 	bool frags = tun_napi_frags_enabled(tun);
 
@@ -1739,7 +1776,10 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		rcu_read_unlock();
 	}
 
-	rxhash = __skb_get_hash_symmetric(skb);
+	rcu_read_lock();
+	if (!rcu_dereference(tun->steering_prog))
+		rxhash = __skb_get_hash_symmetric(skb);
+	rcu_read_unlock();
 
 	if (frags) {
 		/* Exercise flow dissector code path. */
@@ -1783,7 +1823,9 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	u64_stats_update_end(&stats->syncp);
 	put_cpu_ptr(stats);
 
-	tun_flow_update(tun, rxhash, tfile);
+	if (rxhash)
+		tun_flow_update(tun, rxhash, tfile);
+
 	return total_len;
 }
 
@@ -1988,6 +2030,36 @@  static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
+static void tun_steering_prog_free(struct rcu_head *rcu)
+{
+	struct tun_steering_prog *prog = container_of(rcu,
+					 struct tun_steering_prog, rcu);
+
+	bpf_prog_destroy(prog->prog);
+	kfree(prog);
+}
+
+static int __tun_set_steering_ebpf(struct tun_struct *tun,
+				   struct bpf_prog *prog)
+{
+	struct tun_steering_prog *old, *new = NULL;
+
+	if (prog) {
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->prog = prog;
+	}
+
+	old = rtnl_dereference(tun->steering_prog);
+	rcu_assign_pointer(tun->steering_prog, new);
+
+	if (old)
+		call_rcu(&old->rcu, tun_steering_prog_free);
+
+	return 0;
+}
+
 static void tun_free_netdev(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -1996,6 +2068,9 @@  static void tun_free_netdev(struct net_device *dev)
 	free_percpu(tun->pcpu_stats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
+	rtnl_lock();
+	__tun_set_steering_ebpf(tun, NULL);
+	rtnl_unlock();
 }
 
 static void tun_setup(struct net_device *dev)
@@ -2275,6 +2350,7 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		tun->filter_attached = false;
 		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
 		tun->rx_batched = 0;
+		RCU_INIT_POINTER(tun->steering_prog, NULL);
 
 		tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
 		if (!tun->pcpu_stats) {
@@ -2467,6 +2543,25 @@  static int tun_set_queue(struct file *file, struct ifreq *ifr)
 	return ret;
 }
 
+static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+{
+	struct bpf_prog *prog;
+	int fd;
+
+	if (copy_from_user(&fd, data, sizeof(fd)))
+		return -EFAULT;
+
+	if (fd == -1) {
+		prog = NULL;
+	} else {
+		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
+	return __tun_set_steering_ebpf(tun, prog);
+}
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg, int ifreq_len)
 {
@@ -2743,6 +2838,10 @@  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = 0;
 		break;
 
+	case TUNSETSTEERINGEBPF:
+		ret = tun_set_steering_ebpf(tun, argp);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 030d3e6..fb38c17 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -57,6 +57,7 @@ 
  */
 #define TUNSETVNETBE _IOW('T', 222, int)
 #define TUNGETVNETBE _IOR('T', 223, int)
+#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001