netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] net: filter: add insn for loading internal transport header offset
@ 2014-04-30 18:29 Chema Gonzalez
  2014-04-30 22:21 ` Alexei Starovoitov
                   ` (13 more replies)
  0 siblings, 14 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-04-30 18:29 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Alexei Starovoitov; +Cc: Chema Gonzalez, netdev

Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
header of a packet ("internal" meaning after decapsulation by the
flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 17 +++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 759abf7..b76ae2b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -224,6 +224,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
+	BPF_S_ANC_TRA_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 253b4d4..9f1b8f1 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -131,7 +131,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_TRA_OFFSET	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 7c4db3d..e31846a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -601,6 +601,17 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 	return __skb_get_poff(skb);
 }
 
+static u64 __skb_get_tra_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	struct flow_keys keys;
+	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
+
+	if (!skb_flow_dissect(skb, &keys))
+		return 0;
+
+	return keys.thoff;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
@@ -788,6 +799,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
+	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 		/* arg1 = ctx */
 		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
 		insn->a_reg = ARG1_REG;
@@ -824,6 +836,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			insn->imm = __get_random_u32 - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+			insn->imm = __skb_get_tra_offset - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1375,6 +1390,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
+			ANCILLARY(TRA_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1760,6 +1776,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 833a966..4e72934 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -93,6 +93,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
+"#"?("toff")	{ return K_TOFF; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index e6306c5..ced6949 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -167,6 +167,9 @@ ldb
 	| OP_LDB K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDB K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldh
@@ -218,6 +221,9 @@ ldh
 	| OP_LDH K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDH K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldi
@@ -274,6 +280,9 @@ ld
 	| OP_LD K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH] net: filter: add insn for loading internal transport header offset
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
@ 2014-04-30 22:21 ` Alexei Starovoitov
  2014-05-01 10:53 ` Daniel Borkmann
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 47+ messages in thread
From: Alexei Starovoitov @ 2014-04-30 22:21 UTC (permalink / raw)
  To: Chema Gonzalez, Daniel Borkmann
  Cc: David S. Miller, Eric Dumazet, Network Development

On Wed, Apr 30, 2014 at 11:29 AM, Chema Gonzalez <chema@google.com> wrote:
> Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
> header of a packet ("internal" meaning after decapsulation by the
> flow dissector).
>
> Signed-off-by: Chema Gonzalez <chema@google.com>

Acked-by: Alexei Starovoitov <ast@plumgrid.com>

Looks useful. Some of the tcpdump filters can get much shorter.

> ---
>  include/linux/filter.h      |  1 +
>  include/uapi/linux/filter.h |  3 ++-
>  net/core/filter.c           | 17 +++++++++++++++++
>  tools/net/bpf_exp.l         |  1 +
>  tools/net/bpf_exp.y         | 11 ++++++++++-
>  5 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 759abf7..b76ae2b 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -224,6 +224,7 @@ enum {
>         BPF_S_ANC_VLAN_TAG_PRESENT,
>         BPF_S_ANC_PAY_OFFSET,
>         BPF_S_ANC_RANDOM,
> +       BPF_S_ANC_TRA_OFFSET,
>  };
>
>  #endif /* __LINUX_FILTER_H__ */
> diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
> index 253b4d4..9f1b8f1 100644
> --- a/include/uapi/linux/filter.h
> +++ b/include/uapi/linux/filter.h
> @@ -131,7 +131,8 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
>  #define SKF_AD_VLAN_TAG_PRESENT 48
>  #define SKF_AD_PAY_OFFSET      52
>  #define SKF_AD_RANDOM  56
> -#define SKF_AD_MAX     60
> +#define SKF_AD_TRA_OFFSET      60
> +#define SKF_AD_MAX     64
>  #define SKF_NET_OFF   (-0x100000)
>  #define SKF_LL_OFF    (-0x200000)
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 7c4db3d..e31846a 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -601,6 +601,17 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
>         return __skb_get_poff(skb);
>  }
>
> +static u64 __skb_get_tra_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
> +{
> +       struct flow_keys keys;
> +       struct sk_buff *skb = (struct sk_buff *)(long) ctx;

I think kernel style recommends to swap above two lines,
but that's a nit.

> +
> +       if (!skb_flow_dissect(skb, &keys))
> +               return 0;
> +
> +       return keys.thoff;
> +}
> +
>  static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
>  {
>         struct sk_buff *skb = (struct sk_buff *)(long) ctx;
> @@ -788,6 +799,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
>         case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
>         case SKF_AD_OFF + SKF_AD_CPU:
>         case SKF_AD_OFF + SKF_AD_RANDOM:
> +       case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
>                 /* arg1 = ctx */
>                 insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
>                 insn->a_reg = ARG1_REG;
> @@ -824,6 +836,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
>                 case SKF_AD_OFF + SKF_AD_RANDOM:
>                         insn->imm = __get_random_u32 - __bpf_call_base;
>                         break;
> +               case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
> +                       insn->imm = __skb_get_tra_offset - __bpf_call_base;
> +                       break;
>                 }
>                 break;
>
> @@ -1375,6 +1390,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
>                         ANCILLARY(VLAN_TAG_PRESENT);
>                         ANCILLARY(PAY_OFFSET);
>                         ANCILLARY(RANDOM);
> +                       ANCILLARY(TRA_OFFSET);
>                         }
>
>                         /* ancillary operation unknown or unsupported */
> @@ -1760,6 +1776,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
>                 [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
>                 [BPF_S_ANC_PAY_OFFSET]  = BPF_LD|BPF_B|BPF_ABS,
>                 [BPF_S_ANC_RANDOM]      = BPF_LD|BPF_B|BPF_ABS,
> +               [BPF_S_ANC_TRA_OFFSET]  = BPF_LD|BPF_B|BPF_ABS,
>                 [BPF_S_LD_W_LEN]        = BPF_LD|BPF_W|BPF_LEN,
>                 [BPF_S_LD_W_IND]        = BPF_LD|BPF_W|BPF_IND,
>                 [BPF_S_LD_H_IND]        = BPF_LD|BPF_H|BPF_IND,
> diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
> index 833a966..4e72934 100644
> --- a/tools/net/bpf_exp.l
> +++ b/tools/net/bpf_exp.l
> @@ -93,6 +93,7 @@ extern void yyerror(const char *str);
>  "#"?("vlan_tci") { return K_VLANT; }
>  "#"?("vlan_pr")        { return K_VLANP; }
>  "#"?("rand")   { return K_RAND; }
> +"#"?("toff")   { return K_TOFF; }
>
>  ":"            { return ':'; }
>  ","            { return ','; }
> diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
> index e6306c5..ced6949 100644
> --- a/tools/net/bpf_exp.y
> +++ b/tools/net/bpf_exp.y
> @@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
>  %token OP_LDXI
>
>  %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
> -%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
> +%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
>
>  %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
>
> @@ -167,6 +167,9 @@ ldb
>         | OP_LDB K_RAND {
>                 bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
>                                    SKF_AD_OFF + SKF_AD_RANDOM); }
> +       | OP_LDB K_TOFF {
> +               bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
> +                                  SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
>         ;
>
>  ldh
> @@ -218,6 +221,9 @@ ldh
>         | OP_LDH K_RAND {
>                 bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
>                                    SKF_AD_OFF + SKF_AD_RANDOM); }
> +       | OP_LDH K_TOFF {
> +               bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
> +                                  SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
>         ;
>
>  ldi
> @@ -274,6 +280,9 @@ ld
>         | OP_LD K_RAND {
>                 bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
>                                    SKF_AD_OFF + SKF_AD_RANDOM); }
> +       | OP_LD K_TOFF {
> +               bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
> +                                  SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
>         | OP_LD 'M' '[' number ']' {
>                 bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
>         | OP_LD '[' 'x' '+' number ']' {
> --
> 1.9.1.423.g4596e3a
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] net: filter: add insn for loading internal transport header offset
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
  2014-04-30 22:21 ` Alexei Starovoitov
@ 2014-05-01 10:53 ` Daniel Borkmann
  2014-05-01 10:55   ` Daniel Borkmann
  2014-05-01 18:44 ` [PATCH net-next v2] " Chema Gonzalez
                   ` (11 subsequent siblings)
  13 siblings, 1 reply; 47+ messages in thread
From: Daniel Borkmann @ 2014-05-01 10:53 UTC (permalink / raw)
  To: Chema Gonzalez; +Cc: David S. Miller, Eric Dumazet, Alexei Starovoitov, netdev

[ Chema, please also keep me in cc if possible. ]

On 04/30/2014 08:29 PM, Chema Gonzalez wrote:
> Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
> header of a packet ("internal" meaning after decapsulation by the
> flow dissector).
>
> Signed-off-by: Chema Gonzalez <chema@google.com>

Looks good to me. For casting pointers as Dave noted recently, please
use 'unsigned long'. This needs to be fixed in other places as well.

Other than that, you can add my:

Acked-by: Daniel Borkmann <dborkman@redhat.com>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] net: filter: add insn for loading internal transport header offset
  2014-05-01 10:53 ` Daniel Borkmann
@ 2014-05-01 10:55   ` Daniel Borkmann
  2014-05-01 18:44     ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Daniel Borkmann @ 2014-05-01 10:55 UTC (permalink / raw)
  To: Chema Gonzalez; +Cc: David S. Miller, Eric Dumazet, Alexei Starovoitov, netdev

On 05/01/2014 12:53 PM, Daniel Borkmann wrote:
> [ Chema, please also keep me in cc if possible. ]
>
> On 04/30/2014 08:29 PM, Chema Gonzalez wrote:
>> Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
>> header of a packet ("internal" meaning after decapsulation by the
>> flow dissector).
>>
>> Signed-off-by: Chema Gonzalez <chema@google.com>
>
> Looks good to me. For casting pointers as Dave noted recently, please
> use 'unsigned long'. This needs to be fixed in other places as well.

Ohh, and please in future include net or net-next into your subject line.

> Other than that, you can add my:
>
> Acked-by: Daniel Borkmann <dborkman@redhat.com>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] net: filter: add insn for loading internal transport header offset
  2014-05-01 10:55   ` Daniel Borkmann
@ 2014-05-01 18:44     ` Chema Gonzalez
  0 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-01 18:44 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: David S. Miller, Eric Dumazet, Alexei Starovoitov, Network Development

On Thu, May 1, 2014 at 3:55 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> [ Chema, please also keep me in cc if possible. ]
Done.

>> Looks good to me. For casting pointers as Dave noted recently, please
>> use 'unsigned long'. This needs to be fixed in other places as well.
Done.

> Ohh, and please in future include net or net-next into your subject line.
Done.

On Wed, Apr 30, 2014 at 3:21 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> +static u64 __skb_get_tra_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
>> +{
>> +       struct flow_keys keys;
>> +       struct sk_buff *skb = (struct sk_buff *)(long) ctx;
>
> I think kernel style recommends to swap above two lines,
> but that's a nit.
Done.

-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
  2014-04-30 22:21 ` Alexei Starovoitov
  2014-05-01 10:53 ` Daniel Borkmann
@ 2014-05-01 18:44 ` Chema Gonzalez
  2014-05-02 16:21   ` Ben Hutchings
  2014-05-02 21:49   ` David Miller
  2014-05-14 18:42 ` [PATCH v2 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
                   ` (10 subsequent siblings)
  13 siblings, 2 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-01 18:44 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
header of a packet ("internal" meaning after decapsulation by the
flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 17 +++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 759abf7..b76ae2b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -224,6 +224,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
+	BPF_S_ANC_TRA_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 253b4d4..9f1b8f1 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -131,7 +131,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_TRA_OFFSET	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 7c4db3d..8264c79 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -601,6 +601,17 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 	return __skb_get_poff(skb);
 }
 
+static u64 __skb_get_tra_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
+	struct flow_keys keys;
+
+	if (!skb_flow_dissect(skb, &keys))
+		return 0;
+
+	return keys.thoff;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
@@ -788,6 +799,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
+	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 		/* arg1 = ctx */
 		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
 		insn->a_reg = ARG1_REG;
@@ -824,6 +836,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			insn->imm = __get_random_u32 - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+			insn->imm = __skb_get_tra_offset - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1375,6 +1390,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
+			ANCILLARY(TRA_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1760,6 +1776,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 833a966..4e72934 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -93,6 +93,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
+"#"?("toff")	{ return K_TOFF; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index e6306c5..ced6949 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -167,6 +167,9 @@ ldb
 	| OP_LDB K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDB K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldh
@@ -218,6 +221,9 @@ ldh
 	| OP_LDH K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDH K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldi
@@ -274,6 +280,9 @@ ld
 	| OP_LD K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-01 18:44 ` [PATCH net-next v2] " Chema Gonzalez
@ 2014-05-02 16:21   ` Ben Hutchings
  2014-05-02 21:49   ` David Miller
  1 sibling, 0 replies; 47+ messages in thread
From: Ben Hutchings @ 2014-05-02 16:21 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov,
	netdev, Tom Herbert

[-- Attachment #1: Type: text/plain, Size: 470 bytes --]

On Thu, 2014-05-01 at 11:44 -0700, Chema Gonzalez wrote:
> Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
> header of a packet ("internal" meaning after decapsulation by the
> flow dissector).
[...]

This turns the behaviour of the flow dissector into a userland ABI,
which must then be stable.  Is it ready for that?

Ben.

-- 
Ben Hutchings
Lowery's Law:
             If it jams, force it. If it breaks, it needed replacing anyway.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-01 18:44 ` [PATCH net-next v2] " Chema Gonzalez
  2014-05-02 16:21   ` Ben Hutchings
@ 2014-05-02 21:49   ` David Miller
  2014-05-03  0:53     ` Chema Gonzalez
  1 sibling, 1 reply; 47+ messages in thread
From: David Miller @ 2014-05-02 21:49 UTC (permalink / raw)
  To: chema; +Cc: edumazet, dborkman, ast, netdev

From: Chema Gonzalez <chema@google.com>
Date: Thu,  1 May 2014 11:44:40 -0700

> Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
> header of a packet ("internal" meaning after decapsulation by the
> flow dissector).
> 
> Signed-off-by: Chema Gonzalez <chema@google.com>

I'm not so convinced of this change.

What are the use cases?

If the use cases are to simply pull the ports out of the transport
header for filter matching or hashing, don't just make a half-step
like this.

We can probably add an extension to AF_PACKET which provides the flow
key at the end of the tpacket3_hdr if a certain socket option is set.

That would provide the transport header as well as a side effect, and
be much more powerful and efficient than this particular BPF
instruction.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-02 21:49   ` David Miller
@ 2014-05-03  0:53     ` Chema Gonzalez
  2014-05-03  2:52       ` David Miller
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-03  0:53 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Daniel Borkmann, Alexei Starovoitov, Network Development

On Fri, May 2, 2014 at 2:49 PM, David Miller <davem@davemloft.net> wrote:
> From: Chema Gonzalez <chema@google.com>
> Date: Thu,  1 May 2014 11:44:40 -0700
>
>> Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
>> header of a packet ("internal" meaning after decapsulation by the
>> flow dissector).
>>
>> Signed-off-by: Chema Gonzalez <chema@google.com>
>
> I'm not so convinced of this change.
>
> What are the use cases?
Any BPF filtering that depends on the inner-most L3/L4 header. In
particular, I want to write a BPF filter that depends on some fields
in the inner-most IP/L4 headers (currently the L4 ports, but at some
point I'll need to add the IP ToS field). I want my filter to support
both GRE and VLAN traffic. I started writing the filter
(ethernet-based L2):

start:
ldh [12]
jne #0x0800, ipv4parsing
jne #0x86dd, ipv6parsing
jne #0x88a8, vlanparsing
jne #0x8100, vlanparsing

ipv4parsing:
ldb [23]
jeq #6, tcpparsing
jeq #17, udpparsing
jeq #47, greparsing
...

vlanparsing:
...get new proto...
jne #0x0800, ipv4parsing2
jne #0x86dd, ipv6parsing2
jne #0x88a8, vlanparsing2
jne #0x8100, vlanparsing2
...

tcpparsing:
ldh [X+2]  ; dst port
...now we take a decision...

You get the issue: I found myself effectively rewriting the flow
dissector, using BPF syntax (no loops). Not fun. I'd like to use the
actual flow dissector. "ld #poff" already uses it, to get the snaplen
it should return. We should be able to access the transport (and
network) header itself. Something around the lines of:

ldh #toff  ; now A has the inner-most L4 header offset
ldh [A+2]  ; dst port
...now we take a decision...

ldh #noff  ; now A has the inner-most IP header offset
ldb [A+1]  ; ToS
...now we take a decision...

We probably want the proto (to differentiate IPv4 from IPv6) of the L3 header.

> If the use cases are to simply pull the ports out of the transport
> header for filter matching or hashing, don't just make a half-step
> like this.
I thought about that, but at some point I'll want to use other fields
(the ToS) to take filtering decisions. That means I'll have to do "ld
#noff". I started with #toff because it's straightforward (noff
requires adding the value in flow_keys). But, the fact that I need a
field other than the L3 fields that flow_keys export made me suspect
that somebody will need the same for the L4 fields at some point. I
find that more flexible than just providing the inner-most IP
addrs/ports from flow_keys.

> We can probably add an extension to AF_PACKET which provides the flow
> key at the end of the tpacket3_hdr if a certain socket option is set.
>
> That would provide the transport header as well as a side effect, and
> be much more powerful and efficient than this particular BPF
> instruction.
Once you get the L3 proto and noff/toff, you get everything. The
rationale of adding toff/noff is that it's very similar to poff, which
we already have.

-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-03  0:53     ` Chema Gonzalez
@ 2014-05-03  2:52       ` David Miller
  2014-05-05 18:42         ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: David Miller @ 2014-05-03  2:52 UTC (permalink / raw)
  To: chema; +Cc: edumazet, dborkman, ast, netdev

From: Chema Gonzalez <chema@google.com>
Date: Fri, 2 May 2014 17:53:01 -0700

> On Fri, May 2, 2014 at 2:49 PM, David Miller <davem@davemloft.net> wrote:
>> We can probably add an extension to AF_PACKET which provides the flow
>> key at the end of the tpacket3_hdr if a certain socket option is set.
>>
>> That would provide the transport header as well as a side effect, and
>> be much more powerful and efficient than this particular BPF
>> instruction.
> Once you get the L3 proto and noff/toff, you get everything. The
> rationale of adding toff/noff is that it's very similar to poff, which
> we already have.

But you're implementation is calling a function to do all of that work
already, just to get only the transport header and then throwing away
all of the other useful bits that function is calculating, so you'll
end up doing it twice which is wasted work.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-03  2:52       ` David Miller
@ 2014-05-05 18:42         ` Chema Gonzalez
  2014-05-05 19:12           ` David Miller
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-05 18:42 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Daniel Borkmann, Alexei Starovoitov, Network Development

On Fri, May 2, 2014 at 7:52 PM, David Miller <davem@davemloft.net> wrote:
>> Once you get the L3 proto and noff/toff, you get everything. The
>> rationale of adding toff/noff is that it's very similar to poff, which
>> we already have.
>
> But you're implementation is calling a function to do all of that work
> already, just to get only the transport header and then throwing away
> all of the other useful bits that function is calculating, so you'll
> end up doing it twice which is wasted work.
The problem here is that (classic) BPF works only with 2 registers. We
need at least 2 fields to get a valid point in the packet, namely the
offset and the protocol of the inner-most header (either L3 or L4).
The idea of this patch is to run the flow dissector twice, once to get
the protocol type, and a second to get the offset (this patch
implements the offset part of the L4 header). This is 2 calls, but it
provides flow dissection inside BPF.

I guess we could have this call return 2 values, one in A, the other
in X. This is strange in BPF, but we could do it (and then the user
should probably push one of the values right away if she wants to use
it later). Does it sound good?

> We can probably add an extension to AF_PACKET which provides the flow
> key at the end of the tpacket3_hdr if a certain socket option is set.
>
> That would provide the transport header as well as a side effect, and
> be much more powerful and efficient than this particular BPF
> instruction.
I'm not sure whether I follow this. The goal is to be able to access
the inner-most headers inside BPF, not in userland by calling
getsockopt().

-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-05 18:42         ` Chema Gonzalez
@ 2014-05-05 19:12           ` David Miller
  2014-05-14 18:42             ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: David Miller @ 2014-05-05 19:12 UTC (permalink / raw)
  To: chema; +Cc: edumazet, dborkman, ast, netdev

From: Chema Gonzalez <chema@google.com>
Date: Mon, 5 May 2014 11:42:00 -0700

> On Fri, May 2, 2014 at 7:52 PM, David Miller <davem@davemloft.net> wrote:
>> We can probably add an extension to AF_PACKET which provides the flow
>> key at the end of the tpacket3_hdr if a certain socket option is set.
>>
>> That would provide the transport header as well as a side effect, and
>> be much more powerful and efficient than this particular BPF
>> instruction.
> I'm not sure whether I follow this. The goal is to be able to access
> the inner-most headers inside BPF, not in userland by calling
> getsockopt().

You're missing my entire point.

You can use AF_PACKET mmap() rings and in those ring entries all of the
flow dissection information can be put in the ring entry headers before
the packet contents.  Ports, header offsets, everything.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH v2 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (2 preceding siblings ...)
  2014-05-01 18:44 ` [PATCH net-next v2] " Chema Gonzalez
@ 2014-05-14 18:42 ` Chema Gonzalez
  2014-05-14 18:42   ` [PATCH v2 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
  2014-05-14 18:42   ` [PATCH v2 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
  2014-05-14 18:51 ` [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
                   ` (9 subsequent siblings)
  13 siblings, 2 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:42 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

We want multiple calls to __skb_get_poff() in the same filter to only
cause one invocation to the flow dissector. In order to reuse the result
of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
variable in the eBPF runner stack (__sk_run_filter() function), and pass
it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
the very first time it is called, and reuses the result in any further
invocation.

We also add handy function to init/check for inited flow_keys.

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/skbuff.h    |  4 +++-
 net/core/filter.c         | 14 ++++++++++++--
 net/core/flow_dissector.c | 28 ++++++++++++++++++++++------
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb..176ec05 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3065,7 +3065,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
 
-u32 __skb_get_poff(const struct sk_buff *skb);
+void __flow_keys_init(struct flow_keys *flow);
+bool __flow_keys_inited(struct flow_keys *flow);
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow);
 
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
diff --git a/net/core/filter.c b/net/core/filter.c
index c442a0d..b71948b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -66,6 +66,11 @@
 #define CTX	regs[BPF_REG_CTX]
 #define K	insn->imm
 
+struct sk_run_filter_ctx {
+	struct sk_buff *skb;
+	struct flow_keys *flow;
+};
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -252,12 +257,15 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 	};
 	void *ptr;
 	int off;
+	struct flow_keys flow;
+	struct sk_run_filter_ctx context = { ctx, &flow };
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
 
+	memset(&flow, 0, sizeof(flow));
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
+	ARG1 = (u64) (unsigned long) &context;
 
 	/* Register for user BPF programs need to be reset first. */
 	regs[BPF_REG_A] = 0;
@@ -602,7 +610,9 @@ static unsigned int pkt_type_offset(void)
 
 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
-	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	return __skb_get_poff(context->skb, context->flow);
 }
 
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12..0f6bf73 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -270,21 +270,37 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 
+/* __flow_keys_init() inits a flow_keys struct */
+void __flow_keys_init(struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(struct flow_keys));
+}
+
+/* __flow_keys_inited() checks whether a flow_keys struct is init */
+bool __flow_keys_inited(struct flow_keys *flow)
+{
+	struct flow_keys zero_flow;
+
+	__flow_keys_init(&zero_flow);
+	return memcmp(flow, &zero_flow, sizeof(struct flow_keys));
+}
+
 /* __skb_get_poff() returns the offset to the payload as far as it could
  * be dissected. The main user is currently BPF, so that we can dynamically
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow)
 {
-	struct flow_keys keys;
 	u32 poff = 0;
 
-	if (!skb_flow_dissect(skb, &keys))
-		return 0;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(flow))
+		if (!skb_flow_dissect(skb, flow))
+			return 0;
 
-	poff += keys.thoff;
-	switch (keys.ip_proto) {
+	poff += flow->thoff;
+	switch (flow->ip_proto) {
 	case IPPROTO_TCP: {
 		const struct tcphdr *tcph;
 		struct tcphdr _tcph;
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v2 net-next 2/3] net: filter: add insn for loading internal transport header offset
  2014-05-14 18:42 ` [PATCH v2 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
@ 2014-05-14 18:42   ` Chema Gonzalez
  2014-05-14 18:42   ` [PATCH v2 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
  1 sibling, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:42 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
header of a packet ("internal" meaning after decapsulation by the
flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 17 +++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4457b38..72b3e63 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -306,6 +306,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
+	BPF_S_ANC_TRA_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 253b4d4..9f1b8f1 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -131,7 +131,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_TRA_OFFSET	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index b71948b..c4ba8f9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -615,6 +615,17 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return __skb_get_poff(context->skb, context->flow);
 }
 
+static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(context->flow))
+		if (!skb_flow_dissect(context->skb, context->flow))
+			return 0;
+	return context->flow->thoff;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -781,6 +792,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
+	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -811,6 +823,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			insn->imm = __get_random_u32 - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+			insn->imm = __skb_get_tra_offset - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1348,6 +1363,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
+			ANCILLARY(TRA_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1733,6 +1749,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 833a966..4e72934 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -93,6 +93,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
+"#"?("toff")	{ return K_TOFF; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index e6306c5..ced6949 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -167,6 +167,9 @@ ldb
 	| OP_LDB K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDB K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldh
@@ -218,6 +221,9 @@ ldh
 	| OP_LDH K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDH K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldi
@@ -274,6 +280,9 @@ ld
 	| OP_LD K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v2 net-next 3/3] net: filter: add insn for loading internal transport header proto
  2014-05-14 18:42 ` [PATCH v2 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
  2014-05-14 18:42   ` [PATCH v2 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
@ 2014-05-14 18:42   ` Chema Gonzalez
  1 sibling, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:42 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_PROTOCOL insn that loads the protocol of
the internal transport header of a packet ("internal" meaning after
decapsulation by the flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 17 +++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 72b3e63..cc8e333 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -307,6 +307,7 @@ enum {
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
 	BPF_S_ANC_TRA_OFFSET,
+	BPF_S_ANC_TRA_PROTOCOL,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 9f1b8f1..e34d608 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -132,7 +132,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
 #define SKF_AD_TRA_OFFSET	60
-#define SKF_AD_MAX	64
+#define SKF_AD_TRA_PROTOCOL	64
+#define SKF_AD_MAX	68
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index c4ba8f9..c75eba6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -626,6 +626,17 @@ static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return context->flow->thoff;
 }
 
+static u64 __skb_get_tra_protocol(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(context->flow))
+		if (!skb_flow_dissect(context->skb, context->flow))
+			return 0;
+	return context->flow->ip_proto;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -793,6 +804,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+	case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -826,6 +838,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 			insn->imm = __skb_get_tra_offset - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
+			insn->imm = __skb_get_tra_protocol - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1364,6 +1379,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
 			ANCILLARY(TRA_OFFSET);
+			ANCILLARY(TRA_PROTOCOL);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1750,6 +1766,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 4e72934..c1b5958 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -94,6 +94,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
 "#"?("toff")	{ return K_TOFF; }
+"#"?("tproto")	{ return K_TPROTO; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index ced6949..71fef72 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF K_TPROTO
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -170,6 +170,9 @@ ldb
 	| OP_LDB K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDB K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldh
@@ -224,6 +227,9 @@ ldh
 	| OP_LDH K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDH K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldi
@@ -283,6 +289,9 @@ ld
 	| OP_LD K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LD K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-05 19:12           ` David Miller
@ 2014-05-14 18:42             ` Chema Gonzalez
  2014-05-14 18:51               ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:42 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Daniel Borkmann, Alexei Starovoitov, Network Development

On Mon, May 5, 2014 at 12:12 PM, David Miller <davem@davemloft.net> wrote:
> From: Chema Gonzalez <chema@google.com>
> Date: Mon, 5 May 2014 11:42:00 -0700
>
>> On Fri, May 2, 2014 at 7:52 PM, David Miller <davem@davemloft.net> wrote:
>>> We can probably add an extension to AF_PACKET which provides the flow
>>> key at the end of the tpacket3_hdr if a certain socket option is set.
>>>
>>> That would provide the transport header as well as a side effect, and
>>> be much more powerful and efficient than this particular BPF
>>> instruction.
>> I'm not sure whether I follow this. The goal is to be able to access
>> the inner-most headers inside BPF, not in userland by calling
>> getsockopt().
>
> You're missing my entire point.
>
> You can use AF_PACKET mmap() rings and in those ring entries all of the
> flow dissection information can be put in the ring entry headers before
> the packet contents.  Ports, header offsets, everything.
I added a flow_keys variable in the packet runner function
(__sk_run_filter()) stack, and modified __skb_get_poff() to try to get
the results from there instead of calling the flow dissector every
time you call it. That will allow the packet filter to only perform a
single call to the flow dissector per packet.

I reworked the toff patch to use the same flow dissector output
approach (and share it with the poff load), and added a tproto patch.

Patches coming now.

-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH net-next v2] net: filter: add insn for loading internal transport header offset
  2014-05-14 18:42             ` Chema Gonzalez
@ 2014-05-14 18:51               ` Chema Gonzalez
  0 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:51 UTC (permalink / raw)
  To: David Miller
  Cc: Eric Dumazet, Daniel Borkmann, Alexei Starovoitov, Network Development

Just realized flow_keys_init() returns a zero'ed flow_keys, which is
not inited. I renamed the function flow_keys_reset(). Sending v3.

-Chema


On Wed, May 14, 2014 at 11:42 AM, Chema Gonzalez <chema@google.com> wrote:
> On Mon, May 5, 2014 at 12:12 PM, David Miller <davem@davemloft.net> wrote:
>> From: Chema Gonzalez <chema@google.com>
>> Date: Mon, 5 May 2014 11:42:00 -0700
>>
>>> On Fri, May 2, 2014 at 7:52 PM, David Miller <davem@davemloft.net> wrote:
>>>> We can probably add an extension to AF_PACKET which provides the flow
>>>> key at the end of the tpacket3_hdr if a certain socket option is set.
>>>>
>>>> That would provide the transport header as well as a side effect, and
>>>> be much more powerful and efficient than this particular BPF
>>>> instruction.
>>> I'm not sure whether I follow this. The goal is to be able to access
>>> the inner-most headers inside BPF, not in userland by calling
>>> getsockopt().
>>
>> You're missing my entire point.
>>
>> You can use AF_PACKET mmap() rings and in those ring entries all of the
>> flow dissection information can be put in the ring entry headers before
>> the packet contents.  Ports, header offsets, everything.
> I added a flow_keys variable in the packet runner function
> (__sk_run_filter()) stack, and modified __skb_get_poff() to try to get
> the results from there instead of calling the flow dissector every
> time you call it. That will allow the packet filter to only perform a
> single call to the flow dissector per packet.
>
> I reworked the toff patch to use the same flow dissector output
> approach (and share it with the poff load), and added a tproto patch.
>
> Patches coming now.
>
> -Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (3 preceding siblings ...)
  2014-05-14 18:42 ` [PATCH v2 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
@ 2014-05-14 18:51 ` Chema Gonzalez
  2014-05-14 20:05   ` Alexei Starovoitov
  2014-05-14 18:51 ` [PATCH v3 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (8 subsequent siblings)
  13 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:51 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

We want multiple calls to __skb_get_poff() in the same filter to only
cause one invocation to the flow dissector. In order to reuse the result
of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
variable in the eBPF runner stack (__sk_run_filter() function), and pass
it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
the very first time it is called, and reuses the result in any further
invocation.

We also add handy function to init/check for inited flow_keys.

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/skbuff.h    |  4 +++-
 net/core/filter.c         | 14 ++++++++++++--
 net/core/flow_dissector.c | 28 ++++++++++++++++++++++------
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb..858d7af 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3065,7 +3065,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
 
-u32 __skb_get_poff(const struct sk_buff *skb);
+void __flow_keys_reset(struct flow_keys *flow);
+bool __flow_keys_inited(struct flow_keys *flow);
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow);
 
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
diff --git a/net/core/filter.c b/net/core/filter.c
index c442a0d..b71948b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -66,6 +66,11 @@
 #define CTX	regs[BPF_REG_CTX]
 #define K	insn->imm
 
+struct sk_run_filter_ctx {
+	struct sk_buff *skb;
+	struct flow_keys *flow;
+};
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -252,12 +257,15 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 	};
 	void *ptr;
 	int off;
+	struct flow_keys flow;
+	struct sk_run_filter_ctx context = { ctx, &flow };
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
 
+	memset(&flow, 0, sizeof(flow));
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
+	ARG1 = (u64) (unsigned long) &context;
 
 	/* Register for user BPF programs need to be reset first. */
 	regs[BPF_REG_A] = 0;
@@ -602,7 +610,9 @@ static unsigned int pkt_type_offset(void)
 
 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
-	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	return __skb_get_poff(context->skb, context->flow);
 }
 
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12..4d431f0 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -270,21 +270,37 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 
+/* __flow_keys_reset() zeroes a flow_keys struct */
+void __flow_keys_reset(struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(struct flow_keys));
+}
+
+/* __flow_keys_inited() checks whether a flow_keys struct is init */
+bool __flow_keys_inited(struct flow_keys *flow)
+{
+	struct flow_keys zero_flow;
+
+	__flow_keys_reset(&zero_flow);
+	return memcmp(flow, &zero_flow, sizeof(struct flow_keys));
+}
+
 /* __skb_get_poff() returns the offset to the payload as far as it could
  * be dissected. The main user is currently BPF, so that we can dynamically
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow)
 {
-	struct flow_keys keys;
 	u32 poff = 0;
 
-	if (!skb_flow_dissect(skb, &keys))
-		return 0;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(flow))
+		if (!skb_flow_dissect(skb, flow))
+			return 0;
 
-	poff += keys.thoff;
-	switch (keys.ip_proto) {
+	poff += flow->thoff;
+	switch (flow->ip_proto) {
 	case IPPROTO_TCP: {
 		const struct tcphdr *tcph;
 		struct tcphdr _tcph;
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v3 net-next 2/3] net: filter: add insn for loading internal transport header offset
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (4 preceding siblings ...)
  2014-05-14 18:51 ` [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
@ 2014-05-14 18:51 ` Chema Gonzalez
  2014-05-14 18:51 ` [PATCH v3 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:51 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
header of a packet ("internal" meaning after decapsulation by the
flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 17 +++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4457b38..72b3e63 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -306,6 +306,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
+	BPF_S_ANC_TRA_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 253b4d4..9f1b8f1 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -131,7 +131,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_TRA_OFFSET	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index b71948b..c4ba8f9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -615,6 +615,17 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return __skb_get_poff(context->skb, context->flow);
 }
 
+static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(context->flow))
+		if (!skb_flow_dissect(context->skb, context->flow))
+			return 0;
+	return context->flow->thoff;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -781,6 +792,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
+	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -811,6 +823,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			insn->imm = __get_random_u32 - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+			insn->imm = __skb_get_tra_offset - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1348,6 +1363,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
+			ANCILLARY(TRA_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1733,6 +1749,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 833a966..4e72934 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -93,6 +93,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
+"#"?("toff")	{ return K_TOFF; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index e6306c5..ced6949 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -167,6 +167,9 @@ ldb
 	| OP_LDB K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDB K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldh
@@ -218,6 +221,9 @@ ldh
 	| OP_LDH K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDH K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldi
@@ -274,6 +280,9 @@ ld
 	| OP_LD K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v3 net-next 3/3] net: filter: add insn for loading internal transport header proto
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (5 preceding siblings ...)
  2014-05-14 18:51 ` [PATCH v3 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
@ 2014-05-14 18:51 ` Chema Gonzalez
  2014-05-16 18:41 ` [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
                   ` (6 subsequent siblings)
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 18:51 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_PROTOCOL insn that loads the protocol of
the internal transport header of a packet ("internal" meaning after
decapsulation by the flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 17 +++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 72b3e63..cc8e333 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -307,6 +307,7 @@ enum {
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
 	BPF_S_ANC_TRA_OFFSET,
+	BPF_S_ANC_TRA_PROTOCOL,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 9f1b8f1..e34d608 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -132,7 +132,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
 #define SKF_AD_TRA_OFFSET	60
-#define SKF_AD_MAX	64
+#define SKF_AD_TRA_PROTOCOL	64
+#define SKF_AD_MAX	68
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index c4ba8f9..c75eba6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -626,6 +626,17 @@ static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return context->flow->thoff;
 }
 
+static u64 __skb_get_tra_protocol(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+	struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
+			(unsigned long) ctx;
+	/* check whether the flow dissector has already been run */
+	if (!__flow_keys_inited(context->flow))
+		if (!skb_flow_dissect(context->skb, context->flow))
+			return 0;
+	return context->flow->ip_proto;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -793,6 +804,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+	case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -826,6 +838,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 			insn->imm = __skb_get_tra_offset - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
+			insn->imm = __skb_get_tra_protocol - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1364,6 +1379,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
 			ANCILLARY(TRA_OFFSET);
+			ANCILLARY(TRA_PROTOCOL);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1750,6 +1766,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 4e72934..c1b5958 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -94,6 +94,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
 "#"?("toff")	{ return K_TOFF; }
+"#"?("tproto")	{ return K_TPROTO; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index ced6949..71fef72 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF K_TPROTO
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -170,6 +170,9 @@ ldb
 	| OP_LDB K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDB K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldh
@@ -224,6 +227,9 @@ ldh
 	| OP_LDH K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDH K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldi
@@ -283,6 +289,9 @@ ld
 	| OP_LD K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LD K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-14 18:51 ` [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
@ 2014-05-14 20:05   ` Alexei Starovoitov
  2014-05-14 21:51     ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Alexei Starovoitov @ 2014-05-14 20:05 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Network Development

On Wed, May 14, 2014 at 11:51 AM, Chema Gonzalez <chema@google.com> wrote:
> We want multiple calls to __skb_get_poff() in the same filter to only
> cause one invocation to the flow dissector. In order to reuse the result
> of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
> variable in the eBPF runner stack (__sk_run_filter() function), and pass
> it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
> the very first time it is called, and reuses the result in any further
> invocation.
>
> We also add handy function to init/check for inited flow_keys.
>
> Signed-off-by: Chema Gonzalez <chema@google.com>
> ---
>  include/linux/skbuff.h    |  4 +++-
>  net/core/filter.c         | 14 ++++++++++++--
>  net/core/flow_dissector.c | 28 ++++++++++++++++++++++------
>  3 files changed, 37 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 7a9beeb..858d7af 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3065,7 +3065,9 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
>
>  int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
>
> -u32 __skb_get_poff(const struct sk_buff *skb);
> +void __flow_keys_reset(struct flow_keys *flow);
> +bool __flow_keys_inited(struct flow_keys *flow);
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow);
>
>  /**
>   * skb_head_is_locked - Determine if the skb->head is locked down
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c442a0d..b71948b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -66,6 +66,11 @@
>  #define CTX    regs[BPF_REG_CTX]
>  #define K      insn->imm
>
> +struct sk_run_filter_ctx {
> +       struct sk_buff *skb;
> +       struct flow_keys *flow;
> +};
> +
>  /* No hurry in this branch
>   *
>   * Exported for the bpf jit load helper.
> @@ -252,12 +257,15 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
>         };
>         void *ptr;
>         int off;
> +       struct flow_keys flow;
> +       struct sk_run_filter_ctx context = { ctx, &flow };
>
>  #define CONT    ({ insn++; goto select_insn; })
>  #define CONT_JMP ({ insn++; goto select_insn; })
>
> +       memset(&flow, 0, sizeof(flow));
>         FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
> -       ARG1 = (u64) (unsigned long) ctx;
> +       ARG1 = (u64) (unsigned long) &context;

I have a feeling that you didn't test this. Even basic "tcpdump port 22"
will be broken, let alone BPF testsuite. It also breaks seccomp, new JIT
and tracing filter patches I've posted yesterday.
sk_run_filter() is a generic interpreter that should be suitable for
sockets, seccomp, tracing, etc. Adding one use case specific data
structure to interpreter is not desirable.
Doing memset(&flow,0...) every invocation is not performance
friendly either. Majority of filters will suffer.

You can do the same without touching sk_run_filter().
For example, you can tweak sk_convert_filter() to pass
R10(frame pointer) as arg4 to helper functions
(__skb_get_pay_offset() and others)
then inside those functions cast 'arg4 - appropriate offset'
to struct flow_keys and use it.
BPF was extended from 16 spill/fill only slots to generic
stack space exactly for this type of use cases.
Key point: you can add pretty much any feature to classic BPF
by tweaking converter from classic to internal without changing
interpreter and causing trainwreck for non-socket use cases.

The idea itself I think is quite interesting. I'm only disliking its
implementation.

>         /* Register for user BPF programs need to be reset first. */
>         regs[BPF_REG_A] = 0;
> @@ -602,7 +610,9 @@ static unsigned int pkt_type_offset(void)
>
>  static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
>  {
> -       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
> +       struct sk_run_filter_ctx *context = (struct sk_run_filter_ctx *)
> +                       (unsigned long) ctx;
> +       return __skb_get_poff(context->skb, context->flow);
>  }
>
>  static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12..4d431f0 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -270,21 +270,37 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
>  }
>  EXPORT_SYMBOL(__skb_tx_hash);
>
> +/* __flow_keys_reset() zeroes a flow_keys struct */
> +void __flow_keys_reset(struct flow_keys *flow)
> +{
> +       memset(flow, 0, sizeof(struct flow_keys));
> +}
> +
> +/* __flow_keys_inited() checks whether a flow_keys struct is init */
> +bool __flow_keys_inited(struct flow_keys *flow)
> +{
> +       struct flow_keys zero_flow;
> +
> +       __flow_keys_reset(&zero_flow);
> +       return memcmp(flow, &zero_flow, sizeof(struct flow_keys));
> +}
> +
>  /* __skb_get_poff() returns the offset to the payload as far as it could
>   * be dissected. The main user is currently BPF, so that we can dynamically
>   * truncate packets without needing to push actual payload to the user
>   * space and can analyze headers only, instead.
>   */
> -u32 __skb_get_poff(const struct sk_buff *skb)
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow)
>  {
> -       struct flow_keys keys;
>         u32 poff = 0;
>
> -       if (!skb_flow_dissect(skb, &keys))
> -               return 0;
> +       /* check whether the flow dissector has already been run */
> +       if (!__flow_keys_inited(flow))
> +               if (!skb_flow_dissect(skb, flow))
> +                       return 0;
>
> -       poff += keys.thoff;
> -       switch (keys.ip_proto) {
> +       poff += flow->thoff;
> +       switch (flow->ip_proto) {
>         case IPPROTO_TCP: {
>                 const struct tcphdr *tcph;
>                 struct tcphdr _tcph;
> --
> 1.9.1.423.g4596e3a
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-14 20:05   ` Alexei Starovoitov
@ 2014-05-14 21:51     ` Chema Gonzalez
  2014-05-14 22:44       ` Alexei Starovoitov
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-14 21:51 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Network Development

Hi,

On Wed, May 14, 2014 at 1:05 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> I have a feeling that you didn't test this. Even basic "tcpdump port 22"
> will be broken,
Not sure whether you understood the use of this. I'm not trying to
change how to compile tcpdump expressions. The only way to access the
new features is using Daniel's bpf_asm code.

I did test the code:

$ cat tools/net/ipv4_tcp_poff2.bpf
ldh [12]
jne #0x800, drop
ldb [23]
jneq #6, drop
ld poff
ld poff
ld poff
ld poff
ld toff
ld toff
ld toff
ld tproto
ld tproto
ld tproto
ret #-1
drop: ret #0
$ ./tools/net/bpf_asm tools/net/ipv4_tcp_poff2.bpf
16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11 6,32 0 0 4294963252,32 0 0
4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963260,32 0
0 4294963260,32 0 0 4294963260,32 0 0 4294963264,32 0 0 4294963264,32
0 0 4294963264,6 0 0 4294967295,6 0 0 0,

And then, in a VM, I ran:

$ tcpdump -n -i eth0 -f "16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11
6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0
4294963252,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963260,32 0
0 4294963264,32 0 0 4294963264,32 0 0 4294963264,6 0 0 4294967295,6 0
0 0,"

This tcpdump is github's tcpdump HEAD with
https://github.com/the-tcpdump-group/libpcap/pull/353.

Adding some labels let me see how the flow dissector was only called
for the first "ld poff":

[    7.003362] --------__skb_get_poff(): checking flow dissector: {0,
0, 0, 0, 0} inited returns 0
[    7.005618] --------__skb_get_poff(): after calling flow dissector:
{-26957632, 23374016, 369113781, 34, 6}
[    7.006821] --------__skb_get_poff(): checking flow dissector:
{-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.008156] --------__skb_get_poff(): checking flow dissector:
{-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.009485] --------__skb_get_poff(): checking flow dissector:
{-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.010827] --------__skb_get_tra_offset(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.013630] --------__skb_get_tra_offset(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.015969] --------__skb_get_tra_offset(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.017357] --------__skb_get_tra_protocol(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.018749] --------__skb_get_tra_protocol(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1
[    7.020137] --------__skb_get_tra_protocol(): checking flow
dissector: {-26957632, 23374016, 369113781, 34, 6} inited returns 1

> let alone BPF testsuite. It also breaks seccomp, new JIT
> and tracing filter patches I've posted yesterday.
Note that the flow_keys field is only used by those functions actually
calling the flow dissector (originally __skb_get_poff(), and after
this set of patches __skb_get_tra_offset and __skb_get_tra_protocol).
If any non-packet user is calling any of these functions, you have an
issue there.

Sorry if I broke your patches from yesterday. I didn't see them. I'll
check them.

> sk_run_filter() is a generic interpreter that should be suitable for
> sockets, seccomp, tracing, etc. Adding one use case specific data
> structure to interpreter is not desirable.
That's an interesting point, which you'd agree does apply to "ld poff"
right now. There's a tradeoff here between making the BPF VM as
generic as possible, and having to reimplement in BPF VM code
functions that already exist in the kernel (the flow dissector
itself). I think not having 2 flow dissectors in the kernel (one in C,
one in BPF VM code) is a win.

> Doing memset(&flow,0...) every invocation is not performance
> friendly either. Majority of filters will suffer.
Will fix. In retrospect, I should have just added an flow_keys_is_init
field instead of mapping {0, 0, {0}, 0, 0} to "not inited."

> You can do the same without touching sk_run_filter().
> For example, you can tweak sk_convert_filter() to pass
> R10(frame pointer) as arg4 to helper functions
> (__skb_get_pay_offset() and others)
> then inside those functions cast 'arg4 - appropriate offset'
> to struct flow_keys and use it.
> BPF was extended from 16 spill/fill only slots to generic
> stack space exactly for this type of use cases.
I tried several implementations, including the one you propose:

1. add flow_keys to skb CB (packet_skb_cb)
The problem here is that struct flow_keys (15 bytes) does not fit in
AF_PACKET's 48-byte skb CB. I also tried to create a small_flow_keys
struct that only contains L3 info (nhoff/proto, which is the minimum
output you need from the flow dissector to be able to get the rest
without having to rewalk the packet), or maybe L3 and L4
(thoff/ip_proto). The problem is that the packet CB is completely full
already. packet_skb_cb is 4 (origlen) + 20 (sockaddr_ll) and
MAX_ADDR_LEN=32, so the net/packet/af_packet.c:1822 check is 4+20+32-8
= 48, which is already the size of the sk_buff->CB.

2. add flow_keys using the stack in the 3 BPF filter runners
(original, eBPF, JIT)
  - (-) there are 6x BPF engines (1x non-JIT and 5x JIT)
  - (+) we can just do the non-JIT version, and let JITs implement it
when they want)
  - (+) very simple

3. add FLOW, BPF_REG_FLOW, map it to a register, and then pass it as
R4. This is what you're proposing, IIIC. I found the implementation
way more complicated (in fact I gave up trying to make it not crash
;), but I guess this is my own taste. What really convinced me to
extend the context is that, if you think about it, both the skb and
the flow_keys are part of the context that the bpf_calls use to run.
If we pass flow_keys in R4, and we ever want to add more context
parameters to a call, that means using R5. And then what?

> Key point: you can add pretty much any feature to classic BPF
> by tweaking converter from classic to internal without changing
> interpreter and causing trainwreck for non-socket use cases.
Again, the patch is not breaking the current functionality. We already
provide "ld poff" as a recognition that packet processing is a first
class citizen of the BPF VM. If you run a seccomp filter containing a
"ld poff" right now, you're probably causing issues anyway.

> The idea itself I think is quite interesting. I'm only disliking its
> implementation.
Thanks a lot for the comments.

-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-14 21:51     ` Chema Gonzalez
@ 2014-05-14 22:44       ` Alexei Starovoitov
  2014-05-16 18:41         ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Alexei Starovoitov @ 2014-05-14 22:44 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Network Development

On Wed, May 14, 2014 at 2:51 PM, Chema Gonzalez <chema@google.com> wrote:
> Hi,
>
> On Wed, May 14, 2014 at 1:05 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
>> I have a feeling that you didn't test this. Even basic "tcpdump port 22"
>> will be broken,
> Not sure whether you understood the use of this. I'm not trying to
> change how to compile tcpdump expressions. The only way to access the
> new features is using Daniel's bpf_asm code.
>
> I did test the code:

please try 'modprobe test_bpf'… you'll see most ld_proto, ld_vlan, …
tests failing..

>> sk_run_filter() is a generic interpreter that should be suitable for
>> sockets, seccomp, tracing, etc. Adding one use case specific data
>> structure to interpreter is not desirable.
> That's an interesting point, which you'd agree does apply to "ld poff"
> right now. There's a tradeoff here between making the BPF VM as
> generic as possible, and having to reimplement in BPF VM code
> functions that already exist in the kernel (the flow dissector
> itself). I think not having 2 flow dissectors in the kernel (one in C,
> one in BPF VM code) is a win.

You misunderstood. That's not at all what I said. See the diff below.

>> Doing memset(&flow,0...) every invocation is not performance
>> friendly either. Majority of filters will suffer.
> Will fix. In retrospect, I should have just added an flow_keys_is_init
> field instead of mapping {0, 0, {0}, 0, 0} to "not inited."
>
>> You can do the same without touching sk_run_filter().
>> For example, you can tweak sk_convert_filter() to pass
>> R10(frame pointer) as arg4 to helper functions
>> (__skb_get_pay_offset() and others)
>> then inside those functions cast 'arg4 - appropriate offset'
>> to struct flow_keys and use it.
>> BPF was extended from 16 spill/fill only slots to generic
>> stack space exactly for this type of use cases.
> I tried several implementations, including the one you propose:

nope. see the diff of what I'm proposing.

> 1. add flow_keys to skb CB (packet_skb_cb)
> The problem here is that struct flow_keys (15 bytes) does not fit in
> AF_PACKET's 48-byte skb CB. I also tried to create a small_flow_keys
> struct that only contains L3 info (nhoff/proto, which is the minimum
> output you need from the flow dissector to be able to get the rest
> without having to rewalk the packet), or maybe L3 and L4
> (thoff/ip_proto). The problem is that the packet CB is completely full
> already. packet_skb_cb is 4 (origlen) + 20 (sockaddr_ll) and
> MAX_ADDR_LEN=32, so the net/packet/af_packet.c:1822 check is 4+20+32-8
> = 48, which is already the size of the sk_buff->CB.
>
> 2. add flow_keys using the stack in the 3 BPF filter runners
> (original, eBPF, JIT)
>   - (-) there are 6x BPF engines (1x non-JIT and 5x JIT)
>   - (+) we can just do the non-JIT version, and let JITs implement it
> when they want)
>   - (+) very simple
>
> 3. add FLOW, BPF_REG_FLOW, map it to a register, and then pass it as
> R4. This is what you're proposing, IIIC. I found the implementation
> way more complicated (in fact I gave up trying to make it not crash
> ;), but I guess this is my own taste. What really convinced me to
> extend the context is that, if you think about it, both the skb and
> the flow_keys are part of the context that the bpf_calls use to run.
> If we pass flow_keys in R4, and we ever want to add more context
> parameters to a call, that means using R5. And then what?
>
>> Key point: you can add pretty much any feature to classic BPF
>> by tweaking converter from classic to internal without changing
>> interpreter and causing trainwreck for non-socket use cases.
> Again, the patch is not breaking the current functionality. We already
> provide "ld poff" as a recognition that packet processing is a first
> class citizen of the BPF VM. If you run a seccomp filter containing a
> "ld poff" right now, you're probably causing issues anyway.

'ld poff' is not first class citizen of interpreter. There is no such eBPF insn.
It's a classic BPF instruction. It is converted into a sequence of eBPF
insns including bpf_call.

Just to reduce the number of back and forth emails here is the diff
of what you want to do that doesn't break things:
(sorry for whitespace mangling)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb1c458..10f580e44a33 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3065,7 +3065,7 @@ bool skb_partial_csum_set(struct sk_buff *skb,
u16 start, u16 off);

 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);

-u32 __skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *keys);

 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
diff --git a/net/core/filter.c b/net/core/filter.c
index 32c5b44c537e..1179f2fb4c95 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -602,7 +602,12 @@ static unsigned int pkt_type_offset(void)

 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
-       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+       /* top BPF_MEMWORDS * 4 bytes are used to represent classic BPF
+        * mem[0-15] slots, use next sizeof(struct flow_keys) bytes
+        * of stack to share flow_dissect-ed data
+        */
+       struct flow_keys *keys = (void *) r4 - BPF_MEMWORDS * 4 - sizeof(*keys);
+       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx, keys);
 }

 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
@@ -783,6 +788,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
                *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
                insn++;

+               /* arg4 = FP */
+               *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG4, BPF_REG_FP);
+               insn++;
+
                /* Emit call(ctx, arg2=A, arg3=X) */
                insn->code = BPF_JMP | BPF_CALL;
                switch (fp->k) {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12a5323..c8d9f16e4872 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -275,16 +275,15 @@ EXPORT_SYMBOL(__skb_tx_hash);
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *keys)
 {
-       struct flow_keys keys;
        u32 poff = 0;

-       if (!skb_flow_dissect(skb, &keys))
+       if (!skb_flow_dissect(skb, keys))
                return 0;

-       poff += keys.thoff;
-       switch (keys.ip_proto) {
+       poff += keys->thoff;
+       switch (keys->ip_proto) {
        case IPPROTO_TCP: {

now just use the same 'keys' address in your new get_off calls.
First word can be used to indicate whether flow_dissector() was called
or not.

Above is tested with and without JIT with BPF testsuite.

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-14 22:44       ` Alexei Starovoitov
@ 2014-05-16 18:41         ` Chema Gonzalez
  0 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-16 18:41 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Network Development

On Wed, May 14, 2014 at 3:44 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> please try 'modprobe test_bpf'… you'll see most ld_proto, ld_vlan, …
> tests failing..
Fixed.

>>> sk_run_filter() is a generic interpreter that should be suitable for
>>> sockets, seccomp, tracing, etc. Adding one use case specific data
>>> structure to interpreter is not desirable.
>> That's an interesting point, which you'd agree does apply to "ld poff"
>> right now. There's a tradeoff here between making the BPF VM as
>> generic as possible, and having to reimplement in BPF VM code
>> functions that already exist in the kernel (the flow dissector
>> itself). I think not having 2 flow dissectors in the kernel (one in C,
>> one in BPF VM code) is a win.
>
> You misunderstood. That's not at all what I said. See the diff below.
OK. Fixed using the extended eBPF stack (EES). Note that I added an
explicit struct to put stuff in the EES (instead of assuming what's
there).

>>> Doing memset(&flow,0...) every invocation is not performance
>>> friendly either. Majority of filters will suffer.
>> Will fix. In retrospect, I should have just added an flow_keys_is_init
>> field instead of mapping {0, 0, {0}, 0, 0} to "not inited."
Done.

>>> You can do the same without touching sk_run_filter().
>>> For example, you can tweak sk_convert_filter() to pass
>>> R10(frame pointer) as arg4 to helper functions
>>> (__skb_get_pay_offset() and others)
>>> then inside those functions cast 'arg4 - appropriate offset'
>>> to struct flow_keys and use it.
>>> BPF was extended from 16 spill/fill only slots to generic
>>> stack space exactly for this type of use cases.
>> I tried several implementations, including the one you propose:
>
> nope. see the diff of what I'm proposing.
Done.

>
>> 1. add flow_keys to skb CB (packet_skb_cb)
>> The problem here is that struct flow_keys (15 bytes) does not fit in
>> AF_PACKET's 48-byte skb CB. I also tried to create a small_flow_keys
>> struct that only contains L3 info (nhoff/proto, which is the minimum
>> output you need from the flow dissector to be able to get the rest
>> without having to rewalk the packet), or maybe L3 and L4
>> (thoff/ip_proto). The problem is that the packet CB is completely full
>> already. packet_skb_cb is 4 (origlen) + 20 (sockaddr_ll) and
>> MAX_ADDR_LEN=32, so the net/packet/af_packet.c:1822 check is 4+20+32-8
>> = 48, which is already the size of the sk_buff->CB.
>>
>> 2. add flow_keys using the stack in the 3 BPF filter runners
>> (original, eBPF, JIT)
>>   - (-) there are 6x BPF engines (1x non-JIT and 5x JIT)
>>   - (+) we can just do the non-JIT version, and let JITs implement it
>> when they want)
>>   - (+) very simple
>>
>> 3. add FLOW, BPF_REG_FLOW, map it to a register, and then pass it as
>> R4. This is what you're proposing, IIIC. I found the implementation
>> way more complicated (in fact I gave up trying to make it not crash
>> ;), but I guess this is my own taste. What really convinced me to
>> extend the context is that, if you think about it, both the skb and
>> the flow_keys are part of the context that the bpf_calls use to run.
>> If we pass flow_keys in R4, and we ever want to add more context
>> parameters to a call, that means using R5. And then what?
>>
>>> Key point: you can add pretty much any feature to classic BPF
>>> by tweaking converter from classic to internal without changing
>>> interpreter and causing trainwreck for non-socket use cases.
>> Again, the patch is not breaking the current functionality. We already
>> provide "ld poff" as a recognition that packet processing is a first
>> class citizen of the BPF VM. If you run a seccomp filter containing a
>> "ld poff" right now, you're probably causing issues anyway.
>
> 'ld poff' is not first class citizen of interpreter. There is no such eBPF insn.
> It's a classic BPF instruction. It is converted into a sequence of eBPF
> insns including bpf_call.
>
> Just to reduce the number of back and forth emails here is the diff
> of what you want to do that doesn't break things:
> (sorry for whitespace mangling)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 7a9beeb1c458..10f580e44a33 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3065,7 +3065,7 @@ bool skb_partial_csum_set(struct sk_buff *skb,
> u16 start, u16 off);
>
>  int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
>
> -u32 __skb_get_poff(const struct sk_buff *skb);
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *keys);
>
>  /**
>   * skb_head_is_locked - Determine if the skb->head is locked down
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 32c5b44c537e..1179f2fb4c95 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -602,7 +602,12 @@ static unsigned int pkt_type_offset(void)
>
>  static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
>  {
> -       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
> +       /* top BPF_MEMWORDS * 4 bytes are used to represent classic BPF
> +        * mem[0-15] slots, use next sizeof(struct flow_keys) bytes
> +        * of stack to share flow_dissect-ed data
> +        */
> +       struct flow_keys *keys = (void *) r4 - BPF_MEMWORDS * 4 - sizeof(*keys);
> +       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx, keys);
>  }
>
>  static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
> @@ -783,6 +788,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
>                 *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
>                 insn++;
>
> +               /* arg4 = FP */
> +               *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG4, BPF_REG_FP);
> +               insn++;
> +
>                 /* Emit call(ctx, arg2=A, arg3=X) */
>                 insn->code = BPF_JMP | BPF_CALL;
>                 switch (fp->k) {
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12a5323..c8d9f16e4872 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -275,16 +275,15 @@ EXPORT_SYMBOL(__skb_tx_hash);
>   * truncate packets without needing to push actual payload to the user
>   * space and can analyze headers only, instead.
>   */
> -u32 __skb_get_poff(const struct sk_buff *skb)
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *keys)
>  {
> -       struct flow_keys keys;
>         u32 poff = 0;
>
> -       if (!skb_flow_dissect(skb, &keys))
> +       if (!skb_flow_dissect(skb, keys))
>                 return 0;
>
> -       poff += keys.thoff;
> -       switch (keys.ip_proto) {
> +       poff += keys->thoff;
> +       switch (keys->ip_proto) {
>         case IPPROTO_TCP: {
>
> now just use the same 'keys' address in your new get_off calls.
> First word can be used to indicate whether flow_dissector() was called
> or not.
>
> Above is tested with and without JIT with BPF testsuite.

Patches now.
-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (6 preceding siblings ...)
  2014-05-14 18:51 ` [PATCH v3 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
@ 2014-05-16 18:41 ` Chema Gonzalez
  2014-05-16 22:00   ` Alexei Starovoitov
  2014-05-16 18:41 ` [PATCH v5 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (5 subsequent siblings)
  13 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-16 18:41 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

We want multiple calls to __skb_get_poff() in the same filter to only
cause one invocation to the flow dissector. In order to reuse the result
of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
variable in the eBPF runner stack (__sk_run_filter() function), and pass
it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
the very first time it is called, and reuses the result in any further
invocation.

Tested:

$ cat tools/net/ipv4_tcp_poff2.bpf
ldh [12]
jne #0x800, drop
ldb [23]
jneq #6, drop
ld poff
ld poff
ld poff
ld poff
ld toff
ld toff
ld toff
ld tproto
ld tproto
ld tproto
ret #-1
drop: ret #0
$ ./tools/net/bpf_asm tools/net/ipv4_tcp_poff2.bpf
16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963264,32 0 0 4294963264,32 0 0 4294963264,6 0 0 4294967295,6 0 0 0,

And then, in a VM, I ran:

$ tcpdump -n -i eth0 -f "16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11
6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0
4294963252,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963260,32 0
0 4294963264,32 0 0 4294963264,32 0 0 4294963264,6 0 0 4294967295,6 0
0 0,"

This tcpdump is github's tcpdump HEAD with
https://github.com/the-tcpdump-group/libpcap/pull/353.

Adding some labels shows how the flow dissector is only called for
the first "ld poff":

...
[   14.400269] --------__sk_run_filter(): setting flow: {0, 481192, -30720, 1013, 8} is inited? 0
[   14.401528] --------__skb_get_poff(): checking flow dissector: {0, 481192, -30720, 1013, 8} is inited? 0
[   14.403088] --------__skb_get_poff(): before calling flow dissector: {0, 481192, -30720, 1013, 8}
[   14.404068] --------__skb_get_poff(): after calling flow dissector: {23374016, -26957632, -174123520, 34, 6}
[   14.405154] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.406264] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.407412] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.408520] --------__skb_get_tra_offset(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.409673] --------__skb_get_tra_offset(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.410845] --------__skb_get_tra_offset(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.412008] --------__skb_get_tra_protocol(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.413255] --------__skb_get_tra_protocol(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.414437] --------__skb_get_tra_protocol(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
[   14.415888] --------__sk_run_filter(): setting flow: {-1, 399522456, -30720, 1736, 8} is inited? 0
[   14.415929] --------__sk_run_filter(): setting flow: {0, 1400960, -30720, 56016, 7} is inited? 0
[   14.415932] --------__skb_get_poff(): checking flow dissector: {0, 1400960, -30720, 56016, 7} is inited? 0
[   14.415932] --------__skb_get_poff(): before calling flow dissector: {0, 1400960, -30720, 56016, 7}
[   14.415950] --------__skb_get_poff(): after calling flow dissector: {23374016, -26957632, -174123520, 34, 6}
[   14.415952] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
...

$ modprobe test_bpf
[    9.809183] test_bpf: #0 TAX 23 39 39 PASS
[    9.820202] test_bpf: #1 TXA 10 10 11 PASS
[    9.824239] test_bpf: #2 ADD_SUB_MUL_K 13 PASS
[    9.826369] test_bpf: #3 DIV_KX 45 PASS
[    9.831530] test_bpf: #4 AND_OR_LSH_K 15 14 PASS
[    9.835290] test_bpf: #5 LD_IND 11 11 11 PASS
[    9.839567] test_bpf: #6 LD_ABS 10 10 10 PASS
[    9.843381] test_bpf: #7 LD_ABS_LL 18 39 PASS
[    9.849925] test_bpf: #8 LD_IND_LL 18 18 18 PASS
[    9.856191] test_bpf: #9 LD_ABS_NET 15 18 PASS
[    9.860391] test_bpf: #10 LD_IND_NET 15 18 17 PASS
[    9.866310] test_bpf: #11 LD_PKTTYPE 44 47 PASS
[    9.876354] test_bpf: #12 LD_MARK 7 7 PASS
[    9.878626] test_bpf: #13 LD_RXHASH 8 8 PASS
[    9.880990] test_bpf: #14 LD_QUEUE 7 7 PASS
[    9.883251] test_bpf: #15 LD_PROTOCOL 20 20 PASS
[    9.888086] test_bpf: #16 LD_VLAN_TAG 9 9 PASS
[    9.890708] test_bpf: #17 LD_VLAN_TAG_PRESENT 10 11 PASS
[    9.893785] test_bpf: #18 LD_IFINDEX 11 11 PASS
[    9.896924] test_bpf: #19 LD_HATYPE 13 14 PASS
[    9.900458] test_bpf: #20 LD_CPU 43 43 PASS
[    9.909919] test_bpf: #21 LD_NLATTR 18 23 PASS
[    9.914841] test_bpf: #22 LD_NLATTR_NEST 110 155 PASS
[    9.942252] test_bpf: #23 LD_PAYLOAD_OFF 134 93 PASS
[    9.965865] test_bpf: #24 LD_ANC_XOR 9 9 PASS
[    9.968571] test_bpf: #25 SPILL_FILL 26 26 26 PASS
[    9.977303] test_bpf: #26 JEQ 10 10 11 PASS
[    9.981278] test_bpf: #27 JGT 10 11 11 PASS
[    9.985383] test_bpf: #28 JGE 13 18 19 PASS
[    9.991189] test_bpf: #29 JSET 24 29 67 PASS
[   10.004116] test_bpf: #30 tcpdump port 22 9 32 37 PASS
[   10.012935] test_bpf: #31 tcpdump complex 9 28 79 PASS
[   10.025630] test_bpf: #32 RET_A 7 7 PASS
[   10.027799] test_bpf: #33 INT: ADD trivial 12 PASS
[   10.029827] test_bpf: #34 INT: MUL_X 10 PASS
[   10.031588] test_bpf: #35 INT: MUL_X2 12 PASS
[   10.033561] test_bpf: #36 INT: MUL32_X 12 PASS
[   10.035462] test_bpf: #37 INT: ADD 64-bit 583 PASS
[   10.094546] test_bpf: #38 INT: ADD 32-bit 525 PASS
[   10.147935] test_bpf: #39 INT: SUB 386 PASS
[   10.187293] test_bpf: #40 INT: XOR 142 PASS
[   10.202252] test_bpf: #41 INT: MUL 171 PASS
[   10.220148] test_bpf: #42 INT: ALU MIX 33 PASS
[   10.224212] test_bpf: #43 INT: DIV + ABS 24 26 PASS
[   10.230178] test_bpf: #44 INT: DIV by zero 10 7 PASS
[   10.232817] test_bpf: #45 check: missing ret PASS
[   10.233604] test_bpf: #46 check: div_k_0 PASS
[   10.234273] test_bpf: #47 check: unknown insn PASS
[   10.235008] test_bpf: #48 check: out of range spill/fill PASS

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/skbuff.h    |  3 ++-
 net/core/filter.c         | 26 +++++++++++++++++++++++++-
 net/core/flow_dissector.c | 16 ++++++++++------
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb..5f42eee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3065,7 +3065,8 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
 
-u32 __skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
+		bool *flow_initted);
 
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
diff --git a/net/core/filter.c b/net/core/filter.c
index 32c5b44..fc20588 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -66,6 +66,11 @@
 #define CTX	regs[BPF_REG_CTX]
 #define K	insn->imm
 
+struct sk_run_filter_ctx {
+	struct flow_keys flow;
+	bool flow_initted;
+};
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -252,6 +257,7 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 	};
 	void *ptr;
 	int off;
+	struct sk_run_filter_ctx *context;
 
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
@@ -259,6 +265,17 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
 	ARG1 = (u64) (unsigned long) ctx;
 
+	/* init context.
+	 *
+	 * Top (BPF_MEMWORDS * 4) bytes are used to represent classic BPF
+	 * mem[0-15] slots. We use the next sizeof(struct sk_run_filter_ctx)
+	 * bytes of stack to share context data (so far only the flow_keys
+	 * obtained from dissecting the flow, and a bool stating whether
+	 * such field has been inited)
+	 */
+	context = (void *)FP - BPF_MEMWORDS * 4 - sizeof(*context);
+	context->flow_initted = false;
+
 	/* Register for user BPF programs need to be reset first. */
 	regs[BPF_REG_A] = 0;
 	regs[BPF_REG_X] = 0;
@@ -602,7 +619,10 @@ static unsigned int pkt_type_offset(void)
 
 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
-	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	struct sk_run_filter_ctx *context = (void *) r4 - BPF_MEMWORDS * 4 -
+			sizeof(*context);
+	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx,
+			&context->flow, &context->flow_initted);
 }
 
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
@@ -783,6 +803,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
 		insn++;
 
+		/* arg4 = FP */
+		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG4, BPF_REG_FP);
+		insn++;
+
 		/* Emit call(ctx, arg2=A, arg3=X) */
 		insn->code = BPF_JMP | BPF_CALL;
 		switch (fp->k) {
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12..cefe1d2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -275,16 +275,20 @@ EXPORT_SYMBOL(__skb_tx_hash);
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
+		bool *flow_initted)
 {
-	struct flow_keys keys;
 	u32 poff = 0;
 
-	if (!skb_flow_dissect(skb, &keys))
-		return 0;
+	/* check whether the flow dissector has already been run */
+	if (!*flow_initted) {
+		if (!skb_flow_dissect(skb, flow))
+			return 0;
+		*flow_initted = true;
+	}
 
-	poff += keys.thoff;
-	switch (keys.ip_proto) {
+	poff += flow->thoff;
+	switch (flow->ip_proto) {
 	case IPPROTO_TCP: {
 		const struct tcphdr *tcph;
 		struct tcphdr _tcph;
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v5 net-next 2/3] net: filter: add insn for loading internal transport header offset
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (7 preceding siblings ...)
  2014-05-16 18:41 ` [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
@ 2014-05-16 18:41 ` Chema Gonzalez
  2014-05-16 18:41 ` [PATCH v5 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-16 18:41 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
header of a packet ("internal" meaning after decapsulation by the
flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 20 ++++++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9d5ae0a..7af9410 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -309,6 +309,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
+	BPF_S_ANC_TRA_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 253b4d4..9f1b8f1 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -131,7 +131,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_TRA_OFFSET	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index fc20588..edaf03f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -625,6 +625,20 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 			&context->flow, &context->flow_initted);
 }
 
+static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+	struct sk_run_filter_ctx *context = (void *) r4 - BPF_MEMWORDS * 4 -
+			sizeof(*context);
+	/* check whether the flow dissector has already been run */
+	if (!context->flow_initted) {
+		if (!skb_flow_dissect((struct sk_buff *)(unsigned long) ctx,
+				&context->flow))
+			return 0;
+		context->flow_initted = true;
+	}
+	return context->flow.thoff;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -791,6 +805,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
+	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -825,6 +840,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			insn->imm = __get_random_u32 - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+			insn->imm = __skb_get_tra_offset - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1362,6 +1380,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
+			ANCILLARY(TRA_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1754,6 +1773,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 833a966..4e72934 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -93,6 +93,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
+"#"?("toff")	{ return K_TOFF; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index e6306c5..ced6949 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -167,6 +167,9 @@ ldb
 	| OP_LDB K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDB K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldh
@@ -218,6 +221,9 @@ ldh
 	| OP_LDH K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDH K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldi
@@ -274,6 +280,9 @@ ld
 	| OP_LD K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v5 net-next 3/3] net: filter: add insn for loading internal transport header proto
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (8 preceding siblings ...)
  2014-05-16 18:41 ` [PATCH v5 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
@ 2014-05-16 18:41 ` Chema Gonzalez
  2014-05-29 18:55 ` [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF Chema Gonzalez
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-16 18:41 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Alexei Starovoitov, dborkman
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_PROTOCOL insn that loads the protocol of
the internal transport header of a packet ("internal" meaning after
decapsulation by the flow dissector).

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 20 ++++++++++++++++++++
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7af9410..1d7d68e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -310,6 +310,7 @@ enum {
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
 	BPF_S_ANC_TRA_OFFSET,
+	BPF_S_ANC_TRA_PROTOCOL,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 9f1b8f1..e34d608 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -132,7 +132,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
 #define SKF_AD_TRA_OFFSET	60
-#define SKF_AD_MAX	64
+#define SKF_AD_TRA_PROTOCOL	64
+#define SKF_AD_MAX	68
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index edaf03f..7245a0d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -639,6 +639,20 @@ static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return context->flow.thoff;
 }
 
+static u64 __skb_get_tra_protocol(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+{
+	struct sk_run_filter_ctx *context = (void *) r4 - BPF_MEMWORDS * 4 -
+			sizeof(*context);
+	/* check whether the flow dissector has already been run */
+	if (!context->flow_initted) {
+		if (!skb_flow_dissect((struct sk_buff *)(unsigned long) ctx,
+				&context->flow))
+			return 0;
+		context->flow_initted = true;
+	}
+	return context->flow.ip_proto;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -806,6 +820,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+	case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -843,6 +858,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 			insn->imm = __skb_get_tra_offset - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
+			insn->imm = __skb_get_tra_protocol - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -1381,6 +1399,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
 			ANCILLARY(TRA_OFFSET);
+			ANCILLARY(TRA_PROTOCOL);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1774,6 +1793,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 4e72934..c1b5958 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -94,6 +94,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
 "#"?("toff")	{ return K_TOFF; }
+"#"?("tproto")	{ return K_TPROTO; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index ced6949..71fef72 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF K_TPROTO
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -170,6 +170,9 @@ ldb
 	| OP_LDB K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDB K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldh
@@ -224,6 +227,9 @@ ldh
 	| OP_LDH K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDH K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldi
@@ -283,6 +289,9 @@ ld
 	| OP_LD K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LD K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-16 18:41 ` [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
@ 2014-05-16 22:00   ` Alexei Starovoitov
  2014-05-19 22:23     ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Alexei Starovoitov @ 2014-05-16 22:00 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Network Development

On Fri, May 16, 2014 at 11:41 AM, Chema Gonzalez <chema@google.com> wrote:
> We want multiple calls to __skb_get_poff() in the same filter to only
> cause one invocation to the flow dissector. In order to reuse the result
> of the flow dissector invocation (skb_flow_dissect()), we add a flow_keys
> variable in the eBPF runner stack (__sk_run_filter() function), and pass
> it as an argument to __skb_get_poff(). __skb_get_poff() inits the variable
> the very first time it is called, and reuses the result in any further
> invocation.
>
> Tested:
>
> $ cat tools/net/ipv4_tcp_poff2.bpf
> ldh [12]
> jne #0x800, drop
> ldb [23]
> jneq #6, drop
> ld poff
> ld poff
> ld poff
> ld poff
> ld toff
> ld toff
> ld toff
> ld tproto
> ld tproto
> ld tproto
> ret #-1
> drop: ret #0
> $ ./tools/net/bpf_asm tools/net/ipv4_tcp_poff2.bpf
> 16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963264,32 0 0 4294963264,32 0 0 4294963264,6 0 0 4294967295,6 0 0 0,
>
> And then, in a VM, I ran:
>
> $ tcpdump -n -i eth0 -f "16,40 0 0 12,21 0 13 2048,48 0 0 23,21 0 11
> 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0
> 4294963252,32 0 0 4294963260,32 0 0 4294963260,32 0 0 4294963260,32 0
> 0 4294963264,32 0 0 4294963264,32 0 0 4294963264,6 0 0 4294967295,6 0
> 0 0,"
>
> This tcpdump is github's tcpdump HEAD with
> https://github.com/the-tcpdump-group/libpcap/pull/353.
>
> Adding some labels shows how the flow dissector is only called for
> the first "ld poff":
>
> ...
> [   14.400269] --------__sk_run_filter(): setting flow: {0, 481192, -30720, 1013, 8} is inited? 0
> [   14.401528] --------__skb_get_poff(): checking flow dissector: {0, 481192, -30720, 1013, 8} is inited? 0
> [   14.403088] --------__skb_get_poff(): before calling flow dissector: {0, 481192, -30720, 1013, 8}
> [   14.404068] --------__skb_get_poff(): after calling flow dissector: {23374016, -26957632, -174123520, 34, 6}
> [   14.405154] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.406264] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.407412] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.408520] --------__skb_get_tra_offset(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.409673] --------__skb_get_tra_offset(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.410845] --------__skb_get_tra_offset(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.412008] --------__skb_get_tra_protocol(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.413255] --------__skb_get_tra_protocol(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.414437] --------__skb_get_tra_protocol(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1
> [   14.415888] --------__sk_run_filter(): setting flow: {-1, 399522456, -30720, 1736, 8} is inited? 0
> [   14.415929] --------__sk_run_filter(): setting flow: {0, 1400960, -30720, 56016, 7} is inited? 0
> [   14.415932] --------__skb_get_poff(): checking flow dissector: {0, 1400960, -30720, 56016, 7} is inited? 0
> [   14.415932] --------__skb_get_poff(): before calling flow dissector: {0, 1400960, -30720, 56016, 7}
> [   14.415950] --------__skb_get_poff(): after calling flow dissector: {23374016, -26957632, -174123520, 34, 6}
> [   14.415952] --------__skb_get_poff(): checking flow dissector: {23374016, -26957632, -174123520, 34, 6} is inited? 1

this is way too long for commit log. These details can go after '---' line.

> $ modprobe test_bpf
> [    9.809183] test_bpf: #0 TAX 23 39 39 PASS
> [    9.820202] test_bpf: #1 TXA 10 10 11 PASS
> [    9.824239] test_bpf: #2 ADD_SUB_MUL_K 13 PASS
> [    9.826369] test_bpf: #3 DIV_KX 45 PASS
> [    9.831530] test_bpf: #4 AND_OR_LSH_K 15 14 PASS
> [    9.835290] test_bpf: #5 LD_IND 11 11 11 PASS
> [    9.839567] test_bpf: #6 LD_ABS 10 10 10 PASS
> [    9.843381] test_bpf: #7 LD_ABS_LL 18 39 PASS
> [    9.849925] test_bpf: #8 LD_IND_LL 18 18 18 PASS
> [    9.856191] test_bpf: #9 LD_ABS_NET 15 18 PASS
> [    9.860391] test_bpf: #10 LD_IND_NET 15 18 17 PASS
> [    9.866310] test_bpf: #11 LD_PKTTYPE 44 47 PASS
> [    9.876354] test_bpf: #12 LD_MARK 7 7 PASS
> [    9.878626] test_bpf: #13 LD_RXHASH 8 8 PASS
> [    9.880990] test_bpf: #14 LD_QUEUE 7 7 PASS
> [    9.883251] test_bpf: #15 LD_PROTOCOL 20 20 PASS
> [    9.888086] test_bpf: #16 LD_VLAN_TAG 9 9 PASS
> [    9.890708] test_bpf: #17 LD_VLAN_TAG_PRESENT 10 11 PASS
> [    9.893785] test_bpf: #18 LD_IFINDEX 11 11 PASS
> [    9.896924] test_bpf: #19 LD_HATYPE 13 14 PASS
> [    9.900458] test_bpf: #20 LD_CPU 43 43 PASS
> [    9.909919] test_bpf: #21 LD_NLATTR 18 23 PASS
> [    9.914841] test_bpf: #22 LD_NLATTR_NEST 110 155 PASS
> [    9.942252] test_bpf: #23 LD_PAYLOAD_OFF 134 93 PASS
> [    9.965865] test_bpf: #24 LD_ANC_XOR 9 9 PASS
> [    9.968571] test_bpf: #25 SPILL_FILL 26 26 26 PASS
> [    9.977303] test_bpf: #26 JEQ 10 10 11 PASS
> [    9.981278] test_bpf: #27 JGT 10 11 11 PASS
> [    9.985383] test_bpf: #28 JGE 13 18 19 PASS
> [    9.991189] test_bpf: #29 JSET 24 29 67 PASS
> [   10.004116] test_bpf: #30 tcpdump port 22 9 32 37 PASS
> [   10.012935] test_bpf: #31 tcpdump complex 9 28 79 PASS
> [   10.025630] test_bpf: #32 RET_A 7 7 PASS
> [   10.027799] test_bpf: #33 INT: ADD trivial 12 PASS
> [   10.029827] test_bpf: #34 INT: MUL_X 10 PASS
> [   10.031588] test_bpf: #35 INT: MUL_X2 12 PASS
> [   10.033561] test_bpf: #36 INT: MUL32_X 12 PASS
> [   10.035462] test_bpf: #37 INT: ADD 64-bit 583 PASS
> [   10.094546] test_bpf: #38 INT: ADD 32-bit 525 PASS
> [   10.147935] test_bpf: #39 INT: SUB 386 PASS
> [   10.187293] test_bpf: #40 INT: XOR 142 PASS
> [   10.202252] test_bpf: #41 INT: MUL 171 PASS
> [   10.220148] test_bpf: #42 INT: ALU MIX 33 PASS
> [   10.224212] test_bpf: #43 INT: DIV + ABS 24 26 PASS
> [   10.230178] test_bpf: #44 INT: DIV by zero 10 7 PASS
> [   10.232817] test_bpf: #45 check: missing ret PASS
> [   10.233604] test_bpf: #46 check: div_k_0 PASS
> [   10.234273] test_bpf: #47 check: unknown insn PASS
> [   10.235008] test_bpf: #48 check: out of range spill/fill PASS
>
> Signed-off-by: Chema Gonzalez <chema@google.com>
> ---
>  include/linux/skbuff.h    |  3 ++-
>  net/core/filter.c         | 26 +++++++++++++++++++++++++-
>  net/core/flow_dissector.c | 16 ++++++++++------
>  3 files changed, 37 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 7a9beeb..5f42eee 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3065,7 +3065,8 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
>
>  int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
>
> -u32 __skb_get_poff(const struct sk_buff *skb);
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
> +               bool *flow_initted);
>
>  /**
>   * skb_head_is_locked - Determine if the skb->head is locked down
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 32c5b44..fc20588 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -66,6 +66,11 @@
>  #define CTX    regs[BPF_REG_CTX]
>  #define K      insn->imm
>
> +struct sk_run_filter_ctx {
> +       struct flow_keys flow;
> +       bool flow_initted;
> +};
> +
>  /* No hurry in this branch
>   *
>   * Exported for the bpf jit load helper.
> @@ -252,6 +257,7 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
>         };
>         void *ptr;
>         int off;
> +       struct sk_run_filter_ctx *context;
>
>  #define CONT    ({ insn++; goto select_insn; })
>  #define CONT_JMP ({ insn++; goto select_insn; })
> @@ -259,6 +265,17 @@ unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
>         FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
>         ARG1 = (u64) (unsigned long) ctx;
>
> +       /* init context.
> +        *
> +        * Top (BPF_MEMWORDS * 4) bytes are used to represent classic BPF
> +        * mem[0-15] slots. We use the next sizeof(struct sk_run_filter_ctx)
> +        * bytes of stack to share context data (so far only the flow_keys
> +        * obtained from dissecting the flow, and a bool stating whether
> +        * such field has been inited)
> +        */
> +       context = (void *)FP - BPF_MEMWORDS * 4 - sizeof(*context);
> +       context->flow_initted = false;
> +

Neither this code nor comment belong in interpreter which is striving to
be generic. Both have a meaning only for your new instructions in
socket filters. They do not apply to seccomp and to tracing.
Calling it 'struct sk_run_filter_ctx' is equally misleading.
'struct flow_keys' has nothing to do with seccomp and even with
normal tcpdump filters.

We've spent so much time to generalize it and now you want to take
it back into socket specific territory. As I said in the previous email
it's totally not ok to customize eBPF just for one use case especially
when you're not sharing final usage scenario.
Any uapi change must be strongly justified.

Other than design issue there is technical problem too.
You're not updating eBPF JIT and context->flow_initted will have junk,
so ld_poff will be returning junk too.
JITs and interpreter must match. That was always the case even in
old days of classic BPF. Interpreter initialized A and X to zero,
JITs had to do the same. If you add stuff like this to interpreter,
JITs would have to follow. Fast forward few years and this is not scalable.
You have several JITs around and adding custom stuff to interpreter
may not even be possible to do in JITs.
Therefore I'm strongly against adding custom C code to interpreter.
It's an interpreter of eBPF instructions. Nothing else.
At the same time adding new eBPF instructions would be completely fine.
Like right now we don't have < and <= conditional jumps, which adds
unnecessary complexity to tracing filters. So we may still expand
interpreter and JITs in the future.

Going back to your use case… assuming that you share how user
space will use new anc loads… you can do equivalent initialization
of 'flow_initted' by emitting extra eBPF insn while converting classic
to internal. Don't do it unconditionally though. tcpdump filters should
not suffer. You can scan program for presence of ld_poff, ld_toff,
ld_tproto and if they're present, emit single eBPF 'st [fp - off], 0'
ST_MEM instruction as a first instruction of converted filter. All eBPF
JITs will compile it to native automatically. You'll have good
performance and all existing filters will not regress.

>         /* Register for user BPF programs need to be reset first. */
>         regs[BPF_REG_A] = 0;
>         regs[BPF_REG_X] = 0;
> @@ -602,7 +619,10 @@ static unsigned int pkt_type_offset(void)
>
>  static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
>  {
> -       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
> +       struct sk_run_filter_ctx *context = (void *) r4 - BPF_MEMWORDS * 4 -
> +                       sizeof(*context);
> +       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx,
> +                       &context->flow, &context->flow_initted);
>  }
>
>  static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
> @@ -783,6 +803,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
>                 *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
>                 insn++;
>
> +               /* arg4 = FP */
> +               *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG4, BPF_REG_FP);
> +               insn++;
> +
>                 /* Emit call(ctx, arg2=A, arg3=X) */
>                 insn->code = BPF_JMP | BPF_CALL;
>                 switch (fp->k) {
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 107ed12..cefe1d2 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -275,16 +275,20 @@ EXPORT_SYMBOL(__skb_tx_hash);
>   * truncate packets without needing to push actual payload to the user
>   * space and can analyze headers only, instead.
>   */
> -u32 __skb_get_poff(const struct sk_buff *skb)
> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
> +               bool *flow_initted)
>  {
> -       struct flow_keys keys;
>         u32 poff = 0;
>
> -       if (!skb_flow_dissect(skb, &keys))
> -               return 0;
> +       /* check whether the flow dissector has already been run */
> +       if (!*flow_initted) {
> +               if (!skb_flow_dissect(skb, flow))
> +                       return 0;
> +               *flow_initted = true;
> +       }
>
> -       poff += keys.thoff;
> -       switch (keys.ip_proto) {
> +       poff += flow->thoff;
> +       switch (flow->ip_proto) {
>         case IPPROTO_TCP: {
>                 const struct tcphdr *tcph;
>                 struct tcphdr _tcph;
> --
> 1.9.1.423.g4596e3a
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-16 22:00   ` Alexei Starovoitov
@ 2014-05-19 22:23     ` Chema Gonzalez
  2014-05-20  9:58       ` Daniel Borkmann
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-19 22:23 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Miller, Eric Dumazet, Daniel Borkmann, Network Development

On Fri, May 16, 2014 at 3:00 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> this is way too long for commit log. These details can go after '---' line.
This can be fixed easily ;). I must admit I'm surprised by this
request: The way I see it, a commit log is not part of the code, but
text that justifies a patch. Therefore, being extremely descriptive is
a good thing. But it's got an easy solution.

>> +        * such field has been inited)
>> +        */
>> +       context = (void *)FP - BPF_MEMWORDS * 4 - sizeof(*context);
>> +       context->flow_initted = false;
>> +
>
> Neither this code nor comment belong in interpreter which is striving to
> be generic. Both have a meaning only for your new instructions in
> socket filters. They do not apply to seccomp and to tracing.
> Calling it 'struct sk_run_filter_ctx' is equally misleading.
> 'struct flow_keys' has nothing to do with seccomp and even with
> normal tcpdump filters.
I'd argue specificity already pervades the BPF filter. Almost all the
ancillary loads (now bpf_calls) assume a sk_buff in the ctx field (all
but the get_cpu and random). If the user's filter has a "ld #nla"
insn, the eBPF filter ends up having "mov(ctx, arg1); mov(A, arg2);
...; call(__skb_get_nlattr)", which assumes ctx has an skb, and runs
"nla_find((struct nlattr *) &skb->data[a], ...)".

Now, this specificity is needed: We're using BPF filters to filter
packets. It would make no sense not to use all the packet-related
functions in the kernel.

> We've spent so much time to generalize it and now you want to take
> it back into socket specific territory. As I said in the previous email
> it's totally not ok to customize eBPF just for one use case especially
> when you're not sharing final usage scenario.
Sorry if I haven't mentioned it before.

Right now I use the following filter to detect some packets I'm
interested on ('ip[1] == 0x03 and tcp and dst port 80' in tcpdump
expression-ish):

$ cat tools/net/tos_and_tcp_and_dport.bpf
ldh [12]
jne #0x800, drop
ldb [15]
jne #0x3, drop
ldb [23]
jne #0x6, drop
ldh [20]
jset #0x1fff, drop
ldxb 4*([14]&0xf)
ldh [x + 16]
jne #0x50, drop
ret #-1
drop: ret #0

Now, I want to do the same filtering in encapsulated traffic (vlan and
gre). I'd like to be able to write something like:

$ cat tools/net/tos_and_tcp_and_dport_generic.bpf
ld #nproto
jne #0x800, drop
ld #nhoff
ldh [x + 1]
jne #0x3, drop
...

This will automatically support all the de-encapsulation performed by
the flow dissector.

> Any uapi change must be strongly justified.
I hope I have managed to convince you with the example above. Still,
my proposal only add a couple of ancillary loads.

> Other than design issue there is technical problem too.
> You're not updating eBPF JIT and context->flow_initted will have junk,
> so ld_poff will be returning junk too.
> JITs and interpreter must match. That was always the case even in
> old days of classic BPF. Interpreter initialized A and X to zero,
> JITs had to do the same. If you add stuff like this to interpreter,
> JITs would have to follow. Fast forward few years and this is not scalable.
> You have several JITs around and adding custom stuff to interpreter
> may not even be possible to do in JITs.
This is a fair point. In fact my patch is breaking "ld poff."

I think the previous mechanism where a new insn added to BPF would
cause the BPF JIT engine to refuse to run was a more scalable: New
stuff always went in the non-jitted runner, and people interested in
the jitted version will port when they needed it. We definitely not
want to require people adding new features to provide 6 versions of
the code.

> Therefore I'm strongly against adding custom C code to interpreter.
> It's an interpreter of eBPF instructions. Nothing else.
> At the same time adding new eBPF instructions would be completely fine.
> Like right now we don't have < and <= conditional jumps, which adds
> unnecessary complexity to tracing filters. So we may still expand
> interpreter and JITs in the future.
>
> Going back to your use case… assuming that you share how user
> space will use new anc loads… you can do equivalent initialization
> of 'flow_initted' by emitting extra eBPF insn while converting classic
> to internal. Don't do it unconditionally though. tcpdump filters should
> not suffer. You can scan program for presence of ld_poff, ld_toff,
> ld_tproto and if they're present, emit single eBPF 'st [fp - off], 0'
> ST_MEM instruction as a first instruction of converted filter. All eBPF
> JITs will compile it to native automatically. You'll have good
> performance and all existing filters will not regress.
Sounds fair. I'll think of a good mechanism to allow other
instructions "sharing" state among themselves.

-Chema


>
>>         /* Register for user BPF programs need to be reset first. */
>>         regs[BPF_REG_A] = 0;
>>         regs[BPF_REG_X] = 0;
>> @@ -602,7 +619,10 @@ static unsigned int pkt_type_offset(void)
>>
>>  static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
>>  {
>> -       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
>> +       struct sk_run_filter_ctx *context = (void *) r4 - BPF_MEMWORDS * 4 -
>> +                       sizeof(*context);
>> +       return __skb_get_poff((struct sk_buff *)(unsigned long) ctx,
>> +                       &context->flow, &context->flow_initted);
>>  }
>>
>>  static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
>> @@ -783,6 +803,10 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
>>                 *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
>>                 insn++;
>>
>> +               /* arg4 = FP */
>> +               *insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG4, BPF_REG_FP);
>> +               insn++;
>> +
>>                 /* Emit call(ctx, arg2=A, arg3=X) */
>>                 insn->code = BPF_JMP | BPF_CALL;
>>                 switch (fp->k) {
>> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>> index 107ed12..cefe1d2 100644
>> --- a/net/core/flow_dissector.c
>> +++ b/net/core/flow_dissector.c
>> @@ -275,16 +275,20 @@ EXPORT_SYMBOL(__skb_tx_hash);
>>   * truncate packets without needing to push actual payload to the user
>>   * space and can analyze headers only, instead.
>>   */
>> -u32 __skb_get_poff(const struct sk_buff *skb)
>> +u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
>> +               bool *flow_initted)
>>  {
>> -       struct flow_keys keys;
>>         u32 poff = 0;
>>
>> -       if (!skb_flow_dissect(skb, &keys))
>> -               return 0;
>> +       /* check whether the flow dissector has already been run */
>> +       if (!*flow_initted) {
>> +               if (!skb_flow_dissect(skb, flow))
>> +                       return 0;
>> +               *flow_initted = true;
>> +       }
>>
>> -       poff += keys.thoff;
>> -       switch (keys.ip_proto) {
>> +       poff += flow->thoff;
>> +       switch (flow->ip_proto) {
>>         case IPPROTO_TCP: {
>>                 const struct tcphdr *tcph;
>>                 struct tcphdr _tcph;
>> --
>> 1.9.1.423.g4596e3a
>>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF
  2014-05-19 22:23     ` Chema Gonzalez
@ 2014-05-20  9:58       ` Daniel Borkmann
  0 siblings, 0 replies; 47+ messages in thread
From: Daniel Borkmann @ 2014-05-20  9:58 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: Alexei Starovoitov, David Miller, Eric Dumazet, Network Development

On 05/20/2014 12:23 AM, Chema Gonzalez wrote:
> On Fri, May 16, 2014 at 3:00 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
...
>> Other than design issue there is technical problem too.
>> You're not updating eBPF JIT and context->flow_initted will have junk,
>> so ld_poff will be returning junk too.
>> JITs and interpreter must match. That was always the case even in
>> old days of classic BPF. Interpreter initialized A and X to zero,
>> JITs had to do the same. If you add stuff like this to interpreter,
>> JITs would have to follow. Fast forward few years and this is not scalable.
>> You have several JITs around and adding custom stuff to interpreter
>> may not even be possible to do in JITs.
> This is a fair point. In fact my patch is breaking "ld poff."
>
> I think the previous mechanism where a new insn added to BPF would
> cause the BPF JIT engine to refuse to run was a more scalable: New
> stuff always went in the non-jitted runner, and people interested in
> the jitted version will port when they needed it. We definitely not
> want to require people adding new features to provide 6 versions of
> the code.

Absolutely, "we definitely not want to require people adding new features
to provide 6 versions of the code". I agree with this statement and that
is exactly one of the goals of the new BPF mechanism. You might have seen
how the internal conversion function performs this task for all current
BPF extensions, and that is exactly so that JITs, once they are in place,
do not need to be upgraded 6 times anymore (in contrast to your previous
statement -- as people who want to use one of the extensions, needed to
port them to each of their arch-specific JITs before).

Best,

Daniel

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (9 preceding siblings ...)
  2014-05-16 18:41 ` [PATCH v5 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
@ 2014-05-29 18:55 ` Chema Gonzalez
  2014-05-29 23:54   ` Daniel Borkmann
  2014-05-29 18:56 ` [PATCH v6 net-next 2/4] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (2 subsequent siblings)
  13 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-29 18:55 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov
  Cc: netdev, Chema Gonzalez

This patch makes __skb_get_pay_offset() (the function called by
the eBPF call generated from "ld #poff") store the output of the flow
dissector (a flow_keys structure) in the eBPF stack. This way, multiple
invocations of this eBPF call in the same BPF filter will only require
one single call to the flow dissector.

Note that other eBPF calls that use the flow dissector can use the same
approach, and share the results, so at the end there is only one single
call to the flow dissector per packet.

Tested:

$ cat tools/net/ipv4_tcp_poff2.bpf
ldh [12]
jne #0x800, drop
ldb [23]
jneq #6, drop
ld poff
ld poff
ld poff
ld poff
ret #-1
drop: ret #0
$ ./tools/net/bpf_asm tools/net/ipv4_tcp_poff2.bpf
10,40 0 0 12,21 0 7 2048,48 0 0 23,21 0 5 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,6 0 0 4294967295,6 0 0 0,

And then, in a VM, we ran:

$ tcpdump -n -i eth0 -f "10,40 0 0 12,21 0 7 2048,48 0 0 23,21 0 5 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,6 0 0 4294967295,6 0 0 0,"

This tcpdump is github's tcpdump HEAD with pull request #353, which
allows using raw filters in tcpdump.

Debugging the kernel, we can see that there only the first "ld poff" call
does invoke the flow dissector. The other calls reuse the value in the
eBPF stack.

Also tested the test_bpf module.

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/skbuff.h    |  3 +-
 lib/test_bpf.c            | 52 ++++++++++++++++++++++++++++
 net/core/filter.c         | 86 ++++++++++++++++++++++++++++++++++++++++++-----
 net/core/flow_dissector.c | 16 +++++----
 4 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7a9beeb..0214b5a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3065,7 +3065,8 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
 
-u32 __skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
+		u32 *flow_inited);
 
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index af677cb..0f8128f 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -446,6 +446,58 @@ static struct bpf_test tests[] = {
 		{ { 30, 0 }, { 100, 42 } },
 	},
 	{
+		"LD_PAYLOAD_OFF_STACK",
+		.u.insns = {
+			BPF_STMT(BPF_LD | BPF_IMM, 0x11111111),
+			BPF_STMT(BPF_ST, 0),
+			BPF_STMT(BPF_LD | BPF_IMM, 0x22222222),
+			BPF_STMT(BPF_ST, 1),
+			BPF_STMT(BPF_LD | BPF_IMM, 0xeeeeeeee),
+			BPF_STMT(BPF_ST, 14),
+			BPF_STMT(BPF_LD | BPF_IMM, 0xffffffff),
+			BPF_STMT(BPF_ST, 15),
+			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+			    SKF_AD_OFF + SKF_AD_PAY_OFFSET),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 54, 1, 0),
+			BPF_STMT(BPF_RET | BPF_K, 0),
+			BPF_STMT(BPF_LD | BPF_MEM, 0),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x11111111, 1, 0),
+			BPF_STMT(BPF_RET | BPF_K, 0),
+			BPF_STMT(BPF_LD | BPF_MEM, 1),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0x22222222, 1, 0),
+			BPF_STMT(BPF_RET | BPF_K, 0),
+			BPF_STMT(BPF_LD | BPF_MEM, 14),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xeeeeeeee, 1, 0),
+			BPF_STMT(BPF_RET | BPF_K, 0),
+			BPF_STMT(BPF_LD | BPF_MEM, 15),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xffffffff, 1, 0),
+			BPF_STMT(BPF_RET | BPF_K, 0),
+			BPF_STMT(BPF_RET | BPF_K, 1)
+		},
+		CLASSIC,
+		/* 01:02:03:04:05:06 < 07:08:09:0a:0b:0c, ethertype IPv4(0x0800)
+		 * length 94: 10.1.1.1.10000 > 10.1.1.2.22: Flags [P.],
+		 * seq 1:21, ack 2, win 14400, length 20
+		 */
+		{ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+		  0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c,
+		  0x08, 0x00,
+		  /* ip header */
+		  0x45, 0x10, 0x00, 0x5e,
+		  0x75, 0xb5, 0x40, 0x00,
+		  0x40, 0x06, 0xad, 0x2e,
+		  0x0a, 0x01, 0x01, 0x01, /* ip src */
+		  0x0a, 0x01, 0x01, 0x02, /* ip dst */
+		  /* tcp header */
+		  0x27, 0x10, 0x00, 0x16, /* tcp src/dst port */
+		  0x00, 0x00, 0x00, 0x01, /* tcp seq# */
+		  0x00, 0x00, 0x00, 0x02, /* tcp ack# */
+			0x50, 0x00, 0x38, 0x40,
+			0x9a, 0x42, 0x00, 0x00,
+		},
+		{ { 100, 1 } },
+	},
+	{
 		"LD_ANC_XOR",
 		.u.insns = {
 			BPF_STMT(BPF_LD | BPF_IMM, 10),
diff --git a/net/core/filter.c b/net/core/filter.c
index 2c2d35d..0c55252 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -66,6 +66,24 @@
 #define CTX	regs[BPF_REG_CTX]
 #define K	insn->imm
 
+/* classic BPF stack layout:
+ *
+ * This is the layout for the stack for eBPF filters generated from
+ * classic BPF filters.
+ *
+ * Top (BPF_MEMWORDS * 4) bytes are used to represent classic BPF
+ * mem[0-15] slots.
+ *
+ * Flow dissector users (poff so far) use the space just below mem[]
+ * to share the flow_keys obtained from dissecting the flow, and a
+ * bool stating whether such field has been inited.
+ */
+struct classic_bpf_stack_layout {
+	u32 flow_inited;
+	struct flow_keys flow;
+	u32 mem[BPF_MEMWORDS];
+};
+
 /* No hurry in this branch
  *
  * Exported for the bpf jit load helper.
@@ -596,9 +614,13 @@ static unsigned int pkt_type_offset(void)
 	return -1;
 }
 
-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 {
-	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
+	struct classic_bpf_stack_layout *stack_layout =
+	    (void *) fp - sizeof(struct classic_bpf_stack_layout);
+	return __skb_get_poff((struct sk_buff *)(unsigned long) ctx,
+			      &stack_layout->flow,
+			      &stack_layout->flow_inited);
 }
 
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
@@ -779,7 +801,11 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG3, BPF_REG_X);
 		insn++;
 
-		/* Emit call(ctx, arg2=A, arg3=X) */
+		/* arg4 = FP */
+		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG4, BPF_REG_FP);
+		insn++;
+
+		/* Emit call(ctx, arg2=A, arg3=X, arg4=FP) */
 		insn->code = BPF_JMP | BPF_CALL;
 		switch (fp->k) {
 		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
@@ -818,6 +844,45 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	return true;
 }
 
+void __sk_convert_filter_prologue(struct sock_filter *fp, int len,
+		struct sock_filter_int **new_insn)
+{
+	bool use_flow_dissector = false;
+	bool do_write_code = false;
+	int i;
+
+	/* check if there are any insn's that use the flow dissector */
+	for (i = 0; i < len; fp++, i++) {
+		if (BPF_CLASS(fp->code) == BPF_LD &&
+		    BPF_MODE(fp->code) == BPF_ABS &&
+		    fp->k == SKF_AD_OFF + SKF_AD_PAY_OFFSET) {
+			use_flow_dissector = true;
+			break;
+		}
+	}
+
+	do_write_code = (*new_insn != NULL);
+
+	/* first init the stack */
+	if (use_flow_dissector) {
+		/* stack_layout->flow_inited = 0; */
+		if (do_write_code) {
+			(*new_insn)->code = BPF_ST | BPF_MEM | BPF_W;
+			(*new_insn)->a_reg = BPF_REG_FP;
+			(*new_insn)->x_reg = 0;
+			(*new_insn)->off = (s16) (
+			-sizeof(struct classic_bpf_stack_layout) +
+			offsetof(struct classic_bpf_stack_layout, flow_inited));
+			(*new_insn)->imm = 0;
+		}
+		(*new_insn)++;
+	}
+	/* ctx = arg1 */
+	if (do_write_code)
+		**new_insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_CTX, BPF_REG_ARG1);
+	(*new_insn)++;
+}
+
 /**
  *	sk_convert_filter - convert filter program
  *	@prog: the user passed filter program
@@ -867,10 +932,7 @@ do_pass:
 	new_insn = new_prog;
 	fp = prog;
 
-	if (new_insn) {
-		*new_insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_CTX, BPF_REG_ARG1);
-	}
-	new_insn++;
+	__sk_convert_filter_prologue(fp, len, &new_insn);
 
 	for (i = 0; i < len; fp++, i++) {
 		struct sock_filter_int tmp_insns[6] = { };
@@ -1041,7 +1103,10 @@ do_pass:
 			insn->a_reg = BPF_REG_FP;
 			insn->x_reg = fp->code == BPF_ST ?
 				      BPF_REG_A : BPF_REG_X;
-			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
+			insn->off =
+			    -sizeof(struct classic_bpf_stack_layout) +
+			    offsetof(struct classic_bpf_stack_layout,
+				     mem[fp->k]);
 			break;
 
 		/* Load from stack. */
@@ -1051,7 +1116,10 @@ do_pass:
 			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
 				      BPF_REG_A : BPF_REG_X;
 			insn->x_reg = BPF_REG_FP;
-			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
+			insn->off =
+			    -sizeof(struct classic_bpf_stack_layout) +
+			    offsetof(struct classic_bpf_stack_layout,
+				     mem[fp->k]);
 			break;
 
 		/* A = K or X = K */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12..bf3cb99 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -275,16 +275,20 @@ EXPORT_SYMBOL(__skb_tx_hash);
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-u32 __skb_get_poff(const struct sk_buff *skb)
+u32 __skb_get_poff(const struct sk_buff *skb, struct flow_keys *flow,
+		u32 *flow_inited)
 {
-	struct flow_keys keys;
 	u32 poff = 0;
 
-	if (!skb_flow_dissect(skb, &keys))
-		return 0;
+	/* check whether the flow dissector has already been run */
+	if (!*flow_inited) {
+		if (!skb_flow_dissect(skb, flow))
+			return 0;
+		*flow_inited = 1;
+	}
 
-	poff += keys.thoff;
-	switch (keys.ip_proto) {
+	poff += flow->thoff;
+	switch (flow->ip_proto) {
 	case IPPROTO_TCP: {
 		const struct tcphdr *tcph;
 		struct tcphdr _tcph;
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v6 net-next 2/4] net: filter: add insn for loading internal transport header offset
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (10 preceding siblings ...)
  2014-05-29 18:55 ` [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF Chema Gonzalez
@ 2014-05-29 18:56 ` Chema Gonzalez
  2014-05-29 18:56 ` [PATCH v6 net-next 3/4] net: filter: add insn for loading internal transport header proto Chema Gonzalez
  2014-05-29 18:56 ` [PATCH v6 net-next 4/4] net: filter: minor BPF cleanups Chema Gonzalez
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-29 18:56 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_OFFSET insn that loads the internal transport
header of a packet ("internal" meaning after decapsulation by the
flow dissector). Goal is to be able to easily write filters based
on the inner-most header.

For example, the following filter will capture all packets whose
inner-most L4 dst port is 80:

ld #toff
tax
ldh [X+2]
jneq #80, drop
ret #-1
drop: ret #0

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 23 ++++++++++++++++++++++-
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 625f4de..9bc7771 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -304,6 +304,7 @@ enum {
 	BPF_S_ANC_VLAN_TAG_PRESENT,
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
+	BPF_S_ANC_TRA_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 253b4d4..9f1b8f1 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -131,7 +131,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_TRA_OFFSET	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c55252..0d67498 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -623,6 +623,20 @@ static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 			      &stack_layout->flow_inited);
 }
 
+static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
+{
+	struct classic_bpf_stack_layout *stack_layout =
+	    (void *) fp - sizeof(struct classic_bpf_stack_layout);
+	/* check whether the flow dissector has already been run */
+	if (!stack_layout->flow_inited) {
+		if (!skb_flow_dissect((struct sk_buff *)(unsigned long) ctx,
+		    &stack_layout->flow))
+			return 0;
+		stack_layout->flow_inited = 1;
+	}
+	return stack_layout->flow.thoff;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -789,6 +803,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
+	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -823,6 +838,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_RANDOM:
 			insn->imm = __get_random_u32 - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+			insn->imm = __skb_get_tra_offset - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -855,7 +873,8 @@ void __sk_convert_filter_prologue(struct sock_filter *fp, int len,
 	for (i = 0; i < len; fp++, i++) {
 		if (BPF_CLASS(fp->code) == BPF_LD &&
 		    BPF_MODE(fp->code) == BPF_ABS &&
-		    fp->k == SKF_AD_OFF + SKF_AD_PAY_OFFSET) {
+		    (fp->k == SKF_AD_OFF + SKF_AD_PAY_OFFSET ||
+		     fp->k == SKF_AD_OFF + SKF_AD_TRA_OFFSET)) {
 			use_flow_dissector = true;
 			break;
 		}
@@ -1402,6 +1421,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(VLAN_TAG_PRESENT);
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
+			ANCILLARY(TRA_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1815,6 +1835,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 833a966..4e72934 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -93,6 +93,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_tci") { return K_VLANT; }
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
+"#"?("toff")	{ return K_TOFF; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index e6306c5..ced6949 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -167,6 +167,9 @@ ldb
 	| OP_LDB K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDB K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldh
@@ -218,6 +221,9 @@ ldh
 	| OP_LDH K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LDH K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	;
 
 ldi
@@ -274,6 +280,9 @@ ld
 	| OP_LD K_RAND {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD K_TOFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v6 net-next 3/4] net: filter: add insn for loading internal transport header proto
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (11 preceding siblings ...)
  2014-05-29 18:56 ` [PATCH v6 net-next 2/4] net: filter: add insn for loading internal transport header offset Chema Gonzalez
@ 2014-05-29 18:56 ` Chema Gonzalez
  2014-05-29 18:56 ` [PATCH v6 net-next 4/4] net: filter: minor BPF cleanups Chema Gonzalez
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-29 18:56 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov
  Cc: netdev, Chema Gonzalez

Patch adds an ANC_TRA_PROTOCOL insn that loads the protocol of
the internal transport header of a packet ("internal" meaning after
decapsulation by the flow dissector).

For example, the following filter will capture all packets whose
inner-most TCP dst port is 80:

ld #proto
jneq #6, drop
ld #toff
tax
ldh [X+2]
jneq #80, drop
ret #-1
drop: ret #0

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 include/linux/filter.h      |  1 +
 include/uapi/linux/filter.h |  3 ++-
 net/core/filter.c           | 23 ++++++++++++++++++++++-
 tools/net/bpf_exp.l         |  1 +
 tools/net/bpf_exp.y         | 11 ++++++++++-
 5 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9bc7771..02e6ed8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -305,6 +305,7 @@ enum {
 	BPF_S_ANC_PAY_OFFSET,
 	BPF_S_ANC_RANDOM,
 	BPF_S_ANC_TRA_OFFSET,
+	BPF_S_ANC_TRA_PROTOCOL,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 9f1b8f1..e34d608 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -132,7 +132,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
 #define SKF_AD_TRA_OFFSET	60
-#define SKF_AD_MAX	64
+#define SKF_AD_TRA_PROTOCOL	64
+#define SKF_AD_MAX	68
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0d67498..fcc2783 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -637,6 +637,20 @@ static u64 __skb_get_tra_offset(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 	return stack_layout->flow.thoff;
 }
 
+static u64 __skb_get_tra_protocol(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
+{
+	struct classic_bpf_stack_layout *stack_layout =
+	    (void *) fp - sizeof(struct classic_bpf_stack_layout);
+	/* check whether the flow dissector has already been run */
+	if (!stack_layout->flow_inited) {
+		if (!skb_flow_dissect((struct sk_buff *)(unsigned long) ctx,
+		    &stack_layout->flow))
+			return 0;
+		stack_layout->flow_inited = 1;
+	}
+	return stack_layout->flow.ip_proto;
+}
+
 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
@@ -804,6 +818,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 	case SKF_AD_OFF + SKF_AD_CPU:
 	case SKF_AD_OFF + SKF_AD_RANDOM:
 	case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
+	case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
 		/* arg1 = ctx */
 		*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_ARG1, BPF_REG_CTX);
 		insn++;
@@ -841,6 +856,9 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
 		case SKF_AD_OFF + SKF_AD_TRA_OFFSET:
 			insn->imm = __skb_get_tra_offset - __bpf_call_base;
 			break;
+		case SKF_AD_OFF + SKF_AD_TRA_PROTOCOL:
+			insn->imm = __skb_get_tra_protocol - __bpf_call_base;
+			break;
 		}
 		break;
 
@@ -874,7 +892,8 @@ void __sk_convert_filter_prologue(struct sock_filter *fp, int len,
 		if (BPF_CLASS(fp->code) == BPF_LD &&
 		    BPF_MODE(fp->code) == BPF_ABS &&
 		    (fp->k == SKF_AD_OFF + SKF_AD_PAY_OFFSET ||
-		     fp->k == SKF_AD_OFF + SKF_AD_TRA_OFFSET)) {
+		     fp->k == SKF_AD_OFF + SKF_AD_TRA_OFFSET ||
+		     fp->k == SKF_AD_OFF + SKF_AD_TRA_PROTOCOL)) {
 			use_flow_dissector = true;
 			break;
 		}
@@ -1422,6 +1441,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(PAY_OFFSET);
 			ANCILLARY(RANDOM);
 			ANCILLARY(TRA_OFFSET);
+			ANCILLARY(TRA_PROTOCOL);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -1836,6 +1856,7 @@ void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_RANDOM]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_TRA_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_TRA_PROTOCOL]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
index 4e72934..c1b5958 100644
--- a/tools/net/bpf_exp.l
+++ b/tools/net/bpf_exp.l
@@ -94,6 +94,7 @@ extern void yyerror(const char *str);
 "#"?("vlan_pr")	{ return K_VLANP; }
 "#"?("rand")	{ return K_RAND; }
 "#"?("toff")	{ return K_TOFF; }
+"#"?("tproto")	{ return K_TPROTO; }
 
 ":"		{ return ':'; }
 ","		{ return ','; }
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
index ced6949..71fef72 100644
--- a/tools/net/bpf_exp.y
+++ b/tools/net/bpf_exp.y
@@ -56,7 +56,7 @@ static void bpf_set_jmp_label(char *label, enum jmp_type type);
 %token OP_LDXI
 
 %token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
-%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND K_TOFF K_TPROTO
 
 %token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
 
@@ -170,6 +170,9 @@ ldb
 	| OP_LDB K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDB K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldh
@@ -224,6 +227,9 @@ ldh
 	| OP_LDH K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LDH K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	;
 
 ldi
@@ -283,6 +289,9 @@ ld
 	| OP_LD K_TOFF {
 		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
 				   SKF_AD_OFF + SKF_AD_TRA_OFFSET); }
+	| OP_LD K_TPROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_TRA_PROTOCOL); }
 	| OP_LD 'M' '[' number ']' {
 		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
 	| OP_LD '[' 'x' '+' number ']' {
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [PATCH v6 net-next 4/4] net: filter: minor BPF cleanups
  2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
                   ` (12 preceding siblings ...)
  2014-05-29 18:56 ` [PATCH v6 net-next 3/4] net: filter: add insn for loading internal transport header proto Chema Gonzalez
@ 2014-05-29 18:56 ` Chema Gonzalez
  13 siblings, 0 replies; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-29 18:56 UTC (permalink / raw)
  To: David Miller, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov
  Cc: netdev, Chema Gonzalez

Includes:
 - 4th argument for BPF_CALLs is the FP register
 - unused fields in LD[X]/ST[X] insn should be zero
 - some typos

Signed-off-by: Chema Gonzalez <chema@google.com>
---
 net/core/filter.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index fcc2783..fd9c7bf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -651,7 +651,7 @@ static u64 __skb_get_tra_protocol(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 	return stack_layout->flow.ip_proto;
 }
 
-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 	struct nlattr *nla;
@@ -672,7 +672,7 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return 0;
 }
 
-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 	struct nlattr *nla;
@@ -697,13 +697,13 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 	return 0;
 }
 
-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 {
 	return raw_smp_processor_id();
 }
 
 /* note that this only generates 32-bit random numbers */
-static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
+static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 fp, u64 r5)
 {
 	return prandom_u32();
 }
@@ -1094,7 +1094,7 @@ do_pass:
 			EMIT_JMP;
 			break;
 
-		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+		/* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
 		case BPF_LDX | BPF_MSH | BPF_B:
 			/* tmp = A */
 			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_TMP, BPF_REG_A);
@@ -1120,7 +1120,7 @@ do_pass:
 			*insn = BPF_ALU64_REG(BPF_MOV, BPF_REG_A, BPF_REG_TMP);
 			break;
 
-		/* RET_K, RET_A are remaped into 2 insns. */
+		/* RET_K, RET_A are remapped into 2 insns. */
 		case BPF_RET | BPF_A:
 		case BPF_RET | BPF_K:
 			insn->code = BPF_ALU | BPF_MOV |
@@ -1145,6 +1145,7 @@ do_pass:
 			    -sizeof(struct classic_bpf_stack_layout) +
 			    offsetof(struct classic_bpf_stack_layout,
 				     mem[fp->k]);
+			insn->imm = 0;
 			break;
 
 		/* Load from stack. */
@@ -1158,6 +1159,7 @@ do_pass:
 			    -sizeof(struct classic_bpf_stack_layout) +
 			    offsetof(struct classic_bpf_stack_layout,
 				     mem[fp->k]);
+			insn->imm = 0;
 			break;
 
 		/* A = K or X = K */
-- 
1.9.1.423.g4596e3a

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-05-29 18:55 ` [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF Chema Gonzalez
@ 2014-05-29 23:54   ` Daniel Borkmann
  2014-05-30 17:12     ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Daniel Borkmann @ 2014-05-29 23:54 UTC (permalink / raw)
  To: Chema Gonzalez; +Cc: David Miller, Eric Dumazet, Alexei Starovoitov, netdev

On 05/29/2014 08:55 PM, Chema Gonzalez wrote:
> This patch makes __skb_get_pay_offset() (the function called by
> the eBPF call generated from "ld #poff") store the output of the flow
> dissector (a flow_keys structure) in the eBPF stack. This way, multiple
> invocations of this eBPF call in the same BPF filter will only require
> one single call to the flow dissector.
>
> Note that other eBPF calls that use the flow dissector can use the same
> approach, and share the results, so at the end there is only one single
> call to the flow dissector per packet.
>
> Tested:
>
> $ cat tools/net/ipv4_tcp_poff2.bpf
> ldh [12]
> jne #0x800, drop
> ldb [23]
> jneq #6, drop
> ld poff
> ld poff
> ld poff
> ld poff
> ret #-1
> drop: ret #0
> $ ./tools/net/bpf_asm tools/net/ipv4_tcp_poff2.bpf
> 10,40 0 0 12,21 0 7 2048,48 0 0 23,21 0 5 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,6 0 0 4294967295,6 0 0 0,
>
> And then, in a VM, we ran:
>
> $ tcpdump -n -i eth0 -f "10,40 0 0 12,21 0 7 2048,48 0 0 23,21 0 5 6,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,32 0 0 4294963252,6 0 0 4294967295,6 0 0 0,"
>
> This tcpdump is github's tcpdump HEAD with pull request #353, which
> allows using raw filters in tcpdump.
>
> Debugging the kernel, we can see that there only the first "ld poff" call
> does invoke the flow dissector. The other calls reuse the value in the
> eBPF stack.
>
> Also tested the test_bpf module.
>
> Signed-off-by: Chema Gonzalez <chema@google.com>

I actually liked [1] most ... ;)

Just an idea ...

Have you taken into account the following: since thoff is u16 and ip_proto
is u8 and you clearly seem to want to have the value of these two in your
patch set (or even plus the poff value), why not squashing all these into
one extension e.g. SKF_AD_DISSECT where you call the external skb_flow_dissect()
helper or similar on?

I could imagine that either these offsets are squashed into the return or
stored if you really need them from the struct flow_keys into M[] itself. So
you would end up with one e.g. 'ld #keys' call that e.g. fills out/overwrites
your M[] with the dissected metadata. Then, you'd only have a reason to call
that once and do further processing based on these information. Whether you
also need poff could e.g. be determined if the user does e.g. 'ldx #1' or so
to indicate to the function it eventually calls, that it would further do
the dissect and also give you poff into fixed defined M[] slots back.

Thus if someone really only cares about headers, a use case like 'ld #poff +
ret a' is sufficient, but if you need much more and don't want to do this in
BPF like in your case, you go with 'ld #keys' and process your BPF program
further via M[].

Thanks,

Daniel

  [1] http://patchwork.ozlabs.org/patch/344271/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-05-29 23:54   ` Daniel Borkmann
@ 2014-05-30 17:12     ` Chema Gonzalez
  2014-06-02 12:36       ` Daniel Borkmann
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-05-30 17:12 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: David Miller, Eric Dumazet, Alexei Starovoitov, Network Development

On Thu, May 29, 2014 at 4:54 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
> I actually liked [1] most ... ;)
>
> Just an idea ...
>
> Have you taken into account the following: since thoff is u16 and ip_proto
> is u8 and you clearly seem to want to have the value of these two in your
> patch set (or even plus the poff value), why not squashing all these into
Note that I not only want thoff and ip_proto (the ones in patch #1 and
#2), but also nhoff and nproto. I could use the other flow_keys fields
(saddr, daddr, sport, dport), but they're one proto
comparison+indirect load away from [tn]hoff/[ip_|n]proto, so I don't
think it's worth to change BPF for them. Now, nhoff and proto are not
kept by the flow dissector. We cannot just add them to flow_keys as
the struct is used in some skb CB's that don't have any space left. I
have a patch that replaces thoff/ip_proto with nhof/proto (as the L4
info is easily obtainable from the L3 one, but not the other way
around), plus a fix of the couple of places where the L4 info is
actually used. I'll post it after these patches (and after your
code/decode cleaning goes through).

> one extension e.g. SKF_AD_DISSECT where you call the external
> skb_flow_dissect()
> helper or similar on?
>
> I could imagine that either these offsets are squashed into the return or
> stored if you really need them from the struct flow_keys into M[] itself. So
> you would end up with one e.g. 'ld #keys' call that e.g. fills
> out/overwrites
> your M[] with the dissected metadata. Then, you'd only have a reason to call
> that once and do further processing based on these information. Whether you
> also need poff could e.g. be determined if the user does e.g. 'ldx #1' or so
> to indicate to the function it eventually calls, that it would further do
> the dissect and also give you poff into fixed defined M[] slots back.

IIUC, you're suggesting to have the classic BPF user writes "ld #keys"
to load flow_keys in the stack and then *explicitly* adds "ld
mem[<offset>]" to access the field she's interested on. Note that if
the "ld mem[]" is implicit, then the user must use "ld #thoff" to let
the compiler know she wants "ld #keys" and then "ld
mem[<thoff_offset>]" (and not "ld mem[<other_offset>]"), which is
equivalent to what we're doing right now*.

The only advantage I can see is that we're only adding one new
ancillary load, which is more elegant. I can see some issues with this
approach:

- usability. The user has to actually know what's the right offset of
the thoff, which is cumbersome and may change with kernels. We'd be
effectively exporting the flow_keys struct layout to the users.
- have to take care that the classic BPF filter does not muck with
mem[] between the "ld #keys" and all of the the "ld* mem[]" that
access to it. Note that we're currently storing the flow_keys struct
in the stack outside of the mem[] words, so this is not a problem
here. (It is only accessible using ebpf insns.)

*Now, if your comment is ebpf-only, that could be ok. That would mean:

- a. the user writes "ld #thoff"
- b. the BPF->eBPF compiler generates one common BPF_CALL
(__skb_get_flow_dissect) plus a "ebpf_ldx stack[right_offset]". This
would not include payload_offset (which needs to run its own function
to get the poff from flow_keys).

Is this what you are proposing?

-Chema


> Thus if someone really only cares about headers, a use case like 'ld #poff +
> ret a' is sufficient, but if you need much more and don't want to do this in
> BPF like in your case, you go with 'ld #keys' and process your BPF program
> further via M[].
>
> Thanks,
>
> Daniel
>
>  [1] http://patchwork.ozlabs.org/patch/344271/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-05-30 17:12     ` Chema Gonzalez
@ 2014-06-02 12:36       ` Daniel Borkmann
  2014-06-02 16:48         ` Alexei Starovoitov
  0 siblings, 1 reply; 47+ messages in thread
From: Daniel Borkmann @ 2014-06-02 12:36 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: David Miller, Eric Dumazet, Alexei Starovoitov, Network Development

On 05/30/2014 07:12 PM, Chema Gonzalez wrote:
> On Thu, May 29, 2014 at 4:54 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> I actually liked [1] most ... ;)
...
>> one extension e.g. SKF_AD_DISSECT where you call the external
>> skb_flow_dissect()
>> helper or similar on?
>>
>> I could imagine that either these offsets are squashed into the return or
>> stored if you really need them from the struct flow_keys into M[] itself. So
>> you would end up with one e.g. 'ld #keys' call that e.g. fills
>> out/overwrites
>> your M[] with the dissected metadata. Then, you'd only have a reason to call
>> that once and do further processing based on these information. Whether you
>> also need poff could e.g. be determined if the user does e.g. 'ldx #1' or so
>> to indicate to the function it eventually calls, that it would further do
>> the dissect and also give you poff into fixed defined M[] slots back.
>
> IIUC, you're suggesting to have the classic BPF user writes "ld #keys"
> to load flow_keys in the stack and then *explicitly* adds "ld
> mem[<offset>]" to access the field she's interested on. Note that if
> the "ld mem[]" is implicit, then the user must use "ld #thoff" to let
> the compiler know she wants "ld #keys" and then "ld
> mem[<thoff_offset>]" (and not "ld mem[<other_offset>]"), which is
> equivalent to what we're doing right now*.

I think there are many ways to design that, but for something possibly
generic, what I mean is something like this:

We would a-priori know per API documentation what slots in M[] are
filled with which information, just for example:

M[0] <-- nhoff  [network header off]
M[1] <-- nproto [network header proto]
M[2] <-- thoff  [transport header off]
M[3] <-- tproto [transport header proto]
M[4] <-- poff   [payload off]

Now a user could load the following pseudo bpf_asm style program:

ld #5     <-- indicates to fill the first 5 slots of M[], so M[0] to M[4]
ld #keys  <-- triggers the extension to fill the M[] slots
ld M[0]   <-- loads nhoff from M[0] into accu
<do sth with it>
ld M[3]   <-- loads tproto into accu, etc
...

A program like:

ld #2
ld #keys
...

Would then on the other hand only fill the first two slots of M[] as
the user does not need all information from the kernel's flow dissector
and thus also does not fully need to dissect the skb.

This also permits in future to add new fields by enabling them with
ld #6 etc, for example.

I think this would fit much more into the design of BPF (although I
don't strictly like referring to the kernel's flow dissector and thus
bypassing BPF's actual purpose; but I understand that it can get quite
complicated with tunnels, etc).

But this idea would allow you to only add 1 new extension and to have
it return dynamically a part or all information you would need in your
program, and thus solves the issue of calling the skb flow dissector
multiple times just because we have ld #thoff, ld #nhoff, etc, we can
avoid making such a stack_layout + filter_prologue hack and thus design
it more cleanly into the BPF architecture.

> The only advantage I can see is that we're only adding one new
> ancillary load, which is more elegant. I can see some issues with this
> approach:
>
> - usability. The user has to actually know what's the right offset of
> the thoff, which is cumbersome and may change with kernels. We'd be
> effectively exporting the flow_keys struct layout to the users.

See above.

> - have to take care that the classic BPF filter does not muck with
> mem[] between the "ld #keys" and all of the the "ld* mem[]" that
> access to it. Note that we're currently storing the flow_keys struct
> in the stack outside of the mem[] words, so this is not a problem
> here. (It is only accessible using ebpf insns.)

Since it is part of the API/ABI, and a new instruction, it is then
known that ld #keys would fill particular slots of M[] to the user.
That however, must be carefully designed, so that in future one
doesn't need to add ld #keys2. The part of not mucking with M[] fields
is then part of the user's task actually, just the same way as a
user shouldn't store something to A resp. X while an ld resp. ldx
knowingly would overwrite that value stored previously. I don't
think it would be any different than that.

> *Now, if your comment is ebpf-only, that could be ok. That would mean:
>
> - a. the user writes "ld #thoff"
> - b. the BPF->eBPF compiler generates one common BPF_CALL
> (__skb_get_flow_dissect) plus a "ebpf_ldx stack[right_offset]". This
> would not include payload_offset (which needs to run its own function
> to get the poff from flow_keys).

Since we're not using eBPF in user space right now, the comment is not
about eBPF. I think if you would use eBPF from user space, you don't
need to add such extensions anymore, but could just write yourself a
minimal flow dissector in 'restricted' C to solve that problem.

Cheers,

Daniel

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-02 12:36       ` Daniel Borkmann
@ 2014-06-02 16:48         ` Alexei Starovoitov
  2014-06-03  8:33           ` Daniel Borkmann
  0 siblings, 1 reply; 47+ messages in thread
From: Alexei Starovoitov @ 2014-06-02 16:48 UTC (permalink / raw)
  To: Daniel Borkmann, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook
  Cc: Chema Gonzalez, David Miller, Eric Dumazet, Network Development, LKML

extending cc-list, since I think this thread is related to bpf split thread.

On Mon, Jun 2, 2014 at 5:36 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 05/30/2014 07:12 PM, Chema Gonzalez wrote:
>>
>> On Thu, May 29, 2014 at 4:54 PM, Daniel Borkmann <dborkman@redhat.com>
>> wrote:
>>>
>>> I actually liked [1] most ... ;)
>
> ...
>
>>> one extension e.g. SKF_AD_DISSECT where you call the external
>>> skb_flow_dissect()
>>> helper or similar on?
>>>
>>> I could imagine that either these offsets are squashed into the return or
>>> stored if you really need them from the struct flow_keys into M[] itself.
>>> So
>>> you would end up with one e.g. 'ld #keys' call that e.g. fills
>>> out/overwrites
>>> your M[] with the dissected metadata. Then, you'd only have a reason to
>>> call
>>> that once and do further processing based on these information. Whether
>>> you
>>> also need poff could e.g. be determined if the user does e.g. 'ldx #1' or
>>> so
>>> to indicate to the function it eventually calls, that it would further do
>>> the dissect and also give you poff into fixed defined M[] slots back.
>>
>>
>> IIUC, you're suggesting to have the classic BPF user writes "ld #keys"
>> to load flow_keys in the stack and then *explicitly* adds "ld
>> mem[<offset>]" to access the field she's interested on. Note that if
>> the "ld mem[]" is implicit, then the user must use "ld #thoff" to let
>> the compiler know she wants "ld #keys" and then "ld
>> mem[<thoff_offset>]" (and not "ld mem[<other_offset>]"), which is
>> equivalent to what we're doing right now*.
>
>
> I think there are many ways to design that, but for something possibly
> generic, what I mean is something like this:
>
> We would a-priori know per API documentation what slots in M[] are
> filled with which information, just for example:
>
> M[0] <-- nhoff  [network header off]
> M[1] <-- nproto [network header proto]
> M[2] <-- thoff  [transport header off]
> M[3] <-- tproto [transport header proto]
> M[4] <-- poff   [payload off]
>
> Now a user could load the following pseudo bpf_asm style program:
>
> ld #5     <-- indicates to fill the first 5 slots of M[], so M[0] to M[4]
> ld #keys  <-- triggers the extension to fill the M[] slots
> ld M[0]   <-- loads nhoff from M[0] into accu
> <do sth with it>
> ld M[3]   <-- loads tproto into accu, etc
> …

imo there are pros and cons in Daniel's and Chema's proposals
for classic BPF extensions.
I like Chema's a bit more, since his proposal doesn't require to
change classic BPF verifier and that's always a delicate area
(seccomp also allows to use M[] slots).
Both still look like a stop gap until next classic extension
appears on horizon.

I think explicit eBPF program solves this particular task cleaner
and it doesn't require changing eBPF.

> A program like:
>
> ld #2
> ld #keys
> ...
>
> Would then on the other hand only fill the first two slots of M[] as
> the user does not need all information from the kernel's flow dissector
> and thus also does not fully need to dissect the skb.
>
> This also permits in future to add new fields by enabling them with
> ld #6 etc, for example.
>
> I think this would fit much more into the design of BPF (although I
> don't strictly like referring to the kernel's flow dissector and thus
> bypassing BPF's actual purpose; but I understand that it can get quite
> complicated with tunnels, etc).
>
> But this idea would allow you to only add 1 new extension and to have
> it return dynamically a part or all information you would need in your
> program, and thus solves the issue of calling the skb flow dissector
> multiple times just because we have ld #thoff, ld #nhoff, etc, we can
> avoid making such a stack_layout + filter_prologue hack and thus design
> it more cleanly into the BPF architecture.
>
>
>> The only advantage I can see is that we're only adding one new
>> ancillary load, which is more elegant. I can see some issues with this
>> approach:
>>
>> - usability. The user has to actually know what's the right offset of
>> the thoff, which is cumbersome and may change with kernels. We'd be
>> effectively exporting the flow_keys struct layout to the users.
>
>
> See above.
>
>
>> - have to take care that the classic BPF filter does not muck with
>> mem[] between the "ld #keys" and all of the the "ld* mem[]" that
>> access to it. Note that we're currently storing the flow_keys struct
>> in the stack outside of the mem[] words, so this is not a problem
>> here. (It is only accessible using ebpf insns.)
>
>
> Since it is part of the API/ABI, and a new instruction, it is then
> known that ld #keys would fill particular slots of M[] to the user.
> That however, must be carefully designed, so that in future one
> doesn't need to add ld #keys2. The part of not mucking with M[] fields
> is then part of the user's task actually, just the same way as a
> user shouldn't store something to A resp. X while an ld resp. ldx
> knowingly would overwrite that value stored previously. I don't
> think it would be any different than that.
>
>
>> *Now, if your comment is ebpf-only, that could be ok. That would mean:
>>
>> - a. the user writes "ld #thoff"
>> - b. the BPF->eBPF compiler generates one common BPF_CALL
>> (__skb_get_flow_dissect) plus a "ebpf_ldx stack[right_offset]". This
>> would not include payload_offset (which needs to run its own function
>> to get the poff from flow_keys).
>
>
> Since we're not using eBPF in user space right now, the comment is not
> about eBPF. I think if you would use eBPF from user space, you don't
> need to add such extensions anymore, but could just write yourself a
> minimal flow dissector in 'restricted' C to solve that problem.

Exactly.
I think exposing eBPF to user space is not a matter of 'if' but 'when'.

I'm not sure how pressing it is now to add another extension to classic,
when the goal of this patch set fits into eBPF model just fine.
yes, eBPF verifier is not in tree yet and it will obviously take longer to
review than Chema's or Daniel's set.

When eBPF is exposed to user space the inner header access can
be done in two ways without changing eBPF instruction set or eBPF
verifier.

eBPF approach #1:
-- code packet parser in restricted C
Pros:
skb_flow_dissect() stays hidden in kernel.
No additional uapi headache which exist if we start reusing
in-kernel skb_flow_dissect() either via classic or eBPF.
Such eBPF program will be just as fast as in-kernel skb_flow_dissect()
(I provided performance numbers before)
Cons:
in-kernel skb_flow_dissect() is not reused, but mostly copy-pasted to userspace.

eBPF approach #2:
-- reuse in-kernel skb_flow_dissect() via bpf_call insn from eBPF program
Pros:
eBPF program becomes much shorter and can be done in asm like:
bpf_mov r2, fp
bpf_sub r2, 64
bpf_call bpf_skb_flow_dissect  // call in-kernel helper function from
eBPF program
bpf_ldx r1, [fp - 64]  // access flow_keys data
bpf_ldx r2, [fp - 60]

Cons:
bpf_skb_flow_dissect() (wrapper on top of skb_flow_dissect()) becomes
uapi visible helper function

imo both eBPF approaches are cleaner than extending classic.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-02 16:48         ` Alexei Starovoitov
@ 2014-06-03  8:33           ` Daniel Borkmann
  2014-06-03 20:15             ` Alexei Starovoitov
  0 siblings, 1 reply; 47+ messages in thread
From: Daniel Borkmann @ 2014-06-03  8:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, Chema Gonzalez,
	David Miller, Eric Dumazet, Network Development, LKML

On 06/02/2014 06:48 PM, Alexei Starovoitov wrote:
> extending cc-list, since I think this thread is related to bpf split thread.
>
> On Mon, Jun 2, 2014 at 5:36 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> On 05/30/2014 07:12 PM, Chema Gonzalez wrote:
>>>
>>> On Thu, May 29, 2014 at 4:54 PM, Daniel Borkmann <dborkman@redhat.com>
>>> wrote:
>>>>
>>>> I actually liked [1] most ... ;)
>>
>> ...
>>
>>>> one extension e.g. SKF_AD_DISSECT where you call the external
>>>> skb_flow_dissect()
>>>> helper or similar on?
>>>>
>>>> I could imagine that either these offsets are squashed into the return or
>>>> stored if you really need them from the struct flow_keys into M[] itself.
>>>> So
>>>> you would end up with one e.g. 'ld #keys' call that e.g. fills
>>>> out/overwrites
>>>> your M[] with the dissected metadata. Then, you'd only have a reason to
>>>> call
>>>> that once and do further processing based on these information. Whether
>>>> you
>>>> also need poff could e.g. be determined if the user does e.g. 'ldx #1' or
>>>> so
>>>> to indicate to the function it eventually calls, that it would further do
>>>> the dissect and also give you poff into fixed defined M[] slots back.
>>>
>>>
>>> IIUC, you're suggesting to have the classic BPF user writes "ld #keys"
>>> to load flow_keys in the stack and then *explicitly* adds "ld
>>> mem[<offset>]" to access the field she's interested on. Note that if
>>> the "ld mem[]" is implicit, then the user must use "ld #thoff" to let
>>> the compiler know she wants "ld #keys" and then "ld
>>> mem[<thoff_offset>]" (and not "ld mem[<other_offset>]"), which is
>>> equivalent to what we're doing right now*.
>>
>>
>> I think there are many ways to design that, but for something possibly
>> generic, what I mean is something like this:
>>
>> We would a-priori know per API documentation what slots in M[] are
>> filled with which information, just for example:
>>
>> M[0] <-- nhoff  [network header off]
>> M[1] <-- nproto [network header proto]
>> M[2] <-- thoff  [transport header off]
>> M[3] <-- tproto [transport header proto]
>> M[4] <-- poff   [payload off]
>>
>> Now a user could load the following pseudo bpf_asm style program:
>>
>> ld #5     <-- indicates to fill the first 5 slots of M[], so M[0] to M[4]
>> ld #keys  <-- triggers the extension to fill the M[] slots
>> ld M[0]   <-- loads nhoff from M[0] into accu
>> <do sth with it>
>> ld M[3]   <-- loads tproto into accu, etc
>> …
>
> imo there are pros and cons in Daniel's and Chema's proposals
> for classic BPF extensions.
> I like Chema's a bit more, since his proposal doesn't require to
> change classic BPF verifier and that's always a delicate area
> (seccomp also allows to use M[] slots).

What bothers me is that you have to *special case* this extension
all over the BPF converter and stack, even you need to introduce a
prologue just to walk the whole filter program and to check if the
extension is present; next to that instead of one extension you
need to add a couple of them to uapi. I understand Chema's proposal
or his need to easily access these data but that's why I proposed
the above if he wants to go for that. Btw, seccomp doesn't allow
for loading BPF extensions, you said so yourself Alexei.

> Both still look like a stop gap until next classic extension
> appears on horizon.
>
> I think explicit eBPF program solves this particular task cleaner
> and it doesn't require changing eBPF.
>
>> A program like:
>>
>> ld #2
>> ld #keys
>> ...
>>
>> Would then on the other hand only fill the first two slots of M[] as
>> the user does not need all information from the kernel's flow dissector
>> and thus also does not fully need to dissect the skb.
>>
>> This also permits in future to add new fields by enabling them with
>> ld #6 etc, for example.
>>
>> I think this would fit much more into the design of BPF (although I
>> don't strictly like referring to the kernel's flow dissector and thus
>> bypassing BPF's actual purpose; but I understand that it can get quite
>> complicated with tunnels, etc).
>>
>> But this idea would allow you to only add 1 new extension and to have
>> it return dynamically a part or all information you would need in your
>> program, and thus solves the issue of calling the skb flow dissector
>> multiple times just because we have ld #thoff, ld #nhoff, etc, we can
>> avoid making such a stack_layout + filter_prologue hack and thus design
>> it more cleanly into the BPF architecture.
>>
>>
>>> The only advantage I can see is that we're only adding one new
>>> ancillary load, which is more elegant. I can see some issues with this
>>> approach:
>>>
>>> - usability. The user has to actually know what's the right offset of
>>> the thoff, which is cumbersome and may change with kernels. We'd be
>>> effectively exporting the flow_keys struct layout to the users.
>>
>>
>> See above.
>>
>>
>>> - have to take care that the classic BPF filter does not muck with
>>> mem[] between the "ld #keys" and all of the the "ld* mem[]" that
>>> access to it. Note that we're currently storing the flow_keys struct
>>> in the stack outside of the mem[] words, so this is not a problem
>>> here. (It is only accessible using ebpf insns.)
>>
>>
>> Since it is part of the API/ABI, and a new instruction, it is then
>> known that ld #keys would fill particular slots of M[] to the user.
>> That however, must be carefully designed, so that in future one
>> doesn't need to add ld #keys2. The part of not mucking with M[] fields
>> is then part of the user's task actually, just the same way as a
>> user shouldn't store something to A resp. X while an ld resp. ldx
>> knowingly would overwrite that value stored previously. I don't
>> think it would be any different than that.
>>
>>
>>> *Now, if your comment is ebpf-only, that could be ok. That would mean:
>>>
>>> - a. the user writes "ld #thoff"
>>> - b. the BPF->eBPF compiler generates one common BPF_CALL
>>> (__skb_get_flow_dissect) plus a "ebpf_ldx stack[right_offset]". This
>>> would not include payload_offset (which needs to run its own function
>>> to get the poff from flow_keys).
>>
>>
>> Since we're not using eBPF in user space right now, the comment is not
>> about eBPF. I think if you would use eBPF from user space, you don't
>> need to add such extensions anymore, but could just write yourself a
>> minimal flow dissector in 'restricted' C to solve that problem.
>
> Exactly.
> I think exposing eBPF to user space is not a matter of 'if' but 'when'.
>
> I'm not sure how pressing it is now to add another extension to classic,
> when the goal of this patch set fits into eBPF model just fine.
> yes, eBPF verifier is not in tree yet and it will obviously take longer to
> review than Chema's or Daniel's set.
>
> When eBPF is exposed to user space the inner header access can
> be done in two ways without changing eBPF instruction set or eBPF
> verifier.
>
> eBPF approach #1:
> -- code packet parser in restricted C
> Pros:
> skb_flow_dissect() stays hidden in kernel.
> No additional uapi headache which exist if we start reusing
> in-kernel skb_flow_dissect() either via classic or eBPF.
> Such eBPF program will be just as fast as in-kernel skb_flow_dissect()
> (I provided performance numbers before)
> Cons:
> in-kernel skb_flow_dissect() is not reused, but mostly copy-pasted to userspace.
>
> eBPF approach #2:
> -- reuse in-kernel skb_flow_dissect() via bpf_call insn from eBPF program
> Pros:
> eBPF program becomes much shorter and can be done in asm like:
> bpf_mov r2, fp
> bpf_sub r2, 64
> bpf_call bpf_skb_flow_dissect  // call in-kernel helper function from
> eBPF program
> bpf_ldx r1, [fp - 64]  // access flow_keys data
> bpf_ldx r2, [fp - 60]
>
> Cons:
> bpf_skb_flow_dissect() (wrapper on top of skb_flow_dissect()) becomes
> uapi visible helper function
>
> imo both eBPF approaches are cleaner than extending classic.
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-03  8:33           ` Daniel Borkmann
@ 2014-06-03 20:15             ` Alexei Starovoitov
  2014-06-03 21:12               ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Alexei Starovoitov @ 2014-06-03 20:15 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, Chema Gonzalez,
	David Miller, Eric Dumazet, Network Development, LKML

On Tue, Jun 3, 2014 at 1:33 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 06/02/2014 06:48 PM, Alexei Starovoitov wrote:
>>
>> imo there are pros and cons in Daniel's and Chema's proposals
>> for classic BPF extensions.
>> I like Chema's a bit more, since his proposal doesn't require to
>> change classic BPF verifier and that's always a delicate area
>> (seccomp also allows to use M[] slots).
>
>
> What bothers me is that you have to *special case* this extension
> all over the BPF converter and stack, even you need to introduce a
> prologue just to walk the whole filter program and to check if the
> extension is present; next to that instead of one extension you
> need to add a couple of them to uapi. I understand Chema's proposal

Agree. The walking part and multiple anc loads are not clean, but in your
approach you'd need to hack sk_chk_filter() to recognize very specific
anc load and do even fancier things in check_load_and_stores()
to make sure that 'ld #5, ld #keys' is safe from M[] usage point of view.

> or his need to easily access these data but that's why I proposed
> the above if he wants to go for that. Btw, seccomp doesn't allow
> for loading BPF extensions, you said so yourself Alexei.

of course, but seccomp does use ld/st M[] which will overlap
with your socket extension and since check_load_and_stores()
will be hacked, seccomp verification needs to be considered as well.
>From user api point of view, your approach is cleaner than Chema's,
but from implementation Chema's is much safer and smaller.

Anyway as I said before I'm not excited about either.
I don't think we should be adding classic BPF extensions any more.
The long term headache of supporting classic BPF extensions
outweighs the short term benefits.

Not having to constantly tweak kernel for such cases was the key
goal of eBPF. My two eBPF approaches to solve Chema's need
are both cleaner, since nothing special is being done in eBPF
core to support this new need. Both instruction set and verifier
stay generic and cover tracing and this new socket use at once.
I do realize that I'm talking here about future eBPF verifier that
is not in tree yet and eBPF is not exposed to user space, but
I think we should consider longer term perspective.
If I'm forced to pick between yours or Chema's classic extensions,
I would pick Chema's because it's lesser evil :)

>> I think exposing eBPF to user space is not a matter of 'if' but 'when'.
>>
>> I'm not sure how pressing it is now to add another extension to classic,
>> when the goal of this patch set fits into eBPF model just fine.
>> yes, eBPF verifier is not in tree yet and it will obviously take longer to
>> review than Chema's or Daniel's set.
>>
>> When eBPF is exposed to user space the inner header access can
>> be done in two ways without changing eBPF instruction set or eBPF
>> verifier.
>>
>> eBPF approach #1:
>> -- code packet parser in restricted C
>> Pros:
>> skb_flow_dissect() stays hidden in kernel.
>> No additional uapi headache which exist if we start reusing
>> in-kernel skb_flow_dissect() either via classic or eBPF.
>> Such eBPF program will be just as fast as in-kernel skb_flow_dissect()
>> (I provided performance numbers before)
>> Cons:
>> in-kernel skb_flow_dissect() is not reused, but mostly copy-pasted to
>> userspace.
>>
>> eBPF approach #2:
>> -- reuse in-kernel skb_flow_dissect() via bpf_call insn from eBPF program
>> Pros:
>> eBPF program becomes much shorter and can be done in asm like:
>> bpf_mov r2, fp
>> bpf_sub r2, 64
>> bpf_call bpf_skb_flow_dissect  // call in-kernel helper function from
>> eBPF program
>> bpf_ldx r1, [fp - 64]  // access flow_keys data
>> bpf_ldx r2, [fp - 60]
>>
>> Cons:
>> bpf_skb_flow_dissect() (wrapper on top of skb_flow_dissect()) becomes
>> uapi visible helper function
>>
>> imo both eBPF approaches are cleaner than extending classic.
>>
>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-03 20:15             ` Alexei Starovoitov
@ 2014-06-03 21:12               ` Chema Gonzalez
  2014-06-04  8:51                 ` Daniel Borkmann
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-06-03 21:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, David Miller,
	Eric Dumazet, Network Development, LKML

On Tue, Jun 3, 2014 at 1:15 PM, Alexei Starovoitov <ast@plumgrid.com> wrote:
> On Tue, Jun 3, 2014 at 1:33 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> On 06/02/2014 06:48 PM, Alexei Starovoitov wrote:
>>>
>>> imo there are pros and cons in Daniel's and Chema's proposals
>>> for classic BPF extensions.
>>> I like Chema's a bit more, since his proposal doesn't require to
>>> change classic BPF verifier and that's always a delicate area
>>> (seccomp also allows to use M[] slots).
>>
>>
>> What bothers me is that you have to *special case* this extension
>> all over the BPF converter and stack, even you need to introduce a
>> prologue just to walk the whole filter program and to check if the
>> extension is present; next to that instead of one extension you
>> need to add a couple of them to uapi. I understand Chema's proposal
>
> Agree. The walking part and multiple anc loads are not clean, but in your
> approach you'd need to hack sk_chk_filter() to recognize very specific
> anc load and do even fancier things in check_load_and_stores()
> to make sure that 'ld #5, ld #keys' is safe from M[] usage point of view.
I think you may be confusing the two parts of the patch, namely
whether we implement the code as 1 anc load with N parameters or as N
anc loads, and the existence of a prologue.

- Adding just 1 anc load ("ld #keys" in your pseudo-code) requires
less UAPI changes, but effectively exports the flow_keys struct to the
user (which means we cannot change it in the future). The fact that
you're proposing your own version of the flow_keys ( {nhoff, nproto,
thoff, tproto, poff} ) does not make it less of a problem: You're just
exporting an alternative (albeit IMO way better one, as all the
dissected info can be obtained from your 5-tuple) flow_keys. From a
usability PoV, the user may have to either do "ld #0; ld #keys"
(knowing that "#0" refers to "nhoff"), or "ld #nhoff". I definitely
prefer the second, but I can easily rewrite the patch to use the
first.

- Now, the existence of a prologue is a must if you want to ensure the
filter only calls the actual flow dissector once (this is the main
goal of the patch, actually). Having 2 insns separated by N insns that
access data obtained from the same flow dissector call means you have
to both (a) ensure the first caller -- whether it's the first or the
second insn -- does perform the BPF_CALL, and (b) ensure that none of
the N insns in the middle mucks with the result of the first call (my
solution deals with (b) by using the stack outside of M[], while yours
requires verifying the code. I find mine easier).

In order to achieve (a), you need the prologue code: Because the code
can have jumps, you can never know whether the "ld #keys" was actually
called or not. What my solution's prologue does is to write a 0 on a
flow_inited u32, which states that the flow dissector hasn't been
called. Now, every call for any of the sub-fields checks this
flow_inited field, and runs the flow dissector iff it's zero (hasn't
been called yet), in which case it also sets flow_inited to 1.

Your approach needs it too. Citing from your pseudo-code:

> ld #5     <-- indicates to fill the first 5 slots of M[], so M[0] to M[4]
> ld #keys  <-- triggers the extension to fill the M[] slots
> ld M[0]   <-- loads nhoff from M[0] into accu

How does the "ld M[0]" know that the actual flow dissector has already
been called? What if the insn just before the "ld #5" was "jmp +2" ?
In that case, the "ld #keys" would have never been called.

> ld M[0]   <-- loads nhoff from M[0] into accu
> <do sth with it>
> ld M[3]   <-- loads tproto into accu, etc

>> or his need to easily access these data but that's why I proposed
>> the above if he wants to go for that. Btw, seccomp doesn't allow
>> for loading BPF extensions, you said so yourself Alexei.
>
> of course, but seccomp does use ld/st M[] which will overlap
> with your socket extension and since check_load_and_stores()
> will be hacked, seccomp verification needs to be considered as well.
> From user api point of view, your approach is cleaner than Chema's,
> but from implementation Chema's is much safer and smaller.
>
> Anyway as I said before I'm not excited about either.
> I don't think we should be adding classic BPF extensions any more.
> The long term headache of supporting classic BPF extensions
> outweighs the short term benefits.
I see a couple of issues with (effectively) freezing classic BPF
development while waiting for direct eBPF access to happen. The first
one is that the kernel has to accept it. I can see many questions
about this, especially security and usability (I'll send an email
about the "split BPF out of core later"). Now, the main issue is
whether/when the tools will support it. IMO, this is useful iff I can
quickly write/reuse filters and run tcpdump filters based on them. I'm
trying to get upstream libpcap to accept support for raw (classic) BPF
filters, and it's taking a long time. I can imagine how they may be
less receptive about supporting a Linux-only eBPF mechanism. Tools do
matter.

Even if eBPF happens, it's not that the extensions are so hard to port
to eBPF: It's one BPF_CALL per extension. And they have a
straightforward porting path to the C-like code.

-Chema


> Not having to constantly tweak kernel for such cases was the key
> goal of eBPF. My two eBPF approaches to solve Chema's need
> are both cleaner, since nothing special is being done in eBPF
> core to support this new need. Both instruction set and verifier
> stay generic and cover tracing and this new socket use at once.
> I do realize that I'm talking here about future eBPF verifier that
> is not in tree yet and eBPF is not exposed to user space, but
> I think we should consider longer term perspective.
> If I'm forced to pick between yours or Chema's classic extensions,
> I would pick Chema's because it's lesser evil :)
>
>>> I think exposing eBPF to user space is not a matter of 'if' but 'when'.
>>>
>>> I'm not sure how pressing it is now to add another extension to classic,
>>> when the goal of this patch set fits into eBPF model just fine.
>>> yes, eBPF verifier is not in tree yet and it will obviously take longer to
>>> review than Chema's or Daniel's set.
>>>
>>> When eBPF is exposed to user space the inner header access can
>>> be done in two ways without changing eBPF instruction set or eBPF
>>> verifier.
>>>
>>> eBPF approach #1:
>>> -- code packet parser in restricted C
>>> Pros:
>>> skb_flow_dissect() stays hidden in kernel.
>>> No additional uapi headache which exist if we start reusing
>>> in-kernel skb_flow_dissect() either via classic or eBPF.
>>> Such eBPF program will be just as fast as in-kernel skb_flow_dissect()
>>> (I provided performance numbers before)
>>> Cons:
>>> in-kernel skb_flow_dissect() is not reused, but mostly copy-pasted to
>>> userspace.
>>>
>>> eBPF approach #2:
>>> -- reuse in-kernel skb_flow_dissect() via bpf_call insn from eBPF program
>>> Pros:
>>> eBPF program becomes much shorter and can be done in asm like:
>>> bpf_mov r2, fp
>>> bpf_sub r2, 64
>>> bpf_call bpf_skb_flow_dissect  // call in-kernel helper function from
>>> eBPF program
>>> bpf_ldx r1, [fp - 64]  // access flow_keys data
>>> bpf_ldx r2, [fp - 60]
>>>
>>> Cons:
>>> bpf_skb_flow_dissect() (wrapper on top of skb_flow_dissect()) becomes
>>> uapi visible helper function
>>>
>>> imo both eBPF approaches are cleaner than extending classic.
>>>
>>

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-03 21:12               ` Chema Gonzalez
@ 2014-06-04  8:51                 ` Daniel Borkmann
  2014-06-05  6:55                   ` David Miller
  2014-06-20 21:56                   ` Chema Gonzalez
  0 siblings, 2 replies; 47+ messages in thread
From: Daniel Borkmann @ 2014-06-04  8:51 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: Alexei Starovoitov, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, David Miller,
	Eric Dumazet, Network Development, LKML

On 06/03/2014 11:12 PM, Chema Gonzalez wrote:
...
> Your approach needs it too. Citing from your pseudo-code:
>
>> ld #5     <-- indicates to fill the first 5 slots of M[], so M[0] to M[4]
>> ld #keys  <-- triggers the extension to fill the M[] slots
>> ld M[0]   <-- loads nhoff from M[0] into accu
>
> How does the "ld M[0]" know that the actual flow dissector has already
> been called? What if the insn just before the "ld #5" was "jmp +2" ?
> In that case, the "ld #keys" would have never been called.

But that case would be no different from doing something like ...

[...]
   jmp foo
   ldi #42
   st M[0]
foo:
   ld M[0]
[...]

... and would then not pass the checker in check_load_and_stores(),
which, as others have already stated, would need to be extended,
of course. It's one possible approach.

>> Anyway as I said before I'm not excited about either.
>> I don't think we should be adding classic BPF extensions any more.
>> The long term headache of supporting classic BPF extensions
>> outweighs the short term benefits.
...
> I see a couple of issues with (effectively) freezing classic BPF
> development while waiting for direct eBPF access to happen. The first
> one is that the kernel has to accept it. I can see many questions
> about this, especially security and usability (I'll send an email
> about the "split BPF out of core later"). Now, the main issue is
> whether/when the tools will support it. IMO, this is useful iff I can
> quickly write/reuse filters and run tcpdump filters based on them. I'm
> trying to get upstream libpcap to accept support for raw (classic) BPF
> filters, and it's taking a long time. I can imagine how they may be
> less receptive about supporting a Linux-only eBPF mechanism. Tools do
> matter.

Grepping through libpcap code, which tries to be platform independent,
it seems after all the years, the only thing where you can see support
for in their code is SKF_AD_PKTTYPE and SKF_AD_PROTOCOL. Perhaps they
just don't care, perhaps they do, who knows, but it looks to me a bit
that they are reluctant to these improvements, maybe for one reason
that other OSes don't support it. That was also one of the reasons that
led me to start writing bpf_asm (net/tools/) for having a small DSL
for more easily trying out BPF code while having _full_ control over it.

Maybe someone should start a binary-compatible Linux-only version of
libpcap, where tcpdump will transparently make use of these low level
improvements eventually. </rant> ;)

Thanks,

Daniel

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-04  8:51                 ` Daniel Borkmann
@ 2014-06-05  6:55                   ` David Miller
  2014-06-20 21:56                   ` Chema Gonzalez
  1 sibling, 0 replies; 47+ messages in thread
From: David Miller @ 2014-06-05  6:55 UTC (permalink / raw)
  To: dborkman
  Cc: chema, ast, mingo, rostedt, a.p.zijlstra, acme, jolsa, tglx, hpa,
	akpm, keescook, edumazet, netdev, linux-kernel


FWIW, I'm not applying this series.

There is no real consensus and a lot more discussion and time is
needed before anything happens here.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-04  8:51                 ` Daniel Borkmann
  2014-06-05  6:55                   ` David Miller
@ 2014-06-20 21:56                   ` Chema Gonzalez
  2014-06-24  8:14                     ` Daniel Borkmann
  1 sibling, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-06-20 21:56 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, David Miller,
	Eric Dumazet, Network Development, LKML

I'll try to revive the discussion for this patch, in case I can
convince you about its implementation. I rebased it to the latest
HEAD, and I'm ready to re-submit.

On Wed, Jun 4, 2014 at 1:51 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 06/03/2014 11:12 PM, Chema Gonzalez wrote:
> ...
>
>> Your approach needs it too. Citing from your pseudo-code:
>>
>>> ld #5     <-- indicates to fill the first 5 slots of M[], so M[0] to M[4]
>>> ld #keys  <-- triggers the extension to fill the M[] slots
>>> ld M[0]   <-- loads nhoff from M[0] into accu
>>
>>
>> How does the "ld M[0]" know that the actual flow dissector has already
>> been called? What if the insn just before the "ld #5" was "jmp +2" ?
>> In that case, the "ld #keys" would have never been called.
>
>
> But that case would be no different from doing something like ...
>
> [...]
>   jmp foo
>   ldi #42
>   st M[0]
> foo:
>   ld M[0]
> [...]
>
> ... and would then not pass the checker in check_load_and_stores(),
> which, as others have already stated, would need to be extended,
> of course. It's one possible approach.

As Alexei/you have noted, this is moving the complexity to
check_load_and_stores(). It's more complicated than the current
ensure-there's-a-store-preceding-each-load-for-each-M[]-position
check. In particular, your approach requires 2 insns before a memory
position can be considered filled, namely the "ld #5" and the "ld
#keys". That means adding intra-instruction state to the FSM in
check_load_and_stores().

A second issue is that you need to ensure that M[0:5] does not get
polluted between the moment you call "ld #5; ld #keys" and the moment
the code uses the flow-dissector values. This is more complex than
just ensuring the code doesn't access uninit'ed M[].

There's also some concerns about effectively adding a prologue. We
already have that:

$ cat net/core/filter.c
...
int sk_convert_filter(struct sock_filter *prog, int len,
          struct sock_filter_int *new_prog, int *new_len)
{
  ...
  if (new_insn)
    *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
  new_insn++;

  for (i = 0; i < len; fp++, i++) {
    ...

The current prologue is embedded in sk_convert_filter(). The patch
just moves it to its own function, and (potentially) adds another
instruction (which zeroes flow_inited iff there's any call to the flow
dissector).

>>> Anyway as I said before I'm not excited about either.
>>> I don't think we should be adding classic BPF extensions any more.
>>> The long term headache of supporting classic BPF extensions
>>> outweighs the short term benefits.
>> I see a couple of issues with (effectively) freezing classic BPF
>> development while waiting for direct eBPF access to happen. The first
>> one is that the kernel has to accept it. I can see many questions
>> about this, especially security and usability (I'll send an email
>> about the "split BPF out of core later"). Now, the main issue is
>> whether/when the tools will support it. IMO, this is useful iff I can
>> quickly write/reuse filters and run tcpdump filters based on them. I'm
>> trying to get upstream libpcap to accept support for raw (classic) BPF
>> filters, and it's taking a long time. I can imagine how they may be
>> less receptive about supporting a Linux-only eBPF mechanism. Tools do
>> matter.
This is a high-level decision, more than a technical one. Do we want
to freeze classic BPF development in linux, even before we have a
complete eBPF replacement, and zero eBPF tool (libpcap) support?

> Grepping through libpcap code, which tries to be platform independent,
> it seems after all the years, the only thing where you can see support
> for in their code is SKF_AD_PKTTYPE and SKF_AD_PROTOCOL. Perhaps they
Actually they recently added MOD/XOR support. Woo-hoo!

> just don't care, perhaps they do, who knows, but it looks to me a bit
> that they are reluctant to these improvements, maybe for one reason
> that other OSes don't support it.
>From the comments in the MOD/XOR patch, the latter seem to be the issue.

> That was also one of the reasons that
> led me to start writing bpf_asm (net/tools/) for having a small DSL
> for more easily trying out BPF code while having _full_ control over it.
>
> Maybe someone should start a binary-compatible Linux-only version of
> libpcap, where tcpdump will transparently make use of these low level
> improvements eventually. </rant> ;)
There's too much code dependent on libpcap to make a replacement possible.

Thanks,
-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-20 21:56                   ` Chema Gonzalez
@ 2014-06-24  8:14                     ` Daniel Borkmann
  2014-06-25 22:00                       ` Chema Gonzalez
  0 siblings, 1 reply; 47+ messages in thread
From: Daniel Borkmann @ 2014-06-24  8:14 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: Alexei Starovoitov, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, David Miller,
	Eric Dumazet, Network Development, LKML

On 06/20/2014 11:56 PM, Chema Gonzalez wrote:
...
>>>> Anyway as I said before I'm not excited about either.
>>>> I don't think we should be adding classic BPF extensions any more.
>>>> The long term headache of supporting classic BPF extensions
>>>> outweighs the short term benefits.
 >>>
>>> I see a couple of issues with (effectively) freezing classic BPF
>>> development while waiting for direct eBPF access to happen. The first
>>> one is that the kernel has to accept it. I can see many questions
>>> about this, especially security and usability (I'll send an email
>>> about the "split BPF out of core later"). Now, the main issue is
>>> whether/when the tools will support it. IMO, this is useful iff I can
>>> quickly write/reuse filters and run tcpdump filters based on them. I'm
>>> trying to get upstream libpcap to accept support for raw (classic) BPF
>>> filters, and it's taking a long time. I can imagine how they may be
>>> less receptive about supporting a Linux-only eBPF mechanism. Tools do
>>> matter.
 >
> This is a high-level decision, more than a technical one. Do we want
> to freeze classic BPF development in linux, even before we have a
> complete eBPF replacement, and zero eBPF tool (libpcap) support?

In my opinion, I don't think we strictly have to hard-freeze it. The
only concern I see is that conceptually hooking into the flow_dissector
to read out all keys for further processing on top of them 1) sort
of breaks/bypasses the concept of BPF (as it's actually the task of
BPF itself for doing this), 2) effectively freezes any changes to the
flow_dissector as BPF applications making use of it now depend on the
provided offsets for doing further processing on top of them, 3) it
can already be resolved by (re-)writing the kernel's flow dissector
in C-like syntax in user space iff eBPF can be loaded from there with
similar performance. So shouldn't we rather work towards that as a
more generic approach/goal in the mid term and w/o having to maintain
a very short term intermediate solution that we need to special case
along the code and have to carry around forever ...

>> Grepping through libpcap code, which tries to be platform independent,
>> it seems after all the years, the only thing where you can see support
>> for in their code is SKF_AD_PKTTYPE and SKF_AD_PROTOCOL. Perhaps they
 >
> Actually they recently added MOD/XOR support. Woo-hoo!

Great to hear, still quite some things missing, unfortunately. :/

>> just don't care, perhaps they do, who knows, but it looks to me a bit
>> that they are reluctant to these improvements, maybe for one reason
>> that other OSes don't support it.
 >
>  From the comments in the MOD/XOR patch, the latter seem to be the issue.

Yep, that's the pain you need to live with when trying to be multi
OS capable. I assume in its very origin, the [libpcap] compiler was
probably not designed for handling such differences in various
operating systems (likely even ran in user space from libpcap directly).

>> That was also one of the reasons that
>> led me to start writing bpf_asm (net/tools/) for having a small DSL
>> for more easily trying out BPF code while having _full_ control over it.
>>
>> Maybe someone should start a binary-compatible Linux-only version of
>> libpcap, where tcpdump will transparently make use of these low level
>> improvements eventually. </rant> ;)
 >
> There's too much code dependent on libpcap to make a replacement possible.

Well, I wrote binary-compatible, so applications on top of it won't
care much if it could be used as drop-in replacement. That would perhaps
also allow for fanout and other features to be used ...

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-24  8:14                     ` Daniel Borkmann
@ 2014-06-25 22:00                       ` Chema Gonzalez
  2014-06-27 10:19                         ` Daniel Borkmann
  0 siblings, 1 reply; 47+ messages in thread
From: Chema Gonzalez @ 2014-06-25 22:00 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, David Miller,
	Eric Dumazet, Network Development, LKML

On Tue, Jun 24, 2014 at 1:14 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> This is a high-level decision, more than a technical one. Do we want
>> to freeze classic BPF development in linux, even before we have a
>> complete eBPF replacement, and zero eBPF tool (libpcap) support?
>
>
> In my opinion, I don't think we strictly have to hard-freeze it. The
> only concern I see is that conceptually hooking into the flow_dissector
> to read out all keys for further processing on top of them 1) sort
> of breaks/bypasses the concept of BPF (as it's actually the task of
> BPF itself for doing this),
I don't think we want to do flow dissection using BPF insns. It's not
easy to write BPF insns, and we already have kernel code that does
that. IMO that's what eBPF calls/BPF ancillary loads are for (e.g.
vlan access).

> 2) effectively freezes any changes to the
> flow_dissector as BPF applications making use of it now depend on the
> provided offsets for doing further processing on top of them, 3) it
Remember that my approach does not have (user-visible) offsets. It
uses the eBPF stack to dump the output (struct flow_keys) of the flow
dissector (skb_flow_dissect()). The only dependencies we're adding is
that, once we provide a BPF ancillary load to access e.g. thoff, we
have to keep providing it.

> can already be resolved by (re-)writing the kernel's flow dissector
> in C-like syntax in user space iff eBPF can be loaded from there with
> similar performance. So shouldn't we rather work towards that as a
> more generic approach/goal in the mid term and w/o having to maintain
> a very short term intermediate solution that we need to special case
> along the code and have to carry around forever ...
Once (if) we reach the point where we can do eBPF filters in "C-like
syntax," I'd agree with you that it would be nice to be able to reuse
the same function inside the kernel and as an eBPF library. The same
probably applies to other network functions. Now, I'm not sure what's
the model to reuse: Are we planning to re-write (maybe "re-write" is
too strong, as we will probably only need some minor changes) some of
the kernel functions into this "C--" language so that eBPF can use
them? Do other people agree with this vision?

There's still the problem of whether we want to obsolete classic BPF
in the kernel before the tools (libpcap mainly) accept eBPF. This can
take a lot.

Finally, what's the user's CLI interface you have in mind? Right now,
tcpdump expressions are very handy: I know I can pass "ip[2:2] ==
1500" or "(tcp[13] & 0x03)" to any libpcap-based application. This is
very handy to log into a machine, and quickly run tcpdump to get the
packets I'm interested on. What would be the model for using C-- eBPF
filters in the same manner?

Thanks again,
-Chema

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF
  2014-06-25 22:00                       ` Chema Gonzalez
@ 2014-06-27 10:19                         ` Daniel Borkmann
  0 siblings, 0 replies; 47+ messages in thread
From: Daniel Borkmann @ 2014-06-27 10:19 UTC (permalink / raw)
  To: Chema Gonzalez
  Cc: Alexei Starovoitov, Ingo Molnar, Steven Rostedt, Peter Zijlstra,
	Arnaldo Carvalho de Melo, Jiri Olsa, Thomas Gleixner,
	H. Peter Anvin, Andrew Morton, Kees Cook, David Miller,
	Eric Dumazet, Network Development, LKML

On 06/26/2014 12:00 AM, Chema Gonzalez wrote:
...
> There's still the problem of whether we want to obsolete classic BPF
> in the kernel before the tools (libpcap mainly) accept eBPF. This can
> take a lot.
>
> Finally, what's the user's CLI interface you have in mind? Right now,
> tcpdump expressions are very handy: I know I can pass "ip[2:2] ==
> 1500" or "(tcp[13] & 0x03)" to any libpcap-based application. This is
> very handy to log into a machine, and quickly run tcpdump to get the
> packets I'm interested on. What would be the model for using C-- eBPF
> filters in the same manner?

Yes, imho, it's a valid question to ask. I think there are a couple
of possibilities for libpcap/tcpdump from a user point of view (note,
I don't strictly think it's the _only_ main user though): 1) iff a
llvm and/or gcc backend gets merged from the compiler side, one could
add a cli interface to run the generated opcodes from a file for
advanced filters while perhaps classic BPF continues to be supported
via its high-level filter expressions; 2) there could be a Linux-only
compiler in libpcap that translates and makes use of full eBPF (though
significantly more effort to implement); 3) libpcap continues to use
classic BPF as it's currently doing.

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2014-06-27 10:19 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-30 18:29 [PATCH] net: filter: add insn for loading internal transport header offset Chema Gonzalez
2014-04-30 22:21 ` Alexei Starovoitov
2014-05-01 10:53 ` Daniel Borkmann
2014-05-01 10:55   ` Daniel Borkmann
2014-05-01 18:44     ` Chema Gonzalez
2014-05-01 18:44 ` [PATCH net-next v2] " Chema Gonzalez
2014-05-02 16:21   ` Ben Hutchings
2014-05-02 21:49   ` David Miller
2014-05-03  0:53     ` Chema Gonzalez
2014-05-03  2:52       ` David Miller
2014-05-05 18:42         ` Chema Gonzalez
2014-05-05 19:12           ` David Miller
2014-05-14 18:42             ` Chema Gonzalez
2014-05-14 18:51               ` Chema Gonzalez
2014-05-14 18:42 ` [PATCH v2 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
2014-05-14 18:42   ` [PATCH v2 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
2014-05-14 18:42   ` [PATCH v2 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
2014-05-14 18:51 ` [PATCH v3 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
2014-05-14 20:05   ` Alexei Starovoitov
2014-05-14 21:51     ` Chema Gonzalez
2014-05-14 22:44       ` Alexei Starovoitov
2014-05-16 18:41         ` Chema Gonzalez
2014-05-14 18:51 ` [PATCH v3 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
2014-05-14 18:51 ` [PATCH v3 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
2014-05-16 18:41 ` [PATCH v5 net-next 1/3] net: flow_dissector: avoid multiple calls in BPF Chema Gonzalez
2014-05-16 22:00   ` Alexei Starovoitov
2014-05-19 22:23     ` Chema Gonzalez
2014-05-20  9:58       ` Daniel Borkmann
2014-05-16 18:41 ` [PATCH v5 net-next 2/3] net: filter: add insn for loading internal transport header offset Chema Gonzalez
2014-05-16 18:41 ` [PATCH v5 net-next 3/3] net: filter: add insn for loading internal transport header proto Chema Gonzalez
2014-05-29 18:55 ` [PATCH v6 net-next 1/4] net: flow_dissector: avoid multiple calls in eBPF Chema Gonzalez
2014-05-29 23:54   ` Daniel Borkmann
2014-05-30 17:12     ` Chema Gonzalez
2014-06-02 12:36       ` Daniel Borkmann
2014-06-02 16:48         ` Alexei Starovoitov
2014-06-03  8:33           ` Daniel Borkmann
2014-06-03 20:15             ` Alexei Starovoitov
2014-06-03 21:12               ` Chema Gonzalez
2014-06-04  8:51                 ` Daniel Borkmann
2014-06-05  6:55                   ` David Miller
2014-06-20 21:56                   ` Chema Gonzalez
2014-06-24  8:14                     ` Daniel Borkmann
2014-06-25 22:00                       ` Chema Gonzalez
2014-06-27 10:19                         ` Daniel Borkmann
2014-05-29 18:56 ` [PATCH v6 net-next 2/4] net: filter: add insn for loading internal transport header offset Chema Gonzalez
2014-05-29 18:56 ` [PATCH v6 net-next 3/4] net: filter: add insn for loading internal transport header proto Chema Gonzalez
2014-05-29 18:56 ` [PATCH v6 net-next 4/4] net: filter: minor BPF cleanups Chema Gonzalez

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).