* [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
2019-09-08 22:02 ` Yonghong Song
2019-09-07 21:40 ` [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs Alan Maguire
` (5 subsequent siblings)
6 siblings, 1 reply; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
bpf_pcap() simplifies packet capture for skb and XDP
BPF programs by creating a BPF perf event containing information
relevant for packet capture (protocol, actual/captured packet
size, time of capture, etc) along with the packet payload itself.
All of this is stored in a "struct bpf_pcap_hdr".
This header information can then be retrieved from the perf
event map and used by packet capture frameworks such as libpcap
to carry out packet capture.
skb and XDP programs currently deal in Ethernet-based traffic
exclusively, so should specify BPF_PCAP_TYPE_ETH or
BPF_PCAP_TYPE_UNSET. The protocol parameter will be used
in a later commit.
Note that libpcap assumes times are relative to the epoch while
we record nanoseconds since boot; as a result any times need
to be normalized with respect to the boot time for libpcap
storage; sysinfo(2) can be used to retrieve boot time to normalize
values appropriately.
Example usage for a tc-bpf program:
struct bpf_map_def SEC("maps") pcap_map = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1024,
};
SEC("cap")
int cap(struct __sk_buff *skb)
{
bpf_pcap(skb, 1514, &pcap_map, BPF_PCAP_TYPE_ETH, 0);
return TC_ACT_OK;
}
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
include/linux/bpf.h | 20 +++++++++++++
include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++-
kernel/bpf/verifier.c | 4 ++-
net/core/filter.c | 67 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 164 insertions(+), 2 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d223..033c9cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1145,4 +1145,24 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
}
#endif /* CONFIG_INET */
+
+static inline int bpf_pcap_prepare(int protocol, u32 cap_len, u32 tot_len,
+ u64 flags, struct bpf_pcap_hdr *pcap)
+{
+ if (protocol < 0 || pcap == NULL)
+ return -EINVAL;
+
+ pcap->magic = BPF_PCAP_MAGIC;
+ pcap->protocol = protocol;
+ pcap->flags = flags;
+
+ if (cap_len == 0 || tot_len < cap_len)
+ cap_len = tot_len;
+ pcap->cap_len = cap_len;
+ pcap->tot_len = tot_len;
+ pcap->ktime_ns = ktime_get_mono_fast_ns();
+
+ return 0;
+}
+
#endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77c6be9..a27e58e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2750,6 +2750,39 @@ struct bpf_stack_build_id {
* **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
*
* **-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
+ * u64 flags)
+ * Description
+ * Write packet data from *data* into a special BPF perf event
+ * held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
+ * perf event has the same attributes as perf events generated
+ * by bpf_perf_event_output. For skb and xdp programs, *data*
+ * is the relevant context.
+ *
+ * Metadata for this event is a **struct bpf_pcap_hdr**; this
+ * contains the capture length, actual packet length and
+ * the starting protocol.
+ *
+ * The max number of bytes of context to store is specified via
+ * *size*.
+ *
+ * The flags value can be used to specify an id value of up
+ * to 48 bits; the id can be used to correlate captured packets
+ * with other trace data, since the passed-in flags value is stored
+ * stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *
+ * The *protocol* value specifies the protocol type of the start
+ * of the packet so that packet capture can carry out
+ * interpretation. See **pcap-linktype** (7) for details on
+ * the supported values.
+ *
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ * -ENOENT will be returned if the associated perf event
+ * map entry is empty, or the skb is zero-length.
+ * -EINVAL will be returned if the flags value is invalid.
+ *
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2862,7 +2895,8 @@ struct bpf_stack_build_id {
FN(sk_storage_get), \
FN(sk_storage_delete), \
FN(send_signal), \
- FN(tcp_gen_syncookie),
+ FN(tcp_gen_syncookie), \
+ FN(pcap),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -2941,6 +2975,9 @@ enum bpf_func_id {
/* BPF_FUNC_sk_storage_get flags */
#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0)
+/* BPF_FUNC_pcap flags */
+#define BPF_F_PCAP_ID_MASK 0xffffffffffff
+
/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
@@ -3613,4 +3650,40 @@ struct bpf_sockopt {
__s32 retval;
};
+/* bpf_pcap_hdr contains information related to a particular packet capture
+ * flow. It specifies
+ *
+ * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
+ * a pcap-related event.
+ * - a starting protocol is the protocol associated with the header
+ * - a flags value, copied from the flags value passed into bpf_pcap().
+ * IDs can be used to correlate packet capture data and other tracing data.
+ *
+ * bpf_pcap_hdr also contains the information relating to the to-be-captured
+ * packet, and closely corresponds to the struct pcap_pkthdr used by
+ * pcap_dump (3PCAP). The bpf_pcap helper sets ktime_ns (nanoseconds since
+ * boot) to the ktime_ns value; to get sensible pcap times this value should
+ * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
+ *
+ * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
+ * need both information about the particular packet and the protocol
+ * we are capturing.
+ */
+
+#define BPF_PCAP_MAGIC 0xb7fca7
+
+struct bpf_pcap_hdr {
+ __u32 magic;
+ int protocol;
+ __u64 flags;
+ __u64 ktime_ns;
+ __u32 tot_len;
+ __u32 cap_len;
+ __u8 data[0];
+};
+
+#define BPF_PCAP_TYPE_UNSET -1
+#define BPF_PCAP_TYPE_ETH 1
+#define BPF_PCAP_TYPE_IP 12
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fb5075..a33ed24 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3440,7 +3440,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
if (func_id != BPF_FUNC_perf_event_read &&
func_id != BPF_FUNC_perf_event_output &&
- func_id != BPF_FUNC_perf_event_read_value)
+ func_id != BPF_FUNC_perf_event_read_value &&
+ func_id != BPF_FUNC_pcap)
goto error;
break;
case BPF_MAP_TYPE_STACK_TRACE:
@@ -3527,6 +3528,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
case BPF_FUNC_perf_event_read_value:
+ case BPF_FUNC_pcap:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index ed65636..e0e23ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4158,6 +4158,35 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+BPF_CALL_5(bpf_xdp_pcap, struct xdp_buff *, xdp, u32, size,
+ struct bpf_map *, map, int, protocol, u64, flags)
+{
+ unsigned long len = (unsigned long)(xdp->data_end - xdp->data);
+ struct bpf_pcap_hdr pcap;
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
+ return -EINVAL;
+
+ ret = bpf_pcap_prepare(protocol, size, len, flags, &pcap);
+ if (ret)
+ return ret;
+
+ return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+ xdp->data, pcap.cap_len, bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_pcap_proto = {
+ .func = bpf_xdp_pcap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_MAP_PTR,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
return skb->sk ? sock_gen_cookie(skb->sk) : 0;
@@ -5926,6 +5955,34 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
#endif /* CONFIG_INET */
+BPF_CALL_5(bpf_skb_pcap, struct sk_buff *, skb, u32, size,
+ struct bpf_map *, map, int, protocol, u64, flags)
+{
+ struct bpf_pcap_hdr pcap;
+ int ret;
+
+ if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
+ return -EINVAL;
+
+ ret = bpf_pcap_prepare(protocol, size, skb->len, flags, &pcap);
+ if (ret)
+ return ret;
+
+ return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+ skb, pcap.cap_len, bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_pcap_proto = {
+ .func = bpf_skb_pcap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_MAP_PTR,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
bool bpf_helper_changes_pkt_data(void *func)
{
if (func == bpf_skb_vlan_push ||
@@ -6075,6 +6132,8 @@ bool bpf_helper_changes_pkt_data(void *func)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_perf_event_output:
return &bpf_skb_event_output_proto;
+ case BPF_FUNC_pcap:
+ return &bpf_skb_pcap_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -6216,6 +6275,8 @@ bool bpf_helper_changes_pkt_data(void *func)
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
#endif
+ case BPF_FUNC_pcap:
+ return &bpf_skb_pcap_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -6256,6 +6317,8 @@ bool bpf_helper_changes_pkt_data(void *func)
return &bpf_tcp_check_syncookie_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_pcap:
+ return &bpf_xdp_pcap_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6361,6 +6424,8 @@ bool bpf_helper_changes_pkt_data(void *func)
case BPF_FUNC_skc_lookup_tcp:
return &bpf_skc_lookup_tcp_proto;
#endif
+ case BPF_FUNC_pcap:
+ return &bpf_skb_pcap_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -6399,6 +6464,8 @@ bool bpf_helper_changes_pkt_data(void *func)
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
+ case BPF_FUNC_pcap:
+ return &bpf_skb_pcap_proto;
default:
return bpf_base_func_proto(func_id);
}
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture
2019-09-07 21:40 ` [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture Alan Maguire
@ 2019-09-08 22:02 ` Yonghong Song
0 siblings, 0 replies; 12+ messages in thread
From: Yonghong Song @ 2019-09-08 22:02 UTC (permalink / raw)
To: Alan Maguire, ast, daniel, Martin Lau, Song Liu, davem,
jakub.kicinski, hawk, john.fastabend, rostedt, mingo,
quentin.monnet, Andrey Ignatov, joe, acme, jolsa,
alexey.budankov, gregkh, namhyung, sdf, f.fainelli, shuah, peter,
ivan, Andrii Nakryiko, bhole_prashant_q7, david.calavera,
danieltimlee, Takshak Chahande, netdev, bpf, linux-kselftest
On 9/7/19 2:40 PM, Alan Maguire wrote:
> bpf_pcap() simplifies packet capture for skb and XDP
> BPF programs by creating a BPF perf event containing information
> relevant for packet capture (protocol, actual/captured packet
> size, time of capture, etc) along with the packet payload itself.
> All of this is stored in a "struct bpf_pcap_hdr".
>
> This header information can then be retrieved from the perf
> event map and used by packet capture frameworks such as libpcap
> to carry out packet capture.
>
> skb and XDP programs currently deal in Ethernet-based traffic
> exclusively, so should specify BPF_PCAP_TYPE_ETH or
> BPF_PCAP_TYPE_UNSET. The protocol parameter will be used
> in a later commit.
>
> Note that libpcap assumes times are relative to the epoch while
> we record nanoseconds since boot; as a result any times need
> to be normalized with respect to the boot time for libpcap
> storage; sysinfo(2) can be used to retrieve boot time to normalize
> values appropriately.
>
> Example usage for a tc-bpf program:
>
> struct bpf_map_def SEC("maps") pcap_map = {
> .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> .key_size = sizeof(int),
> .value_size = sizeof(int),
> .max_entries = 1024,
> };
>
> SEC("cap")
> int cap(struct __sk_buff *skb)
> {
> bpf_pcap(skb, 1514, &pcap_map, BPF_PCAP_TYPE_ETH, 0);
>
> return TC_ACT_OK;
> }
>
> Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
> ---
> include/linux/bpf.h | 20 +++++++++++++
> include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++-
> kernel/bpf/verifier.c | 4 ++-
> net/core/filter.c | 67 ++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 164 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5b9d223..033c9cf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1145,4 +1145,24 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
> }
> #endif /* CONFIG_INET */
>
> +
> +static inline int bpf_pcap_prepare(int protocol, u32 cap_len, u32 tot_len,
> + u64 flags, struct bpf_pcap_hdr *pcap)
> +{
> + if (protocol < 0 || pcap == NULL)
> + return -EINVAL;
> +
> + pcap->magic = BPF_PCAP_MAGIC;
> + pcap->protocol = protocol;
> + pcap->flags = flags;
> +
> + if (cap_len == 0 || tot_len < cap_len)
> + cap_len = tot_len;
> + pcap->cap_len = cap_len;
> + pcap->tot_len = tot_len;
> + pcap->ktime_ns = ktime_get_mono_fast_ns();
> +
> + return 0;
> +}
> +
> #endif /* _LINUX_BPF_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 77c6be9..a27e58e 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2750,6 +2750,39 @@ struct bpf_stack_build_id {
> * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
> *
> * **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> + *
> + * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
> + * u64 flags)
> + * Description
> + * Write packet data from *data* into a special BPF perf event
> + * held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
> + * perf event has the same attributes as perf events generated
> + * by bpf_perf_event_output. For skb and xdp programs, *data*
> + * is the relevant context.
> + *
> + * Metadata for this event is a **struct bpf_pcap_hdr**; this
> + * contains the capture length, actual packet length and
> + * the starting protocol.
> + *
> + * The max number of bytes of context to store is specified via
> + * *size*.
> + *
> + * The flags value can be used to specify an id value of up
> + * to 48 bits; the id can be used to correlate captured packets
> + * with other trace data, since the passed-in flags value is stored
> + * stored in the **struct bpf_pcap_hdr** in the **flags** field.
> + *
> + * The *protocol* value specifies the protocol type of the start
> + * of the packet so that packet capture can carry out
> + * interpretation. See **pcap-linktype** (7) for details on
> + * the supported values.
> + *
> + * Return
> + * 0 on success, or a negative error in case of failure.
> + * -ENOENT will be returned if the associated perf event
> + * map entry is empty, or the skb is zero-length.
> + * -EINVAL will be returned if the flags value is invalid.
The feature itself indeed seems useful for networking community.
I just have some questions what kind of kernel support is needed.
We already have perf_event_output for skb and xdp.
Can we just use the original helpers and do a little bit post
processing in user space to get what you want?
It looks possible to me to generate bpf_pcap_hdr in user space
before sending output to pcap analyzer.
> + *
> */
> #define __BPF_FUNC_MAPPER(FN) \
> FN(unspec), \
> @@ -2862,7 +2895,8 @@ struct bpf_stack_build_id {
> FN(sk_storage_get), \
> FN(sk_storage_delete), \
> FN(send_signal), \
> - FN(tcp_gen_syncookie),
> + FN(tcp_gen_syncookie), \
> + FN(pcap),
>
> /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> * function eBPF program intends to call
> @@ -2941,6 +2975,9 @@ enum bpf_func_id {
> /* BPF_FUNC_sk_storage_get flags */
> #define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0)
>
> +/* BPF_FUNC_pcap flags */
> +#define BPF_F_PCAP_ID_MASK 0xffffffffffff
> +
> /* Mode for BPF_FUNC_skb_adjust_room helper. */
> enum bpf_adj_room_mode {
> BPF_ADJ_ROOM_NET,
> @@ -3613,4 +3650,40 @@ struct bpf_sockopt {
> __s32 retval;
> };
>
> +/* bpf_pcap_hdr contains information related to a particular packet capture
> + * flow. It specifies
> + *
> + * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
> + * a pcap-related event.
> + * - a starting protocol is the protocol associated with the header
> + * - a flags value, copied from the flags value passed into bpf_pcap().
> + * IDs can be used to correlate packet capture data and other tracing data.
> + *
> + * bpf_pcap_hdr also contains the information relating to the to-be-captured
> + * packet, and closely corresponds to the struct pcap_pkthdr used by
> + * pcap_dump (3PCAP). The bpf_pcap helper sets ktime_ns (nanoseconds since
> + * boot) to the ktime_ns value; to get sensible pcap times this value should
> + * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
> + *
> + * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
> + * need both information about the particular packet and the protocol
> + * we are capturing.
> + */
> +
> +#define BPF_PCAP_MAGIC 0xb7fca7
> +
> +struct bpf_pcap_hdr {
> + __u32 magic;
> + int protocol;
> + __u64 flags;
> + __u64 ktime_ns;
> + __u32 tot_len;
> + __u32 cap_len;
> + __u8 data[0];
> +};
> +
> +#define BPF_PCAP_TYPE_UNSET -1
> +#define BPF_PCAP_TYPE_ETH 1
> +#define BPF_PCAP_TYPE_IP 12
> +
> #endif /* _UAPI__LINUX_BPF_H__ */
[...]
> diff --git a/net/core/filter.c b/net/core/filter.c
> index ed65636..e0e23ee 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4158,6 +4158,35 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
> .arg5_type = ARG_CONST_SIZE_OR_ZERO,
> };
>
> +BPF_CALL_5(bpf_xdp_pcap, struct xdp_buff *, xdp, u32, size,
> + struct bpf_map *, map, int, protocol, u64, flags)
> +{
> + unsigned long len = (unsigned long)(xdp->data_end - xdp->data);
> + struct bpf_pcap_hdr pcap;
> + int ret;
> +
> + if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
> + return -EINVAL;
> +
> + ret = bpf_pcap_prepare(protocol, size, len, flags, &pcap);
> + if (ret)
> + return ret;
> +
> + return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
> + xdp->data, pcap.cap_len, bpf_xdp_copy);
> +}
> +
> +static const struct bpf_func_proto bpf_xdp_pcap_proto = {
> + .func = bpf_xdp_pcap,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_CTX,
> + .arg2_type = ARG_ANYTHING,
> + .arg3_type = ARG_CONST_MAP_PTR,
> + .arg4_type = ARG_ANYTHING,
> + .arg5_type = ARG_ANYTHING,
> +};
> +
> BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
> {
> return skb->sk ? sock_gen_cookie(skb->sk) : 0;
> @@ -5926,6 +5955,34 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
>
> #endif /* CONFIG_INET */
>
> +BPF_CALL_5(bpf_skb_pcap, struct sk_buff *, skb, u32, size,
> + struct bpf_map *, map, int, protocol, u64, flags)
> +{
> + struct bpf_pcap_hdr pcap;
> + int ret;
> +
> + if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
> + return -EINVAL;
> +
> + ret = bpf_pcap_prepare(protocol, size, skb->len, flags, &pcap);
> + if (ret)
> + return ret;
> +
> + return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
> + skb, pcap.cap_len, bpf_skb_copy);
> +}
> +
> +static const struct bpf_func_proto bpf_skb_pcap_proto = {
> + .func = bpf_skb_pcap,
> + .gpl_only = false,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_PTR_TO_CTX,
> + .arg2_type = ARG_ANYTHING,
> + .arg3_type = ARG_CONST_MAP_PTR,
> + .arg4_type = ARG_ANYTHING,
> + .arg5_type = ARG_ANYTHING,
> +};
> +
> bool bpf_helper_changes_pkt_data(void *func)
> {
> if (func == bpf_skb_vlan_push ||
> @@ -6075,6 +6132,8 @@ bool bpf_helper_changes_pkt_data(void *func)
> return &bpf_get_socket_uid_proto;
> case BPF_FUNC_perf_event_output:
> return &bpf_skb_event_output_proto;
> + case BPF_FUNC_pcap:
> + return &bpf_skb_pcap_proto;
> default:
> return bpf_base_func_proto(func_id);
> }
> @@ -6216,6 +6275,8 @@ bool bpf_helper_changes_pkt_data(void *func)
> case BPF_FUNC_tcp_gen_syncookie:
> return &bpf_tcp_gen_syncookie_proto;
> #endif
> + case BPF_FUNC_pcap:
> + return &bpf_skb_pcap_proto;
> default:
> return bpf_base_func_proto(func_id);
> }
> @@ -6256,6 +6317,8 @@ bool bpf_helper_changes_pkt_data(void *func)
> return &bpf_tcp_check_syncookie_proto;
> case BPF_FUNC_tcp_gen_syncookie:
> return &bpf_tcp_gen_syncookie_proto;
> + case BPF_FUNC_pcap:
> + return &bpf_xdp_pcap_proto;
> #endif
> default:
> return bpf_base_func_proto(func_id);
> @@ -6361,6 +6424,8 @@ bool bpf_helper_changes_pkt_data(void *func)
> case BPF_FUNC_skc_lookup_tcp:
> return &bpf_skc_lookup_tcp_proto;
> #endif
> + case BPF_FUNC_pcap:
> + return &bpf_skb_pcap_proto;
> default:
> return bpf_base_func_proto(func_id);
> }
> @@ -6399,6 +6464,8 @@ bool bpf_helper_changes_pkt_data(void *func)
> return &bpf_get_smp_processor_id_proto;
> case BPF_FUNC_skb_under_cgroup:
> return &bpf_skb_under_cgroup_proto;
> + case BPF_FUNC_pcap:
> + return &bpf_skb_pcap_proto;
> default:
> return bpf_base_func_proto(func_id);
> }
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
2019-09-08 22:18 ` Yonghong Song
2019-09-07 21:40 ` [RFC bpf-next 3/7] bpf: sync tools/include/uapi/linux/bpf.h for pcap support Alan Maguire
` (4 subsequent siblings)
6 siblings, 1 reply; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
packet capture is especially valuable in tracing contexts, so
extend bpf_pcap helper to take a tracing-derived skb pointer
as an argument.
In the case of tracing programs, the starting protocol
(corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for
IP, etc) needs to be specified and should reflect the protocol
type which is pointed to by the skb->data pointer; i.e. the
start of the packet. This can derived in a limited set of cases,
but should be specified where possible. For skb and xdp programs
this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).
Example usage for a tracing program, where we use a
struct bpf_pcap_hdr array map to pass in preferences for
protocol and max len:
struct bpf_map_def SEC("maps") pcap_conf_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(struct bpf_pcap_hdr),
.max_entries = 1,
};
struct bpf_map_def SEC("maps") pcap_map = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1024,
};
SEC("kprobe/kfree_skb")
int probe_kfree_skb(struct pt_regs *ctx)
{
struct bpf_pcap_hdr *conf;
int key = 0;
conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
if (!conf)
return 0;
bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map,
conf->protocol, BPF_F_CURRENT_CPU);
return 0;
}
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
include/uapi/linux/bpf.h | 21 ++++-
kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 233 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a27e58e..13f86d3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2758,7 +2758,9 @@ struct bpf_stack_build_id {
* held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
* perf event has the same attributes as perf events generated
* by bpf_perf_event_output. For skb and xdp programs, *data*
- * is the relevant context.
+ * is the relevant context, while for tracing programs,
+ * *data* must be a pointer to a **struct sk_buff** derived
+ * from kprobe or tracepoint arguments.
*
* Metadata for this event is a **struct bpf_pcap_hdr**; this
* contains the capture length, actual packet length and
@@ -2771,6 +2773,14 @@ struct bpf_stack_build_id {
* to 48 bits; the id can be used to correlate captured packets
* with other trace data, since the passed-in flags value is stored
* stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ * Specifying **BPF_F_PCAP_ID_IIFINDEX** and a non-zero value in
+ * the id portion of the flags limits capture events to skbs
+ * with the specified incoming ifindex, allowing limiting of
+ * tracing to the the associated interface. Specifying
+ * **BPF_F_PCAP_STRICT_TYPE** will cause *bpf_pcap* to return
+ * -EPROTO and skip capture if a specific protocol is specified
+ * and it does not match the current skb. These additional flags
+ * are only valid (and useful) for tracing programs.
*
* The *protocol* value specifies the protocol type of the start
* of the packet so that packet capture can carry out
@@ -2780,7 +2790,12 @@ struct bpf_stack_build_id {
* Return
* 0 on success, or a negative error in case of failure.
* -ENOENT will be returned if the associated perf event
- * map entry is empty, or the skb is zero-length.
+ * map entry is empty, the skb is zero-length, or the incoming
+ * ifindex was specified and we failed to match.
+ * -EPROTO will be returned if **BPF_PCAP_TYPE_UNSET** is specified
+ * and no protocol can be determined, or if we specify a protocol
+ * along with **BPF_F_PCAP_STRICT_TYPE** and the skb protocol does
+ * not match.
* -EINVAL will be returned if the flags value is invalid.
*
*/
@@ -2977,6 +2992,8 @@ enum bpf_func_id {
/* BPF_FUNC_pcap flags */
#define BPF_F_PCAP_ID_MASK 0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX (1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE (1ULL << 56)
/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d..311883b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,8 @@
#include <linux/kprobes.h>
#include <linux/syscalls.h>
#include <linux/error-injection.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
#include <asm/tlb.h>
@@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
return __bpf_perf_event_output(regs, map, flags, sd);
}
+/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */
+static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,
+ unsigned long offset,
+ unsigned long len)
+{
+ const struct sk_buff *frag_iterp, *skb = from;
+ struct skb_shared_info *shinfop, shinfo;
+ struct sk_buff frag_iter;
+ unsigned long copy, start;
+ void *to = tobuf;
+ int i, ret;
+
+ start = skb_headlen(skb);
+
+ copy = start - offset;
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ ret = probe_kernel_read(to, skb->data, copy);
+ if (unlikely(ret < 0))
+ goto out;
+ len -= copy;
+ if (len == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+
+ if (skb->data_len == 0)
+ goto out;
+
+ shinfop = skb_shinfo(skb);
+
+ ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
+ if (unlikely(ret < 0))
+ goto out;
+
+ if (shinfo.nr_frags > MAX_SKB_FRAGS) {
+ ret = -EINVAL;
+ goto out;
+ }
+ for (i = 0; i < shinfo.nr_frags; i++) {
+ skb_frag_t *f = &shinfo.frags[i];
+ int end;
+
+ if (start > offset + len) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ end = start + skb_frag_size(f);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 poff, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ skb_frag_foreach_page(f,
+ skb_frag_off(f) + offset - start,
+ copy, p, poff, p_len, copied) {
+
+ vaddr = kmap_atomic(p);
+ ret = probe_kernel_read(to + copied,
+ vaddr + poff, p_len);
+ kunmap_atomic(vaddr);
+
+ if (unlikely(ret < 0))
+ goto out;
+ }
+ len -= copy;
+ if (len == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+
+ for (frag_iterp = shinfo.frag_list; frag_iterp;
+ frag_iterp = frag_iter.next) {
+ int end;
+
+ if (start > offset + len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ ret = probe_kernel_read(&frag_iter, frag_iterp,
+ sizeof(frag_iter));
+ if (ret)
+ goto out;
+
+ end = start + frag_iter.len;
+ copy = end - offset;
+ if (copy > 0) {
+ if (copy > len)
+ copy = len;
+ ret = bpf_trace_skb_copy(to, &frag_iter,
+ offset - start,
+ copy);
+ if (ret)
+ goto out;
+
+ len -= copy;
+ if (len == 0)
+ return 0;
+ offset += copy;
+ to += copy;
+ }
+ start = end;
+ }
+out:
+ if (ret)
+ memset(tobuf, 0, len);
+
+ return ret;
+}
+
+/* Derive protocol for some of the easier cases. For tracing, a probe point
+ * may be dealing with packets in various states. Common cases are IP
+ * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
+ * (_PCAP_TYPE_ETH). For other cases the caller must specify the
+ * protocol they expect. Other heuristics for packet identification
+ * should be added here as needed, since determining the packet type
+ * ensures we do not capture packets that fail to match the desired
+ * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
+ */
+static inline int bpf_skb_protocol_get(struct sk_buff *skb)
+{
+ switch (htons(skb->protocol)) {
+ case ETH_P_IP:
+ case ETH_P_IPV6:
+ if (skb_network_header(skb) == skb->data)
+ return BPF_PCAP_TYPE_IP;
+ else
+ return BPF_PCAP_TYPE_ETH;
+ default:
+ return BPF_PCAP_TYPE_UNSET;
+ }
+}
+
+BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
+ int, protocol_wanted, u64, flags)
+{
+ struct bpf_pcap_hdr pcap;
+ struct sk_buff skb;
+ int protocol;
+ int ret;
+
+ if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
+ BPF_F_PCAP_STRICT_TYPE)))
+ return -EINVAL;
+
+ ret = probe_kernel_read(&skb, data, sizeof(skb));
+ if (unlikely(ret < 0))
+ return ret;
+
+ /* Sanity check skb len in case we get bogus data. */
+ if (unlikely(!skb.len))
+ return -ENOENT;
+ if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
+ return -E2BIG;
+
+ protocol = bpf_skb_protocol_get(&skb);
+
+ if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
+ /* If we cannot determine protocol type, bail. */
+ if (protocol == BPF_PCAP_TYPE_UNSET)
+ return -EPROTO;
+ } else {
+ /* if we determine protocol type, and it's not what we asked
+ * for _and_ we are in strict mode, bail. Otherwise we assume
+ * the packet is the requested protocol type and drive on.
+ */
+ if (flags & BPF_F_PCAP_STRICT_TYPE &&
+ protocol != BPF_PCAP_TYPE_UNSET &&
+ protocol != protocol_wanted)
+ return -EPROTO;
+ protocol = protocol_wanted;
+ }
+
+ /* If we specified a matching incoming ifindex, bail if not a match. */
+ if (flags & BPF_F_PCAP_ID_IIFINDEX) {
+ int iif = flags & BPF_F_PCAP_ID_MASK;
+
+ if (iif && skb.skb_iif != iif)
+ return -ENOENT;
+ }
+
+ ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
+ if (ret)
+ return ret;
+
+ return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+ &skb, pcap.cap_len, bpf_trace_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_trace_pcap_proto = {
+ .func = bpf_trace_pcap,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_CONST_MAP_PTR,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
BPF_CALL_0(bpf_get_current_task)
{
return (long) current;
@@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry)
#endif
case BPF_FUNC_send_signal:
return &bpf_send_signal_proto;
+ case BPF_FUNC_pcap:
+ return &bpf_trace_pcap_proto;
default:
return NULL;
}
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs
2019-09-07 21:40 ` [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs Alan Maguire
@ 2019-09-08 22:18 ` Yonghong Song
2019-09-09 22:25 ` Alan Maguire
0 siblings, 1 reply; 12+ messages in thread
From: Yonghong Song @ 2019-09-08 22:18 UTC (permalink / raw)
To: Alan Maguire, ast, daniel, Martin Lau, Song Liu, davem,
jakub.kicinski, hawk, john.fastabend, rostedt, mingo,
quentin.monnet, Andrey Ignatov, joe, acme, jolsa,
alexey.budankov, gregkh, namhyung, sdf, f.fainelli, shuah, peter,
ivan, Andrii Nakryiko, bhole_prashant_q7, david.calavera,
danieltimlee, Takshak Chahande, netdev, bpf, linux-kselftest
On 9/7/19 2:40 PM, Alan Maguire wrote:
> packet capture is especially valuable in tracing contexts, so
> extend bpf_pcap helper to take a tracing-derived skb pointer
> as an argument.
>
> In the case of tracing programs, the starting protocol
> (corresponding to libpcap DLT_* values; 1 for Ethernet, 12 for
> IP, etc) needs to be specified and should reflect the protocol
> type which is pointed to by the skb->data pointer; i.e. the
> start of the packet. This can derived in a limited set of cases,
> but should be specified where possible. For skb and xdp programs
> this protocol will nearly always be 1 (BPF_PCAP_TYPE_ETH).
>
> Example usage for a tracing program, where we use a
> struct bpf_pcap_hdr array map to pass in preferences for
> protocol and max len:
>
> struct bpf_map_def SEC("maps") pcap_conf_map = {
> .type = BPF_MAP_TYPE_ARRAY,
> .key_size = sizeof(int),
> .value_size = sizeof(struct bpf_pcap_hdr),
> .max_entries = 1,
> };
>
> struct bpf_map_def SEC("maps") pcap_map = {
> .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> .key_size = sizeof(int),
> .value_size = sizeof(int),
> .max_entries = 1024,
> };
>
> SEC("kprobe/kfree_skb")
> int probe_kfree_skb(struct pt_regs *ctx)
> {
> struct bpf_pcap_hdr *conf;
> int key = 0;
>
> conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
> if (!conf)
> return 0;
>
> bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len, &pcap_map,
> conf->protocol, BPF_F_CURRENT_CPU);
> return 0;
> }
>
> Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
> ---
[...]
> @@ -2977,6 +2992,8 @@ enum bpf_func_id {
>
> /* BPF_FUNC_pcap flags */
> #define BPF_F_PCAP_ID_MASK 0xffffffffffff
> +#define BPF_F_PCAP_ID_IIFINDEX (1ULL << 48)
> +#define BPF_F_PCAP_STRICT_TYPE (1ULL << 56)
>
> /* Mode for BPF_FUNC_skb_adjust_room helper. */
> enum bpf_adj_room_mode {
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ca1255d..311883b 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -13,6 +13,8 @@
> #include <linux/kprobes.h>
> #include <linux/syscalls.h>
> #include <linux/error-injection.h>
> +#include <linux/skbuff.h>
> +#include <linux/ip.h>
>
> #include <asm/tlb.h>
>
> @@ -530,6 +532,216 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
> return __bpf_perf_event_output(regs, map, flags, sd);
> }
>
> +/* Essentially just skb_copy_bits() using probe_kernel_read() where needed. */
> +static unsigned long bpf_trace_skb_copy(void *tobuf, const void *from,
> + unsigned long offset,
> + unsigned long len)
> +{
> + const struct sk_buff *frag_iterp, *skb = from;
> + struct skb_shared_info *shinfop, shinfo;
> + struct sk_buff frag_iter;
> + unsigned long copy, start;
> + void *to = tobuf;
> + int i, ret;
> +
> + start = skb_headlen(skb);
> +
> + copy = start - offset;
> + if (copy > 0) {
> + if (copy > len)
> + copy = len;
> + ret = probe_kernel_read(to, skb->data, copy);
> + if (unlikely(ret < 0))
> + goto out;
> + len -= copy;
> + if (len == 0)
> + return 0;
> + offset += copy;
> + to += copy;
> + }
> +
> + if (skb->data_len == 0)
> + goto out;
> +
> + shinfop = skb_shinfo(skb);
> +
> + ret = probe_kernel_read(&shinfo, shinfop, sizeof(shinfo));
> + if (unlikely(ret < 0))
> + goto out;
> +
> + if (shinfo.nr_frags > MAX_SKB_FRAGS) {
> + ret = -EINVAL;
> + goto out;
> + }
> + for (i = 0; i < shinfo.nr_frags; i++) {
> + skb_frag_t *f = &shinfo.frags[i];
> + int end;
> +
> + if (start > offset + len) {
> + ret = -E2BIG;
> + goto out;
> + }
> +
> + end = start + skb_frag_size(f);
> + copy = end - offset;
> + if (copy > 0) {
> + u32 poff, p_len, copied;
> + struct page *p;
> + u8 *vaddr;
> +
> + if (copy > len)
> + copy = len;
> +
> + skb_frag_foreach_page(f,
> + skb_frag_off(f) + offset - start,
> + copy, p, poff, p_len, copied) {
> +
> + vaddr = kmap_atomic(p);
> + ret = probe_kernel_read(to + copied,
> + vaddr + poff, p_len);
> + kunmap_atomic(vaddr);
> +
> + if (unlikely(ret < 0))
> + goto out;
> + }
> + len -= copy;
> + if (len == 0)
> + return 0;
> + offset += copy;
> + to += copy;
> + }
> + start = end;
> + }
> +
> + for (frag_iterp = shinfo.frag_list; frag_iterp;
> + frag_iterp = frag_iter.next) {
> + int end;
> +
> + if (start > offset + len) {
> + ret = -E2BIG;
> + goto out;
> + }
> + ret = probe_kernel_read(&frag_iter, frag_iterp,
> + sizeof(frag_iter));
> + if (ret)
> + goto out;
> +
> + end = start + frag_iter.len;
> + copy = end - offset;
> + if (copy > 0) {
> + if (copy > len)
> + copy = len;
> + ret = bpf_trace_skb_copy(to, &frag_iter,
> + offset - start,
> + copy);
> + if (ret)
> + goto out;
> +
> + len -= copy;
> + if (len == 0)
> + return 0;
> + offset += copy;
> + to += copy;
> + }
> + start = end;
> + }
> +out:
> + if (ret)
> + memset(tobuf, 0, len);
> +
> + return ret;
> +}
For net side bpf_perf_event_output, we have
static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
unsigned long off, unsigned long len)
{
void *ptr = skb_header_pointer(skb, off, len, dst_buff);
if (unlikely(!ptr))
return len;
if (ptr != dst_buff)
memcpy(dst_buff, ptr, len);
return 0;
}
BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map
*, map,
u64, flags, void *, meta, u64, meta_size)
{
u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
if (unlikely(skb_size > skb->len))
return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
bpf_skb_copy);
}
It does not really consider output all the frags.
I understand that to get truly all packet data, frags should be
considered, but seems we did not do it before? I am wondering
whether we need to do here.
If we indeed do not need to handle frags here, I think maybe
bpf_probe_read() in existing bpf kprobe function should be
enough, we do not need this helper?
> +
> +/* Derive protocol for some of the easier cases. For tracing, a probe point
> + * may be dealing with packets in various states. Common cases are IP
> + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
> + * (_PCAP_TYPE_ETH). For other cases the caller must specify the
> + * protocol they expect. Other heuristics for packet identification
> + * should be added here as needed, since determining the packet type
> + * ensures we do not capture packets that fail to match the desired
> + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
> + */
> +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
> +{
> + switch (htons(skb->protocol)) {
> + case ETH_P_IP:
> + case ETH_P_IPV6:
> + if (skb_network_header(skb) == skb->data)
> + return BPF_PCAP_TYPE_IP;
> + else
> + return BPF_PCAP_TYPE_ETH;
> + default:
> + return BPF_PCAP_TYPE_UNSET;
> + }
> +}
> +
> +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
> + int, protocol_wanted, u64, flags)
Up to now, for helpers, verifier has a way to verifier it is used
properly regarding to the context. For example, for xdp version
perf_event_output, the help prototype,
BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct
bpf_map *, map,
u64, flags, void *, meta, u64, meta_size)
the verifier is able to guarantee that the first parameter
has correct type xdp_buff, not something from type cast.
.arg1_type = ARG_PTR_TO_CTX,
This helper, in the below we have
.arg1_type = ARG_ANYTHING,
So it is not really enforced. Bringing BTF can help, but type
name matching typically bad.
> +{
> + struct bpf_pcap_hdr pcap;
> + struct sk_buff skb;
> + int protocol;
> + int ret;
> +
> + if (unlikely(flags & ~(BPF_F_PCAP_ID_IIFINDEX | BPF_F_PCAP_ID_MASK |
> + BPF_F_PCAP_STRICT_TYPE)))
> + return -EINVAL;
> +
> + ret = probe_kernel_read(&skb, data, sizeof(skb));
> + if (unlikely(ret < 0))
> + return ret;
> +
> + /* Sanity check skb len in case we get bogus data. */
> + if (unlikely(!skb.len))
> + return -ENOENT;
> + if (unlikely(skb.len > GSO_MAX_SIZE || skb.data_len > skb.len))
> + return -E2BIG;
> +
> + protocol = bpf_skb_protocol_get(&skb);
> +
> + if (protocol_wanted == BPF_PCAP_TYPE_UNSET) {
> + /* If we cannot determine protocol type, bail. */
> + if (protocol == BPF_PCAP_TYPE_UNSET)
> + return -EPROTO;
> + } else {
> + /* if we determine protocol type, and it's not what we asked
> + * for _and_ we are in strict mode, bail. Otherwise we assume
> + * the packet is the requested protocol type and drive on.
> + */
> + if (flags & BPF_F_PCAP_STRICT_TYPE &&
> + protocol != BPF_PCAP_TYPE_UNSET &&
> + protocol != protocol_wanted)
> + return -EPROTO;
> + protocol = protocol_wanted;
> + }
> +
> + /* If we specified a matching incoming ifindex, bail if not a match. */
> + if (flags & BPF_F_PCAP_ID_IIFINDEX) {
> + int iif = flags & BPF_F_PCAP_ID_MASK;
> +
> + if (iif && skb.skb_iif != iif)
> + return -ENOENT;
> + }
> +
> + ret = bpf_pcap_prepare(protocol, size, skb.len, flags, &pcap);
> + if (ret)
> + return ret;
> +
> + return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
> + &skb, pcap.cap_len, bpf_trace_skb_copy);
> +}
> +
> +static const struct bpf_func_proto bpf_trace_pcap_proto = {
> + .func = bpf_trace_pcap,
> + .gpl_only = true,
> + .ret_type = RET_INTEGER,
> + .arg1_type = ARG_ANYTHING,
> + .arg2_type = ARG_ANYTHING,
> + .arg3_type = ARG_CONST_MAP_PTR,
> + .arg4_type = ARG_ANYTHING,
> + .arg5_type = ARG_ANYTHING,
> +};
> +
> BPF_CALL_0(bpf_get_current_task)
> {
> return (long) current;
> @@ -709,6 +921,8 @@ static void do_bpf_send_signal(struct irq_work *entry)
> #endif
> case BPF_FUNC_send_signal:
> return &bpf_send_signal_proto;
> + case BPF_FUNC_pcap:
> + return &bpf_trace_pcap_proto;
> default:
> return NULL;
> }
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs
2019-09-08 22:18 ` Yonghong Song
@ 2019-09-09 22:25 ` Alan Maguire
2019-09-10 7:43 ` Yonghong Song
0 siblings, 1 reply; 12+ messages in thread
From: Alan Maguire @ 2019-09-09 22:25 UTC (permalink / raw)
To: Yonghong Song
Cc: Alan Maguire, ast, daniel, Martin Lau, Song Liu, davem,
jakub.kicinski, hawk, john.fastabend, rostedt, mingo,
quentin.monnet, Andrey Ignatov, joe, acme, jolsa,
alexey.budankov, gregkh, namhyung, sdf, f.fainelli, shuah, peter,
ivan, Andrii Nakryiko, bhole_prashant_q7, david.calavera,
danieltimlee, Takshak Chahande, netdev, bpf, linux-kselftest,
toke, jbenc, acme
On Sun, 8 Sep 2019, Yonghong Song wrote:
> For net side bpf_perf_event_output, we have
> static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
> unsigned long off, unsigned long len)
> {
> void *ptr = skb_header_pointer(skb, off, len, dst_buff);
>
> if (unlikely(!ptr))
> return len;
> if (ptr != dst_buff)
> memcpy(dst_buff, ptr, len);
>
> return 0;
> }
>
> BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map
> *, map,
> u64, flags, void *, meta, u64, meta_size)
> {
> u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
>
> if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
> return -EINVAL;
> if (unlikely(skb_size > skb->len))
> return -EFAULT;
>
> return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
> bpf_skb_copy);
> }
>
> It does not really consider output all the frags.
> I understand that to get truly all packet data, frags should be
> considered, but seems we did not do it before? I am wondering
> whether we need to do here.
Thanks for the feedback! In experimenting with packet capture,
my original hope was to keep things simple and avoid fragment parsing
if possible. However if scatter-gather is enabled for the networking
device, or indeed if it's running in a VM it turns out a lot of the
interesting packet data ends up in the fragments on transmit (ssh
headers, http headers etc). So I think it would be worth considering
adding support for fragment traversal. It's not needed as much
in the skb program case - we can always pullup the skb - but in
the tracing situation we probably wouldn't want to do something
that invasive in tracing context.
Fragment traversal might be worth breaking out as a separate patchset,
perhaps triggered by a specific flag to bpf_skb_event_output?
Feedback from folks at Linux Plumbers (I hope I'm summarizing correctly)
seemed to agree with what you mentioned WRT the first patch in this
series. The gist was we probably don't want to force the metadata to be a
specific packet capture type; we'd rather use the existing perf event
mechanisms and if we are indeed doing packet capture, simply specify that
data in the program as metadata.
I'd be happy with that approach myself if I could capture skb
fragments in tracing programs - being able to do that would give
equivalent functionality to what I proposed but without having a packet
capture-specific helper.
>
> If we indeed do not need to handle frags here, I think maybe
> bpf_probe_read() in existing bpf kprobe function should be
> enough, we do not need this helper?
>
Certainly for many use cases, that will get you most of what you need -
particularly if you're just looking at L2 to L4 data. For full packet
capture however I think we may need to think about fragment traversal.
> > +
> > +/* Derive protocol for some of the easier cases. For tracing, a probe point
> > + * may be dealing with packets in various states. Common cases are IP
> > + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
> > + * (_PCAP_TYPE_ETH). For other cases the caller must specify the
> > + * protocol they expect. Other heuristics for packet identification
> > + * should be added here as needed, since determining the packet type
> > + * ensures we do not capture packets that fail to match the desired
> > + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
> > + */
> > +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
> > +{
> > + switch (htons(skb->protocol)) {
> > + case ETH_P_IP:
> > + case ETH_P_IPV6:
> > + if (skb_network_header(skb) == skb->data)
> > + return BPF_PCAP_TYPE_IP;
> > + else
> > + return BPF_PCAP_TYPE_ETH;
> > + default:
> > + return BPF_PCAP_TYPE_UNSET;
> > + }
> > +}
> > +
> > +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
> > + int, protocol_wanted, u64, flags)
>
> Up to now, for helpers, verifier has a way to verifier it is used
> properly regarding to the context. For example, for xdp version
> perf_event_output, the help prototype,
> BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct
> bpf_map *, map,
> u64, flags, void *, meta, u64, meta_size)
> the verifier is able to guarantee that the first parameter
> has correct type xdp_buff, not something from type cast.
> .arg1_type = ARG_PTR_TO_CTX,
>
> This helper, in the below we have
> .arg1_type = ARG_ANYTHING,
>
> So it is not really enforced. Bringing BTF can help, but type
> name matching typically bad.
>
>
One thing we were discussing - and I think this is similar to what
you're suggesting - is to investigate if there might be a way to
leverage BTF to provide additional guarantees that the tracing
data we are handling is indeed an skb. Specifically if we
trace a kprobe function argument or a tracepoint function, and
if we had that guarantee, we could perhaps invoke the skb-style
perf event output function (trace both the skb data and the metadata).
The challenge would be how to do that type-based matching; we'd
need the function argument information from BTF _and_ need to
somehow associate it at probe attach time.
Thanks again for looking at the code!
Alan
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs
2019-09-09 22:25 ` Alan Maguire
@ 2019-09-10 7:43 ` Yonghong Song
0 siblings, 0 replies; 12+ messages in thread
From: Yonghong Song @ 2019-09-10 7:43 UTC (permalink / raw)
To: Alan Maguire
Cc: ast, daniel, Martin Lau, Song Liu, davem, jakub.kicinski, hawk,
john.fastabend, rostedt, mingo, quentin.monnet, Andrey Ignatov,
joe, acme, jolsa, alexey.budankov, gregkh, namhyung, sdf,
f.fainelli, shuah, peter, ivan, Andrii Nakryiko,
bhole_prashant_q7, david.calavera, danieltimlee,
Takshak Chahande, netdev, bpf, linux-kselftest, toke, jbenc
On 9/9/19 11:25 PM, Alan Maguire wrote:
> On Sun, 8 Sep 2019, Yonghong Song wrote:
>
>> For net side bpf_perf_event_output, we have
>> static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
>> unsigned long off, unsigned long len)
>> {
>> void *ptr = skb_header_pointer(skb, off, len, dst_buff);
>>
>> if (unlikely(!ptr))
>> return len;
>> if (ptr != dst_buff)
>> memcpy(dst_buff, ptr, len);
>>
>> return 0;
>> }
>>
>> BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map
>> *, map,
>> u64, flags, void *, meta, u64, meta_size)
>> {
>> u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
>>
>> if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
>> return -EINVAL;
>> if (unlikely(skb_size > skb->len))
>> return -EFAULT;
>>
>> return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
>> bpf_skb_copy);
>> }
>>
>> It does not really consider output all the frags.
>> I understand that to get truly all packet data, frags should be
>> considered, but seems we did not do it before? I am wondering
>> whether we need to do here.
>
> Thanks for the feedback! In experimenting with packet capture,
> my original hope was to keep things simple and avoid fragment parsing
> if possible. However if scatter-gather is enabled for the networking
> device, or indeed if it's running in a VM it turns out a lot of the
> interesting packet data ends up in the fragments on transmit (ssh
> headers, http headers etc). So I think it would be worth considering
> adding support for fragment traversal. It's not needed as much
> in the skb program case - we can always pullup the skb - but in
> the tracing situation we probably wouldn't want to do something
> that invasive in tracing context.
Agree that in tracing context, we should avoid push/pull skb. It is
indeed invasive.
>
> Fragment traversal might be worth breaking out as a separate patchset,
> perhaps triggered by a specific flag to bpf_skb_event_output?
This can be done for bpf_skb_event_output as the context is a sk_buff.
And you can just follow the frags to copy the whole thing without
bpf_probe_read().
>
> Feedback from folks at Linux Plumbers (I hope I'm summarizing correctly)
> seemed to agree with what you mentioned WRT the first patch in this
> series. The gist was we probably don't want to force the metadata to be a
> specific packet capture type; we'd rather use the existing perf event
> mechanisms and if we are indeed doing packet capture, simply specify that
> data in the program as metadata.
Agree, you can have whatever metadata you have for bpf_perf_event_output.
>
> I'd be happy with that approach myself if I could capture skb
> fragments in tracing programs - being able to do that would give
> equivalent functionality to what I proposed but without having a packet
> capture-specific helper.
That won't work for tracing program. Full of bpf_probe_read()
in tracing version of packet copying is not nice either.
We may still need a different helper for tracing programs.
I think we need something like below:
- vmlinux BTF at /sys/kernel/btf/kernel, is loaded into kernel.
(/sys/kernel/btf/kernel is the source of truth)
- For a tracing bpf program, if that function eventually
copy helper
bpf_skb_event_output(..., skb, ...)
the verifier needs to verify skb is indeed a valid skb
by tracing back to one of parameters.
Here, I use skb as an example, maybe it can be extended
to other data structures as well.
With this approach, you can reuse some of functions from
tracing side to deal with frag copying and no bpf_probe_read()
is needed.
Here, I use skb as an example, maybe it can be extended
to other data structures as well if needed.
>>
>> If we indeed do not need to handle frags here, I think maybe
>> bpf_probe_read() in existing bpf kprobe function should be
>> enough, we do not need this helper?
>>
>
> Certainly for many use cases, that will get you most of what you need -
> particularly if you're just looking at L2 to L4 data. For full packet
> capture however I think we may need to think about fragment traversal.
>
>>> +
>>> +/* Derive protocol for some of the easier cases. For tracing, a probe point
>>> + * may be dealing with packets in various states. Common cases are IP
>>> + * packets prior to adding MAC header (_PCAP_TYPE_IP) and a full packet
>>> + * (_PCAP_TYPE_ETH). For other cases the caller must specify the
>>> + * protocol they expect. Other heuristics for packet identification
>>> + * should be added here as needed, since determining the packet type
>>> + * ensures we do not capture packets that fail to match the desired
>>> + * pcap type in BPF_F_PCAP_STRICT_TYPE mode.
>>> + */
>>> +static inline int bpf_skb_protocol_get(struct sk_buff *skb)
>>> +{
>>> + switch (htons(skb->protocol)) {
>>> + case ETH_P_IP:
>>> + case ETH_P_IPV6:
>>> + if (skb_network_header(skb) == skb->data)
>>> + return BPF_PCAP_TYPE_IP;
>>> + else
>>> + return BPF_PCAP_TYPE_ETH;
>>> + default:
>>> + return BPF_PCAP_TYPE_UNSET;
>>> + }
>>> +}
>>> +
>>> +BPF_CALL_5(bpf_trace_pcap, void *, data, u32, size, struct bpf_map *, map,
>>> + int, protocol_wanted, u64, flags)
>>
>> Up to now, for helpers, verifier has a way to verifier it is used
>> properly regarding to the context. For example, for xdp version
>> perf_event_output, the help prototype,
>> BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct
>> bpf_map *, map,
>> u64, flags, void *, meta, u64, meta_size)
>> the verifier is able to guarantee that the first parameter
>> has correct type xdp_buff, not something from type cast.
>> .arg1_type = ARG_PTR_TO_CTX,
>>
>> This helper, in the below we have
>> .arg1_type = ARG_ANYTHING,
>>
>> So it is not really enforced. Bringing BTF can help, but type
>> name matching typically bad.
>>
>>
> One thing we were discussing - and I think this is similar to what
> you're suggesting - is to investigate if there might be a way to
> leverage BTF to provide additional guarantees that the tracing
> data we are handling is indeed an skb. Specifically if we
> trace a kprobe function argument or a tracepoint function, and
> if we had that guarantee, we could perhaps invoke the skb-style
> perf event output function (trace both the skb data and the metadata).
> The challenge would be how to do that type-based matching; we'd
> need the function argument information from BTF _and_ need to
> somehow associate it at probe attach time.
>
> Thanks again for looking at the code!
>
> Alan
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [RFC bpf-next 3/7] bpf: sync tools/include/uapi/linux/bpf.h for pcap support
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 2/7] bpf: extend bpf_pcap support to tracing programs Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 4/7] bpf: add libpcap feature test Alan Maguire
` (3 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
sync bpf.h updates for bpf_pcap helper and associated definitions
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
tools/include/uapi/linux/bpf.h | 92 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 91 insertions(+), 1 deletion(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77c6be9..13f86d3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2750,6 +2750,54 @@ struct bpf_stack_build_id {
* **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
*
* **-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
+ * u64 flags)
+ * Description
+ * Write packet data from *data* into a special BPF perf event
+ * held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
+ * perf event has the same attributes as perf events generated
+ * by bpf_perf_event_output. For skb and xdp programs, *data*
+ * is the relevant context, while for tracing programs,
+ * *data* must be a pointer to a **struct sk_buff** derived
+ * from kprobe or tracepoint arguments.
+ *
+ * Metadata for this event is a **struct bpf_pcap_hdr**; this
+ * contains the capture length, actual packet length and
+ * the starting protocol.
+ *
+ * The max number of bytes of context to store is specified via
+ * *size*.
+ *
+ * The flags value can be used to specify an id value of up
+ * to 48 bits; the id can be used to correlate captured packets
+ * with other trace data, since the passed-in flags value is stored
+ * stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ * Specifying **BPF_F_PCAP_ID_IIFINDEX** and a non-zero value in
+ * the id portion of the flags limits capture events to skbs
+ * with the specified incoming ifindex, allowing limiting of
+ * tracing to the the associated interface. Specifying
+ * **BPF_F_PCAP_STRICT_TYPE** will cause *bpf_pcap* to return
+ * -EPROTO and skip capture if a specific protocol is specified
+ * and it does not match the current skb. These additional flags
+ * are only valid (and useful) for tracing programs.
+ *
+ * The *protocol* value specifies the protocol type of the start
+ * of the packet so that packet capture can carry out
+ * interpretation. See **pcap-linktype** (7) for details on
+ * the supported values.
+ *
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ * -ENOENT will be returned if the associated perf event
+ * map entry is empty, the skb is zero-length, or the incoming
+ * ifindex was specified and we failed to match.
+ * -EPROTO will be returned if **BPF_PCAP_TYPE_UNSET** is specified
+ * and no protocol can be determined, or if we specify a protocol
+ * along with **BPF_F_PCAP_STRICT_TYPE** and the skb protocol does
+ * not match.
+ * -EINVAL will be returned if the flags value is invalid.
+ *
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2862,7 +2910,8 @@ struct bpf_stack_build_id {
FN(sk_storage_get), \
FN(sk_storage_delete), \
FN(send_signal), \
- FN(tcp_gen_syncookie),
+ FN(tcp_gen_syncookie), \
+ FN(pcap),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -2941,6 +2990,11 @@ enum bpf_func_id {
/* BPF_FUNC_sk_storage_get flags */
#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0)
+/* BPF_FUNC_pcap flags */
+#define BPF_F_PCAP_ID_MASK 0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX (1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE (1ULL << 56)
+
/* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET,
@@ -3613,4 +3667,40 @@ struct bpf_sockopt {
__s32 retval;
};
+/* bpf_pcap_hdr contains information related to a particular packet capture
+ * flow. It specifies
+ *
+ * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
+ * a pcap-related event.
+ * - a starting protocol is the protocol associated with the header
+ * - a flags value, copied from the flags value passed into bpf_pcap().
+ * IDs can be used to correlate packet capture data and other tracing data.
+ *
+ * bpf_pcap_hdr also contains the information relating to the to-be-captured
+ * packet, and closely corresponds to the struct pcap_pkthdr used by
+ * pcap_dump (3PCAP). The bpf_pcap helper sets ktime_ns (nanoseconds since
+ * boot) to the ktime_ns value; to get sensible pcap times this value should
+ * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
+ *
+ * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
+ * need both information about the particular packet and the protocol
+ * we are capturing.
+ */
+
+#define BPF_PCAP_MAGIC 0xb7fca7
+
+struct bpf_pcap_hdr {
+ __u32 magic;
+ int protocol;
+ __u64 flags;
+ __u64 ktime_ns;
+ __u32 tot_len;
+ __u32 cap_len;
+ __u8 data[0];
+};
+
+#define BPF_PCAP_TYPE_UNSET -1
+#define BPF_PCAP_TYPE_ETH 1
+#define BPF_PCAP_TYPE_IP 12
+
#endif /* _UAPI__LINUX_BPF_H__ */
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [RFC bpf-next 4/7] bpf: add libpcap feature test
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
` (2 preceding siblings ...)
2019-09-07 21:40 ` [RFC bpf-next 3/7] bpf: sync tools/include/uapi/linux/bpf.h for pcap support Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 5/7] bpf: add pcap support to bpftool Alan Maguire
` (2 subsequent siblings)
6 siblings, 0 replies; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
this test will be used when deciding whether to add the pcap
support features in the following patch
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
tools/build/Makefile.feature | 2 ++
tools/build/feature/Makefile | 4 ++++
tools/build/feature/test-libpcap.c | 26 ++++++++++++++++++++++++++
3 files changed, 32 insertions(+)
create mode 100644 tools/build/feature/test-libpcap.c
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 86b793d..35e65418 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -85,6 +85,7 @@ FEATURE_TESTS_EXTRA := \
libbfd-liberty \
libbfd-liberty-z \
libopencsd \
+ libpcap \
libunwind-x86 \
libunwind-x86_64 \
libunwind-arm \
@@ -113,6 +114,7 @@ FEATURE_DISPLAY ?= \
libelf \
libnuma \
numa_num_possible_cpus \
+ libpcap \
libperl \
libpython \
libcrypto \
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 0658b8c..c7585a1 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -27,6 +27,7 @@ FILES= \
test-libelf-mmap.bin \
test-libnuma.bin \
test-numa_num_possible_cpus.bin \
+ test-libpcap.bin \
test-libperl.bin \
test-libpython.bin \
test-libpython-version.bin \
@@ -209,6 +210,9 @@ FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
$(OUTPUT)test-libperl.bin:
$(BUILD) $(FLAGS_PERL_EMBED)
+$(OUTPUT)test-libpcap.bin:
+ $(BUILD) -lpcap
+
$(OUTPUT)test-libpython.bin:
$(BUILD) $(FLAGS_PYTHON_EMBED)
diff --git a/tools/build/feature/test-libpcap.c b/tools/build/feature/test-libpcap.c
new file mode 100644
index 0000000..7f60eb9
--- /dev/null
+++ b/tools/build/feature/test-libpcap.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <pcap.h>
+
+#define PKTLEN 100
+
+int main(void)
+{
+ char dummy_data[PKTLEN] = { 0 };
+ pcap_dumper_t *pcap_dumper;
+ struct pcap_pkthdr hdr;
+ int proto = 1;
+ pcap_t *pcap;
+
+ pcap = pcap_open_dead(proto, PKTLEN);
+ pcap_dumper = pcap_dump_open(pcap, "-");
+ hdr.caplen = PKTLEN;
+ hdr.len = PKTLEN;
+ hdr.ts.tv_sec = 0;
+ hdr.ts.tv_usec = 0;
+ pcap_dump((u_char *)pcap_dumper, &hdr, (const u_char *)dummy_data);
+ pcap_dump_flush(pcap_dumper);
+ pcap_dump_close(pcap_dumper);
+ pcap_close(pcap);
+
+ return 0;
+}
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [RFC bpf-next 5/7] bpf: add pcap support to bpftool
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
` (3 preceding siblings ...)
2019-09-07 21:40 ` [RFC bpf-next 4/7] bpf: add libpcap feature test Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 6/7] bpf: add documentation for bpftool pcap subcommand Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 7/7] bpf: add tests for bpftool packet capture Alan Maguire
6 siblings, 0 replies; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
bpftool is enhanced to be able to both capture from existing skb/xdp
programs ("bpftool pcap prog") and to load tracing programs - including
built-in simple kprobe/raw_tracepoint programs. The end result is
to have a way of dynamically tapping BPF programs, kernel functions
and tracepoints to capture packet data.
"bpftool pcap" support depends on libpcap library and headers presence,
hence the new feature test is used to check for these.
If present, "bpftool pcap" can be used to capture pcap perf event
data from the perf event map associated with a program. For example,
$ bpftool pcap prog id 32 data_out /tmp/cap
...will capture perf event data from the BPF program with id 32,
storing it in the capture file /tmp/pcap.
bpftool looks for a perf event map associated with that program and
then captures packets from it in a loop until Ctrl^C is pressed.
By default stdout is used, so the following also works:
$ bpftool pcap prog id 32 | tcpdump -r -
Configuration can also be passed to pcap programs, provided they
define a single-element BPF_MAP_TYPE_ARRAY with value size
greater than "sizeof struct bpf_pcap_hdr". Options include
data_out FILE packet capture file to use (stdout is default)
proto PROTO DLT_* type as per libpcap; by specifying a type
BPF programs can query the map in-kernel and
capture packets of that type. A string
or numeric value can be used. It is set in the
bpf_pcap_hdr associated with the configuration
map as the "protocol" value.
len MAXLEN maximum capture length in bytes. It is set in
the bpf_pcap_hdr associated with the configuration
map as the "cap_len" value.
dev DEVICE incoming interface. Tracing will be restricted
to skbs which have this incoming interface set.
The flags associated with the bpf_pcap_hdr
in the configuration map are adjusted to record
the associated ifindex to limit tracing.
In addition to capturing from existing programs, it is possible
to load provided programs which trace kprobe entry and raw_tracepoints,
making the first four arguments of each available for tracing.
For example
$ bpftool pcap trace kprobe/ip_rcv proto ip | tcpdump -r -
...will load a provided kprobe program, set the configuration options
in its associated map and capture packets which the bpf_pcap()
helper identifies as of type IPv[4,6]. Similarly for tracepoints,
$ bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -
In this case we explicitly specify an argument (:arg1), but the
default assumption is the first argument is to be captured.
To achieve the built-in tracing capabilities, two BPF objects need
to be delivered with bpftool - bpftool_pcap_kprobe.o and
bpftool_pcap_tracepoint.o. These are accordingly added to the
install target for bpftool. Each contains a separate program for
extracting arg1, arg2, arg3 and arg4. This may seem wasteful -
why not just have the arg number as a map parameter? In practice
tracepoints fail to attach with that approach.
A question arises here. First, if we deliver a kprobe program, won't
it only work for the specific kernel? Just by dumb luck on my part
the program appears to dodge the kernel version check in libbpf by not
passing an explicit program type at load time. That said, the
program does not reference any data structures outside of the context
provided (struct pt_regs *), so maybe there's something else going
on too?
Note that a user-defined tracing program can also be passed in,
and that program will be attached to the target probe in a similar
manner. We first look for programs with "arg[1-4]" in the name
if an argnum is specified, otherwise we fall back to using the
first program.
$ bpftool pcap trace mytraceprog.o tracepoint:net_dev_xmit:arg1 \
data_out /tmp/cap
bpftool looks for a BPF_MAP_TYPE_ARRAY containing one value of
size >= "struct bpf_pcap_hdr", and assumes that configuration
provided to the program should be set in that map. This allows
the user to provide a maximum packet length, starting protocol
etc to tracing programs.
The idea behind providing packet capture/tracing functionality in
bpftool is to similify developer access to dynamic packet capture.
An alternative approach would be to provide libbpf interfaces, but
this would require linking libbpf with libpcap.
A possible approach would be to take the code from bpftool that
interacts with programs (to retrieve pcap-related maps and
set config) and move it to libbpf, but it may make sense to
start with the functionality in bpftool and see if other
consumers need/want it.
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
tools/bpf/bpftool/Makefile | 39 +-
tools/bpf/bpftool/main.c | 3 +-
tools/bpf/bpftool/main.h | 1 +
tools/bpf/bpftool/pcap.c | 496 ++++++++++++++++++++++
tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c | 80 ++++
tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c | 68 +++
tools/testing/selftests/bpf/bpf_helpers.h | 11 +
7 files changed, 690 insertions(+), 8 deletions(-)
create mode 100644 tools/bpf/bpftool/pcap.c
create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 39bc6f0..16c4104 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
include ../../scripts/Makefile.include
+include ../../scripts/Makefile.arch
include ../../scripts/utilities.mak
ifeq ($(srctree),)
@@ -61,8 +62,8 @@ INSTALL ?= install
RM ?= rm -f
FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
-FEATURE_DISPLAY = libbfd disassembler-four-args zlib
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libpcap
+FEATURE_DISPLAY = libbfd disassembler-four-args zlib libpcap
check_feat := 1
NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
@@ -90,7 +91,14 @@ endif
include $(wildcard $(OUTPUT)*.d)
-all: $(OUTPUT)bpftool
+ifeq ($(feature-libpcap),1)
+ LIBS += -lpcap
+ CFLAGS += -DHAVE_LIBPCAP_SUPPORT
+ BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
+ BPF_SRCS = $(wildcard progs/*.c)
+endif
+
+all: $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES)
BFD_SRCS = jit_disasm.c
@@ -109,6 +117,18 @@ CFLAGS += -DHAVE_LIBBFD_SUPPORT
SRCS += $(BFD_SRCS)
endif
+CLANG ?= clang
+LLC ?= llc
+
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+CLANG_FLAGS = -I. -I$(srctree)/tools/include/uapi \
+ -I$(srctree)/tools/testing/selftests/bpf \
+ $(CLANG_SYS_INCLUDES) \
+ -Wno-compare-distinct-pointer-types \
+ -D__TARGET_ARCH_$(SRCARCH)
+
OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
@@ -122,24 +142,29 @@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
$(OUTPUT)%.o: %.c
$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
+$(OUTPUT)$(BPF_OBJ_FILES): $(BPF_SRCS)
+ ($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm \
+ -c $(patsubst %.o,progs/%.c,$@) -o - || echo "clang failed") | \
+ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+
clean: $(LIBBPF)-clean
$(call QUIET_CLEAN, bpftool)
- $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+ $(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d $(OUTPUT)$(BPF_OBJ_FILES)
$(Q)$(RM) -r -- $(OUTPUT)libbpf/
$(call QUIET_CLEAN, core-gen)
$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
$(Q)$(RM) -r -- $(OUTPUT)feature/
-install: $(OUTPUT)bpftool
+install: $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES)
$(call QUIET_INSTALL, bpftool)
$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/sbin
- $(Q)$(INSTALL) $(OUTPUT)bpftool $(DESTDIR)$(prefix)/sbin/bpftool
+ $(Q)$(INSTALL) $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES) $(DESTDIR)$(prefix)/sbin/
$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir)
$(Q)$(INSTALL) -m 0644 bash-completion/bpftool $(DESTDIR)$(bash_compdir)
uninstall:
$(call QUIET_UNINST, bpftool)
- $(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool
+ $(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool*
$(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool
doc:
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 93d0086..e7c7969 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -58,7 +58,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n"
" %s version\n"
"\n"
- " OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n"
+ " OBJECT := { prog | map | cgroup | perf | net | feature | btf | pcap }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@@ -227,6 +227,7 @@ static int make_args(char *line, char *n_argv[], int maxargs, int cmd_nb)
{ "net", do_net },
{ "feature", do_feature },
{ "btf", do_btf },
+ { "pcap", do_pcap },
{ "version", do_version },
{ 0 }
};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index af9ad56..079409c 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -155,6 +155,7 @@ int cmd_select(const struct cmd *cmds, int argc, char **argv,
int do_tracelog(int argc, char **arg);
int do_feature(int argc, char **argv);
int do_btf(int argc, char **argv);
+int do_pcap(int argc, char **argv);
int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
int prog_parse_fd(int *argc, char ***argv);
diff --git a/tools/bpf/bpftool/pcap.c b/tools/bpf/bpftool/pcap.c
new file mode 100644
index 0000000..ab18d1f
--- /dev/null
+++ b/tools/bpf/bpftool/pcap.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <assert.h>
+#include <bpf.h>
+#include <errno.h>
+#include <libbpf.h>
+#include <libgen.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/perf_event.h>
+#include <linux/sysinfo.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <time.h>
+
+#include "json_writer.h"
+#include "main.h"
+
+#ifdef HAVE_LIBPCAP_SUPPORT
+
+/* To avoid conflicting definitions of bpf_insn */
+#define PCAP_DONT_INCLUDE_PCAP_BPF_H
+#include <pcap.h>
+
+#include <perf-sys.h>
+
+#define PCAP_MAX_MAPS 8
+#define PCAP_PROTOCOL_DEFAULT BPF_PCAP_TYPE_ETH
+#define PCAP_NUM_PAGES_DEFAULT 16
+#define PCAP_NUM_PAGES_MIN 8
+#define PCAP_MAX_LEN 65536
+#define PCAP_FILE_STDOUT "-"
+#define PCAP_FILE_DEFAULT PCAP_FILE_STDOUT
+#define NANOSEC 1000000000
+
+
+pcap_dumper_t *pcap_dumper;
+static bool flush = true;
+volatile bool stop;
+
+static __u64 boottime; /* seconds since epoch at boot time. */
+
+static unsigned int proto_from_str(const char *proto_str)
+{
+ int proto;
+
+ /* Names for DLT_ ethernet (en10mb) and IP (raw) aren't obvious. */
+ if (strcmp(proto_str, "eth") == 0)
+ proto = BPF_PCAP_TYPE_ETH;
+ else if (strcmp(proto_str, "ip") == 0)
+ proto = BPF_PCAP_TYPE_IP;
+ else {
+ proto = pcap_datalink_name_to_val(proto_str);
+ if (proto == PCAP_ERROR) {
+ proto = strtol(proto_str, NULL, 10);
+ if (errno == ERANGE)
+ proto = -1;
+ }
+ }
+ return proto;
+}
+
+static int verify_map(int map_fd, enum bpf_map_type map_type,
+ __u32 num_entries, __u32 min_value_size)
+{
+ __u32 info_len = sizeof(struct bpf_map_info);
+ struct bpf_map_info info;
+
+ if (!bpf_obj_get_info_by_fd(map_fd, &info, &info_len) &&
+ info.type == map_type &&
+ (!num_entries || info.max_entries == num_entries) &&
+ (!min_value_size || info.value_size >= min_value_size))
+ return 0;
+ return -1;
+}
+
+static void int_exit(int signo)
+{
+ stop = true;
+}
+
+static void handle_pcap_event(void *ctx, int cpu, void *data, __u32 size)
+{
+ struct bpf_pcap_hdr *conf = ctx;
+ struct bpf_pcap_hdr *hdr = data;
+ struct pcap_pkthdr caphdr;
+
+ if (hdr->magic != BPF_PCAP_MAGIC)
+ return;
+
+ /* If we are looking for a specific protocol, and this isn't a
+ * match, ignore.
+ */
+ if (conf->protocol != BPF_PCAP_TYPE_UNSET &&
+ conf->protocol != hdr->protocol)
+ return;
+
+ caphdr.len = hdr->tot_len;
+ caphdr.caplen = hdr->cap_len;
+ caphdr.ts.tv_sec = boottime + (hdr->ktime_ns/NANOSEC);
+ caphdr.ts.tv_usec = (hdr->ktime_ns % NANOSEC) / 1000;
+
+ pcap_dump((u_char *)pcap_dumper, &caphdr, hdr->data);
+ if (flush)
+ pcap_dump_flush(pcap_dumper);
+}
+
+static int handle_pcap(int data_map_fd, int conf_map_fd,
+ struct bpf_pcap_hdr *conf, const char *pcap_filename,
+ int pages, struct bpf_link *trace_link)
+{
+ struct perf_buffer_opts pb_opts = {};
+ struct perf_buffer *pb;
+ struct sysinfo info;
+ pcap_t *pcap;
+ int err;
+
+ if (signal(SIGHUP, int_exit) ||
+ signal(SIGTERM, int_exit)) {
+ perror("signal");
+ return 1;
+ }
+ (void) signal(SIGINT, int_exit);
+
+ /* pcap expects time since epoch and bpf_pcap() records nanoseconds
+ * since boot; get time of boot to add to pcap time to give a (rough)
+ * time since epoch for capture event.
+ */
+ if (sysinfo(&info) == 0)
+ boottime = time(NULL) - info.uptime;
+
+ pcap = pcap_open_dead(conf->protocol, conf->cap_len ?
+ conf->cap_len : PCAP_MAX_LEN);
+ if (!pcap) {
+ perror("pcap_open");
+ return -1;
+ }
+ pcap_dumper = pcap_dump_open(pcap, pcap_filename);
+ if (!pcap_dumper) {
+ perror("pcap_dumper");
+ return -1;
+ }
+
+ pb_opts.sample_cb = handle_pcap_event;
+ pb_opts.ctx = conf;
+ pb = perf_buffer__new(data_map_fd, pages, &pb_opts);
+ if (libbpf_get_error(pb)) {
+ perror("perf_buffer setup failed");
+ return -1;
+ }
+
+ while (!stop) {
+ err = perf_buffer__poll(pb, 1000);
+ if (err < 0 && err != -EINTR) {
+ p_err("perf buffer polling failed: %s (%d)",
+ strerror(err), err);
+ break;
+ }
+ }
+
+ /* detach program if we attached one. */
+ if (trace_link)
+ bpf_link__destroy(trace_link);
+ perf_buffer__free(pb);
+ close(data_map_fd);
+ if (conf_map_fd >= 0)
+ close(conf_map_fd);
+ if (pcap_dumper) {
+ pcap_dump_flush(pcap_dumper);
+ pcap_dump_close(pcap_dumper);
+ }
+ if (pcap)
+ pcap_close(pcap);
+
+ return 0;
+}
+
+static int handle_opts(int argc, char **argv,
+ struct bpf_pcap_hdr *conf,
+ const char **pcap_filename, int *pages)
+{
+ int conf_used = 0;
+
+ while (argc) {
+ if (!REQ_ARGS(2))
+ return -1;
+
+ if (is_prefix(*argv, "data_out")) {
+ NEXT_ARG();
+ *pcap_filename = *argv;
+ /* no need to flush to capture file if not stdout */
+ if (strcmp(*pcap_filename, PCAP_FILE_STDOUT) != 0)
+ flush = false;
+ NEXT_ARG();
+ } else if (is_prefix(*argv, "proto")) {
+ NEXT_ARG();
+ conf->protocol = proto_from_str(*argv);
+ if (conf->protocol == -1) {
+ p_err("unrecognized protocol %s", *argv);
+ return -1;
+ }
+ conf_used = 1;
+ NEXT_ARG();
+ } else if (is_prefix(*argv, "len")) {
+ NEXT_ARG();
+ conf->cap_len = atoi(*argv);
+ conf_used = 1;
+ NEXT_ARG();
+ } else if (is_prefix(*argv, "dev")) {
+ unsigned long iifindex;
+
+ NEXT_ARG();
+ iifindex = if_nametoindex(*argv);
+ if (!iifindex) {
+ p_err("no such device %s", *argv);
+ return -1;
+ }
+ conf->flags |= (BPF_F_PCAP_ID_IIFINDEX |
+ (iifindex & BPF_F_PCAP_ID_MASK));
+ conf_used = 1;
+ NEXT_ARG();
+ } else if (is_prefix(*argv, "pages")) {
+ NEXT_ARG();
+ *pages = atoi(*argv);
+ if (*pages < PCAP_NUM_PAGES_MIN) {
+ p_err("at least %d pages are required",
+ PCAP_NUM_PAGES_MIN);
+ return -1;
+ }
+ NEXT_ARG();
+ } else {
+ p_err("unknown arg %s", *argv);
+ return -1;
+ }
+ }
+ return conf_used;
+}
+
+static int handle_conf_map(int conf_map_fd, struct bpf_pcap_hdr *conf)
+{
+ int key = 0;
+
+ if (bpf_map_update_elem(conf_map_fd, &key, conf, BPF_ANY)) {
+ p_err("could not populate config in map");
+ return -1;
+ }
+ return 0;
+}
+
+/* For the prog specified, the conf map is optional but the data map must
+ * be present to facilitate capture.
+ */
+static int prog_info(int prog_fd, enum bpf_prog_type *type,
+ int *data_map_fd, int *conf_map_fd)
+{
+ __u32 info_len = sizeof(struct bpf_prog_info);
+ struct bpf_prog_info prog_info;
+ __u32 map_ids[PCAP_MAX_MAPS];
+ int map_fd;
+ __u32 i;
+
+ *data_map_fd = -1;
+ *conf_map_fd = -1;
+
+ /* Find data and (optionally) conf map associated with program. */
+ memset(&prog_info, 0, sizeof(prog_info));
+ prog_info.nr_map_ids = PCAP_MAX_MAPS;
+ prog_info.map_ids = ptr_to_u64(map_ids);
+ if (bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len) < 0) {
+ p_err("could not retrieve info for program");
+ return -1;
+ }
+ *type = prog_info.type;
+
+ for (i = 0; i < prog_info.nr_map_ids; i++) {
+ map_fd = bpf_map_get_fd_by_id(map_ids[i]);
+
+ if (!verify_map(map_fd,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY, 0, 0)) {
+ *data_map_fd = map_fd;
+ continue;
+ }
+ if (!verify_map(map_fd,
+ BPF_MAP_TYPE_ARRAY, 1,
+ sizeof(struct bpf_pcap_hdr)))
+ *conf_map_fd = map_fd;
+ }
+
+ /* For the prog specified, the conf map is optional but the data map
+ * must be present to facilitate capture.
+ */
+ if (*data_map_fd == -1) {
+ p_err("no perf event map associated with program");
+ return -1;
+ }
+ return 0;
+}
+
+static struct bpf_link *trace_attach(int prog_fd, enum bpf_prog_type prog_type,
+ struct bpf_program *prog, const char *trace)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_KPROBE:
+ return bpf_program__attach_kprobe(prog, false, trace);
+
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return bpf_program__attach_raw_tracepoint(prog, trace);
+ default:
+ p_err("unexpected type; kprobes, raw tracepoints supported");
+ return NULL;
+ }
+}
+
+static int do_pcap_common(int argc, char **argv, int prog_fd,
+ struct bpf_program *prog, char *trace)
+{
+ struct bpf_pcap_hdr conf = { .protocol = PCAP_PROTOCOL_DEFAULT,
+ .cap_len = 0,
+ .flags = 0 };
+ const char *pcap_filename = PCAP_FILE_DEFAULT;
+ int data_map_fd = -1, conf_map_fd = -1;
+ int pages = PCAP_NUM_PAGES_DEFAULT;
+ struct bpf_link *trace_link = NULL;
+ enum bpf_prog_type prog_type;
+ int conf_used;
+
+ if (prog_info(prog_fd, &prog_type, &data_map_fd, &conf_map_fd) < 0)
+ return -1;
+
+ conf_used = handle_opts(argc, argv, &conf, &pcap_filename, &pages);
+ switch (conf_used) {
+ case 0:
+ break;
+ case 1:
+ if (conf_map_fd < 0) {
+ p_err("no single-element BPF array map to store configuration found");
+ return -1;
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ set_max_rlimit();
+
+ if (conf_map_fd >= 0 && handle_conf_map(conf_map_fd, &conf) < 0)
+ return -1;
+
+ if (trace && !prog) {
+ p_err("to specify trace option, '%s pcap load' must be used",
+ bin_name);
+ return -1;
+ }
+ if (trace) {
+ trace_link = trace_attach(prog_fd, prog_type, prog, trace);
+ if (IS_ERR(trace_link))
+ return -1;
+ }
+
+ return handle_pcap(data_map_fd, conf_map_fd, &conf, pcap_filename,
+ pages, trace_link);
+}
+
+static int do_pcap_trace(int argc, char **argv)
+{
+ char trace_prog[PATH_MAX], trace[PATH_MAX], trace_type[PATH_MAX];
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int trace_argnum = 1;
+ char argstr[8];
+ int prog_fd;
+
+ if (!REQ_ARGS(1))
+ return -1;
+
+ trace_prog[0] = '\0';
+
+ /* Optional trace program; if not specified we load a builtin program
+ * based on probe prefix (kprobe|tracepoint).
+ */
+ if (strcmp(*argv + strlen(*argv) - 2, ".o") == 0) {
+ strncpy(trace_prog, *argv, sizeof(trace_prog));
+ if (!REQ_ARGS(2))
+ return -1;
+ NEXT_ARG();
+ }
+
+ if (sscanf(*argv, "%[^:]:%[^:]:arg%d", trace_type, trace, &trace_argnum)
+ != 3 &&
+ sscanf(*argv, "%[^:]:%[^:]", trace_type, trace) != 2) {
+ p_err("expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'");
+ return -1;
+ }
+ if (strcmp(trace_type, "kprobe") != 0 &&
+ strcmp(trace_type, "tracepoint") != 0) {
+ p_err("invalid trace type %s, expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'",
+ trace_type);
+ return -1;
+ }
+ if (trace_argnum < 1 || trace_argnum > 4) {
+ p_err("'arg%d' invalid, expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'",
+ trace_argnum);
+ return -1;
+ }
+ NEXT_ARG();
+
+ if (strlen(trace_prog) == 0) {
+ char bin_path[PATH_MAX];
+
+ /* derive path of currently-executing command; BPF programs will
+ * be in the same directory, with suffix based on trace type.
+ */
+ if (readlink("/proc/self/exe", bin_path, sizeof(bin_path)) <= 0)
+ return -1;
+ snprintf(trace_prog, sizeof(trace_prog), "%s_pcap_%s.o",
+ bin_path, trace_type);
+ }
+
+ if (bpf_prog_load(trace_prog, BPF_PROG_TYPE_UNSPEC, &obj, &prog_fd) < 0)
+ return -1;
+
+ snprintf(argstr, sizeof(argstr), "arg%d", trace_argnum);
+
+ bpf_object__for_each_program(prog, obj) {
+ if (strstr(bpf_program__title(prog, false), argstr))
+ break;
+ }
+ /* No argnum-specific program, fall back to first program. */
+ if (!prog)
+ prog = bpf_program__next(NULL, obj);
+ if (!prog) {
+ p_err("could not get program");
+ return -1;
+ }
+
+ return do_pcap_common(argc, argv, prog_fd, prog, trace);
+}
+
+static int do_pcap_prog(int argc, char **argv)
+{
+ int prog_fd;
+
+ prog_fd = prog_parse_fd(&argc, &argv);
+ if (prog_fd == -1)
+ return -1;
+
+ return do_pcap_common(argc, argv, prog_fd, NULL, NULL);
+}
+
+static int do_help(int argc, char **argv)
+{
+ if (json_output) {
+ jsonw_null(json_wtr);
+ return 0;
+ }
+ fprintf(stderr,
+ " %s %s prog {id ID | pinned PATH }\n"
+ " [data_out FILE] [proto PROTOCOL] [len MAXLEN]\n"
+ " [pages NUMPAGES]\n"
+ " %s %s trace [OBJ] {kprobe|tracepoint}:probename[:arg[1-4]]]\n"
+ " [data_out FILE] [proto PROTOCOL] [len MAXLEN]\n"
+ " [dev DEVICE] [pages NUMPAGES]\n"
+ " %s %s help\n",
+ bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+
+ return 0;
+}
+
+static const struct cmd cmds[] = {
+ { "prog", do_pcap_prog },
+ { "trace", do_pcap_trace },
+ { "help", do_help },
+ { 0 }
+};
+
+#endif /* HAVE_LIBPCAP_SUPPORT */
+
+int do_pcap(int argc, char **argv)
+{
+#ifdef HAVE_LIBPCAP_SUPPORT
+ return cmd_select(cmds, argc, argv, do_help);
+#else
+ p_err("pcap support was not compiled into bpftool as libpcap\n"
+ "and associated headers were not available at build time.\n");
+ return -1;
+#endif /* HAVE_LIBPCAP_SUPPORT */
+}
diff --git a/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c b/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
new file mode 100644
index 0000000..00a945d
--- /dev/null
+++ b/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_pcap_hdr),
+ .max_entries = 1,
+};
+
+static __always_inline int kprobe_pcap(struct pt_regs *ctx, int argnum)
+{
+ struct bpf_pcap_hdr *conf;
+ int key = 0;
+
+ conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+ if (!conf)
+ return 0;
+
+ switch (argnum) {
+ case 1:
+ bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len,
+ &pcap_data_map, conf->protocol, conf->flags);
+ return 0;
+ case 2:
+ bpf_pcap((void *)PT_REGS_PARM2(ctx), conf->cap_len,
+ &pcap_data_map, conf->protocol, conf->flags);
+ return 0;
+ case 3:
+ bpf_pcap((void *)PT_REGS_PARM3(ctx), conf->cap_len,
+ &pcap_data_map, conf->protocol, conf->flags);
+ return 0;
+ case 4:
+ bpf_pcap((void *)PT_REGS_PARM4(ctx), conf->cap_len,
+ &pcap_data_map, conf->protocol, conf->flags);
+ return 0;
+ }
+ return 0;
+}
+
+SEC("kprobe/pcap_arg1")
+int pcap_arg1(struct pt_regs *ctx)
+{
+ return kprobe_pcap(ctx, 1);
+}
+
+SEC("kprobe/pcap_arg2")
+int pcap_arg2(struct pt_regs *ctx)
+{
+ return kprobe_pcap(ctx, 2);
+}
+
+SEC("kprobe/pcap_arg3")
+int pcap_arg3(struct pt_regs *ctx)
+{
+ return kprobe_pcap(ctx, 3);
+}
+
+SEC("kprobe/pcap_arg4")
+int pcap_arg4(struct pt_regs *ctx)
+{
+ return kprobe_pcap(ctx, 4);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c b/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
new file mode 100644
index 0000000..639806a
--- /dev/null
+++ b/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_pcap_hdr),
+ .max_entries = 1,
+};
+
+/* To attach to raw tracepoints, we need one program for each arg choice 1-4.
+ * Otherwise attach fails.
+ */
+static __always_inline int trace_pcap(struct bpf_raw_tracepoint_args *ctx,
+ int argnum)
+{
+ struct bpf_pcap_hdr *conf;
+ int ret, key = 0;
+
+ conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+ if (!conf)
+ return 0;
+
+ bpf_pcap((void *)ctx->args[argnum], conf->cap_len,
+ &pcap_data_map, conf->protocol, conf->flags);
+ return 0;
+}
+
+SEC("raw_tracepoint/pcap_arg1")
+int trace_arg1(struct bpf_raw_tracepoint_args *ctx)
+{
+ return trace_pcap(ctx, 0);
+}
+
+SEC("raw_tracepoint/pcap_arg2")
+int trace_arg2(struct bpf_raw_tracepoint_args *ctx)
+{
+ return trace_pcap(ctx, 1);
+}
+
+SEC("raw_tracepoint/pcap_arg3")
+int trace_arg3(struct bpf_raw_tracepoint_args *ctx)
+{
+ return trace_pcap(ctx, 2);
+}
+
+SEC("raw_tracepoint/pcap_arg4")
+int trace_arg4(struct bpf_raw_tracepoint_args *ctx)
+{
+ return trace_pcap(ctx, 3);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 6c4930b..2a61126 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -231,6 +231,9 @@ static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) =
static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
int ip_len, void *tcp, int tcp_len) =
(void *) BPF_FUNC_tcp_gen_syncookie;
+static int (*bpf_pcap)(void *data, __u32 size, void *map, int protocol,
+ __u64 flags) =
+ (void *) BPF_FUNC_pcap;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
@@ -520,8 +523,16 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
* actual field offset, based on target kernel BTF type that matches original
* (local) BTF, used to record relocation.
*/
+#ifdef __builtin_preserve_access_index
#define BPF_CORE_READ(dst, src) \
bpf_probe_read((dst), sizeof(*(src)), \
__builtin_preserve_access_index(src))
+#else
+
+#define BPF_CORE_READ(dst, src) \
+ bpf_probe_read((dst), sizeof(*(src)), src)
+
+#endif /* __builtin_preserve_access_index */
+
#endif
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [RFC bpf-next 6/7] bpf: add documentation for bpftool pcap subcommand
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
` (4 preceding siblings ...)
2019-09-07 21:40 ` [RFC bpf-next 5/7] bpf: add pcap support to bpftool Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
2019-09-07 21:40 ` [RFC bpf-next 7/7] bpf: add tests for bpftool packet capture Alan Maguire
6 siblings, 0 replies; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
Document supported "bpf pcap" subcommands.
"prog" is used to capture packets from already-loaded programs.
"trace" loads/atttaches tracing programs to capture packets.
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
tools/bpf/bpftool/Documentation/bpftool-btf.rst | 1 +
tools/bpf/bpftool/Documentation/bpftool-cgroup.rst | 1 +
.../bpf/bpftool/Documentation/bpftool-feature.rst | 1 +
tools/bpf/bpftool/Documentation/bpftool-map.rst | 1 +
tools/bpf/bpftool/Documentation/bpftool-net.rst | 1 +
tools/bpf/bpftool/Documentation/bpftool-pcap.rst | 119 +++++++++++++++++++++
tools/bpf/bpftool/Documentation/bpftool-perf.rst | 1 +
tools/bpf/bpftool/Documentation/bpftool-prog.rst | 1 +
tools/bpf/bpftool/Documentation/bpftool.rst | 1 +
9 files changed, 127 insertions(+)
create mode 100644 tools/bpf/bpftool/Documentation/bpftool-pcap.rst
diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 39615f8..54045f0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -235,4 +235,5 @@ SEE ALSO
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 06a28b0..1df98e1 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -164,5 +164,6 @@ SEE ALSO
**bpftool-map**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index 4d08f35..0f36ad8 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -86,5 +86,6 @@ SEE ALSO
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 1c0f714..8408022 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -271,5 +271,6 @@ SEE ALSO
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 8651b00..6bd24bb 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -198,5 +198,6 @@ SEE ALSO
**bpftool-map**\ (8),
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-pcap.rst b/tools/bpf/bpftool/Documentation/bpftool-pcap.rst
new file mode 100644
index 0000000..53ed226d
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-pcap.rst
@@ -0,0 +1,119 @@
+================
+bpftool-pcap
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF progs
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+ **bpftool** [*OPTIONS*] **pcap** *COMMAND*
+
+ *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+
+ *COMMANDS* :=
+ { **prog** | **trace** | **help** }
+
+PCAP COMMANDS
+=============
+
+| **bpftool** **pcap** **prog ** *PROG* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **pages** *NUMPAGES*}]
+| **bpftool** **pcap** **trace** [*OBJ*] *TRACE* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **dev** *DEVNAME* | **pages** *NUMPAGES*}]
+| **bpftool** **pcap help**
+|
+| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+| *PROTOCOL* := {
+| **eth** | **ip** | **ieee_80211** | ... }
+| *TRACE* := {
+| **kprobe**|**tracepoint**:*probename*[:arg{1-4}] }
+
+
+DESCRIPTION
+===========
+ **bpftool pcap prog [*PROG*] *PROG* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **pages** *NUMPAGES*}]
+
+ Capture packet data from perf event map associated with
+ program specified. By default capture data is displayed on
+ stdout, but if a capture file is preferred the data_out FILE
+ option can be used. The link type (termed DLT_TYPE in
+ libpcap) is assumed to be Ethernet if not explicitly
+ specified via the **proto** option.
+
+ Maximum capture length can be adjusted via the **len**
+ option.
+
+ To work with bpftool pcap, the associated BPF program must
+ at least define a perf event map, but if config options
+ (protocol, max len) are to be supported it should also
+ provide an array map with a single value of at least
+ *struct bpf_pcap_conf* size.
+
+ **bpftool** **pcap** **trace** [*OBJ*] *TRACE* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **dev** *DEV* | **pages** *NUMPAGES*}]
+
+ Attach the specified program in *OBJ* or load a
+ pre-existing BPF kprobe/tracepoint program capable
+ of capturing packets.
+
+ Trace specification is of the form
+
+ trace_type:probe[:arg]
+
+ For example tracepoint:iwlwifi_dev_tx_tb:arg2 will
+ capture packet data from the second argument to the
+ iwlwifi_dev_tx_tb tracepoint. *DEV* can be used to
+ limit capture to a specific incoming interface.
+
+ **bpftool prog help**
+ Print short help message.
+
+OPTIONS
+=======
+ -h, --help
+ Print short generic help message (similar to **bpftool help**).
+
+ -V, --version
+ Print version number (similar to **bpftool version**).
+
+ -j, --json
+ Generate JSON output. For commands that cannot produce JSON, this
+ option has no effect.
+
+ -p, --pretty
+ Generate human-readable JSON output. Implies **-j**.
+
+ -f, --bpffs
+ When showing BPF programs, show file names of pinned
+ programs.
+
+ -m, --mapcompat
+ Allow loading maps with unknown map definitions.
+
+ -n, --nomount
+ Do not automatically attempt to mount any virtual file system
+ (such as tracefs or BPF virtual file system) when necessary.
+
+ -d, --debug
+ Print all logs available, even debug-level information. This
+ includes logs from libbpf as well as from the verifier, when
+ attempting to load programs.
+
+EXAMPLES
+========
+**# bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -**
+reading from file -, link-type EN10MB (Ethernet)
+00:16:49.150880 IP 10.11.12.13 > 10.11.12.14: ICMP echo reply, id 10519, seq 1, length 64
+
+SEE ALSO
+========
+ **bpf**\ (2),
+ **bpf-helpers**\ (7),
+ **bpftool**\ (8),
+ **bpftool-map**\ (8),
+ **bpftool-cgroup**\ (8),
+ **bpftool-feature**\ (8),
+ **bpftool-net**\ (8),
+ **bpftool-perf**\ (8),
+ **bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index e252bd0..d618bbd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -90,4 +90,5 @@ SEE ALSO
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 7a374b3..b4dd779 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -311,5 +311,6 @@ SEE ALSO
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 6a9c52e..4126246 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -80,5 +80,6 @@ SEE ALSO
**bpftool-cgroup**\ (8),
**bpftool-feature**\ (8),
**bpftool-net**\ (8),
+ **bpftool-pcap**\ (8),
**bpftool-perf**\ (8),
**bpftool-btf**\ (8)
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [RFC bpf-next 7/7] bpf: add tests for bpftool packet capture
2019-09-07 21:40 [RFC bpf-next 0/7] bpf: packet capture helpers, bpftool support Alan Maguire
` (5 preceding siblings ...)
2019-09-07 21:40 ` [RFC bpf-next 6/7] bpf: add documentation for bpftool pcap subcommand Alan Maguire
@ 2019-09-07 21:40 ` Alan Maguire
6 siblings, 0 replies; 12+ messages in thread
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
danieltimlee, ctakshak, netdev, bpf, linux-kselftest
Cc: Alan Maguire
add tests which verify packet capture works for tracing of
kprobes and raw tracepoints, and for capturing packets from
existing skb/xdp programs.
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
tools/testing/selftests/bpf/Makefile | 3 +-
.../testing/selftests/bpf/progs/bpftool_pcap_tc.c | 41 +++++++
.../testing/selftests/bpf/progs/bpftool_pcap_xdp.c | 39 ++++++
tools/testing/selftests/bpf/test_bpftool_pcap.sh | 132 +++++++++++++++++++++
4 files changed, 214 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
create mode 100755 tools/testing/selftests/bpf/test_bpftool_pcap.sh
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7f3196a..1e8b68d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -66,7 +66,8 @@ TEST_PROGS := test_kmod.sh \
test_tc_tunnel.sh \
test_tc_edt.sh \
test_xdping.sh \
- test_bpftool_build.sh
+ test_bpftool_build.sh \
+ test_bpftool_pcap.sh
TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \
diff --git a/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c b/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
new file mode 100644
index 0000000..b51f8fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_pcap_hdr),
+ .max_entries = 1,
+};
+
+SEC("tc_pcap")
+int tc_pcap(struct __sk_buff *skb)
+{
+ struct bpf_pcap_hdr *conf;
+ int key = 0;
+
+ conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+ if (!conf)
+ return 0;
+
+ bpf_pcap(skb, conf->cap_len, &pcap_data_map, conf->protocol,
+ conf->flags);
+
+ return TC_ACT_OK;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c b/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
new file mode 100644
index 0000000..a7d6866
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_pcap_hdr),
+ .max_entries = 1,
+};
+
+SEC("xdp_pcap")
+int xdp_pcap(struct xdp_md *ctx)
+{
+ struct bpf_pcap_hdr *conf;
+ int key = 0;
+
+ conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+ if (!conf)
+ return 0;
+
+ bpf_pcap(ctx, conf->cap_len, &pcap_data_map, conf->protocol,
+ conf->flags);
+
+ return XDP_PASS;
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_pcap.sh b/tools/testing/selftests/bpf/test_bpftool_pcap.sh
new file mode 100755
index 0000000..92b5438
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_pcap.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+
+readonly src="../../../../"
+readonly bpftool="${src}/tools/bpf/bpftool/bpftool"
+readonly capfile="/tmp/cap.$$"
+readonly ns="ns-$$"
+readonly badport="5555"
+readonly addr1="192.168.1.1"
+readonly addr2="192.168.1.2"
+readonly pinpath="/sys/fs/bpf/"
+readonly veth1="${ns}-veth1"
+readonly veth2="${ns}-veth2"
+# 24 bytes for the pcap header
+readonly cap_minsize=24
+readonly caplens="0 8192"
+readonly addrs="127.0.0.1 ::1"
+readonly devs="none lo"
+
+cleanup() {
+ iptables -D INPUT -p tcp --dport $badport -j DROP
+ ip6tables -D INPUT -p tcp --dport $badport -j DROP
+ ip netns del $ns 2>/dev/null
+ rm -f $capfile
+}
+
+verify_capture() {
+ capsize=$(stat -c '%s' $capfile)
+ if [[ $capsize -le $cap_minsize ]]; then
+ exit 1
+ fi
+ if [[ $no_tcpdump == 0 ]]; then
+ count=$(tcpdump -lnr $capfile $1 2>/dev/null)
+ if [[ -z "$count" ]]; then
+ exit 1
+ fi
+ fi
+}
+
+which tcpdump 2>&1 > /dev/null
+no_tcpdump=$?
+
+pcap_supported=$(bpftool pcap help >/dev/null 2>&1)
+if [[ $? -ne 0 ]]; then
+ echo "no pcap support in bpftool, cannot test feature."
+ exit 0
+fi
+
+set -e
+
+trap cleanup EXIT
+
+iptables -A INPUT -p tcp --dport $badport -j DROP
+ip6tables -A INPUT -p tcp --dport $badport -j DROP
+
+# Test "bpftool pcap trace" - kprobe, tracepoint tracing
+for probe in kprobe tracepoint; do
+ for dev in $devs; do
+ devarg=
+ if [[ $dev != "none" ]]; then
+ devarg="dev $dev"
+ fi
+ args="$probe:kfree_skb proto ip data_out $capfile $devarg"
+ echo "Test trace $args"
+ for caplen in $caplens ; do
+ for progname in none $probe ; do
+ progpath=
+ if [[ $progname != "none" ]]; then
+ progpath=${bpftool}_pcap_${probe}.o
+ fi
+ allargs="$progpath $args len $caplen"
+ for addr in $addrs ; do
+ $bpftool pcap trace $allargs &
+ bpftool_pid=$!
+ set +e
+ timeout 2 nc $addr $badport 2>/dev/null
+ kill -TERM $bpftool_pid
+ set -e
+ sleep 1
+ verify_capture "host $addr and port $badport"
+ rm -f $capfile
+ done
+ done
+ done
+ echo "Test trace $args: PASS"
+ done
+done
+
+# Test "bpftool pcap prog" - skb, xdp program tracing
+ip netns add $ns
+ip link add dev $veth2 netns $ns type veth peer name $veth1
+ip link set $veth1 up
+ip addr add ${addr1}/24 dev $veth1
+ip -netns $ns link set $veth2 up
+ip netns exec $ns ip addr add ${addr2}/24 dev $veth2
+
+for prog in tc xdp ; do
+ if [[ $prog == tc ]]; then
+ ip netns exec $ns tc qdisc add dev $veth2 clsact
+ ip netns exec $ns tc filter add dev $veth2 ingress bpf da \
+ obj bpftool_pcap_${prog}.o sec ${prog}_pcap
+ id=$(ip netns exec $ns tc filter show dev $veth2 ingress | \
+ awk '/direct-action/ { for(i=1;i<=NF;i++)if($i=="id")print $(i+1)}')
+ else
+ ip netns exec $ns ip link set dev $veth2 xdp obj bpftool_pcap_${prog}.o \
+ sec ${prog}_pcap
+ id=$(ip netns exec $ns ip link show $veth2 | awk '/prog\/xdp/ { print $3 }')
+ sleep 5
+ fi
+ args="id $id data_out $capfile"
+ echo "Test prog $args"
+ for caplen in $caplens ; do
+ allargs="$args len $caplen"
+ $bpftool pcap prog $allargs &
+ bpftool_pid=$!
+ set +e
+ ping -q -c 5 $addr2 1>/dev/null
+ kill -TERM $bpftool_pid
+ set -e
+ sleep 1
+ verify_capture "host $addr1"
+ rm -f $capfile
+ done
+ if [[ $prog == tc ]]; then
+ ip netns exec $ns tc qdisc del dev $veth2 clsact
+ sleep 1
+ else
+ ip netns exec $ns ip link set dev $veth2 xdp off
+ fi
+ echo "Test trace $args: PASS"
+done
--
1.8.3.1
^ permalink raw reply related [flat|nested] 12+ messages in thread