Re: [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture

From: Yonghong Song <yhs@fb.com>
To: Alan Maguire <alan.maguire@oracle.com>,
	"ast@kernel.org" <ast@kernel.org>,
	"daniel@iogearbox.net" <daniel@iogearbox.net>,
	Martin Lau <kafai@fb.com>, Song Liu <songliubraving@fb.com>,
	"davem@davemloft.net" <davem@davemloft.net>,
	"jakub.kicinski@netronome.com" <jakub.kicinski@netronome.com>,
	"hawk@kernel.org" <hawk@kernel.org>,
	"john.fastabend@gmail.com" <john.fastabend@gmail.com>,
	"rostedt@goodmis.org" <rostedt@goodmis.org>,
	"mingo@redhat.com" <mingo@redhat.com>,
	"quentin.monnet@netronome.com" <quentin.monnet@netronome.com>,
	Andrey Ignatov <rdna@fb.com>, "joe@wand.net.nz" <joe@wand.net.nz>,
	"acme@redhat.com" <acme@redhat.com>,
	"jolsa@kernel.org" <jolsa@kernel.org>,
	"alexey.budankov@linux.intel.com"
	<alexey.budankov@linux.intel.com>,
	"gregkh@linuxfoundation.org" <gregkh@linuxfoundation.org>,
	"namhyung@kernel.org" <namhyung@kernel.org>,
	"sdf@google.com" <sdf@google.com>,
	"f.fainelli@gmail.com" <f.fainelli@gmail.com>,
	"shuah@kernel.org" <shuah@kernel.org>,
	"peter@lekensteyn.nl" <peter@lekensteyn.nl>,
	"ivan@cloudflare.com" <ivan@cloudflare.com>,
	"Andrii Nakryiko" <andriin@fb.com>,
	"bhole_prashant_q7@lab.ntt.co.jp"
	<bhole_prashant_q7@lab.ntt.co.jp>,
	"david.calavera@gmail.com" <david.calavera@gmail.com>,
	"danieltimlee@gmail.com" <danieltimlee@gmail.com>,
	Takshak Chahande <ctakshak@fb.com>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"bpf@vger.kernel.org" <bpf@vger.kernel.org>,
	"linux-kselftest@vger.kernel.org"
	<linux-kselftest@vger.kernel.org>
Subject: Re: [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture
Date: Sun, 8 Sep 2019 22:02:13 +0000	[thread overview]
Message-ID: <d5995641-9ce9-9cad-7a58-999614550963@fb.com> (raw)
In-Reply-To: <1567892444-16344-2-git-send-email-alan.maguire@oracle.com>

On 9/7/19 2:40 PM, Alan Maguire wrote:
> bpf_pcap() simplifies packet capture for skb and XDP
> BPF programs by creating a BPF perf event containing information
> relevant for packet capture (protocol, actual/captured packet
> size, time of capture, etc) along with the packet payload itself.
> All of this is stored in a "struct bpf_pcap_hdr".
> 
> This header information can then be retrieved from the perf
> event map and used by packet capture frameworks such as libpcap
> to carry out packet capture.
> 
> skb and XDP programs currently deal in Ethernet-based traffic
> exclusively, so should specify BPF_PCAP_TYPE_ETH or
> BPF_PCAP_TYPE_UNSET.  The protocol parameter will be used
> in a later commit.
> 
> Note that libpcap assumes times are relative to the epoch while
> we record nanoseconds since boot; as a result any times need
> to be normalized with respect to the boot time for libpcap
> storage; sysinfo(2) can be used to retrieve boot time to normalize
> values appropriately.
> 
> Example usage for a tc-bpf program:
> 
> struct bpf_map_def SEC("maps") pcap_map = {
> 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
> 	.key_size = sizeof(int),
> 	.value_size = sizeof(int),
> 	.max_entries = 1024,
> };
> 
> SEC("cap")
> int cap(struct __sk_buff *skb)
> {
> 	bpf_pcap(skb, 1514, &pcap_map, BPF_PCAP_TYPE_ETH, 0);
> 
> 	return TC_ACT_OK;
> }
> 
> Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
> ---
>   include/linux/bpf.h      | 20 +++++++++++++
>   include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++-
>   kernel/bpf/verifier.c    |  4 ++-
>   net/core/filter.c        | 67 ++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 164 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 5b9d223..033c9cf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1145,4 +1145,24 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
>   }
>   #endif /* CONFIG_INET */
>   
> +
> +static inline int bpf_pcap_prepare(int protocol, u32 cap_len, u32 tot_len,
> +				   u64 flags, struct bpf_pcap_hdr *pcap)
> +{
> +	if (protocol < 0 || pcap == NULL)
> +		return -EINVAL;
> +
> +	pcap->magic = BPF_PCAP_MAGIC;
> +	pcap->protocol = protocol;
> +	pcap->flags = flags;
> +
> +	if (cap_len == 0 || tot_len < cap_len)
> +		cap_len = tot_len;
> +	pcap->cap_len = cap_len;
> +	pcap->tot_len = tot_len;
> +	pcap->ktime_ns = ktime_get_mono_fast_ns();
> +
> +	return 0;
> +}
> +
>   #endif /* _LINUX_BPF_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 77c6be9..a27e58e 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2750,6 +2750,39 @@ struct bpf_stack_build_id {
>    *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
>    *
>    *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
> + *
> + * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
> + *		u64 flags)
> + *	Description
> + *		Write packet data from *data* into a special BPF perf event
> + *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
> + *		perf event has the same attributes as perf events generated
> + *		by bpf_perf_event_output.  For skb and xdp programs, *data*
> + *		is the relevant context.
> + *
> + *		Metadata for this event is a **struct bpf_pcap_hdr**; this
> + *		contains the capture length, actual packet length and
> + *		the starting protocol.
> + *
> + *		The max number of bytes of context to store is specified via
> + *		*size*.
> + *
> + *		The flags value can be used to specify an id value of up
> + *		to 48 bits; the id can be used to correlate captured packets
> + *		with other trace data, since the passed-in flags value is stored
> + *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
> + *
> + *		The *protocol* value specifies the protocol type of the start
> + *		of the packet so that packet capture can carry out
> + *		interpretation.  See **pcap-linktype** (7) for details on
> + *		the supported values.
> + *
> + *	Return
> + *		0 on success, or a negative error in case of failure.
> + *		-ENOENT will be returned if the associated perf event
> + *		map entry is empty, or the skb is zero-length.
> + *		-EINVAL will be returned if the flags value is invalid.

The feature itself indeed seems useful for networking community.
I just have some questions what kind of kernel support is needed.

We already have perf_event_output for skb and xdp.
Can we just use the original helpers and do a little bit post
processing in user space to get what you want?
It looks possible to me to generate bpf_pcap_hdr in user space
before sending output to pcap analyzer.

> + *
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> @@ -2862,7 +2895,8 @@ struct bpf_stack_build_id {
>   	FN(sk_storage_get),		\
>   	FN(sk_storage_delete),		\
>   	FN(send_signal),		\
> -	FN(tcp_gen_syncookie),
> +	FN(tcp_gen_syncookie),		\
> +	FN(pcap),
>   
>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>    * function eBPF program intends to call
> @@ -2941,6 +2975,9 @@ enum bpf_func_id {
>   /* BPF_FUNC_sk_storage_get flags */
>   #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
>   
> +/* BPF_FUNC_pcap flags */
> +#define	BPF_F_PCAP_ID_MASK		0xffffffffffff
> +
>   /* Mode for BPF_FUNC_skb_adjust_room helper. */
>   enum bpf_adj_room_mode {
>   	BPF_ADJ_ROOM_NET,
> @@ -3613,4 +3650,40 @@ struct bpf_sockopt {
>   	__s32	retval;
>   };
>   
> +/* bpf_pcap_hdr contains information related to a particular packet capture
> + * flow.  It specifies
> + *
> + * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
> + *   a pcap-related event.
> + * - a starting protocol is the protocol associated with the header
> + * - a flags value, copied from the flags value passed into bpf_pcap().
> + *   IDs can be used to correlate packet capture data and other tracing data.
> + *
> + * bpf_pcap_hdr also contains the information relating to the to-be-captured
> + * packet, and closely corresponds to the struct pcap_pkthdr used by
> + * pcap_dump (3PCAP).  The bpf_pcap helper sets ktime_ns (nanoseconds since
> + * boot) to the ktime_ns value; to get sensible pcap times this value should
> + * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
> + *
> + * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
> + * need both information about the particular packet and the protocol
> + * we are capturing.
> + */
> +
> +#define BPF_PCAP_MAGIC		0xb7fca7
> +
> +struct bpf_pcap_hdr {
> +	__u32			magic;
> +	int			protocol;
> +	__u64			flags;
> +	__u64			ktime_ns;
> +	__u32			tot_len;
> +	__u32			cap_len;
> +	__u8			data[0];
> +};
> +
> +#define BPF_PCAP_TYPE_UNSET	-1
> +#define BPF_PCAP_TYPE_ETH	1
> +#define	BPF_PCAP_TYPE_IP	12
> +
>   #endif /* _UAPI__LINUX_BPF_H__ */
[...]
> diff --git a/net/core/filter.c b/net/core/filter.c
> index ed65636..e0e23ee 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4158,6 +4158,35 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
>   	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
>   };
>   
> +BPF_CALL_5(bpf_xdp_pcap, struct xdp_buff *, xdp, u32, size,
> +	   struct bpf_map *, map, int, protocol, u64, flags)
> +{
> +	unsigned long len = (unsigned long)(xdp->data_end - xdp->data);
> +	struct bpf_pcap_hdr pcap;
> +	int ret;
> +
> +	if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
> +		return -EINVAL;
> +
> +	ret = bpf_pcap_prepare(protocol, size, len, flags, &pcap);
> +	if (ret)
> +		return ret;
> +
> +	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
> +				xdp->data, pcap.cap_len, bpf_xdp_copy);
> +}
> +
> +static const struct bpf_func_proto bpf_xdp_pcap_proto = {
> +	.func		= bpf_xdp_pcap,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_CONST_MAP_PTR,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
>   BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
>   {
>   	return skb->sk ? sock_gen_cookie(skb->sk) : 0;
> @@ -5926,6 +5955,34 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
>   
>   #endif /* CONFIG_INET */
>   
> +BPF_CALL_5(bpf_skb_pcap, struct sk_buff *, skb, u32, size,
> +	   struct bpf_map *, map, int, protocol, u64, flags)
> +{
> +	struct bpf_pcap_hdr pcap;
> +	int ret;
> +
> +	if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
> +		return -EINVAL;
> +
> +	ret = bpf_pcap_prepare(protocol, size, skb->len, flags, &pcap);
> +	if (ret)
> +		return ret;
> +
> +	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
> +				skb, pcap.cap_len, bpf_skb_copy);
> +}
> +
> +static const struct bpf_func_proto bpf_skb_pcap_proto = {
> +	.func		= bpf_skb_pcap,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_ANYTHING,
> +	.arg3_type	= ARG_CONST_MAP_PTR,
> +	.arg4_type      = ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
>   bool bpf_helper_changes_pkt_data(void *func)
>   {
>   	if (func == bpf_skb_vlan_push ||
> @@ -6075,6 +6132,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>   		return &bpf_get_socket_uid_proto;
>   	case BPF_FUNC_perf_event_output:
>   		return &bpf_skb_event_output_proto;
> +	case BPF_FUNC_pcap:
> +		return &bpf_skb_pcap_proto;
>   	default:
>   		return bpf_base_func_proto(func_id);
>   	}
> @@ -6216,6 +6275,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>   	case BPF_FUNC_tcp_gen_syncookie:
>   		return &bpf_tcp_gen_syncookie_proto;
>   #endif
> +	case BPF_FUNC_pcap:
> +		return &bpf_skb_pcap_proto;
>   	default:
>   		return bpf_base_func_proto(func_id);
>   	}
> @@ -6256,6 +6317,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>   		return &bpf_tcp_check_syncookie_proto;
>   	case BPF_FUNC_tcp_gen_syncookie:
>   		return &bpf_tcp_gen_syncookie_proto;
> +	case BPF_FUNC_pcap:
> +		return &bpf_xdp_pcap_proto;
>   #endif
>   	default:
>   		return bpf_base_func_proto(func_id);
> @@ -6361,6 +6424,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>   	case BPF_FUNC_skc_lookup_tcp:
>   		return &bpf_skc_lookup_tcp_proto;
>   #endif
> +	case BPF_FUNC_pcap:
> +		return &bpf_skb_pcap_proto;
>   	default:
>   		return bpf_base_func_proto(func_id);
>   	}
> @@ -6399,6 +6464,8 @@ bool bpf_helper_changes_pkt_data(void *func)
>   		return &bpf_get_smp_processor_id_proto;
>   	case BPF_FUNC_skb_under_cgroup:
>   		return &bpf_skb_under_cgroup_proto;
> +	case BPF_FUNC_pcap:
> +		return &bpf_skb_pcap_proto;
>   	default:
>   		return bpf_base_func_proto(func_id);
>   	}
>