All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
@ 2024-02-07 10:12 Jose E. Marchesi
  2024-02-07 21:45 ` Yonghong Song
  0 siblings, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-07 10:12 UTC (permalink / raw)
  To: bpf
  Cc: Jose E . Marchesi, Yonghong Song, Eduard Zingerman,
	Alexei Starovoitov, david.faust, cupertino.miranda

Some BPF tests use loop unrolling compiler pragmas that are clang
specific and not supported by GCC.  These pragmas, along with their
GCC equivalences are:

  #pragma clang loop unroll_count(N)
  #pragma GCC unroll N

  #pragma clang loop unroll(full)
  #pragma GCC unroll 65534

  #pragma clang loop unroll(disable)
  #pragma GCC unroll 1

  #pragma unroll [aka #pragma clang loop unroll(enable)]
  There is no GCC equivalence, and it seems to me that this clang
  pragma may be only useful when building without -funroll-loops to
  enable the optimization in particular loops.  In GCC -funroll-loops
  is enabled with -O2 and higher.  If this is also true in clang,
  perhaps these pragmas in selftests are redundant?

This patch adds a new header progs/bpf_compiler.h that defines the
following macros, which correspond to each pair of compiler-specific
pragmas above:

  __pragma_loop_unroll_count(N)
  __pragma_loop_unroll_full
  __pragma_loop_no_unroll
  __pragma_loop_unroll

The selftests using loop unrolling pragmas are then changed to include
the header and use these macros in place of the explicit pragmas.

Tested in bpf-next master.
No regressions.

Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
Cc: Yonghong Song <yhs@meta.com>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: david.faust@oracle.com
Cc: cupertino.miranda@oracle.com
---
 .../selftests/bpf/progs/bpf_compiler.h        | 33 +++++++++++++++++++
 tools/testing/selftests/bpf/progs/iters.c     |  5 +--
 tools/testing/selftests/bpf/progs/loop4.c     |  4 ++-
 .../selftests/bpf/progs/profiler.inc.h        | 17 +++++-----
 tools/testing/selftests/bpf/progs/pyperf.h    |  7 ++--
 .../testing/selftests/bpf/progs/strobemeta.h  | 18 +++++-----
 .../selftests/bpf/progs/test_cls_redirect.c   |  5 +--
 .../selftests/bpf/progs/test_lwt_seg6local.c  |  6 ++--
 .../selftests/bpf/progs/test_seg6_loop.c      |  4 ++-
 .../selftests/bpf/progs/test_skb_ctx.c        |  4 ++-
 .../selftests/bpf/progs/test_sysctl_loop1.c   |  6 ++--
 .../selftests/bpf/progs/test_sysctl_loop2.c   |  6 ++--
 .../selftests/bpf/progs/test_sysctl_prog.c    |  6 ++--
 .../selftests/bpf/progs/test_tc_tunnel.c      |  4 ++-
 tools/testing/selftests/bpf/progs/test_xdp.c  |  3 +-
 .../selftests/bpf/progs/test_xdp_loop.c       |  3 +-
 .../selftests/bpf/progs/test_xdp_noinline.c   |  5 +--
 .../selftests/bpf/progs/xdp_synproxy_kern.c   |  6 ++--
 .../testing/selftests/bpf/progs/xdping_kern.c |  3 +-
 19 files changed, 103 insertions(+), 42 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h

diff --git a/tools/testing/selftests/bpf/progs/bpf_compiler.h b/tools/testing/selftests/bpf/progs/bpf_compiler.h
new file mode 100644
index 000000000000..a7c343dc82e6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_compiler.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_COMPILER_H__
+#define __BPF_COMPILER_H__
+
+#define DO_PRAGMA_(X) _Pragma(#X)
+
+#if __clang__
+#define __pragma_loop_unroll DO_PRAGMA_(clang loop unroll(enable))
+#else
+/* In GCC -funroll-loops, which is enabled with -O2, should have the
+   same impact than the loop-unroll-enable pragma above.  */
+#define __pragma_loop_unroll
+#endif
+
+#if __clang__
+#define __pragma_loop_unroll_count(N) DO_PRAGMA_(clang loop unroll_count(N))
+#else
+#define __pragma_loop_unroll_count(N) DO_PRAGMA_(GCC unroll N)
+#endif
+
+#if __clang__
+#define __pragma_loop_unroll_full DO_PRAGMA_(clang loop unroll(full))
+#else
+#define __pragma_loop_unroll_full DO_PRAGMA_(GCC unroll 65534)
+#endif
+
+#if __clang__
+#define __pragma_loop_no_unroll DO_PRAGMA_(clang loop unroll(disable))
+#else
+#define __pragma_loop_no_unroll DO_PRAGMA_(GCC unroll 1)
+#endif
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
index 225f02dd66d0..3db416606f2f 100644
--- a/tools/testing/selftests/bpf/progs/iters.c
+++ b/tools/testing/selftests/bpf/progs/iters.c
@@ -5,6 +5,7 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
+#include "bpf_compiler.h"
 
 #define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0]))
 
@@ -183,7 +184,7 @@ int iter_pragma_unroll_loop(const void *ctx)
 	MY_PID_GUARD();
 
 	bpf_iter_num_new(&it, 0, 2);
-#pragma nounroll
+	__pragma_loop_no_unroll
 	for (i = 0; i < 3; i++) {
 		v = bpf_iter_num_next(&it);
 		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
@@ -238,7 +239,7 @@ int iter_multiple_sequential_loops(const void *ctx)
 	bpf_iter_num_destroy(&it);
 
 	bpf_iter_num_new(&it, 0, 2);
-#pragma nounroll
+	__pragma_loop_no_unroll
 	for (i = 0; i < 3; i++) {
 		v = bpf_iter_num_next(&it);
 		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
diff --git a/tools/testing/selftests/bpf/progs/loop4.c b/tools/testing/selftests/bpf/progs/loop4.c
index b35337926d66..0de0357f57cc 100644
--- a/tools/testing/selftests/bpf/progs/loop4.c
+++ b/tools/testing/selftests/bpf/progs/loop4.c
@@ -3,6 +3,8 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 char _license[] SEC("license") = "GPL";
 
 SEC("socket")
@@ -10,7 +12,7 @@ int combinations(volatile struct __sk_buff* skb)
 {
 	int ret = 0, i;
 
-#pragma nounroll
+	__pragma_loop_no_unroll
 	for (i = 0; i < 20; i++)
 		if (skb->len)
 			ret |= 1 << i;
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index de3b6e4e4d0a..6957d9f2805e 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -8,6 +8,7 @@
 #include "profiler.h"
 #include "err.h"
 #include "bpf_experimental.h"
+#include "bpf_compiler.h"
 
 #ifndef NULL
 #define NULL 0
@@ -169,7 +170,7 @@ static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct,
 				     int spid)
 {
 #ifdef UNROLL
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
 		if (arr_struct->array[i].meta.pid == spid)
@@ -185,7 +186,7 @@ static INLINE void populate_ancestors(struct task_struct* task,
 
 	ancestors_data->num_ancestors = 0;
 #ifdef UNROLL
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) {
 		parent = BPF_CORE_READ(parent, real_parent);
@@ -212,7 +213,7 @@ static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
 	size_t filepart_length;
 
 #ifdef UNROLL
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
 		filepart_length =
@@ -261,7 +262,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
 		int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local,
 						  pids_cgrp_id___local);
 #ifdef UNROLL
-#pragma unroll
+		__pragma_loop_unroll
 #endif
 		for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys_state* subsys =
@@ -402,7 +403,7 @@ static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
 			if (kill_data == NULL)
 				return 0;
 #ifdef UNROLL
-#pragma unroll
+			__pragma_loop_unroll
 #endif
 			for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
 				if (arr_struct->array[i].meta.pid == 0) {
@@ -482,7 +483,7 @@ read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
 	struct dentry* parent_dentry;
 
 #ifdef UNROLL
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
 		filepart_length =
@@ -508,7 +509,7 @@ is_ancestor_in_allowed_inodes(struct dentry* filp_dentry)
 {
 	struct dentry* parent_dentry;
 #ifdef UNROLL
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
 		u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino);
@@ -629,7 +630,7 @@ int raw_tracepoint__sched_process_exit(void* ctx)
 	struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
 
 #ifdef UNROLL
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
 		struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h
index 026d573ce179..86484f07e1d1 100644
--- a/tools/testing/selftests/bpf/progs/pyperf.h
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -8,6 +8,7 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include "bpf_misc.h"
+#include "bpf_compiler.h"
 
 #define FUNCTION_NAME_LEN 64
 #define FILE_NAME_LEN 128
@@ -298,11 +299,11 @@ int __on_event(struct bpf_raw_tracepoint_args *ctx)
 #if defined(USE_ITER)
 /* no for loop, no unrolling */
 #elif defined(NO_UNROLL)
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 #elif defined(UNROLL_COUNT)
-#pragma clang loop unroll_count(UNROLL_COUNT)
+	__pragma_loop_unroll_count(UNROLL_COUNT)
 #else
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 #endif /* NO_UNROLL */
 		/* Unwind python stack */
 #ifdef USE_ITER
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
index 40df2cc26eaf..f74459eead26 100644
--- a/tools/testing/selftests/bpf/progs/strobemeta.h
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -10,6 +10,8 @@
 #include <linux/types.h>
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 typedef uint32_t pid_t;
 struct task_struct {};
 
@@ -419,9 +421,9 @@ static __always_inline uint64_t read_map_var(struct strobemeta_cfg *cfg,
 	}
 
 #ifdef NO_UNROLL
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 #else
-#pragma unroll
+	__pragma_loop_unroll
 #endif
 	for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
 		if (i >= map.cnt)
@@ -560,25 +562,25 @@ static void *read_strobe_meta(struct task_struct *task,
 		payload_off = sizeof(data->payload);
 #else
 #ifdef NO_UNROLL
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 #else
-#pragma unroll
+	__pragma_loop_unroll
 #endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_INTS; ++i) {
 		read_int_var(cfg, i, tls_base, &value, data);
 	}
 #ifdef NO_UNROLL
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 #else
-#pragma unroll
+	__pragma_loop_unroll
 #endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
 		payload_off = read_str_var(cfg, i, tls_base, &value, data, payload_off);
 	}
 #ifdef NO_UNROLL
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 #else
-#pragma unroll
+	__pragma_loop_unroll
 #endif /* NO_UNROLL */
 	for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
 		payload_off = read_map_var(cfg, i, tls_base, &value, data, payload_off);
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
index 66b304982245..1f7e82649347 100644
--- a/tools/testing/selftests/bpf/progs/test_cls_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
@@ -20,6 +20,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#include "bpf_compiler.h"
 #include "test_cls_redirect.h"
 
 #ifdef SUBPROGS
@@ -267,7 +268,7 @@ static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
 	uint32_t acc = 0;
 	uint16_t *ipw = (uint16_t *)iph;
 
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
 		acc += ipw[i];
 	}
@@ -294,7 +295,7 @@ bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
 	};
 	*is_fragment = false;
 
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (int i = 0; i < 6; i++) {
 		switch (exthdr.next) {
 		case IPPROTO_FRAGMENT:
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
index 48ff2b2ad5e7..fed66f36adb6 100644
--- a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
+++ b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
@@ -6,6 +6,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#include "bpf_compiler.h"
+
 /* Packet parsing state machine helpers. */
 #define cursor_advance(_cursor, _len) \
 	({ void *_tmp = _cursor; _cursor += _len; _tmp; })
@@ -131,7 +133,7 @@ int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh,
 	*pad_off = 0;
 
 	// we can only go as far as ~10 TLVs due to the BPF max stack size
-	#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (int i = 0; i < 10; i++) {
 		struct sr6_tlv_t tlv;
 
@@ -302,7 +304,7 @@ int __encap_srh(struct __sk_buff *skb)
 
 	seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh));
 
-	#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (unsigned long long lo = 0; lo < 4; lo++) {
 		seg->lo = bpf_cpu_to_be64(4 - lo);
 		seg->hi = bpf_cpu_to_be64(hi);
diff --git a/tools/testing/selftests/bpf/progs/test_seg6_loop.c b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
index a7278f064368..5059050f74f6 100644
--- a/tools/testing/selftests/bpf/progs/test_seg6_loop.c
+++ b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
@@ -6,6 +6,8 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#include "bpf_compiler.h"
+
 /* Packet parsing state machine helpers. */
 #define cursor_advance(_cursor, _len) \
 	({ void *_tmp = _cursor; _cursor += _len; _tmp; })
@@ -134,7 +136,7 @@ static __always_inline int is_valid_tlv_boundary(struct __sk_buff *skb,
 	// we can only go as far as ~10 TLVs due to the BPF max stack size
 	// workaround: define induction variable "i" as "long" instead
 	// of "int" to prevent alu32 sub-register spilling.
-	#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 	for (long i = 0; i < 100; i++) {
 		struct sr6_tlv_t tlv;
 
diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
index c482110cfc95..a724a70c6700 100644
--- a/tools/testing/selftests/bpf/progs/test_skb_ctx.c
+++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
@@ -3,12 +3,14 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 char _license[] SEC("license") = "GPL";
 
 SEC("tc")
 int process(struct __sk_buff *skb)
 {
-	#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (int i = 0; i < 5; i++) {
 		if (skb->cb[i] != i + 1)
 			return 1;
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c
index 553a282d816a..7f74077d6622 100644
--- a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c
@@ -9,6 +9,8 @@
 
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
@@ -30,7 +32,7 @@ static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx)
 	if (ret < 0 || ret != sizeof(tcp_mem_name) - 1)
 		return 0;
 
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 	for (i = 0; i < sizeof(tcp_mem_name); ++i)
 		if (name[i] != tcp_mem_name[i])
 			return 0;
@@ -59,7 +61,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx)
 	if (ret < 0 || ret >= MAX_VALUE_STR_LEN)
 		return 0;
 
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 	for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) {
 		ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0,
 				  tcp_mem + i);
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c
index 2b64bc563a12..68a75436e8af 100644
--- a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c
@@ -9,6 +9,8 @@
 
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #endif
@@ -30,7 +32,7 @@ static __attribute__((noinline)) int is_tcp_mem(struct bpf_sysctl *ctx)
 	if (ret < 0 || ret != sizeof(tcp_mem_name) - 1)
 		return 0;
 
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 	for (i = 0; i < sizeof(tcp_mem_name); ++i)
 		if (name[i] != tcp_mem_name[i])
 			return 0;
@@ -57,7 +59,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx)
 	if (ret < 0 || ret >= MAX_VALUE_STR_LEN)
 		return 0;
 
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 	for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) {
 		ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0,
 				  tcp_mem + i);
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
index 5489823c83fc..efc3c61f7852 100644
--- a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
@@ -9,6 +9,8 @@
 
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 /* Max supported length of a string with unsigned long in base 10 (pow2 - 1). */
 #define MAX_ULONG_STR_LEN 0xF
 
@@ -31,7 +33,7 @@ static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx)
 	if (ret < 0 || ret != sizeof(tcp_mem_name) - 1)
 		return 0;
 
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (i = 0; i < sizeof(tcp_mem_name); ++i)
 		if (name[i] != tcp_mem_name[i])
 			return 0;
@@ -57,7 +59,7 @@ int sysctl_tcp_mem(struct bpf_sysctl *ctx)
 	if (ret < 0 || ret >= MAX_VALUE_STR_LEN)
 		return 0;
 
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) {
 		ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0,
 				  tcp_mem + i);
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
index e6e678aa9874..ff5009e3c278 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -20,6 +20,8 @@
 #include <bpf/bpf_endian.h>
 #include <bpf/bpf_helpers.h>
 
+#include "bpf_compiler.h"
+
 static const int cfg_port = 8000;
 
 static const int cfg_udp_src = 20000;
@@ -81,7 +83,7 @@ static __always_inline void set_ipv4_csum(struct iphdr *iph)
 
 	iph->check = 0;
 
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++)
 		csum += *iph16++;
 
diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c
index d7a9a74b7245..8caf58be5818 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp.c
@@ -19,6 +19,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 #include "test_iptunnel_common.h"
+#include "bpf_compiler.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@@ -137,7 +138,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	iph->ttl = 8;
 
 	next_iph = (__u16 *)iph;
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (i = 0; i < sizeof(*iph) >> 1; i++)
 		csum += *next_iph++;
 
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_loop.c b/tools/testing/selftests/bpf/progs/test_xdp_loop.c
index c98fb44156f0..93267a68825b 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_loop.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_loop.c
@@ -15,6 +15,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 #include "test_iptunnel_common.h"
+#include "bpf_compiler.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
@@ -133,7 +134,7 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	iph->ttl = 8;
 
 	next_iph = (__u16 *)iph;
-#pragma clang loop unroll(disable)
+	__pragma_loop_no_unroll
 	for (i = 0; i < sizeof(*iph) >> 1; i++)
 		csum += *next_iph++;
 
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
index 42c8f6ded0e4..5c7e4758a0ca 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
@@ -15,6 +15,7 @@
 #include <linux/udp.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
+#include "bpf_compiler.h"
 
 static __always_inline __u32 rol32(__u32 word, unsigned int shift)
 {
@@ -362,7 +363,7 @@ bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
 	iph->ttl = 4;
 
 	next_iph_u16 = (__u16 *) iph;
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (int i = 0; i < sizeof(struct iphdr) >> 1; i++)
 		csum += *next_iph_u16++;
 	iph->check = ~((csum & 0xffff) + (csum >> 16));
@@ -409,7 +410,7 @@ int send_icmp_reply(void *data, void *data_end)
 	iph->saddr = tmp_addr;
 	iph->check = 0;
 	next_iph_u16 = (__u16 *) iph;
-#pragma clang loop unroll(full)
+	__pragma_loop_unroll_full
 	for (int i = 0; i < sizeof(struct iphdr) >> 1; i++)
 		csum += *next_iph_u16++;
 	iph->check = ~((csum & 0xffff) + (csum >> 16));
diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
index 518329c666e9..7ea9785738b5 100644
--- a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
+++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c
@@ -7,6 +7,8 @@
 #include <bpf/bpf_endian.h>
 #include <asm/errno.h>
 
+#include "bpf_compiler.h"
+
 #define TC_ACT_OK 0
 #define TC_ACT_SHOT 2
 
@@ -151,11 +153,11 @@ static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
 	__u64 sum = csum;
 	int i;
 
-#pragma unroll
+	__pragma_loop_unroll
 	for (i = 0; i < 4; i++)
 		sum += (__u32)saddr->in6_u.u6_addr32[i];
 
-#pragma unroll
+	__pragma_loop_unroll
 	for (i = 0; i < 4; i++)
 		sum += (__u32)daddr->in6_u.u6_addr32[i];
 
diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c
index 54cf1765118b..44e2b0ef23ae 100644
--- a/tools/testing/selftests/bpf/progs/xdping_kern.c
+++ b/tools/testing/selftests/bpf/progs/xdping_kern.c
@@ -15,6 +15,7 @@
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_endian.h>
 
+#include "bpf_compiler.h"
 #include "xdping.h"
 
 struct {
@@ -116,7 +117,7 @@ int xdping_client(struct xdp_md *ctx)
 		return XDP_PASS;
 
 	if (pinginfo->start) {
-#pragma clang loop unroll(full)
+		__pragma_loop_unroll_full
 		for (i = 0; i < XDPING_MAX_COUNT; i++) {
 			if (pinginfo->times[i] == 0)
 				break;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-07 10:12 [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests Jose E. Marchesi
@ 2024-02-07 21:45 ` Yonghong Song
  2024-02-08 11:32   ` Jose E. Marchesi
  2024-02-08 19:49   ` Yonghong Song
  0 siblings, 2 replies; 18+ messages in thread
From: Yonghong Song @ 2024-02-07 21:45 UTC (permalink / raw)
  To: Jose E. Marchesi, bpf
  Cc: Yonghong Song, Eduard Zingerman, Alexei Starovoitov, david.faust,
	cupertino.miranda


On 2/7/24 2:12 AM, Jose E. Marchesi wrote:
> Some BPF tests use loop unrolling compiler pragmas that are clang
> specific and not supported by GCC.  These pragmas, along with their
> GCC equivalences are:
>
>    #pragma clang loop unroll_count(N)
>    #pragma GCC unroll N
>
>    #pragma clang loop unroll(full)
>    #pragma GCC unroll 65534
>
>    #pragma clang loop unroll(disable)
>    #pragma GCC unroll 1
>
>    #pragma unroll [aka #pragma clang loop unroll(enable)]
>    There is no GCC equivalence, and it seems to me that this clang
>    pragma may be only useful when building without -funroll-loops to
>    enable the optimization in particular loops.  In GCC -funroll-loops
>    is enabled with -O2 and higher.  If this is also true in clang,
>    perhaps these pragmas in selftests are redundant?

You are right, at -O2 level, loop unrolling is enabled by default.
So I think '#pragma unroll' can be removed since gcc also has
loop unrolling enabled by default at -O2.

Your patch has a conflict with latest bpf-next. Please rebase it
on top of bpf-next, remove '#pragma unroll' support and resubmit.
Thanks!

>
> This patch adds a new header progs/bpf_compiler.h that defines the
> following macros, which correspond to each pair of compiler-specific
> pragmas above:
>
>    __pragma_loop_unroll_count(N)
>    __pragma_loop_unroll_full
>    __pragma_loop_no_unroll
>    __pragma_loop_unroll
>
> The selftests using loop unrolling pragmas are then changed to include
> the header and use these macros in place of the explicit pragmas.
>
> Tested in bpf-next master.
> No regressions.
>
> Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
> Cc: Yonghong Song <yhs@meta.com>
> Cc: Eduard Zingerman <eddyz87@gmail.com>
> Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
> Cc: david.faust@oracle.com
> Cc: cupertino.miranda@oracle.com
> ---
>   .../selftests/bpf/progs/bpf_compiler.h        | 33 +++++++++++++++++++
>   tools/testing/selftests/bpf/progs/iters.c     |  5 +--
>   tools/testing/selftests/bpf/progs/loop4.c     |  4 ++-
>   .../selftests/bpf/progs/profiler.inc.h        | 17 +++++-----
>   tools/testing/selftests/bpf/progs/pyperf.h    |  7 ++--
>   .../testing/selftests/bpf/progs/strobemeta.h  | 18 +++++-----
>   .../selftests/bpf/progs/test_cls_redirect.c   |  5 +--
>   .../selftests/bpf/progs/test_lwt_seg6local.c  |  6 ++--
>   .../selftests/bpf/progs/test_seg6_loop.c      |  4 ++-
>   .../selftests/bpf/progs/test_skb_ctx.c        |  4 ++-
>   .../selftests/bpf/progs/test_sysctl_loop1.c   |  6 ++--
>   .../selftests/bpf/progs/test_sysctl_loop2.c   |  6 ++--
>   .../selftests/bpf/progs/test_sysctl_prog.c    |  6 ++--
>   .../selftests/bpf/progs/test_tc_tunnel.c      |  4 ++-
>   tools/testing/selftests/bpf/progs/test_xdp.c  |  3 +-
>   .../selftests/bpf/progs/test_xdp_loop.c       |  3 +-
>   .../selftests/bpf/progs/test_xdp_noinline.c   |  5 +--
>   .../selftests/bpf/progs/xdp_synproxy_kern.c   |  6 ++--
>   .../testing/selftests/bpf/progs/xdping_kern.c |  3 +-
>   19 files changed, 103 insertions(+), 42 deletions(-)
>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h
>
> diff --git a/tools/testing/selftests/bpf/progs/bpf_compiler.h b/tools/testing/selftests/bpf/progs/bpf_compiler.h
> new file mode 100644
> index 000000000000..a7c343dc82e6
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/bpf_compiler.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BPF_COMPILER_H__
> +#define __BPF_COMPILER_H__
> +
> +#define DO_PRAGMA_(X) _Pragma(#X)
> +
> +#if __clang__
> +#define __pragma_loop_unroll DO_PRAGMA_(clang loop unroll(enable))
> +#else
> +/* In GCC -funroll-loops, which is enabled with -O2, should have the
> +   same impact than the loop-unroll-enable pragma above.  */
> +#define __pragma_loop_unroll
> +#endif
> +
> +#if __clang__
> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(clang loop unroll_count(N))
> +#else
> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(GCC unroll N)
> +#endif
> +
> +#if __clang__
> +#define __pragma_loop_unroll_full DO_PRAGMA_(clang loop unroll(full))
> +#else
> +#define __pragma_loop_unroll_full DO_PRAGMA_(GCC unroll 65534)
> +#endif
> +
> +#if __clang__
> +#define __pragma_loop_no_unroll DO_PRAGMA_(clang loop unroll(disable))
> +#else
> +#define __pragma_loop_no_unroll DO_PRAGMA_(GCC unroll 1)
> +#endif
> +
> +#endif
> diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
> index 225f02dd66d0..3db416606f2f 100644
> --- a/tools/testing/selftests/bpf/progs/iters.c
> +++ b/tools/testing/selftests/bpf/progs/iters.c
> @@ -5,6 +5,7 @@
>   #include <linux/bpf.h>
>   #include <bpf/bpf_helpers.h>
>   #include "bpf_misc.h"
> +#include "bpf_compiler.h"
>   
>   #define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0]))
>   
> @@ -183,7 +184,7 @@ int iter_pragma_unroll_loop(const void *ctx)
>   	MY_PID_GUARD();
>   
>   	bpf_iter_num_new(&it, 0, 2);
> -#pragma nounroll
> +	__pragma_loop_no_unroll
>   	for (i = 0; i < 3; i++) {
>   		v = bpf_iter_num_next(&it);
>   		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
> @@ -238,7 +239,7 @@ int iter_multiple_sequential_loops(const void *ctx)
>   	bpf_iter_num_destroy(&it);
>   
>   	bpf_iter_num_new(&it, 0, 2);
> -#pragma nounroll
> +	__pragma_loop_no_unroll
>   	for (i = 0; i < 3; i++) {
>   		v = bpf_iter_num_next(&it);
>   		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);

[...]


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-07 21:45 ` Yonghong Song
@ 2024-02-08 11:32   ` Jose E. Marchesi
  2024-02-08 12:55     ` Jose E. Marchesi
  2024-02-08 19:49   ` Yonghong Song
  1 sibling, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 11:32 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Yonghong Song, Eduard Zingerman, Alexei Starovoitov,
	david.faust, cupertino.miranda


> On 2/7/24 2:12 AM, Jose E. Marchesi wrote:
>> Some BPF tests use loop unrolling compiler pragmas that are clang
>> specific and not supported by GCC.  These pragmas, along with their
>> GCC equivalences are:
>>
>>    #pragma clang loop unroll_count(N)
>>    #pragma GCC unroll N
>>
>>    #pragma clang loop unroll(full)
>>    #pragma GCC unroll 65534
>>
>>    #pragma clang loop unroll(disable)
>>    #pragma GCC unroll 1
>>
>>    #pragma unroll [aka #pragma clang loop unroll(enable)]
>>    There is no GCC equivalence, and it seems to me that this clang
>>    pragma may be only useful when building without -funroll-loops to
>>    enable the optimization in particular loops.  In GCC -funroll-loops
>>    is enabled with -O2 and higher.  If this is also true in clang,
>>    perhaps these pragmas in selftests are redundant?
>
> You are right, at -O2 level, loop unrolling is enabled by default.
> So I think '#pragma unroll' can be removed since gcc also has
> loop unrolling enabled by default at -O2.
>
> Your patch has a conflict with latest bpf-next. Please rebase it
> on top of bpf-next, remove '#pragma unroll' support and resubmit.
> Thanks!

Note profiler.inc.h contains code like:

  #ifdef UNROLL
  	__pragma_loop_unroll
  #endif
 	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {

And then it is inluded by several test programs, which define (or not)
UNROLL:

profiler1.c:

  #define UNROLL
  #include "profiler.inc.h"

profiler2.c:

  /* undef #define UNROLL */
  #include "profiler.inc.h"

In contrast, in pyperf.h or strobemeta.h we find code like:

  #ifdef NO_UNROLL
  	__pragma_loop_no_unroll
  #endif /* NO_UNROLL */
  	for (int i = 0; i < STROBE_MAX_STRS; ++i) {

And then programs including it define NO_UNROLL to disable unrolling.

If -funroll-oops is enabled with -O2 and BPF programs are always built
with -O2, then not defining UNROLL for profiler.inc.h, seems like
basically a no-op to me, because unrolling will still happen. This is
assuming that #pragma unroll in clang doesn't activates more aggressive
inlining.

>>
>> This patch adds a new header progs/bpf_compiler.h that defines the
>> following macros, which correspond to each pair of compiler-specific
>> pragmas above:
>>
>>    __pragma_loop_unroll_count(N)
>>    __pragma_loop_unroll_full
>>    __pragma_loop_no_unroll
>>    __pragma_loop_unroll
>>
>> The selftests using loop unrolling pragmas are then changed to include
>> the header and use these macros in place of the explicit pragmas.
>>
>> Tested in bpf-next master.
>> No regressions.
>>
>> Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
>> Cc: Yonghong Song <yhs@meta.com>
>> Cc: Eduard Zingerman <eddyz87@gmail.com>
>> Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
>> Cc: david.faust@oracle.com
>> Cc: cupertino.miranda@oracle.com
>> ---
>>   .../selftests/bpf/progs/bpf_compiler.h        | 33 +++++++++++++++++++
>>   tools/testing/selftests/bpf/progs/iters.c     |  5 +--
>>   tools/testing/selftests/bpf/progs/loop4.c     |  4 ++-
>>   .../selftests/bpf/progs/profiler.inc.h        | 17 +++++-----
>>   tools/testing/selftests/bpf/progs/pyperf.h    |  7 ++--
>>   .../testing/selftests/bpf/progs/strobemeta.h  | 18 +++++-----
>>   .../selftests/bpf/progs/test_cls_redirect.c   |  5 +--
>>   .../selftests/bpf/progs/test_lwt_seg6local.c  |  6 ++--
>>   .../selftests/bpf/progs/test_seg6_loop.c      |  4 ++-
>>   .../selftests/bpf/progs/test_skb_ctx.c        |  4 ++-
>>   .../selftests/bpf/progs/test_sysctl_loop1.c   |  6 ++--
>>   .../selftests/bpf/progs/test_sysctl_loop2.c   |  6 ++--
>>   .../selftests/bpf/progs/test_sysctl_prog.c    |  6 ++--
>>   .../selftests/bpf/progs/test_tc_tunnel.c      |  4 ++-
>>   tools/testing/selftests/bpf/progs/test_xdp.c  |  3 +-
>>   .../selftests/bpf/progs/test_xdp_loop.c       |  3 +-
>>   .../selftests/bpf/progs/test_xdp_noinline.c   |  5 +--
>>   .../selftests/bpf/progs/xdp_synproxy_kern.c   |  6 ++--
>>   .../testing/selftests/bpf/progs/xdping_kern.c |  3 +-
>>   19 files changed, 103 insertions(+), 42 deletions(-)
>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h
>>
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_compiler.h b/tools/testing/selftests/bpf/progs/bpf_compiler.h
>> new file mode 100644
>> index 000000000000..a7c343dc82e6
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/bpf_compiler.h
>> @@ -0,0 +1,33 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef __BPF_COMPILER_H__
>> +#define __BPF_COMPILER_H__
>> +
>> +#define DO_PRAGMA_(X) _Pragma(#X)
>> +
>> +#if __clang__
>> +#define __pragma_loop_unroll DO_PRAGMA_(clang loop unroll(enable))
>> +#else
>> +/* In GCC -funroll-loops, which is enabled with -O2, should have the
>> +   same impact than the loop-unroll-enable pragma above.  */
>> +#define __pragma_loop_unroll
>> +#endif
>> +
>> +#if __clang__
>> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(clang loop unroll_count(N))
>> +#else
>> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(GCC unroll N)
>> +#endif
>> +
>> +#if __clang__
>> +#define __pragma_loop_unroll_full DO_PRAGMA_(clang loop unroll(full))
>> +#else
>> +#define __pragma_loop_unroll_full DO_PRAGMA_(GCC unroll 65534)
>> +#endif
>> +
>> +#if __clang__
>> +#define __pragma_loop_no_unroll DO_PRAGMA_(clang loop unroll(disable))
>> +#else
>> +#define __pragma_loop_no_unroll DO_PRAGMA_(GCC unroll 1)
>> +#endif
>> +
>> +#endif
>> diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c
>> index 225f02dd66d0..3db416606f2f 100644
>> --- a/tools/testing/selftests/bpf/progs/iters.c
>> +++ b/tools/testing/selftests/bpf/progs/iters.c
>> @@ -5,6 +5,7 @@
>>   #include <linux/bpf.h>
>>   #include <bpf/bpf_helpers.h>
>>   #include "bpf_misc.h"
>> +#include "bpf_compiler.h"
>>     #define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0]))
>>   @@ -183,7 +184,7 @@ int iter_pragma_unroll_loop(const void *ctx)
>>   	MY_PID_GUARD();
>>     	bpf_iter_num_new(&it, 0, 2);
>> -#pragma nounroll
>> +	__pragma_loop_no_unroll
>>   	for (i = 0; i < 3; i++) {
>>   		v = bpf_iter_num_next(&it);
>>   		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
>> @@ -238,7 +239,7 @@ int iter_multiple_sequential_loops(const void *ctx)
>>   	bpf_iter_num_destroy(&it);
>>     	bpf_iter_num_new(&it, 0, 2);
>> -#pragma nounroll
>> +	__pragma_loop_no_unroll
>>   	for (i = 0; i < 3; i++) {
>>   		v = bpf_iter_num_next(&it);
>>   		bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
>
> [...]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 11:32   ` Jose E. Marchesi
@ 2024-02-08 12:55     ` Jose E. Marchesi
  2024-02-08 14:18       ` Eduard Zingerman
  0 siblings, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 12:55 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Yonghong Song, Eduard Zingerman, Alexei Starovoitov,
	david.faust, cupertino.miranda


>> You are right, at -O2 level, loop unrolling is enabled by default.
>> So I think '#pragma unroll' can be removed since gcc also has
>> loop unrolling enabled by default at -O2.
>>
>> Your patch has a conflict with latest bpf-next. Please rebase it
>> on top of bpf-next, remove '#pragma unroll' support and resubmit.
>> Thanks!
>
> Note profiler.inc.h contains code like:
>
>   #ifdef UNROLL
>   	__pragma_loop_unroll
>   #endif
>  	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
>
> And then it is inluded by several test programs, which define (or not)
> UNROLL:
>
> profiler1.c:
>
>   #define UNROLL
>   #include "profiler.inc.h"
>
> profiler2.c:
>
>   /* undef #define UNROLL */
>   #include "profiler.inc.h"
>
> In contrast, in pyperf.h or strobemeta.h we find code like:
>
>   #ifdef NO_UNROLL
>   	__pragma_loop_no_unroll
>   #endif /* NO_UNROLL */
>   	for (int i = 0; i < STROBE_MAX_STRS; ++i) {
>
> And then programs including it define NO_UNROLL to disable unrolling.
>
> If -funroll-oops is enabled with -O2 and BPF programs are always built
> with -O2, then not defining UNROLL for profiler.inc.h, seems like
> basically a no-op to me, because unrolling will still happen. This is
> assuming that #pragma unroll in clang doesn't activates more aggressive
> inlining.

With the patch below, that basically inverts the logic of these
conditionals in profiler.inc.h, the selftests still pass running
./vmtest.sh -- ./test_progs.

However, it would be good if some clang wizard could confirm what
impact, if any, #pragma unroll (aka #pragma clang loop unroll(enabled))
has over -O2, before ditching these pragmas from the selftests.

diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index de3b6e4e4d0a..0a30162e53d2 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -168,8 +168,8 @@ probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max)
 static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct,
 				     int spid)
 {
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
 		if (arr_struct->array[i].meta.pid == spid)
@@ -184,8 +184,8 @@ static INLINE void populate_ancestors(struct task_struct* task,
 	u32 num_ancestors, ppid;
 
 	ancestors_data->num_ancestors = 0;
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 	for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) {
 		parent = BPF_CORE_READ(parent, real_parent);
@@ -211,8 +211,8 @@ static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
 	void* payload_start = payload;
 	size_t filepart_length;
 
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 	for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
 		filepart_length =
@@ -260,8 +260,8 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
 	if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) {
 		int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local,
 						  pids_cgrp_id___local);
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 		for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys_state* subsys =
@@ -401,8 +401,8 @@ static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
 				get_var_kill_data(ctx, spid, tpid, sig);
 			if (kill_data == NULL)
 				return 0;
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 			for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
 				if (arr_struct->array[i].meta.pid == 0) {
@@ -481,8 +481,8 @@ read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
 	size_t filepart_length;
 	struct dentry* parent_dentry;
 
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
 		filepart_length =
@@ -507,8 +507,8 @@ static INLINE bool
 is_ancestor_in_allowed_inodes(struct dentry* filp_dentry)
 {
 	struct dentry* parent_dentry;
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 	for (int i = 0; i < MAX_PATH_DEPTH; i++) {
 		u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino);
@@ -628,8 +628,8 @@ int raw_tracepoint__sched_process_exit(void* ctx)
 	struct task_struct* task = (struct task_struct*)bpf_get_current_task();
 	struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
 
-#ifdef UNROLL
-#pragma unroll
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
 #endif
 	for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
 		struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
diff --git a/tools/testing/selftests/bpf/progs/profiler1.c b/tools/testing/selftests/bpf/progs/profiler1.c
index fb6b13522949..c32783826f36 100644
--- a/tools/testing/selftests/bpf/progs/profiler1.c
+++ b/tools/testing/selftests/bpf/progs/profiler1.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
-#define UNROLL
 #define INLINE __always_inline
 #include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler2.c b/tools/testing/selftests/bpf/progs/profiler2.c
index 0f32a3cbf556..17da6089212b 100644
--- a/tools/testing/selftests/bpf/progs/profiler2.c
+++ b/tools/testing/selftests/bpf/progs/profiler2.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #define barrier_var(var) /**/
-/* undef #define UNROLL */
+#define NO_UNROLL
 #define INLINE /**/
 #include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler3.c b/tools/testing/selftests/bpf/progs/profiler3.c
index 6249fc31ccb0..cc7f9aee6d9e 100644
--- a/tools/testing/selftests/bpf/progs/profiler3.c
+++ b/tools/testing/selftests/bpf/progs/profiler3.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #define barrier_var(var) /**/
-#define UNROLL
 #define INLINE __noinline
 #include "profiler.inc.h"

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 12:55     ` Jose E. Marchesi
@ 2024-02-08 14:18       ` Eduard Zingerman
  2024-02-08 15:05         ` Jose E. Marchesi
  0 siblings, 1 reply; 18+ messages in thread
From: Eduard Zingerman @ 2024-02-08 14:18 UTC (permalink / raw)
  To: Jose E. Marchesi, Yonghong Song
  Cc: bpf, Yonghong Song, Alexei Starovoitov, david.faust, cupertino.miranda

On Thu, 2024-02-08 at 13:55 +0100, Jose E. Marchesi wrote:
[...]

> However, it would be good if some clang wizard could confirm what
> impact, if any, #pragma unroll (aka #pragma clang loop unroll(enabled))
> has over -O2, before ditching these pragmas from the selftests.

I compiled sefltests both with and without this patch,
there are no differences in disassembly of generated BPF object files.
(using current clang main).

[...]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 14:18       ` Eduard Zingerman
@ 2024-02-08 15:05         ` Jose E. Marchesi
  2024-02-08 15:28           ` Eduard Zingerman
  0 siblings, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 15:05 UTC (permalink / raw)
  To: Eduard Zingerman
  Cc: Yonghong Song, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda


> On Thu, 2024-02-08 at 13:55 +0100, Jose E. Marchesi wrote:
> [...]
>
>> However, it would be good if some clang wizard could confirm what
>> impact, if any, #pragma unroll (aka #pragma clang loop unroll(enabled))
>> has over -O2, before ditching these pragmas from the selftests.
>
> I compiled sefltests both with and without this patch,
> there are no differences in disassembly of generated BPF object files.
> (using current clang main).
>
> [...]

Hmm, wouldn't that mean that the loops in profiler.inc.h never get
unrolled regardless of optimization level or pragma? (profiler2.c)


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 15:05         ` Jose E. Marchesi
@ 2024-02-08 15:28           ` Eduard Zingerman
  2024-02-08 15:35             ` Jose E. Marchesi
  0 siblings, 1 reply; 18+ messages in thread
From: Eduard Zingerman @ 2024-02-08 15:28 UTC (permalink / raw)
  To: Jose E. Marchesi
  Cc: Yonghong Song, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda

On Thu, 2024-02-08 at 16:05 +0100, Jose E. Marchesi wrote:
> > On Thu, 2024-02-08 at 13:55 +0100, Jose E. Marchesi wrote:
> > [...]
> > 
> > > However, it would be good if some clang wizard could confirm what
> > > impact, if any, #pragma unroll (aka #pragma clang loop unroll(enabled))
> > > has over -O2, before ditching these pragmas from the selftests.
> > 
> > I compiled sefltests both with and without this patch,
> > there are no differences in disassembly of generated BPF object files.
> > (using current clang main).
> > 
> > [...]
> 
> Hmm, wouldn't that mean that the loops in profiler.inc.h never get
> unrolled regardless of optimization level or pragma? (profiler2.c)
> 

No, the generated code is different between profiler{1,2,3}, e.g.:

$ llvm-objdump -d before/profiler1.bpf.o | wc -l
5356
$ llvm-objdump -d before/profiler2.bpf.o | wc -l
2329
$ llvm-objdump -d before/profiler3.bpf.o | wc -l
1915

What I meant, is that generated code for before/profiler1.bpf.o
and after/profiler1.bpf.o is identical, etc.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 15:28           ` Eduard Zingerman
@ 2024-02-08 15:35             ` Jose E. Marchesi
  2024-02-08 15:53               ` Eduard Zingerman
  0 siblings, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 15:35 UTC (permalink / raw)
  To: Eduard Zingerman
  Cc: Yonghong Song, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda


> On Thu, 2024-02-08 at 16:05 +0100, Jose E. Marchesi wrote:
>> > On Thu, 2024-02-08 at 13:55 +0100, Jose E. Marchesi wrote:
>> > [...]
>> > 
>> > > However, it would be good if some clang wizard could confirm what
>> > > impact, if any, #pragma unroll (aka #pragma clang loop unroll(enabled))
>> > > has over -O2, before ditching these pragmas from the selftests.
>> > 
>> > I compiled sefltests both with and without this patch,
>> > there are no differences in disassembly of generated BPF object files.
>> > (using current clang main).
>> > 
>> > [...]
>> 
>> Hmm, wouldn't that mean that the loops in profiler.inc.h never get
>> unrolled regardless of optimization level or pragma? (profiler2.c)
>> 
>
> No, the generated code is different between profiler{1,2,3}, e.g.:
>
> $ llvm-objdump -d before/profiler1.bpf.o | wc -l
> 5356
> $ llvm-objdump -d before/profiler2.bpf.o | wc -l
> 2329
> $ llvm-objdump -d before/profiler3.bpf.o | wc -l
> 1915
>
> What I meant, is that generated code for before/profiler1.bpf.o
> and after/profiler1.bpf.o is identical, etc.

Right.  But profiler2.c before and after the patch do:

--- Before:

profiler2.c:

// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define barrier_var(var) /**/
/* undef #define UNROLL */
#define INLINE /**/
#include "profiler.inc.h"

profiler.inc.h:

#ifdef UNROLL
#pragma unroll
#endif
       for (WHATEVER) {
         [...]
       }

--- After:

profiler2.c:

// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define barrier_var(var) /**/
#define NO_UNROLL
#define INLINE /**/
#include "profiler.inc.h"

profiler.inc.h:

#ifdef NO_UNROLL
#pragma clang loop unroll(disable)
#endif
	for (WHATEVER) {
          [...]
        }
---

If the compiler generates assembly code the same code for profile2.c for
before and after, that means that the loop does _not_ get unrolled when
profiler.inc.h is built with -O2 but without #pragma unroll.

But what if #pragma unroll is used?  If it unrolls then, that would mean
that the pragma does something more than -funroll-loops/-O2.

Sorry if I am not making sense.  Stuff like this confuses me to no end
;)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 15:35             ` Jose E. Marchesi
@ 2024-02-08 15:53               ` Eduard Zingerman
  2024-02-08 16:51                 ` Jose E. Marchesi
  0 siblings, 1 reply; 18+ messages in thread
From: Eduard Zingerman @ 2024-02-08 15:53 UTC (permalink / raw)
  To: Jose E. Marchesi
  Cc: Yonghong Song, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda

On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
[...]

> If the compiler generates assembly code the same code for profile2.c for
> before and after, that means that the loop does _not_ get unrolled when
> profiler.inc.h is built with -O2 but without #pragma unroll.
> 
> But what if #pragma unroll is used?  If it unrolls then, that would mean
> that the pragma does something more than -funroll-loops/-O2.
> 
> Sorry if I am not making sense.  Stuff like this confuses me to no end
> ;)

Sorry, I messed up while switching branches :(
Here are the correct stats:

| File            | insn # | insn # |
|                 | before |  after |
|-----------------+--------+--------|
| profiler1.bpf.o |  16716 |   4813 |
| profiler2.bpf.o |   2088 |   2050 |
| profiler3.bpf.o |   4465 |   1690 |

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 15:53               ` Eduard Zingerman
@ 2024-02-08 16:51                 ` Jose E. Marchesi
  2024-02-08 18:04                   ` Yonghong Song
  0 siblings, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 16:51 UTC (permalink / raw)
  To: Eduard Zingerman
  Cc: Yonghong Song, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda


> On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
> [...]
>
>> If the compiler generates assembly code the same code for profile2.c for
>> before and after, that means that the loop does _not_ get unrolled when
>> profiler.inc.h is built with -O2 but without #pragma unroll.
>> 
>> But what if #pragma unroll is used?  If it unrolls then, that would mean
>> that the pragma does something more than -funroll-loops/-O2.
>> 
>> Sorry if I am not making sense.  Stuff like this confuses me to no end
>> ;)
>
> Sorry, I messed up while switching branches :(
> Here are the correct stats:
>
> | File            | insn # | insn # |
> |                 | before |  after |
> |-----------------+--------+--------|
> | profiler1.bpf.o |  16716 |   4813 |

This means:

- With both `#pragma unroll' and -O2 we get 16716 instructions.
- Without `#pragma unroll' and with -O2 we get 4813 instructions.

Weird.

> | profiler2.bpf.o |   2088 |   2050 |

- Without `#pragma unroll' and with -O2 we get 2088 instructions.
- With `#pragma loop unroll(disable)' and with -O2 we get 2050
  instructions.

Also surprising.

> | profiler3.bpf.o |   4465 |   1690 |

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 16:51                 ` Jose E. Marchesi
@ 2024-02-08 18:04                   ` Yonghong Song
  2024-02-08 18:35                     ` Yonghong Song
  0 siblings, 1 reply; 18+ messages in thread
From: Yonghong Song @ 2024-02-08 18:04 UTC (permalink / raw)
  To: Jose E. Marchesi, Eduard Zingerman
  Cc: bpf, Yonghong Song, Alexei Starovoitov, david.faust, cupertino.miranda


On 2/8/24 8:51 AM, Jose E. Marchesi wrote:
>> On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
>> [...]
>>
>>> If the compiler generates assembly code the same code for profile2.c for
>>> before and after, that means that the loop does _not_ get unrolled when
>>> profiler.inc.h is built with -O2 but without #pragma unroll.
>>>
>>> But what if #pragma unroll is used?  If it unrolls then, that would mean
>>> that the pragma does something more than -funroll-loops/-O2.
>>>
>>> Sorry if I am not making sense.  Stuff like this confuses me to no end
>>> ;)
>> Sorry, I messed up while switching branches :(
>> Here are the correct stats:
>>
>> | File            | insn # | insn # |
>> |                 | before |  after |
>> |-----------------+--------+--------|
>> | profiler1.bpf.o |  16716 |   4813 |
> This means:
>
> - With both `#pragma unroll' and -O2 we get 16716 instructions.
> - Without `#pragma unroll' and with -O2 we get 4813 instructions.
>
> Weird.

Thanks for the analysis. I can reproduce with vs. without '#pragma unroll' at -O2
level, the number of generated insns is indeed different, quite dramatically
as the above numbers. I will do some checking in compiler.

>
>> | profiler2.bpf.o |   2088 |   2050 |
> - Without `#pragma unroll' and with -O2 we get 2088 instructions.
> - With `#pragma loop unroll(disable)' and with -O2 we get 2050
>    instructions.
>
> Also surprising.
>
>> | profiler3.bpf.o |   4465 |   1690 |

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 18:04                   ` Yonghong Song
@ 2024-02-08 18:35                     ` Yonghong Song
  2024-02-08 18:59                       ` Jose E. Marchesi
  0 siblings, 1 reply; 18+ messages in thread
From: Yonghong Song @ 2024-02-08 18:35 UTC (permalink / raw)
  To: Jose E. Marchesi, Eduard Zingerman
  Cc: bpf, Yonghong Song, Alexei Starovoitov, david.faust, cupertino.miranda


On 2/8/24 10:04 AM, Yonghong Song wrote:
>
> On 2/8/24 8:51 AM, Jose E. Marchesi wrote:
>>> On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
>>> [...]
>>>
>>>> If the compiler generates assembly code the same code for 
>>>> profile2.c for
>>>> before and after, that means that the loop does _not_ get unrolled 
>>>> when
>>>> profiler.inc.h is built with -O2 but without #pragma unroll.
>>>>
>>>> But what if #pragma unroll is used?  If it unrolls then, that would 
>>>> mean
>>>> that the pragma does something more than -funroll-loops/-O2.
>>>>
>>>> Sorry if I am not making sense.  Stuff like this confuses me to no end
>>>> ;)
>>> Sorry, I messed up while switching branches :(
>>> Here are the correct stats:
>>>
>>> | File            | insn # | insn # |
>>> |                 | before |  after |
>>> |-----------------+--------+--------|
>>> | profiler1.bpf.o |  16716 |   4813 |
>> This means:
>>
>> - With both `#pragma unroll' and -O2 we get 16716 instructions.
>> - Without `#pragma unroll' and with -O2 we get 4813 instructions.
>>
>> Weird.
>
> Thanks for the analysis. I can reproduce with vs. without '#pragma 
> unroll' at -O2
> level, the number of generated insns is indeed different, quite 
> dramatically
> as the above numbers. I will do some checking in compiler.

Okay, a quick checking compiler found that
   - with "#pragma unroll" means no profitability test and do full unroll as instructed
   - without "#pragma unroll" mean compiler will do profitability for full unroll,
     if compiler thinks full unroll is not profitable, there will be no unrolling.

So for gcc, even users saying '#pragma unroll', gcc still do profitability test?

>
>>
>>> | profiler2.bpf.o |   2088 |   2050 |
>> - Without `#pragma unroll' and with -O2 we get 2088 instructions.
>> - With `#pragma loop unroll(disable)' and with -O2 we get 2050
>>    instructions.
>>
>> Also surprising.
>>
>>> | profiler3.bpf.o |   4465 |   1690 |
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 18:35                     ` Yonghong Song
@ 2024-02-08 18:59                       ` Jose E. Marchesi
  2024-02-08 19:03                         ` Jose E. Marchesi
  0 siblings, 1 reply; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 18:59 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Eduard Zingerman, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda


> On 2/8/24 10:04 AM, Yonghong Song wrote:
>>
>> On 2/8/24 8:51 AM, Jose E. Marchesi wrote:
>>>> On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
>>>> [...]
>>>>
>>>>> If the compiler generates assembly code the same code for
>>>>> profile2.c for
>>>>> before and after, that means that the loop does _not_ get
>>>>> unrolled when
>>>>> profiler.inc.h is built with -O2 but without #pragma unroll.
>>>>>
>>>>> But what if #pragma unroll is used?  If it unrolls then, that
>>>>> would mean
>>>>> that the pragma does something more than -funroll-loops/-O2.
>>>>>
>>>>> Sorry if I am not making sense.  Stuff like this confuses me to no end
>>>>> ;)
>>>> Sorry, I messed up while switching branches :(
>>>> Here are the correct stats:
>>>>
>>>> | File            | insn # | insn # |
>>>> |                 | before |  after |
>>>> |-----------------+--------+--------|
>>>> | profiler1.bpf.o |  16716 |   4813 |
>>> This means:
>>>
>>> - With both `#pragma unroll' and -O2 we get 16716 instructions.
>>> - Without `#pragma unroll' and with -O2 we get 4813 instructions.
>>>
>>> Weird.
>>
>> Thanks for the analysis. I can reproduce with vs. without '#pragma
>> unroll' at -O2
>> level, the number of generated insns is indeed different, quite
>> dramatically
>> as the above numbers. I will do some checking in compiler.
>
> Okay, a quick checking compiler found that
>   - with "#pragma unroll" means no profitability test and do full
>    unroll as instructed


I don't think clang's `#pragma unroll' does full unroll.

On one side, AFAIK `pragma unroll' is supposed to be equivalent to
`pragma clang loop(enable)', which is different to `pragma clang loop
unroll(full)'.

On the other, if you replace `pragma unroll' with `pragma clang loop
unroll(full)' in the BPF selftests you will get branch instruction
overflows.

What criteria `pragma unroll' in clang uses in order to determine how
much it unrolls the loop, compared to -O2|-funroll-loops, I don't know.

>   - without "#pragma unroll" mean compiler will do profitability for full unroll,
>     if compiler thinks full unroll is not profitable, there will be no unrolling.
>
> So for gcc, even users saying '#pragma unroll', gcc still do
> profitability test?

GCC doesn't support `#pragma unroll'.

Hence in my original patch the macro __pragma_unroll expands to nothing
with GCC.  That will lead to the compiler perhaps not unrolling the loop
even with -O2|-funroll-loops.

>
>>
>>>
>>>> | profiler2.bpf.o |   2088 |   2050 |
>>> - Without `#pragma unroll' and with -O2 we get 2088 instructions.
>>> - With `#pragma loop unroll(disable)' and with -O2 we get 2050
>>>    instructions.
>>>
>>> Also surprising.
>>>
>>>> | profiler3.bpf.o |   4465 |   1690 |
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 18:59                       ` Jose E. Marchesi
@ 2024-02-08 19:03                         ` Jose E. Marchesi
  2024-02-08 19:34                           ` Eduard Zingerman
  2024-02-08 19:44                           ` Yonghong Song
  0 siblings, 2 replies; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 19:03 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Eduard Zingerman, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda


>> On 2/8/24 10:04 AM, Yonghong Song wrote:
>>>
>>> On 2/8/24 8:51 AM, Jose E. Marchesi wrote:
>>>>> On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
>>>>> [...]
>>>>>
>>>>>> If the compiler generates assembly code the same code for
>>>>>> profile2.c for
>>>>>> before and after, that means that the loop does _not_ get
>>>>>> unrolled when
>>>>>> profiler.inc.h is built with -O2 but without #pragma unroll.
>>>>>>
>>>>>> But what if #pragma unroll is used?  If it unrolls then, that
>>>>>> would mean
>>>>>> that the pragma does something more than -funroll-loops/-O2.
>>>>>>
>>>>>> Sorry if I am not making sense.  Stuff like this confuses me to no end
>>>>>> ;)
>>>>> Sorry, I messed up while switching branches :(
>>>>> Here are the correct stats:
>>>>>
>>>>> | File            | insn # | insn # |
>>>>> |                 | before |  after |
>>>>> |-----------------+--------+--------|
>>>>> | profiler1.bpf.o |  16716 |   4813 |
>>>> This means:
>>>>
>>>> - With both `#pragma unroll' and -O2 we get 16716 instructions.
>>>> - Without `#pragma unroll' and with -O2 we get 4813 instructions.
>>>>
>>>> Weird.
>>>
>>> Thanks for the analysis. I can reproduce with vs. without '#pragma
>>> unroll' at -O2
>>> level, the number of generated insns is indeed different, quite
>>> dramatically
>>> as the above numbers. I will do some checking in compiler.
>>
>> Okay, a quick checking compiler found that
>>   - with "#pragma unroll" means no profitability test and do full
>>    unroll as instructed
>
>
> I don't think clang's `#pragma unroll' does full unroll.
>
> On one side, AFAIK `pragma unroll' is supposed to be equivalent to
> `pragma clang loop(enable)', which is different to `pragma clang loop
> unroll(full)'.
>
> On the other, if you replace `pragma unroll' with `pragma clang loop
> unroll(full)' in the BPF selftests you will get branch instruction
> overflows.
>
> What criteria `pragma unroll' in clang uses in order to determine how
> much it unrolls the loop, compared to -O2|-funroll-loops, I don't know.

This makes me wonder, asking from ignorance: what is the benefit/point
for BPF programs to partially unroll a loop?  I would have said either
we unroll them completely in order to avoid verification problems, or we
don't unroll them because the verifier is supposed to handle it the way
it is written...

>>   - without "#pragma unroll" mean compiler will do profitability for full unroll,
>>     if compiler thinks full unroll is not profitable, there will be no unrolling.
>>
>> So for gcc, even users saying '#pragma unroll', gcc still do
>> profitability test?
>
> GCC doesn't support `#pragma unroll'.
>
> Hence in my original patch the macro __pragma_unroll expands to nothing
> with GCC.  That will lead to the compiler perhaps not unrolling the loop
> even with -O2|-funroll-loops.
>
>>
>>>
>>>>
>>>>> | profiler2.bpf.o |   2088 |   2050 |
>>>> - Without `#pragma unroll' and with -O2 we get 2088 instructions.
>>>> - With `#pragma loop unroll(disable)' and with -O2 we get 2050
>>>>    instructions.
>>>>
>>>> Also surprising.
>>>>
>>>>> | profiler3.bpf.o |   4465 |   1690 |
>>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 19:03                         ` Jose E. Marchesi
@ 2024-02-08 19:34                           ` Eduard Zingerman
  2024-02-08 19:44                           ` Yonghong Song
  1 sibling, 0 replies; 18+ messages in thread
From: Eduard Zingerman @ 2024-02-08 19:34 UTC (permalink / raw)
  To: Jose E. Marchesi, Yonghong Song
  Cc: bpf, Yonghong Song, Alexei Starovoitov, david.faust, cupertino.miranda

On Thu, 2024-02-08 at 20:03 +0100, Jose E. Marchesi wrote:
[...]

> This makes me wonder, asking from ignorance: what is the benefit/point
> for BPF programs to partially unroll a loop?  I would have said either
> we unroll them completely in order to avoid verification problems, or we
> don't unroll them because the verifier is supposed to handle it the way
> it is written...

Generally speaking, I'd agree.
But basing on the git history, this specific test was added to check
if verifier is capable to process some specific pattern generated by clang.
See [0]:

    The main purpose of the profiler test to check different llvm generation
    patterns to make sure the verifier can load these large programs.

I'd say it would be fair to select any reasonable combination
of unroll pragmas for GCC, e.g. use unroll(fully) instead of "unroll".

[0] 03d4d13fab3f ("selftests/bpf: Add profiler test")

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 19:03                         ` Jose E. Marchesi
  2024-02-08 19:34                           ` Eduard Zingerman
@ 2024-02-08 19:44                           ` Yonghong Song
  1 sibling, 0 replies; 18+ messages in thread
From: Yonghong Song @ 2024-02-08 19:44 UTC (permalink / raw)
  To: Jose E. Marchesi
  Cc: Eduard Zingerman, bpf, Yonghong Song, Alexei Starovoitov,
	david.faust, cupertino.miranda


On 2/8/24 11:03 AM, Jose E. Marchesi wrote:
>>> On 2/8/24 10:04 AM, Yonghong Song wrote:
>>>> On 2/8/24 8:51 AM, Jose E. Marchesi wrote:
>>>>>> On Thu, 2024-02-08 at 16:35 +0100, Jose E. Marchesi wrote:
>>>>>> [...]
>>>>>>
>>>>>>> If the compiler generates assembly code the same code for
>>>>>>> profile2.c for
>>>>>>> before and after, that means that the loop does _not_ get
>>>>>>> unrolled when
>>>>>>> profiler.inc.h is built with -O2 but without #pragma unroll.
>>>>>>>
>>>>>>> But what if #pragma unroll is used?  If it unrolls then, that
>>>>>>> would mean
>>>>>>> that the pragma does something more than -funroll-loops/-O2.
>>>>>>>
>>>>>>> Sorry if I am not making sense.  Stuff like this confuses me to no end
>>>>>>> ;)
>>>>>> Sorry, I messed up while switching branches :(
>>>>>> Here are the correct stats:
>>>>>>
>>>>>> | File            | insn # | insn # |
>>>>>> |                 | before |  after |
>>>>>> |-----------------+--------+--------|
>>>>>> | profiler1.bpf.o |  16716 |   4813 |
>>>>> This means:
>>>>>
>>>>> - With both `#pragma unroll' and -O2 we get 16716 instructions.
>>>>> - Without `#pragma unroll' and with -O2 we get 4813 instructions.
>>>>>
>>>>> Weird.
>>>> Thanks for the analysis. I can reproduce with vs. without '#pragma
>>>> unroll' at -O2
>>>> level, the number of generated insns is indeed different, quite
>>>> dramatically
>>>> as the above numbers. I will do some checking in compiler.
>>> Okay, a quick checking compiler found that
>>>    - with "#pragma unroll" means no profitability test and do full
>>>     unroll as instructed
>>
>> I don't think clang's `#pragma unroll' does full unroll.
>>
>> On one side, AFAIK `pragma unroll' is supposed to be equivalent to
>> `pragma clang loop(enable)', which is different to `pragma clang loop
>> unroll(full)'.
>>
>> On the other, if you replace `pragma unroll' with `pragma clang loop
>> unroll(full)' in the BPF selftests you will get branch instruction
>> overflows.

You are correct. I did series of examples, and find with "#pragma unroll",
clang may do:
   - full unroll (smaller body, less trip count), or
     Loop Unroll: F[kprobe__proc_sys_write] Loop %for.body.i
       Loop Size = 40
       Exiting block %if.then23.i: TripCount=0, TripMultiple=1, BreakoutTrip=1
       Exiting block %for.inc.i: TripCount=10, TripMultiple=0, BreakoutTrip=0
     COMPLETELY UNROLLING loop %for.body.i with trip count 10!
   - partial unroll (for a loop with trip count 10000000), or
     Loop Unroll: F[foo] Loop %for.body
       Loop Size = 16
       partially unrolling with count: 1000
       Exiting block %for.body: TripCount=10000000, TripMultiple=0, BreakoutTrip=0
     UNROLLING loop %for.body by 1000!
   - no unrolling (for a loop with huge body) and issue warning like
     t.c:2:5: warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation
     might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]

With '#pragma unroll', the compiler will do (more strict?) profitability analysis and looks like
by default will not do partial inlining:

   Loop Unroll: F[kprobe__proc_sys_write] Loop %for.body.i
     Loop Size = 40
   Starting LoopUnroll profitability analysis...
    Analyzing iteration 0
   Adding cost of instruction (iteration 0):   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %__r15.i) #7, !dbg !30496
   Adding cost of instruction (iteration 0):   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %__t16.i) #7, !dbg !30497
   Adding cost of instruction (iteration 0):   %67 = call ptr @llvm.bpf.passthrough.p0.p0(i32 38, ptr %66)
   Adding cost of instruction (iteration 0):   %66 = getelementptr i8, ptr %17, i64 %65
   Adding cost of instruction (iteration 0):   %65 = load i64, ptr @"llvm.task_struct:0:4656$0:171", align 8
   Can't analyze cost of loop with call
     will not try to unroll partially because -unroll-allow-partial not given

>>
>> What criteria `pragma unroll' in clang uses in order to determine how
>> much it unrolls the loop, compared to -O2|-funroll-loops, I don't know.

There are some heuristics in the compiler. I do not know exact algorithm
either.

> This makes me wonder, asking from ignorance: what is the benefit/point
> for BPF programs to partially unroll a loop?  I would have said either
> we unroll them completely in order to avoid verification problems, or we
> don't unroll them because the verifier is supposed to handle it the way
> it is written...

In early days, partial unrolling probably for better performance but
complete unrolling may exceed insn limit 4K. Later on the insn limit
is increased....

Anyway, I think you patch looks good based on current discussion.
I will ack it.

>
>>>    - without "#pragma unroll" mean compiler will do profitability for full unroll,
>>>      if compiler thinks full unroll is not profitable, there will be no unrolling.
>>>
>>> So for gcc, even users saying '#pragma unroll', gcc still do
>>> profitability test?
>> GCC doesn't support `#pragma unroll'.
>>
>> Hence in my original patch the macro __pragma_unroll expands to nothing
>> with GCC.  That will lead to the compiler perhaps not unrolling the loop
>> even with -O2|-funroll-loops.
>>
>>>>>> | profiler2.bpf.o |   2088 |   2050 |
>>>>> - Without `#pragma unroll' and with -O2 we get 2088 instructions.
>>>>> - With `#pragma loop unroll(disable)' and with -O2 we get 2050
>>>>>     instructions.
>>>>>
>>>>> Also surprising.
>>>>>
>>>>>> | profiler3.bpf.o |   4465 |   1690 |

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-07 21:45 ` Yonghong Song
  2024-02-08 11:32   ` Jose E. Marchesi
@ 2024-02-08 19:49   ` Yonghong Song
  2024-02-08 20:06     ` Jose E. Marchesi
  1 sibling, 1 reply; 18+ messages in thread
From: Yonghong Song @ 2024-02-08 19:49 UTC (permalink / raw)
  To: Jose E. Marchesi, bpf
  Cc: Yonghong Song, Eduard Zingerman, Alexei Starovoitov, david.faust,
	cupertino.miranda


On 2/7/24 1:45 PM, Yonghong Song wrote:
>
> On 2/7/24 2:12 AM, Jose E. Marchesi wrote:
>> Some BPF tests use loop unrolling compiler pragmas that are clang
>> specific and not supported by GCC.  These pragmas, along with their
>> GCC equivalences are:
>>
>>    #pragma clang loop unroll_count(N)
>>    #pragma GCC unroll N
>>
>>    #pragma clang loop unroll(full)
>>    #pragma GCC unroll 65534
>>
>>    #pragma clang loop unroll(disable)
>>    #pragma GCC unroll 1
>>
>>    #pragma unroll [aka #pragma clang loop unroll(enable)]
>>    There is no GCC equivalence, and it seems to me that this clang
>>    pragma may be only useful when building without -funroll-loops to
>>    enable the optimization in particular loops.  In GCC -funroll-loops
>>    is enabled with -O2 and higher.  If this is also true in clang,
>>    perhaps these pragmas in selftests are redundant?
>
> You are right, at -O2 level, loop unrolling is enabled by default.
> So I think '#pragma unroll' can be removed since gcc also has
> loop unrolling enabled by default at -O2.

My comment in the above is not correct. In clang,
at -O2 level, with and without "#pragma unroll", the generated
code could be different. Basically "#pragma unroll" seems
more aggressive in inlining compared to without it.

So the current patch LGTM.

Acked-by: Yonghong Song <yonghong.song@linux.dev>

>
> Your patch has a conflict with latest bpf-next. Please rebase it
> on top of bpf-next, remove '#pragma unroll' support and resubmit.
> Thanks!
>
>>
>> This patch adds a new header progs/bpf_compiler.h that defines the
>> following macros, which correspond to each pair of compiler-specific
>> pragmas above:
>>
>>    __pragma_loop_unroll_count(N)
>>    __pragma_loop_unroll_full
>>    __pragma_loop_no_unroll
>>    __pragma_loop_unroll
>>
>> The selftests using loop unrolling pragmas are then changed to include
>> the header and use these macros in place of the explicit pragmas.
>>
>> Tested in bpf-next master.
>> No regressions.
>>
>> Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
>> Cc: Yonghong Song <yhs@meta.com>
>> Cc: Eduard Zingerman <eddyz87@gmail.com>
>> Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
>> Cc: david.faust@oracle.com
>> Cc: cupertino.miranda@oracle.com
>> ---
>>   .../selftests/bpf/progs/bpf_compiler.h        | 33 +++++++++++++++++++
>>   tools/testing/selftests/bpf/progs/iters.c     |  5 +--
>>   tools/testing/selftests/bpf/progs/loop4.c     |  4 ++-
>>   .../selftests/bpf/progs/profiler.inc.h        | 17 +++++-----
>>   tools/testing/selftests/bpf/progs/pyperf.h    |  7 ++--
>>   .../testing/selftests/bpf/progs/strobemeta.h  | 18 +++++-----
>>   .../selftests/bpf/progs/test_cls_redirect.c   |  5 +--
>>   .../selftests/bpf/progs/test_lwt_seg6local.c  |  6 ++--
>>   .../selftests/bpf/progs/test_seg6_loop.c      |  4 ++-
>>   .../selftests/bpf/progs/test_skb_ctx.c        |  4 ++-
>>   .../selftests/bpf/progs/test_sysctl_loop1.c   |  6 ++--
>>   .../selftests/bpf/progs/test_sysctl_loop2.c   |  6 ++--
>>   .../selftests/bpf/progs/test_sysctl_prog.c    |  6 ++--
>>   .../selftests/bpf/progs/test_tc_tunnel.c      |  4 ++-
>>   tools/testing/selftests/bpf/progs/test_xdp.c  |  3 +-
>>   .../selftests/bpf/progs/test_xdp_loop.c       |  3 +-
>>   .../selftests/bpf/progs/test_xdp_noinline.c   |  5 +--
>>   .../selftests/bpf/progs/xdp_synproxy_kern.c   |  6 ++--
>>   .../testing/selftests/bpf/progs/xdping_kern.c |  3 +-
>>   19 files changed, 103 insertions(+), 42 deletions(-)
>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h
>>
>> diff --git a/tools/testing/selftests/bpf/progs/bpf_compiler.h 
>> b/tools/testing/selftests/bpf/progs/bpf_compiler.h
>> new file mode 100644
>> index 000000000000..a7c343dc82e6
>> --- /dev/null
>> +++ b/tools/testing/selftests/bpf/progs/bpf_compiler.h
>> @@ -0,0 +1,33 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef __BPF_COMPILER_H__
>> +#define __BPF_COMPILER_H__
>> +
>> +#define DO_PRAGMA_(X) _Pragma(#X)
>> +
>> +#if __clang__
>> +#define __pragma_loop_unroll DO_PRAGMA_(clang loop unroll(enable))
>> +#else
>> +/* In GCC -funroll-loops, which is enabled with -O2, should have the
>> +   same impact than the loop-unroll-enable pragma above.  */
>> +#define __pragma_loop_unroll
>> +#endif
>> +
>> +#if __clang__
>> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(clang loop 
>> unroll_count(N))
>> +#else
>> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(GCC unroll N)
>> +#endif
>> +
>> +#if __clang__
>> +#define __pragma_loop_unroll_full DO_PRAGMA_(clang loop unroll(full))
>> +#else
>> +#define __pragma_loop_unroll_full DO_PRAGMA_(GCC unroll 65534)
>> +#endif
>> +
>> +#if __clang__
>> +#define __pragma_loop_no_unroll DO_PRAGMA_(clang loop unroll(disable))
>> +#else
>> +#define __pragma_loop_no_unroll DO_PRAGMA_(GCC unroll 1)
>> +#endif
>> +
>> +#endif
>> diff --git a/tools/testing/selftests/bpf/progs/iters.c 
>> b/tools/testing/selftests/bpf/progs/iters.c
>> index 225f02dd66d0..3db416606f2f 100644
>> --- a/tools/testing/selftests/bpf/progs/iters.c
>> +++ b/tools/testing/selftests/bpf/progs/iters.c
>> @@ -5,6 +5,7 @@
>>   #include <linux/bpf.h>
>>   #include <bpf/bpf_helpers.h>
>>   #include "bpf_misc.h"
>> +#include "bpf_compiler.h"
>>     #define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0]))
>>   @@ -183,7 +184,7 @@ int iter_pragma_unroll_loop(const void *ctx)
>>       MY_PID_GUARD();
>>         bpf_iter_num_new(&it, 0, 2);
>> -#pragma nounroll
>> +    __pragma_loop_no_unroll
>>       for (i = 0; i < 3; i++) {
>>           v = bpf_iter_num_next(&it);
>>           bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
>> @@ -238,7 +239,7 @@ int iter_multiple_sequential_loops(const void *ctx)
>>       bpf_iter_num_destroy(&it);
>>         bpf_iter_num_new(&it, 0, 2);
>> -#pragma nounroll
>> +    __pragma_loop_no_unroll
>>       for (i = 0; i < 3; i++) {
>>           v = bpf_iter_num_next(&it);
>>           bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
>
> [...]
>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests
  2024-02-08 19:49   ` Yonghong Song
@ 2024-02-08 20:06     ` Jose E. Marchesi
  0 siblings, 0 replies; 18+ messages in thread
From: Jose E. Marchesi @ 2024-02-08 20:06 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Yonghong Song, Eduard Zingerman, Alexei Starovoitov,
	david.faust, cupertino.miranda


> On 2/7/24 1:45 PM, Yonghong Song wrote:
>>
>> On 2/7/24 2:12 AM, Jose E. Marchesi wrote:
>>> Some BPF tests use loop unrolling compiler pragmas that are clang
>>> specific and not supported by GCC.  These pragmas, along with their
>>> GCC equivalences are:
>>>
>>>    #pragma clang loop unroll_count(N)
>>>    #pragma GCC unroll N
>>>
>>>    #pragma clang loop unroll(full)
>>>    #pragma GCC unroll 65534
>>>
>>>    #pragma clang loop unroll(disable)
>>>    #pragma GCC unroll 1
>>>
>>>    #pragma unroll [aka #pragma clang loop unroll(enable)]
>>>    There is no GCC equivalence, and it seems to me that this clang
>>>    pragma may be only useful when building without -funroll-loops to
>>>    enable the optimization in particular loops.  In GCC -funroll-loops
>>>    is enabled with -O2 and higher.  If this is also true in clang,
>>>    perhaps these pragmas in selftests are redundant?
>>
>> You are right, at -O2 level, loop unrolling is enabled by default.
>> So I think '#pragma unroll' can be removed since gcc also has
>> loop unrolling enabled by default at -O2.
>
> My comment in the above is not correct. In clang,
> at -O2 level, with and without "#pragma unroll", the generated
> code could be different. Basically "#pragma unroll" seems
> more aggressive in inlining compared to without it.
>
> So the current patch LGTM.
>
> Acked-by: Yonghong Song <yonghong.song@linux.dev>

I need to send a v2 with the conflict resolved.

>
>>
>> Your patch has a conflict with latest bpf-next. Please rebase it
>> on top of bpf-next, remove '#pragma unroll' support and resubmit.
>> Thanks!
>>
>>>
>>> This patch adds a new header progs/bpf_compiler.h that defines the
>>> following macros, which correspond to each pair of compiler-specific
>>> pragmas above:
>>>
>>>    __pragma_loop_unroll_count(N)
>>>    __pragma_loop_unroll_full
>>>    __pragma_loop_no_unroll
>>>    __pragma_loop_unroll
>>>
>>> The selftests using loop unrolling pragmas are then changed to include
>>> the header and use these macros in place of the explicit pragmas.
>>>
>>> Tested in bpf-next master.
>>> No regressions.
>>>
>>> Signed-off-by: Jose E. Marchesi <jose.marchesi@oracle.com>
>>> Cc: Yonghong Song <yhs@meta.com>
>>> Cc: Eduard Zingerman <eddyz87@gmail.com>
>>> Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
>>> Cc: david.faust@oracle.com
>>> Cc: cupertino.miranda@oracle.com
>>> ---
>>>   .../selftests/bpf/progs/bpf_compiler.h        | 33 +++++++++++++++++++
>>>   tools/testing/selftests/bpf/progs/iters.c     |  5 +--
>>>   tools/testing/selftests/bpf/progs/loop4.c     |  4 ++-
>>>   .../selftests/bpf/progs/profiler.inc.h        | 17 +++++-----
>>>   tools/testing/selftests/bpf/progs/pyperf.h    |  7 ++--
>>>   .../testing/selftests/bpf/progs/strobemeta.h  | 18 +++++-----
>>>   .../selftests/bpf/progs/test_cls_redirect.c   |  5 +--
>>>   .../selftests/bpf/progs/test_lwt_seg6local.c  |  6 ++--
>>>   .../selftests/bpf/progs/test_seg6_loop.c      |  4 ++-
>>>   .../selftests/bpf/progs/test_skb_ctx.c        |  4 ++-
>>>   .../selftests/bpf/progs/test_sysctl_loop1.c   |  6 ++--
>>>   .../selftests/bpf/progs/test_sysctl_loop2.c   |  6 ++--
>>>   .../selftests/bpf/progs/test_sysctl_prog.c    |  6 ++--
>>>   .../selftests/bpf/progs/test_tc_tunnel.c      |  4 ++-
>>>   tools/testing/selftests/bpf/progs/test_xdp.c  |  3 +-
>>>   .../selftests/bpf/progs/test_xdp_loop.c       |  3 +-
>>>   .../selftests/bpf/progs/test_xdp_noinline.c   |  5 +--
>>>   .../selftests/bpf/progs/xdp_synproxy_kern.c   |  6 ++--
>>>   .../testing/selftests/bpf/progs/xdping_kern.c |  3 +-
>>>   19 files changed, 103 insertions(+), 42 deletions(-)
>>>   create mode 100644 tools/testing/selftests/bpf/progs/bpf_compiler.h
>>>
>>> diff --git a/tools/testing/selftests/bpf/progs/bpf_compiler.h
>>> b/tools/testing/selftests/bpf/progs/bpf_compiler.h
>>> new file mode 100644
>>> index 000000000000..a7c343dc82e6
>>> --- /dev/null
>>> +++ b/tools/testing/selftests/bpf/progs/bpf_compiler.h
>>> @@ -0,0 +1,33 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#ifndef __BPF_COMPILER_H__
>>> +#define __BPF_COMPILER_H__
>>> +
>>> +#define DO_PRAGMA_(X) _Pragma(#X)
>>> +
>>> +#if __clang__
>>> +#define __pragma_loop_unroll DO_PRAGMA_(clang loop unroll(enable))
>>> +#else
>>> +/* In GCC -funroll-loops, which is enabled with -O2, should have the
>>> +   same impact than the loop-unroll-enable pragma above.  */
>>> +#define __pragma_loop_unroll
>>> +#endif
>>> +
>>> +#if __clang__
>>> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(clang loop
>>> unroll_count(N))
>>> +#else
>>> +#define __pragma_loop_unroll_count(N) DO_PRAGMA_(GCC unroll N)
>>> +#endif
>>> +
>>> +#if __clang__
>>> +#define __pragma_loop_unroll_full DO_PRAGMA_(clang loop unroll(full))
>>> +#else
>>> +#define __pragma_loop_unroll_full DO_PRAGMA_(GCC unroll 65534)
>>> +#endif
>>> +
>>> +#if __clang__
>>> +#define __pragma_loop_no_unroll DO_PRAGMA_(clang loop unroll(disable))
>>> +#else
>>> +#define __pragma_loop_no_unroll DO_PRAGMA_(GCC unroll 1)
>>> +#endif
>>> +
>>> +#endif
>>> diff --git a/tools/testing/selftests/bpf/progs/iters.c
>>> b/tools/testing/selftests/bpf/progs/iters.c
>>> index 225f02dd66d0..3db416606f2f 100644
>>> --- a/tools/testing/selftests/bpf/progs/iters.c
>>> +++ b/tools/testing/selftests/bpf/progs/iters.c
>>> @@ -5,6 +5,7 @@
>>>   #include <linux/bpf.h>
>>>   #include <bpf/bpf_helpers.h>
>>>   #include "bpf_misc.h"
>>> +#include "bpf_compiler.h"
>>>     #define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof((x)[0]))
>>>   @@ -183,7 +184,7 @@ int iter_pragma_unroll_loop(const void *ctx)
>>>       MY_PID_GUARD();
>>>         bpf_iter_num_new(&it, 0, 2);
>>> -#pragma nounroll
>>> +    __pragma_loop_no_unroll
>>>       for (i = 0; i < 3; i++) {
>>>           v = bpf_iter_num_next(&it);
>>>           bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
>>> @@ -238,7 +239,7 @@ int iter_multiple_sequential_loops(const void *ctx)
>>>       bpf_iter_num_destroy(&it);
>>>         bpf_iter_num_new(&it, 0, 2);
>>> -#pragma nounroll
>>> +    __pragma_loop_no_unroll
>>>       for (i = 0; i < 3; i++) {
>>>           v = bpf_iter_num_next(&it);
>>>           bpf_printk("ITER_BASIC: E3 VAL: i=%d v=%d", i, v ? *v : -1);
>>
>> [...]
>>
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2024-02-08 20:06 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-07 10:12 [PATCH bpf-next] bpf: abstract loop unrolling pragmas in BPF selftests Jose E. Marchesi
2024-02-07 21:45 ` Yonghong Song
2024-02-08 11:32   ` Jose E. Marchesi
2024-02-08 12:55     ` Jose E. Marchesi
2024-02-08 14:18       ` Eduard Zingerman
2024-02-08 15:05         ` Jose E. Marchesi
2024-02-08 15:28           ` Eduard Zingerman
2024-02-08 15:35             ` Jose E. Marchesi
2024-02-08 15:53               ` Eduard Zingerman
2024-02-08 16:51                 ` Jose E. Marchesi
2024-02-08 18:04                   ` Yonghong Song
2024-02-08 18:35                     ` Yonghong Song
2024-02-08 18:59                       ` Jose E. Marchesi
2024-02-08 19:03                         ` Jose E. Marchesi
2024-02-08 19:34                           ` Eduard Zingerman
2024-02-08 19:44                           ` Yonghong Song
2024-02-08 19:49   ` Yonghong Song
2024-02-08 20:06     ` Jose E. Marchesi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.