bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next 0/2] selftests/bpf: fix flaky send_signal test
@ 2021-08-17 17:19 Yonghong Song
  2021-08-17 17:20 ` [PATCH bpf-next 1/2] selftests/bpf: replace CHECK with ASSERT_* macros in send_signal.c Yonghong Song
  2021-08-17 17:20 ` [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test Yonghong Song
  0 siblings, 2 replies; 6+ messages in thread
From: Yonghong Song @ 2021-08-17 17:19 UTC (permalink / raw)
  To: bpf; +Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team

The bpf selftest send_signal() is flaky for its subtests trying to
send signals in softirq/nmi context. To reduce flakiness, the
signal-targetted process priority is boosted, which should minimize
preemption of that process and improve the possibility that
the underlying task in softirq/nmi context is the bpf_send_signal()
wanted task.

Patch #1 did a refactoring to use ASSERT_* instead of old CHECK macros.
Patch #2 did actual change of boosting priority.

Yonghong Song (2):
  selftests/bpf: replace CHECK with ASSERT_* macros in send_signal.c
  selftests/bpf: fix flaky send_signal test

 .../selftests/bpf/prog_tests/send_signal.c    | 68 ++++++++++++-------
 .../bpf/progs/test_send_signal_kern.c         |  3 +-
 2 files changed, 43 insertions(+), 28 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH bpf-next 1/2] selftests/bpf: replace CHECK with ASSERT_* macros in send_signal.c
  2021-08-17 17:19 [PATCH bpf-next 0/2] selftests/bpf: fix flaky send_signal test Yonghong Song
@ 2021-08-17 17:20 ` Yonghong Song
  2021-08-17 17:20 ` [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test Yonghong Song
  1 sibling, 0 replies; 6+ messages in thread
From: Yonghong Song @ 2021-08-17 17:20 UTC (permalink / raw)
  To: bpf; +Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team

Replace CHECK in send_signal.c with ASSERT_* macros as
ASSERT_* macros are generally preferred. There is no
funcitonality change.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/prog_tests/send_signal.c    | 45 +++++++++----------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 023cc532992d..41e158ae888e 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -10,29 +10,25 @@ static void sigusr1_handler(int signum)
 }
 
 static void test_send_signal_common(struct perf_event_attr *attr,
-				    bool signal_thread,
-				    const char *test_name)
+				    bool signal_thread)
 {
 	struct test_send_signal_kern *skel;
 	int pipe_c2p[2], pipe_p2c[2];
 	int err = -1, pmu_fd = -1;
-	__u32 duration = 0;
 	char buf[256];
 	pid_t pid;
 
-	if (CHECK(pipe(pipe_c2p), test_name,
-		  "pipe pipe_c2p error: %s\n", strerror(errno)))
+	if (!ASSERT_OK(pipe(pipe_c2p), "pipe_c2p"))
 		return;
 
-	if (CHECK(pipe(pipe_p2c), test_name,
-		  "pipe pipe_p2c error: %s\n", strerror(errno))) {
+	if (!ASSERT_OK(pipe(pipe_p2c), "pipe_p2c")) {
 		close(pipe_c2p[0]);
 		close(pipe_c2p[1]);
 		return;
 	}
 
 	pid = fork();
-	if (CHECK(pid < 0, test_name, "fork error: %s\n", strerror(errno))) {
+	if (!ASSERT_GE(pid, 0, "fork")) {
 		close(pipe_c2p[0]);
 		close(pipe_c2p[1]);
 		close(pipe_p2c[0]);
@@ -48,19 +44,19 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 		close(pipe_p2c[1]); /* close write */
 
 		/* notify parent signal handler is installed */
-		CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+		ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
 		/* make sure parent enabled bpf program to send_signal */
-		CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+		ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
 
 		/* wait a little for signal handler */
 		sleep(1);
 
 		buf[0] = sigusr1_received ? '2' : '0';
-		CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+		ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
 		/* wait for parent notification and exit */
-		CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+		ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
 
 		close(pipe_c2p[1]);
 		close(pipe_p2c[0]);
@@ -71,20 +67,19 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	close(pipe_p2c[0]); /* close read */
 
 	skel = test_send_signal_kern__open_and_load();
-	if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n"))
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
 		goto skel_open_load_failure;
 
 	if (!attr) {
 		err = test_send_signal_kern__attach(skel);
-		if (CHECK(err, "skel_attach", "skeleton attach failed\n")) {
+		if (!ASSERT_OK(err, "skel_attach")) {
 			err = -1;
 			goto destroy_skel;
 		}
 	} else {
 		pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1,
 				 -1 /* group id */, 0 /* flags */);
-		if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n",
-			strerror(errno))) {
+		if (!ASSERT_GE(pmu_fd, 0, "perf_event_open")) {
 			err = -1;
 			goto destroy_skel;
 		}
@@ -96,7 +91,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	}
 
 	/* wait until child signal handler installed */
-	CHECK(read(pipe_c2p[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+	ASSERT_EQ(read(pipe_c2p[0], buf, 1), 1, "pipe_read");
 
 	/* trigger the bpf send_signal */
 	skel->bss->pid = pid;
@@ -104,21 +99,21 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	skel->bss->signal_thread = signal_thread;
 
 	/* notify child that bpf program can send_signal now */
-	CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
 	/* wait for result */
 	err = read(pipe_c2p[0], buf, 1);
-	if (CHECK(err < 0, test_name, "reading pipe error: %s\n", strerror(errno)))
+	if (!ASSERT_GE(err, 0, "reading pipe"))
 		goto disable_pmu;
-	if (CHECK(err == 0, test_name, "reading pipe error: size 0\n")) {
+	if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) {
 		err = -1;
 		goto disable_pmu;
 	}
 
-	CHECK(buf[0] != '2', test_name, "incorrect result\n");
+	ASSERT_EQ(buf[0], '2', "incorrect result");
 
 	/* notify child safe to exit */
-	CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
 disable_pmu:
 	close(pmu_fd);
@@ -132,7 +127,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 
 static void test_send_signal_tracepoint(bool signal_thread)
 {
-	test_send_signal_common(NULL, signal_thread, "tracepoint");
+	test_send_signal_common(NULL, signal_thread);
 }
 
 static void test_send_signal_perf(bool signal_thread)
@@ -143,7 +138,7 @@ static void test_send_signal_perf(bool signal_thread)
 		.config = PERF_COUNT_SW_CPU_CLOCK,
 	};
 
-	test_send_signal_common(&attr, signal_thread, "perf_sw_event");
+	test_send_signal_common(&attr, signal_thread);
 }
 
 static void test_send_signal_nmi(bool signal_thread)
@@ -172,7 +167,7 @@ static void test_send_signal_nmi(bool signal_thread)
 		close(pmu_fd);
 	}
 
-	test_send_signal_common(&attr, signal_thread, "perf_hw_event");
+	test_send_signal_common(&attr, signal_thread);
 }
 
 void test_send_signal(void)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test
  2021-08-17 17:19 [PATCH bpf-next 0/2] selftests/bpf: fix flaky send_signal test Yonghong Song
  2021-08-17 17:20 ` [PATCH bpf-next 1/2] selftests/bpf: replace CHECK with ASSERT_* macros in send_signal.c Yonghong Song
@ 2021-08-17 17:20 ` Yonghong Song
  2021-08-17 18:45   ` Andrii Nakryiko
  1 sibling, 1 reply; 6+ messages in thread
From: Yonghong Song @ 2021-08-17 17:20 UTC (permalink / raw)
  To: bpf; +Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, kernel-team

libbpf CI has reported send_signal test is flaky although
I am not able to reproduce it in my local environment.
But I am able to reproduce with on-demand libbpf CI ([1]).

Through code analysis, the following is possible reason.
The failed subtest runs bpf program in softirq environment.
Since bpf_send_signal() only sends to a fork of "test_progs"
process. If the underlying current task is
not "test_progs", bpf_send_signal() will not be triggered
and the subtest will fail.

To reduce the chances where the underlying process is not
the intended one, this patch boosted scheduling priority to
-20 (highest allowed by setpriority() call). And I did
10 runs with on-demand libbpf CI with this patch and I
didn't observe any failures.

 [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
 .../bpf/progs/test_send_signal_kern.c         |  3 +-
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
index 41e158ae888e..0701c97456da 100644
--- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include "test_send_signal_kern.skel.h"
 
 int sigusr1_received = 0;
@@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
 }
 
 static void test_send_signal_common(struct perf_event_attr *attr,
-				    bool signal_thread)
+				    bool signal_thread, bool allow_skip)
 {
 	struct test_send_signal_kern *skel;
 	int pipe_c2p[2], pipe_p2c[2];
@@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 	}
 
 	if (pid == 0) {
+		int old_prio;
+
 		/* install signal handler and notify parent */
 		signal(SIGUSR1, sigusr1_handler);
 
 		close(pipe_c2p[0]); /* close read */
 		close(pipe_p2c[1]); /* close write */
 
+		/* boost with a high priority so we got a higher chance
+		 * that if an interrupt happens, the underlying task
+		 * is this process.
+		 */
+		errno = 0;
+		old_prio = getpriority(PRIO_PROCESS, 0);
+		ASSERT_OK(errno, "getpriority");
+		ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+
 		/* notify parent signal handler is installed */
 		ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
 
@@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 		/* wait for parent notification and exit */
 		ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
 
+		/* restore the old priority */
+		ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
+
 		close(pipe_c2p[1]);
 		close(pipe_p2c[0]);
 		exit(0);
@@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 		goto disable_pmu;
 	}
 
-	ASSERT_EQ(buf[0], '2', "incorrect result");
-
 	/* notify child safe to exit */
 	ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
 
+	if (skel->bss->status == 0 && allow_skip) {
+		printf("%s:SKIP\n", __func__);
+		test__skip();
+	} else if (skel->bss->status != 1) {
+		ASSERT_EQ(buf[0], '2', "incorrect result");
+	}
+
 disable_pmu:
 	close(pmu_fd);
 destroy_skel:
@@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
 
 static void test_send_signal_tracepoint(bool signal_thread)
 {
-	test_send_signal_common(NULL, signal_thread);
+	test_send_signal_common(NULL, signal_thread, false);
 }
 
 static void test_send_signal_perf(bool signal_thread)
@@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
 		.config = PERF_COUNT_SW_CPU_CLOCK,
 	};
 
-	test_send_signal_common(&attr, signal_thread);
+	test_send_signal_common(&attr, signal_thread, true);
 }
 
 static void test_send_signal_nmi(bool signal_thread)
@@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
 		close(pmu_fd);
 	}
 
-	test_send_signal_common(&attr, signal_thread);
+	test_send_signal_common(&attr, signal_thread, true);
 }
 
 void test_send_signal(void)
diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
index b4233d3efac2..59c05c422bbd 100644
--- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
@@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
 			ret = bpf_send_signal_thread(sig);
 		else
 			ret = bpf_send_signal(sig);
-		if (ret == 0)
-			status = 1;
+		status = (ret == 0) ? 1 : 2;
 	}
 
 	return 0;
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test
  2021-08-17 17:20 ` [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test Yonghong Song
@ 2021-08-17 18:45   ` Andrii Nakryiko
  2021-08-17 19:01     ` Yonghong Song
  0 siblings, 1 reply; 6+ messages in thread
From: Andrii Nakryiko @ 2021-08-17 18:45 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Kernel Team

On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote:
>
> libbpf CI has reported send_signal test is flaky although
> I am not able to reproduce it in my local environment.
> But I am able to reproduce with on-demand libbpf CI ([1]).
>
> Through code analysis, the following is possible reason.
> The failed subtest runs bpf program in softirq environment.
> Since bpf_send_signal() only sends to a fork of "test_progs"
> process. If the underlying current task is
> not "test_progs", bpf_send_signal() will not be triggered
> and the subtest will fail.
>
> To reduce the chances where the underlying process is not
> the intended one, this patch boosted scheduling priority to
> -20 (highest allowed by setpriority() call). And I did
> 10 runs with on-demand libbpf CI with this patch and I
> didn't observe any failures.
>
>  [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
>  .../bpf/progs/test_send_signal_kern.c         |  3 +-
>  2 files changed, 28 insertions(+), 8 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> index 41e158ae888e..0701c97456da 100644
> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> @@ -1,5 +1,7 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <test_progs.h>
> +#include <sys/time.h>
> +#include <sys/resource.h>
>  #include "test_send_signal_kern.skel.h"
>
>  int sigusr1_received = 0;
> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
>  }
>
>  static void test_send_signal_common(struct perf_event_attr *attr,
> -                                   bool signal_thread)
> +                                   bool signal_thread, bool allow_skip)
>  {
>         struct test_send_signal_kern *skel;
>         int pipe_c2p[2], pipe_p2c[2];
> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>         }
>
>         if (pid == 0) {
> +               int old_prio;
> +
>                 /* install signal handler and notify parent */
>                 signal(SIGUSR1, sigusr1_handler);
>
>                 close(pipe_c2p[0]); /* close read */
>                 close(pipe_p2c[1]); /* close write */
>
> +               /* boost with a high priority so we got a higher chance
> +                * that if an interrupt happens, the underlying task
> +                * is this process.
> +                */
> +               errno = 0;
> +               old_prio = getpriority(PRIO_PROCESS, 0);
> +               ASSERT_OK(errno, "getpriority");
> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
> +
>                 /* notify parent signal handler is installed */
>                 ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
>
> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>                 /* wait for parent notification and exit */
>                 ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
>
> +               /* restore the old priority */
> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
> +
>                 close(pipe_c2p[1]);
>                 close(pipe_p2c[0]);
>                 exit(0);
> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>                 goto disable_pmu;
>         }
>
> -       ASSERT_EQ(buf[0], '2', "incorrect result");
> -
>         /* notify child safe to exit */
>         ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>
> +       if (skel->bss->status == 0 && allow_skip) {
> +               printf("%s:SKIP\n", __func__);
> +               test__skip();
> +       } else if (skel->bss->status != 1) {
> +               ASSERT_EQ(buf[0], '2', "incorrect result");
> +       }
> +
>  disable_pmu:
>         close(pmu_fd);
>  destroy_skel:
> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>
>  static void test_send_signal_tracepoint(bool signal_thread)
>  {
> -       test_send_signal_common(NULL, signal_thread);
> +       test_send_signal_common(NULL, signal_thread, false);
>  }
>
>  static void test_send_signal_perf(bool signal_thread)
> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
>                 .config = PERF_COUNT_SW_CPU_CLOCK,
>         };
>
> -       test_send_signal_common(&attr, signal_thread);
> +       test_send_signal_common(&attr, signal_thread, true);
>  }
>
>  static void test_send_signal_nmi(bool signal_thread)
> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
>                 close(pmu_fd);
>         }
>
> -       test_send_signal_common(&attr, signal_thread);
> +       test_send_signal_common(&attr, signal_thread, true);
>  }
>
>  void test_send_signal(void)
> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> index b4233d3efac2..59c05c422bbd 100644
> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
>                         ret = bpf_send_signal_thread(sig);
>                 else
>                         ret = bpf_send_signal(sig);
> -               if (ret == 0)
> -                       status = 1;
> +               status = (ret == 0) ? 1 : 2;

This doesn't make sense to me. status == 0 is the default value, it
will stay 0 even if nothing is triggered, no BPF program is called,
etc.

If we are doing the skipping of the test logic (which I'd honestly
just not do right now to see if we actually fixed the test), then I'd
set status = 3 for the case when signal was triggered, but the current
task is not test_progs. And only skip test if we get status 3. That
is, status 0 and status 2 are bad (either not triggered, or some error
when sending signal), 1 is OK, 3 is SKIP.

But really, skipping a test that we couldn't randomly run doesn't feel
good. Can you please leave the priority boosting part and drop the
skipping part for now?

>         }
>
>         return 0;
> --
> 2.30.2
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test
  2021-08-17 18:45   ` Andrii Nakryiko
@ 2021-08-17 19:01     ` Yonghong Song
  2021-08-17 19:06       ` Andrii Nakryiko
  0 siblings, 1 reply; 6+ messages in thread
From: Yonghong Song @ 2021-08-17 19:01 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Kernel Team



On 8/17/21 11:45 AM, Andrii Nakryiko wrote:
> On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote:
>>
>> libbpf CI has reported send_signal test is flaky although
>> I am not able to reproduce it in my local environment.
>> But I am able to reproduce with on-demand libbpf CI ([1]).
>>
>> Through code analysis, the following is possible reason.
>> The failed subtest runs bpf program in softirq environment.
>> Since bpf_send_signal() only sends to a fork of "test_progs"
>> process. If the underlying current task is
>> not "test_progs", bpf_send_signal() will not be triggered
>> and the subtest will fail.
>>
>> To reduce the chances where the underlying process is not
>> the intended one, this patch boosted scheduling priority to
>> -20 (highest allowed by setpriority() call). And I did
>> 10 runs with on-demand libbpf CI with this patch and I
>> didn't observe any failures.
>>
>>   [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
>>   .../bpf/progs/test_send_signal_kern.c         |  3 +-
>>   2 files changed, 28 insertions(+), 8 deletions(-)
>>
>> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> index 41e158ae888e..0701c97456da 100644
>> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
>> @@ -1,5 +1,7 @@
>>   // SPDX-License-Identifier: GPL-2.0
>>   #include <test_progs.h>
>> +#include <sys/time.h>
>> +#include <sys/resource.h>
>>   #include "test_send_signal_kern.skel.h"
>>
>>   int sigusr1_received = 0;
>> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
>>   }
>>
>>   static void test_send_signal_common(struct perf_event_attr *attr,
>> -                                   bool signal_thread)
>> +                                   bool signal_thread, bool allow_skip)
>>   {
>>          struct test_send_signal_kern *skel;
>>          int pipe_c2p[2], pipe_p2c[2];
>> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>          }
>>
>>          if (pid == 0) {
>> +               int old_prio;
>> +
>>                  /* install signal handler and notify parent */
>>                  signal(SIGUSR1, sigusr1_handler);
>>
>>                  close(pipe_c2p[0]); /* close read */
>>                  close(pipe_p2c[1]); /* close write */
>>
>> +               /* boost with a high priority so we got a higher chance
>> +                * that if an interrupt happens, the underlying task
>> +                * is this process.
>> +                */
>> +               errno = 0;
>> +               old_prio = getpriority(PRIO_PROCESS, 0);
>> +               ASSERT_OK(errno, "getpriority");
>> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
>> +
>>                  /* notify parent signal handler is installed */
>>                  ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
>>
>> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>                  /* wait for parent notification and exit */
>>                  ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
>>
>> +               /* restore the old priority */
>> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
>> +
>>                  close(pipe_c2p[1]);
>>                  close(pipe_p2c[0]);
>>                  exit(0);
>> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>                  goto disable_pmu;
>>          }
>>
>> -       ASSERT_EQ(buf[0], '2', "incorrect result");
>> -
>>          /* notify child safe to exit */
>>          ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
>>
>> +       if (skel->bss->status == 0 && allow_skip) {
>> +               printf("%s:SKIP\n", __func__);
>> +               test__skip();
>> +       } else if (skel->bss->status != 1) {
>> +               ASSERT_EQ(buf[0], '2', "incorrect result");
>> +       }
>> +
>>   disable_pmu:
>>          close(pmu_fd);
>>   destroy_skel:
>> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>>
>>   static void test_send_signal_tracepoint(bool signal_thread)
>>   {
>> -       test_send_signal_common(NULL, signal_thread);
>> +       test_send_signal_common(NULL, signal_thread, false);
>>   }
>>
>>   static void test_send_signal_perf(bool signal_thread)
>> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
>>                  .config = PERF_COUNT_SW_CPU_CLOCK,
>>          };
>>
>> -       test_send_signal_common(&attr, signal_thread);
>> +       test_send_signal_common(&attr, signal_thread, true);
>>   }
>>
>>   static void test_send_signal_nmi(bool signal_thread)
>> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
>>                  close(pmu_fd);
>>          }
>>
>> -       test_send_signal_common(&attr, signal_thread);
>> +       test_send_signal_common(&attr, signal_thread, true);
>>   }
>>
>>   void test_send_signal(void)
>> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> index b4233d3efac2..59c05c422bbd 100644
>> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
>> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
>>                          ret = bpf_send_signal_thread(sig);
>>                  else
>>                          ret = bpf_send_signal(sig);
>> -               if (ret == 0)
>> -                       status = 1;
>> +               status = (ret == 0) ? 1 : 2;
> 
> This doesn't make sense to me. status == 0 is the default value, it
> will stay 0 even if nothing is triggered, no BPF program is called,
> etc.

that is true.

> 
> If we are doing the skipping of the test logic (which I'd honestly
> just not do right now to see if we actually fixed the test), then I'd
> set status = 3 for the case when signal was triggered, but the current
> task is not test_progs. And only skip test if we get status 3. That
> is, status 0 and status 2 are bad (either not triggered, or some error
> when sending signal), 1 is OK, 3 is SKIP.

Here, we *assume* bpf program always got called which should be the case 
unless softirq/nmi logic goes wrong. so status = 0 means
pid doesn't match, and status = 1 means good bpf_send_signal happens,
status = 2 means bpf_send_signal helper fails.

> 
> But really, skipping a test that we couldn't randomly run doesn't feel
> good. Can you please leave the priority boosting part and drop the
> skipping part for now?

Sure. Let me drop skipping part. With the patch, I am expecting in 
*most* cases, we should not observe flakiness.

> 
>>          }
>>
>>          return 0;
>> --
>> 2.30.2
>>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test
  2021-08-17 19:01     ` Yonghong Song
@ 2021-08-17 19:06       ` Andrii Nakryiko
  0 siblings, 0 replies; 6+ messages in thread
From: Andrii Nakryiko @ 2021-08-17 19:06 UTC (permalink / raw)
  To: Yonghong Song
  Cc: bpf, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Kernel Team

On Tue, Aug 17, 2021 at 12:01 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 8/17/21 11:45 AM, Andrii Nakryiko wrote:
> > On Tue, Aug 17, 2021 at 10:20 AM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> libbpf CI has reported send_signal test is flaky although
> >> I am not able to reproduce it in my local environment.
> >> But I am able to reproduce with on-demand libbpf CI ([1]).
> >>
> >> Through code analysis, the following is possible reason.
> >> The failed subtest runs bpf program in softirq environment.
> >> Since bpf_send_signal() only sends to a fork of "test_progs"
> >> process. If the underlying current task is
> >> not "test_progs", bpf_send_signal() will not be triggered
> >> and the subtest will fail.
> >>
> >> To reduce the chances where the underlying process is not
> >> the intended one, this patch boosted scheduling priority to
> >> -20 (highest allowed by setpriority() call). And I did
> >> 10 runs with on-demand libbpf CI with this patch and I
> >> didn't observe any failures.
> >>
> >>   [1] https://github.com/libbpf/libbpf/actions/workflows/ondemand.yml
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   .../selftests/bpf/prog_tests/send_signal.c    | 33 +++++++++++++++----
> >>   .../bpf/progs/test_send_signal_kern.c         |  3 +-
> >>   2 files changed, 28 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> >> index 41e158ae888e..0701c97456da 100644
> >> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
> >> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> >> @@ -1,5 +1,7 @@
> >>   // SPDX-License-Identifier: GPL-2.0
> >>   #include <test_progs.h>
> >> +#include <sys/time.h>
> >> +#include <sys/resource.h>
> >>   #include "test_send_signal_kern.skel.h"
> >>
> >>   int sigusr1_received = 0;
> >> @@ -10,7 +12,7 @@ static void sigusr1_handler(int signum)
> >>   }
> >>
> >>   static void test_send_signal_common(struct perf_event_attr *attr,
> >> -                                   bool signal_thread)
> >> +                                   bool signal_thread, bool allow_skip)
> >>   {
> >>          struct test_send_signal_kern *skel;
> >>          int pipe_c2p[2], pipe_p2c[2];
> >> @@ -37,12 +39,23 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>          }
> >>
> >>          if (pid == 0) {
> >> +               int old_prio;
> >> +
> >>                  /* install signal handler and notify parent */
> >>                  signal(SIGUSR1, sigusr1_handler);
> >>
> >>                  close(pipe_c2p[0]); /* close read */
> >>                  close(pipe_p2c[1]); /* close write */
> >>
> >> +               /* boost with a high priority so we got a higher chance
> >> +                * that if an interrupt happens, the underlying task
> >> +                * is this process.
> >> +                */
> >> +               errno = 0;
> >> +               old_prio = getpriority(PRIO_PROCESS, 0);
> >> +               ASSERT_OK(errno, "getpriority");
> >> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
> >> +
> >>                  /* notify parent signal handler is installed */
> >>                  ASSERT_EQ(write(pipe_c2p[1], buf, 1), 1, "pipe_write");
> >>
> >> @@ -58,6 +71,9 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>                  /* wait for parent notification and exit */
> >>                  ASSERT_EQ(read(pipe_p2c[0], buf, 1), 1, "pipe_read");
> >>
> >> +               /* restore the old priority */
> >> +               ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
> >> +
> >>                  close(pipe_c2p[1]);
> >>                  close(pipe_p2c[0]);
> >>                  exit(0);
> >> @@ -110,11 +126,16 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>                  goto disable_pmu;
> >>          }
> >>
> >> -       ASSERT_EQ(buf[0], '2', "incorrect result");
> >> -
> >>          /* notify child safe to exit */
> >>          ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
> >>
> >> +       if (skel->bss->status == 0 && allow_skip) {
> >> +               printf("%s:SKIP\n", __func__);
> >> +               test__skip();
> >> +       } else if (skel->bss->status != 1) {
> >> +               ASSERT_EQ(buf[0], '2', "incorrect result");
> >> +       }
> >> +
> >>   disable_pmu:
> >>          close(pmu_fd);
> >>   destroy_skel:
> >> @@ -127,7 +148,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
> >>
> >>   static void test_send_signal_tracepoint(bool signal_thread)
> >>   {
> >> -       test_send_signal_common(NULL, signal_thread);
> >> +       test_send_signal_common(NULL, signal_thread, false);
> >>   }
> >>
> >>   static void test_send_signal_perf(bool signal_thread)
> >> @@ -138,7 +159,7 @@ static void test_send_signal_perf(bool signal_thread)
> >>                  .config = PERF_COUNT_SW_CPU_CLOCK,
> >>          };
> >>
> >> -       test_send_signal_common(&attr, signal_thread);
> >> +       test_send_signal_common(&attr, signal_thread, true);
> >>   }
> >>
> >>   static void test_send_signal_nmi(bool signal_thread)
> >> @@ -167,7 +188,7 @@ static void test_send_signal_nmi(bool signal_thread)
> >>                  close(pmu_fd);
> >>          }
> >>
> >> -       test_send_signal_common(&attr, signal_thread);
> >> +       test_send_signal_common(&attr, signal_thread, true);
> >>   }
> >>
> >>   void test_send_signal(void)
> >> diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> >> index b4233d3efac2..59c05c422bbd 100644
> >> --- a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> >> +++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
> >> @@ -18,8 +18,7 @@ static __always_inline int bpf_send_signal_test(void *ctx)
> >>                          ret = bpf_send_signal_thread(sig);
> >>                  else
> >>                          ret = bpf_send_signal(sig);
> >> -               if (ret == 0)
> >> -                       status = 1;
> >> +               status = (ret == 0) ? 1 : 2;
> >
> > This doesn't make sense to me. status == 0 is the default value, it
> > will stay 0 even if nothing is triggered, no BPF program is called,
> > etc.
>
> that is true.
>
> >
> > If we are doing the skipping of the test logic (which I'd honestly
> > just not do right now to see if we actually fixed the test), then I'd
> > set status = 3 for the case when signal was triggered, but the current
> > task is not test_progs. And only skip test if we get status 3. That
> > is, status 0 and status 2 are bad (either not triggered, or some error
> > when sending signal), 1 is OK, 3 is SKIP.
>
> Here, we *assume* bpf program always got called which should be the case
> unless softirq/nmi logic goes wrong. so status = 0 means
> pid doesn't match, and status = 1 means good bpf_send_signal happens,
> status = 2 means bpf_send_signal helper fails.

Sorry, I didn't make my point clear. I meant that test shouldn't just
assume that BPF program ran, so I'd add

if ((bpf_get_current_pid_tgid() >> 32) == pid) {
   ...
} else {
    status = 3;
}

Just to capture that we did get bpf_send_signal_test() called, but we
didn't have correct current.

But it doesn't matter for now, I'd like to see if prio games get us to
stable tests with no skipping first.


>
> >
> > But really, skipping a test that we couldn't randomly run doesn't feel
> > good. Can you please leave the priority boosting part and drop the
> > skipping part for now?
>
> Sure. Let me drop skipping part. With the patch, I am expecting in
> *most* cases, we should not observe flakiness.

Yep, thanks!

>
> >
> >>          }
> >>
> >>          return 0;
> >> --
> >> 2.30.2
> >>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-08-17 19:06 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-17 17:19 [PATCH bpf-next 0/2] selftests/bpf: fix flaky send_signal test Yonghong Song
2021-08-17 17:20 ` [PATCH bpf-next 1/2] selftests/bpf: replace CHECK with ASSERT_* macros in send_signal.c Yonghong Song
2021-08-17 17:20 ` [PATCH bpf-next 2/2] selftests/bpf: fix flaky send_signal test Yonghong Song
2021-08-17 18:45   ` Andrii Nakryiko
2021-08-17 19:01     ` Yonghong Song
2021-08-17 19:06       ` Andrii Nakryiko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).