All of lore.kernel.org
 help / color / mirror / Atom feed
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
To: linux-block@vger.kernel.org, io-uring@vger.kernel.org,
	bpf@vger.kernel.org
Cc: ming.lei@redhat.com, axboe@kernel.dk, asml.silence@gmail.com,
	ZiyangZhang@linux.alibaba.com
Subject: [RFC v2 1/4] bpf: add UBLK program type
Date: Wed, 22 Feb 2023 21:25:31 +0800	[thread overview]
Message-ID: <20230222132534.114574-2-xiaoguang.wang@linux.alibaba.com> (raw)
In-Reply-To: <20230222132534.114574-1-xiaoguang.wang@linux.alibaba.com>

Normally, userspace block device implementations need to copy data between
kernel block layer's io requests and userspace block device's userspace
daemon. For example, ublk and tcmu both have similar logic, but this
operation will consume cpu resources obviously, especially for large io.

There are methods trying to reduce these cpu overheads, then userspace
block device's io performance will be improved further. These methods
contain: 1) use special hardware to do memory copy, but seems not all
architectures have these special hardware; 2) software methods, such as
mmap kernel block layer's io requests's data to userspace daemon [1],
but it has page table's map/unmap, tlb flush overhead, security issue,
etc, and it maybe only friendly to large io.

To solve this problem, I'd propose a new method, which will combine the
respective advantages of io_uring and ebpf. Add a new program type
BPF_PROG_TYPE_UBLK for ublk, userspace block device daemon process will
register ebpf progs, which will use bpf helper offered by ublk bpf prog
type to submit io requests on behalf of daemon process in kernel, note
io requests will use kernel block layer io reqeusts's pages to do io,
then the memory copy overhead will be gone.

[1] https://lore.kernel.org/all/20220318095531.15479-1-xiaoguang.wang@linux.alibaba.com/

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
---
 drivers/block/ublk_drv.c       | 23 +++++++++++++++++++++++
 include/linux/bpf_types.h      |  2 ++
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  1 +
 kernel/bpf/verifier.c          |  9 +++++++--
 tools/include/uapi/linux/bpf.h |  1 +
 tools/lib/bpf/libbpf.c         |  1 +
 7 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6368b56eacf1..b628e9eaefa6 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -43,6 +43,8 @@
 #include <asm/page.h>
 #include <linux/task_work.h>
 #include <uapi/linux/ublk_cmd.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
 
 #define UBLK_MINORS		(1U << MINORBITS)
 
@@ -187,6 +189,27 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+static const struct bpf_func_proto *
+ublk_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return bpf_base_func_proto(func_id);
+}
+
+static bool ublk_bpf_is_valid_access(int off, int size,
+			enum bpf_access_type type,
+			const struct bpf_prog *prog,
+			struct bpf_insn_access_aux *info)
+{
+	return false;
+}
+
+const struct bpf_prog_ops bpf_ublk_prog_ops = {};
+
+const struct bpf_verifier_ops bpf_ublk_verifier_ops = {
+	.get_func_proto		= ublk_bpf_func_proto,
+	.is_valid_access	= ublk_bpf_is_valid_access,
+};
+
 static void ublk_dev_param_basic_apply(struct ublk_device *ub)
 {
 	struct request_queue *q = ub->ub_disk->queue;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d4ee3ccd3753..4ef0bc0251b7 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -79,6 +79,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm,
 #endif
 BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 	      void *, void *)
+BPF_PROG_TYPE(BPF_PROG_TYPE_UBLK, bpf_ublk,
+	      void *, void *)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 464ca3f01fe7..515b7b995b3a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_UBLK,
 };
 
 enum bpf_attach_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ecca9366c7a6..eb1752243f4f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2432,6 +2432,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_UBLK:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
 		return true;
 	case BPF_PROG_TYPE_CGROUP_SKB:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7ee218827259..1e5bc89aea36 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12235,6 +12235,10 @@ static int check_return_code(struct bpf_verifier_env *env)
 		}
 		break;
 
+	case BPF_PROG_TYPE_UBLK:
+		range = tnum_const(0);
+		break;
+
 	case BPF_PROG_TYPE_EXT:
 		/* freplace program can return anything as its return value
 		 * depends on the to-be-replaced kernel func or bpf program.
@@ -16770,8 +16774,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	}
 
 	if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
-	    prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
-		verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
+	    prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE &&
+	    prog->type != BPF_PROG_TYPE_UBLK) {
+		verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe, ublk programs can be sleepable\n");
 		return -EINVAL;
 	}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 464ca3f01fe7..515b7b995b3a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_UBLK,
 };
 
 enum bpf_attach_type {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2a82f49ce16f..891ae1830ac7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8606,6 +8606,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("cgroup/dev",		CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT),
 	SEC_DEF("struct_ops+",		STRUCT_OPS, 0, SEC_NONE),
 	SEC_DEF("sk_lookup",		SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
+	SEC_DEF("ublk.s/",		UBLK, 0, SEC_SLEEPABLE),
 };
 
 static size_t custom_sec_def_cnt;
-- 
2.31.1


  reply	other threads:[~2023-02-22 13:26 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-22 13:25 [RFC v2 0/4] Add io_uring & ebpf based methods to implement zero-copy for ublk Xiaoguang Wang
2023-02-22 13:25 ` Xiaoguang Wang [this message]
2023-02-23  1:21   ` [RFC v2 1/4] bpf: add UBLK program type kernel test robot
2023-02-22 13:25 ` [RFC v2 2/4] io_uring: enable io_uring to submit sqes located in kernel Xiaoguang Wang
2023-02-23  0:39   ` kernel test robot
2023-02-23  1:31   ` kernel test robot
2023-02-22 13:25 ` [RFC v2 3/4] io_uring: introduce IORING_URING_CMD_UNLOCK flag Xiaoguang Wang
2023-02-22 13:25 ` [RFC v2 4/4] ublk_drv: add ebpf support Xiaoguang Wang
2023-02-22 19:25   ` Alexei Starovoitov
2023-02-23 14:01     ` Xiaoguang Wang
2023-02-23  0:59   ` kernel test robot
2023-02-22 13:27 ` [PATCH] Add " Xiaoguang Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230222132534.114574-2-xiaoguang.wang@linux.alibaba.com \
    --to=xiaoguang.wang@linux.alibaba.com \
    --cc=ZiyangZhang@linux.alibaba.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=bpf@vger.kernel.org \
    --cc=io-uring@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.