io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 00/10] liburing: buffer registration enhancements
@ 2021-01-22 22:54 Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 01/10] liburing: support buffer registration updates Bijan Mottahedeh
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

v2:

- manpage updates
- additional sharing test cases

This patchset is the liburing changes for buffer registration enhancements.

Patch 1-2 implement the actual liburing changes

Patch 3-4 are the buffer-registration/update tests, copied from
          corresponding file tests and adapted for buffers

Patch 5-7 are the buffer-sharing tests.

Patch 8-10 are the man page changes.

Bijan Mottahedeh (10):
  liburing: support buffer registration updates
  liburing: support buffer registration sharing
  test/buffer-register: add buffer registration test
  test/buffer-update: add buffer registration update test
  test/buffer-share: add buffer registration sharing test
  test/buffer-share: add private memory option
  test/buffer-share: add interruptible deadlock test
  man/io_uring_setup.2: document buffer registration sharing
  man/io_uring_register.2: document buffer registration updates
  man/io_uring_enter.2: document IORING_OP_BUFFERS_UPDATE

 .gitignore                      |    3 +
 man/io_uring_enter.2            |   16 +
 man/io_uring_register.2         |   29 +-
 man/io_uring_setup.2            |   12 +
 src/include/liburing.h          |   12 +
 src/include/liburing/io_uring.h |   11 +
 src/register.c                  |   29 +-
 test/Makefile                   |    7 +
 test/buffer-register.c          |  701 +++++++++++++++++++++
 test/buffer-share.c             | 1282 +++++++++++++++++++++++++++++++++++++++
 test/buffer-update.c            |  165 +++++
 11 files changed, 2260 insertions(+), 7 deletions(-)
 create mode 100644 test/buffer-register.c
 create mode 100644 test/buffer-share.c
 create mode 100644 test/buffer-update.c

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 01/10] liburing: support buffer registration updates
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 02/10] liburing: support buffer registration sharing Bijan Mottahedeh
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 src/include/liburing.h          | 12 ++++++++++++
 src/include/liburing/io_uring.h |  9 +++++++++
 src/register.c                  | 29 +++++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/include/liburing.h b/src/include/liburing.h
index 90403bc..c1e01ab 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -122,6 +122,9 @@ extern int io_uring_register_buffers(struct io_uring *ring,
 					const struct iovec *iovecs,
 					unsigned nr_iovecs);
 extern int io_uring_unregister_buffers(struct io_uring *ring);
+extern int io_uring_register_buffers_update(struct io_uring *ring, unsigned off,
+					struct iovec *iovecs,
+					unsigned nr_iovecs);
 extern int io_uring_register_files(struct io_uring *ring, const int *files,
 					unsigned nr_files);
 extern int io_uring_unregister_files(struct io_uring *ring);
@@ -382,6 +385,15 @@ static inline void io_uring_prep_files_update(struct io_uring_sqe *sqe,
 	io_uring_prep_rw(IORING_OP_FILES_UPDATE, sqe, -1, fds, nr_fds, offset);
 }
 
+static inline void io_uring_prep_buffers_update(struct io_uring_sqe *sqe,
+						struct iovec *iovs,
+						unsigned nr_iovs,
+						int offset)
+{
+	io_uring_prep_rw(IORING_OP_BUFFERS_UPDATE, sqe, -1, iovs, nr_iovs,
+			 offset);
+}
+
 static inline void io_uring_prep_fallocate(struct io_uring_sqe *sqe, int fd,
 					   int mode, off_t offset, off_t len)
 {
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 0bb55b0..1ee9a0f 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -141,6 +141,7 @@ enum {
 	IORING_OP_SHUTDOWN,
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
+	IORING_OP_BUFFERS_UPDATE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -284,17 +285,25 @@ enum {
 	IORING_UNREGISTER_PERSONALITY		= 10,
 	IORING_REGISTER_RESTRICTIONS		= 11,
 	IORING_REGISTER_ENABLE_RINGS		= 12,
+	IORING_REGISTER_BUFFERS_UPDATE		= 13,
 
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
 
+/* deprecated, see struct io_uring_rsrc_update */
 struct io_uring_files_update {
 	__u32 offset;
 	__u32 resv;
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+struct io_uring_rsrc_update {
+	__u32 offset;
+	__u32 resv;
+	__aligned_u64 data;
+};
+
 #define IO_URING_OP_SUPPORTED	(1U << 0)
 
 struct io_uring_probe_op {
diff --git a/src/register.c b/src/register.c
index 994aaff..45cf114 100644
--- a/src/register.c
+++ b/src/register.c
@@ -14,6 +14,31 @@
 
 #include "syscall.h"
 
+/*
+ * Register an update for an existing buffer set. The updates will start at
+ * 'off' in the original array, and 'nr_iovecs' is the number of buffers we'll
+ * update.
+ *
+ * Returns number of files updated on success, -ERROR on failure.
+ */
+int io_uring_register_buffers_update(struct io_uring *ring, unsigned off,
+				     struct iovec *iovecs, unsigned nr_iovecs)
+{
+	struct io_uring_rsrc_update up = {
+		.offset	= off,
+		.data	= (unsigned long) iovecs,
+	};
+	int ret;
+
+	ret = __sys_io_uring_register(ring->ring_fd,
+					IORING_REGISTER_BUFFERS_UPDATE, &up,
+					nr_iovecs);
+	if (ret < 0)
+		return -errno;
+
+	return ret;
+}
+
 int io_uring_register_buffers(struct io_uring *ring, const struct iovec *iovecs,
 			      unsigned nr_iovecs)
 {
@@ -49,9 +74,9 @@ int io_uring_unregister_buffers(struct io_uring *ring)
 int io_uring_register_files_update(struct io_uring *ring, unsigned off,
 				   int *files, unsigned nr_files)
 {
-	struct io_uring_files_update up = {
+	struct io_uring_rsrc_update up = {
 		.offset	= off,
-		.fds	= (unsigned long) files,
+		.data	= (unsigned long) files,
 	};
 	int ret;
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 02/10] liburing: support buffer registration sharing
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 01/10] liburing: support buffer registration updates Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 03/10] test/buffer-register: add buffer registration test Bijan Mottahedeh
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 src/include/liburing/io_uring.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 1ee9a0f..c85ab40 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -102,6 +102,8 @@ enum {
 #define IORING_SETUP_CLAMP	(1U << 4)	/* clamp SQ/CQ ring sizes */
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
+#define IORING_SETUP_SHARE_BUF	(1U << 7)	/* share buffer registration */
+#define IORING_SETUP_ATTACH_BUF	(1U << 8)	/* attach buffer registration */
 
 enum {
 	IORING_OP_NOP,
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 03/10] test/buffer-register: add buffer registration test
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 01/10] liburing: support buffer registration updates Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 02/10] liburing: support buffer registration sharing Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 04/10] test/buffer-update: add buffer registration update test Bijan Mottahedeh
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 .gitignore             |   1 +
 test/Makefile          |   2 +
 test/buffer-register.c | 701 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 704 insertions(+)
 create mode 100644 test/buffer-register.c

diff --git a/.gitignore b/.gitignore
index 360064a..9d30cf7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,7 @@
 /test/across-fork
 /test/b19062a56726-test
 /test/b5837bd5311d-test
+/test/buffer-register
 /test/ce593a6c480a-test
 /test/close-opath
 /test/config.local
diff --git a/test/Makefile b/test/Makefile
index ce5d9e3..4e1529d 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -29,6 +29,7 @@ test_targets += \
 	across-fork splice \
 	b19062a56726-test \
 	b5837bd5311d-test \
+	buffer-register \
 	ce593a6c480a-test \
 	close-opath \
 	connect \
@@ -152,6 +153,7 @@ test_srcs := \
 	across-fork.c \
 	b19062a56726-test.c \
 	b5837bd5311d-test.c \
+	buffer-register.c \
 	ce593a6c480a-test.c \
 	close-opath.c \
 	connect.c \
diff --git a/test/buffer-register.c b/test/buffer-register.c
new file mode 100644
index 0000000..35176a1
--- /dev/null
+++ b/test/buffer-register.c
@@ -0,0 +1,701 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various buffer registration tests
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+
+static int pagesize;
+
+static void free_bufs(struct iovec *iovs, int nr_bufs)
+{
+	int i;
+
+	if (!iovs)
+		return;
+
+	for (i = 0; i < nr_bufs; i++)
+		if (iovs[i].iov_base)
+			free(iovs[i].iov_base);
+
+	free(iovs);
+}
+
+static struct iovec *alloc_bufs(int nr_bufs, int extra)
+{
+	struct iovec *iovs;
+	void *buf;
+	int i;
+
+	iovs = calloc(nr_bufs + extra, sizeof(struct iovec));
+	if (!iovs) {
+		perror("malloc");
+		return NULL;
+	}
+
+	for (i = 0; i < nr_bufs; i++) {
+		buf = malloc(pagesize);
+		if (!buf) {
+			perror("malloc");
+			break;
+		}
+		iovs[i].iov_base = buf;
+		iovs[i].iov_len = pagesize;
+	}
+
+	/* extra buffers already set to zero */
+
+	if (i < nr_bufs) {
+		free_bufs(iovs, nr_bufs);
+		iovs = NULL;
+	}
+
+	return iovs;
+}
+
+static int test_shrink(struct io_uring *ring)
+{
+	int ret, off;
+	struct iovec *iovs, iov = {0};
+
+	iovs = alloc_bufs(50, 0);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 50);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	off = 0;
+	do {
+		ret = io_uring_register_buffers_update(ring, off, &iov, 1);
+		if (ret != 1) {
+			if (off == 50 && ret == -EINVAL)
+				break;
+			fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__,
+				ret);
+			break;
+		}
+		off++;
+	} while (1);
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 50);
+	return 0;
+err:
+	free_bufs(iovs, 50);
+	return 1;
+}
+
+static int test_grow(struct io_uring *ring)
+{
+	int ret, off, i;
+	struct iovec *iovs, *ups = NULL;
+
+	iovs = alloc_bufs(50, 250);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 300);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ups = alloc_bufs(250, 0);
+	if (!ups)
+		goto err;
+	i = 0;
+	off = 50;
+	do {
+		ret = io_uring_register_buffers_update(ring, off, ups + i, 1);
+		if (ret != 1) {
+			if (off == 300 && ret == -EINVAL)
+				break;
+			fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__,
+				ret);
+			break;
+		}
+		if (off >= 300) {
+			fprintf(stderr, "%s: Succeeded beyond end-of-list?\n",
+				__FUNCTION__);
+			goto err;
+		}
+		off++;
+		i++;
+	} while (1);
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 300);
+	free_bufs(ups, 250);
+	return 0;
+err:
+	free_bufs(iovs, 300);
+	free_bufs(ups, 250);
+	return 1;
+}
+
+static int test_replace_all(struct io_uring *ring)
+{
+	struct iovec *iovs, *ups = NULL;
+	int ret;
+
+	iovs = alloc_bufs(100, 0);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 100);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ups = alloc_bufs(0, 100);
+
+	ret = io_uring_register_buffers_update(ring, 0, ups, 100);
+	if (ret != 100) {
+		fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 100);
+	free_bufs(ups, 100);
+	return 0;
+err:
+	free_bufs(iovs, 100);
+	free_bufs(ups, 100);
+	return 1;
+}
+
+static int test_replace(struct io_uring *ring)
+{
+	struct iovec *iovs, *ups = NULL;
+	int ret;
+
+	iovs = alloc_bufs(100, 0);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 100);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ups = alloc_bufs(10, 0);
+	if (!ups)
+		goto err;
+	ret = io_uring_register_buffers_update(ring, 90, ups, 10);
+	if (ret != 10) {
+		fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 100);
+	free_bufs(ups, 10);
+	return 0;
+err:
+	free_bufs(iovs, 100);
+	free_bufs(ups, 10);
+	return 1;
+}
+
+static int test_removals(struct io_uring *ring)
+{
+	struct iovec *iovs, *ups = NULL;
+	int ret;
+
+	iovs = alloc_bufs(100, 0);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 100);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ups = alloc_bufs(0, 10);
+	if (!ups)
+		goto err;
+	ret = io_uring_register_buffers_update(ring, 50, ups, 10);
+	if (ret != 10) {
+		fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 100);
+	free_bufs(ups, 0);
+	return 0;
+err:
+	free_bufs(iovs, 100);
+	free_bufs(ups, 0);
+	return 1;
+}
+
+static int test_additions(struct io_uring *ring)
+{
+	struct iovec *iovs, *ups = NULL;
+	int ret;
+
+	iovs = alloc_bufs(100, 100);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 200);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ups = alloc_bufs(2, 0);
+	if (!ups)
+		goto err;
+	ret = io_uring_register_buffers_update(ring, 100, ups, 2);
+	if (ret != 2) {
+		fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 100);
+	free_bufs(ups, 2);
+	return 0;
+err:
+	free_bufs(iovs, 100);
+	free_bufs(ups, 2);
+	return 1;
+}
+
+static int test_sparse(struct io_uring *ring)
+{
+	struct iovec *iovs;
+	int ret;
+
+	iovs = alloc_bufs(100, 100);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 200);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	free_bufs(iovs, 100);
+	return 0;
+err:
+	free_bufs(iovs, 100);
+	return 1;
+}
+
+static int test_basic_many(struct io_uring *ring)
+{
+	struct iovec *iovs;
+	int ret;
+
+	iovs = alloc_bufs(1024, 0);
+	ret = io_uring_register_buffers(ring, iovs, 768);
+	if (ret) {
+		fprintf(stderr, "%s: register %d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister %d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	free_bufs(iovs, 1024);
+	return 0;
+err:
+	free_bufs(iovs, 1024);
+	return 1;
+}
+
+static int test_basic(struct io_uring *ring)
+{
+	struct iovec *iovs;
+	int ret;
+
+	iovs = alloc_bufs(100, 0);
+	ret = io_uring_register_buffers(ring, iovs, 100);
+	if (ret) {
+		fprintf(stderr, "%s: register %d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister %d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	free_bufs(iovs, 100);
+	return 0;
+err:
+	free_bufs(iovs, 100);
+	return 1;
+}
+
+/*
+ * Register 0 buffers, but reserve space for 10.  Then add one buffer.
+ */
+static int test_zero(struct io_uring *ring)
+{
+	struct iovec *iovs, *ups = NULL;
+	int ret;
+
+	iovs = alloc_bufs(0, 10);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, 10);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ups = alloc_bufs(1, 0);
+	if (!ups)
+		return 1;
+	ret = io_uring_register_buffers_update(ring, 0, ups, 1);
+	if (ret != 1) {
+		fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	free_bufs(iovs, 0);
+	free_bufs(ups, 1);
+	return 0;
+err:
+	free_bufs(iovs, 0);
+	free_bufs(ups, 1);
+	return 1;
+}
+
+static int test_fixed_read_write(struct io_uring *ring, int fd,
+				 struct iovec *iovs, int index)
+{
+	struct io_uring_sqe *sqe;
+	struct io_uring_cqe *cqe;
+	struct iovec *iov;
+	int ret;
+
+	sqe = io_uring_get_sqe(ring);
+	if (!sqe) {
+		fprintf(stderr, "%s: failed to get sqe\n", __FUNCTION__);
+		return 1;
+	}
+	iov = &iovs[index];
+	io_uring_prep_write_fixed(sqe, fd, iov->iov_base, iov->iov_len, 0,
+				  index);
+	sqe->user_data = 1;
+
+	ret = io_uring_submit(ring);
+	if (ret != 1) {
+		fprintf(stderr, "%s: got %d, wanted 1\n", __FUNCTION__, ret);
+		return 1;
+	}
+
+	ret = io_uring_wait_cqe(ring, &cqe);
+	if (ret < 0) {
+		fprintf(stderr, "%s: io_uring_wait_cqe=%d\n", __FUNCTION__,
+			ret);
+		return 1;
+	}
+	if (cqe->res != pagesize) {
+		fprintf(stderr, "%s: write cqe->res=%d\n", __FUNCTION__,
+			cqe->res);
+		return 1;
+	}
+	io_uring_cqe_seen(ring, cqe);
+
+	sqe = io_uring_get_sqe(ring);
+	if (!sqe) {
+		fprintf(stderr, "%s: failed to get sqe\n", __FUNCTION__);
+		return 1;
+	}
+	iov = &iovs[index + 1];
+	io_uring_prep_read_fixed(sqe, fd, iov->iov_base, iov->iov_len, 0,
+				  index + 1);
+	sqe->user_data = 2;
+
+	ret = io_uring_submit(ring);
+	if (ret != 1) {
+		fprintf(stderr, "%s: got %d, wanted 1\n", __FUNCTION__, ret);
+		return 1;
+	}
+
+	ret = io_uring_wait_cqe(ring, &cqe);
+	if (ret < 0) {
+		fprintf(stderr, "%s: io_uring_wait_cqe=%d\n", __FUNCTION__,
+			ret);
+		return 1;
+	}
+	if (cqe->res != pagesize) {
+		fprintf(stderr, "%s: read cqe->res=%d\n", __FUNCTION__,
+			cqe->res);
+		return 1;
+	}
+	io_uring_cqe_seen(ring, cqe);
+
+	return 0;
+}
+
+/*
+ * Register 1k of sparse buffers, update one at a random spot, then do some
+ * file IO to verify it works.
+ */
+static int test_huge(struct io_uring *ring)
+{
+	struct iovec *iovs, *ups = NULL;
+	int ret, fd;
+
+	iovs = alloc_bufs(0, UIO_MAXIOV);
+	if (!iovs)
+		return 1;
+	ret = io_uring_register_buffers(ring, iovs, UIO_MAXIOV);
+	if (ret) {
+		fprintf(stderr, "%s: register ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+	fd = open(".reg.768", O_RDWR | O_CREAT, 0644);
+	if (fd < 0) {
+		fprintf(stderr, "%s: open=%d\n", __FUNCTION__, errno);
+		goto err;
+	}
+
+	ups = alloc_bufs(2, 0);
+	if (!ups) {
+		fprintf(stderr, "%s: malloc=%d\n", __FUNCTION__, errno);
+		goto err;
+	}
+	memset(ups[0].iov_base, 0x5a, pagesize);
+
+	ret = io_uring_register_buffers_update(ring, 768, ups, 2);
+	if (ret != 2) {
+		fprintf(stderr, "%s: update ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	iovs[768].iov_base = ups[0].iov_base;
+	iovs[768].iov_len = pagesize;
+	iovs[769].iov_base = ups[1].iov_base;
+	iovs[769].iov_len = pagesize;
+
+	if (test_fixed_read_write(ring, fd, iovs, 768))
+		goto err;
+
+	if (memcmp(ups[0].iov_base, ups[1].iov_base, pagesize)) {
+		fprintf(stderr, "%s: data mismatch\n", __FUNCTION__);
+		goto err;
+	}
+
+	ret = io_uring_unregister_buffers(ring);
+	if (ret) {
+		fprintf(stderr, "%s: unregister ret=%d\n", __FUNCTION__, ret);
+		goto err;
+	}
+
+	if (fd != -1) {
+		close(fd);
+		unlink(".reg.768");
+	}
+	free_bufs(iovs, 0);
+	free_bufs(ups, 2);
+	return 0;
+err:
+	free_bufs(iovs, 0);
+	free_bufs(ups, 2);
+	return 1;
+}
+
+static int test_sparse_updates(void)
+{
+	struct iovec *iovs, *ups = NULL;
+	struct io_uring ring;
+	int ret, i;
+
+	ret = io_uring_queue_init(8, &ring, 0);
+	if (ret) {
+		fprintf(stderr, "queue_init: %d\n", ret);
+		return ret;
+	}
+
+	iovs = alloc_bufs(0, 256);
+
+	ret = io_uring_register_buffers(&ring, iovs, 256);
+	if (ret) {
+		fprintf(stderr, "buffer_register: %d\n", ret);
+		return ret;
+	}
+
+	ups = alloc_bufs(1, 0);
+	for (i = 0; i < 256; i++) {
+		ret = io_uring_register_buffers_update(&ring, i, ups, 1);
+		if (ret != 1) {
+			fprintf(stderr, "buffer_update: %d\n", ret);
+			return ret;
+		}
+	}
+	io_uring_unregister_buffers(&ring);
+
+	for (i = 0; i < 256; i++)
+		iovs[i] = ups[0];
+
+	ret = io_uring_register_buffers(&ring, iovs, 256);
+	if (ret) {
+		fprintf(stderr, "buffer_register: %d\n", ret);
+		return ret;
+	}
+
+	free_bufs(ups, 1);
+	ups = alloc_bufs(0, 1);
+	for (i = 0; i < 256; i++) {
+		ret = io_uring_register_buffers_update(&ring, i, ups, 1);
+		if (ret != 1) {
+			fprintf(stderr, "buffer_update: %d\n", ret);
+			goto done;
+		}
+	}
+	ret = 0;
+done:
+	free_bufs(ups, 0);
+	io_uring_unregister_buffers(&ring);
+
+	io_uring_queue_exit(&ring);
+	return ret;
+}
+
+int main(int argc, char *argv[])
+{
+	struct io_uring ring;
+	int ret;
+
+	if (argc > 1)
+		return 0;
+
+	pagesize = getpagesize();
+
+	ret = io_uring_queue_init(8, &ring, 0);
+	if (ret) {
+		printf("ring setup failed\n");
+		return 1;
+	}
+
+	ret = test_basic(&ring);
+	if (ret) {
+		printf("test_basic failed\n");
+		return ret;
+	}
+
+	ret = test_basic_many(&ring);
+	if (ret) {
+		printf("test_basic_many failed\n");
+		return ret;
+	}
+
+	ret = test_sparse(&ring);
+	if (ret) {
+		printf("test_sparse failed\n");
+		return ret;
+	}
+
+	ret = test_additions(&ring);
+	if (ret) {
+		printf("test_additions failed\n");
+		return ret;
+	}
+
+	ret = test_removals(&ring);
+	if (ret) {
+		printf("test_removals failed\n");
+		return ret;
+	}
+
+	ret = test_replace(&ring);
+	if (ret) {
+		printf("test_replace failed\n");
+		return ret;
+	}
+
+	ret = test_replace_all(&ring);
+	if (ret) {
+		printf("test_replace_all failed\n");
+		return ret;
+	}
+
+	ret = test_grow(&ring);
+	if (ret) {
+		printf("test_grow failed\n");
+		return ret;
+	}
+
+	ret = test_shrink(&ring);
+	if (ret) {
+		printf("test_shrink failed\n");
+		return ret;
+	}
+
+	ret = test_zero(&ring);
+	if (ret) {
+		printf("test_zero failed\n");
+		return ret;
+	}
+
+	ret = test_huge(&ring);
+	if (ret) {
+		printf("test_huge failed\n");
+		return ret;
+	}
+
+	ret = test_sparse_updates();
+	if (ret) {
+		printf("test_sparse_updates failed\n");
+		return ret;
+	}
+
+	return 0;
+}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 04/10] test/buffer-update: add buffer registration update test
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (2 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 03/10] test/buffer-register: add buffer registration test Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 05/10] test/buffer-share: add buffer registration sharing test Bijan Mottahedeh
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 .gitignore           |   1 +
 test/Makefile        |   2 +
 test/buffer-update.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 168 insertions(+)
 create mode 100644 test/buffer-update.c

diff --git a/.gitignore b/.gitignore
index 9d30cf7..b44560e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@
 /test/b19062a56726-test
 /test/b5837bd5311d-test
 /test/buffer-register
+/test/buffer-update
 /test/ce593a6c480a-test
 /test/close-opath
 /test/config.local
diff --git a/test/Makefile b/test/Makefile
index 4e1529d..79e3e00 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -30,6 +30,7 @@ test_targets += \
 	b19062a56726-test \
 	b5837bd5311d-test \
 	buffer-register \
+	buffer-update \
 	ce593a6c480a-test \
 	close-opath \
 	connect \
@@ -154,6 +155,7 @@ test_srcs := \
 	b19062a56726-test.c \
 	b5837bd5311d-test.c \
 	buffer-register.c \
+	buffer-update.c \
 	ce593a6c480a-test.c \
 	close-opath.c \
 	connect.c \
diff --git a/test/buffer-update.c b/test/buffer-update.c
new file mode 100644
index 0000000..5f926aa
--- /dev/null
+++ b/test/buffer-update.c
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: run various buffer registration tests
+ *
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include "liburing.h"
+
+#define NBUFS	10
+
+static int pagesize;
+
+static struct iovec *alloc_bufs(int nr_bufs)
+{
+	struct iovec *iovs;
+	void *buf;
+	int i;
+
+	iovs = calloc(nr_bufs, sizeof(struct iovec));
+	if (!iovs) {
+		perror("malloc");
+		return NULL;
+	}
+
+	for (i = 0; i < nr_bufs; i++) {
+		buf = malloc(pagesize);
+		if (!buf) {
+			perror("malloc");
+			iovs = NULL;
+			break;
+		}
+		iovs[i].iov_base = buf;
+		iovs[i].iov_len = pagesize;
+	}
+
+	return iovs;
+}
+
+static void free_bufs(struct iovec *iovs, int nr_bufs)
+{
+	int i;
+
+	for (i = 0; i < nr_bufs; i++)
+		free(iovs[i].iov_base);
+}
+
+static int test_update_multiring(struct io_uring *r1, struct io_uring *r2,
+				 struct io_uring *r3, int do_unreg)
+{
+	struct iovec *iovs, *newiovs;
+
+	iovs = alloc_bufs(NBUFS);
+	newiovs = alloc_bufs(NBUFS);
+
+	if (io_uring_register_buffers(r1, iovs, NBUFS) ||
+	    io_uring_register_buffers(r2, iovs, NBUFS) ||
+	    io_uring_register_buffers(r3, iovs, NBUFS)) {
+		fprintf(stderr, "%s: register buffers failed\n", __FUNCTION__);
+		goto err;
+	}
+
+	if (io_uring_register_buffers_update(r1, 0, newiovs, NBUFS) != NBUFS ||
+	    io_uring_register_buffers_update(r2, 0, newiovs, NBUFS) != NBUFS ||
+	    io_uring_register_buffers_update(r3, 0, newiovs, NBUFS) != NBUFS) {
+		fprintf(stderr, "%s: update buffers failed\n", __FUNCTION__);
+		goto err;
+	}
+
+	if (!do_unreg)
+		goto done;
+
+	if (io_uring_unregister_buffers(r1) ||
+	    io_uring_unregister_buffers(r2) ||
+	    io_uring_unregister_buffers(r3)) {
+		fprintf(stderr, "%s: unregister buffers failed\n",
+			__FUNCTION__);
+		goto err;
+	}
+
+done:
+	free_bufs(iovs, NBUFS);
+	free_bufs(newiovs, NBUFS);
+	return 0;
+err:
+	free_bufs(iovs, NBUFS);
+	free_bufs(newiovs, NBUFS);
+	return 1;
+}
+
+static int test_sqe_update(struct io_uring *ring)
+{
+	struct io_uring_sqe *sqe;
+	struct io_uring_cqe *cqe;
+	struct iovec *iovs;
+	int ret;
+
+	iovs = calloc(NBUFS, sizeof(struct iovec));
+
+	sqe = io_uring_get_sqe(ring);
+	io_uring_prep_buffers_update(sqe, iovs, NBUFS, 0);
+	ret = io_uring_submit(ring);
+	if (ret != 1) {
+		fprintf(stderr, "submit: %d\n", ret);
+		return 1;
+	}
+
+	ret = io_uring_wait_cqe(ring, &cqe);
+	if (ret) {
+		fprintf(stderr, "wait: %d\n", ret);
+		return 1;
+	}
+
+	ret = cqe->res;
+	io_uring_cqe_seen(ring, cqe);
+	if (ret == -EINVAL) {
+		fprintf(stdout, "IORING_OP_BUFFERSS_UPDATE not supported" \
+			", skipping\n");
+		return 0;
+	}
+	return ret != NBUFS;
+}
+
+int main(int argc, char *argv[])
+{
+	struct io_uring r1, r2, r3;
+	int ret;
+
+	if (argc > 1)
+		return 0;
+
+	pagesize = getpagesize();
+
+	if (io_uring_queue_init(8, &r1, 0) ||
+	    io_uring_queue_init(8, &r2, 0) ||
+	    io_uring_queue_init(8, &r3, 0)) {
+		fprintf(stderr, "ring setup failed\n");
+		return 1;
+	}
+
+	ret = test_update_multiring(&r1, &r2, &r3, 1);
+	if (ret) {
+		fprintf(stderr, "test_update_multiring w/unreg\n");
+		return ret;
+	}
+
+	ret = test_update_multiring(&r1, &r2, &r3, 0);
+	if (ret) {
+		fprintf(stderr, "test_update_multiring wo/unreg\n");
+		return ret;
+	}
+
+	ret = test_sqe_update(&r1);
+	if (ret) {
+		fprintf(stderr, "test_sqe_update failed\n");
+		return ret;
+	}
+
+	return 0;
+}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 05/10] test/buffer-share: add buffer registration sharing test
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (3 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 04/10] test/buffer-update: add buffer registration update test Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 06/10] test/buffer-share: add private memory option Bijan Mottahedeh
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 .gitignore          |    1 +
 test/Makefile       |    3 +
 test/buffer-share.c | 1184 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1188 insertions(+)
 create mode 100644 test/buffer-share.c

diff --git a/.gitignore b/.gitignore
index b44560e..4e09f1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@
 /test/b19062a56726-test
 /test/b5837bd5311d-test
 /test/buffer-register
+/test/buffer-share
 /test/buffer-update
 /test/ce593a6c480a-test
 /test/close-opath
diff --git a/test/Makefile b/test/Makefile
index 79e3e00..c9db730 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -30,6 +30,7 @@ test_targets += \
 	b19062a56726-test \
 	b5837bd5311d-test \
 	buffer-register \
+	buffer-share \
 	buffer-update \
 	ce593a6c480a-test \
 	close-opath \
@@ -155,6 +156,7 @@ test_srcs := \
 	b19062a56726-test.c \
 	b5837bd5311d-test.c \
 	buffer-register.c \
+	buffer-share.c \
 	buffer-update.c \
 	ce593a6c480a-test.c \
 	close-opath.c \
@@ -255,6 +257,7 @@ wakeup-hang: XCFLAGS = -lpthread
 pipe-eof: XCFLAGS = -lpthread
 timeout-new: XCFLAGS = -lpthread
 thread-exit: XCFLAGS = -lpthread
+buffer-share: XCFLAGS = -lrt
 
 install: $(test_targets) runtests.sh runtests-loop.sh
 	$(INSTALL) -D -d -m 755 $(datadir)/liburing-test/
diff --git a/test/buffer-share.c b/test/buffer-share.c
new file mode 100644
index 0000000..81e0537
--- /dev/null
+++ b/test/buffer-share.c
@@ -0,0 +1,1184 @@
+/* SPDX-License-Identifier: MIT */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <linux/mman.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/fs.h>
+
+#include "liburing.h"
+
+/*
+ * The main (granparent) process creates a shared memory status segment for 
+ * all tests and then creates a parent process which drives test execution.
+ * The grandparent then waits to reap the status of granchildren test
+ * processes, for the final test case which involves running IO tests after
+ * the parent exits.
+ * 
+ * The parent process creates a shared memory data segment where all IO
+ * buffers reside. The parent then runs all test cases.  For each test
+ * the parent creates the specified number of test processes, and waits
+ * for the children to execute the actual tests.  The parent controls
+ * children's execution with ptrace().
+ *
+ * The tests are organized as follows:
+ *
+ * test           ring        registration   test       expected
+ * category       sharing     type           case       result
+ * --------       -------     ---            ----       ------
+ * registration   none        none           various    pass
+ * IO             fd based    register/pin              failure errno
+ *                            attach
+ *
+ * The IO test has been adapted largely unchanged from test/read-write.c.
+ */
+
+#define MAXID		128	/* max number of tester processes */
+int pids[MAXID];
+int status[MAXID];
+int nids = 1;
+
+#define SHM_ST_SZ	(MAXID * 4)
+#define SHM_ST_NONE	-1
+
+#define FILE_SIZE	(128 * 1024)
+#define BS		4096
+#define BUFFERS		(FILE_SIZE / BS)
+
+#define	SZ_2M		0x200000
+
+#define SQ_THREAD_IDLE  2000
+
+char *shmbuf;
+char *shmstat;
+
+struct iovec vecs[UIO_MAXIOV];
+unsigned long buf_cnt = BUFFERS;
+unsigned long shm_sz = SZ_2M;
+
+int no_read;
+int warned;
+int verbose = 0;
+
+enum {
+	REG_NOP = 0,
+	REG_PIN,
+	REG_ATT,
+};
+
+char *r2s[] = {
+	"nop",
+	"pin",
+	"att",
+};
+
+enum {
+	TEST_DFLT = 0,
+	TEST_INVFD,
+	TEST_BADFD,
+	TEST_UNREG,
+	TEST_BUSY,
+	TEST_EXIT,
+	TEST_FIN,
+};
+
+char *t2s[] = {
+	"dflt",
+	"invfd",
+	"badfd",
+	"unreg",
+	"busy",
+	"exit",
+	"fin",
+};
+
+enum {
+	TEST_REG = 0,
+	TEST_IO,
+};
+
+char *i2s[] = {
+	"reg",
+	"io",
+};
+
+enum {
+	NO,
+	FD,
+};
+
+char *s2s[] = {
+	"nos",
+	"fd",
+};
+
+enum {
+	PASS,
+	FAIL,
+	NONE,
+};
+
+enum {
+	V0 = 0,
+	V1,
+	V2,
+	V3,
+	V4,
+};
+
+struct test_desc {
+	int id;
+	int idx;
+	int fd;
+	int sp[2];
+	int share;
+	int reg;
+	int preg;
+	int test;
+	int expect;
+};
+struct test_desc tdcs[MAXID];
+
+static int test_reg(struct test_desc *);
+static int test_io(struct test_desc *);
+
+int (*test_fn[])(struct test_desc *) = {
+	test_reg,
+	test_io,
+};
+
+static void
+vinfo(int level, char *fmt, ...)
+{
+	va_list args;
+
+	if (verbose < level)
+		return;
+
+	fprintf(stderr, "%-5d ", getpid());
+	va_start(args, fmt);
+	vfprintf(stderr, fmt, args);
+	va_end(args);
+	fflush(stderr);
+}
+
+static void
+verror(char *msg)
+{
+        if (!verbose)
+                return;
+
+        fprintf(stderr, "%-5d %s: %s\n", getpid(), msg, strerror(errno));
+        fflush(stderr);
+}
+
+static void
+send_fd(int socket, int fd)
+{
+	char buf[CMSG_SPACE(sizeof(fd))];
+	struct cmsghdr *cmsg;
+	struct msghdr msg;
+
+	memset(buf, 0, sizeof(buf));
+	memset(&msg, 0, sizeof(msg));
+
+	msg.msg_control = buf;
+	msg.msg_controllen = sizeof(buf);
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
+
+	memmove(CMSG_DATA(cmsg), &fd, sizeof(fd));
+
+	msg.msg_controllen = CMSG_SPACE(sizeof(fd));
+
+	if (sendmsg(socket, &msg, 0) < 0) {
+		verror("sendmsg");
+		exit(1);
+	}
+}
+
+static int
+recv_fd(int socket)
+{
+	int *fdp, fd = -1, ret;
+	char buf[CMSG_SPACE(sizeof(fd))];
+	struct cmsghdr *cmsg;
+	struct msghdr msg;
+
+	memset(buf, 0, sizeof(buf));
+	memset(&msg, 0, sizeof(msg));
+
+	msg.msg_control = buf;
+	msg.msg_controllen = sizeof(buf);
+
+	ret = recvmsg(socket, &msg, 0);
+	if (ret < 0) {
+		verror("recvmsg");
+		exit(1);
+	}
+
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
+	     cmsg = CMSG_NXTHDR(&msg,cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET &&
+		    cmsg->cmsg_type == SCM_RIGHTS) {
+        		fdp = (int *)CMSG_DATA(cmsg);
+        		fd = *fdp;
+    		}
+	}
+
+	return fd;
+}
+
+static int
+shm_create(void)
+{
+	int id;
+
+	id = shmget(2, shm_sz, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+	if (id < 0) {
+		fprintf(stderr, "Unable to map a huge page. "
+			"Increase /proc/sys/vm/nr_hugepages by at least 1.\n");
+		verror("shmget");
+		exit(1);
+	}
+	vinfo(V3, "shm_create shmid: 0x%x\n", id);
+
+	return id;
+}
+
+static char *
+shm_attach(int id)
+{
+	char *addr;
+
+	addr = shmat(id, 0, 0);
+	if (addr == MAP_FAILED) {
+		verror("shmat");
+		shmctl(id, IPC_RMID, NULL);
+	}
+	vinfo(V3, "shm_attach shmid: 0x%x shmbuf: %p\n", id, addr);
+
+	return addr;
+}
+
+static void
+shm_destroy(int id, char *addr)
+{
+	int ret;
+
+	vinfo(V3, "shm_destroy shmid: 0x%x\n", id);
+
+	ret = shmdt((const void *)addr);
+	if (ret)
+		verror("shmdt");
+
+	shmctl(id, IPC_RMID, NULL);
+
+	if (ret)
+		exit(1);
+}
+
+static void *
+shmstat_create(void)
+{
+	int res;
+	int fd;
+	char fname[32];
+	void *addr = NULL;
+
+	snprintf(fname, sizeof(fname), "shmstat.%d", getpid());
+
+	/* get shared memory file descriptor (NOT a file) */
+	fd = shm_open(fname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
+	if (fd == -1) {
+		verror("open");
+		goto done;
+	}
+
+	/* extend shared memory object as by default it's set to size 0 */
+	res = ftruncate(fd, SHM_ST_SZ);
+	if (res == -1) {
+		verror("ftruncate");
+		goto done;
+	}
+
+	/* map shared memory to process address space */
+	addr = mmap(NULL, SHM_ST_SZ, PROT_WRITE, MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED) {
+		verror("mmap");
+		goto done;
+	}
+	memset(addr, 0, SHM_ST_SZ);
+done:
+	return addr;
+}
+
+static void
+shmstat_store(struct test_desc *tdc, int data)
+{
+	vinfo(V4, "stat_store %i %d\n", tdc->idx, data);
+	shmstat[tdc->idx] = data;
+}
+
+static int
+shmstat_check(int expect)
+{
+	int i;
+
+	for (i = 0; i < nids; i++) {
+		if (shmstat[i] != expect) {
+			vinfo(V3, "check_shmstat child %d: expect %d got %d\n",
+			      pids[i], expect, shmstat[i]);
+			return shmstat[i];
+		}
+	}
+
+	return 0;
+}
+
+static int
+queue_init(struct test_desc *tdc, int qd, struct io_uring *ring,
+	   struct io_uring_params *p, int reg)
+{
+	int ret;
+
+	if (tdc->share == FD) {
+		if (reg == REG_ATT) {
+			p->flags |= IORING_SETUP_ATTACH_BUF;
+			if (tdc->test == TEST_BADFD)
+				p->wq_fd = -1;
+			else if (tdc->test == TEST_INVFD)
+				p->wq_fd = 0;
+			else
+				p->wq_fd = tdc->fd;
+		} else
+			p->flags |= IORING_SETUP_SHARE_BUF;
+	}
+
+	ret = io_uring_queue_init_params(qd, ring, p);
+	if (ret)
+		verror("queue_init");
+	vinfo(V2, "queue_init %d\n", ring->ring_fd);
+
+	return ret;
+}
+
+static void
+queue_exit(struct io_uring *ring)
+{
+	vinfo(V2, "queue_exit %d\n", ring->ring_fd);
+	io_uring_queue_exit(ring);
+}
+
+static void
+create_buffers(char *adr)
+{
+	int i;
+
+	if (buf_cnt > UIO_MAXIOV) {
+		fprintf(stderr, "invalid buffer count: %ld\n", buf_cnt);
+		exit(1);
+	}
+
+	for (i = 0; i < buf_cnt; i++) {
+		vecs[i].iov_base = adr;
+		vecs[i].iov_len = BS;
+		adr += BS;
+	}
+}
+
+static int
+register_buffers(struct test_desc *tdc, struct io_uring *ring)
+{
+	if (tdc->reg == REG_ATT)
+		return io_uring_register_buffers(ring, 0, 0);
+
+	return io_uring_register_buffers(ring, vecs, buf_cnt);
+}
+
+static void
+stop(void)
+{
+	vinfo(V2, "raise SIGSTOP\n");
+	raise(SIGSTOP);
+	vinfo(V2, "resume SIGSTOP\n");
+}
+
+static int
+test_reg(struct test_desc *tdc)
+{
+	int ret;
+	struct io_uring_params p = {0};
+	struct io_uring ring;
+
+	ret = queue_init(tdc, 1, &ring, &p, tdc->reg);
+	if (ret)
+		return ret;
+
+	ret = register_buffers(tdc, &ring);
+	if (ret)
+		verror("register test_reg");
+
+	if (tdc->test == TEST_BUSY) {
+		vinfo(V2, "delay\n");
+		sleep(5);
+	} else if (tdc->test == TEST_EXIT) {
+		stop();
+	}
+
+	queue_exit(&ring);
+
+	return ret;
+}
+
+static int
+create_file(const char *file)
+{
+	ssize_t ret;
+	char *buf;
+	int fd;
+
+	buf = malloc(FILE_SIZE);
+	memset(buf, 0xaa, FILE_SIZE);
+
+	fd = open(file, O_WRONLY | O_CREAT, 0644);
+	if (fd < 0) {
+		verror("open");
+		return 1;
+	}
+	ret = write(fd, buf, FILE_SIZE);
+	close(fd);
+	return ret != FILE_SIZE;
+}
+
+static int
+do_io(struct test_desc *tdc, const char *file,
+      struct io_uring *ring, int write, int buffered,
+      int sqthread, int fixed, int nonvec, int seq, int exp_len)
+{
+	struct io_uring_sqe *sqe;
+	struct io_uring_cqe *cqe;
+	int open_flags;
+	int i, fd, ret = 0;
+	off_t offset;
+
+	vinfo(V1, "%s: %d/%d/%d/%d/%d\n",
+	     __FUNCTION__, write, buffered, sqthread, fixed, nonvec);
+
+	if (sqthread && geteuid()) {
+		vinfo(V1, "SKIPPED (not root)\n");
+		return 0;
+	}
+
+	if (write)
+		open_flags = O_WRONLY;
+	else
+		open_flags = O_RDONLY;
+	if (!buffered)
+		open_flags |= O_DIRECT;
+
+	fd = open(file, open_flags);
+	if (fd < 0) {
+		verror("open");
+		goto err;
+	}
+
+	if (fixed) {
+		ret = register_buffers(tdc, ring);
+		if (ret) {
+			vinfo(V1, "buffer reg failed: %d\n", ret);
+			goto err;
+		}
+	}
+	if (fixed || sqthread) {
+		ret = io_uring_register_files(ring, &fd, 1);
+		if (ret) {
+			vinfo(V1, "file reg failed: %d\n", ret);
+			goto err;
+		}
+	}
+
+	offset = 0;
+	for (i = 0; i < BUFFERS; i++) {
+		sqe = io_uring_get_sqe(ring);
+		if (!sqe) {
+			vinfo(V1, "sqe get failed\n");
+			goto err;
+		}
+		if (!seq)
+			offset = BS * (rand() % BUFFERS);
+		if (write) {
+			int use_fd = fd;
+
+			if (fixed || sqthread)
+				use_fd = 0;
+
+			if (fixed) {
+				io_uring_prep_write_fixed(sqe, use_fd,
+							  vecs[i].iov_base,
+							  vecs[i].iov_len,
+							  offset, i);
+			} else if (nonvec) {
+				io_uring_prep_write(sqe, use_fd,
+						    vecs[i].iov_base,
+						    vecs[i].iov_len, offset);
+			} else {
+				io_uring_prep_writev(sqe, use_fd, &vecs[i], 1,
+						     offset);
+			}
+		} else {
+			int use_fd = fd;
+
+			if (fixed || sqthread)
+				use_fd = 0;
+
+			if (fixed) {
+				io_uring_prep_read_fixed(sqe, use_fd,
+							 vecs[i].iov_base,
+							 vecs[i].iov_len,
+							 offset, i);
+			} else if (nonvec) {
+				io_uring_prep_read(sqe, use_fd,
+						   vecs[i].iov_base,
+						   vecs[i].iov_len, offset);
+			} else {
+				io_uring_prep_readv(sqe, use_fd, &vecs[i], 1,
+						    offset);
+			}
+
+		}
+		if (fixed || sqthread)
+			sqe->flags |= IOSQE_FIXED_FILE;
+		if (seq)
+			offset += BS;
+	}
+
+	ret = io_uring_submit(ring);
+	if (ret != BUFFERS) {
+		vinfo(V1, "submit got %d, wanted %d\n", ret, BUFFERS);
+		goto err;
+	}
+
+	for (i = 0; i < BUFFERS; i++) {
+		ret = io_uring_wait_cqe(ring, &cqe);
+		if (ret) {
+			vinfo(V1, "wait_cqe %d\n", ret);
+			goto err;
+		}
+		if (cqe->res == -EINVAL && nonvec) {
+			if (!warned) {
+				vinfo(V1, "Non-vectored IO not "
+				      "supported, skipping\n");
+				warned = 1;
+				no_read = 1;
+			}
+		} else if (cqe->res != exp_len) {
+			vinfo(V1, "cqe res %d, wanted %d\n", cqe->res, exp_len);
+			ret = cqe->res;
+			goto err;
+		}
+		io_uring_cqe_seen(ring, cqe);
+	}
+
+	if (fixed) {
+		ret = io_uring_unregister_buffers(ring);
+		if (ret) {
+			vinfo(V1, "buffer unreg failed: %d\n", ret);
+			goto err;
+		}
+	}
+	if (fixed || sqthread) {
+		ret = io_uring_unregister_files(ring);
+		if (ret) {
+			vinfo(V1, "file unreg failed: %d\n", ret);
+			goto err;
+		}
+	}
+
+	close(fd);
+	vinfo(V1, "%s: %d/%d/%d/%d/%d: PASS\n",
+	     __FUNCTION__, write, buffered, sqthread, fixed, nonvec);
+	return 0;
+err:
+	(void)io_uring_unregister_buffers(ring);
+	if (fd != -1)
+		close(fd);
+	return ret;
+}
+
+static int
+run_test_io(struct test_desc *tdc, const char *file,
+	      int write, int buffered, int sqthread, int fixed, int nonvec)
+{
+	struct io_uring_params p = {0};
+	struct io_uring ring;
+	int share, ret;
+
+	share = tdc->share;
+
+	if (!fixed)
+		tdc->share = NO;
+
+	if (sqthread) {
+		if (geteuid()) {
+			if (!warned) {
+				fprintf(stderr,
+					"SQPOLL requires root, skipping\n");
+				warned = 1;
+			}
+			return 0;
+		}
+		p.flags = IORING_SETUP_SQPOLL;
+		p.sq_thread_idle = nids * SQ_THREAD_IDLE;
+	}
+	ret = queue_init(tdc, 64, &ring, &p, tdc->reg);
+
+	if (ret)
+		goto done;
+
+	ret = do_io(tdc, file, &ring, write, buffered, sqthread, fixed,
+		    nonvec, 0, BS);
+
+	queue_exit(&ring);
+done:
+	tdc->share = share;
+	return ret;
+}
+
+
+static int
+has_nonvec_read(void)
+{
+	struct io_uring_probe *p;
+	struct io_uring ring;
+	int ret;
+
+	ret = io_uring_queue_init(1, &ring, 0);
+	vinfo(V1, "io_uring_queue_init %d\n", ring.ring_fd);
+	if (ret) {
+		verror("io_uring_queue_init");
+		exit(ret);
+	}
+
+	p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
+	ret = io_uring_register_probe(&ring, p, 256);
+	/* if we don't have PROBE_REGISTER, we don't have OP_READ/WRITE */
+	if (ret == -EINVAL) {
+out:
+		queue_exit(&ring);
+		return 0;
+	} else if (ret) {
+		fprintf(stderr, "register_probe: %d\n", ret);
+		goto out;
+	}
+
+	if (p->ops_len <= IORING_OP_READ)
+		goto out;
+	if (!(p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED))
+		goto out;
+	queue_exit(&ring);
+	return 1;
+}
+
+static int
+test_io(struct test_desc *tdc)
+{
+	int i, ret, nr;
+	char fname[32];
+
+	snprintf(fname, sizeof(fname), ".basic-rw.%d", getpid());
+	if (create_file(fname)) {
+		fprintf(stderr, "file creation failed\n");
+		ret = 1;
+		goto err;
+	}
+
+	if (tdc->test == TEST_EXIT)
+		stop();
+
+	/* if we don't have nonvec read, skip testing that */
+	if (has_nonvec_read())
+		nr = 64;
+	else
+		nr = 32;
+
+	for (i = 0; i < nr; i++) {
+		int v1, v2, v3, v4, v5;
+
+		v1 = (i & 1) != 0;
+		v2 = (i & 2) != 0;
+		v3 = (i & 4) != 0;
+		v4 = (i & 8) != 0;
+		v5 = (i & 16) != 0;
+		ret = run_test_io(tdc, fname, v1, v2, v3, v4, v5);
+		if (ret) {
+			vinfo(V1, "test_io failed %d/%d/%d/%d/%d\n",
+			      v1, v2, v3, v4, v5);
+			goto err;
+		}
+	}
+err:
+	unlink(fname);
+	return ret;
+}
+
+static int
+get_socketpair(int sp[])
+{
+	if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sp) != 0) {
+		verror("socketpair\n");
+		return errno;
+	}
+	return 0;
+}
+
+static void
+waitstoppid(int pid)
+{
+	int ret;
+	int status;
+
+	while (1) {
+		vinfo(V2, "waitstop %d\n", pid);
+
+		ret = waitpid(pid, &status, WNOHANG | WUNTRACED | WCONTINUED);
+		if (ret != pid) {
+			sleep(1);
+			continue;
+		}
+		if (WIFSTOPPED(status))
+			return;
+
+		break;
+	}
+
+	vinfo(V2, "pid %d was not stopped!\n", pid);
+	exit(1);
+}
+
+static void
+waitstop(void)
+{
+	int i;
+
+	for (i = 0; i < nids; i++)
+		waitstoppid(pids[i]);
+}
+		
+static void
+waitexitpid(int pid, int *status)
+{
+	int ret, xs;
+
+	while (1) {
+		vinfo(V2, "waitexit %d\n", pid);
+
+		ret = waitpid(pid, status, WNOHANG | WUNTRACED | WCONTINUED);
+		if (ret != pid) {
+			sleep(1);
+			continue;
+		}
+		xs = *status;
+		if (WIFEXITED(xs)) {
+			vinfo(2, "child %d exited: %d\n", pid, WEXITSTATUS(xs));
+			*status =  WEXITSTATUS(xs);
+			return;
+		}
+
+		vinfo(V2, "waitexit 0x%x e:%d s:%d d:%d t:%d c:%d\n", xs,
+		      WIFEXITED(xs), WIFSIGNALED(xs), WCOREDUMP(xs),
+		      WIFSTOPPED(xs), WIFCONTINUED(xs));
+		break;
+	}
+
+	vinfo(V2, "pid %d didn't exit!\n", pid);
+	exit(1);
+}
+
+static void
+waitexit(void)
+{
+	int i;
+
+	for (i = 0; i < nids; i++) {
+		status[i] = 0;
+		waitexitpid(pids[i], &status[i]);
+	}
+}
+
+static void
+contpid(int pid)
+{
+	int ret;
+
+	vinfo(V2, "cont %d\n", pid);
+
+	ret = ptrace(PTRACE_CONT, pid, NULL, NULL);
+	if (ret < 0) {
+		verror("ptrace");
+		exit(1);
+	}
+}
+
+static void
+cont(void)
+{
+	int i;
+
+	for (i = 0; i < nids; i++)
+		contpid(pids[i]);
+}
+
+static void
+send_ring_fd(int fd)
+{
+	int i;
+	struct test_desc *tdc;
+
+	for (i = 0; i < nids; i++) {
+		tdc = &tdcs[i];
+		vinfo(V3, "send_fd %d %d: %d\n", pids[i], tdc->sp[0], fd);
+		send_fd(tdc->sp[0], fd);
+	}
+}
+
+static void
+close_sockets(void)
+{
+	int i;
+
+	for (i = 0; i < nids; i++) {
+		shutdown(tdcs[i].sp[0], SHUT_RDWR);
+		shutdown(tdcs[i].sp[1], SHUT_RDWR);
+	}
+}
+
+static int
+run_parent(struct test_desc *tdc)
+{
+	int ret;
+	struct io_uring_params p = {0};
+	struct io_uring ring;
+
+	waitstop();
+
+	ret = queue_init(tdc, 1, &ring, &p, tdc->preg);
+	if (ret)
+		return ret;
+
+	if (tdc->preg == REG_NOP)
+		goto send;
+
+	ret = io_uring_register_buffers(&ring, vecs, buf_cnt);
+	if (ret) {
+		verror("register run_parent");
+		goto done;
+	}
+
+	if (tdc->test == TEST_UNREG) {
+		ret = io_uring_unregister_buffers(&ring);
+		if (ret) {
+			verror("unregister run_parent");
+			goto done;
+		}
+	}
+send:
+	if (tdc->share == FD)
+		send_ring_fd(ring.ring_fd);
+
+	cont();
+
+	if (tdc->test == TEST_BUSY) {
+		ret = io_uring_unregister_buffers(&ring);
+		if (ret)
+			verror("unregister run_parent busy");
+		if (-ret == tdc->expect)
+			ret = -ret;
+	} else if (tdc->test == TEST_EXIT) {
+		waitstop();
+		vinfo(2, "parent exiting\n");
+		exit(2);
+	}
+
+	waitexit();
+	if (!ret && tdc->test == TEST_FIN) {
+		ret = io_uring_unregister_buffers(&ring);
+		if (ret)
+			verror("unregister run_parent fin");
+	}
+
+done:
+	queue_exit(&ring);
+
+	if (ret)
+		cont();
+
+	return ret;
+}
+
+static void
+run_child(struct test_desc *tdc)
+{
+	int from_fd, ret = ECONNABORTED;
+	int (*fn)(struct test_desc *);
+
+	vinfo(V1, "child stop\n");
+
+	ptrace(PTRACE_TRACEME, 0, NULL, NULL);
+	raise(SIGSTOP);
+
+	if (tdc->share == FD) {
+		vinfo(V3, "recv_fd %d\n", tdc->sp[1]);
+		from_fd = recv_fd(tdc->sp[1]);
+		if (from_fd < 0)
+			goto done;
+		vinfo(V3, "recv_fd %d: %d\n", tdc->sp[1], from_fd);
+		tdc->fd = from_fd;
+	}
+
+	fn = test_fn[tdc->id];
+	ret = fn(tdc);
+
+	if (tdc->share == FD)
+		close_sockets();
+
+done:
+	vinfo(V2, "child exiting %d\n", ret);
+	/*
+	 * The parent exits for the final test before waiting for children
+	 * so save the test status.
+	 */
+	shmstat_store(tdc, -ret);
+	exit(-ret);
+}
+
+static int
+run_test(struct test_desc *proto)
+{
+	int i, pid, ret;
+	struct test_desc *tdc = NULL;
+
+	ret = pid = 0;
+
+	for (i = 0; i < nids; i++) {
+		tdcs[i] = *proto;
+		tdc = & tdcs[i];
+		tdc->idx = i;
+		if (tdc->share == FD) {
+			ret = get_socketpair(tdc->sp);
+			if (ret)
+				return -ret;
+		}
+		pid = fork();
+		if (pid) {
+			pids[i] = pid;
+			continue;
+		}
+		run_child(tdc);
+	}
+
+	ret = run_parent(proto);
+
+	if (proto->share == FD)
+		close_sockets();
+
+	return ret;
+}
+
+static int
+check_exitstat(struct test_desc *tdc, int *stat)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < nids; i++) {
+		if (status[i] != tdc->expect) {
+			vinfo(V1, "check_exitstat child %d: expect %d got %d\n",
+			      pids[i], tdc->expect, status[i]);
+			if (!ret) {
+				ret = 1;
+				*stat = status[i];
+			}
+		}
+	}
+
+	if (!ret)
+		*stat = status[0];
+
+	return ret;
+}
+
+static int
+test(int id, int share, int preg, int reg, int test, int expect)
+{
+	int ret, stat;
+	char *res;
+
+	struct test_desc tdc = {
+		.id = id,
+		.fd = -1,
+		.sp = {-1, -1},
+		.share = share,
+		.preg = preg,
+		.reg = reg,
+		.test = test,
+		.expect = expect,
+	};
+
+	vinfo(V1, "test(%s %s %s %s %s)\n", i2s[tdc.id], s2s[tdc.share],
+	      r2s[tdc.preg], r2s[tdc.reg], t2s[tdc.test]);
+
+	memset(tdcs, 0, sizeof(tdcs));
+	memset(pids, 0, sizeof(pids));
+	memset(status, 0, sizeof(status));
+	memset(shmstat, SHM_ST_NONE, SHM_ST_SZ);
+
+	ret = run_test(&tdc);
+	stat = ret;
+
+	if (ret) {
+		ret = 1;
+		res = "FAIL RUN";
+	} else {
+		ret = check_exitstat(&tdc, &stat);
+		res = ret ? "FAIL" : "PASS";
+	}
+
+	vinfo(V1, "test(%s %s %s %s %s: %d %d) %s\n",
+	      i2s[tdc.id], s2s[tdc.share], r2s[tdc.preg],
+	      r2s[tdc.reg], t2s[tdc.test], tdc.expect, stat, res);
+
+	return ret;
+}
+
+static int
+test_exit(int expect)
+{
+	int status, i, ret = 0;
+
+	vinfo(V1, "grandparent pid\n");
+
+	wait(&status);
+
+	vinfo(V2, "parent exited: %d\n", WEXITSTATUS(status));
+
+	for (i = 0; i < 60; i++) {
+		ret = shmstat_check(expect);
+		if (ret >= 0)
+			break;
+		sleep(1);
+	}
+
+	if (ret) {
+		vinfo(V1, "Bad test_exit stat %d\n", ret);
+		ret = 1;
+	}
+
+	vinfo(V1, "Test exit %d\n", ret);
+	exit(ret);
+}
+
+/*
+ * main()
+ * -> shm_create()
+ * -> create_buffers()
+ * parent:
+ * -> test()
+ *    -> run_test()
+ *       -> run_parent()
+ *          -> io_uring_register_buffers()
+ *       -> run_child()
+ * 	 -> test_reg()
+ *          -> register_buffers()
+ *       -> test_io()
+ *          -> run_test_io()
+ *             -> do_io()
+ *                -> register_buffers()
+ *
+ * grandparent:
+ * -> test_exit()
+ */
+
+int
+main(int argc, char *argv[])
+{
+	int shmid, pid, ret = 0;
+	char c;
+
+	while ((c = getopt(argc, argv, "s:v:")) != -1)
+		switch (c) {
+		case 's':
+			shm_sz = atoi(optarg) * SZ_2M;
+			break;
+		case 'v':
+			verbose = atoi(optarg);
+			break;
+		default:
+			exit(1);
+		}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc)
+		nids = atoi(argv[0]);
+
+	if (!nids || nids > MAXID)
+		exit(1);
+
+	shmstat = shmstat_create();
+	if (shmstat == MAP_FAILED)
+		exit(1);
+
+	pid = fork();
+	if (pid)
+		exit(test_exit(PASS));
+
+	vinfo(V1, "parent pid\n");
+
+	shmid = shm_create();
+	if (shmid < 0)
+		exit(1);
+
+	shmbuf = shm_attach(shmid);
+	if (shmbuf == MAP_FAILED) {
+		shm_destroy(shmid, shmbuf);
+		exit(1);
+	}
+
+	create_buffers(shmbuf);
+
+	ret |= test(TEST_REG, NO, REG_PIN, REG_PIN, TEST_DFLT,  PASS);
+	ret |= test(TEST_REG, NO, REG_NOP, REG_NOP, TEST_DFLT,  PASS);
+	ret |= test(TEST_REG, NO, REG_NOP, REG_ATT, TEST_DFLT,  EINVAL);
+	ret |= test(TEST_REG, FD, REG_PIN, REG_ATT, TEST_DFLT,  PASS);
+	ret |= test(TEST_REG, FD, REG_NOP, REG_ATT, TEST_DFLT,  PASS);
+	ret |= test(TEST_REG, FD, REG_PIN, REG_ATT, TEST_BUSY,  PASS);
+	ret |= test(TEST_REG, FD, REG_PIN, REG_ATT, TEST_FIN,   PASS);
+	ret |= test(TEST_REG, FD, REG_PIN, REG_ATT, TEST_BADFD, EBADF);
+	ret |= test(TEST_REG, FD, REG_PIN, REG_ATT, TEST_INVFD, EINVAL);
+	ret |= test(TEST_REG, FD, REG_PIN, REG_ATT, TEST_UNREG, EINVAL);
+	ret |= test(TEST_IO,  FD, REG_PIN, REG_PIN, TEST_DFLT,  PASS);
+	ret |= test(TEST_IO,  FD, REG_PIN, REG_ATT, TEST_DFLT,  PASS);
+	ret |= test(TEST_IO,  FD, REG_NOP, REG_ATT, TEST_DFLT,  EFAULT);
+	ret |= test(TEST_IO,  FD, REG_PIN, REG_ATT, TEST_EXIT,  NONE);
+
+	shm_destroy(shmid, shmbuf);
+
+	return ret;
+}
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 06/10] test/buffer-share: add private memory option
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (4 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 05/10] test/buffer-share: add buffer registration sharing test Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 07/10] test/buffer-share: add interruptible deadlock test Bijan Mottahedeh
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 test/buffer-share.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/test/buffer-share.c b/test/buffer-share.c
index 81e0537..cbfbfb2 100644
--- a/test/buffer-share.c
+++ b/test/buffer-share.c
@@ -18,6 +18,7 @@
 #include <unistd.h>
 #include <string.h>
 #include <linux/fs.h>
+#include <malloc.h>
 
 #include "liburing.h"
 
@@ -71,7 +72,8 @@ unsigned long shm_sz = SZ_2M;
 
 int no_read;
 int warned;
-int verbose = 0;
+int private;
+int verbose;
 
 enum {
 	REG_NOP = 0,
@@ -1120,8 +1122,11 @@ main(int argc, char *argv[])
 	int shmid, pid, ret = 0;
 	char c;
 
-	while ((c = getopt(argc, argv, "s:v:")) != -1)
+	while ((c = getopt(argc, argv, "ps:v:")) != -1)
 		switch (c) {
+		case 'p':
+			private++;
+			break;
 		case 's':
 			shm_sz = atoi(optarg) * SZ_2M;
 			break;
@@ -1155,10 +1160,19 @@ main(int argc, char *argv[])
 	if (shmid < 0)
 		exit(1);
 
-	shmbuf = shm_attach(shmid);
-	if (shmbuf == MAP_FAILED) {
-		shm_destroy(shmid, shmbuf);
-		exit(1);
+	if (private) {
+		printf("private\n");
+		shmbuf = memalign(4096, shm_sz);
+		if (shmbuf == NULL) {
+			perror("memalign");
+			exit(1);
+		}
+	} else {
+		shmbuf = shm_attach(shmid);
+		if (shmbuf == MAP_FAILED) {
+			shm_destroy(shmid, shmbuf);
+			exit(1);
+		}
 	}
 
 	create_buffers(shmbuf);
@@ -1178,7 +1192,8 @@ main(int argc, char *argv[])
 	ret |= test(TEST_IO,  FD, REG_NOP, REG_ATT, TEST_DFLT,  EFAULT);
 	ret |= test(TEST_IO,  FD, REG_PIN, REG_ATT, TEST_EXIT,  NONE);
 
-	shm_destroy(shmid, shmbuf);
+	if (!private)
+		shm_destroy(shmid, shmbuf);
 
 	return ret;
 }
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 07/10] test/buffer-share: add interruptible deadlock test
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (5 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 06/10] test/buffer-share: add private memory option Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 08/10] man/io_uring_setup.2: document buffer registration sharing Bijan Mottahedeh
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 test/buffer-share.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/test/buffer-share.c b/test/buffer-share.c
index cbfbfb2..2997333 100644
--- a/test/buffer-share.c
+++ b/test/buffer-share.c
@@ -72,6 +72,7 @@ unsigned long shm_sz = SZ_2M;
 
 int no_read;
 int warned;
+int deadlock;
 int private;
 int verbose;
 
@@ -1095,6 +1096,82 @@ test_exit(int expect)
 	exit(ret);
 }
 
+
+/*
+ * Test for deadlock scenario raised by Pavel.
+ * Test hangs but is interruptible.
+ *
+ * [Pavel's comments]
+ *
+ * Ok, now the original io_uring instance will wait until the attached
+ * once get rid of their references. That's a versatile ground to have
+ * in kernel deadlocks.
+ * 
+ * task1: uring1 = create()
+ * task2: uring2 = create()
+ * task1: uring3 = create(share=uring2);
+ * task2: uring4 = create(share=uring1);
+ * 
+ * task1: io_sqe_buffers_unregister(uring1)
+ * task2: io_sqe_buffers_unregister(uring2)
+ * 
+ * If I skimmed through the code right, that should hang unkillably.
+ */
+static int test_deadlock(void)
+{
+	int i, pid, ret, fd0, fd1;
+	struct io_uring rings[4];
+	struct io_uring_params p;
+
+	for (i = 0; i < 2; i++) {
+		memset(&p, 0, sizeof(p));
+		p.flags = IORING_SETUP_SHARE_BUF;
+		ret = io_uring_queue_init_params(1, &rings[i], &p);
+		if (ret) {
+			verror("queue_init share");
+			return ret;
+		}
+	}
+
+	fd0 = rings[0].ring_fd;
+	fd1 = rings[1].ring_fd;
+
+	pid = fork();
+
+	memset(&p, 0, sizeof(p));
+	p.flags = IORING_SETUP_ATTACH_BUF;
+
+	if (pid) {
+		p.wq_fd = fd1;
+		ret = io_uring_queue_init_params(1, &rings[2], &p);
+	} else {
+		p.wq_fd = fd0;
+		ret = io_uring_queue_init_params(1, &rings[3], &p);
+	}
+
+	if (ret) {
+		verror("queue_init attach");
+		return ret;
+	}
+
+	vinfo(V1, "unregister\n");
+
+	if (pid) {
+		close(fd1);
+		ret = io_uring_unregister_buffers(&rings[0]);
+	} else {
+		close(fd0);
+		ret = io_uring_unregister_buffers(&rings[1]);
+	}
+	
+	vinfo(V1, "unregister done\n");
+
+	if (ret)
+		verror("unregister");
+
+	return ret;
+}
+
 /*
  * main()
  * -> shm_create()
@@ -1122,8 +1199,11 @@ main(int argc, char *argv[])
 	int shmid, pid, ret = 0;
 	char c;
 
-	while ((c = getopt(argc, argv, "ps:v:")) != -1)
+	while ((c = getopt(argc, argv, "dps:v:")) != -1)
 		switch (c) {
+		case 'd':
+			deadlock++;
+			break;
 		case 'p':
 			private++;
 			break;
@@ -1146,6 +1226,9 @@ main(int argc, char *argv[])
 	if (!nids || nids > MAXID)
 		exit(1);
 
+	if (deadlock)
+		exit(test_deadlock());
+
 	shmstat = shmstat_create();
 	if (shmstat == MAP_FAILED)
 		exit(1);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 08/10] man/io_uring_setup.2: document buffer registration sharing
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (6 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 07/10] test/buffer-share: add interruptible deadlock test Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 09/10] man/io_uring_register.2: document buffer registration updates Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 10/10] man/io_uring_enter.2: document IORING_OP_BUFFERS_UPDATE Bijan Mottahedeh
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Document IORING_SETUP_SHARE_BUF and IORING_SETUP_ATTACH_BUF

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 man/io_uring_setup.2 | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/man/io_uring_setup.2 b/man/io_uring_setup.2
index 3122313..0297bf1 100644
--- a/man/io_uring_setup.2
+++ b/man/io_uring_setup.2
@@ -166,6 +166,18 @@ exceeds
 then it will be clamped at
 .B IORING_MAX_CQ_ENTRIES .
 .TP
+.B IORING_SETUP_SHARE_BUF
+If this flag is specified, the io_uring ring allows its buffer
+registrations to be shared with other io_uring ring instances who have
+access to this ring's file descriptor.
+.TP
+.B IORING_SETUP_ATTACH_BUF
+This flag should be set in conjunction with 
+.IR "struct io_uring_params.wq_fd"
+being set to an existing io_uring ring file descriptor. When set, the
+io_uring instance being created will share the buffer registrations 
+of the specified io_uring ring.
+.TP
 .B IORING_SETUP_ATTACH_WQ
 This flag should be set in conjunction with 
 .IR "struct io_uring_params.wq_fd"
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 09/10] man/io_uring_register.2: document buffer registration updates
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (7 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 08/10] man/io_uring_setup.2: document buffer registration sharing Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  2021-01-22 22:54 ` [PATCH v2 10/10] man/io_uring_enter.2: document IORING_OP_BUFFERS_UPDATE Bijan Mottahedeh
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Document sparse buffer registrations and IORING_REGISER_BUFFER_UPDATE.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 man/io_uring_register.2 | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/man/io_uring_register.2 b/man/io_uring_register.2
index 225e461..5c7a39e 100644
--- a/man/io_uring_register.2
+++ b/man/io_uring_register.2
@@ -82,11 +82,13 @@ It is perfectly valid to setup a large buffer and then only use part
 of it for an I/O, as long as the range is within the originally mapped
 region.
 
-An application can increase or decrease the size or number of
-registered buffers by first unregistering the existing buffers, and
-then issuing a new call to
-.BR io_uring_register ()
-with the new buffers.
+The buffer set may be sparse, meaning that the
+.B iovec
+address and size fields in the array may be set to
+.B 0.
+See
+.B IORING_REGISTER_BUFFERS_UPDATE
+for how to update buffers in place.
 
 Note that registering buffers will wait for the ring to idle. If the application
 currently has requests in-flight, the registration will wait for those to
@@ -103,6 +105,20 @@ must be passed as NULL.  All previously registered buffers associated
 with the io_uring instance will be released. Available since 5.1.
 
 .TP
+.B IORING_REGISTER_BUFFERS_UPDATE
+This operation replaces existing iovecs in the registered buffer set with
+new ones, either turning a sparse entry (one where iovec base and len
+are equal to 0) into a real one, removing an existing entry (new one is
+set to 0), or replacing an existing entry with a new existing entry.
+.I arg
+must contain a pointer to a struct io_uring_rsrc_update, which contains
+an offset on which to start the update, and an array of iovecs to
+use for the update.
+.I nr_args
+must contain the number of iovecs in the passed in array. Available
+since 5.12.
+
+.TP
 .B IORING_REGISTER_FILES
 Register files for I/O.
 .I arg
@@ -152,6 +168,9 @@ use for the update.
 must contain the number of descriptors in the passed in array. Available
 since 5.5.
 
+Note that struct io_uring_files_update is deprecated since 5.12 and that
+io_uring_rsrc_update must be used instead.
+
 .TP
 .B IORING_UNREGISTER_FILES
 This operation requires no argument, and
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 10/10] man/io_uring_enter.2: document IORING_OP_BUFFERS_UPDATE
  2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
                   ` (8 preceding siblings ...)
  2021-01-22 22:54 ` [PATCH v2 09/10] man/io_uring_register.2: document buffer registration updates Bijan Mottahedeh
@ 2021-01-22 22:54 ` Bijan Mottahedeh
  9 siblings, 0 replies; 11+ messages in thread
From: Bijan Mottahedeh @ 2021-01-22 22:54 UTC (permalink / raw)
  To: axboe, io-uring

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
---
 man/io_uring_enter.2 | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/man/io_uring_enter.2 b/man/io_uring_enter.2
index da20451..99f6756 100644
--- a/man/io_uring_enter.2
+++ b/man/io_uring_enter.2
@@ -636,6 +636,22 @@ See also
 for the general description of the related system call. Available since 5.8.
 
 .TP
+.B IORING_OP_BUFFERS_UPDATE
+This command is an alternative to using
+.B IORING_REGISTER_BUFFERS_UPDATE
+which then works in an async fashion, like the rest of the io_uring commands.
+The arguments passed in are the same.
+.I addr
+must contain a pointer to the array of iovecs,
+.I len
+must contain the length of the array, and
+.I off
+must contain the offset at which to operate. Note that the array of iovecs
+pointed to in
+.I addr
+must remain valid until this operation has completed. Available since 5.12.
+
+.TP
 .B IORING_OP_FILES_UPDATE
 This command is an alternative to using
 .B IORING_REGISTER_FILES_UPDATE
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-01-22 22:58 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-01-22 22:54 [PATCH v2 00/10] liburing: buffer registration enhancements Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 01/10] liburing: support buffer registration updates Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 02/10] liburing: support buffer registration sharing Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 03/10] test/buffer-register: add buffer registration test Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 04/10] test/buffer-update: add buffer registration update test Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 05/10] test/buffer-share: add buffer registration sharing test Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 06/10] test/buffer-share: add private memory option Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 07/10] test/buffer-share: add interruptible deadlock test Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 08/10] man/io_uring_setup.2: document buffer registration sharing Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 09/10] man/io_uring_register.2: document buffer registration updates Bijan Mottahedeh
2021-01-22 22:54 ` [PATCH v2 10/10] man/io_uring_enter.2: document IORING_OP_BUFFERS_UPDATE Bijan Mottahedeh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).