* [PATCH fio] engines: Add Network Block Device (NBD) support.
2019-04-18 14:09 [PATCH fio] engines: Add Network Block Device (NBD) support Richard W.M. Jones
@ 2019-04-18 14:09 ` Richard W.M. Jones
2019-04-19 15:28 ` Elliott, Robert (Servers)
2019-04-19 9:27 ` Sitsofe Wheeler
1 sibling, 1 reply; 6+ messages in thread
From: Richard W.M. Jones @ 2019-04-18 14:09 UTC (permalink / raw)
To: fio; +Cc: eblake, nbd
Signed-off-by: Richard W.M. Jones <rjones@redhat.com>
---
Makefile | 2 +-
engines/nbd.c | 570 +++++++++++++++++++++++++++++++++++++++++++++++
examples/nbd.fio | 9 +
optgroup.c | 4 +
optgroup.h | 2 +
options.c | 3 +
6 files changed, 589 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index fd138dd2..ef8db418 100644
--- a/Makefile
+++ b/Makefile
@@ -45,7 +45,7 @@ SOURCE := $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
pshared.c options.c \
smalloc.c filehash.c profile.c debug.c engines/cpu.c \
engines/mmap.c engines/sync.c engines/null.c engines/net.c \
- engines/ftruncate.c engines/filecreate.c \
+ engines/ftruncate.c engines/filecreate.c engines/nbd.c \
server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
gettime-thread.c helpers.c json.c idletime.c td_error.c \
profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
diff --git a/engines/nbd.c b/engines/nbd.c
new file mode 100644
index 00000000..86e394a1
--- /dev/null
+++ b/engines/nbd.c
@@ -0,0 +1,570 @@
+/*
+ * NBD engine
+ *
+ * IO engine that talks to a newstyle-fixed NBD server over a TCP or
+ * Unix domain socket.
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ * Written by Richard W.M. Jones <rjones@redhat.com>
+ * Distributed under the GNU GPL v2 license.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/* New-style handshake. */
+struct nbd_new_handshake {
+ char nbdmagic[8]; /* "NBDMAGIC" */
+ uint64_t version; /* NEW_VERSION */
+ uint16_t gflags; /* global flags */
+} __attribute__((packed));
+
+#define NEW_VERSION UINT64_C(0x49484156454F5054)
+
+#define NBD_FLAG_FIXED_NEWSTYLE 1
+#define NBD_FLAG_NO_ZEROES 2
+
+struct nbd_new_option {
+ uint64_t version; /* NEW_VERSION */
+ uint32_t option; /* NBD_OPT_* */
+ uint32_t optlen; /* option data length */
+ /* option data follows */
+} __attribute__((packed));
+
+#define NBD_OPT_EXPORT_NAME UINT32_C(1)
+
+struct nbd_new_handshake_finish {
+ uint64_t exportsize;
+ uint16_t eflags; /* per-export flags */
+} __attribute__((packed));
+
+/* Per-export flags. */
+#define NBD_FLAG_HAS_FLAGS (1 << 0)
+#define NBD_FLAG_READ_ONLY (1 << 1)
+#define NBD_FLAG_SEND_FLUSH (1 << 2)
+#define NBD_FLAG_SEND_FUA (1 << 3)
+#define NBD_FLAG_ROTATIONAL (1 << 4)
+#define NBD_FLAG_SEND_TRIM (1 << 5)
+#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6)
+#define NBD_FLAG_SEND_DF (1 << 7)
+#define NBD_FLAG_CAN_MULTI_CONN (1 << 8)
+
+struct nbd_request {
+ uint32_t magic; /* NBD_REQUEST_MAGIC. */
+ uint16_t flags; /* NBD_CMD_FLAG_* */
+ uint16_t type; /* NBD_CMD_* */
+ uint64_t handle; /* Opaque handle. */
+ uint64_t offset; /* Request offset. */
+ uint32_t count; /* Request length. */
+} __attribute__((packed));
+
+struct nbd_reply {
+ uint32_t magic; /* NBD_SIMPLE_REPLY_MAGIC. */
+ uint32_t error; /* NBD_SUCCESS or one of NBD_E*. */
+ uint64_t handle; /* Opaque handle. */
+} __attribute__((packed));
+
+#define NBD_REQUEST_MAGIC UINT32_C(0x25609513)
+#define NBD_SIMPLE_REPLY_MAGIC UINT32_C(0x67446698)
+
+#define NBD_CMD_READ 0
+#define NBD_CMD_WRITE 1
+#define NBD_CMD_DISC 2
+#define NBD_CMD_FLUSH 3
+#define NBD_CMD_TRIM 4
+#define NBD_CMD_WRITE_ZEROES 6
+
+#define NBD_CMD_FLAG_FUA (1<<0)
+#define NBD_CMD_FLAG_NO_HOLE (1<<1)
+
+#define NBD_SUCCESS 0
+#define NBD_EPERM 1
+#define NBD_EIO 5
+#define NBD_ENOMEM 12
+#define NBD_EINVAL 22
+#define NBD_ENOSPC 28
+#define NBD_ESHUTDOWN 108
+
+/* Actually this differs across servers, but for nbdkit ... */
+#define NBD_MAX_REQUEST_SIZE (64 * 1024 * 1024)
+
+static uint64_t exportsize;
+static uint16_t eflags;
+
+struct nbd_data {
+ int fd;
+};
+
+struct nbd_options {
+ void *padding;
+ char *hostname;
+ char *port;
+ char *sockname;
+ char *exportname;
+ unsigned long long size;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "hostname",
+ .lname = "NBD remote hostname",
+ .help = "Hostname of remote NBD server",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_NBD,
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct nbd_options, hostname),
+ },
+ {
+ .name = "port",
+ .lname = "NBD port",
+ .help = "Port or service name to use for NBD connection",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_NBD,
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct nbd_options, port),
+ },
+ {
+ .name = "sockname",
+ .lname = "NBD Unix domain socket",
+ .help = "Name of Unix domain socket",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_NBD,
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct nbd_options, sockname),
+ },
+ {
+ .name = "exportname",
+ .lname = "NBD export name",
+ .help = "Name of NBD export",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_NBD,
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct nbd_options, exportname),
+ },
+ {
+ .name = "nbdsize",
+ .lname = "Size of block device",
+ .help = "Size of NBD export to use",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_NBD,
+ .type = FIO_OPT_ULL,
+ .off1 = offsetof(struct nbd_options, size),
+ .minval = 512,
+ .interval = 512,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+/* Alocates nbd_data. */
+static int nbd_setup(struct thread_data *td)
+{
+ struct nbd_data *nbd_data;
+ struct nbd_options *o = td->eo;
+ struct fio_file *f;
+
+ if (!td->io_ops_data) {
+ nbd_data = calloc(1, sizeof(*nbd_data));
+ if (!nbd_data) return 1;
+ td->io_ops_data = nbd_data;
+ }
+
+ /* Pretend to deal with files. See engines/rbd.c */
+ if (!td->files_index) {
+ add_file(td, "nbd", 0, 0);
+ td->o.nr_files = td->o.nr_files ? : 1;
+ td->o.open_files++;
+ }
+ f = td->files[0];
+ /* XXX We could read the size from NBD, but that's awkward as
+ * we haven't connected to the server. In any case nothing
+ * bad happens (except an error) if the size is too large.
+ */
+ f->real_file_size = o->size;
+
+ return 0;
+}
+
+/* Closes socket and frees nbd_data -- the opposite of nbd_setup. */
+static void nbd_cleanup(struct thread_data *td)
+{
+ struct nbd_data *nbd_data = td->io_ops_data;
+
+ if (nbd_data) {
+ if (nbd_data->fd >= 0)
+ close(nbd_data->fd);
+ free(nbd_data);
+ }
+}
+
+/* Connect to the NBD server. */
+static int set_nodelay(int fd)
+{
+#ifdef CONFIG_TCP_NODELAY
+ const int optval = 1;
+
+ if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+ (void *)&optval, sizeof(int)) == -1) {
+ log_err("fio: nbd: cannot set TCP_NODELAY option: %s\n",
+ strerror(errno));
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static int nbd_connect_inet(struct thread_data *td,
+ const char *hostname, const char *port)
+{
+ struct nbd_data *nbd_data = td->io_ops_data;
+ struct addrinfo hints;
+ struct addrinfo *result, *rp;
+ int r;
+
+ memset(&hints, 0, sizeof hints);
+ hints.ai_family = AF_UNSPEC;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_flags = 0;
+ hints.ai_protocol = 0;
+
+ r = getaddrinfo(hostname, port, &hints, &result);
+ if(r != 0) {
+ log_err("fio: nbd: getaddrinfo: %s\n", gai_strerror(r));
+ return 1;
+ }
+
+ for (rp = result; rp; rp = rp->ai_next) {
+ nbd_data->fd = socket(rp->ai_family, rp->ai_socktype,
+ rp->ai_protocol);
+ if(nbd_data->fd == -1)
+ continue;
+ if(connect(nbd_data->fd, rp->ai_addr, rp->ai_addrlen) != -1)
+ break;
+ close(nbd_data->fd);
+ }
+ freeaddrinfo(result);
+ if (rp == NULL) {
+ td_verror(td, errno, "connect");
+ return 1;
+ }
+
+ if (set_nodelay(nbd_data->fd)) {
+ close(nbd_data->fd);
+ nbd_data->fd = -1;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nbd_connect_unix(struct thread_data *td,
+ const char *sockname)
+{
+ struct nbd_data *nbd_data = td->io_ops_data;
+ struct sockaddr_un sun;
+ socklen_t len;
+
+ nbd_data->fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if(nbd_data->fd == -1) {
+ td_verror(td, errno, "socket");
+ return 1;
+ }
+
+ sun.sun_family = AF_UNIX;
+ memset(sun.sun_path, 0, sizeof(sun.sun_path));
+ strncpy(sun.sun_path, sockname, sizeof(sun.sun_path) - 1);
+
+ len = sizeof(sun.sun_family) + strlen(sun.sun_path) + 1;
+ if (connect(nbd_data->fd, (struct sockaddr *) &sun, len) == -1) {
+ td_verror(td, errno, "connect");
+ goto fail;
+ }
+
+ return 0;
+
+ fail:
+ close(nbd_data->fd);
+ return 1;
+}
+
+/* Like read(2) and write(2) except they always read/write the full
+ * data or fail if that isn't possible. Read fails on early EOF too.
+ *
+ * Returns 0 for success or 1 for failure.
+ */
+static int full_read(struct thread_data *td,
+ int fd, void *buf, size_t count)
+{
+ ssize_t r;
+
+ while (count > 0) {
+ r = read(fd, buf, count);
+ if (r == -1) {
+ td_verror(td, errno, "read");
+ return 1;
+ }
+ if (r == 0) {
+ log_err("fio: nbd: unexpected end of file "
+ "reading from server\n");
+ return 1;
+ }
+ buf += r;
+ count -= r;
+ }
+
+ return 0;
+}
+
+static int full_write(struct thread_data *td,
+ int fd, const void *buf, size_t count)
+{
+ ssize_t r;
+
+ while (count > 0) {
+ r = write(fd, buf, count);
+ if (r == -1) {
+ td_verror(td, errno, "write");
+ return 1;
+ }
+ buf += r;
+ count -= r;
+ }
+
+ return 0;
+}
+
+static int nbd_handshake(struct thread_data *td,
+ const char *exportname)
+{
+ struct nbd_data *nbd_data = td->io_ops_data;
+ struct nbd_options *o = td->eo;
+ struct nbd_new_handshake handshake;
+ const int expected_gflags = NBD_FLAG_FIXED_NEWSTYLE|NBD_FLAG_NO_ZEROES;
+ uint32_t cflags;
+ struct nbd_new_option option;
+ const uint32_t exportname_len = strlen(exportname);
+ struct nbd_new_handshake_finish finish;
+
+ /* Expect the fixed newstyle handshake. */
+ if (full_read(td, nbd_data->fd, &handshake, sizeof handshake))
+ return 1;
+ handshake.version = be64_to_cpu(handshake.version);
+ handshake.gflags = be16_to_cpu(handshake.gflags);
+ if (memcmp(handshake.nbdmagic, "NBDMAGIC", 8) != 0 ||
+ handshake.version != NEW_VERSION ||
+ (handshake.gflags & expected_gflags) != expected_gflags) {
+ log_err("fio: nbd: handshake failed: server is not a "
+ "fixed newstyle NBD server\n");
+ return 1;
+ }
+
+ /* Send 32 bit flags word back to the server containing only
+ * the global flags we understand.
+ */
+ cflags = handshake.gflags & expected_gflags;
+ cflags = cpu_to_be32(cflags);
+ if (full_write(td, nbd_data->fd, &cflags, sizeof cflags))
+ return 1;
+
+ /* Send NBD_OPT_EXPORT_NAME. If we send any other options
+ * they must go before this one, since this option terminates
+ * the option phase.
+ */
+ option.version = cpu_to_be64(NEW_VERSION);
+ option.option = cpu_to_be32(NBD_OPT_EXPORT_NAME);
+ option.optlen = cpu_to_be32(exportname_len);
+ if (full_write(td, nbd_data->fd, &option, sizeof option) ||
+ full_write(td, nbd_data->fd, exportname, exportname_len))
+ return 1;
+ /* Note the server does not send an option reply to
+ * NBD_OPT_EXPORT_NAME.
+ */
+
+ /* Server sends handshake finish. */
+ if (full_read(td, nbd_data->fd, &finish, sizeof finish))
+ return 1;
+ finish.exportsize = be64_to_cpu(finish.exportsize);
+ finish.eflags = be16_to_cpu(finish.eflags);
+
+ exportsize = finish.exportsize;
+ eflags = finish.eflags;
+
+ if (exportsize < o->size) {
+ log_err("fio: nbd: NBD export size is smaller than test size: "
+ "increase the fio size= parameter to at least "
+ "%" PRIu64,
+ exportsize);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nbd_init(struct thread_data *td)
+{
+ struct nbd_options *o = td->eo;
+ int r;
+
+ if (o->sockname) {
+ /* Unix domain socket connection. */
+ if (o->hostname || o->port) {
+ log_err("fio: nbd: if sockname is specified, "
+ "hostname and port cannot be used\n");
+ return 1;
+ }
+ r = nbd_connect_unix(td, o->sockname);
+ } else {
+ /* TCP connection to (usually) remote server. */
+ if (!o->hostname) {
+ log_err("fio: nbd: you must specify either "
+ "hostname or sockname\n");
+ return 1;
+ }
+ r = nbd_connect_inet(td, o->hostname,
+ o->port ? o->port : "10809");
+ }
+ if (r)
+ return 1;
+
+ log_info("fio: connected to NBD server\n");
+ r = nbd_handshake(td, o->exportname ? o->exportname : "");
+ if (r)
+ return 1;
+
+ log_info("fio: handshake with NBD server completed\n");
+ return 0;
+}
+
+/* Read or write request (handled synchronously in this engine). */
+static enum fio_q_status nbd_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct nbd_data *nbd_data = td->io_ops_data;
+ struct nbd_request rq;
+ uint16_t cmd;
+ struct nbd_reply rp;
+
+ fio_ro_check(td, io_u);
+
+ if (io_u->ddir == DDIR_WRITE || io_u->ddir == DDIR_READ)
+ assert(io_u->xfer_buflen <= NBD_MAX_REQUEST_SIZE);
+
+ rq.magic = cpu_to_be32(NBD_REQUEST_MAGIC);
+ rq.flags = 0;
+ rq.handle = 0; /* XXX If we do non-sync, need to set this. */
+ rq.offset = cpu_to_be64((uint64_t) io_u->offset);
+ rq.count = cpu_to_be32((uint32_t) io_u->xfer_buflen);
+
+ switch (io_u->ddir) {
+ case DDIR_WRITE: cmd = NBD_CMD_WRITE; break;
+ case DDIR_READ: cmd = NBD_CMD_READ; break;
+ case DDIR_TRIM: cmd = NBD_CMD_TRIM; break;
+ /* XXX We could probably also handle
+ * DDIR_SYNC_FILE_RANGE with a bit of effort.
+ */
+ case DDIR_SYNC: cmd = NBD_CMD_FLUSH; break;
+ default:
+ io_u->error = EINVAL;
+ return FIO_Q_COMPLETED;
+ }
+ rq.type = cpu_to_be16(cmd);
+
+ /* Send the request + optional write payload. */
+ if (full_write(td, nbd_data->fd, &rq, sizeof rq)) {
+ io_u->error = errno;
+ return FIO_Q_COMPLETED;
+ }
+ if (io_u->ddir == DDIR_WRITE) {
+ if(full_write(td, nbd_data->fd,
+ io_u->xfer_buf, io_u->xfer_buflen)) {
+ io_u->error = errno;
+ return FIO_Q_COMPLETED;
+ }
+ }
+
+ /* Read the reply + optional read payload. */
+ if (full_read(td, nbd_data->fd, &rp, sizeof rp)) {
+ io_u->error = errno;
+ return FIO_Q_COMPLETED;
+ }
+ rp.magic = be32_to_cpu(rp.magic);
+ rp.error = be32_to_cpu(rp.error);
+ rp.handle = be64_to_cpu(rp.handle);
+ if (rp.magic != NBD_SIMPLE_REPLY_MAGIC) {
+ log_err("fio: nbd: invalid magic in reply message\n");
+ io_u->error = EINVAL;
+ return FIO_Q_COMPLETED;
+ }
+
+ if (rp.error != NBD_SUCCESS) {
+ switch (rp.error) {
+ case NBD_EPERM: io_u->error = EPERM; break;
+ case NBD_EIO: io_u->error = EIO; break;
+ case NBD_ENOMEM: io_u->error = ENOMEM; break;
+ case NBD_ENOSPC: io_u->error = ENOSPC; break;
+ case NBD_ESHUTDOWN: io_u->error = ESHUTDOWN; break;
+ case NBD_EINVAL: default: io_u->error = EINVAL; break;
+ }
+ return FIO_Q_COMPLETED;
+ }
+
+ if (io_u->ddir == DDIR_READ) {
+ if (full_read(td, nbd_data->fd,
+ io_u->xfer_buf, io_u->xfer_buflen)) {
+ io_u->error = errno;
+ return FIO_Q_COMPLETED;
+ }
+ }
+
+ io_u->error = 0;
+ return FIO_Q_COMPLETED;
+}
+
+
+static int nbd_open_file(struct thread_data *td, struct fio_file *f)
+{
+ return 0;
+}
+
+static int nbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+ return 0;
+}
+
+static struct ioengine_ops ioengine = {
+ .name = "nbd",
+ .version = FIO_IOOPS_VERSION,
+ .options = options,
+ .option_struct_size = sizeof(struct nbd_options),
+ .flags = FIO_SYNCIO | FIO_NOEXTEND,
+
+ .setup = nbd_setup,
+ .init = nbd_init,
+ .cleanup = nbd_cleanup,
+ .queue = nbd_queue,
+ .open_file = nbd_open_file,
+ .invalidate = nbd_invalidate,
+};
+
+static void fio_init fio_nbd_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nbd_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
diff --git a/examples/nbd.fio b/examples/nbd.fio
new file mode 100644
index 00000000..0e02f7f3
--- /dev/null
+++ b/examples/nbd.fio
@@ -0,0 +1,9 @@
+[test]
+ioengine=nbd
+sockname=/tmp/socket
+rw=randrw
+size=64m
+nbdsize=64m
+iodepth=4
+time_based
+runtime=120
diff --git a/optgroup.c b/optgroup.c
index 04ceec7e..c228ff29 100644
--- a/optgroup.c
+++ b/optgroup.c
@@ -169,6 +169,10 @@ static const struct opt_group fio_opt_cat_groups[] = {
.name = "libhdfs I/O engine", /* libhdfs */
.mask = FIO_OPT_G_HDFS,
},
+ {
+ .name = "NBD I/O engine", /* NBD */
+ .mask = FIO_OPT_G_NBD,
+ },
{
.name = NULL,
},
diff --git a/optgroup.h b/optgroup.h
index adf4d09b..bf5c9735 100644
--- a/optgroup.h
+++ b/optgroup.h
@@ -61,6 +61,7 @@ enum opt_category_group {
__FIO_OPT_G_MTD,
__FIO_OPT_G_HDFS,
__FIO_OPT_G_SG,
+ __FIO_OPT_G_NBD,
__FIO_OPT_G_NR,
FIO_OPT_G_RATE = (1ULL << __FIO_OPT_G_RATE),
@@ -97,6 +98,7 @@ enum opt_category_group {
FIO_OPT_G_MTD = (1ULL << __FIO_OPT_G_MTD),
FIO_OPT_G_HDFS = (1ULL << __FIO_OPT_G_HDFS),
FIO_OPT_G_SG = (1ULL << __FIO_OPT_G_SG),
+ FIO_OPT_G_NBD = (1ULL << __FIO_OPT_G_NBD),
FIO_OPT_G_INVALID = (1ULL << __FIO_OPT_G_NR),
};
diff --git a/options.c b/options.c
index 95086074..f4c9bedf 100644
--- a/options.c
+++ b/options.c
@@ -1899,6 +1899,9 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
.help = "HTTP (WebDAV/S3) IO engine",
},
#endif
+ { .ival = "nbd",
+ .help = "Network Block Device (NBD) IO engine"
+ },
},
},
{
--
2.20.1
^ permalink raw reply related [flat|nested] 6+ messages in thread