All of lore.kernel.org
 help / color / mirror / Atom feed
* Recent changes (master)
@ 2015-12-18 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2015-12-18 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit dde7b2361bf5b052a9c5c727bb2b062c604c7d42:

  gclient: don't free pdu on iolog return (2015-12-16 15:05:54 -0700)

are available in the git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to dfc8e76c8d438da9861acfcdc96c46afe4339148:

  server: code cleanups (2015-12-17 15:23:48 -0700)

----------------------------------------------------------------
Jens Axboe (7):
      Ensure that command line options also end up in json output
      client/server: various bug fixes
      Fix compile warning for !zlib
      client/server: transparent handling of storing compressed logs
      configure: fix zlib typo
      server: remove leftover debug statement
      server: code cleanups

 client.c  |  65 +++++++++++++++++++++++++----
 client.h  |   3 --
 gclient.c |   6 ---
 iolog.c   |  31 ++++++++++----
 iolog.h   |   9 ++++
 options.c |   9 ++--
 parse.c   |  44 ++++++++++++--------
 parse.h   |   2 +-
 server.c  | 139 ++++++++++++++++++++++++++++++++++++++++++++------------------
 server.h  |   5 +++
 10 files changed, 227 insertions(+), 86 deletions(-)

---

Diff of recent changes:

diff --git a/client.c b/client.c
index f4b95d3..27a764d 100644
--- a/client.c
+++ b/client.c
@@ -70,6 +70,8 @@ static int error_clients;
 #define FIO_CLIENT_HASH_MASK	(FIO_CLIENT_HASH_SZ - 1)
 static struct flist_head client_hash[FIO_CLIENT_HASH_SZ];
 
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *, bool *);
+
 static void fio_client_add_hash(struct fio_client *client)
 {
 	int bucket = hash_long(client->fd, FIO_CLIENT_HASH_BITS);
@@ -1224,6 +1226,46 @@ static void handle_eta(struct fio_client *client, struct fio_net_cmd *cmd)
 	fio_client_dec_jobs_eta(eta, client->ops->eta);
 }
 
+void fio_client_handle_iolog(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_iolog_pdu *pdu;
+	bool store_direct;
+
+	pdu = convert_iolog(cmd, &store_direct);
+	if (!pdu)
+		return;
+
+	if (store_direct) {
+		ssize_t ret;
+		size_t sz;
+		int fd;
+
+		fd = open((const char *) pdu->name,
+				O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (fd < 0) {
+			perror("open log");
+			return;
+		}
+		sz = cmd->pdu_len - sizeof(*pdu);
+		ret = write(fd, pdu->samples, sz);
+		if (ret != sz)
+			log_err("fio: short write on compressed log\n");
+		close(fd);
+	} else {
+		FILE *f;
+
+		f = fopen((const char *) pdu->name, "w");
+		if (!f) {
+			perror("fopen log");
+			return;
+		}
+
+		flush_samples(f, pdu->samples,
+				pdu->nr_samples * sizeof(struct io_sample));
+		fclose(f);
+	}
+}
+
 static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct cmd_probe_reply_pdu *probe = (struct cmd_probe_reply_pdu *) cmd->payload;
@@ -1364,27 +1406,36 @@ err:
  * This has been compressed on the server side, since it can be big.
  * Uncompress here.
  */
-static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd)
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
+					   bool *store_direct)
 {
 	struct cmd_iolog_pdu *pdu = (struct cmd_iolog_pdu *) cmd->payload;
 	struct cmd_iolog_pdu *ret;
 	uint64_t i;
+	int compressed;
 	void *samples;
 
+	*store_direct = false;
+
 	/*
 	 * Convert if compressed and we support it. If it's not
 	 * compressed, we need not do anything.
 	 */
-	if (le32_to_cpu(pdu->compressed)) {
+	compressed = le32_to_cpu(pdu->compressed);
+	if (compressed == XMIT_COMPRESSED) {
 #ifndef CONFIG_ZLIB
 		log_err("fio: server sent compressed data by mistake\n");
 		return NULL;
 #endif
 		ret = convert_iolog_gz(cmd, pdu);
+		printf("compressed iolog, %p\n", ret);
 		if (!ret) {
 			log_err("fio: failed decompressing log\n");
 			return NULL;
 		}
+	} else if (compressed == STORE_COMPRESSED) {
+		*store_direct = true;
+		ret = pdu;
 	} else
 		ret = pdu;
 
@@ -1394,6 +1445,9 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd)
 	ret->compressed		= le32_to_cpu(ret->compressed);
 	ret->log_offset		= le32_to_cpu(ret->log_offset);
 
+	if (*store_direct)
+		return ret;
+
 	samples = &ret->samples[0];
 	for (i = 0; i < ret->nr_samples; i++) {
 		struct io_sample *s;
@@ -1550,12 +1604,7 @@ int fio_handle_client(struct fio_client *client)
 		break;
 		}
 	case FIO_NET_CMD_IOLOG:
-		if (ops->iolog) {
-			struct cmd_iolog_pdu *pdu;
-
-			pdu = convert_iolog(cmd);
-			ops->iolog(client, pdu);
-		}
+		fio_client_handle_iolog(client, cmd);
 		break;
 	case FIO_NET_CMD_UPDATE_JOB:
 		ops->update_job(client, cmd);
diff --git a/client.h b/client.h
index 035e606..7fe09d1 100644
--- a/client.h
+++ b/client.h
@@ -76,12 +76,10 @@ struct fio_client {
 	unsigned int nr_files;
 };
 
-struct cmd_iolog_pdu;
 typedef void (client_cmd_op)(struct fio_client *, struct fio_net_cmd *);
 typedef void (client_eta_op)(struct jobs_eta *je);
 typedef void (client_timed_out_op)(struct fio_client *);
 typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je);
-typedef void (client_iolog_op)(struct fio_client *client, struct cmd_iolog_pdu *);
 
 struct client_ops {
 	client_cmd_op		*text;
@@ -98,7 +96,6 @@ struct client_ops {
 	client_cmd_op		*stop;
 	client_cmd_op		*start;
 	client_cmd_op		*job_start;
-	client_iolog_op		*iolog;
 	client_timed_out_op	*removed;
 
 	unsigned int eta_msec;
diff --git a/gclient.c b/gclient.c
index 949ad42..9c32474 100644
--- a/gclient.c
+++ b/gclient.c
@@ -693,11 +693,6 @@ static void gfio_client_job_start(struct fio_client *client, struct fio_net_cmd
 	gdk_threads_leave();
 }
 
-static void gfio_client_iolog(struct fio_client *client, struct cmd_iolog_pdu *pdu)
-{
-	printf("got iolog: name=%s, type=%u, entries=%lu\n", pdu->name, pdu->log_type, (unsigned long) pdu->nr_samples);
-}
-
 static void gfio_add_total_depths_tree(GtkListStore *model,
 				       struct thread_stat *ts, unsigned int len)
 {
@@ -1393,7 +1388,6 @@ struct client_ops gfio_client_ops = {
 	.stop			= gfio_client_stop,
 	.start			= gfio_client_start,
 	.job_start		= gfio_client_job_start,
-	.iolog			= gfio_client_iolog,
 	.removed		= gfio_client_removed,
 	.eta_msec		= FIO_CLIENT_DEF_ETA_MSEC,
 	.stay_connected		= 1,
diff --git a/iolog.c b/iolog.c
index d4a1017..feda9ed 100644
--- a/iolog.c
+++ b/iolog.c
@@ -634,7 +634,7 @@ void free_log(struct io_log *log)
 	free(log);
 }
 
-static void flush_samples(FILE *f, void *samples, uint64_t sample_size)
+void flush_samples(FILE *f, void *samples, uint64_t sample_size)
 {
 	struct io_sample *s;
 	int log_offset;
@@ -682,13 +682,6 @@ struct iolog_flush_data {
 	uint64_t nr_samples;
 };
 
-struct iolog_compress {
-	struct flist_head list;
-	void *buf;
-	size_t len;
-	unsigned int seq;
-};
-
 #define GZ_CHUNK	131072
 
 static struct iolog_compress *get_new_chunk(unsigned int seq)
@@ -984,7 +977,7 @@ static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
 	} else
 		fio_lock_file(log->filename);
 
-	if (td->client_type == FIO_CLIENT_TYPE_GUI)
+	if (td->client_type == FIO_CLIENT_TYPE_GUI || is_backend)
 		fio_send_iolog(td, log, log->filename);
 	else
 		flush_log(log, !td->o.per_job_logs);
@@ -994,6 +987,26 @@ static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
 	return 0;
 }
 
+size_t log_chunk_sizes(struct io_log *log)
+{
+	struct flist_head *entry;
+	size_t ret;
+
+	if (flist_empty(&log->chunk_list))
+		return 0;
+
+	ret = 0;
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(entry, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(entry, struct iolog_compress, list);
+		ret += c->len;
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+	return ret;
+}
+
 #ifdef CONFIG_ZLIB
 
 static void drop_data_unlock(struct iolog_flush_data *data)
diff --git a/iolog.h b/iolog.h
index b99329a..297daf5 100644
--- a/iolog.h
+++ b/iolog.h
@@ -186,6 +186,7 @@ extern void prune_io_piece_log(struct thread_data *);
 extern void write_iolog_close(struct thread_data *);
 extern int iolog_compress_init(struct thread_data *, struct sk_out *);
 extern void iolog_compress_exit(struct thread_data *);
+extern size_t log_chunk_sizes(struct io_log *);
 
 #ifdef CONFIG_ZLIB
 extern int iolog_file_inflate(const char *);
@@ -207,6 +208,7 @@ struct log_params {
 extern void finalize_logs(struct thread_data *td);
 extern void setup_log(struct io_log **, struct log_params *, const char *);
 extern void flush_log(struct io_log *, int);
+extern void flush_samples(FILE *, void *, uint64_t);
 extern void free_log(struct io_log *);
 extern void fio_writeout_logs(struct thread_data *);
 extern int iolog_flush(struct io_log *, int);
@@ -217,4 +219,11 @@ static inline void init_ipo(struct io_piece *ipo)
 	INIT_FLIST_HEAD(&ipo->trim_list);
 }
 
+struct iolog_compress {
+	struct flist_head list;
+	void *buf;
+	size_t len;
+	unsigned int seq;
+};
+
 #endif
diff --git a/options.c b/options.c
index 964e263..45726aa 100644
--- a/options.c
+++ b/options.c
@@ -536,6 +536,7 @@ static int str_verify_cpus_allowed_cb(void *data, const char *input)
 	return set_cpus_allowed(td, &td->o.verify_cpumask, input);
 }
 
+#ifdef CONFIG_ZLIB
 static int str_log_cpus_allowed_cb(void *data, const char *input)
 {
 	struct thread_data *td = data;
@@ -545,8 +546,9 @@ static int str_log_cpus_allowed_cb(void *data, const char *input)
 
 	return set_cpus_allowed(td, &td->o.log_gz_cpumask, input);
 }
+#endif /* CONFIG_ZLIB */
 
-#endif
+#endif /* FIO_HAVE_CPU_AFFINITY */
 
 #ifdef CONFIG_LIBNUMA
 static int str_numa_cpunodes_cb(void *data, char *input)
@@ -4143,7 +4145,7 @@ int fio_cmd_option_parse(struct thread_data *td, const char *opt, char *val)
 {
 	int ret;
 
-	ret = parse_cmd_option(opt, val, fio_options, td);
+	ret = parse_cmd_option(opt, val, fio_options, td, &td->opt_list);
 	if (!ret) {
 		struct fio_option *o;
 
@@ -4158,7 +4160,8 @@ int fio_cmd_option_parse(struct thread_data *td, const char *opt, char *val)
 int fio_cmd_ioengine_option_parse(struct thread_data *td, const char *opt,
 				char *val)
 {
-	return parse_cmd_option(opt, val, td->io_ops->options, td->eo);
+	return parse_cmd_option(opt, val, td->io_ops->options, td->eo,
+					&td->opt_list);
 }
 
 void fio_fill_default_options(struct thread_data *td)
diff --git a/parse.c b/parse.c
index 0ef00b8..ac1bee9 100644
--- a/parse.c
+++ b/parse.c
@@ -960,8 +960,27 @@ void sort_options(char **opts, struct fio_option *options, int num_opts)
 	__fio_options = NULL;
 }
 
+static void add_to_dump_list(struct fio_option *o, struct flist_head *dump_list,
+			     const char *post)
+{
+	struct print_option *p;
+
+	if (!dump_list)
+		return;
+
+	p = malloc(sizeof(*p));
+	p->name = strdup(o->name);
+	if (post)
+		p->value = strdup(post);
+	else
+		p->value = NULL;
+
+	flist_add_tail(&p->list, dump_list);
+}
+
 int parse_cmd_option(const char *opt, const char *val,
-		     struct fio_option *options, void *data)
+		     struct fio_option *options, void *data,
+		     struct flist_head *dump_list)
 {
 	struct fio_option *o;
 
@@ -971,11 +990,13 @@ int parse_cmd_option(const char *opt, const char *val,
 		return 1;
 	}
 
-	if (!handle_option(o, val, data))
-		return 0;
+	if (handle_option(o, val, data)) {
+		log_err("fio: failed parsing %s=%s\n", opt, val);
+		return 1;
+	}
 
-	log_err("fio: failed parsing %s=%s\n", opt, val);
-	return 1;
+	add_to_dump_list(o, dump_list, val);
+	return 0;
 }
 
 int parse_option(char *opt, const char *input,
@@ -1006,18 +1027,7 @@ int parse_option(char *opt, const char *input,
 		return 1;
 	}
 
-	if (dump_list) {
-		struct print_option *p = malloc(sizeof(*p));
-
-		p->name = strdup((*o)->name);
-		if (post)
-			p->value = strdup(post);
-		else
-			p->value = NULL;
-
-		flist_add_tail(&p->list, dump_list);
-	}
-
+	add_to_dump_list(*o, dump_list, post);
 	return 0;
 }
 
diff --git a/parse.h b/parse.h
index 1882810..3ba8047 100644
--- a/parse.h
+++ b/parse.h
@@ -82,7 +82,7 @@ typedef int (str_cb_fn)(void *, char *);
 
 extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *, struct flist_head *);
 extern void sort_options(char **, struct fio_option *, int);
-extern int parse_cmd_option(const char *t, const char *l, struct fio_option *, void *);
+extern int parse_cmd_option(const char *t, const char *l, struct fio_option *, void *, struct flist_head *);
 extern int show_cmd_help(struct fio_option *, const char *);
 extern void fill_default_options(void *, struct fio_option *);
 extern void option_init(struct fio_option *);
diff --git a/server.c b/server.c
index f11e972..f53e2c8 100644
--- a/server.c
+++ b/server.c
@@ -37,6 +37,7 @@ enum {
 	SK_F_COPY	= 2,
 	SK_F_SIMPLE	= 4,
 	SK_F_VEC	= 8,
+	SK_F_INLINE	= 16,
 };
 
 struct sk_entry {
@@ -54,9 +55,10 @@ struct sk_out {
 				 * protected by below ->lock */
 
 	int sk;			/* socket fd to talk to client */
-	struct fio_mutex *lock;	/* protects ref and below list */
+	struct fio_mutex lock;	/* protects ref and below list */
 	struct flist_head list;	/* list of pending transmit work */
-	struct fio_mutex *wait;	/* wake backend when items added to list */
+	struct fio_mutex wait;	/* wake backend when items added to list */
+	struct fio_mutex xmit;	/* held while sending data */
 };
 
 static char *fio_server_arg;
@@ -116,12 +118,12 @@ static const char *fio_server_ops[FIO_NET_CMD_NR] = {
 
 static void sk_lock(struct sk_out *sk_out)
 {
-	fio_mutex_down(sk_out->lock);
+	fio_mutex_down(&sk_out->lock);
 }
 
 static void sk_unlock(struct sk_out *sk_out)
 {
-	fio_mutex_up(sk_out->lock);
+	fio_mutex_up(&sk_out->lock);
 }
 
 void sk_out_assign(struct sk_out *sk_out)
@@ -137,8 +139,9 @@ void sk_out_assign(struct sk_out *sk_out)
 
 static void sk_out_free(struct sk_out *sk_out)
 {
-	fio_mutex_remove(sk_out->lock);
-	fio_mutex_remove(sk_out->wait);
+	__fio_mutex_remove(&sk_out->lock);
+	__fio_mutex_remove(&sk_out->wait);
+	__fio_mutex_remove(&sk_out->xmit);
 	sfree(sk_out);
 }
 
@@ -517,8 +520,9 @@ int fio_net_send_cmd(int fd, uint16_t opcode, const void *buf, off_t size,
 	return ret;
 }
 
-static struct sk_entry *fio_net_prep_cmd(uint16_t opcode, void *buf, off_t size,
-					 uint64_t *tagptr, int flags)
+static struct sk_entry *fio_net_prep_cmd(uint16_t opcode, void *buf,
+					 size_t size, uint64_t *tagptr,
+					 int flags)
 {
 	struct sk_entry *entry;
 
@@ -530,22 +534,28 @@ static struct sk_entry *fio_net_prep_cmd(uint16_t opcode, void *buf, off_t size,
 		memcpy(entry->buf, buf, size);
 	} else
 		entry->buf = buf;
+
 	entry->size = size;
 	entry->tagptr = tagptr;
 	entry->flags = flags;
-
 	return entry;
 }
 
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry);
+
 static void fio_net_queue_entry(struct sk_entry *entry)
 {
 	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
 
-	sk_lock(sk_out);
-	flist_add_tail(&entry->list, &sk_out->list);
-	sk_unlock(sk_out);
+	if (entry->flags & SK_F_INLINE)
+		handle_sk_entry(sk_out, entry);
+	else {
+		sk_lock(sk_out);
+		flist_add_tail(&entry->list, &sk_out->list);
+		sk_unlock(sk_out);
 
-	fio_mutex_up(sk_out->wait);
+		fio_mutex_up(&sk_out->wait);
+	}
 }
 
 static int fio_net_queue_cmd(uint16_t opcode, void *buf, off_t size,
@@ -1102,17 +1112,24 @@ static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry)
 {
 	int ret;
 
+	fio_mutex_down(&sk_out->xmit);
+
 	if (entry->flags & SK_F_VEC)
 		ret = send_vec_entry(sk_out, entry);
-	if (entry->flags & SK_F_SIMPLE) {
+	else if (entry->flags & SK_F_SIMPLE) {
 		uint64_t tag = 0;
 
 		if (entry->tagptr)
 			tag = *entry->tagptr;
 
-		ret = fio_net_send_simple_cmd(sk_out->sk, entry->opcode, tag, NULL);
-	} else
-		ret = fio_net_send_cmd(sk_out->sk, entry->opcode, entry->buf, entry->size, entry->tagptr, NULL);
+		ret = fio_net_send_simple_cmd(sk_out->sk, entry->opcode, tag,
+						NULL);
+	} else {
+		ret = fio_net_send_cmd(sk_out->sk, entry->opcode, entry->buf,
+					entry->size, entry->tagptr, NULL);
+	}
+
+	fio_mutex_up(&sk_out->xmit);
 
 	if (ret)
 		log_err("fio: failed handling cmd %s\n", fio_server_op(entry->opcode));
@@ -1177,7 +1194,7 @@ static int handle_connection(struct sk_out *sk_out)
 				break;
 			} else if (!ret) {
 				fio_server_check_jobs(&job_list);
-				fio_mutex_down_timeout(sk_out->wait, timeout);
+				fio_mutex_down_timeout(&sk_out->wait, timeout);
 				continue;
 			}
 
@@ -1323,8 +1340,9 @@ static int accept_loop(int listen_sk)
 		sk_out = smalloc(sizeof(*sk_out));
 		sk_out->sk = sk;
 		INIT_FLIST_HEAD(&sk_out->list);
-		sk_out->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
-		sk_out->wait = fio_mutex_init(FIO_MUTEX_LOCKED);
+		__fio_mutex_init(&sk_out->lock, FIO_MUTEX_UNLOCKED);
+		__fio_mutex_init(&sk_out->wait, FIO_MUTEX_LOCKED);
+		__fio_mutex_init(&sk_out->xmit, FIO_MUTEX_UNLOCKED);
 
 		pid = fork();
 		if (pid) {
@@ -1609,7 +1627,7 @@ void fio_server_send_du(void)
 	}
 }
 
-static int fio_send_iolog_gz(struct sk_entry *first, struct io_log *log)
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
 {
 	int ret = 0;
 #ifdef CONFIG_ZLIB
@@ -1649,7 +1667,8 @@ static int fio_send_iolog_gz(struct sk_entry *first, struct io_log *log)
 		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream.avail_out;
 
 		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
-						NULL, SK_F_FREE | SK_F_VEC);
+						NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		out_pdu = NULL;
 		flist_add_tail(&entry->list, &first->next);
 	} while (stream.avail_in);
 
@@ -1661,6 +1680,36 @@ err:
 	return ret;
 }
 
+static int fio_append_gz_chunks(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	struct flist_head *node;
+
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(node, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(node, struct iolog_compress, list);
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, c->buf, c->len,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		flist_add_tail(&entry->list, &first->next);
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+
+	return 0;
+}
+
+static int fio_append_text_log(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	size_t size = log->nr_samples * log_entry_sz(log);
+
+	entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, log->log, size,
+					NULL, SK_F_VEC | SK_F_INLINE);
+	flist_add_tail(&entry->list, &first->next);
+	return 0;
+}
+
 int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 {
 	struct cmd_iolog_pdu pdu;
@@ -1670,11 +1719,21 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 	pdu.nr_samples = cpu_to_le64(log->nr_samples);
 	pdu.thread_number = cpu_to_le32(td->thread_number);
 	pdu.log_type = cpu_to_le32(log->log_type);
-	pdu.compressed = cpu_to_le32(use_zlib);
+
+	if (!flist_empty(&log->chunk_list))
+		pdu.compressed = __cpu_to_le32(STORE_COMPRESSED);
+	else if (use_zlib)
+		pdu.compressed = __cpu_to_le32(XMIT_COMPRESSED);
+	else
+		pdu.compressed = 0;
 
 	strncpy((char *) pdu.name, name, FIO_NET_NAME_MAX);
 	pdu.name[FIO_NET_NAME_MAX - 1] = '\0';
 
+	/*
+	 * We can't do this for a pre-compressed log, but for that case,
+	 * log->nr_samples is zero anyway.
+	 */
 	for (i = 0; i < log->nr_samples; i++) {
 		struct io_sample *s = get_sample(log, i);
 
@@ -1693,23 +1752,22 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 	/*
 	 * Assemble header entry first
 	 */
-	first = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, &pdu, sizeof(pdu), NULL, SK_F_COPY | SK_F_VEC);
+	first = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, &pdu, sizeof(pdu), NULL, SK_F_VEC | SK_F_INLINE | SK_F_COPY);
 
 	/*
-	 * Now append actual log entries. Compress if we can, otherwise just
-	 * plain text output.
+	 * Now append actual log entries. If log compression was enabled on
+	 * the job, just send out the compressed chunks directly. If we
+	 * have a plain log, compress if we can, then send. Otherwise, send
+	 * the plain text output.
 	 */
-	if (use_zlib)
-		ret = fio_send_iolog_gz(first, log);
-	else {
-		struct sk_entry *entry;
-
-		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, log->log,
-					log->nr_samples * log_entry_sz(log),
-					NULL, SK_F_FREE | SK_F_VEC);
-		flist_add_tail(&entry->list, &first->next);
-	}
+	if (!flist_empty(&log->chunk_list))
+		ret = fio_append_gz_chunks(first, log);
+	else if (use_zlib)
+		ret = fio_append_iolog_gz(first, log);
+	else
+		ret = fio_append_text_log(first, log);
 
+	fio_net_queue_entry(first);
 	return ret;
 }
 
@@ -1722,7 +1780,8 @@ void fio_server_send_add_job(struct thread_data *td)
 	pdu.groupid = cpu_to_le32(td->groupid);
 	convert_thread_options_to_net(&pdu.top, &td->o);
 
-	fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL, SK_F_COPY);
+	fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL,
+				SK_F_COPY);
 }
 
 void fio_server_send_start(struct thread_data *td)
@@ -1758,7 +1817,8 @@ int fio_server_get_verify_state(const char *name, int threadnumber,
 	verify_state_gen_name((char *) out.path, sizeof(out.path), name, me,
 				threadnumber);
 	tag = (uint64_t) (uintptr_t) rep;
-	fio_net_queue_cmd(FIO_NET_CMD_SENDFILE, &out, sizeof(out), &tag, SK_F_COPY);
+	fio_net_queue_cmd(FIO_NET_CMD_SENDFILE, &out, sizeof(out), &tag,
+				SK_F_COPY);
 
 	/*
 	 * Wait for the backend to receive the reply
@@ -1769,7 +1829,8 @@ int fio_server_get_verify_state(const char *name, int threadnumber,
 	}
 
 	if (rep->error) {
-		log_err("fio: failure on receiving state file: %s\n", strerror(rep->error));
+		log_err("fio: failure on receiving state file: %s\n",
+				strerror(rep->error));
 fail:
 		*datap = NULL;
 		sfree(rep);
diff --git a/server.h b/server.h
index dc4a419..5a59d07 100644
--- a/server.h
+++ b/server.h
@@ -172,6 +172,11 @@ struct cmd_text_pdu {
 	uint8_t buf[0];
 };
 
+enum {
+	XMIT_COMPRESSED		= 1U,
+	STORE_COMPRESSED	= 2U,
+};
+
 struct cmd_iolog_pdu {
 	uint64_t nr_samples;
 	uint32_t thread_number;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-12 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-12 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 9dc528b1638b625b5e167983a74de4e85c5859ea:

  lib/rand: get rid of unused MAX_SEED_BUCKETS (2022-08-10 09:51:49 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 7a7bcae0610d872951bc22dc310105c7ec1157af:

  Merge branch 's3_crypto' of github.com:hualongfeng/fio (2022-08-11 15:39:02 -0400)

----------------------------------------------------------------
Feng, Hualong (3):
      engines/http: Add storage class option for s3
      engines/http: Add s3 crypto options for s3
      doc: Add usage and example about s3 storage class and crypto

Friendy.Su@sony.com (1):
      ioengines: merge filecreate, filestat, filedelete engines to fileoperations.c

Vincent Fu (1):
      Merge branch 's3_crypto' of github.com:hualongfeng/fio

 HOWTO.rst                          |  14 ++
 Makefile                           |   2 +-
 engines/filecreate.c               | 118 --------------
 engines/filedelete.c               | 115 --------------
 engines/fileoperations.c           | 318 +++++++++++++++++++++++++++++++++++++
 engines/filestat.c                 | 190 ----------------------
 engines/http.c                     | 178 ++++++++++++++++++---
 examples/http-s3-crypto.fio        |  38 +++++
 examples/http-s3-storage-class.fio |  37 +++++
 fio.1                              |   9 ++
 10 files changed, 577 insertions(+), 442 deletions(-)
 delete mode 100644 engines/filecreate.c
 delete mode 100644 engines/filedelete.c
 create mode 100644 engines/fileoperations.c
 delete mode 100644 engines/filestat.c
 create mode 100644 examples/http-s3-crypto.fio
 create mode 100644 examples/http-s3-storage-class.fio

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 104cce2d..05fc117f 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -2692,6 +2692,20 @@ with the caveat that when used on the command line, they must come after the
 
 	The S3 key/access id.
 
+.. option:: http_s3_sse_customer_key=str : [http]
+
+        The encryption customer key in SSE server side.
+
+.. option:: http_s3_sse_customer_algorithm=str : [http]
+
+        The encryption customer algorithm in SSE server side.
+        Default is **AES256**
+
+.. option:: http_s3_storage_class=str : [http]
+
+        Which storage class to access. User-customizable settings.
+        Default is **STANDARD**
+
 .. option:: http_swift_auth_token=str : [http]
 
 	The Swift auth token. See the example configuration file on how
diff --git a/Makefile b/Makefile
index 188a74d7..634d2c93 100644
--- a/Makefile
+++ b/Makefile
@@ -56,7 +56,7 @@ SOURCE :=	$(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
 		pshared.c options.c \
 		smalloc.c filehash.c profile.c debug.c engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
-		engines/ftruncate.c engines/filecreate.c engines/filestat.c engines/filedelete.c \
+		engines/ftruncate.c engines/fileoperations.c \
 		engines/exec.c \
 		server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
 		gettime-thread.c helpers.c json.c idletime.c td_error.c \
diff --git a/engines/filecreate.c b/engines/filecreate.c
deleted file mode 100644
index 7884752d..00000000
--- a/engines/filecreate.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * filecreate engine
- *
- * IO engine that doesn't do any IO, just creates files and tracks the latency
- * of the file creation.
- */
-#include <stdio.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include "../fio.h"
-
-struct fc_data {
-	enum fio_ddir stat_ddir;
-};
-
-static int open_file(struct thread_data *td, struct fio_file *f)
-{
-	struct timespec start;
-	int do_lat = !td->o.disable_lat;
-
-	dprint(FD_FILE, "fd open %s\n", f->file_name);
-
-	if (f->filetype != FIO_TYPE_FILE) {
-		log_err("fio: only files are supported\n");
-		return 1;
-	}
-	if (!strcmp(f->file_name, "-")) {
-		log_err("fio: can't read/write to stdin/out\n");
-		return 1;
-	}
-
-	if (do_lat)
-		fio_gettime(&start, NULL);
-
-	f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
-
-	if (f->fd == -1) {
-		char buf[FIO_VERROR_SIZE];
-		int e = errno;
-
-		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
-		td_verror(td, e, buf);
-		return 1;
-	}
-
-	if (do_lat) {
-		struct fc_data *data = td->io_ops_data;
-		uint64_t nsec;
-
-		nsec = ntime_since_now(&start);
-		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
-	}
-
-	return 0;
-}
-
-static enum fio_q_status queue_io(struct thread_data *td,
-				  struct io_u fio_unused *io_u)
-{
-	return FIO_Q_COMPLETED;
-}
-
-/*
- * Ensure that we at least have a block size worth of IO to do for each
- * file. If the job file has td->o.size < nr_files * block_size, then
- * fio won't do anything.
- */
-static int get_file_size(struct thread_data *td, struct fio_file *f)
-{
-	f->real_file_size = td_min_bs(td);
-	return 0;
-}
-
-static int init(struct thread_data *td)
-{
-	struct fc_data *data;
-
-	data = calloc(1, sizeof(*data));
-
-	if (td_read(td))
-		data->stat_ddir = DDIR_READ;
-	else if (td_write(td))
-		data->stat_ddir = DDIR_WRITE;
-
-	td->io_ops_data = data;
-	return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
-	struct fc_data *data = td->io_ops_data;
-
-	free(data);
-}
-
-static struct ioengine_ops ioengine = {
-	.name		= "filecreate",
-	.version	= FIO_IOOPS_VERSION,
-	.init		= init,
-	.cleanup	= cleanup,
-	.queue		= queue_io,
-	.get_file_size	= get_file_size,
-	.open_file	= open_file,
-	.close_file	= generic_close_file,
-	.flags		= FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
-				FIO_NOSTATS | FIO_NOFILEHASH,
-};
-
-static void fio_init fio_filecreate_register(void)
-{
-	register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filecreate_unregister(void)
-{
-	unregister_ioengine(&ioengine);
-}
diff --git a/engines/filedelete.c b/engines/filedelete.c
deleted file mode 100644
index df388ac9..00000000
--- a/engines/filedelete.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * file delete engine
- *
- * IO engine that doesn't do any IO, just delete files and track the latency
- * of the file deletion.
- */
-#include <stdio.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "../fio.h"
-
-struct fc_data {
-	enum fio_ddir stat_ddir;
-};
-
-static int delete_file(struct thread_data *td, struct fio_file *f)
-{
-	struct timespec start;
-	int do_lat = !td->o.disable_lat;
-	int ret;
-
-	dprint(FD_FILE, "fd delete %s\n", f->file_name);
-
-	if (f->filetype != FIO_TYPE_FILE) {
-		log_err("fio: only files are supported\n");
-		return 1;
-	}
-	if (!strcmp(f->file_name, "-")) {
-		log_err("fio: can't read/write to stdin/out\n");
-		return 1;
-	}
-
-	if (do_lat)
-		fio_gettime(&start, NULL);
-
-	ret = unlink(f->file_name);
-
-	if (ret == -1) {
-		char buf[FIO_VERROR_SIZE];
-		int e = errno;
-
-		snprintf(buf, sizeof(buf), "delete(%s)", f->file_name);
-		td_verror(td, e, buf);
-		return 1;
-	}
-
-	if (do_lat) {
-		struct fc_data *data = td->io_ops_data;
-		uint64_t nsec;
-
-		nsec = ntime_since_now(&start);
-		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
-	}
-
-	return 0;
-}
-
-
-static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u)
-{
-	return FIO_Q_COMPLETED;
-}
-
-static int init(struct thread_data *td)
-{
-	struct fc_data *data;
-
-	data = calloc(1, sizeof(*data));
-
-	if (td_read(td))
-		data->stat_ddir = DDIR_READ;
-	else if (td_write(td))
-		data->stat_ddir = DDIR_WRITE;
-
-	td->io_ops_data = data;
-	return 0;
-}
-
-static int delete_invalidate(struct thread_data *td, struct fio_file *f)
-{
-    /* do nothing because file not opened */
-    return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
-	struct fc_data *data = td->io_ops_data;
-
-	free(data);
-}
-
-static struct ioengine_ops ioengine = {
-	.name		= "filedelete",
-	.version	= FIO_IOOPS_VERSION,
-	.init		= init,
-	.invalidate	= delete_invalidate,
-	.cleanup	= cleanup,
-	.queue		= queue_io,
-	.get_file_size	= generic_get_file_size,
-	.open_file	= delete_file,
-	.flags		=  FIO_SYNCIO | FIO_FAKEIO |
-				FIO_NOSTATS | FIO_NOFILEHASH,
-};
-
-static void fio_init fio_filedelete_register(void)
-{
-	register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filedelete_unregister(void)
-{
-	unregister_ioengine(&ioengine);
-}
diff --git a/engines/fileoperations.c b/engines/fileoperations.c
new file mode 100644
index 00000000..1db60da1
--- /dev/null
+++ b/engines/fileoperations.c
@@ -0,0 +1,318 @@
+/*
+ * fileoperations engine
+ *
+ * IO engine that doesn't do any IO, just operates files and tracks the latency
+ * of the file operation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../oslib/statx.h"
+
+
+struct fc_data {
+	enum fio_ddir stat_ddir;
+};
+
+struct filestat_options {
+	void *pad;
+	unsigned int stat_type;
+};
+
+enum {
+	FIO_FILESTAT_STAT	= 1,
+	FIO_FILESTAT_LSTAT	= 2,
+	FIO_FILESTAT_STATX	= 3,
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "stat_type",
+		.lname	= "stat_type",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct filestat_options, stat_type),
+		.help	= "Specify stat system call type to measure lookup/getattr performance",
+		.def	= "stat",
+		.posval = {
+			  { .ival = "stat",
+			    .oval = FIO_FILESTAT_STAT,
+			    .help = "Use stat(2)",
+			  },
+			  { .ival = "lstat",
+			    .oval = FIO_FILESTAT_LSTAT,
+			    .help = "Use lstat(2)",
+			  },
+			  { .ival = "statx",
+			    .oval = FIO_FILESTAT_STATX,
+			    .help = "Use statx(2) if exists",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_FILESTAT,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct timespec start;
+	int do_lat = !td->o.disable_lat;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported\n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (do_lat)
+		fio_gettime(&start, NULL);
+
+	f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
+
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+		td_verror(td, e, buf);
+		return 1;
+	}
+
+	if (do_lat) {
+		struct fc_data *data = td->io_ops_data;
+		uint64_t nsec;
+
+		nsec = ntime_since_now(&start);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+	}
+
+	return 0;
+}
+
+static int stat_file(struct thread_data *td, struct fio_file *f)
+{
+	struct filestat_options *o = td->eo;
+	struct timespec start;
+	int do_lat = !td->o.disable_lat;
+	struct stat statbuf;
+#ifndef WIN32
+	struct statx statxbuf;
+	char *abspath;
+#endif
+	int ret;
+
+	dprint(FD_FILE, "fd stat %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported\n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (do_lat)
+		fio_gettime(&start, NULL);
+
+	switch (o->stat_type) {
+	case FIO_FILESTAT_STAT:
+		ret = stat(f->file_name, &statbuf);
+		break;
+	case FIO_FILESTAT_LSTAT:
+		ret = lstat(f->file_name, &statbuf);
+		break;
+	case FIO_FILESTAT_STATX:
+#ifndef WIN32
+		abspath = realpath(f->file_name, NULL);
+		if (abspath) {
+			ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
+			free(abspath);
+		} else
+			ret = -1;
+#else
+		ret = -1;
+#endif
+		break;
+	default:
+		ret = -1;
+		break;
+	}
+
+	if (ret == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
+			o->stat_type);
+		td_verror(td, e, buf);
+		return 1;
+	}
+
+	if (do_lat) {
+		struct fc_data *data = td->io_ops_data;
+		uint64_t nsec;
+
+		nsec = ntime_since_now(&start);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+	}
+
+	return 0;
+}
+
+
+static int delete_file(struct thread_data *td, struct fio_file *f)
+{
+	struct timespec start;
+	int do_lat = !td->o.disable_lat;
+	int ret;
+
+	dprint(FD_FILE, "fd delete %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported\n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+	if (do_lat)
+		fio_gettime(&start, NULL);
+
+	ret = unlink(f->file_name);
+
+	if (ret == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int e = errno;
+
+		snprintf(buf, sizeof(buf), "delete(%s)", f->file_name);
+		td_verror(td, e, buf);
+		return 1;
+	}
+
+	if (do_lat) {
+		struct fc_data *data = td->io_ops_data;
+		uint64_t nsec;
+
+		nsec = ntime_since_now(&start);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+	}
+
+	return 0;
+}
+
+static int invalidate_do_nothing(struct thread_data *td, struct fio_file *f)
+{
+	/* do nothing because file not opened */
+	return 0;
+}
+
+static enum fio_q_status queue_io(struct thread_data *td, struct io_u *io_u)
+{
+	return FIO_Q_COMPLETED;
+}
+
+/*
+ * Ensure that we at least have a block size worth of IO to do for each
+ * file. If the job file has td->o.size < nr_files * block_size, then
+ * fio won't do anything.
+ */
+static int get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	f->real_file_size = td_min_bs(td);
+	return 0;
+}
+
+static int init(struct thread_data *td)
+{
+	struct fc_data *data;
+
+	data = calloc(1, sizeof(*data));
+
+	if (td_read(td))
+		data->stat_ddir = DDIR_READ;
+	else if (td_write(td))
+		data->stat_ddir = DDIR_WRITE;
+
+	td->io_ops_data = data;
+	return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+	struct fc_data *data = td->io_ops_data;
+
+	free(data);
+}
+
+static struct ioengine_ops ioengine_filecreate = {
+	.name		= "filecreate",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= init,
+	.cleanup	= cleanup,
+	.queue		= queue_io,
+	.get_file_size	= get_file_size,
+	.open_file	= open_file,
+	.close_file	= generic_close_file,
+	.flags		= FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+				FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_filestat = {
+	.name		= "filestat",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= init,
+	.cleanup	= cleanup,
+	.queue		= queue_io,
+	.invalidate	= invalidate_do_nothing,
+	.get_file_size	= generic_get_file_size,
+	.open_file	= stat_file,
+	.flags		=  FIO_SYNCIO | FIO_FAKEIO |
+				FIO_NOSTATS | FIO_NOFILEHASH,
+	.options	= options,
+	.option_struct_size = sizeof(struct filestat_options),
+};
+
+static struct ioengine_ops ioengine_filedelete = {
+	.name		= "filedelete",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= init,
+	.invalidate	= invalidate_do_nothing,
+	.cleanup	= cleanup,
+	.queue		= queue_io,
+	.get_file_size	= generic_get_file_size,
+	.open_file	= delete_file,
+	.flags		=  FIO_SYNCIO | FIO_FAKEIO |
+				FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+
+static void fio_init fio_fileoperations_register(void)
+{
+	register_ioengine(&ioengine_filecreate);
+	register_ioengine(&ioengine_filestat);
+	register_ioengine(&ioengine_filedelete);
+}
+
+static void fio_exit fio_fileoperations_unregister(void)
+{
+	unregister_ioengine(&ioengine_filecreate);
+	unregister_ioengine(&ioengine_filestat);
+	unregister_ioengine(&ioengine_filedelete);
+}
diff --git a/engines/filestat.c b/engines/filestat.c
deleted file mode 100644
index e587eb54..00000000
--- a/engines/filestat.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * filestat engine
- *
- * IO engine that doesn't do any IO, just stat files and tracks the latency
- * of the file stat.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "../fio.h"
-#include "../optgroup.h"
-#include "../oslib/statx.h"
-
-struct fc_data {
-	enum fio_ddir stat_ddir;
-};
-
-struct filestat_options {
-	void *pad;
-	unsigned int stat_type;
-};
-
-enum {
-	FIO_FILESTAT_STAT	= 1,
-	FIO_FILESTAT_LSTAT	= 2,
-	FIO_FILESTAT_STATX	= 3,
-};
-
-static struct fio_option options[] = {
-	{
-		.name	= "stat_type",
-		.lname	= "stat_type",
-		.type	= FIO_OPT_STR,
-		.off1	= offsetof(struct filestat_options, stat_type),
-		.help	= "Specify stat system call type to measure lookup/getattr performance",
-		.def	= "stat",
-		.posval = {
-			  { .ival = "stat",
-			    .oval = FIO_FILESTAT_STAT,
-			    .help = "Use stat(2)",
-			  },
-			  { .ival = "lstat",
-			    .oval = FIO_FILESTAT_LSTAT,
-			    .help = "Use lstat(2)",
-			  },
-			  { .ival = "statx",
-			    .oval = FIO_FILESTAT_STATX,
-			    .help = "Use statx(2) if exists",
-			  },
-		},
-		.category = FIO_OPT_C_ENGINE,
-		.group	= FIO_OPT_G_FILESTAT,
-	},
-	{
-		.name	= NULL,
-	},
-};
-
-static int stat_file(struct thread_data *td, struct fio_file *f)
-{
-	struct filestat_options *o = td->eo;
-	struct timespec start;
-	int do_lat = !td->o.disable_lat;
-	struct stat statbuf;
-#ifndef WIN32
-	struct statx statxbuf;
-	char *abspath;
-#endif
-	int ret;
-
-	dprint(FD_FILE, "fd stat %s\n", f->file_name);
-
-	if (f->filetype != FIO_TYPE_FILE) {
-		log_err("fio: only files are supported\n");
-		return 1;
-	}
-	if (!strcmp(f->file_name, "-")) {
-		log_err("fio: can't read/write to stdin/out\n");
-		return 1;
-	}
-
-	if (do_lat)
-		fio_gettime(&start, NULL);
-
-	switch (o->stat_type){
-	case FIO_FILESTAT_STAT:
-		ret = stat(f->file_name, &statbuf);
-		break;
-	case FIO_FILESTAT_LSTAT:
-		ret = lstat(f->file_name, &statbuf);
-		break;
-	case FIO_FILESTAT_STATX:
-#ifndef WIN32
-		abspath = realpath(f->file_name, NULL);
-		if (abspath) {
-			ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
-			free(abspath);
-		} else
-			ret = -1;
-#else
-		ret = -1;
-#endif
-		break;
-	default:
-		ret = -1;
-		break;
-	}
-
-	if (ret == -1) {
-		char buf[FIO_VERROR_SIZE];
-		int e = errno;
-
-		snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
-			o->stat_type);
-		td_verror(td, e, buf);
-		return 1;
-	}
-
-	if (do_lat) {
-		struct fc_data *data = td->io_ops_data;
-		uint64_t nsec;
-
-		nsec = ntime_since_now(&start);
-		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
-	}
-
-	return 0;
-}
-
-static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u)
-{
-	return FIO_Q_COMPLETED;
-}
-
-static int init(struct thread_data *td)
-{
-	struct fc_data *data;
-
-	data = calloc(1, sizeof(*data));
-
-	if (td_read(td))
-		data->stat_ddir = DDIR_READ;
-	else if (td_write(td))
-		data->stat_ddir = DDIR_WRITE;
-
-	td->io_ops_data = data;
-	return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
-	struct fc_data *data = td->io_ops_data;
-
-	free(data);
-}
-
-static int stat_invalidate(struct thread_data *td, struct fio_file *f)
-{
-	/* do nothing because file not opened */
-	return 0;
-}
-
-static struct ioengine_ops ioengine = {
-	.name		= "filestat",
-	.version	= FIO_IOOPS_VERSION,
-	.init		= init,
-	.cleanup	= cleanup,
-	.queue		= queue_io,
-	.invalidate	= stat_invalidate,
-	.get_file_size	= generic_get_file_size,
-	.open_file	= stat_file,
-	.flags		=  FIO_SYNCIO | FIO_FAKEIO |
-				FIO_NOSTATS | FIO_NOFILEHASH,
-	.options	= options,
-	.option_struct_size = sizeof(struct filestat_options),
-};
-
-static void fio_init fio_filestat_register(void)
-{
-	register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filestat_unregister(void)
-{
-	unregister_ioengine(&ioengine);
-}
diff --git a/engines/http.c b/engines/http.c
index 1de9e66c..56dc7d1b 100644
--- a/engines/http.c
+++ b/engines/http.c
@@ -57,6 +57,9 @@ struct http_options {
 	char *s3_key;
 	char *s3_keyid;
 	char *s3_region;
+	char *s3_sse_customer_key;
+	char *s3_sse_customer_algorithm;
+	char *s3_storage_class;
 	char *swift_auth_token;
 	int verbose;
 	unsigned int mode;
@@ -161,6 +164,36 @@ static struct fio_option options[] = {
 		.category = FIO_OPT_C_ENGINE,
 		.group    = FIO_OPT_G_HTTP,
 	},
+	{
+		.name     = "http_s3_sse_customer_key",
+		.lname    = "SSE Customer Key",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 SSE Customer Key",
+		.off1     = offsetof(struct http_options, s3_sse_customer_key),
+		.def	  = "",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_sse_customer_algorithm",
+		.lname    = "SSE Customer Algorithm",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 SSE Customer Algorithm",
+		.off1     = offsetof(struct http_options, s3_sse_customer_algorithm),
+		.def	  = "AES256",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
+	{
+		.name     = "http_s3_storage_class",
+		.lname    = "S3 Storage class",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "S3 Storage Class",
+		.off1     = offsetof(struct http_options, s3_storage_class),
+		.def	  = "STANDARD",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_HTTP,
+	},
 	{
 		.name     = "http_mode",
 		.lname    = "Request mode to use",
@@ -266,6 +299,54 @@ static char *_gen_hex_md5(const char *p, size_t len)
 	return _conv_hex(hash, MD5_DIGEST_LENGTH);
 }
 
+static char *_conv_base64_encode(const unsigned char *p, size_t len)
+{
+	char *r, *ret;
+	int i;
+	static const char sEncodingTable[] = {
+		'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+		'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+		'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+		'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+		'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+		'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+		'w', 'x', 'y', 'z', '0', '1', '2', '3',
+		'4', '5', '6', '7', '8', '9', '+', '/'
+	};
+
+	size_t out_len = 4 * ((len + 2) / 3);
+	ret = r = malloc(out_len + 1);
+
+	for (i = 0; i < len - 2; i += 3) {
+		*r++ = sEncodingTable[(p[i] >> 2) & 0x3F];
+		*r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)];
+		*r++ = sEncodingTable[((p[i + 1] & 0xF) << 2) | ((int) (p[i + 2] & 0xC0) >> 6)];
+		*r++ = sEncodingTable[p[i + 2] & 0x3F];
+	}
+
+	if (i < len) {
+		*r++ = sEncodingTable[(p[i] >> 2) & 0x3F];
+		if (i == (len - 1)) {
+			*r++ = sEncodingTable[((p[i] & 0x3) << 4)];
+			*r++ = '=';
+		} else {
+			*r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)];
+			*r++ = sEncodingTable[((p[i + 1] & 0xF) << 2)];
+		}
+		*r++ = '=';
+	}
+
+	ret[out_len]=0;
+	return ret;
+}
+
+static char *_gen_base64_md5(const unsigned char *p, size_t len)
+{
+	unsigned char hash[MD5_DIGEST_LENGTH];
+	MD5((unsigned char*)p, len, hash);
+	return _conv_base64_encode(hash, MD5_DIGEST_LENGTH);
+}
+
 static void _hmac(unsigned char *md, void *key, int key_len, char *data) {
 #ifndef CONFIG_HAVE_OPAQUE_HMAC_CTX
 	HMAC_CTX _ctx;
@@ -335,8 +416,8 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 	char date_iso[32];
 	char method[8];
 	char dkey[128];
-	char creq[512];
-	char sts[256];
+	char creq[4096];
+	char sts[512];
 	char s[512];
 	char *uri_encoded = NULL;
 	char *dsha = NULL;
@@ -345,6 +426,9 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 	const char *service = "s3";
 	const char *aws = "aws4_request";
 	unsigned char md[SHA256_DIGEST_LENGTH];
+	unsigned char sse_key[33] = {0};
+	char *sse_key_base64 = NULL;
+	char *sse_key_md5_base64 = NULL;
 
 	time_t t = time(NULL);
 	struct tm *gtm = gmtime(&t);
@@ -353,6 +437,9 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 	strftime (date_iso, sizeof(date_iso), "%Y%m%dT%H%M%SZ", gtm);
 	uri_encoded = _aws_uriencode(uri);
 
+	if (o->s3_sse_customer_key != NULL)
+		strncpy((char*)sse_key, o->s3_sse_customer_key, sizeof(sse_key) - 1);
+
 	if (op == DDIR_WRITE) {
 		dsha = _gen_hex_sha256(buf, len);
 		sprintf(method, "PUT");
@@ -366,22 +453,50 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 	}
 
 	/* Create the canonical request first */
-	snprintf(creq, sizeof(creq),
-	"%s\n"
-	"%s\n"
-	"\n"
-	"host:%s\n"
-	"x-amz-content-sha256:%s\n"
-	"x-amz-date:%s\n"
-	"\n"
-	"host;x-amz-content-sha256;x-amz-date\n"
-	"%s"
-	, method
-	, uri_encoded, o->host, dsha, date_iso, dsha);
+	if (sse_key[0] != '\0') {
+		sse_key_base64 = _conv_base64_encode(sse_key, sizeof(sse_key) - 1);
+		sse_key_md5_base64 = _gen_base64_md5(sse_key, sizeof(sse_key) - 1);
+		snprintf(creq, sizeof(creq),
+			"%s\n"
+			"%s\n"
+			"\n"
+			"host:%s\n"
+			"x-amz-content-sha256:%s\n"
+			"x-amz-date:%s\n"
+			"x-amz-server-side-encryption-customer-algorithm:%s\n"
+			"x-amz-server-side-encryption-customer-key:%s\n"
+			"x-amz-server-side-encryption-customer-key-md5:%s\n"
+			"x-amz-storage-class:%s\n"
+			"\n"
+			"host;x-amz-content-sha256;x-amz-date;"
+			"x-amz-server-side-encryption-customer-algorithm;"
+			"x-amz-server-side-encryption-customer-key;"
+			"x-amz-server-side-encryption-customer-key-md5;"
+			"x-amz-storage-class\n"
+			"%s"
+			, method
+			, uri_encoded, o->host, dsha, date_iso
+			, o->s3_sse_customer_algorithm, sse_key_base64
+			, sse_key_md5_base64, o->s3_storage_class, dsha);
+	} else {
+		snprintf(creq, sizeof(creq),
+			"%s\n"
+			"%s\n"
+			"\n"
+			"host:%s\n"
+			"x-amz-content-sha256:%s\n"
+			"x-amz-date:%s\n"
+			"x-amz-storage-class:%s\n"
+			"\n"
+			"host;x-amz-content-sha256;x-amz-date;x-amz-storage-class\n"
+			"%s"
+			, method
+			, uri_encoded, o->host, dsha, date_iso, o->s3_storage_class, dsha);
+	}
 
 	csha = _gen_hex_sha256(creq, strlen(creq));
 	snprintf(sts, sizeof(sts), "AWS4-HMAC-SHA256\n%s\n%s/%s/%s/%s\n%s",
-		date_iso, date_short, o->s3_region, service, aws, csha);
+			date_iso, date_short, o->s3_region, service, aws, csha);
 
 	snprintf((char *)dkey, sizeof(dkey), "AWS4%s", o->s3_key);
 	_hmac(md, dkey, strlen(dkey), date_short);
@@ -401,9 +516,32 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 	snprintf(s, sizeof(s), "x-amz-date: %s", date_iso);
 	slist = curl_slist_append(slist, s);
 
-	snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
-	"SignedHeaders=host;x-amz-content-sha256;x-amz-date,Signature=%s",
-	o->s3_keyid, date_short, o->s3_region, signature);
+	if (sse_key[0] != '\0') {
+		snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-algorithm: %s", o->s3_sse_customer_algorithm);
+		slist = curl_slist_append(slist, s);
+		snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key: %s", sse_key_base64);
+		slist = curl_slist_append(slist, s);
+		snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key-md5: %s", sse_key_md5_base64);
+		slist = curl_slist_append(slist, s);
+	}
+
+	snprintf(s, sizeof(s), "x-amz-storage-class: %s", o->s3_storage_class);
+	slist = curl_slist_append(slist, s);
+
+	if (sse_key[0] != '\0') {
+		snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+			"SignedHeaders=host;x-amz-content-sha256;"
+			"x-amz-date;x-amz-server-side-encryption-customer-algorithm;"
+			"x-amz-server-side-encryption-customer-key;"
+			"x-amz-server-side-encryption-customer-key-md5;"
+			"x-amz-storage-class,"
+			"Signature=%s",
+		o->s3_keyid, date_short, o->s3_region, signature);
+	} else {
+		snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+			"SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-storage-class,Signature=%s",
+			o->s3_keyid, date_short, o->s3_region, signature);
+	}
 	slist = curl_slist_append(slist, s);
 
 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
@@ -412,6 +550,10 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 	free(csha);
 	free(dsha);
 	free(signature);
+	if (sse_key_base64 != NULL) {
+		free(sse_key_base64);
+		free(sse_key_md5_base64);
+	}
 }
 
 static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
diff --git a/examples/http-s3-crypto.fio b/examples/http-s3-crypto.fio
new file mode 100644
index 00000000..2403746e
--- /dev/null
+++ b/examples/http-s3-crypto.fio
@@ -0,0 +1,38 @@
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+# And you can set the SSE Customer Key and Algorithm to test Server
+# Side Encryption.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+http_s3_sse_customer_key=${SSE_KEY}
+http_s3_sse_customer_algorithm=AES256
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
diff --git a/examples/http-s3-storage-class.fio b/examples/http-s3-storage-class.fio
new file mode 100644
index 00000000..9ee23837
--- /dev/null
+++ b/examples/http-s3-storage-class.fio
@@ -0,0 +1,37 @@
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+# And here add storage class parameter, you can set normal test for
+# STANDARD and compression test for another storage class.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+http_s3_storage_class=${STORAGE_CLASS}
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
diff --git a/fio.1 b/fio.1
index ce9bf3ef..6630525f 100644
--- a/fio.1
+++ b/fio.1
@@ -2308,6 +2308,15 @@ The S3 secret key.
 .BI (http)http_s3_keyid \fR=\fPstr
 The S3 key/access id.
 .TP
+.BI (http)http_s3_sse_customer_key \fR=\fPstr
+The encryption customer key in SSE server side.
+.TP
+.BI (http)http_s3_sse_customer_algorithm \fR=\fPstr
+The encryption customer algorithm in SSE server side. Default is \fBAES256\fR
+.TP
+.BI (http)http_s3_storage_class \fR=\fPstr
+Which storage class to access. User-customizable settings. Default is \fBSTANDARD\fR
+.TP
 .BI (http)http_swift_auth_token \fR=\fPstr
 The Swift auth token. See the example configuration file on how to
 retrieve this.

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-11 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-11 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6cafe8445fd1e04e5f7d67bbc73029a538d1b253:

  Fio 3.31 (2022-08-09 14:41:25 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 9dc528b1638b625b5e167983a74de4e85c5859ea:

  lib/rand: get rid of unused MAX_SEED_BUCKETS (2022-08-10 09:51:49 -0600)

----------------------------------------------------------------
Jens Axboe (2):
      Merge branch 'multi_seed_refill' of https://github.com/sungup/fio
      lib/rand: get rid of unused MAX_SEED_BUCKETS

Sungup Moon (1):
      lib/rand: Enhance __fill_random_buf using the multi random seed

 configure  | 17 +++++++++++++++++
 lib/rand.c | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 49 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index 36450df8..a2b9bd4c 100755
--- a/configure
+++ b/configure
@@ -116,6 +116,10 @@ has() {
   type "$1" >/dev/null 2>&1
 }
 
+num() {
+  echo "$1" | grep -P -q "^[0-9]+$"
+}
+
 check_define() {
   cat > $TMPC <<EOF
 #if !defined($1)
@@ -174,6 +178,7 @@ libnfs=""
 xnvme=""
 libzbc=""
 dfs=""
+seed_buckets=""
 dynamic_engines="no"
 prefix=/usr/local
 
@@ -255,6 +260,8 @@ for opt do
   ;;
   --enable-asan) asan="yes"
   ;;
+  --seed-buckets=*) seed_buckets="$optarg"
+  ;;
   --help)
     show_help="yes"
     ;;
@@ -302,6 +309,7 @@ if test "$show_help" = "yes" ; then
   echo "--dynamic-libengines    Lib-based ioengines as dynamic libraries"
   echo "--disable-dfs           Disable DAOS File System support even if found"
   echo "--enable-asan           Enable address sanitizer"
+  echo "--seed-buckets=         Number of seed buckets for the refill-buffer"
   exit $exit_val
 fi
 
@@ -3273,6 +3281,15 @@ if test "$disable_tcmalloc" != "yes"; then
   fi
 fi
 print_config "TCMalloc support" "$tcmalloc"
+if ! num "$seed_buckets"; then
+  seed_buckets=4
+elif test "$seed_buckets" -lt 2; then
+  seed_buckets=2
+elif test "$seed_buckets" -gt 16; then
+  seed_buckets=16
+fi
+echo "#define CONFIG_SEED_BUCKETS $seed_buckets" >> $config_host_h
+print_config "seed_buckets" "$seed_buckets"
 
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
diff --git a/lib/rand.c b/lib/rand.c
index 1e669116..0e787a62 100644
--- a/lib/rand.c
+++ b/lib/rand.c
@@ -95,7 +95,7 @@ void init_rand_seed(struct frand_state *state, uint64_t seed, bool use64)
 		__init_rand64(&state->state64, seed);
 }
 
-void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
+void __fill_random_buf_small(void *buf, unsigned int len, uint64_t seed)
 {
 	uint64_t *b = buf;
 	uint64_t *e = b  + len / sizeof(*b);
@@ -110,6 +110,37 @@ void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
 		__builtin_memcpy(e, &seed, rest);
 }
 
+void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
+{
+	static uint64_t prime[] = {1, 2, 3, 5, 7, 11, 13, 17,
+				   19, 23, 29, 31, 37, 41, 43, 47};
+	uint64_t *b, *e, s[CONFIG_SEED_BUCKETS];
+	unsigned int rest;
+	int p;
+
+	/*
+	 * Calculate the max index which is multiples of the seed buckets.
+	 */
+	rest = (len / sizeof(*b) / CONFIG_SEED_BUCKETS) * CONFIG_SEED_BUCKETS;
+
+	b = buf;
+	e = b + rest;
+
+	rest = len - (rest * sizeof(*b));
+
+	for (p = 0; p < CONFIG_SEED_BUCKETS; p++)
+		s[p] = seed * prime[p];
+
+	for (; b != e; b += CONFIG_SEED_BUCKETS) {
+		for (p = 0; p < CONFIG_SEED_BUCKETS; ++p) {
+			b[p] = s[p];
+			s[p] = __hash_u64(s[p]);
+		}
+	}
+
+	__fill_random_buf_small(b, rest, s[0]);
+}
+
 uint64_t fill_random_buf(struct frand_state *fs, void *buf,
 			 unsigned int len)
 {

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-10 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-10 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit de31fe9ab3dd6115cd0d5c77354f67f06595570d:

  testing: add test for slat + clat = tlat (2022-08-07 12:27:55 -0400)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6cafe8445fd1e04e5f7d67bbc73029a538d1b253:

  Fio 3.31 (2022-08-09 14:41:25 -0600)

----------------------------------------------------------------
Jens Axboe (2):
      Merge branch 'master' of ssh://git.kernel.dk/data/git/fio
      Fio 3.31

Vincent Fu (2):
      ci: upload tagged AppVeyor installers as GitHub releases
      ci: drop master branch requirement for AppVeyor releases

 .appveyor.yml   | 12 ++++++++++++
 FIO-VERSION-GEN |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/.appveyor.yml b/.appveyor.yml
index b94eefe3..92301ca9 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -50,5 +50,17 @@ after_build:
 test_script:
   - python.exe t/run-fio-tests.py --artifact-root test-artifacts --debug
 
+deploy:
+  - provider: GitHub
+    description: fio Windows installer
+    auth_token:                      # encrypted token from GitHub
+      secure: Tjj+xRQEV25P6dQgboUblTCKx/LtUOUav2bvzSCtwMhHMAxrrn2adod6nlTf0ItV
+    artifact: fio.msi                # upload installer to release assets
+    draft: false
+    prerelease: false
+    on:
+      APPVEYOR_REPO_TAG: true        # deploy on tag push only
+      DISTRO: cygwin
+
 on_finish:
   - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && [ -d test-artifacts ] && 7z a -t7z test-artifacts.7z test-artifacts -xr!foo.0.0 -xr!latency.?.0 -xr!fio_jsonplus_clat2csv.test && appveyor PushArtifact test-artifacts.7z'
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index fa64f50f..72630dd0 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.30
+DEF_VER=fio-3.31
 
 LF='
 '

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-08 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-08 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit c08f9533042e909d4b4b12fdb8d14f1bc8e23dff:

  filesetup: use correct random seed for non-uniform distributions (2022-08-03 16:18:53 -0400)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to de31fe9ab3dd6115cd0d5c77354f67f06595570d:

  testing: add test for slat + clat = tlat (2022-08-07 12:27:55 -0400)

----------------------------------------------------------------
Vincent Fu (3):
      testing: add test for slat + clat = tlat
      engines/null: add FIO_ASYNCIO_SETS_ISSUE_TIME flag
      testing: add test for slat + clat = tlat

 engines/null.c            |  2 ++
 t/jobs/t0015-e78980ff.fio |  7 +++++++
 t/jobs/t0016-259ebc00.fio |  7 +++++++
 t/run-fio-tests.py        | 41 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)
 create mode 100644 t/jobs/t0015-e78980ff.fio
 create mode 100644 t/jobs/t0016-259ebc00.fio

---

Diff of recent changes:

diff --git a/engines/null.c b/engines/null.c
index 2df56718..68759c26 100644
--- a/engines/null.c
+++ b/engines/null.c
@@ -113,9 +113,11 @@ static struct null_data *null_init(struct thread_data *td)
 	if (td->o.iodepth != 1) {
 		nd->io_us = (struct io_u **) malloc(td->o.iodepth * sizeof(struct io_u *));
 		memset(nd->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
+		td->io_ops->flags |= FIO_ASYNCIO_SETS_ISSUE_TIME;
 	} else
 		td->io_ops->flags |= FIO_SYNCIO;
 
+	td_set_ioengine_flags(td);
 	return nd;
 }
 
diff --git a/t/jobs/t0015-e78980ff.fio b/t/jobs/t0015-e78980ff.fio
new file mode 100644
index 00000000..c650c0b2
--- /dev/null
+++ b/t/jobs/t0015-e78980ff.fio
@@ -0,0 +1,7 @@
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+
+[test]
+ioengine=libaio
+size=1M
+iodepth=16
diff --git a/t/jobs/t0016-259ebc00.fio b/t/jobs/t0016-259ebc00.fio
new file mode 100644
index 00000000..1b418e7c
--- /dev/null
+++ b/t/jobs/t0016-259ebc00.fio
@@ -0,0 +1,7 @@
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+
+[test]
+ioengine=null
+size=1M
+iodepth=16
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py
index 32cdbc19..d77f20e0 100755
--- a/t/run-fio-tests.py
+++ b/t/run-fio-tests.py
@@ -527,6 +527,27 @@ class FioJobTest_t0014(FioJobTest):
             return
 
 
+class FioJobTest_t0015(FioJobTest):
+    """Test consists of fio test jobs t0015 and t0016
+    Confirm that mean(slat) + mean(clat) = mean(tlat)"""
+
+    def check_result(self):
+        super(FioJobTest_t0015, self).check_result()
+
+        if not self.passed:
+            return
+
+        slat = self.json_data['jobs'][0]['read']['slat_ns']['mean']
+        clat = self.json_data['jobs'][0]['read']['clat_ns']['mean']
+        tlat = self.json_data['jobs'][0]['read']['lat_ns']['mean']
+        logging.debug('Test %d: slat %f, clat %f, tlat %f', self.testnum, slat, clat, tlat)
+
+        if abs(slat + clat - tlat) > 1:
+            self.failure_reason = "{0} slat {1} + clat {2} = {3} != tlat {4},".format(
+                self.failure_reason, slat, clat, slat+clat, tlat)
+            self.passed = False
+
+
 class FioJobTest_iops_rate(FioJobTest):
     """Test consists of fio test job t0009
     Confirm that job0 iops == 1000
@@ -816,6 +837,26 @@ TEST_LIST = [
         'output_format':    'json',
         'requirements':     [],
     },
+    {
+        'test_id':          15,
+        'test_class':       FioJobTest_t0015,
+        'job':              't0015-e78980ff.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
+    {
+        'test_id':          16,
+        'test_class':       FioJobTest_t0015,
+        'job':              't0016-259ebc00.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
     {
         'test_id':          1000,
         'test_class':       FioExeTest,

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-04 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-04 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 7006d70c7c8b9a39cf3dfdd839d1975295c10527:

  Merge branch 'io_uring-numa' (2022-08-02 10:20:31 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to c08f9533042e909d4b4b12fdb8d14f1bc8e23dff:

  filesetup: use correct random seed for non-uniform distributions (2022-08-03 16:18:53 -0400)

----------------------------------------------------------------
Vincent Fu (3):
      examples: fix ioengine in zbd-rand-write.fio
      engines/null: fill issue_time during commit
      filesetup: use correct random seed for non-uniform distributions

 engines/null.c              | 19 +++++++++++++++++++
 examples/zbd-rand-write.fio |  2 +-
 filesetup.c                 |  2 +-
 3 files changed, 21 insertions(+), 2 deletions(-)

---

Diff of recent changes:

diff --git a/engines/null.c b/engines/null.c
index 8dcd1b21..2df56718 100644
--- a/engines/null.c
+++ b/engines/null.c
@@ -44,9 +44,28 @@ static int null_getevents(struct null_data *nd, unsigned int min_events,
 	return ret;
 }
 
+static void null_queued(struct thread_data *td, struct null_data *nd)
+{
+	struct timespec now;
+
+	if (!fio_fill_issue_time(td))
+		return;
+
+	fio_gettime(&now, NULL);
+
+	for (int i = 0; i < nd->queued; i++) {
+		struct io_u *io_u = nd->io_us[i];
+
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+	}
+}
+
 static int null_commit(struct thread_data *td, struct null_data *nd)
 {
 	if (!nd->events) {
+		null_queued(td, nd);
+
 #ifndef FIO_EXTERNAL_ENGINE
 		io_u_mark_submit(td, nd->queued);
 #endif
diff --git a/examples/zbd-rand-write.fio b/examples/zbd-rand-write.fio
index 46cddd06..9494a583 100644
--- a/examples/zbd-rand-write.fio
+++ b/examples/zbd-rand-write.fio
@@ -1,4 +1,4 @@
-; Using the libaio ioengine, random write to a (zoned) block device,
+; Using the psync ioengine, random write to a (zoned) block device,
 ; writing at most 32 zones at a time. Target zones are chosen randomly
 ; and writes directed at the write pointer of the chosen zones
 
diff --git a/filesetup.c b/filesetup.c
index e0592209..3e2ccf9b 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -1495,7 +1495,7 @@ static void __init_rand_distribution(struct thread_data *td, struct fio_file *f)
 
 	seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
 	if (!td->o.rand_repeatable)
-		seed = td->rand_seeds[4];
+		seed = td->rand_seeds[FIO_RAND_BLOCK_OFF];
 
 	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
 		zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, td->o.random_center.u.f, seed);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-03 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-03 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 55037c4839c65612fa388ae937e63661d8192ed9:

  t/io_uring: switch to GiB/sec if numbers get large (2022-07-31 12:06:12 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 7006d70c7c8b9a39cf3dfdd839d1975295c10527:

  Merge branch 'io_uring-numa' (2022-08-02 10:20:31 -0600)

----------------------------------------------------------------
Jens Axboe (2):
      t/io_uring: support NUMA placement
      Merge branch 'io_uring-numa'

 t/io_uring.c | 446 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 252 insertions(+), 194 deletions(-)

---

Diff of recent changes:

diff --git a/t/io_uring.c b/t/io_uring.c
index 335a06ed..35bf1956 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -11,6 +11,10 @@
 #include <libaio.h>
 #endif
 
+#ifdef CONFIG_LIBNUMA
+#include <numa.h>
+#endif
+
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/ioctl.h>
@@ -100,6 +104,9 @@ struct submitter {
 	io_context_t aio_ctx;
 #endif
 
+	int numa_node;
+	const char *filename;
+
 	struct file files[MAX_FDS];
 	unsigned nr_files;
 	unsigned cur_file;
@@ -110,6 +117,7 @@ static struct submitter *submitter;
 static volatile int finish;
 static int stats_running;
 static unsigned long max_iops;
+static long page_size;
 
 static int depth = DEPTH;
 static int batch_submit = BATCH_SUBMIT;
@@ -130,6 +138,7 @@ static int runtime = 0;		/* runtime */
 static int random_io = 1;	/* random or sequential IO */
 static int register_ring = 1;	/* register ring */
 static int use_sync = 0;	/* use preadv2 */
+static int numa_placement = 0;	/* set to node of device */
 
 static unsigned long tsc_rate;
 
@@ -611,12 +620,191 @@ static int reap_events_uring(struct submitter *s)
 	return reaped;
 }
 
+static void set_affinity(struct submitter *s)
+{
+#ifdef CONFIG_LIBNUMA
+	struct bitmask *mask;
+
+	if (s->numa_node == -1)
+		return;
+
+	numa_set_preferred(s->numa_node);
+
+	mask = numa_allocate_cpumask();
+	numa_node_to_cpus(s->numa_node, mask);
+	numa_sched_setaffinity(s->tid, mask);
+#endif
+}
+
+static int detect_node(struct submitter *s, const char *name)
+{
+#ifdef CONFIG_LIBNUMA
+	const char *base = basename(name);
+	char str[128];
+	int ret, fd, node;
+
+	sprintf(str, "/sys/block/%s/device/numa_node", base);
+	fd = open(str, O_RDONLY);
+	if (fd < 0)
+		return -1;
+
+	ret = read(fd, str, sizeof(str));
+	if (ret < 0) {
+		close(fd);
+		return -1;
+	}
+	node = atoi(str);
+	s->numa_node = node;
+	close(fd);
+#else
+	s->numa_node = -1;
+#endif
+	return 0;
+}
+
+static int setup_aio(struct submitter *s)
+{
+#ifdef CONFIG_LIBAIO
+	if (polled) {
+		fprintf(stderr, "aio does not support polled IO\n");
+		polled = 0;
+	}
+	if (sq_thread_poll) {
+		fprintf(stderr, "aio does not support SQPOLL IO\n");
+		sq_thread_poll = 0;
+	}
+	if (do_nop) {
+		fprintf(stderr, "aio does not support polled IO\n");
+		do_nop = 0;
+	}
+	if (fixedbufs || register_files) {
+		fprintf(stderr, "aio does not support registered files or buffers\n");
+		fixedbufs = register_files = 0;
+	}
+
+	return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
+#else
+	fprintf(stderr, "Legacy AIO not available on this system/build\n");
+	errno = EINVAL;
+	return -1;
+#endif
+}
+
+static int setup_ring(struct submitter *s)
+{
+	struct io_sq_ring *sring = &s->sq_ring;
+	struct io_cq_ring *cring = &s->cq_ring;
+	struct io_uring_params p;
+	int ret, fd;
+	void *ptr;
+
+	memset(&p, 0, sizeof(p));
+
+	if (polled && !do_nop)
+		p.flags |= IORING_SETUP_IOPOLL;
+	if (sq_thread_poll) {
+		p.flags |= IORING_SETUP_SQPOLL;
+		if (sq_thread_cpu != -1) {
+			p.flags |= IORING_SETUP_SQ_AFF;
+			p.sq_thread_cpu = sq_thread_cpu;
+		}
+	}
+
+	fd = io_uring_setup(depth, &p);
+	if (fd < 0) {
+		perror("io_uring_setup");
+		return 1;
+	}
+	s->ring_fd = s->enter_ring_fd = fd;
+
+	io_uring_probe(fd);
+
+	if (fixedbufs) {
+		struct rlimit rlim;
+
+		rlim.rlim_cur = RLIM_INFINITY;
+		rlim.rlim_max = RLIM_INFINITY;
+		/* ignore potential error, not needed on newer kernels */
+		setrlimit(RLIMIT_MEMLOCK, &rlim);
+
+		ret = io_uring_register_buffers(s);
+		if (ret < 0) {
+			perror("io_uring_register_buffers");
+			return 1;
+		}
+
+		if (dma_map) {
+			ret = io_uring_map_buffers(s);
+			if (ret < 0) {
+				perror("io_uring_map_buffers");
+				return 1;
+			}
+		}
+	}
+
+	if (register_files) {
+		ret = io_uring_register_files(s);
+		if (ret < 0) {
+			perror("io_uring_register_files");
+			return 1;
+		}
+	}
+
+	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_SQ_RING);
+	sring->head = ptr + p.sq_off.head;
+	sring->tail = ptr + p.sq_off.tail;
+	sring->ring_mask = ptr + p.sq_off.ring_mask;
+	sring->ring_entries = ptr + p.sq_off.ring_entries;
+	sring->flags = ptr + p.sq_off.flags;
+	sring->array = ptr + p.sq_off.array;
+	sq_ring_mask = *sring->ring_mask;
+
+	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_SQES);
+
+	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
+			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
+			IORING_OFF_CQ_RING);
+	cring->head = ptr + p.cq_off.head;
+	cring->tail = ptr + p.cq_off.tail;
+	cring->ring_mask = ptr + p.cq_off.ring_mask;
+	cring->ring_entries = ptr + p.cq_off.ring_entries;
+	cring->cqes = ptr + p.cq_off.cqes;
+	cq_ring_mask = *cring->ring_mask;
+	return 0;
+}
+
+static void *allocate_mem(struct submitter *s, int size)
+{
+	void *buf;
+
+#ifdef CONFIG_LIBNUMA
+	if (s->numa_node != -1)
+		return numa_alloc_onnode(size, s->numa_node);
+#endif
+
+	if (posix_memalign(&buf, page_size, bs)) {
+		printf("failed alloc\n");
+		return NULL;
+	}
+
+	return buf;
+}
+
 static int submitter_init(struct submitter *s)
 {
-	int i, nr_batch;
+	int i, nr_batch, err;
+	static int init_printed;
+	char buf[80];
 
 	s->tid = gettid();
-	printf("submitter=%d, tid=%d\n", s->index, s->tid);
+	printf("submitter=%d, tid=%d, file=%s, node=%d\n", s->index, s->tid,
+							s->filename, s->numa_node);
+
+	set_affinity(s);
 
 	__init_rand64(&s->rand_state, pthread_self());
 	srand48(pthread_self());
@@ -624,6 +812,37 @@ static int submitter_init(struct submitter *s)
 	for (i = 0; i < MAX_FDS; i++)
 		s->files[i].fileno = i;
 
+	for (i = 0; i < roundup_pow2(depth); i++) {
+		void *buf;
+
+		buf = allocate_mem(s, bs);
+		if (!buf)
+			return 1;
+		s->iovecs[i].iov_base = buf;
+		s->iovecs[i].iov_len = bs;
+	}
+
+	if (use_sync) {
+		sprintf(buf, "Engine=preadv2\n");
+		err = 0;
+	} else if (!aio) {
+		err = setup_ring(s);
+		sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+	} else {
+		sprintf(buf, "Engine=aio\n");
+		err = setup_aio(s);
+	}
+	if (err) {
+		printf("queue setup failed: %s, %d\n", strerror(errno), err);
+		return 1;
+	}
+
+	if (!init_printed) {
+		printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
+		printf("%s", buf);
+		init_printed = 1;
+	}
+
 	if (stats) {
 		nr_batch = roundup_pow2(depth / batch_submit);
 		if (nr_batch < 2)
@@ -1026,15 +1245,21 @@ static struct submitter *get_submitter(int offset)
 static void do_finish(const char *reason)
 {
 	int j;
+
 	printf("Exiting on %s\n", reason);
 	for (j = 0; j < nthreads; j++) {
 		struct submitter *s = get_submitter(j);
 		s->finish = 1;
 	}
-	if (max_iops > 100000)
-		printf("Maximum IOPS=%luK\n", max_iops / 1000);
-	else if (max_iops)
+	if (max_iops > 1000000) {
+		double miops = (double) max_iops / 1000000.0;
+		printf("Maximum IOPS=%.2fM\n", miops);
+	} else if (max_iops > 100000) {
+		double kiops = (double) max_iops / 1000.0;
+		printf("Maximum IOPS=%.2fK\n", kiops);
+	} else {
 		printf("Maximum IOPS=%lu\n", max_iops);
+	}
 	finish = 1;
 }
 
@@ -1058,144 +1283,6 @@ static void arm_sig_int(void)
 #endif
 }
 
-static int setup_aio(struct submitter *s)
-{
-#ifdef CONFIG_LIBAIO
-	if (polled) {
-		fprintf(stderr, "aio does not support polled IO\n");
-		polled = 0;
-	}
-	if (sq_thread_poll) {
-		fprintf(stderr, "aio does not support SQPOLL IO\n");
-		sq_thread_poll = 0;
-	}
-	if (do_nop) {
-		fprintf(stderr, "aio does not support polled IO\n");
-		do_nop = 0;
-	}
-	if (fixedbufs || register_files) {
-		fprintf(stderr, "aio does not support registered files or buffers\n");
-		fixedbufs = register_files = 0;
-	}
-
-	return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
-#else
-	fprintf(stderr, "Legacy AIO not available on this system/build\n");
-	errno = EINVAL;
-	return -1;
-#endif
-}
-
-static int setup_ring(struct submitter *s)
-{
-	struct io_sq_ring *sring = &s->sq_ring;
-	struct io_cq_ring *cring = &s->cq_ring;
-	struct io_uring_params p;
-	int ret, fd;
-	void *ptr;
-
-	memset(&p, 0, sizeof(p));
-
-	if (polled && !do_nop)
-		p.flags |= IORING_SETUP_IOPOLL;
-	if (sq_thread_poll) {
-		p.flags |= IORING_SETUP_SQPOLL;
-		if (sq_thread_cpu != -1) {
-			p.flags |= IORING_SETUP_SQ_AFF;
-			p.sq_thread_cpu = sq_thread_cpu;
-		}
-	}
-
-	fd = io_uring_setup(depth, &p);
-	if (fd < 0) {
-		perror("io_uring_setup");
-		return 1;
-	}
-	s->ring_fd = s->enter_ring_fd = fd;
-
-	io_uring_probe(fd);
-
-	if (fixedbufs) {
-		struct rlimit rlim;
-
-		rlim.rlim_cur = RLIM_INFINITY;
-		rlim.rlim_max = RLIM_INFINITY;
-		/* ignore potential error, not needed on newer kernels */
-		setrlimit(RLIMIT_MEMLOCK, &rlim);
-
-		ret = io_uring_register_buffers(s);
-		if (ret < 0) {
-			perror("io_uring_register_buffers");
-			return 1;
-		}
-
-		if (dma_map) {
-			ret = io_uring_map_buffers(s);
-			if (ret < 0) {
-				perror("io_uring_map_buffers");
-				return 1;
-			}
-		}
-	}
-
-	if (register_files) {
-		ret = io_uring_register_files(s);
-		if (ret < 0) {
-			perror("io_uring_register_files");
-			return 1;
-		}
-	}
-
-	ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
-			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
-			IORING_OFF_SQ_RING);
-	sring->head = ptr + p.sq_off.head;
-	sring->tail = ptr + p.sq_off.tail;
-	sring->ring_mask = ptr + p.sq_off.ring_mask;
-	sring->ring_entries = ptr + p.sq_off.ring_entries;
-	sring->flags = ptr + p.sq_off.flags;
-	sring->array = ptr + p.sq_off.array;
-	sq_ring_mask = *sring->ring_mask;
-
-	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
-			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
-			IORING_OFF_SQES);
-
-	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
-			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
-			IORING_OFF_CQ_RING);
-	cring->head = ptr + p.cq_off.head;
-	cring->tail = ptr + p.cq_off.tail;
-	cring->ring_mask = ptr + p.cq_off.ring_mask;
-	cring->ring_entries = ptr + p.cq_off.ring_entries;
-	cring->cqes = ptr + p.cq_off.cqes;
-	cq_ring_mask = *cring->ring_mask;
-	return 0;
-}
-
-static void file_depths(char *buf)
-{
-	bool prev = false;
-	char *p;
-	int i, j;
-
-	buf[0] = '\0';
-	p = buf;
-	for (j = 0; j < nthreads; j++) {
-		struct submitter *s = get_submitter(j);
-
-		for (i = 0; i < s->nr_files; i++) {
-			struct file *f = &s->files[i];
-
-			if (prev)
-				p += sprintf(p, " %d", f->pending_ios);
-			else
-				p += sprintf(p, "%d", f->pending_ios);
-			prev = true;
-		}
-	}
-}
-
 static void usage(char *argv, int status)
 {
 	char runtime_str[16];
@@ -1218,11 +1305,12 @@ static void usage(char *argv, int status)
 		" -R <bool> : Use random IO, default %d\n"
 		" -a <bool> : Use legacy aio, default %d\n"
 		" -S <bool> : Use sync IO (preadv2), default %d\n"
-		" -X <bool> : Use registered ring %d\n",
+		" -X <bool> : Use registered ring %d\n"
+		" -P <bool> : Automatically place on device home node %d\n",
 		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
 		fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
 		stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
-		use_sync, register_ring);
+		use_sync, register_ring, numa_placement);
 	exit(status);
 }
 
@@ -1274,16 +1362,14 @@ int main(int argc, char *argv[])
 {
 	struct submitter *s;
 	unsigned long done, calls, reap;
-	int err, i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
-	long page_size;
+	int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
 	struct file f;
-	char *fdepths;
 	void *ret;
 
 	if (!do_nop && argc < 2)
 		usage(argv[0], 1);
 
-	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:h?")) != -1) {
+	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:h?")) != -1) {
 		switch (opt) {
 		case 'a':
 			aio = !!atoi(optarg);
@@ -1361,6 +1447,9 @@ int main(int argc, char *argv[])
 			exit(1);
 #endif
 			break;
+		case 'P':
+			numa_placement = !!atoi(optarg);
+			break;
 		case 'h':
 		case '?':
 		default:
@@ -1383,6 +1472,7 @@ int main(int argc, char *argv[])
 				roundup_pow2(depth) * sizeof(struct iovec));
 	for (j = 0; j < nthreads; j++) {
 		s = get_submitter(j);
+		s->numa_node = -1;
 		s->index = j;
 		s->done = s->calls = s->reaps = 0;
 	}
@@ -1440,7 +1530,10 @@ int main(int argc, char *argv[])
 
 			memcpy(&s->files[s->nr_files], &f, sizeof(f));
 
-			printf("Added file %s (submitter %d)\n", argv[i], s->index);
+			if (numa_placement)
+				detect_node(s, argv[i]);
+
+			s->filename = argv[i];
 			s->nr_files++;
 		}
 		threads_rem--;
@@ -1454,43 +1547,6 @@ int main(int argc, char *argv[])
 	if (page_size < 0)
 		page_size = 4096;
 
-	for (j = 0; j < nthreads; j++) {
-		s = get_submitter(j);
-		for (i = 0; i < roundup_pow2(depth); i++) {
-			void *buf;
-
-			if (posix_memalign(&buf, page_size, bs)) {
-				printf("failed alloc\n");
-				return 1;
-			}
-			s->iovecs[i].iov_base = buf;
-			s->iovecs[i].iov_len = bs;
-		}
-	}
-
-	for (j = 0; j < nthreads; j++) {
-		s = get_submitter(j);
-
-		if (use_sync)
-			continue;
-		else if (!aio)
-			err = setup_ring(s);
-		else
-			err = setup_aio(s);
-		if (err) {
-			printf("ring setup failed: %s, %d\n", strerror(errno), err);
-			return 1;
-		}
-	}
-	s = get_submitter(0);
-	printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
-	if (use_sync)
-		printf("Engine=preadv2\n");
-	else if (!aio)
-		printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
-	else
-		printf("Engine=aio\n");
-
 	for (j = 0; j < nthreads; j++) {
 		s = get_submitter(j);
 		if (use_sync)
@@ -1503,7 +1559,6 @@ int main(int argc, char *argv[])
 #endif
 	}
 
-	fdepths = malloc(8 * s->nr_files * nthreads);
 	reap = calls = done = 0;
 	do {
 		unsigned long this_done = 0;
@@ -1535,16 +1590,20 @@ int main(int argc, char *argv[])
 			ipc = (this_reap - reap) / (this_call - calls);
 		} else
 			rpc = ipc = -1;
-		file_depths(fdepths);
 		iops = this_done - done;
 		if (bs > 1048576)
 			bw = iops * (bs / 1048576);
 		else
 			bw = iops / (1048576 / bs);
-		if (iops > 100000)
-			printf("IOPS=%luK, ", iops / 1000);
-		else
+		if (iops > 1000000) {
+			double miops = (double) iops / 1000000.0;
+			printf("IOPS=%.2fM, ", miops);
+		} else if (iops > 100000) {
+			double kiops = (double) iops / 1000.0;
+			printf("IOPS=%.2fK, ", kiops);
+		} else {
 			printf("IOPS=%lu, ", iops);
+		}
 		max_iops = max(max_iops, iops);
 		if (!do_nop) {
 			if (bw > 2000) {
@@ -1555,7 +1614,7 @@ int main(int argc, char *argv[])
 				printf("BW=%luMiB/s, ", bw);
 			}
 		}
-		printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths);
+		printf("IOS/call=%ld/%ld\n", rpc, ipc);
 		done = this_done;
 		calls = this_call;
 		reap = this_reap;
@@ -1578,7 +1637,6 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	free(fdepths);
 	free(submitter);
 	return 0;
 }

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-08-01 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-08-01 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 3e1d3f2fc4a5f09174f0d6d70d036285d69f17c2:

  .github: add pull request template (2022-07-28 11:00:04 -0400)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 55037c4839c65612fa388ae937e63661d8192ed9:

  t/io_uring: switch to GiB/sec if numbers get large (2022-07-31 12:06:12 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      t/io_uring: switch to GiB/sec if numbers get large

 t/io_uring.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

---

Diff of recent changes:

diff --git a/t/io_uring.c b/t/io_uring.c
index 10035912..335a06ed 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -1546,8 +1546,15 @@ int main(int argc, char *argv[])
 		else
 			printf("IOPS=%lu, ", iops);
 		max_iops = max(max_iops, iops);
-		if (!do_nop)
-			printf("BW=%luMiB/s, ", bw);
+		if (!do_nop) {
+			if (bw > 2000) {
+				double bw_g = (double) bw / 1000.0;
+
+				printf("BW=%.2fGiB/s, ", bw_g);
+			} else {
+				printf("BW=%luMiB/s, ", bw);
+			}
+		}
 		printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths);
 		done = this_done;
 		calls = this_call;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-29 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-29 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 5b99196735a245224ec9321f796a9da30654ae6c:

  README: add maintainer section (2022-07-27 21:04:31 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 3e1d3f2fc4a5f09174f0d6d70d036285d69f17c2:

  .github: add pull request template (2022-07-28 11:00:04 -0400)

----------------------------------------------------------------
Vincent Fu (1):
      .github: add pull request template

 .github/PULL_REQUEST_TEMPLATE.md | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE.md

---

Diff of recent changes:

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..4d98a694
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,8 @@
+Please confirm that your commit message(s) follow these guidelines:
+
+1. First line is a commit title, a descriptive one-liner for the change
+2. Empty second line
+3. Commit message body that explains why the change is useful. Break lines that
+   aren't something like a URL at 72-74 chars.
+4. Empty line
+5. Signed-off-by: Real Name <real@email.com>

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-28 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-28 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit dff32ddb97f2257975b6047474d665a5de7f7bbc:

  ci: install libnfs for linux and macos builds (2022-07-22 15:57:27 -0400)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 5b99196735a245224ec9321f796a9da30654ae6c:

  README: add maintainer section (2022-07-27 21:04:31 -0600)

----------------------------------------------------------------
Chris Weber (1):
      Fix multithread issues when operating on a single shared file

Jens Axboe (3):
      Merge branch 'proposed_fix' of https://github.com/weberc-ntap/fio
      Minor style fixups
      README: add maintainer section

 README.rst  | 11 +++++++++++
 backend.c   | 19 ++++++++++++++++++-
 file.h      |  1 +
 filesetup.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 3 deletions(-)

---

Diff of recent changes:

diff --git a/README.rst b/README.rst
index 4d736eaf..67420903 100644
--- a/README.rst
+++ b/README.rst
@@ -81,6 +81,17 @@ benchmark/test tools out there weren't flexible enough to do what he wanted.
 Jens Axboe <axboe@kernel.dk> 20060905
 
 
+Maintainers
+-----------
+
+Fio is maintained by Jens Axboe <axboe@kernel.dk and
+Vincent Fu <vincentfu@gmail.com> - however, for reporting bugs please use
+the fio reflector or the GitHub page rather than email any of them
+directly. By using the public resources, others will be able to learn from
+the responses too. Chances are also good that other members will be able to
+help with your inquiry as well.
+
+
 Binary packages
 ---------------
 
diff --git a/backend.c b/backend.c
index e5bb4e25..5159b60d 100644
--- a/backend.c
+++ b/backend.c
@@ -2314,8 +2314,25 @@ static void run_threads(struct sk_out *sk_out)
 	for_each_td(td, i) {
 		print_status_init(td->thread_number - 1);
 
-		if (!td->o.create_serialize)
+		if (!td->o.create_serialize) {
+			/*
+			 *  When operating on a single rile in parallel,
+			 *  perform single-threaded early setup so that
+			 *  when setup_files() does not run into issues
+			 *  later.
+			*/
+			if (!i && td->o.nr_files == 1) {
+				if (setup_shared_file(td)) {
+					exit_value++;
+					if (td->error)
+						log_err("fio: pid=%d, err=%d/%s\n",
+							(int) td->pid, td->error, td->verror);
+					td_set_runstate(td, TD_REAPED);
+					todo--;
+				}
+			}
 			continue;
+		}
 
 		if (fio_verify_load_state(td))
 			goto reap;
diff --git a/file.h b/file.h
index da1b8947..e646cf22 100644
--- a/file.h
+++ b/file.h
@@ -201,6 +201,7 @@ struct thread_data;
 extern void close_files(struct thread_data *);
 extern void close_and_free_files(struct thread_data *);
 extern uint64_t get_start_offset(struct thread_data *, struct fio_file *);
+extern int __must_check setup_shared_file(struct thread_data *);
 extern int __must_check setup_files(struct thread_data *);
 extern int __must_check file_invalidate_cache(struct thread_data *, struct fio_file *);
 #ifdef __cplusplus
diff --git a/filesetup.c b/filesetup.c
index ab6c488b..e0592209 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -143,7 +143,7 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
 	if (unlink_file || new_layout) {
 		int ret;
 
-		dprint(FD_FILE, "layout unlink %s\n", f->file_name);
+		dprint(FD_FILE, "layout %d unlink %d %s\n", new_layout, unlink_file, f->file_name);
 
 		ret = td_io_unlink_file(td, f);
 		if (ret != 0 && ret != ENOENT) {
@@ -198,6 +198,9 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
 		}
 	}
 
+
+	dprint(FD_FILE, "fill file %s, size %llu\n", f->file_name, (unsigned long long) f->real_file_size);
+
 	left = f->real_file_size;
 	bs = td->o.max_bs[DDIR_WRITE];
 	if (bs > left)
@@ -1078,6 +1081,44 @@ static bool create_work_dirs(struct thread_data *td, const char *fname)
 	return true;
 }
 
+int setup_shared_file(struct thread_data *td)
+{
+	struct fio_file *f;
+	uint64_t file_size;
+	int err = 0;
+
+	if (td->o.nr_files > 1) {
+		log_err("fio: shared file setup called for multiple files\n");
+		return -1;
+	}
+
+	get_file_sizes(td);
+
+	f = td->files[0];
+
+	if (f == NULL) {
+		log_err("fio: NULL shared file\n");
+		return -1;
+	}
+
+	file_size = thread_number * td->o.size;
+	dprint(FD_FILE, "shared setup %s real_file_size=%llu, desired=%llu\n", 
+			f->file_name, (unsigned long long)f->real_file_size, (unsigned long long)file_size);
+
+	if (f->real_file_size < file_size) {
+		dprint(FD_FILE, "fio: extending shared file\n");
+		f->real_file_size = file_size;
+		err = extend_file(td, f);
+		if (!err)
+			err = __file_invalidate_cache(td, f, 0, f->real_file_size);
+		get_file_sizes(td);
+		dprint(FD_FILE, "shared setup new real_file_size=%llu\n", 
+				(unsigned long long)f->real_file_size);
+	}
+
+	return err;
+}
+
 /*
  * Open the files and setup files sizes, creating files if necessary.
  */
@@ -1092,7 +1133,7 @@ int setup_files(struct thread_data *td)
 	const unsigned long long bs = td_min_bs(td);
 	uint64_t fs = 0;
 
-	dprint(FD_FILE, "setup files\n");
+	dprint(FD_FILE, "setup files (thread_number=%d, subjob_number=%d)\n", td->thread_number, td->subjob_number);
 
 	old_state = td_bump_runstate(td, TD_SETTING_UP);
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-23 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-23 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 02a36caa69f5675f7144fbeddb7a32e1d35ce0c7:

  docs: clarify write_iolog description (2022-07-21 15:18:18 -0400)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to dff32ddb97f2257975b6047474d665a5de7f7bbc:

  ci: install libnfs for linux and macos builds (2022-07-22 15:57:27 -0400)

----------------------------------------------------------------
Vincent Fu (3):
      configure: cleanups for nfs ioengine
      engines/nfs: remove commit hook
      ci: install libnfs for linux and macos builds

 ci/actions-install.sh |  3 ++-
 configure             | 16 +++++++---------
 engines/nfs.c         |  9 ---------
 options.c             |  2 +-
 4 files changed, 10 insertions(+), 20 deletions(-)

---

Diff of recent changes:

diff --git a/ci/actions-install.sh b/ci/actions-install.sh
index ff514926..b5c4198f 100755
--- a/ci/actions-install.sh
+++ b/ci/actions-install.sh
@@ -26,6 +26,7 @@ DPKGCFG
         libibverbs-dev
         libnuma-dev
         librdmacm-dev
+	libnfs-dev
         valgrind
     )
     case "${CI_TARGET_ARCH}" in
@@ -78,7 +79,7 @@ install_macos() {
     #echo "Updating homebrew..."
     #brew update >/dev/null 2>&1
     echo "Installing packages..."
-    HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit
+    HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit libnfs
     pip3 install scipy six sphinx
 }
 
diff --git a/configure b/configure
index 7965f0b0..36450df8 100755
--- a/configure
+++ b/configure
@@ -170,7 +170,7 @@ disable_native="no"
 march_set="no"
 libiscsi="no"
 libnbd="no"
-libnfs="no"
+libnfs=""
 xnvme=""
 libzbc=""
 dfs=""
@@ -245,6 +245,8 @@ for opt do
   ;;
   --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
+  --disable-libnfs) libnfs="no"
+  ;;
   --enable-libnfs) libnfs="yes"
   ;;
   --dynamic-libengines) dynamic_engines="yes"
@@ -282,6 +284,7 @@ if test "$show_help" = "yes" ; then
   echo "--disable-gfapi         Disable gfapi"
   echo "--enable-libhdfs        Enable hdfs support"
   echo "--enable-libnfs         Enable nfs support"
+  echo "--disable-libnfs        Disable nfs support"
   echo "--disable-lex           Disable use of lex/yacc for math"
   echo "--disable-pmem          Disable pmem based engines even if found"
   echo "--enable-lex            Enable use of lex/yacc for math"
@@ -2313,15 +2316,14 @@ print_config "DAOS File System (dfs) Engine" "$dfs"
 
 ##########################################
 # Check if we have libnfs (for userspace nfs support).
-if test "$libnfs" = "yes" ; then
+if test "$libnfs" != "no" ; then
   if $(pkg-config libnfs > /dev/null 2>&1); then
     libnfs="yes"
     libnfs_cflags=$(pkg-config --cflags libnfs)
-    # libnfs_libs=$(pkg-config --libs libnfs)
-    libnfs_libs=/usr/local/lib/libnfs.a
+    libnfs_libs=$(pkg-config --libs libnfs)
   else
     if test "$libnfs" = "yes" ; then
-      echo "libnfs" "Install libnfs"
+      feature_not_found "libnfs" "libnfs"
     fi
     libnfs="no"
   fi
@@ -3190,9 +3192,6 @@ fi
 if test "$dfs" = "yes" ; then
   output_sym "CONFIG_DFS"
 fi
-if test "$libnfs" = "yes" ; then
-  output_sym "CONFIG_NFS"
-fi
 if test "$march_set" = "no" && test "$build_native" = "yes" ; then
   output_sym "CONFIG_BUILD_NATIVE"
 fi
@@ -3234,7 +3233,6 @@ if test "$libnbd" = "yes" ; then
 fi
 if test "$libnfs" = "yes" ; then
   output_sym "CONFIG_LIBNFS"
-  echo "CONFIG_LIBNFS=m" >> $config_host_mak
   echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak
   echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak
 fi
diff --git a/engines/nfs.c b/engines/nfs.c
index 21be8833..7031769d 100644
--- a/engines/nfs.c
+++ b/engines/nfs.c
@@ -279,14 +279,6 @@ static int fio_libnfs_close(struct thread_data *td, struct fio_file *f)
 	return ret;
 }
 
-/*
- * Hook for writing out outstanding data.
- */
-static int fio_libnfs_commit(struct thread_data *td) {
-	nfs_event_loop(td, true);
-	return 0;
-}
-
 struct ioengine_ops ioengine = {
 	.name		= "nfs",
 	.version	= FIO_IOOPS_VERSION,
@@ -297,7 +289,6 @@ struct ioengine_ops ioengine = {
 	.cleanup	= fio_libnfs_cleanup,
 	.open_file	= fio_libnfs_open,
 	.close_file	= fio_libnfs_close,
-	.commit     = fio_libnfs_commit,
 	.flags      = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
 	.options	= options,
 	.option_struct_size	= sizeof(struct fio_libnfs_options),
diff --git a/options.c b/options.c
index 2b183c60..5d3daedf 100644
--- a/options.c
+++ b/options.c
@@ -2140,7 +2140,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 			    .help = "DAOS File System (dfs) IO engine",
 			  },
 #endif
-#ifdef CONFIG_NFS
+#ifdef CONFIG_LIBNFS
 			  { .ival = "nfs",
 			    .help = "NFS IO engine",
 			  },

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-22 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-22 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 9c1c1a8d6a4f30eba9595da951d18db1685c03d8:

  engines/http: silence openssl 3.0 deprecation warnings (2022-07-19 13:21:19 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 02a36caa69f5675f7144fbeddb7a32e1d35ce0c7:

  docs: clarify write_iolog description (2022-07-21 15:18:18 -0400)

----------------------------------------------------------------
Vincent Fu (1):
      docs: clarify write_iolog description

 HOWTO.rst | 3 ++-
 fio.1     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 470777e2..104cce2d 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -3049,7 +3049,8 @@ I/O replay
 
 	Write the issued I/O patterns to the specified file. See
 	:option:`read_iolog`.  Specify a separate file for each job, otherwise the
-	iologs will be interspersed and the file may be corrupt.
+        iologs will be interspersed and the file may be corrupt. This file will
+        be opened in append mode.
 
 .. option:: read_iolog=str
 
diff --git a/fio.1 b/fio.1
index 948c01f9..ce9bf3ef 100644
--- a/fio.1
+++ b/fio.1
@@ -2793,7 +2793,8 @@ of milliseconds. Defaults to 1000.
 .BI write_iolog \fR=\fPstr
 Write the issued I/O patterns to the specified file. See
 \fBread_iolog\fR. Specify a separate file for each job, otherwise the
-iologs will be interspersed and the file may be corrupt.
+iologs will be interspersed and the file may be corrupt. This file will be
+opened in append mode.
 .TP
 .BI read_iolog \fR=\fPstr
 Open an iolog with the specified filename and replay the I/O patterns it

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-20 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-20 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit d6225c1550827077c0c0f9e1b8816b4f35cd5304:

  Update README.rst to specify secure protocols where possible (2022-07-11 07:53:29 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 9c1c1a8d6a4f30eba9595da951d18db1685c03d8:

  engines/http: silence openssl 3.0 deprecation warnings (2022-07-19 13:21:19 -0600)

----------------------------------------------------------------
Giuseppe Baccini (1):
      Fixed misplaced goto in http.c

Jens Axboe (1):
      engines/http: silence openssl 3.0 deprecation warnings

Vincent Fu (1):
      Merge branch 'giubacc-misplaced-goto'

 engines/http.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/engines/http.c b/engines/http.c
index 696febe1..1de9e66c 100644
--- a/engines/http.c
+++ b/engines/http.c
@@ -29,6 +29,10 @@
 #include "fio.h"
 #include "../optgroup.h"
 
+/*
+ * Silence OpenSSL 3.0 deprecated function warnings
+ */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 enum {
 	FIO_HTTP_WEBDAV	    = 0,
@@ -526,8 +530,8 @@ static enum fio_q_status fio_http_queue(struct thread_data *td,
 			if (status == 100 || (status >= 200 && status <= 204))
 				goto out;
 			log_err("DDIR_WRITE failed with HTTP status code %ld\n", status);
-			goto err;
 		}
+		goto err;
 	} else if (io_u->ddir == DDIR_READ) {
 		curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
 		curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, &_curl_stream);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-12 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-12 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 30568e0ed9366a810dfcf90a903ecfbff1a6196c:

  Merge branch 'client-hist-le64' of https://github.com/tuan-hoang1/fio (2022-07-07 06:33:25 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to d6225c1550827077c0c0f9e1b8816b4f35cd5304:

  Update README.rst to specify secure protocols where possible (2022-07-11 07:53:29 -0600)

----------------------------------------------------------------
Rebecca Cran (1):
      Update README.rst to specify secure protocols where possible

 README.rst | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

---

Diff of recent changes:

diff --git a/README.rst b/README.rst
index 527f33ab..4d736eaf 100644
--- a/README.rst
+++ b/README.rst
@@ -27,31 +27,20 @@ Source
 
 Fio resides in a git repo, the canonical place is:
 
-	git://git.kernel.dk/fio.git
-
-When inside a corporate firewall, git:// URL sometimes does not work.
-If git:// does not work, use the http protocol instead:
-
-	http://git.kernel.dk/fio.git
+	https://git.kernel.dk/cgit/fio/
 
 Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
 meta data as well. Other tarballs are archives of official fio releases.
 Snapshots can download from:
 
-	http://brick.kernel.dk/snaps/
+	https://brick.kernel.dk/snaps/
 
 There are also two official mirrors. Both of these are automatically synced with
 the main repository, when changes are pushed. If the main repo is down for some
 reason, either one of these is safe to use as a backup:
 
-	git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
 	https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
 
-or
-
-	git://github.com/axboe/fio.git
-
 	https://github.com/axboe/fio.git
 
 
@@ -70,7 +59,7 @@ email to majordomo@vger.kernel.org with
 
 in the body of the email. Archives can be found here:
 
-	http://www.spinics.net/lists/fio/
+	https://www.spinics.net/lists/fio/
 
 or here:
 
@@ -97,12 +86,12 @@ Binary packages
 
 Debian:
 	Starting with Debian "Squeeze", fio packages are part of the official
-	Debian repository. http://packages.debian.org/search?keywords=fio .
+	Debian repository. https://packages.debian.org/search?keywords=fio .
 
 Ubuntu:
 	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
 	of the Ubuntu "universe" repository.
-	http://packages.ubuntu.com/search?keywords=fio .
+	https://packages.ubuntu.com/search?keywords=fio .
 
 Red Hat, Fedora, CentOS & Co:
 	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
@@ -176,7 +165,7 @@ directory.
 
 How to compile fio on 64-bit Windows:
 
- 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
+ 1. Install Cygwin (https://www.cygwin.com/). Install **make** and all
     packages starting with **mingw64-x86_64**. Ensure
     **mingw64-x86_64-zlib** are installed if you wish
     to enable fio's log compression functionality.
@@ -205,8 +194,8 @@ browser to :file:`./doc/output/html/index.html`.  To build manual page run
 ``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
 output formats are supported run ``make -C doc help``.
 
-.. _reStructuredText: http://www.sphinx-doc.org/rest.html
-.. _Sphinx: http://www.sphinx-doc.org
+.. _reStructuredText: https://www.sphinx-doc.org/rest.html
+.. _Sphinx: https://www.sphinx-doc.org
 
 
 Platforms

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-08 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-08 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 1f43cc2e7b2f3ac7461f8ea66bb9b32cb03075c3:

  Merge branch 'server-hist-le64' of https://github.com/tuan-hoang1/fio (2022-07-06 16:38:07 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 30568e0ed9366a810dfcf90a903ecfbff1a6196c:

  Merge branch 'client-hist-le64' of https://github.com/tuan-hoang1/fio (2022-07-07 06:33:25 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'client-hist-le64' of https://github.com/tuan-hoang1/fio

Tuan Hoang (1):
      client: only do le64_to_cpu() on io_sample_data member if iolog is histogram

 client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/client.c b/client.c
index 605a3ce5..37da74bc 100644
--- a/client.c
+++ b/client.c
@@ -1702,7 +1702,8 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
 			s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
 
 		s->time		= le64_to_cpu(s->time);
-		s->data.val	= le64_to_cpu(s->data.val);
+		if (ret->log_type != IO_LOG_TYPE_HIST)
+			s->data.val	= le64_to_cpu(s->data.val);
 		s->__ddir	= __le32_to_cpu(s->__ddir);
 		s->bs		= le64_to_cpu(s->bs);
 		s->priority	= le16_to_cpu(s->priority);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-07 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-07 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 1eb5ca76ee17ff80dd06a0c2d22498ab720ec76f:

  configure: revert NFS configure change (2022-07-05 07:19:39 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 1f43cc2e7b2f3ac7461f8ea66bb9b32cb03075c3:

  Merge branch 'server-hist-le64' of https://github.com/tuan-hoang1/fio (2022-07-06 16:38:07 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'server-hist-le64' of https://github.com/tuan-hoang1/fio

Tuan Hoang (1):
      server: only do cpu_to_le64() on io_sample_data member if iolog is histogram

 server.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/server.c b/server.c
index 4c71bd44..b453be5f 100644
--- a/server.c
+++ b/server.c
@@ -2284,7 +2284,8 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 			struct io_sample *s = get_sample(log, cur_log, i);
 
 			s->time		= cpu_to_le64(s->time);
-			s->data.val	= cpu_to_le64(s->data.val);
+			if (log->log_type != IO_LOG_TYPE_HIST)
+				s->data.val	= cpu_to_le64(s->data.val);
 			s->__ddir	= __cpu_to_le32(s->__ddir);
 			s->bs		= cpu_to_le64(s->bs);
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-06 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-06 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit dc4729e3ef6a9116d7cd30e96e4f5863883e5bd7:

  hash: cleanups (2022-07-01 15:03:39 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 1eb5ca76ee17ff80dd06a0c2d22498ab720ec76f:

  configure: revert NFS configure change (2022-07-05 07:19:39 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      configure: revert NFS configure change

 configure | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index 04a1d0e2..7965f0b0 100755
--- a/configure
+++ b/configure
@@ -245,7 +245,7 @@ for opt do
   ;;
   --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
-  --disable-nfs) disable_nfs="yes"
+  --enable-libnfs) libnfs="yes"
   ;;
   --dynamic-libengines) dynamic_engines="yes"
   ;;
@@ -279,7 +279,6 @@ if test "$show_help" = "yes" ; then
   echo "--disable-rados         Disable Rados support even if found"
   echo "--disable-rbd           Disable Rados Block Device even if found"
   echo "--disable-http          Disable HTTP support even if found"
-  echo "--disable-nfs           Disable userspace NFS support even if found"
   echo "--disable-gfapi         Disable gfapi"
   echo "--enable-libhdfs        Enable hdfs support"
   echo "--enable-libnfs         Enable nfs support"
@@ -2314,15 +2313,17 @@ print_config "DAOS File System (dfs) Engine" "$dfs"
 
 ##########################################
 # Check if we have libnfs (for userspace nfs support).
-if test "$disable_nfs" != "yes"; then
+if test "$libnfs" = "yes" ; then
   if $(pkg-config libnfs > /dev/null 2>&1); then
     libnfs="yes"
     libnfs_cflags=$(pkg-config --cflags libnfs)
-    libnfs_libs=$(pkg-config --libs libnfs)
+    # libnfs_libs=$(pkg-config --libs libnfs)
+    libnfs_libs=/usr/local/lib/libnfs.a
   else
     if test "$libnfs" = "yes" ; then
       echo "libnfs" "Install libnfs"
     fi
+    libnfs="no"
   fi
 fi
 print_config "NFS engine" "$libnfs"

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-07-02 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-07-02 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 660879102e32a0ed3d3225afaebcc0d46625a4a6:

  Merge branch 'master' of https://github.com/bvanassche/fio (2022-06-23 08:20:22 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to dc4729e3ef6a9116d7cd30e96e4f5863883e5bd7:

  hash: cleanups (2022-07-01 15:03:39 -0600)

----------------------------------------------------------------
Georg Sauthoff (1):
      Simplify and optimize __fill_random_buf

Jens Axboe (3):
      Merge branch 'fill-random-smaller' of https://github.com/gsauthof/fio
      lib/rand: improve __fill_random_buf()
      hash: cleanups

 engines/rdma.c |  2 +-
 hash.h         | 26 --------------------------
 lib/rand.c     | 30 +++++++++---------------------
 3 files changed, 10 insertions(+), 48 deletions(-)

---

Diff of recent changes:

diff --git a/engines/rdma.c b/engines/rdma.c
index e3bb2567..fcb41068 100644
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -1389,7 +1389,7 @@ static int fio_rdmaio_setup(struct thread_data *td)
 		rd = malloc(sizeof(*rd));
 
 		memset(rd, 0, sizeof(*rd));
-		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
+		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_64, 0);
 		td->io_ops_data = rd;
 	}
 
diff --git a/hash.h b/hash.h
index f7596a56..51f0706e 100644
--- a/hash.h
+++ b/hash.h
@@ -9,32 +9,6 @@
    (C) 2002 William Lee Irwin III, IBM */
 
 /*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
-#if BITS_PER_LONG == 32
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e370001UL
-#elif BITS_PER_LONG == 64
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
-#else
-#error Define GOLDEN_RATIO_PRIME for your wordsize.
-#endif
-
-/*
- * The above primes are actively bad for hashing, since they are
- * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
- * real problems. Besides, the "prime" part is pointless for the
- * multiplicative hash.
- *
  * Although a random odd number will do, it turns out that the golden
  * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
  * properties.
diff --git a/lib/rand.c b/lib/rand.c
index 6e893e80..1e669116 100644
--- a/lib/rand.c
+++ b/lib/rand.c
@@ -97,29 +97,17 @@ void init_rand_seed(struct frand_state *state, uint64_t seed, bool use64)
 
 void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
 {
-	void *ptr = buf;
+	uint64_t *b = buf;
+	uint64_t *e = b  + len / sizeof(*b);
+	unsigned int rest = len % sizeof(*b);
 
-	while (len) {
-		int this_len;
-
-		if (len >= sizeof(int64_t)) {
-			*((int64_t *) ptr) = seed;
-			this_len = sizeof(int64_t);
-		} else if (len >= sizeof(int32_t)) {
-			*((int32_t *) ptr) = seed;
-			this_len = sizeof(int32_t);
-		} else if (len >= sizeof(int16_t)) {
-			*((int16_t *) ptr) = seed;
-			this_len = sizeof(int16_t);
-		} else {
-			*((int8_t *) ptr) = seed;
-			this_len = sizeof(int8_t);
-		}
-		ptr += this_len;
-		len -= this_len;
-		seed *= GOLDEN_RATIO_PRIME;
-		seed >>= 3;
+	for (; b != e; ++b) {
+		*b = seed;
+		seed = __hash_u64(seed);
 	}
+
+	if (fio_unlikely(rest))
+		__builtin_memcpy(e, &seed, rest);
 }
 
 uint64_t fill_random_buf(struct frand_state *fs, void *buf,

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-24 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-24 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6aaebfbe7269f95164ac83a04505869f96f5f83a:

  configure: add option to disable xnvme build (2022-06-22 11:45:32 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 660879102e32a0ed3d3225afaebcc0d46625a4a6:

  Merge branch 'master' of https://github.com/bvanassche/fio (2022-06-23 08:20:22 -0600)

----------------------------------------------------------------
Bart Van Assche (2):
      ci/travis-*: Fix shellcheck warnings
      ci: Verify the Android build

Jens Axboe (1):
      Merge branch 'master' of https://github.com/bvanassche/fio

 .github/workflows/ci.yml     |  5 +++++
 ci/actions-build.sh          | 19 +++++++++++++++++--
 ci/actions-full-test.sh      |  2 ++
 ci/actions-install.sh        |  7 +++++++
 ci/actions-smoke-test.sh     |  2 ++
 ci/travis-install-librpma.sh |  6 +++---
 ci/travis-install-pmdk.sh    |  9 +++++----
 7 files changed, 41 insertions(+), 9 deletions(-)

---

Diff of recent changes:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cd8ce142..650366b2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -15,6 +15,7 @@ jobs:
         - linux-clang
         - macos
         - linux-i686-gcc
+        - android
         include:
         - build: linux-gcc
           os: ubuntu-20.04
@@ -27,8 +28,12 @@ jobs:
         - build: linux-i686-gcc
           os: ubuntu-20.04
           arch: i686
+        - build: android
+          os: ubuntu-20.04
+          arch: aarch64-linux-android32
 
     env:
+      CI_TARGET_BUILD: ${{ matrix.build }}
       CI_TARGET_ARCH: ${{ matrix.arch }}
       CC: ${{ matrix.cc }}
 
diff --git a/ci/actions-build.sh b/ci/actions-build.sh
index 74a6fdcb..2b3de8e3 100755
--- a/ci/actions-build.sh
+++ b/ci/actions-build.sh
@@ -11,8 +11,23 @@ main() {
     local configure_flags=()
 
     set_ci_target_os
-    case "${CI_TARGET_OS}" in
-        "linux")
+    case "${CI_TARGET_BUILD}/${CI_TARGET_OS}" in
+        android/*)
+            export UNAME=Android
+            if [ -z "${CI_TARGET_ARCH}" ]; then
+                echo "Error: CI_TARGET_ARCH has not been set"
+                return 1
+            fi
+            NDK=$PWD/android-ndk-r24/toolchains/llvm/prebuilt/linux-x86_64/bin
+            export PATH="${NDK}:${PATH}"
+            export LIBS="-landroid"
+            CC=${NDK}/${CI_TARGET_ARCH}-clang
+            if [ ! -e "${CC}" ]; then
+                echo "Error: could not find ${CC}"
+                return 1
+            fi
+            ;;
+        */linux)
             case "${CI_TARGET_ARCH}" in
                 "i686")
                     extra_cflags="${extra_cflags} -m32"
diff --git a/ci/actions-full-test.sh b/ci/actions-full-test.sh
index 8282002f..d1675f6e 100755
--- a/ci/actions-full-test.sh
+++ b/ci/actions-full-test.sh
@@ -3,6 +3,8 @@
 set -eu
 
 main() {
+    [ "${CI_TARGET_BUILD}" = android ] && return 0
+
     echo "Running long running tests..."
     export PYTHONUNBUFFERED="TRUE"
     if [[ "${CI_TARGET_ARCH}" == "arm64" ]]; then
diff --git a/ci/actions-install.sh b/ci/actions-install.sh
index 0e472717..ff514926 100755
--- a/ci/actions-install.sh
+++ b/ci/actions-install.sh
@@ -83,6 +83,13 @@ install_macos() {
 }
 
 main() {
+    if [ "${CI_TARGET_BUILD}" = "android" ]; then
+	echo "Installing Android NDK..."
+	wget --quiet https://dl.google.com/android/repository/android-ndk-r24-linux.zip
+	unzip -q android-ndk-r24-linux.zip
+	return 0
+    fi
+
     set_ci_target_os
 
     install_function="install_${CI_TARGET_OS}"
diff --git a/ci/actions-smoke-test.sh b/ci/actions-smoke-test.sh
index c129c89f..3196f6a1 100755
--- a/ci/actions-smoke-test.sh
+++ b/ci/actions-smoke-test.sh
@@ -3,6 +3,8 @@
 set -eu
 
 main() {
+    [ "${CI_TARGET_BUILD}" = "android" ] && return 0
+
     echo "Running smoke tests..."
     make test
 }
diff --git a/ci/travis-install-librpma.sh b/ci/travis-install-librpma.sh
index b127f3f5..4e5ed21d 100755
--- a/ci/travis-install-librpma.sh
+++ b/ci/travis-install-librpma.sh
@@ -16,7 +16,7 @@ cmake .. -DCMAKE_BUILD_TYPE=Release \
 	-DBUILD_DOC=OFF \
 	-DBUILD_EXAMPLES=OFF \
 	-DBUILD_TESTS=OFF
-make -j$(nproc)
-sudo make -j$(nproc) install
-cd $WORKDIR
+make -j"$(nproc)"
+sudo make -j"$(nproc)" install
+cd "$WORKDIR"
 rm -rf $ZIP_FILE rpma-${LIBRPMA_VERSION}
diff --git a/ci/travis-install-pmdk.sh b/ci/travis-install-pmdk.sh
index 3b0b5bbc..7bde9fd0 100755
--- a/ci/travis-install-pmdk.sh
+++ b/ci/travis-install-pmdk.sh
@@ -12,7 +12,8 @@ WORKDIR=$(pwd)
 #    /bin/sh: 1: clang: not found
 # if CC is not set to the full path of clang.
 #
-export CC=$(type -P $CC)
+CC=$(type -P "$CC")
+export CC
 
 # Install PMDK libraries, because PMDK's libpmem
 # is a dependency of the librpma fio engine.
@@ -22,7 +23,7 @@ export CC=$(type -P $CC)
 wget https://github.com/pmem/pmdk/releases/download/${PMDK_VERSION}/pmdk-${PMDK_VERSION}.tar.gz
 tar -xzf pmdk-${PMDK_VERSION}.tar.gz
 cd pmdk-${PMDK_VERSION}
-make -j$(nproc) NDCTL_ENABLE=n
-sudo make -j$(nproc) install prefix=/usr NDCTL_ENABLE=n
-cd $WORKDIR
+make -j"$(nproc)" NDCTL_ENABLE=n
+sudo make -j"$(nproc)" install prefix=/usr NDCTL_ENABLE=n
+cd "$WORKDIR"
 rm -rf pmdk-${PMDK_VERSION}

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-23 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-23 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit d4bf5e6193b97c5e5490fdb93b069d149a38777c:

  gettime: fix whitespace damage (2022-06-19 12:04:19 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6aaebfbe7269f95164ac83a04505869f96f5f83a:

  configure: add option to disable xnvme build (2022-06-22 11:45:32 -0600)

----------------------------------------------------------------
Ankit Kumar (1):
      configure: add option to disable xnvme build

 configure | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index 510af424..04a1d0e2 100755
--- a/configure
+++ b/configure
@@ -171,7 +171,7 @@ march_set="no"
 libiscsi="no"
 libnbd="no"
 libnfs="no"
-xnvme="no"
+xnvme=""
 libzbc=""
 dfs=""
 dynamic_engines="no"
@@ -241,7 +241,7 @@ for opt do
   ;;
   --disable-libzbc) libzbc="no"
   ;;
-  --enable-xnvme) xnvme="yes"
+  --disable-xnvme) xnvme="no"
   ;;
   --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
@@ -294,7 +294,7 @@ if test "$show_help" = "yes" ; then
   echo "--with-ime=             Install path for DDN's Infinite Memory Engine"
   echo "--enable-libiscsi       Enable iscsi support"
   echo "--enable-libnbd         Enable libnbd (NBD engine) support"
-  echo "--enable-xnvme          Enable xnvme support"
+  echo "--disable-xnvme         Disable xnvme support even if found"
   echo "--disable-libzbc        Disable libzbc even if found"
   echo "--disable-tcmalloc      Disable tcmalloc support"
   echo "--dynamic-libengines    Lib-based ioengines as dynamic libraries"
@@ -2619,7 +2619,7 @@ fi
 
 ##########################################
 # Check if we have xnvme
-if test "$xnvme" != "yes" ; then
+if test "$xnvme" != "no" ; then
   if check_min_lib_version xnvme 0.2.0; then
     xnvme="yes"
     xnvme_cflags=$(pkg-config --cflags xnvme)

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-20 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-20 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit e4d384755e4831cf5bbaa97e0c5b79a3598efbc4:

  Merge branch 'master' of https://github.com/useche/fio (2022-06-15 18:38:41 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to d4bf5e6193b97c5e5490fdb93b069d149a38777c:

  gettime: fix whitespace damage (2022-06-19 12:04:19 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      gettime: fix whitespace damage

 gettime.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

---

Diff of recent changes:

diff --git a/gettime.c b/gettime.c
index 099e9d9f..14462420 100644
--- a/gettime.c
+++ b/gettime.c
@@ -431,22 +431,22 @@ void fio_clock_init(void)
 
 uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
 {
-       int64_t sec, nsec;
+	int64_t sec, nsec;
 
-       sec = e->tv_sec - s->tv_sec;
-       nsec = e->tv_nsec - s->tv_nsec;
-       if (sec > 0 && nsec < 0) {
-	       sec--;
-	       nsec += 1000000000LL;
-       }
+	sec = e->tv_sec - s->tv_sec;
+	nsec = e->tv_nsec - s->tv_nsec;
+	if (sec > 0 && nsec < 0) {
+		sec--;
+		nsec += 1000000000LL;
+	}
 
        /*
 	* time warp bug on some kernels?
 	*/
-       if (sec < 0 || (sec == 0 && nsec < 0))
-	       return 0;
+	if (sec < 0 || (sec == 0 && nsec < 0))
+		return 0;
 
-       return nsec + (sec * 1000000000LL);
+	return nsec + (sec * 1000000000LL);
 }
 
 uint64_t ntime_since_now(const struct timespec *s)

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-16 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-16 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit b5f3adf9e1e40c7bdb76a9e433aa580f7eead740:

  Merge branch 'master' of https://github.com/bvanassche/fio (2022-06-13 18:14:26 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to e4d384755e4831cf5bbaa97e0c5b79a3598efbc4:

  Merge branch 'master' of https://github.com/useche/fio (2022-06-15 18:38:41 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'master' of https://github.com/useche/fio

Luis Useche (1):
      Init file_cache to invalid (maj, min)

Vincent Fu (5):
      ioengines: add helper for trims with async ioengines
      ioengines: don't record issue_time if ioengines already do it
      HOWTO: improve description of latency measures
      ioengines: update last_issue if we set issue_time
      ioengines: clean up latency accounting for 3 ioengines

 HOWTO.rst               | 29 ++++++++++++++++++-----------
 blktrace.c              |  5 ++++-
 engines/io_uring.c      | 13 +++++++++++--
 engines/libaio.c        |  9 ++++++++-
 engines/librpma_apm.c   |  2 +-
 engines/librpma_fio.c   |  9 ++++++++-
 engines/librpma_gpspm.c |  2 +-
 engines/rdma.c          |  9 ++++++++-
 ioengines.c             | 44 ++++++++++++++++++++++++++------------------
 ioengines.h             |  2 ++
 10 files changed, 87 insertions(+), 37 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 28ac2b7c..470777e2 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -4165,24 +4165,31 @@ writes in the example above).  In the order listed, they denote:
 **slat**
 		Submission latency (**min** being the minimum, **max** being the
 		maximum, **avg** being the average, **stdev** being the standard
-		deviation).  This is the time it took to submit the I/O.  For
-		sync I/O this row is not displayed as the slat is really the
-		completion latency (since queue/complete is one operation there).
-		This value can be in nanoseconds, microseconds or milliseconds ---
-		fio will choose the most appropriate base and print that (in the
-		example above nanoseconds was the best scale).  Note: in :option:`--minimal` mode
-		latencies are always expressed in microseconds.
+                deviation).  This is the time from when fio initialized the I/O
+                to submission.  For synchronous ioengines this includes the time
+                up until just before the ioengine's queue function is called.
+                For asynchronous ioengines this includes the time up through the
+                completion of the ioengine's queue function (and commit function
+                if it is defined). For sync I/O this row is not displayed as the
+                slat is negligible.  This value can be in nanoseconds,
+                microseconds or milliseconds --- fio will choose the most
+                appropriate base and print that (in the example above
+                nanoseconds was the best scale).  Note: in :option:`--minimal`
+                mode latencies are always expressed in microseconds.
 
 **clat**
 		Completion latency. Same names as slat, this denotes the time from
-		submission to completion of the I/O pieces. For sync I/O, clat will
-		usually be equal (or very close) to 0, as the time from submit to
-		complete is basically just CPU time (I/O has already been done, see slat
-		explanation).
+                submission to completion of the I/O pieces. For sync I/O, this
+                represents the time from when the I/O was submitted to the
+                operating system to when it was completed. For asynchronous
+                ioengines this is the time from when the ioengine's queue (and
+                commit if available) functions were completed to when the I/O's
+                completion was reaped by fio.
 
 **lat**
 		Total latency. Same names as slat and clat, this denotes the time from
 		when fio created the I/O unit to completion of the I/O operation.
+                It is the sum of submission and completion latency.
 
 **bw**
 		Bandwidth statistics based on samples. Same names as the xlat stats,
diff --git a/blktrace.c b/blktrace.c
index 619121c7..00e5f9a9 100644
--- a/blktrace.c
+++ b/blktrace.c
@@ -442,7 +442,10 @@ err:
 bool read_blktrace(struct thread_data* td)
 {
 	struct blk_io_trace t;
-	struct file_cache cache = { };
+	struct file_cache cache = {
+		.maj = ~0U,
+		.min = ~0U,
+	};
 	unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
 	unsigned long long rw_bs[DDIR_RWDIR_CNT] = { };
 	unsigned long skipped_writes;
diff --git a/engines/io_uring.c b/engines/io_uring.c
index cceafe69..cffc7371 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -608,6 +608,12 @@ static void fio_ioring_queued(struct thread_data *td, int start, int nr)
 
 		start++;
 	}
+
+	/*
+	 * only used for iolog
+	 */
+	if (td->o.read_iolog_file)
+		memcpy(&td->last_issue, &now, sizeof(now));
 }
 
 static int fio_ioring_commit(struct thread_data *td)
@@ -1191,7 +1197,8 @@ static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
 static struct ioengine_ops ioengine_uring = {
 	.name			= "io_uring",
 	.version		= FIO_IOOPS_VERSION,
-	.flags			= FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD |
+					FIO_ASYNCIO_SETS_ISSUE_TIME,
 	.init			= fio_ioring_init,
 	.post_init		= fio_ioring_post_init,
 	.io_u_init		= fio_ioring_io_u_init,
@@ -1211,7 +1218,9 @@ static struct ioengine_ops ioengine_uring = {
 static struct ioengine_ops ioengine_uring_cmd = {
 	.name			= "io_uring_cmd",
 	.version		= FIO_IOOPS_VERSION,
-	.flags			= FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD |
+					FIO_MEMALIGN | FIO_RAWIO |
+					FIO_ASYNCIO_SETS_ISSUE_TIME,
 	.init			= fio_ioring_init,
 	.post_init		= fio_ioring_cmd_post_init,
 	.io_u_init		= fio_ioring_io_u_init,
diff --git a/engines/libaio.c b/engines/libaio.c
index 9c278d06..33b8c12f 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -368,6 +368,12 @@ static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
 		memcpy(&io_u->issue_time, &now, sizeof(now));
 		io_u_queued(td, io_u);
 	}
+
+	/*
+	 * only used for iolog
+	 */
+	if (td->o.read_iolog_file)
+		memcpy(&td->last_issue, &now, sizeof(now));
 }
 
 static int fio_libaio_commit(struct thread_data *td)
@@ -511,7 +517,8 @@ static int fio_libaio_init(struct thread_data *td)
 FIO_STATIC struct ioengine_ops ioengine = {
 	.name			= "libaio",
 	.version		= FIO_IOOPS_VERSION,
-	.flags			= FIO_ASYNCIO_SYNC_TRIM,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM |
+					FIO_ASYNCIO_SETS_ISSUE_TIME,
 	.init			= fio_libaio_init,
 	.post_init		= fio_libaio_post_init,
 	.prep			= fio_libaio_prep,
diff --git a/engines/librpma_apm.c b/engines/librpma_apm.c
index d1166ad8..896240dd 100644
--- a/engines/librpma_apm.c
+++ b/engines/librpma_apm.c
@@ -208,7 +208,7 @@ FIO_STATIC struct ioengine_ops ioengine_client = {
 	.errdetails		= librpma_fio_client_errdetails,
 	.close_file		= librpma_fio_file_nop,
 	.cleanup		= client_cleanup,
-	.flags			= FIO_DISKLESSIO,
+	.flags			= FIO_DISKLESSIO | FIO_ASYNCIO_SETS_ISSUE_TIME,
 	.options		= librpma_fio_options,
 	.option_struct_size	= sizeof(struct librpma_fio_options_values),
 };
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c
index 34818904..a78a1e57 100644
--- a/engines/librpma_fio.c
+++ b/engines/librpma_fio.c
@@ -621,9 +621,16 @@ int librpma_fio_client_commit(struct thread_data *td)
 		}
 	}
 
-	if ((fill_time = fio_fill_issue_time(td)))
+	if ((fill_time = fio_fill_issue_time(td))) {
 		fio_gettime(&now, NULL);
 
+		/*
+		 * only used for iolog
+		 */
+		if (td->o.read_iolog_file)
+			memcpy(&td->last_issue, &now, sizeof(now));
+
+	}
 	/* move executed io_us from queued[] to flight[] */
 	for (i = 0; i < ccd->io_u_queued_nr; i++) {
 		struct io_u *io_u = ccd->io_us_queued[i];
diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c
index 5cf97472..f00717a7 100644
--- a/engines/librpma_gpspm.c
+++ b/engines/librpma_gpspm.c
@@ -352,7 +352,7 @@ FIO_STATIC struct ioengine_ops ioengine_client = {
 	.errdetails		= librpma_fio_client_errdetails,
 	.close_file		= librpma_fio_file_nop,
 	.cleanup		= client_cleanup,
-	.flags			= FIO_DISKLESSIO,
+	.flags			= FIO_DISKLESSIO | FIO_ASYNCIO_SETS_ISSUE_TIME,
 	.options		= librpma_fio_options,
 	.option_struct_size	= sizeof(struct librpma_fio_options_values),
 };
diff --git a/engines/rdma.c b/engines/rdma.c
index 4eb86652..e3bb2567 100644
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -832,6 +832,12 @@ static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
 		memcpy(&io_u->issue_time, &now, sizeof(now));
 		io_u_queued(td, io_u);
 	}
+
+	/*
+	 * only used for iolog
+	 */
+	if (td->o.read_iolog_file)
+		memcpy(&td->last_issue, &now, sizeof(now));
 }
 
 static int fio_rdmaio_commit(struct thread_data *td)
@@ -1404,7 +1410,8 @@ FIO_STATIC struct ioengine_ops ioengine = {
 	.cleanup		= fio_rdmaio_cleanup,
 	.open_file		= fio_rdmaio_open_file,
 	.close_file		= fio_rdmaio_close_file,
-	.flags			= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+	.flags			= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO |
+					FIO_ASYNCIO_SETS_ISSUE_TIME,
 	.options		= options,
 	.option_struct_size	= sizeof(struct rdmaio_options),
 };
diff --git a/ioengines.c b/ioengines.c
index 68f307e5..e2316ee4 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -24,6 +24,13 @@
 
 static FLIST_HEAD(engine_list);
 
+static inline bool async_ioengine_sync_trim(struct thread_data *td,
+					    struct io_u	*io_u)
+{
+	return td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) &&
+		io_u->ddir == DDIR_TRIM;
+}
+
 static bool check_engine_ops(struct thread_data *td, struct ioengine_ops *ops)
 {
 	if (ops->version != FIO_IOOPS_VERSION) {
@@ -350,17 +357,17 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
 	io_u->resid = 0;
 
 	if (td_ioengine_flagged(td, FIO_SYNCIO) ||
-		(td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) && 
-		io_u->ddir == DDIR_TRIM)) {
-		if (fio_fill_issue_time(td))
+		async_ioengine_sync_trim(td, io_u)) {
+		if (fio_fill_issue_time(td)) {
 			fio_gettime(&io_u->issue_time, NULL);
 
-		/*
-		 * only used for iolog
-		 */
-		if (td->o.read_iolog_file)
-			memcpy(&td->last_issue, &io_u->issue_time,
-					sizeof(io_u->issue_time));
+			/*
+			 * only used for iolog
+			 */
+			if (td->o.read_iolog_file)
+				memcpy(&td->last_issue, &io_u->issue_time,
+						sizeof(io_u->issue_time));
+		}
 	}
 
 
@@ -435,17 +442,18 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
 	}
 
 	if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
-		(!td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) ||
-		 io_u->ddir != DDIR_TRIM)) {
-		if (fio_fill_issue_time(td))
+		!async_ioengine_sync_trim(td, io_u)) {
+		if (fio_fill_issue_time(td) &&
+			!td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME)) {
 			fio_gettime(&io_u->issue_time, NULL);
 
-		/*
-		 * only used for iolog
-		 */
-		if (td->o.read_iolog_file)
-			memcpy(&td->last_issue, &io_u->issue_time,
-					sizeof(io_u->issue_time));
+			/*
+			 * only used for iolog
+			 */
+			if (td->o.read_iolog_file)
+				memcpy(&td->last_issue, &io_u->issue_time,
+						sizeof(io_u->issue_time));
+		}
 	}
 
 	return ret;
diff --git a/ioengines.h b/ioengines.h
index acdb0071..fafa1e48 100644
--- a/ioengines.h
+++ b/ioengines.h
@@ -83,6 +83,8 @@ enum fio_ioengine_flags {
 	FIO_ASYNCIO_SYNC_TRIM
 			= 1 << 14,	/* io engine has async ->queue except for trim */
 	FIO_NO_OFFLOAD	= 1 << 15,	/* no async offload */
+	FIO_ASYNCIO_SETS_ISSUE_TIME
+			= 1 << 16,	/* async ioengine with commit function that sets issue_time */
 };
 
 /*

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-14 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-14 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 26faead0f3c6e7608b89a51373f1455b91377fcb:

  t/zbd: skip test case #13 when max_open_zones is too small (2022-06-02 03:58:31 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to b5f3adf9e1e40c7bdb76a9e433aa580f7eead740:

  Merge branch 'master' of https://github.com/bvanassche/fio (2022-06-13 18:14:26 -0600)

----------------------------------------------------------------
Bart Van Assche (2):
      configure: Support gcc 12
      configure: Fix libzbc detection on SUSE Linux

Jens Axboe (1):
      Merge branch 'master' of https://github.com/bvanassche/fio

 configure | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index 8182322b..510af424 100755
--- a/configure
+++ b/configure
@@ -1128,7 +1128,8 @@ cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
 {
-  cpu_set_t mask;
+  cpu_set_t mask = { };
+
   return sched_setaffinity(0, sizeof(mask), &mask);
 }
 EOF
@@ -1139,7 +1140,8 @@ else
 #include <sched.h>
 int main(int argc, char **argv)
 {
-  cpu_set_t mask;
+  cpu_set_t mask = { };
+
   return sched_setaffinity(0, &mask);
 }
 EOF
@@ -1621,7 +1623,8 @@ cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
 {
-  struct sched_param p;
+  struct sched_param p = { };
+
   return sched_setscheduler(0, SCHED_IDLE, &p);
 }
 EOF
@@ -1743,7 +1746,9 @@ cat > $TMPC << EOF
 #include <sys/uio.h>
 int main(int argc, char **argv)
 {
-  return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0);
+  struct iovec iov[1] = { };
+
+  return pwritev(0, iov, 1, 0) + preadv(0, iov, 1, 0);
 }
 EOF
 if compile_prog "" "" "pwritev"; then
@@ -1761,7 +1766,9 @@ cat > $TMPC << EOF
 #include <sys/uio.h>
 int main(int argc, char **argv)
 {
-  return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+  struct iovec iov[1] = { };
+
+  return pwritev2(0, iov, 1, 0, 0) + preadv2(0, iov, 1, 0, 0);
 }
 EOF
 if compile_prog "" "" "pwritev2"; then
@@ -1787,14 +1794,14 @@ cat > $TMPC << EOF
 #include <stdio.h>
 int main(int argc, char **argv)
 {
-  struct addrinfo hints;
-  struct in6_addr addr;
+  struct addrinfo hints = { };
+  struct in6_addr addr = in6addr_any;
   int ret;
 
   ret = getaddrinfo(NULL, NULL, &hints, NULL);
   freeaddrinfo(NULL);
-  printf("%s\n", gai_strerror(ret));
-  addr = in6addr_any;
+  printf("%s %d\n", gai_strerror(ret), addr.s6_addr[0]);
+
   return 0;
 }
 EOF
@@ -2155,9 +2162,7 @@ cat > $TMPC << EOF
 #include <stdlib.h>
 int main(int argc, char **argv)
 {
-  int rc;
-  rc = pmem_is_pmem(NULL, 0);
-  return 0;
+  return pmem_is_pmem(NULL, 0);
 }
 EOF
 if compile_prog "" "-lpmem" "libpmem"; then
@@ -2176,7 +2181,7 @@ if test "$libpmem" = "yes"; then
 #include <stdlib.h>
 int main(int argc, char **argv)
 {
-  pmem_memcpy(NULL, NULL, NULL, NULL);
+  pmem_memcpy(NULL, NULL, 0, 0);
   return 0;
 }
 EOF
@@ -2392,7 +2397,7 @@ int main(int argc, char **argv)
   FILE *mtab = setmntent(NULL, "r");
   struct mntent *mnt = getmntent(mtab);
   endmntent(mtab);
-  return 0;
+  return mnt != NULL;
 }
 EOF
 if compile_prog "" "" "getmntent"; then
@@ -2573,6 +2578,10 @@ int main(int argc, char **argv)
 }
 EOF
 if test "$libzbc" != "no" ; then
+  if [ -e /usr/include/libzbc/libzbc ]; then
+    # SUSE Linux.
+    CFLAGS="$CFLAGS -I/usr/include/libzbc"
+  fi
   if compile_prog "" "-lzbc" "libzbc"; then
     libzbc="yes"
     if ! check_min_lib_version libzbc 5; then

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-02 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-02 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 5ceed0be62f3ce8903d5747674f9f70f44e736d6:

  docs: update language setting for Sphinx build (2022-05-31 20:58:00 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 26faead0f3c6e7608b89a51373f1455b91377fcb:

  t/zbd: skip test case #13 when max_open_zones is too small (2022-06-02 03:58:31 -0600)

----------------------------------------------------------------
Ankit Kumar (5):
      configure: check nvme uring command support
      nvme: add nvme opcodes, structures and helper functions
      docs: document options for io_uring_cmd I/O engine
      zbd: Check for direct flag only if its block device
      engines/io_uring: Enable zone device support for io_uring_cmd I/O engine

Anuj Gupta (4):
      io_uring.h: add IORING_SETUP_SQE128 and IORING_SETUP_CQE32
      init: return error incase an invalid value is passed as option
      engines/io_uring: add new I/O engine for uring passthrough support
      examples: add 2 example job file for io_uring_cmd engine

Jens Axboe (3):
      engines/io_uring: cleanup supported case
      engines/nvme: fix 'fd' leak in error handling
      engines/nvme: ioctl return value is an int

Shin'ichiro Kawasaki (1):
      t/zbd: skip test case #13 when max_open_zones is too small

 HOWTO.rst                    |  41 +++--
 Makefile                     |   4 +-
 configure                    |  21 +++
 engines/io_uring.c           | 346 +++++++++++++++++++++++++++++++++++++++++-
 engines/nvme.c               | 347 +++++++++++++++++++++++++++++++++++++++++++
 engines/nvme.h               | 214 ++++++++++++++++++++++++++
 examples/uring-cmd-ng.fio    |  25 ++++
 examples/uring-cmd-zoned.fio |  31 ++++
 file.h                       |  12 +-
 fio.1                        |  33 +++-
 init.c                       |   9 ++
 os/linux/io_uring.h          |  45 +++++-
 t/zbd/test-zbd-support       |  23 ++-
 zbd.c                        |   4 +-
 14 files changed, 1123 insertions(+), 32 deletions(-)
 create mode 100644 engines/nvme.c
 create mode 100644 engines/nvme.h
 create mode 100644 examples/uring-cmd-ng.fio
 create mode 100644 examples/uring-cmd-zoned.fio

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 8ab3ac4b..28ac2b7c 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -1952,6 +1952,10 @@ I/O engine
 			for both direct and buffered IO.
 			This engine defines engine specific options.
 
+		**io_uring_cmd**
+			Fast Linux native asynchronous I/O for pass through commands.
+			This engine defines engine specific options.
+
 		**libaio**
 			Linux native asynchronous I/O. Note that Linux may only support
 			queued behavior with non-buffered I/O (set ``direct=1`` or
@@ -2255,22 +2259,34 @@ with the caveat that when used on the command line, they must come after the
 	values for trim IOs are ignored. This option is mutually exclusive with
 	the :option:`cmdprio_percentage` option.
 
-.. option:: fixedbufs : [io_uring]
+.. option:: fixedbufs : [io_uring] [io_uring_cmd]
+
+	If fio is asked to do direct IO, then Linux will map pages for each
+	IO call, and release them when IO is done. If this option is set, the
+	pages are pre-mapped before IO is started. This eliminates the need to
+	map and release for each IO. This is more efficient, and reduces the
+	IO latency as well.
+
+.. option:: nonvectored : [io_uring] [io_uring_cmd]
 
-    If fio is asked to do direct IO, then Linux will map pages for each
-    IO call, and release them when IO is done. If this option is set, the
-    pages are pre-mapped before IO is started. This eliminates the need to
-    map and release for each IO. This is more efficient, and reduces the
-    IO latency as well.
+	With this option, fio will use non-vectored read/write commands, where
+	address must contain the address directly. Default is -1.
 
-.. option:: registerfiles : [io_uring]
+.. option:: force_async=int : [io_uring] [io_uring_cmd]
+
+	Normal operation for io_uring is to try and issue an sqe as
+	non-blocking first, and if that fails, execute it in an async manner.
+	With this option set to N, then every N request fio will ask sqe to
+	be issued in an async manner. Default is 0.
+
+.. option:: registerfiles : [io_uring] [io_uring_cmd]
 
 	With this option, fio registers the set of files being used with the
 	kernel. This avoids the overhead of managing file counts in the kernel,
 	making the submission and completion part more lightweight. Required
 	for the below :option:`sqthread_poll` option.
 
-.. option:: sqthread_poll : [io_uring] [xnvme]
+.. option:: sqthread_poll : [io_uring] [io_uring_cmd] [xnvme]
 
 	Normally fio will submit IO by issuing a system call to notify the
 	kernel of available items in the SQ ring. If this option is set, the
@@ -2278,14 +2294,19 @@ with the caveat that when used on the command line, they must come after the
 	This frees up cycles for fio, at the cost of using more CPU in the
 	system.
 
-.. option:: sqthread_poll_cpu : [io_uring]
+.. option:: sqthread_poll_cpu : [io_uring] [io_uring_cmd]
 
 	When :option:`sqthread_poll` is set, this option provides a way to
 	define which CPU should be used for the polling thread.
 
+.. option:: cmd_type=str : [io_uring_cmd]
+
+	Specifies the type of uring passthrough command to be used. Supported
+	value is nvme. Default is nvme.
+
 .. option:: hipri
 
-   [io_uring], [xnvme]
+   [io_uring] [io_uring_cmd] [xnvme]
 
         If this option is set, fio will attempt to use polled IO completions.
         Normal IO completions generate interrupts to signal the completion of
diff --git a/Makefile b/Makefile
index ed66305a..188a74d7 100644
--- a/Makefile
+++ b/Makefile
@@ -231,7 +231,7 @@ ifdef CONFIG_LIBXNVME
 endif
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
-		oslib/linux-dev-lookup.c engines/io_uring.c
+		oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c
   cmdprio_SRCS = engines/cmdprio.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
@@ -241,7 +241,7 @@ endif
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
-		oslib/linux-dev-lookup.c engines/io_uring.c
+		oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c
   cmdprio_SRCS = engines/cmdprio.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
diff --git a/configure b/configure
index 4ee536a0..8182322b 100755
--- a/configure
+++ b/configure
@@ -2587,6 +2587,27 @@ if test "$libzbc" != "no" ; then
 fi
 print_config "libzbc engine" "$libzbc"
 
+if test "$targetos" = "Linux" ; then
+##########################################
+# Check NVME_URING_CMD support
+cat > $TMPC << EOF
+#include <linux/nvme_ioctl.h>
+int main(void)
+{
+  struct nvme_uring_cmd *cmd;
+
+  return sizeof(struct nvme_uring_cmd);
+}
+EOF
+if compile_prog "" "" "nvme uring cmd"; then
+  output_sym "CONFIG_NVME_URING_CMD"
+  nvme_uring_cmd="yes"
+else
+  nvme_uring_cmd="no"
+fi
+print_config "NVMe uring command support" "$nvme_uring_cmd"
+fi
+
 ##########################################
 # Check if we have xnvme
 if test "$xnvme" != "yes" ; then
diff --git a/engines/io_uring.c b/engines/io_uring.c
index 1e15647e..cceafe69 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -24,6 +24,13 @@
 #include "../lib/types.h"
 #include "../os/linux/io_uring.h"
 #include "cmdprio.h"
+#include "nvme.h"
+
+#include <sys/stat.h>
+
+enum uring_cmd_type {
+	FIO_URING_CMD_NVME = 1,
+};
 
 struct io_sq_ring {
 	unsigned *head;
@@ -85,6 +92,7 @@ struct ioring_options {
 	unsigned int uncached;
 	unsigned int nowait;
 	unsigned int force_async;
+	enum uring_cmd_type cmd_type;
 };
 
 static const int ddir_to_op[2][2] = {
@@ -270,6 +278,22 @@ static struct fio_option options[] = {
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_IOURING,
 	},
+	{
+		.name	= "cmd_type",
+		.lname	= "Uring cmd type",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct ioring_options, cmd_type),
+		.help	= "Specify uring-cmd type",
+		.def	= "nvme",
+		.posval = {
+			  { .ival = "nvme",
+			    .oval = FIO_URING_CMD_NVME,
+			    .help = "Issue nvme-uring-cmd",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_IOURING,
+	},
 	{
 		.name	= NULL,
 	},
@@ -373,6 +397,48 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
 	return 0;
 }
 
+static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct fio_file *f = io_u->file;
+	struct nvme_uring_cmd *cmd;
+	struct io_uring_sqe *sqe;
+
+	/* only supports nvme_uring_cmd */
+	if (o->cmd_type != FIO_URING_CMD_NVME)
+		return -EINVAL;
+
+	sqe = &ld->sqes[(io_u->index) << 1];
+
+	if (o->registerfiles) {
+		sqe->fd = f->engine_pos;
+		sqe->flags = IOSQE_FIXED_FILE;
+	} else {
+		sqe->fd = f->fd;
+	}
+	sqe->rw_flags = 0;
+	if (!td->o.odirect && o->uncached)
+		sqe->rw_flags |= RWF_UNCACHED;
+	if (o->nowait)
+		sqe->rw_flags |= RWF_NOWAIT;
+
+	sqe->opcode = IORING_OP_URING_CMD;
+	sqe->user_data = (unsigned long) io_u;
+	if (o->nonvectored)
+		sqe->cmd_op = NVME_URING_CMD_IO;
+	else
+		sqe->cmd_op = NVME_URING_CMD_IO_VEC;
+	if (o->force_async && ++ld->prepped == o->force_async) {
+		ld->prepped = 0;
+		sqe->flags |= IOSQE_ASYNC;
+	}
+
+	cmd = (struct nvme_uring_cmd *)sqe->cmd;
+	return fio_nvme_uring_cmd_prep(cmd, io_u,
+			o->nonvectored ? NULL : &ld->iovecs[io_u->index]);
+}
+
 static struct io_u *fio_ioring_event(struct thread_data *td, int event)
 {
 	struct ioring_data *ld = td->io_ops_data;
@@ -396,6 +462,29 @@ static struct io_u *fio_ioring_event(struct thread_data *td, int event)
 	return io_u;
 }
 
+static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct io_uring_cqe *cqe;
+	struct io_u *io_u;
+	unsigned index;
+
+	index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
+	if (o->cmd_type == FIO_URING_CMD_NVME)
+		index <<= 1;
+
+	cqe = &ld->cq_ring.cqes[index];
+	io_u = (struct io_u *) (uintptr_t) cqe->user_data;
+
+	if (cqe->res != 0)
+		io_u->error = -cqe->res;
+	else
+		io_u->error = 0;
+
+	return io_u;
+}
+
 static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
 				   unsigned int max)
 {
@@ -622,14 +711,22 @@ static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
 	sring->array = ptr + p->sq_off.array;
 	ld->sq_ring_mask = *sring->ring_mask;
 
-	ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
+	if (p->flags & IORING_SETUP_SQE128)
+		ld->mmap[1].len = 2 * p->sq_entries * sizeof(struct io_uring_sqe);
+	else
+		ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
 	ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, ld->ring_fd,
 				IORING_OFF_SQES);
 	ld->mmap[1].ptr = ld->sqes;
 
-	ld->mmap[2].len = p->cq_off.cqes +
-				p->cq_entries * sizeof(struct io_uring_cqe);
+	if (p->flags & IORING_SETUP_CQE32) {
+		ld->mmap[2].len = p->cq_off.cqes +
+					2 * p->cq_entries * sizeof(struct io_uring_cqe);
+	} else {
+		ld->mmap[2].len = p->cq_off.cqes +
+					p->cq_entries * sizeof(struct io_uring_cqe);
+	}
 	ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
 			MAP_SHARED | MAP_POPULATE, ld->ring_fd,
 			IORING_OFF_CQ_RING);
@@ -728,6 +825,61 @@ retry:
 	return fio_ioring_mmap(ld, &p);
 }
 
+static int fio_ioring_cmd_queue_init(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	int depth = td->o.iodepth;
+	struct io_uring_params p;
+	int ret;
+
+	memset(&p, 0, sizeof(p));
+
+	if (o->hipri)
+		p.flags |= IORING_SETUP_IOPOLL;
+	if (o->sqpoll_thread) {
+		p.flags |= IORING_SETUP_SQPOLL;
+		if (o->sqpoll_set) {
+			p.flags |= IORING_SETUP_SQ_AFF;
+			p.sq_thread_cpu = o->sqpoll_cpu;
+		}
+	}
+	if (o->cmd_type == FIO_URING_CMD_NVME) {
+		p.flags |= IORING_SETUP_SQE128;
+		p.flags |= IORING_SETUP_CQE32;
+	}
+
+	/*
+	 * Clamp CQ ring size at our SQ ring size, we don't need more entries
+	 * than that.
+	 */
+	p.flags |= IORING_SETUP_CQSIZE;
+	p.cq_entries = depth;
+
+retry:
+	ret = syscall(__NR_io_uring_setup, depth, &p);
+	if (ret < 0) {
+		if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+			p.flags &= ~IORING_SETUP_CQSIZE;
+			goto retry;
+		}
+		return ret;
+	}
+
+	ld->ring_fd = ret;
+
+	fio_ioring_probe(td);
+
+	if (o->fixedbufs) {
+		ret = syscall(__NR_io_uring_register, ld->ring_fd,
+				IORING_REGISTER_BUFFERS, ld->iovecs, depth);
+		if (ret < 0)
+			return ret;
+	}
+
+	return fio_ioring_mmap(ld, &p);
+}
+
 static int fio_ioring_register_files(struct thread_data *td)
 {
 	struct ioring_data *ld = td->io_ops_data;
@@ -811,6 +963,52 @@ static int fio_ioring_post_init(struct thread_data *td)
 	return 0;
 }
 
+static int fio_ioring_cmd_post_init(struct thread_data *td)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+	struct io_u *io_u;
+	int err, i;
+
+	for (i = 0; i < td->o.iodepth; i++) {
+		struct iovec *iov = &ld->iovecs[i];
+
+		io_u = ld->io_u_index[i];
+		iov->iov_base = io_u->buf;
+		iov->iov_len = td_max_bs(td);
+	}
+
+	err = fio_ioring_cmd_queue_init(td);
+	if (err) {
+		int init_err = errno;
+
+		td_verror(td, init_err, "io_queue_init");
+		return 1;
+	}
+
+	for (i = 0; i < td->o.iodepth; i++) {
+		struct io_uring_sqe *sqe;
+
+		if (o->cmd_type == FIO_URING_CMD_NVME) {
+			sqe = &ld->sqes[i << 1];
+			memset(sqe, 0, 2 * sizeof(*sqe));
+		} else {
+			sqe = &ld->sqes[i];
+			memset(sqe, 0, sizeof(*sqe));
+		}
+	}
+
+	if (o->registerfiles) {
+		err = fio_ioring_register_files(td);
+		if (err) {
+			td_verror(td, errno, "ioring_register_files");
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 static int fio_ioring_init(struct thread_data *td)
 {
 	struct ioring_options *o = td->eo;
@@ -868,6 +1066,38 @@ static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
 	return 0;
 }
 
+static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+
+	if (o->cmd_type == FIO_URING_CMD_NVME) {
+		struct nvme_data *data = NULL;
+		unsigned int nsid, lba_size = 0;
+		unsigned long long nlba = 0;
+		int ret;
+
+		/* Store the namespace-id and lba size. */
+		data = FILE_ENG_DATA(f);
+		if (data == NULL) {
+			ret = fio_nvme_get_info(f, &nsid, &lba_size, &nlba);
+			if (ret)
+				return ret;
+
+			data = calloc(1, sizeof(struct nvme_data));
+			data->nsid = nsid;
+			data->lba_shift = ilog2(lba_size);
+
+			FILE_SET_ENG_DATA(f, data);
+		}
+	}
+	if (!ld || !o->registerfiles)
+		return generic_open_file(td, f);
+
+	f->fd = ld->fds[f->engine_pos];
+	return 0;
+}
+
 static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
 {
 	struct ioring_data *ld = td->io_ops_data;
@@ -880,7 +1110,85 @@ static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
 	return 0;
 }
 
-static struct ioengine_ops ioengine = {
+static int fio_ioring_cmd_close_file(struct thread_data *td,
+				     struct fio_file *f)
+{
+	struct ioring_data *ld = td->io_ops_data;
+	struct ioring_options *o = td->eo;
+
+	if (o->cmd_type == FIO_URING_CMD_NVME) {
+		struct nvme_data *data = FILE_ENG_DATA(f);
+
+		FILE_SET_ENG_DATA(f, NULL);
+		free(data);
+	}
+	if (!ld || !o->registerfiles)
+		return generic_close_file(td, f);
+
+	f->fd = -1;
+	return 0;
+}
+
+static int fio_ioring_cmd_get_file_size(struct thread_data *td,
+					struct fio_file *f)
+{
+	struct ioring_options *o = td->eo;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (o->cmd_type == FIO_URING_CMD_NVME) {
+		struct nvme_data *data = NULL;
+		unsigned int nsid, lba_size = 0;
+		unsigned long long nlba = 0;
+		int ret;
+
+		ret = fio_nvme_get_info(f, &nsid, &lba_size, &nlba);
+		if (ret)
+			return ret;
+
+		data = calloc(1, sizeof(struct nvme_data));
+		data->nsid = nsid;
+		data->lba_shift = ilog2(lba_size);
+
+		f->real_file_size = lba_size * nlba;
+		fio_file_set_size_known(f);
+
+		FILE_SET_ENG_DATA(f, data);
+		return 0;
+	}
+	return generic_get_file_size(td, f);
+}
+
+static int fio_ioring_cmd_get_zoned_model(struct thread_data *td,
+					  struct fio_file *f,
+					  enum zbd_zoned_model *model)
+{
+	return fio_nvme_get_zoned_model(td, f, model);
+}
+
+static int fio_ioring_cmd_report_zones(struct thread_data *td,
+				       struct fio_file *f, uint64_t offset,
+				       struct zbd_zone *zbdz,
+				       unsigned int nr_zones)
+{
+	return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones);
+}
+
+static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f,
+				   uint64_t offset, uint64_t length)
+{
+	return fio_nvme_reset_wp(td, f, offset, length);
+}
+
+static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
+					     struct fio_file *f,
+					     unsigned int *max_open_zones)
+{
+	return fio_nvme_get_max_open_zones(td, f, max_open_zones);
+}
+
+static struct ioengine_ops ioengine_uring = {
 	.name			= "io_uring",
 	.version		= FIO_IOOPS_VERSION,
 	.flags			= FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
@@ -900,13 +1208,39 @@ static struct ioengine_ops ioengine = {
 	.option_struct_size	= sizeof(struct ioring_options),
 };
 
+static struct ioengine_ops ioengine_uring_cmd = {
+	.name			= "io_uring_cmd",
+	.version		= FIO_IOOPS_VERSION,
+	.flags			= FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO,
+	.init			= fio_ioring_init,
+	.post_init		= fio_ioring_cmd_post_init,
+	.io_u_init		= fio_ioring_io_u_init,
+	.prep			= fio_ioring_cmd_prep,
+	.queue			= fio_ioring_queue,
+	.commit			= fio_ioring_commit,
+	.getevents		= fio_ioring_getevents,
+	.event			= fio_ioring_cmd_event,
+	.cleanup		= fio_ioring_cleanup,
+	.open_file		= fio_ioring_cmd_open_file,
+	.close_file		= fio_ioring_cmd_close_file,
+	.get_file_size		= fio_ioring_cmd_get_file_size,
+	.get_zoned_model	= fio_ioring_cmd_get_zoned_model,
+	.report_zones		= fio_ioring_cmd_report_zones,
+	.reset_wp		= fio_ioring_cmd_reset_wp,
+	.get_max_open_zones	= fio_ioring_cmd_get_max_open_zones,
+	.options		= options,
+	.option_struct_size	= sizeof(struct ioring_options),
+};
+
 static void fio_init fio_ioring_register(void)
 {
-	register_ioengine(&ioengine);
+	register_ioengine(&ioengine_uring);
+	register_ioengine(&ioengine_uring_cmd);
 }
 
 static void fio_exit fio_ioring_unregister(void)
 {
-	unregister_ioengine(&ioengine);
+	unregister_ioengine(&ioengine_uring);
+	unregister_ioengine(&ioengine_uring_cmd);
 }
 #endif
diff --git a/engines/nvme.c b/engines/nvme.c
new file mode 100644
index 00000000..9ffc5303
--- /dev/null
+++ b/engines/nvme.c
@@ -0,0 +1,347 @@
+/*
+ * nvme structure declarations and helper functions for the
+ * io_uring_cmd engine.
+ */
+
+#include "nvme.h"
+
+int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+			    struct iovec *iov)
+{
+	struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+	__u64 slba;
+	__u32 nlb;
+
+	memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+
+	if (io_u->ddir == DDIR_READ)
+		cmd->opcode = nvme_cmd_read;
+	else if (io_u->ddir == DDIR_WRITE)
+		cmd->opcode = nvme_cmd_write;
+	else
+		return -ENOTSUP;
+
+	slba = io_u->offset >> data->lba_shift;
+	nlb = (io_u->xfer_buflen >> data->lba_shift) - 1;
+
+	/* cdw10 and cdw11 represent starting lba */
+	cmd->cdw10 = slba & 0xffffffff;
+	cmd->cdw11 = slba >> 32;
+	/* cdw12 represent number of lba's for read/write */
+	cmd->cdw12 = nlb;
+	if (iov) {
+		iov->iov_base = io_u->xfer_buf;
+		iov->iov_len = io_u->xfer_buflen;
+		cmd->addr = (__u64)(uintptr_t)iov;
+		cmd->data_len = 1;
+	} else {
+		cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
+		cmd->data_len = io_u->xfer_buflen;
+	}
+	cmd->nsid = data->nsid;
+	return 0;
+}
+
+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+			 enum nvme_csi csi, void *data)
+{
+	struct nvme_passthru_cmd cmd = {
+		.opcode         = nvme_admin_identify,
+		.nsid           = nsid,
+		.addr           = (__u64)(uintptr_t)data,
+		.data_len       = NVME_IDENTIFY_DATA_SIZE,
+		.cdw10          = cns,
+		.cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
+		.timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+	};
+
+	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz,
+		      __u64 *nlba)
+{
+	struct nvme_id_ns ns;
+	int namespace_id;
+	int fd, err;
+
+	if (f->filetype != FIO_TYPE_CHAR) {
+		log_err("ioengine io_uring_cmd only works with nvme ns "
+			"generic char devices (/dev/ngXnY)\n");
+		return 1;
+	}
+
+	fd = open(f->file_name, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	namespace_id = ioctl(fd, NVME_IOCTL_ID);
+	if (namespace_id < 0) {
+		log_err("failed to fetch namespace-id");
+		close(fd);
+		return -errno;
+	}
+
+	/*
+	 * Identify namespace to get namespace-id, namespace size in LBA's
+	 * and LBA data size.
+	 */
+	err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+				NVME_CSI_NVM, &ns);
+	if (err) {
+		log_err("failed to fetch identify namespace\n");
+		close(fd);
+		return err;
+	}
+
+	*nsid = namespace_id;
+	*lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
+	*nlba = ns.nsze;
+
+	close(fd);
+	return 0;
+}
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+			     enum zbd_zoned_model *model)
+{
+	struct nvme_data *data = FILE_ENG_DATA(f);
+	struct nvme_id_ns ns;
+	struct nvme_passthru_cmd cmd;
+	int fd, ret = 0;
+
+	if (f->filetype != FIO_TYPE_CHAR)
+		return -EINVAL;
+
+	/* File is not yet opened */
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0)
+		return -errno;
+
+	/* Using nvme_id_ns for data as sizes are same */
+	ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
+				NVME_CSI_ZNS, &ns);
+	if (ret) {
+		*model = ZBD_NONE;
+		goto out;
+	}
+
+	memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
+
+	/* Using nvme_id_ns for data as sizes are same */
+	ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+				NVME_CSI_ZNS, &ns);
+	if (ret) {
+		*model = ZBD_NONE;
+		goto out;
+	}
+
+	*model = ZBD_HOST_MANAGED;
+out:
+	close(fd);
+	return 0;
+}
+
+static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
+			     __u32 data_len, void *data)
+{
+	struct nvme_passthru_cmd cmd = {
+		.opcode         = nvme_zns_cmd_mgmt_recv,
+		.nsid           = nsid,
+		.addr           = (__u64)(uintptr_t)data,
+		.data_len       = data_len,
+		.cdw10          = slba & 0xffffffff,
+		.cdw11          = slba >> 32,
+		.cdw12		= (data_len >> 2) - 1,
+		.cdw13		= NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
+		.timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+	};
+
+	return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+			  uint64_t offset, struct zbd_zone *zbdz,
+			  unsigned int nr_zones)
+{
+	struct nvme_data *data = FILE_ENG_DATA(f);
+	struct nvme_zone_report *zr;
+	struct nvme_zns_id_ns zns_ns;
+	struct nvme_id_ns ns;
+	unsigned int i = 0, j, zones_fetched = 0;
+	unsigned int max_zones, zones_chunks = 1024;
+	int fd, ret = 0;
+	__u32 zr_len;
+	__u64 zlen;
+
+	/* File is not yet opened */
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0)
+		return -errno;
+
+	zones_fetched = 0;
+	zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+	zr = calloc(1, zr_len);
+	if (!zr) {
+		close(fd);
+		return -ENOMEM;
+	}
+
+	ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
+				NVME_CSI_NVM, &ns);
+	if (ret) {
+		log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
+			ret);
+		goto out;
+	}
+
+	ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+				NVME_CSI_ZNS, &zns_ns);
+	if (ret) {
+		log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+			f->file_name, ret);
+		goto out;
+	}
+	zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
+
+	max_zones = (f->real_file_size - offset) / zlen;
+	if (max_zones < nr_zones)
+		nr_zones = max_zones;
+
+	if (nr_zones < zones_chunks)
+		zones_chunks = nr_zones;
+
+	while (zones_fetched < nr_zones) {
+		if (zones_fetched + zones_chunks >= nr_zones) {
+			zones_chunks = nr_zones - zones_fetched;
+			zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+		}
+		ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
+					NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
+		if (ret) {
+			log_err("%s: nvme_zns_report_zones failed, err=%d\n",
+				f->file_name, ret);
+			goto out;
+		}
+
+		/* Transform the zone-report */
+		for (j = 0; j < zr->nr_zones; j++, i++) {
+			struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
+
+			zbdz[i].start = desc->zslba << data->lba_shift;
+			zbdz[i].len = zlen;
+			zbdz[i].wp = desc->wp << data->lba_shift;
+			zbdz[i].capacity = desc->zcap << data->lba_shift;
+
+			/* Zone Type is stored in first 4 bits. */
+			switch (desc->zt & 0x0f) {
+			case NVME_ZONE_TYPE_SEQWRITE_REQ:
+				zbdz[i].type = ZBD_ZONE_TYPE_SWR;
+				break;
+			default:
+				log_err("%s: invalid type for zone at offset %llu.\n",
+					f->file_name, desc->zslba);
+				ret = -EIO;
+				goto out;
+			}
+
+			/* Zone State is stored in last 4 bits. */
+			switch (desc->zs >> 4) {
+			case NVME_ZNS_ZS_EMPTY:
+				zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
+				break;
+			case NVME_ZNS_ZS_IMPL_OPEN:
+				zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
+				break;
+			case NVME_ZNS_ZS_EXPL_OPEN:
+				zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
+				break;
+			case NVME_ZNS_ZS_CLOSED:
+				zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
+				break;
+			case NVME_ZNS_ZS_FULL:
+				zbdz[i].cond = ZBD_ZONE_COND_FULL;
+				break;
+			case NVME_ZNS_ZS_READ_ONLY:
+			case NVME_ZNS_ZS_OFFLINE:
+			default:
+				/* Treat all these conditions as offline (don't use!) */
+				zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
+				zbdz[i].wp = zbdz[i].start;
+			}
+		}
+		zones_fetched += zr->nr_zones;
+		offset += zr->nr_zones * zlen;
+	}
+
+	ret = zones_fetched;
+out:
+	free(zr);
+	close(fd);
+
+	return ret;
+}
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+		      uint64_t offset, uint64_t length)
+{
+	struct nvme_data *data = FILE_ENG_DATA(f);
+	unsigned int nr_zones;
+	unsigned long long zslba;
+	int i, fd, ret = 0;
+
+	/* If the file is not yet opened, open it for this function. */
+	fd = f->fd;
+	if (fd < 0) {
+		fd = open(f->file_name, O_RDWR | O_LARGEFILE);
+		if (fd < 0)
+			return -errno;
+	}
+
+	zslba = offset >> data->lba_shift;
+	nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
+
+	for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
+		struct nvme_passthru_cmd cmd = {
+			.opcode         = nvme_zns_cmd_mgmt_send,
+			.nsid           = data->nsid,
+			.cdw10          = zslba & 0xffffffff,
+			.cdw11          = zslba >> 32,
+			.cdw13          = NVME_ZNS_ZSA_RESET,
+			.addr           = (__u64)(uintptr_t)NULL,
+			.data_len       = 0,
+			.timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+		};
+
+		ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+	}
+
+	if (f->fd < 0)
+		close(fd);
+	return -ret;
+}
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+				unsigned int *max_open_zones)
+{
+	struct nvme_data *data = FILE_ENG_DATA(f);
+	struct nvme_zns_id_ns zns_ns;
+	int fd, ret = 0;
+
+	fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0)
+		return -errno;
+
+	ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+				NVME_CSI_ZNS, &zns_ns);
+	if (ret) {
+		log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+			f->file_name, ret);
+		goto out;
+	}
+
+	*max_open_zones = zns_ns.mor + 1;
+out:
+	close(fd);
+	return ret;
+}
diff --git a/engines/nvme.h b/engines/nvme.h
new file mode 100644
index 00000000..70a89b74
--- /dev/null
+++ b/engines/nvme.h
@@ -0,0 +1,214 @@
+/*
+ * nvme structure declarations and helper functions for the
+ * io_uring_cmd engine.
+ */
+
+#ifndef FIO_NVME_H
+#define FIO_NVME_H
+
+#include <linux/nvme_ioctl.h>
+#include "../fio.h"
+
+/*
+ * If the uapi headers installed on the system lacks nvme uring command
+ * support, use the local version to prevent compilation issues.
+ */
+#ifndef CONFIG_NVME_URING_CMD
+struct nvme_uring_cmd {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32   rsvd2;
+};
+
+#define NVME_URING_CMD_IO	_IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC	_IOWR('N', 0x81, struct nvme_uring_cmd)
+#endif /* CONFIG_NVME_URING_CMD */
+
+#define NVME_DEFAULT_IOCTL_TIMEOUT 0
+#define NVME_IDENTIFY_DATA_SIZE 4096
+#define NVME_IDENTIFY_CSI_SHIFT 24
+
+#define NVME_ZNS_ZRA_REPORT_ZONES 0
+#define NVME_ZNS_ZRAS_FEAT_ERZ (1 << 16)
+#define NVME_ZNS_ZSA_RESET 0x4
+#define NVME_ZONE_TYPE_SEQWRITE_REQ 0x2
+
+enum nvme_identify_cns {
+	NVME_IDENTIFY_CNS_NS		= 0x00,
+	NVME_IDENTIFY_CNS_CSI_NS	= 0x05,
+	NVME_IDENTIFY_CNS_CSI_CTRL	= 0x06,
+};
+
+enum nvme_csi {
+	NVME_CSI_NVM			= 0,
+	NVME_CSI_KV			= 1,
+	NVME_CSI_ZNS			= 2,
+};
+
+enum nvme_admin_opcode {
+	nvme_admin_identify		= 0x06,
+};
+
+enum nvme_io_opcode {
+	nvme_cmd_write			= 0x01,
+	nvme_cmd_read			= 0x02,
+	nvme_zns_cmd_mgmt_send		= 0x79,
+	nvme_zns_cmd_mgmt_recv		= 0x7a,
+};
+
+enum nvme_zns_zs {
+	NVME_ZNS_ZS_EMPTY		= 0x1,
+	NVME_ZNS_ZS_IMPL_OPEN		= 0x2,
+	NVME_ZNS_ZS_EXPL_OPEN		= 0x3,
+	NVME_ZNS_ZS_CLOSED		= 0x4,
+	NVME_ZNS_ZS_READ_ONLY		= 0xd,
+	NVME_ZNS_ZS_FULL		= 0xe,
+	NVME_ZNS_ZS_OFFLINE		= 0xf,
+};
+
+struct nvme_data {
+	__u32 nsid;
+	__u32 lba_shift;
+};
+
+struct nvme_lbaf {
+	__le16			ms;
+	__u8			ds;
+	__u8			rp;
+};
+
+struct nvme_id_ns {
+	__le64			nsze;
+	__le64			ncap;
+	__le64			nuse;
+	__u8			nsfeat;
+	__u8			nlbaf;
+	__u8			flbas;
+	__u8			mc;
+	__u8			dpc;
+	__u8			dps;
+	__u8			nmic;
+	__u8			rescap;
+	__u8			fpi;
+	__u8			dlfeat;
+	__le16			nawun;
+	__le16			nawupf;
+	__le16			nacwu;
+	__le16			nabsn;
+	__le16			nabo;
+	__le16			nabspf;
+	__le16			noiob;
+	__u8			nvmcap[16];
+	__le16			npwg;
+	__le16			npwa;
+	__le16			npdg;
+	__le16			npda;
+	__le16			nows;
+	__le16			mssrl;
+	__le32			mcl;
+	__u8			msrc;
+	__u8			rsvd81[11];
+	__le32			anagrpid;
+	__u8			rsvd96[3];
+	__u8			nsattr;
+	__le16			nvmsetid;
+	__le16			endgid;
+	__u8			nguid[16];
+	__u8			eui64[8];
+	struct nvme_lbaf	lbaf[16];
+	__u8			rsvd192[192];
+	__u8			vs[3712];
+};
+
+static inline int ilog2(uint32_t i)
+{
+	int log = -1;
+
+	while (i) {
+		i >>= 1;
+		log++;
+	}
+	return log;
+}
+
+struct nvme_zns_lbafe {
+	__le64	zsze;
+	__u8	zdes;
+	__u8	rsvd9[7];
+};
+
+struct nvme_zns_id_ns {
+	__le16			zoc;
+	__le16			ozcs;
+	__le32			mar;
+	__le32			mor;
+	__le32			rrl;
+	__le32			frl;
+	__le32			rrl1;
+	__le32			rrl2;
+	__le32			rrl3;
+	__le32			frl1;
+	__le32			frl2;
+	__le32			frl3;
+	__le32			numzrwa;
+	__le16			zrwafg;
+	__le16			zrwasz;
+	__u8			zrwacap;
+	__u8			rsvd53[2763];
+	struct nvme_zns_lbafe	lbafe[64];
+	__u8			vs[256];
+};
+
+struct nvme_zns_desc {
+	__u8	zt;
+	__u8	zs;
+	__u8	za;
+	__u8	zai;
+	__u8	rsvd4[4];
+	__le64	zcap;
+	__le64	zslba;
+	__le64	wp;
+	__u8	rsvd32[32];
+};
+
+struct nvme_zone_report {
+	__le64			nr_zones;
+	__u8			rsvd8[56];
+	struct nvme_zns_desc	entries[];
+};
+
+int fio_nvme_get_info(struct fio_file *f, __u32 *nsid, __u32 *lba_sz,
+		      __u64 *nlba);
+
+int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+			    struct iovec *iov);
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+			     enum zbd_zoned_model *model);
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+			  uint64_t offset, struct zbd_zone *zbdz,
+			  unsigned int nr_zones);
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+		      uint64_t offset, uint64_t length);
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+				unsigned int *max_open_zones);
+
+#endif
diff --git a/examples/uring-cmd-ng.fio b/examples/uring-cmd-ng.fio
new file mode 100644
index 00000000..b2888a00
--- /dev/null
+++ b/examples/uring-cmd-ng.fio
@@ -0,0 +1,25 @@
+# io_uring_cmd I/O engine for nvme-ns generic character device
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4K
+thread=1
+stonewall=1
+
+[rand-write]
+rw=randwrite
+sqthread_poll=1
+
+[rand-read]
+rw=randread
+
+[write-opts]
+rw=write
+sqthread_poll=1
+sqthread_poll_cpu=0
+nonvectored=1
+registerfiles=1
diff --git a/examples/uring-cmd-zoned.fio b/examples/uring-cmd-zoned.fio
new file mode 100644
index 00000000..58e8f79e
--- /dev/null
+++ b/examples/uring-cmd-zoned.fio
@@ -0,0 +1,31 @@
+# io_uring_cmd I/O engine for nvme-ns generic zoned character device
+#
+# NOTE: with write workload iodepth must be set to 1 as there is no IO
+# scheduler.
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+zonemode=zbd
+size=1G
+iodepth=1
+bs=256K
+verify=crc32c
+stonewall=1
+
+[rand-write]
+rw=randwrite
+
+[write-opts]
+rw=write
+registerfiles=1
+sqthread_poll=1
+sqthread_poll_cpu=0
+
+[randwrite-opts]
+rw=randwrite
+sqthread_poll=1
+sqthread_poll_cpu=0
+nonvectored=1
+registerfiles=1
diff --git a/file.h b/file.h
index faf65a2a..da1b8947 100644
--- a/file.h
+++ b/file.h
@@ -126,12 +126,14 @@ struct fio_file {
 	unsigned int last_write_idx;
 
 	/*
-	 * For use by the io engine for offset or private data storage
+	 * For use by the io engine to store offset
 	 */
-	union {
-		uint64_t engine_pos;
-		void *engine_data;
-	};
+	uint64_t engine_pos;
+
+	/*
+	 * For use by the io engine for private data storage
+	 */
+	void *engine_data;
 
 	/*
 	 * if io is protected by a semaphore, this is set
diff --git a/fio.1 b/fio.1
index bdba3142..948c01f9 100644
--- a/fio.1
+++ b/fio.1
@@ -1739,6 +1739,15 @@ Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
 .B pvsync2
 Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
 .TP
+.B io_uring
+Fast Linux native asynchronous I/O. Supports async IO
+for both direct and buffered IO.
+This engine defines engine specific options.
+.TP
+.B io_uring_cmd
+Fast Linux native asynchronous I/O for passthrough commands.
+This engine defines engine specific options.
+.TP
 .B libaio
 Linux native asynchronous I/O. Note that Linux may only support
 queued behavior with non-buffered I/O (set `direct=1' or
@@ -2040,35 +2049,49 @@ for trim IOs are ignored. This option is mutually exclusive with the
 \fBcmdprio_percentage\fR option.
 .RE
 .TP
-.BI (io_uring)fixedbufs
+.BI (io_uring,io_uring_cmd)fixedbufs
 If fio is asked to do direct IO, then Linux will map pages for each IO call, and
 release them when IO is done. If this option is set, the pages are pre-mapped
 before IO is started. This eliminates the need to map and release for each IO.
 This is more efficient, and reduces the IO latency as well.
 .TP
-.BI (io_uring,xnvme)hipri
+.BI (io_uring,io_uring_cmd)nonvectored
+With this option, fio will use non-vectored read/write commands, where address
+must contain the address directly. Default is -1.
+.TP
+.BI (io_uring,io_uring_cmd)force_async
+Normal operation for io_uring is to try and issue an sqe as non-blocking first,
+and if that fails, execute it in an async manner. With this option set to N,
+then every N request fio will ask sqe to be issued in an async manner. Default
+is 0.
+.TP
+.BI (io_uring,io_uring_cmd,xnvme)hipri
 If this option is set, fio will attempt to use polled IO completions. Normal IO
 completions generate interrupts to signal the completion of IO, polled
 completions do not. Hence they are require active reaping by the application.
 The benefits are more efficient IO for high IOPS scenarios, and lower latencies
 for low queue depth IO.
 .TP
-.BI (io_uring)registerfiles
+.BI (io_uring,io_uring_cmd)registerfiles
 With this option, fio registers the set of files being used with the kernel.
 This avoids the overhead of managing file counts in the kernel, making the
 submission and completion part more lightweight. Required for the below
 sqthread_poll option.
 .TP
-.BI (io_uring,xnvme)sqthread_poll
+.BI (io_uring,io_uring_cmd,xnvme)sqthread_poll
 Normally fio will submit IO by issuing a system call to notify the kernel of
 available items in the SQ ring. If this option is set, the act of submitting IO
 will be done by a polling thread in the kernel. This frees up cycles for fio, at
 the cost of using more CPU in the system.
 .TP
-.BI (io_uring)sqthread_poll_cpu
+.BI (io_uring,io_uring_cmd)sqthread_poll_cpu
 When `sqthread_poll` is set, this option provides a way to define which CPU
 should be used for the polling thread.
 .TP
+.BI (io_uring_cmd)cmd_type \fR=\fPstr
+Specifies the type of uring passthrough command to be used. Supported
+value is nvme. Default is nvme.
+.TP
 .BI (libaio)userspace_reap
 Normally, with the libaio engine in use, fio will use the
 \fBio_getevents\fR\|(3) system call to reap newly returned events. With
diff --git a/init.c b/init.c
index f7d702f8..da800776 100644
--- a/init.c
+++ b/init.c
@@ -2810,6 +2810,15 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
 				break;
 
 			ret = fio_cmd_ioengine_option_parse(td, opt, val);
+
+			if (ret) {
+				if (td) {
+					put_job(td);
+					td = NULL;
+				}
+				do_exit++;
+				exit_val = 1;
+			}
 			break;
 		}
 		case 'w':
diff --git a/os/linux/io_uring.h b/os/linux/io_uring.h
index 42b2fe84..929997f8 100644
--- a/os/linux/io_uring.h
+++ b/os/linux/io_uring.h
@@ -22,6 +22,7 @@ struct io_uring_sqe {
 	union {
 		__u64	off;	/* offset into file */
 		__u64	addr2;
+		__u32	cmd_op;
 	};
 	union {
 		__u64	addr;	/* pointer to buffer or iovecs */
@@ -60,7 +61,17 @@ struct io_uring_sqe {
 		__s32	splice_fd_in;
 		__u32	file_index;
 	};
-	__u64	__pad2[2];
+	union {
+		struct {
+			__u64	addr3;
+			__u64	__pad2[1];
+		};
+		/*
+		 * If the ring is initialized with IORING_SETUP_SQE128, then
+		 * this field is used for 80 bytes of arbitrary command data
+		 */
+		__u8	cmd[0];
+	};
 };
 
 enum {
@@ -101,6 +112,24 @@ enum {
 #define IORING_SETUP_CLAMP	(1U << 4)	/* clamp SQ/CQ ring sizes */
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
+#define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN	(1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
+
+#define IORING_SETUP_SQE128		(1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32		(1U << 11) /* CQEs are 32 byte */
 
 enum {
 	IORING_OP_NOP,
@@ -143,6 +172,14 @@ enum {
 	IORING_OP_MKDIRAT,
 	IORING_OP_SYMLINKAT,
 	IORING_OP_LINKAT,
+	IORING_OP_MSG_RING,
+	IORING_OP_FSETXATTR,
+	IORING_OP_SETXATTR,
+	IORING_OP_FGETXATTR,
+	IORING_OP_GETXATTR,
+	IORING_OP_SOCKET,
+	IORING_OP_URING_CMD,
+
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -192,6 +229,12 @@ struct io_uring_cqe {
 	__u64	user_data;	/* sqe->data submission passed back */
 	__s32	res;		/* result code for this event */
 	__u32	flags;
+
+	/*
+	 * If the ring is initialized with IORING_SETUP_CQE32, then this field
+	 * contains 16-bytes of padding, doubling the size of the CQE.
+	 */
+	__u64 big_cqe[];
 };
 
 /*
diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
index 7e2fff00..d4aaa813 100755
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -229,6 +229,14 @@ require_regular_block_dev() {
 	return 0
 }
 
+require_block_dev() {
+	if [[ -b "$realdev" ]]; then
+		return 0
+	fi
+	SKIP_REASON="$dev is not a block device"
+	return 1
+}
+
 require_seq_zones() {
 	local req_seq_zones=${1}
 	local seq_bytes=$((disk_size - first_sequential_zone_sector * 512))
@@ -251,8 +259,19 @@ require_conv_zones() {
 	return 0
 }
 
-# Check whether buffered writes are refused.
+require_max_open_zones() {
+	local min=${1}
+
+	if ((max_open_zones !=0 && max_open_zones < min)); then
+		SKIP_REASON="max_open_zones of $dev is smaller than $min"
+		return 1
+	fi
+	return 0
+}
+
+# Check whether buffered writes are refused for block devices.
 test1() {
+    require_block_dev || return $SKIP_TESTCASE
     run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K	\
 	    "$(ioengine "psync")" --size="${zone_size}" --thread=1	\
 	    --zonemode=zbd --zonesize="${zone_size}" 2>&1 |
@@ -453,6 +472,8 @@ test12() {
 test13() {
     local size off capacity
 
+    require_max_open_zones 4 || return $SKIP_TESTCASE
+
     prep_write
     size=$((8 * zone_size))
     off=$((first_sequential_zone_sector * 512))
diff --git a/zbd.c b/zbd.c
index b1fd6b4b..627fb968 100644
--- a/zbd.c
+++ b/zbd.c
@@ -466,7 +466,7 @@ out:
 	return res;
 }
 
-/* Verify whether direct I/O is used for all host-managed zoned drives. */
+/* Verify whether direct I/O is used for all host-managed zoned block drives. */
 static bool zbd_using_direct_io(void)
 {
 	struct thread_data *td;
@@ -477,7 +477,7 @@ static bool zbd_using_direct_io(void)
 		if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
 			continue;
 		for_each_file(td, f, j) {
-			if (f->zbd_info &&
+			if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK &&
 			    f->zbd_info->model == ZBD_HOST_MANAGED)
 				return false;
 		}

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-06-01 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-06-01 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit e1aeff3ac96a51128b0493377f405e38bdc83500:

  Merge branch 'wip-lmy-rados' of https://github.com/liangmingyuanneo/fio (2022-05-29 09:32:18 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 5ceed0be62f3ce8903d5747674f9f70f44e736d6:

  docs: update language setting for Sphinx build (2022-05-31 20:58:00 -0600)

----------------------------------------------------------------
Vincent Fu (1):
      docs: update language setting for Sphinx build

 doc/conf.py | 7 -------
 1 file changed, 7 deletions(-)

---

Diff of recent changes:

diff --git a/doc/conf.py b/doc/conf.py
index 10b72ecb..844f951a 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -85,13 +85,6 @@ def fio_version():
 
 version, release = fio_version()
 
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
 #

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-05-30 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-05-30 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit a2840331c3cae5b2b0a13f99e58ae18375e2e40d:

  Merge branch 'master' of https://github.com/guoanwu/fio (2022-05-25 06:30:06 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to e1aeff3ac96a51128b0493377f405e38bdc83500:

  Merge branch 'wip-lmy-rados' of https://github.com/liangmingyuanneo/fio (2022-05-29 09:32:18 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'wip-lmy-rados' of https://github.com/liangmingyuanneo/fio

Vincent Fu (5):
      steadystate: delete incorrect comment
      configure: refer to zlib1g-dev package for zlib support
      HOWTO: add blank line for prettier formatting
      t/run-fio-tests: improve json data decoding
      docs: update discussion of huge page sizes

liangmingyuan (1):
      engines/ceph: add option for setting config file path

 HOWTO.rst          | 31 ++++++++++++++++++++-----------
 configure          |  2 +-
 engines/rados.c    | 13 ++++++++++++-
 examples/rados.fio |  1 +
 fio.1              | 23 ++++++++++++++---------
 steadystate.c      |  7 -------
 t/run-fio-tests.py | 20 +++++++-------------
 7 files changed, 55 insertions(+), 42 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 84bea5c5..8ab3ac4b 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -1064,6 +1064,7 @@ Target file/device
 	thread/process.
 
 .. option:: ignore_zone_limits=bool
+
 	If this option is used, fio will ignore the maximum number of open
 	zones limit of the zoned block device in use, thus allowing the
 	option :option:`max_open_zones` value to be larger than the device
@@ -1822,13 +1823,14 @@ Buffers and memory
 	**mmaphuge** to work, the system must have free huge pages allocated. This
 	can normally be checked and set by reading/writing
 	:file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
-	is 4MiB in size. So to calculate the number of huge pages you need for a
-	given job file, add up the I/O depth of all jobs (normally one unless
-	:option:`iodepth` is used) and multiply by the maximum bs set. Then divide
-	that number by the huge page size. You can see the size of the huge pages in
-	:file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
-	number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
-	see :option:`hugepage-size`.
+        is 2 or 4MiB in size depending on the platform. So to calculate the
+        number of huge pages you need for a given job file, add up the I/O
+        depth of all jobs (normally one unless :option:`iodepth` is used) and
+        multiply by the maximum bs set. Then divide that number by the huge
+        page size. You can see the size of the huge pages in
+        :file:`/proc/meminfo`. If no huge pages are allocated by having a
+        non-zero number in `nr_hugepages`, using **mmaphuge** or **shmhuge**
+        will fail. Also see :option:`hugepage-size`.
 
 	**mmaphuge** also needs to have hugetlbfs mounted and the file location
 	should point there. So if it's mounted in :file:`/huge`, you would use
@@ -1847,10 +1849,12 @@ Buffers and memory
 
 .. option:: hugepage-size=int
 
-	Defines the size of a huge page. Must at least be equal to the system
-	setting, see :file:`/proc/meminfo`. Defaults to 4MiB.  Should probably
-	always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
-	preferred way to set this to avoid setting a non-pow-2 bad value.
+        Defines the size of a huge page. Must at least be equal to the system
+        setting, see :file:`/proc/meminfo` and
+        :file:`/sys/kernel/mm/hugepages/`. Defaults to 2 or 4MiB depending on
+        the platform.  Should probably always be a multiple of megabytes, so
+        using ``hugepage-size=Xm`` is the preferred way to set this to avoid
+        setting a non-pow-2 bad value.
 
 .. option:: lockmem=int
 
@@ -2491,6 +2495,11 @@ with the caveat that when used on the command line, they must come after the
 	the full *type.id* string. If no type. prefix is given, fio will add
 	'client.' by default.
 
+.. option:: conf=str : [rados]
+
+    Specifies the configuration path of ceph cluster, so conf file does not
+    have to be /etc/ceph/ceph.conf.
+
 .. option:: busy_poll=bool : [rbd,rados]
 
         Poll store instead of waiting for completion. Usually this provides better
diff --git a/configure b/configure
index 95b60bb7..4ee536a0 100755
--- a/configure
+++ b/configure
@@ -3142,7 +3142,7 @@ if test "$libzbc" = "yes" ; then
   output_sym "CONFIG_LIBZBC"
 fi
 if test "$zlib" = "no" ; then
-  echo "Consider installing zlib-dev (zlib-devel, some fio features depend on it."
+  echo "Consider installing zlib1g-dev (zlib-devel) as some fio features depend on it."
   if test "$build_static" = "yes"; then
     echo "Note that some distros have separate packages for static libraries."
   fi
diff --git a/engines/rados.c b/engines/rados.c
index 976f9229..d0d15c5b 100644
--- a/engines/rados.c
+++ b/engines/rados.c
@@ -37,6 +37,7 @@ struct rados_options {
 	char *cluster_name;
 	char *pool_name;
 	char *client_name;
+	char *conf;
 	int busy_poll;
 	int touch_objects;
 };
@@ -69,6 +70,16 @@ static struct fio_option options[] = {
 		.category = FIO_OPT_C_ENGINE,
 		.group    = FIO_OPT_G_RBD,
 	},
+	{
+		.name     = "conf",
+		.lname    = "ceph configuration file path",
+		.type     = FIO_OPT_STR_STORE,
+		.help     = "Path of the ceph configuration file",
+		.off1     = offsetof(struct rados_options, conf),
+		.def      = "/etc/ceph/ceph.conf",
+		.category = FIO_OPT_C_ENGINE,
+		.group    = FIO_OPT_G_RBD,
+	},
 	{
 		.name     = "busy_poll",
 		.lname    = "busy poll mode",
@@ -184,7 +195,7 @@ static int _fio_rados_connect(struct thread_data *td)
 		goto failed_early;
 	}
 
-	r = rados_conf_read_file(rados->cluster, NULL);
+	r = rados_conf_read_file(rados->cluster, o->conf);
 	if (r < 0) {
 		log_err("rados_conf_read_file failed.\n");
 		goto failed_early;
diff --git a/examples/rados.fio b/examples/rados.fio
index 035cbff4..dd86f354 100644
--- a/examples/rados.fio
+++ b/examples/rados.fio
@@ -14,6 +14,7 @@
 ioengine=rados
 clientname=admin
 pool=rados
+conf=/etc/ceph/ceph.conf
 busy_poll=0
 rw=randwrite
 bs=4k
diff --git a/fio.1 b/fio.1
index ded7bbfc..bdba3142 100644
--- a/fio.1
+++ b/fio.1
@@ -1631,11 +1631,11 @@ multiplied by the I/O depth given. Note that for \fBshmhuge\fR and
 \fBmmaphuge\fR to work, the system must have free huge pages allocated. This
 can normally be checked and set by reading/writing
 `/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
-is 4MiB in size. So to calculate the number of huge pages you need for a
-given job file, add up the I/O depth of all jobs (normally one unless
-\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
-that number by the huge page size. You can see the size of the huge pages in
-`/proc/meminfo'. If no huge pages are allocated by having a non-zero
+is 2 or 4MiB in size depending on the platform. So to calculate the number of
+huge pages you need for a given job file, add up the I/O depth of all jobs
+(normally one unless \fBiodepth\fR is used) and multiply by the maximum bs set.
+Then divide that number by the huge page size. You can see the size of the huge
+pages in `/proc/meminfo'. If no huge pages are allocated by having a non-zero
 number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
 see \fBhugepage\-size\fR.
 .P
@@ -1655,10 +1655,11 @@ of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and
 \fBbs\fR used.
 .TP
 .BI hugepage\-size \fR=\fPint
-Defines the size of a huge page. Must at least be equal to the system
-setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
-always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
-preferred way to set this to avoid setting a non-pow-2 bad value.
+Defines the size of a huge page. Must at least be equal to the system setting,
+see `/proc/meminfo' and `/sys/kernel/mm/hugepages/'. Defaults to 2 or 4MiB
+depending on the platform. Should probably always be a multiple of megabytes,
+so using `hugepage\-size=Xm' is the preferred way to set this to avoid setting
+a non-pow-2 bad value.
 .TP
 .BI lockmem \fR=\fPint
 Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
@@ -2243,6 +2244,10 @@ Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall
 the full *type.id* string. If no type. prefix is given, fio will add 'client.'
 by default.
 .TP
+.BI (rados)conf \fR=\fPstr
+Specifies the configuration path of ceph cluster, so conf file does not
+have to be /etc/ceph/ceph.conf.
+.TP
 .BI (rbd,rados)busy_poll \fR=\fPbool
 Poll store instead of waiting for completion. Usually this provides better
 throughput at cost of higher(up to 100%) CPU utilization.
diff --git a/steadystate.c b/steadystate.c
index 2e3da1db..ad19318c 100644
--- a/steadystate.c
+++ b/steadystate.c
@@ -250,13 +250,6 @@ int steadystate_check(void)
 		rate_time = mtime_since(&ss->prev_time, &now);
 		memcpy(&ss->prev_time, &now, sizeof(now));
 
-		/*
-		 * Begin monitoring when job starts but don't actually use
-		 * data in checking stopping criterion until ss->ramp_time is
-		 * over. This ensures that we will have a sane value in
-		 * prev_iops/bw the first time through after ss->ramp_time
-		 * is done.
-		 */
 		if (ss->state & FIO_SS_RAMP_OVER) {
 			group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time;
 			group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time;
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py
index ecceb67e..32cdbc19 100755
--- a/t/run-fio-tests.py
+++ b/t/run-fio-tests.py
@@ -311,21 +311,15 @@ class FioJobTest(FioExeTest):
         #
         # Sometimes fio informational messages are included at the top of the
         # JSON output, especially under Windows. Try to decode output as JSON
-        # data, lopping off up to the first four lines
+        # data, skipping everything until the first {
         #
         lines = file_data.splitlines()
-        for i in range(5):
-            file_data = '\n'.join(lines[i:])
-            try:
-                self.json_data = json.loads(file_data)
-            except json.JSONDecodeError:
-                continue
-            else:
-                logging.debug("Test %d: skipped %d lines decoding JSON data", self.testnum, i)
-                return
-
-        self.failure_reason = "{0} unable to decode JSON data,".format(self.failure_reason)
-        self.passed = False
+        file_data = '\n'.join(lines[lines.index("{"):])
+        try:
+            self.json_data = json.loads(file_data)
+        except json.JSONDecodeError:
+            self.failure_reason = "{0} unable to decode JSON data,".format(self.failure_reason)
+            self.passed = False
 
 
 class FioJobTest_t0005(FioJobTest):

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-05-26 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-05-26 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6f1a24593c227a4f392f454698aca20e95f0006c:

  Makefile: Suppress `-Wimplicit-fallthrough` when compiling `lex.yy` (2022-05-12 11:02:55 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a2840331c3cae5b2b0a13f99e58ae18375e2e40d:

  Merge branch 'master' of https://github.com/guoanwu/fio (2022-05-25 06:30:06 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'master' of https://github.com/guoanwu/fio

dennis.wu (1):
      pmemblk.c: fix one logic bug - read always with write

 engines/pmemblk.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

---

Diff of recent changes:

diff --git a/engines/pmemblk.c b/engines/pmemblk.c
index fc6358e8..849d8a15 100644
--- a/engines/pmemblk.c
+++ b/engines/pmemblk.c
@@ -375,10 +375,11 @@ static enum fio_q_status fio_pmemblk_queue(struct thread_data *td,
 		off /= pmb->pmb_bsize;
 		len /= pmb->pmb_bsize;
 		while (0 < len) {
-			if (io_u->ddir == DDIR_READ &&
-			   0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
-				io_u->error = errno;
-				break;
+			if (io_u->ddir == DDIR_READ) {
+				if (0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
+					io_u->error = errno;
+					break;
+				}
 			} else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
 				io_u->error = errno;
 				break;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-05-13 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-05-13 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 12db6deb8b767ac89dd73e34dbc6f06905441e07:

  Merge branch 'patch-1' of https://github.com/ferdnyc/fio (2022-05-01 07:29:05 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6f1a24593c227a4f392f454698aca20e95f0006c:

  Makefile: Suppress `-Wimplicit-fallthrough` when compiling `lex.yy` (2022-05-12 11:02:55 -0600)

----------------------------------------------------------------
Ammar Faizi (2):
      backend: Fix indentation
      Makefile: Suppress `-Wimplicit-fallthrough` when compiling `lex.yy`

Ankit Kumar (3):
      engines/xnvme: add xnvme engine
      docs: documentation for xnvme ioengine
      examples: add example job file for xnvme engine usage

 HOWTO.rst                  |  55 ++-
 Makefile                   |  13 +-
 backend.c                  |   2 +-
 configure                  |  22 +
 engines/xnvme.c            | 981 +++++++++++++++++++++++++++++++++++++++++++++
 examples/xnvme-compare.fio |  72 ++++
 examples/xnvme-zoned.fio   |  87 ++++
 fio.1                      |  70 +++-
 optgroup.h                 |   2 +
 options.c                  |   5 +
 10 files changed, 1302 insertions(+), 7 deletions(-)
 create mode 100644 engines/xnvme.c
 create mode 100644 examples/xnvme-compare.fio
 create mode 100644 examples/xnvme-zoned.fio

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 6a3e09f5..84bea5c5 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -2171,6 +2171,12 @@ I/O engine
 		**exec**
 			Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
 
+		**xnvme**
+			I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides
+			flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring,
+			the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes
+			engine specific options. (See https://xnvme.io).
+
 I/O engine specific parameters
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2260,7 +2266,7 @@ with the caveat that when used on the command line, they must come after the
 	making the submission and completion part more lightweight. Required
 	for the below :option:`sqthread_poll` option.
 
-.. option:: sqthread_poll : [io_uring]
+.. option:: sqthread_poll : [io_uring] [xnvme]
 
 	Normally fio will submit IO by issuing a system call to notify the
 	kernel of available items in the SQ ring. If this option is set, the
@@ -2275,7 +2281,7 @@ with the caveat that when used on the command line, they must come after the
 
 .. option:: hipri
 
-   [io_uring]
+   [io_uring], [xnvme]
 
         If this option is set, fio will attempt to use polled IO completions.
         Normal IO completions generate interrupts to signal the completion of
@@ -2725,6 +2731,51 @@ with the caveat that when used on the command line, they must come after the
 
 	If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
 
+.. option:: xnvme_async=str : [xnvme]
+
+	Select the xnvme async command interface. This can take these values.
+
+	**emu**
+		This is default and used to emulate asynchronous I/O.
+	**thrpool**
+		Use thread pool for Asynchronous I/O.
+	**io_uring**
+		Use Linux io_uring/liburing for Asynchronous I/O.
+	**libaio**
+		Use Linux aio for Asynchronous I/O.
+	**posix**
+		Use POSIX aio for Asynchronous I/O.
+	**nil**
+		Use nil-io; For introspective perf. evaluation
+
+.. option:: xnvme_sync=str : [xnvme]
+
+	Select the xnvme synchronous command interface. This can take these values.
+
+	**nvme**
+		This is default and uses Linux NVMe Driver ioctl() for synchronous I/O.
+	**psync**
+		Use pread()/write() for synchronous I/O.
+
+.. option:: xnvme_admin=str : [xnvme]
+
+	Select the xnvme admin command interface. This can take these values.
+
+	**nvme**
+		This is default and uses linux NVMe Driver ioctl() for admin commands.
+	**block**
+		Use Linux Block Layer ioctl() and sysfs for admin commands.
+	**file_as_ns**
+		Use file-stat to construct NVMe idfy responses.
+
+.. option:: xnvme_dev_nsid=int : [xnvme]
+
+	xnvme namespace identifier, for userspace NVMe driver.
+
+.. option:: xnvme_iovec=int : [xnvme]
+
+	If this option is set. xnvme will use vectored read/write commands.
+
 I/O depth
 ~~~~~~~~~
 
diff --git a/Makefile b/Makefile
index e670c1f2..ed66305a 100644
--- a/Makefile
+++ b/Makefile
@@ -223,7 +223,12 @@ ifdef CONFIG_LIBZBC
   libzbc_LIBS = -lzbc
   ENGINES += libzbc
 endif
-
+ifdef CONFIG_LIBXNVME
+  xnvme_SRCS = engines/xnvme.c
+  xnvme_LIBS = $(LIBXNVME_LIBS)
+  xnvme_CFLAGS = $(LIBXNVME_CFLAGS)
+  ENGINES += xnvme
+endif
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
 		oslib/linux-dev-lookup.c engines/io_uring.c
@@ -530,8 +535,12 @@ else
 	$(QUIET_LEX)$(LEX) $<
 endif
 
+ifneq (,$(findstring -Wimplicit-fallthrough,$(CFLAGS)))
+LEX_YY_CFLAGS := -Wno-implicit-fallthrough
+endif
+
 lex.yy.o: lex.yy.c y.tab.h
-	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(LEX_YY_CFLAGS) -c $<
 
 y.tab.o: y.tab.c y.tab.h
 	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
diff --git a/backend.c b/backend.c
index ffbb7e2a..e5bb4e25 100644
--- a/backend.c
+++ b/backend.c
@@ -2021,7 +2021,7 @@ static void reap_threads(unsigned int *nr_running, uint64_t *t_rate,
 	for_each_td(td, i) {
 		int flags = 0;
 
-		 if (!strcmp(td->o.ioengine, "cpuio"))
+		if (!strcmp(td->o.ioengine, "cpuio"))
 			cputhreads++;
 		else
 			realthreads++;
diff --git a/configure b/configure
index d327d2ca..95b60bb7 100755
--- a/configure
+++ b/configure
@@ -171,6 +171,7 @@ march_set="no"
 libiscsi="no"
 libnbd="no"
 libnfs="no"
+xnvme="no"
 libzbc=""
 dfs=""
 dynamic_engines="no"
@@ -240,6 +241,8 @@ for opt do
   ;;
   --disable-libzbc) libzbc="no"
   ;;
+  --enable-xnvme) xnvme="yes"
+  ;;
   --disable-tcmalloc) disable_tcmalloc="yes"
   ;;
   --disable-nfs) disable_nfs="yes"
@@ -291,6 +294,7 @@ if test "$show_help" = "yes" ; then
   echo "--with-ime=             Install path for DDN's Infinite Memory Engine"
   echo "--enable-libiscsi       Enable iscsi support"
   echo "--enable-libnbd         Enable libnbd (NBD engine) support"
+  echo "--enable-xnvme          Enable xnvme support"
   echo "--disable-libzbc        Disable libzbc even if found"
   echo "--disable-tcmalloc      Disable tcmalloc support"
   echo "--dynamic-libengines    Lib-based ioengines as dynamic libraries"
@@ -2583,6 +2587,19 @@ if test "$libzbc" != "no" ; then
 fi
 print_config "libzbc engine" "$libzbc"
 
+##########################################
+# Check if we have xnvme
+if test "$xnvme" != "yes" ; then
+  if check_min_lib_version xnvme 0.2.0; then
+    xnvme="yes"
+    xnvme_cflags=$(pkg-config --cflags xnvme)
+    xnvme_libs=$(pkg-config --libs xnvme)
+  else
+    xnvme="no"
+  fi
+fi
+print_config "xnvme engine" "$xnvme"
+
 ##########################################
 # check march=armv8-a+crc+crypto
 if test "$march_armv8_a_crc_crypto" != "yes" ; then
@@ -3190,6 +3207,11 @@ if test "$libnfs" = "yes" ; then
   echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak
   echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak
 fi
+if test "$xnvme" = "yes" ; then
+  output_sym "CONFIG_LIBXNVME"
+  echo "LIBXNVME_CFLAGS=$xnvme_cflags" >> $config_host_mak
+  echo "LIBXNVME_LIBS=$xnvme_libs" >> $config_host_mak
+fi
 if test "$dynamic_engines" = "yes" ; then
   output_sym "CONFIG_DYNAMIC_ENGINES"
 fi
diff --git a/engines/xnvme.c b/engines/xnvme.c
new file mode 100644
index 00000000..c11b33a8
--- /dev/null
+++ b/engines/xnvme.c
@@ -0,0 +1,981 @@
+/*
+ * fio xNVMe IO Engine
+ *
+ * IO engine using the xNVMe C API.
+ *
+ * See: http://xnvme.io/
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdlib.h>
+#include <assert.h>
+#include <libxnvme.h>
+#include <libxnvme_libconf.h>
+#include <libxnvme_nvm.h>
+#include <libxnvme_znd.h>
+#include <libxnvme_spec_fs.h>
+#include "fio.h"
+#include "zbd_types.h"
+#include "optgroup.h"
+
+static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
+
+struct xnvme_fioe_fwrap {
+	/* fio file representation */
+	struct fio_file *fio_file;
+
+	/* xNVMe device handle */
+	struct xnvme_dev *dev;
+	/* xNVMe device geometry */
+	const struct xnvme_geo *geo;
+
+	struct xnvme_queue *queue;
+
+	uint32_t ssw;
+	uint32_t lba_nbytes;
+
+	uint8_t _pad[24];
+};
+XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
+
+struct xnvme_fioe_data {
+	/* I/O completion queue */
+	struct io_u **iocq;
+
+	/* # of iocq entries; incremented via getevents()/cb_pool() */
+	uint64_t completed;
+
+	/*
+	 *  # of errors; incremented when observed on completion via
+	 *  getevents()/cb_pool()
+	 */
+	uint64_t ecount;
+
+	/* Controller which device/file to select */
+	int32_t prev;
+	int32_t cur;
+
+	/* Number of devices/files for which open() has been called */
+	int64_t nopen;
+	/* Number of devices/files allocated in files[] */
+	uint64_t nallocated;
+
+	struct iovec *iovec;
+
+	uint8_t _pad[8];
+
+	struct xnvme_fioe_fwrap files[];
+};
+XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
+
+struct xnvme_fioe_options {
+	void *padding;
+	unsigned int hipri;
+	unsigned int sqpoll_thread;
+	unsigned int xnvme_dev_nsid;
+	unsigned int xnvme_iovec;
+	char *xnvme_be;
+	char *xnvme_async;
+	char *xnvme_sync;
+	char *xnvme_admin;
+};
+
+static struct fio_option options[] = {
+	{
+		.name = "hipri",
+		.lname = "High Priority",
+		.type = FIO_OPT_STR_SET,
+		.off1 = offsetof(struct xnvme_fioe_options, hipri),
+		.help = "Use polled IO completions",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "sqthread_poll",
+		.lname = "Kernel SQ thread polling",
+		.type = FIO_OPT_STR_SET,
+		.off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
+		.help = "Offload submission/completion to kernel thread",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "xnvme_be",
+		.lname = "xNVMe Backend",
+		.type = FIO_OPT_STR_STORE,
+		.off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
+		.help = "Select xNVMe backend [spdk,linux,fbsd]",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "xnvme_async",
+		.lname = "xNVMe Asynchronous command-interface",
+		.type = FIO_OPT_STR_STORE,
+		.off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
+		.help = "Select xNVMe async. interface: [emu,thrpool,io_uring,libaio,posix,nil]",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "xnvme_sync",
+		.lname = "xNVMe Synchronous. command-interface",
+		.type = FIO_OPT_STR_STORE,
+		.off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
+		.help = "Select xNVMe sync. interface: [nvme,psync]",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "xnvme_admin",
+		.lname = "xNVMe Admin command-interface",
+		.type = FIO_OPT_STR_STORE,
+		.off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
+		.help = "Select xNVMe admin. cmd-interface: [nvme,block,file_as_ns]",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "xnvme_dev_nsid",
+		.lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
+		.type = FIO_OPT_INT,
+		.off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
+		.help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+	{
+		.name = "xnvme_iovec",
+		.lname = "Vectored IOs",
+		.type = FIO_OPT_STR_SET,
+		.off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
+		.help = "Send vectored IOs",
+		.category = FIO_OPT_C_ENGINE,
+		.group = FIO_OPT_G_XNVME,
+	},
+
+	{
+		.name = NULL,
+	},
+};
+
+static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
+{
+	struct io_u *io_u = cb_arg;
+	struct xnvme_fioe_data *xd = io_u->mmap_data;
+
+	if (xnvme_cmd_ctx_cpl_status(ctx)) {
+		xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
+		xd->ecount += 1;
+		io_u->error = EIO;
+	}
+
+	xd->iocq[xd->completed++] = io_u;
+	xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+}
+
+static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
+{
+	struct xnvme_fioe_options *o = td->eo;
+	struct xnvme_opts opts = xnvme_opts_default();
+
+	opts.nsid = o->xnvme_dev_nsid;
+	opts.be = o->xnvme_be;
+	opts.async = o->xnvme_async;
+	opts.sync = o->xnvme_sync;
+	opts.admin = o->xnvme_admin;
+
+	opts.poll_io = o->hipri;
+	opts.poll_sq = o->sqpoll_thread;
+
+	opts.direct = td->o.odirect;
+
+	return opts;
+}
+
+static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
+{
+	if (fwrap->dev)
+		xnvme_queue_term(fwrap->queue);
+
+	xnvme_dev_close(fwrap->dev);
+
+	memset(fwrap, 0, sizeof(*fwrap));
+}
+
+static void xnvme_fioe_cleanup(struct thread_data *td)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+	int err;
+
+	err = pthread_mutex_lock(&g_serialize);
+	if (err)
+		log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
+		/* NOTE: not returning here */
+
+	for (uint64_t i = 0; i < xd->nallocated; ++i)
+		_dev_close(td, &xd->files[i]);
+
+	if (!err) {
+		err = pthread_mutex_unlock(&g_serialize);
+		if (err)
+			log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
+	}
+
+	free(xd->iocq);
+	free(xd->iovec);
+	free(xd);
+	td->io_ops_data = NULL;
+}
+
+/**
+ * Helper function setting up device handles as addressed by the naming
+ * convention of the given `fio_file` filename.
+ *
+ * Checks thread-options for explicit control of asynchronous implementation via
+ * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
+ */
+static int _dev_open(struct thread_data *td, struct fio_file *f)
+{
+	struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+	struct xnvme_fioe_fwrap *fwrap;
+	int flags = 0;
+	int err;
+
+	if (f->fileno > (int)xd->nallocated) {
+		log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
+		return 1;
+	}
+
+	fwrap = &xd->files[f->fileno];
+
+	err = pthread_mutex_lock(&g_serialize);
+	if (err) {
+		log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
+			err);
+		return -err;
+	}
+
+	fwrap->dev = xnvme_dev_open(f->file_name, &opts);
+	if (!fwrap->dev) {
+		log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
+		goto failure;
+	}
+	fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
+
+	if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
+		log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
+		goto failure;
+	}
+	xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
+
+	fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
+	fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
+
+	fwrap->fio_file = f;
+	fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
+	fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
+	fio_file_set_size_known(fwrap->fio_file);
+
+	err = pthread_mutex_unlock(&g_serialize);
+	if (err)
+		log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
+			err);
+
+	return 0;
+
+failure:
+	xnvme_queue_term(fwrap->queue);
+	xnvme_dev_close(fwrap->dev);
+
+	err = pthread_mutex_unlock(&g_serialize);
+	if (err)
+		log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
+			err);
+
+	return 1;
+}
+
+static int xnvme_fioe_init(struct thread_data *td)
+{
+	struct xnvme_fioe_data *xd = NULL;
+	struct fio_file *f;
+	unsigned int i;
+
+	if (!td->o.use_thread) {
+		log_err("ioeng->init(): --thread=1 is required\n");
+		return 1;
+	}
+
+	/* Allocate xd and iocq */
+	xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
+	if (!xd) {
+		log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
+		return 1;
+	}
+
+	xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!xd->iocq) {
+		log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
+		return 1;
+	}
+
+	xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
+	if (!xd->iovec) {
+		log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
+		return 1;
+	}
+
+	xd->prev = -1;
+	td->io_ops_data = xd;
+
+	for_each_file(td, f, i)
+	{
+		if (_dev_open(td, f)) {
+			log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
+			return 1;
+		}
+
+		++(xd->nallocated);
+	}
+
+	if (xd->nallocated != td->o.nr_files) {
+		log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+/* NOTE: using the first device for buffer-allocators) */
+static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+	struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
+
+	if (!fwrap->dev) {
+		log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
+		return 1;
+	}
+
+	td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
+
+	return td->orig_buffer == NULL;
+}
+
+/* NOTE: using the first device for buffer-allocators) */
+static void xnvme_fioe_iomem_free(struct thread_data *td)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+	struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
+
+	if (!fwrap->dev) {
+		log_err("ioeng->iomem_free(): failed no dev-handle\n");
+		return;
+	}
+
+	xnvme_buf_free(fwrap->dev, td->orig_buffer);
+}
+
+static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	io_u->mmap_data = td->io_ops_data;
+
+	return 0;
+}
+
+static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	io_u->mmap_data = NULL;
+}
+
+static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+
+	assert(event >= 0);
+	assert((unsigned)event < xd->completed);
+
+	return xd->iocq[event];
+}
+
+static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
+				const struct timespec *t)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+	struct xnvme_fioe_fwrap *fwrap = NULL;
+	int nfiles = xd->nallocated;
+	int err = 0;
+
+	if (xd->prev != -1 && ++xd->prev < nfiles) {
+		fwrap = &xd->files[xd->prev];
+		xd->cur = xd->prev;
+	}
+
+	xd->completed = 0;
+	for (;;) {
+		if (fwrap == NULL || xd->cur == nfiles) {
+			fwrap = &xd->files[0];
+			xd->cur = 0;
+		}
+
+		while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
+			err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
+			if (err < 0) {
+				switch (err) {
+				case -EBUSY:
+				case -EAGAIN:
+					usleep(1);
+					break;
+
+				default:
+					log_err("ioeng->getevents(): unhandled IO error\n");
+					assert(false);
+					return 0;
+				}
+			}
+			if (xd->completed >= min) {
+				xd->prev = xd->cur;
+				return xd->completed;
+			}
+			xd->cur++;
+			fwrap = &xd->files[xd->cur];
+
+			if (err < 0) {
+				switch (err) {
+				case -EBUSY:
+				case -EAGAIN:
+					usleep(1);
+					break;
+				}
+			}
+		}
+	}
+
+	xd->cur = 0;
+
+	return xd->completed;
+}
+
+static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+	struct xnvme_fioe_fwrap *fwrap;
+	struct xnvme_cmd_ctx *ctx;
+	uint32_t nsid;
+	uint64_t slba;
+	uint16_t nlb;
+	int err;
+	bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
+
+	fio_ro_check(td, io_u);
+
+	fwrap = &xd->files[io_u->file->fileno];
+	nsid = xnvme_dev_get_nsid(fwrap->dev);
+
+	slba = io_u->offset >> fwrap->ssw;
+	nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
+
+	ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
+	ctx->async.cb_arg = io_u;
+
+	ctx->cmd.common.nsid = nsid;
+	ctx->cmd.nvm.slba = slba;
+	ctx->cmd.nvm.nlb = nlb;
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
+		break;
+
+	case DDIR_WRITE:
+		ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
+		break;
+
+	default:
+		log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
+		err = -1;
+		assert(false);
+		break;
+	}
+
+	if (vectored_io) {
+		xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
+		xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
+
+		err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen, NULL, 0,
+				      0);
+	} else {
+		err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
+	}
+	switch (err) {
+	case 0:
+		return FIO_Q_QUEUED;
+
+	case -EBUSY:
+	case -EAGAIN:
+		xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+		return FIO_Q_BUSY;
+
+	default:
+		log_err("ioeng->queue(): err: '%d'\n", err);
+
+		xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+		io_u->error = abs(err);
+		assert(false);
+		return FIO_Q_COMPLETED;
+	}
+}
+
+static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+
+	dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
+
+	--(xd->nopen);
+
+	return 0;
+}
+
+static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
+{
+	struct xnvme_fioe_data *xd = td->io_ops_data;
+
+	dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
+
+	if (f->fileno > (int)xd->nallocated) {
+		log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
+		return 1;
+	}
+	if (xd->files[f->fileno].fio_file != f) {
+		log_err("ioeng->open(): fio_file != f; invalid assumption\n");
+		return 1;
+	}
+
+	++(xd->nopen);
+
+	return 0;
+}
+
+static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
+{
+	/* Consider only doing this with be:spdk */
+	return 0;
+}
+
+static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+					 unsigned int *max_open_zones)
+{
+	struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+	struct xnvme_dev *dev;
+	const struct xnvme_spec_znd_idfy_ns *zns;
+	int err = 0, err_lock;
+
+	if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
+	    f->filetype != FIO_TYPE_CHAR) {
+		log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
+		return 0;
+	}
+	err_lock = pthread_mutex_lock(&g_serialize);
+	if (err_lock) {
+		log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
+		return -err_lock;
+	}
+
+	dev = xnvme_dev_open(f->file_name, &opts);
+	if (!dev) {
+		log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
+		err = -errno;
+		goto exit;
+	}
+	if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
+		errno = EINVAL;
+		err = -errno;
+		goto exit;
+	}
+
+	zns = (void *)xnvme_dev_get_ns_css(dev);
+	if (!zns) {
+		log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
+		err = -errno;
+		goto exit;
+	}
+
+	/*
+	 * intentional overflow as the value is zero-based and NVMe
+	 * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
+	 * is how fio indicates unlimited and otherwise just converting
+	 * to one-based.
+	 */
+	*max_open_zones = zns->mor + 1;
+
+exit:
+	xnvme_dev_close(dev);
+	err_lock = pthread_mutex_unlock(&g_serialize);
+	if (err_lock)
+		log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
+			err_lock);
+
+	return err;
+}
+
+/**
+ * Currently, this function is called before of I/O engine initialization, so,
+ * we cannot consult the file-wrapping done when 'fioe' initializes.
+ * Instead we just open based on the given filename.
+ *
+ * TODO: unify the different setup methods, consider keeping the handle around,
+ * and consider how to support the --be option in this usecase
+ */
+static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
+				      enum zbd_zoned_model *model)
+{
+	struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+	struct xnvme_dev *dev;
+	int err = 0, err_lock;
+
+	if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
+	    f->filetype != FIO_TYPE_CHAR) {
+		log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
+		return -EINVAL;
+	}
+
+	err = pthread_mutex_lock(&g_serialize);
+	if (err) {
+		log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
+		return -err;
+	}
+
+	dev = xnvme_dev_open(f->file_name, &opts);
+	if (!dev) {
+		log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
+			f->file_name, errno);
+		err = -errno;
+		goto exit;
+	}
+
+	switch (xnvme_dev_get_geo(dev)->type) {
+	case XNVME_GEO_UNKNOWN:
+		dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
+		*model = ZBD_NONE;
+		break;
+
+	case XNVME_GEO_CONVENTIONAL:
+		dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
+		*model = ZBD_NONE;
+		break;
+
+	case XNVME_GEO_ZONED:
+		dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
+		*model = ZBD_HOST_MANAGED;
+		break;
+
+	default:
+		dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
+		*model = ZBD_NONE;
+		errno = EINVAL;
+		err = -errno;
+		break;
+	}
+
+exit:
+	xnvme_dev_close(dev);
+
+	err_lock = pthread_mutex_unlock(&g_serialize);
+	if (err_lock)
+		log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+
+	return err;
+}
+
+/**
+ * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
+ *
+ * The implementation converts the NVMe Zoned Command Set log-pages for Zone
+ * descriptors into the Linux Kernel Zoned Block Report format.
+ *
+ * NOTE: This function is called before I/O engine initialization, that is,
+ * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
+ * to do the ``_dev_open`` itself, and shut it down again once it is done
+ * retrieving the log-pages and converting them to the report format.
+ *
+ * TODO: unify the different setup methods, consider keeping the handle around,
+ * and consider how to support the --async option in this usecase
+ */
+static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
+				   struct zbd_zone *zbdz, unsigned int nr_zones)
+{
+	struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+	const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
+	struct xnvme_dev *dev = NULL;
+	const struct xnvme_geo *geo = NULL;
+	struct xnvme_znd_report *rprt = NULL;
+	uint32_t ssw;
+	uint64_t slba;
+	unsigned int limit = 0;
+	int err = 0, err_lock;
+
+	dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
+	       nr_zones);
+
+	err = pthread_mutex_lock(&g_serialize);
+	if (err) {
+		log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
+			err);
+		return -err;
+	}
+
+	dev = xnvme_dev_open(f->file_name, &opts);
+	if (!dev) {
+		log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
+			errno);
+		goto exit;
+	}
+
+	geo = xnvme_dev_get_geo(dev);
+	ssw = xnvme_dev_get_ssw(dev);
+	lbafe = xnvme_znd_dev_get_lbafe(dev);
+
+	limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
+
+	dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
+
+	slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
+
+	rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
+	if (!rprt) {
+		log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
+			f->file_name, errno);
+		err = -errno;
+		goto exit;
+	}
+	if (rprt->nentries != limit) {
+		log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
+		err = 1;
+		goto exit;
+	}
+	if (offset > geo->tbytes) {
+		log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
+		goto exit;
+	}
+
+	/* Transform the zone-report */
+	for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
+		struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
+
+		zbdz[idx].start = descr->zslba << ssw;
+		zbdz[idx].len = lbafe->zsze << ssw;
+		zbdz[idx].capacity = descr->zcap << ssw;
+		zbdz[idx].wp = descr->wp << ssw;
+
+		switch (descr->zt) {
+		case XNVME_SPEC_ZND_TYPE_SEQWR:
+			zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
+			break;
+
+		default:
+			log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
+				f->file_name, zbdz[idx].start);
+			err = -EIO;
+			goto exit;
+		}
+
+		switch (descr->zs) {
+		case XNVME_SPEC_ZND_STATE_EMPTY:
+			zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
+			break;
+		case XNVME_SPEC_ZND_STATE_IOPEN:
+			zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
+			break;
+		case XNVME_SPEC_ZND_STATE_EOPEN:
+			zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
+			break;
+		case XNVME_SPEC_ZND_STATE_CLOSED:
+			zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
+			break;
+		case XNVME_SPEC_ZND_STATE_FULL:
+			zbdz[idx].cond = ZBD_ZONE_COND_FULL;
+			break;
+
+		case XNVME_SPEC_ZND_STATE_RONLY:
+		case XNVME_SPEC_ZND_STATE_OFFLINE:
+		default:
+			zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
+			break;
+		}
+	}
+
+exit:
+	xnvme_buf_virt_free(rprt);
+
+	xnvme_dev_close(dev);
+
+	err_lock = pthread_mutex_unlock(&g_serialize);
+	if (err_lock)
+		log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
+
+	dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
+
+	return err ? err : (int)limit;
+}
+
+/**
+ * NOTE: This function may get called before I/O engine initialization, that is,
+ * before ``_dev_open`` has been called and file-wrapping is setup. In such
+ * case it has to do ``_dev_open`` itself, and shut it down again once it is
+ * done resetting write pointer of zones.
+ */
+static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
+			       uint64_t length)
+{
+	struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+	struct xnvme_fioe_data *xd = NULL;
+	struct xnvme_fioe_fwrap *fwrap = NULL;
+	struct xnvme_dev *dev = NULL;
+	const struct xnvme_geo *geo = NULL;
+	uint64_t first, last;
+	uint32_t ssw;
+	uint32_t nsid;
+	int err = 0, err_lock;
+
+	if (td->io_ops_data) {
+		xd = td->io_ops_data;
+		fwrap = &xd->files[f->fileno];
+
+		assert(fwrap->dev);
+		assert(fwrap->geo);
+
+		dev = fwrap->dev;
+		geo = fwrap->geo;
+		ssw = fwrap->ssw;
+	} else {
+		err = pthread_mutex_lock(&g_serialize);
+		if (err) {
+			log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
+			return -err;
+		}
+
+		dev = xnvme_dev_open(f->file_name, &opts);
+		if (!dev) {
+			log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
+				f->file_name, errno);
+			goto exit;
+		}
+		geo = xnvme_dev_get_geo(dev);
+		ssw = xnvme_dev_get_ssw(dev);
+	}
+
+	nsid = xnvme_dev_get_nsid(dev);
+
+	first = ((offset >> ssw) / geo->nsect) * geo->nsect;
+	last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
+	dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
+
+	for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
+		struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
+
+		if (zslba >= (geo->nsect * geo->nzone)) {
+			log_err("ioeng->reset_wp(): out-of-bounds\n");
+			err = 0;
+			break;
+		}
+
+		err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
+					  XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
+		if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
+			err = err ? err : -EIO;
+			log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
+			goto exit;
+		}
+	}
+
+exit:
+	if (!td->io_ops_data) {
+		xnvme_dev_close(dev);
+
+		err_lock = pthread_mutex_unlock(&g_serialize);
+		if (err_lock)
+			log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+	}
+
+	return err;
+}
+
+static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+	struct xnvme_dev *dev;
+	int ret = 0, err;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	ret = pthread_mutex_lock(&g_serialize);
+	if (ret) {
+		log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
+		return -ret;
+	}
+
+	dev = xnvme_dev_open(f->file_name, &opts);
+	if (!dev) {
+		log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
+		ret = -errno;
+		goto exit;
+	}
+
+	f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
+	fio_file_set_size_known(f);
+	f->filetype = FIO_TYPE_BLOCK;
+
+exit:
+	xnvme_dev_close(dev);
+	err = pthread_mutex_unlock(&g_serialize);
+	if (err)
+		log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
+
+	return ret;
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+	.name = "xnvme",
+	.version = FIO_IOOPS_VERSION,
+	.options = options,
+	.option_struct_size = sizeof(struct xnvme_fioe_options),
+	.flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
+
+	.cleanup = xnvme_fioe_cleanup,
+	.init = xnvme_fioe_init,
+
+	.iomem_free = xnvme_fioe_iomem_free,
+	.iomem_alloc = xnvme_fioe_iomem_alloc,
+
+	.io_u_free = xnvme_fioe_io_u_free,
+	.io_u_init = xnvme_fioe_io_u_init,
+
+	.event = xnvme_fioe_event,
+	.getevents = xnvme_fioe_getevents,
+	.queue = xnvme_fioe_queue,
+
+	.close_file = xnvme_fioe_close,
+	.open_file = xnvme_fioe_open,
+	.get_file_size = xnvme_fioe_get_file_size,
+
+	.invalidate = xnvme_fioe_invalidate,
+	.get_max_open_zones = xnvme_fioe_get_max_open_zones,
+	.get_zoned_model = xnvme_fioe_get_zoned_model,
+	.report_zones = xnvme_fioe_report_zones,
+	.reset_wp = xnvme_fioe_reset_wp,
+};
+
+static void fio_init fio_xnvme_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_xnvme_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/examples/xnvme-compare.fio b/examples/xnvme-compare.fio
new file mode 100644
index 00000000..b89dfdf4
--- /dev/null
+++ b/examples/xnvme-compare.fio
@@ -0,0 +1,72 @@
+; Compare fio IO engines with a random-read workload using BS=4k at QD=1
+;
+; README
+;
+; This job-file is intended to be used as:
+;
+; # Use the built-in io_uring engine to get baseline numbers
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=io_uring \
+;   --sqthread_poll=1 \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl.
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --sqthread_poll=1 \
+;   --xnvme_async=io_uring \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl.
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_async=libaio \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_dev_nsid=1 \
+;   --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-compare.fio \
+;   --section=override
+;
+[global]
+rw=randread
+size=12G
+iodepth=1
+bs=4K
+direct=1
+thread=1
+time_based=1
+runtime=7
+ramp_time=3
+norandommap=1
+
+; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0"
+allow_file_create=0
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
diff --git a/examples/xnvme-zoned.fio b/examples/xnvme-zoned.fio
new file mode 100644
index 00000000..1344f9a1
--- /dev/null
+++ b/examples/xnvme-zoned.fio
@@ -0,0 +1,87 @@
+; Running xNVMe/fio on a Zoned Device
+;
+; Writes 1GB at QD1 using 4K BS and verifies it.
+;
+; README
+;
+; This job-file is intended to be used as:
+;
+; # Use the built-in io_uring engine to get baseline numbers
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=io_uring \
+;   --sqthread_poll=1 \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl.
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --sqthread_poll=1 \
+;   --xnvme_async=io_uring \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl.
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_async=libaio \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_dev_nsid=1 \
+;   --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-zoned.fio \
+;   --section=override
+;
+; To reset all zones on the device to EMPTY state aka. wipe the entire device.
+;
+; # zoned mgmt-reset /dev/nvme0n2 --slba 0x0 --all
+;
+[global]
+zonemode=zbd
+rw=write
+size=1G
+iodepth=1
+bs=4K
+direct=1
+thread=1
+ramp_time=1
+norandommap=1
+verify=crc32c
+; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0"
+allow_file_create=0
+;
+; NOTE: If fio complains about zone-size, then run:
+;
+; # zoned info /dev/nvme0n1
+;
+; The command will provide the values you need, then in the fio-script define:
+;
+; zonesize=nsect * nbytes
+;
+;zonesize=
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
diff --git a/fio.1 b/fio.1
index 609947dc..ded7bbfc 100644
--- a/fio.1
+++ b/fio.1
@@ -1965,6 +1965,12 @@ via kernel NFS.
 .TP
 .B exec
 Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+.TP
+.B xnvme
+I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides
+flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring,
+the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes
+engine specific options. (See \fIhttps://xnvme.io/\fR).
 .SS "I/O engine specific parameters"
 In addition, there are some parameters which are only valid when a specific
 \fBioengine\fR is in use. These are used identically to normal parameters,
@@ -2039,7 +2045,7 @@ release them when IO is done. If this option is set, the pages are pre-mapped
 before IO is started. This eliminates the need to map and release for each IO.
 This is more efficient, and reduces the IO latency as well.
 .TP
-.BI (io_uring)hipri
+.BI (io_uring,xnvme)hipri
 If this option is set, fio will attempt to use polled IO completions. Normal IO
 completions generate interrupts to signal the completion of IO, polled
 completions do not. Hence they are require active reaping by the application.
@@ -2052,7 +2058,7 @@ This avoids the overhead of managing file counts in the kernel, making the
 submission and completion part more lightweight. Required for the below
 sqthread_poll option.
 .TP
-.BI (io_uring)sqthread_poll
+.BI (io_uring,xnvme)sqthread_poll
 Normally fio will submit IO by issuing a system call to notify the kernel of
 available items in the SQ ring. If this option is set, the act of submitting IO
 will be done by a polling thread in the kernel. This frees up cycles for fio, at
@@ -2480,6 +2486,66 @@ Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second.
 .TP
 .BI (exec)std_redirect\fR=\fbool
 If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+.TP
+.BI (xnvme)xnvme_async\fR=\fPstr
+Select the xnvme async command interface. This can take these values.
+.RS
+.RS
+.TP
+.B emu
+This is default and used to emulate asynchronous I/O
+.TP
+.BI thrpool
+Use thread pool for Asynchronous I/O
+.TP
+.BI io_uring
+Use Linux io_uring/liburing for Asynchronous I/O
+.TP
+.BI libaio
+Use Linux aio for Asynchronous I/O
+.TP
+.BI posix
+Use POSIX aio for Asynchronous I/O
+.TP
+.BI nil
+Use nil-io; For introspective perf. evaluation
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_sync\fR=\fPstr
+Select the xnvme synchronous command interface. This can take these values.
+.RS
+.RS
+.TP
+.B nvme
+This is default and uses Linux NVMe Driver ioctl() for synchronous I/O
+.TP
+.BI psync
+Use pread()/write() for synchronous I/O
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_admin\fR=\fPstr
+Select the xnvme admin command interface. This can take these values.
+.RS
+.RS
+.TP
+.B nvme
+This is default and uses Linux NVMe Driver ioctl() for admin commands
+.TP
+.BI block
+Use Linux Block Layer ioctl() and sysfs for admin commands
+.TP
+.BI file_as_ns
+Use file-stat as to construct NVMe idfy responses
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_dev_nsid\fR=\fPint
+xnvme namespace identifier, for userspace NVMe driver.
+.TP
+.BI (xnvme)xnvme_iovec
+If this option is set, xnvme will use vectored read/write commands.
 .SS "I/O depth"
 .TP
 .BI iodepth \fR=\fPint
diff --git a/optgroup.h b/optgroup.h
index 3ac8f62a..dc73c8f3 100644
--- a/optgroup.h
+++ b/optgroup.h
@@ -72,6 +72,7 @@ enum opt_category_group {
 	__FIO_OPT_G_DFS,
 	__FIO_OPT_G_NFS,
 	__FIO_OPT_G_WINDOWSAIO,
+	__FIO_OPT_G_XNVME,
 
 	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
 	FIO_OPT_G_ZONE		= (1ULL << __FIO_OPT_G_ZONE),
@@ -118,6 +119,7 @@ enum opt_category_group {
 	FIO_OPT_G_LIBCUFILE	= (1ULL << __FIO_OPT_G_LIBCUFILE),
 	FIO_OPT_G_DFS		= (1ULL << __FIO_OPT_G_DFS),
 	FIO_OPT_G_WINDOWSAIO	= (1ULL << __FIO_OPT_G_WINDOWSAIO),
+	FIO_OPT_G_XNVME         = (1ULL << __FIO_OPT_G_XNVME),
 };
 
 extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
diff --git a/options.c b/options.c
index 3b83573b..2b183c60 100644
--- a/options.c
+++ b/options.c
@@ -2144,6 +2144,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 			  { .ival = "nfs",
 			    .help = "NFS IO engine",
 			  },
+#endif
+#ifdef CONFIG_LIBXNVME
+			  { .ival = "xnvme",
+			    .help = "XNVME IO engine",
+			  },
 #endif
 		},
 	},

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-05-02 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-05-02 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6e594a2fa8388892dffb2ffc9b865689e2d67833:

  Merge branch 'global_dedup' of https://github.com/bardavid/fio (2022-04-29 16:30:50 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 12db6deb8b767ac89dd73e34dbc6f06905441e07:

  Merge branch 'patch-1' of https://github.com/ferdnyc/fio (2022-05-01 07:29:05 -0600)

----------------------------------------------------------------
Frank Dana (1):
      README: Update Fedora pkg URL

Jens Axboe (1):
      Merge branch 'patch-1' of https://github.com/ferdnyc/fio

 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/README.rst b/README.rst
index d566fae3..527f33ab 100644
--- a/README.rst
+++ b/README.rst
@@ -107,7 +107,7 @@ Ubuntu:
 Red Hat, Fedora, CentOS & Co:
 	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
 	packages are part of the Fedora/EPEL repositories.
-	https://apps.fedoraproject.org/packages/fio .
+	https://packages.fedoraproject.org/pkgs/fio/ .
 
 Mandriva:
 	Mandriva has integrated fio into their package repository, so installing

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-04-30 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-04-30 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 5f2d43188c2d65674aaba6280e2a87107e5d7099:

  Merge branch 'fix/json/strdup_memory_leak' of https://github.com/dpronin/fio (2022-04-17 16:47:22 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6e594a2fa8388892dffb2ffc9b865689e2d67833:

  Merge branch 'global_dedup' of https://github.com/bardavid/fio (2022-04-29 16:30:50 -0600)

----------------------------------------------------------------
Bar David (2):
      Introducing support for generation of dedup buffers across jobs. The dedup buffers are spread evenly between the jobs that enabled the dedupe_global option
      adding an example for dedupe_global usage and DRR testing

Jens Axboe (1):
      Merge branch 'global_dedup' of https://github.com/bardavid/fio

 HOWTO.rst                  |  6 +++++
 backend.c                  |  5 ++++
 cconv.c                    |  2 ++
 dedupe.c                   | 46 +++++++++++++++++++++++++++++++++----
 dedupe.h                   |  3 ++-
 examples/dedupe-global.fio | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 fio.1                      |  9 ++++++++
 init.c                     |  2 +-
 options.c                  | 10 ++++++++
 server.h                   |  2 +-
 thread_options.h           |  3 +++
 11 files changed, 138 insertions(+), 7 deletions(-)
 create mode 100644 examples/dedupe-global.fio

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index a5fa432e..6a3e09f5 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -1749,6 +1749,12 @@ Buffers and memory
 	Note that size needs to be explicitly provided and only 1 file per
 	job is supported
 
+.. option:: dedupe_global=bool
+
+	This controls whether the deduplication buffers will be shared amongst
+	all jobs that have this option set. The buffers are spread evenly between
+	participating jobs.
+
 .. option:: invalidate=bool
 
 	Invalidate the buffer/page cache parts of the files to be used prior to
diff --git a/backend.c b/backend.c
index 317e4f6c..ffbb7e2a 100644
--- a/backend.c
+++ b/backend.c
@@ -2570,6 +2570,11 @@ int fio_backend(struct sk_out *sk_out)
 		setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log");
 	}
 
+	if (init_global_dedupe_working_set_seeds()) {
+		log_err("fio: failed to initialize global dedupe working set\n");
+		return 1;
+	}
+
 	startup_sem = fio_sem_init(FIO_SEM_LOCKED);
 	if (!sk_out)
 		is_local_backend = true;
diff --git a/cconv.c b/cconv.c
index 62d02e36..6c36afb7 100644
--- a/cconv.c
+++ b/cconv.c
@@ -305,6 +305,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
 	o->dedupe_mode = le32_to_cpu(top->dedupe_mode);
 	o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage);
+	o->dedupe_global = le32_to_cpu(top->dedupe_global);
 	o->block_error_hist = le32_to_cpu(top->block_error_hist);
 	o->replay_align = le32_to_cpu(top->replay_align);
 	o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -513,6 +514,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
 	top->dedupe_mode = cpu_to_le32(o->dedupe_mode);
 	top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage);
+	top->dedupe_global = cpu_to_le32(o->dedupe_global);
 	top->block_error_hist = cpu_to_le32(o->block_error_hist);
 	top->replay_align = cpu_to_le32(o->replay_align);
 	top->replay_scale = cpu_to_le32(o->replay_scale);
diff --git a/dedupe.c b/dedupe.c
index fd116dfb..8214a786 100644
--- a/dedupe.c
+++ b/dedupe.c
@@ -1,13 +1,37 @@
 #include "fio.h"
 
-int init_dedupe_working_set_seeds(struct thread_data *td)
+/**
+ * initializes the global dedup workset.
+ * this needs to be called after all jobs' seeds
+ * have been initialized
+ */
+int init_global_dedupe_working_set_seeds(void)
 {
-	unsigned long long i, j, num_seed_advancements;
+	int i;
+	struct thread_data *td;
+
+	for_each_td(td, i) {
+		if (!td->o.dedupe_global)
+			continue;
+
+		if (init_dedupe_working_set_seeds(td, 1))
+			return 1;
+	}
+
+	return 0;
+}
+
+int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedup)
+{
+	int tindex;
+	struct thread_data *td_seed;
+	unsigned long long i, j, num_seed_advancements, pages_per_seed;
 	struct frand_state dedupe_working_set_state = {0};
 
 	if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
 		return 0;
 
+	tindex = td->thread_number - 1;
 	num_seed_advancements = td->o.min_bs[DDIR_WRITE] /
 		min_not_zero(td->o.min_bs[DDIR_WRITE], (unsigned long long) td->o.compress_chunk);
 	/*
@@ -20,9 +44,11 @@ int init_dedupe_working_set_seeds(struct thread_data *td)
 		log_err("fio: could not allocate dedupe working set\n");
 		return 1;
 	}
+
 	frand_copy(&dedupe_working_set_state, &td->buf_state);
-	for (i = 0; i < td->num_unique_pages; i++) {
-		frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
+	frand_copy(&td->dedupe_working_set_states[0], &dedupe_working_set_state);
+	pages_per_seed = max(td->num_unique_pages / thread_number, 1ull);
+	for (i = 1; i < td->num_unique_pages; i++) {
 		/*
 		 * When compression is used the seed is advanced multiple times to
 		 * generate the buffer. We want to regenerate the same buffer when
@@ -30,6 +56,18 @@ int init_dedupe_working_set_seeds(struct thread_data *td)
 		 */
 		for (j = 0; j < num_seed_advancements; j++)
 			__get_next_seed(&dedupe_working_set_state);
+
+		/*
+		 * When global dedup is used, we rotate the seeds to allow
+		 * generating same buffers across different jobs. Deduplication buffers
+		 * are spread evenly across jobs participating in global dedupe
+		 */
+		if (global_dedup && i % pages_per_seed == 0) {
+			td_seed = tnumber_to_td(++tindex % thread_number);
+			frand_copy(&dedupe_working_set_state, &td_seed->buf_state);
+		}
+
+		frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
 	}
 
 	return 0;
diff --git a/dedupe.h b/dedupe.h
index d4c4dc37..bd1f9c0c 100644
--- a/dedupe.h
+++ b/dedupe.h
@@ -1,6 +1,7 @@
 #ifndef DEDUPE_H
 #define DEDUPE_H
 
-int init_dedupe_working_set_seeds(struct thread_data *td);
+int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedupe);
+int init_global_dedupe_working_set_seeds(void);
 
 #endif
diff --git a/examples/dedupe-global.fio b/examples/dedupe-global.fio
new file mode 100644
index 00000000..edaaad55
--- /dev/null
+++ b/examples/dedupe-global.fio
@@ -0,0 +1,57 @@
+# Writing to 2 files that share the duplicate blocks.
+# The dedupe working set is spread uniformly such that when
+# each of the jobs choose to perform a dedup operation they will
+# regenerate a buffer from the global space.
+# If you test the dedup ratio on either file by itself the result
+# is likely lower than if you test the ratio of the two files combined.
+#
+# Use `./t/fio-dedupe <file> -C 1 -c 1 -b 4096` to test the total
+# data reduction ratio.
+#
+#
+# Full example of test:
+# $ ./fio ./examples/dedupe-global.fio
+#
+# Checking ratio on a and b individually:
+# $ ./t/fio-dedupe a.0.0 -C 1 -c 1 -b 4096
+#
+# $ Extents=25600, Unique extents=16817 Duplicated extents=5735
+# $ De-dupe ratio: 1:0.52
+# $ De-dupe working set at least: 22.40%
+# $ Fio setting: dedupe_percentage=34
+# $ Unique capacity 33MB
+#
+# ./t/fio-dedupe b.0.0 -C 1 -c 1 -b 4096
+# $ Extents=25600, Unique extents=17009 Duplicated extents=5636
+# $ De-dupe ratio: 1:0.51
+# $ De-dupe working set at least: 22.02%
+# $ Fio setting: dedupe_percentage=34
+# $ Unique capacity 34MB
+#
+# Combining files:
+# $ cat a.0.0 > c.0.0
+# $ cat b.0.0 >> c.0.0
+#
+# Checking data reduction ratio on combined file:
+# $ ./t/fio-dedupe c.0.0 -C 1 -c 1 -b 4096
+# $ Extents=51200, Unique extents=25747 Duplicated extents=11028
+# $ De-dupe ratio: 1:0.99
+# $ De-dupe working set at least: 21.54%
+# $ Fio setting: dedupe_percentage=50
+# $ Unique capacity 51MB
+#
+[global]
+ioengine=libaio
+iodepth=256
+size=100m
+dedupe_mode=working_set
+dedupe_global=1
+dedupe_percentage=50
+blocksize=4k
+rw=write
+buffer_compress_percentage=50
+dedupe_working_set_percentage=50
+
+[a]
+
+[b]
diff --git a/fio.1 b/fio.1
index a2ec836f..609947dc 100644
--- a/fio.1
+++ b/fio.1
@@ -1553,6 +1553,15 @@ Note that \fBsize\fR needs to be explicitly provided and only 1 file
 per job is supported
 .RE
 .TP
+.BI dedupe_global \fR=\fPbool
+This controls whether the deduplication buffers will be shared amongst
+all jobs that have this option set. The buffers are spread evenly between
+participating jobs.
+.P
+.RS
+Note that \fBdedupe_mode\fR must be set to \fBworking_set\fR for this to work.
+Can be used in combination with compression
+.TP
 .BI invalidate \fR=\fPbool
 Invalidate the buffer/page cache parts of the files to be used prior to
 starting I/O if the platform and file type support it. Defaults to true.
diff --git a/init.c b/init.c
index 6f186051..f7d702f8 100644
--- a/init.c
+++ b/init.c
@@ -1541,7 +1541,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
 	if (fixup_options(td))
 		goto err;
 
-	if (init_dedupe_working_set_seeds(td))
+	if (!td->o.dedupe_global && init_dedupe_working_set_seeds(td, 0))
 		goto err;
 
 	/*
diff --git a/options.c b/options.c
index e06d9b66..3b83573b 100644
--- a/options.c
+++ b/options.c
@@ -4665,6 +4665,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
 	},
+	{
+		.name	= "dedupe_global",
+		.lname	= "Global deduplication",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, dedupe_global),
+		.help	= "Share deduplication buffers across jobs",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
 	{
 		.name	= "dedupe_mode",
 		.lname	= "Dedupe mode",
diff --git a/server.h b/server.h
index 0e62b6df..b0c5e2df 100644
--- a/server.h
+++ b/server.h
@@ -51,7 +51,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 96,
+	FIO_SERVER_VER			= 97,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
diff --git a/thread_options.h b/thread_options.h
index 4162c42f..634070af 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -263,6 +263,7 @@ struct thread_options {
 	unsigned int dedupe_percentage;
 	unsigned int dedupe_mode;
 	unsigned int dedupe_working_set_percentage;
+	unsigned int dedupe_global;
 	unsigned int time_based;
 	unsigned int disable_lat;
 	unsigned int disable_clat;
@@ -578,6 +579,7 @@ struct thread_options_pack {
 	uint32_t dedupe_percentage;
 	uint32_t dedupe_mode;
 	uint32_t dedupe_working_set_percentage;
+	uint32_t dedupe_global;
 	uint32_t time_based;
 	uint32_t disable_lat;
 	uint32_t disable_clat;
@@ -596,6 +598,7 @@ struct thread_options_pack {
 	uint32_t lat_percentiles;
 	uint32_t slat_percentiles;
 	uint32_t percentile_precision;
+	uint32_t pad5;
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
 	uint8_t read_iolog_file[FIO_TOP_STR_MAX];

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-04-18 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-04-18 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit d684bb2839d1fa010fba1e64f9b0c16240d8bdae:

  Merge branch 'fix/remove-sudo-in-test-script' of https://github.com/dpronin/fio (2022-04-10 15:18:42 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 5f2d43188c2d65674aaba6280e2a87107e5d7099:

  Merge branch 'fix/json/strdup_memory_leak' of https://github.com/dpronin/fio (2022-04-17 16:47:22 -0600)

----------------------------------------------------------------
Denis Pronin (5):
      fixed possible and actual memory leaks
      fixed memory leak of not freed jobs_eta in several cases
      use flist_first_entry instead of flist_entry applied to 'next' list item
      fixed bunch of memory leaks in json constructor
      updated logging of iops1, iops2, ratio in FioJobTest_iops_rate

Jens Axboe (3):
      Merge branch 'fix/memory-leak' of https://github.com/dpronin/fio
      Merge branch 'fix/jobs_eta_memory_leak' of https://github.com/dpronin/fio
      Merge branch 'fix/json/strdup_memory_leak' of https://github.com/dpronin/fio

 backend.c          | 3 +++
 eta.c              | 7 ++++---
 ioengines.c        | 2 ++
 json.h             | 7 ++++++-
 server.c           | 2 +-
 stat.c             | 2 ++
 t/run-fio-tests.py | 3 ++-
 7 files changed, 20 insertions(+), 6 deletions(-)

---

Diff of recent changes:

diff --git a/backend.c b/backend.c
index 001b2b96..317e4f6c 100644
--- a/backend.c
+++ b/backend.c
@@ -2433,8 +2433,10 @@ reap:
 			} else {
 				pid_t pid;
 				struct fio_file **files;
+				void *eo;
 				dprint(FD_PROCESS, "will fork\n");
 				files = td->files;
+				eo = td->eo;
 				read_barrier();
 				pid = fork();
 				if (!pid) {
@@ -2447,6 +2449,7 @@ reap:
 				// freeing previously allocated memory for files
 				// this memory freed MUST NOT be shared between processes, only the pointer itself may be shared within TD
 				free(files);
+				free(eo);
 				free(fd);
 				fd = NULL;
 			}
diff --git a/eta.c b/eta.c
index 17970c78..6017ca31 100644
--- a/eta.c
+++ b/eta.c
@@ -3,6 +3,7 @@
  */
 #include <unistd.h>
 #include <string.h>
+#include <stdlib.h>
 #ifdef CONFIG_VALGRIND_DEV
 #include <valgrind/drd.h>
 #else
@@ -707,10 +708,10 @@ void print_thread_status(void)
 	size_t size;
 
 	je = get_jobs_eta(false, &size);
-	if (je)
+	if (je) {
 		display_thread_status(je);
-
-	free(je);
+		free(je);
+	}
 }
 
 void print_status_init(int thr_number)
diff --git a/ioengines.c b/ioengines.c
index d08a511a..68f307e5 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -223,6 +223,8 @@ struct ioengine_ops *load_ioengine(struct thread_data *td)
  */
 void free_ioengine(struct thread_data *td)
 {
+	assert(td != NULL && td->io_ops != NULL);
+
 	dprint(FD_IO, "free ioengine %s\n", td->io_ops->name);
 
 	if (td->eo && td->io_ops->options) {
diff --git a/json.h b/json.h
index d9824263..66bb06b1 100644
--- a/json.h
+++ b/json.h
@@ -81,8 +81,13 @@ static inline int json_object_add_value_string(struct json_object *obj,
 	struct json_value arg = {
 		.type = JSON_TYPE_STRING,
 	};
+	union {
+		const char *a;
+		char *b;
+	} string;
 
-	arg.string = strdup(val ? : "");
+	string.a = val ? val : "";
+	arg.string = string.b;
 	return json_object_add_value_type(obj, name, &arg);
 }
 
diff --git a/server.c b/server.c
index 914a8c74..4c71bd44 100644
--- a/server.c
+++ b/server.c
@@ -1323,7 +1323,7 @@ static int handle_xmits(struct sk_out *sk_out)
 	sk_unlock(sk_out);
 
 	while (!flist_empty(&list)) {
-		entry = flist_entry(list.next, struct sk_entry, list);
+		entry = flist_first_entry(&list, struct sk_entry, list);
 		flist_del(&entry->list);
 		ret += handle_sk_entry(sk_out, entry);
 	}
diff --git a/stat.c b/stat.c
index 356083e2..949af5ed 100644
--- a/stat.c
+++ b/stat.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
 #include <sys/time.h>
 #include <sys/stat.h>
 #include <math.h>
@@ -1698,6 +1699,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
 	if (je) {
 		json_object_add_value_int(root, "eta", je->eta_sec);
 		json_object_add_value_int(root, "elapsed", je->elapsed_sec);
+		free(je);
 	}
 
 	if (opt_list)
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py
index 612e50ca..ecceb67e 100755
--- a/t/run-fio-tests.py
+++ b/t/run-fio-tests.py
@@ -546,9 +546,10 @@ class FioJobTest_iops_rate(FioJobTest):
             return
 
         iops1 = self.json_data['jobs'][0]['read']['iops']
+        logging.debug("Test %d: iops1: %f", self.testnum, iops1)
         iops2 = self.json_data['jobs'][1]['read']['iops']
+        logging.debug("Test %d: iops2: %f", self.testnum, iops2)
         ratio = iops2 / iops1
-        logging.debug("Test %d: iops1: %f", self.testnum, iops1)
         logging.debug("Test %d: ratio: %f", self.testnum, ratio)
 
         if iops1 < 950 or iops1 > 1050:

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-04-11 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-04-11 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6d01ac19170fadaf46a6db6b4cc347f1b389f422:

  iolog: Use %llu for 64-bit (2022-04-08 12:46:44 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to d684bb2839d1fa010fba1e64f9b0c16240d8bdae:

  Merge branch 'fix/remove-sudo-in-test-script' of https://github.com/dpronin/fio (2022-04-10 15:18:42 -0600)

----------------------------------------------------------------
Denis Pronin (1):
      actions-full-test.sh, removed sudo from the script

Jens Axboe (1):
      Merge branch 'fix/remove-sudo-in-test-script' of https://github.com/dpronin/fio

 ci/actions-full-test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

---

Diff of recent changes:

diff --git a/ci/actions-full-test.sh b/ci/actions-full-test.sh
index 91790664..8282002f 100755
--- a/ci/actions-full-test.sh
+++ b/ci/actions-full-test.sh
@@ -6,9 +6,9 @@ main() {
     echo "Running long running tests..."
     export PYTHONUNBUFFERED="TRUE"
     if [[ "${CI_TARGET_ARCH}" == "arm64" ]]; then
-        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
+        python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
     else
-        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
+        python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
     fi
     make -C doc html
 }

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-04-09 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-04-09 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit a3e48f483db27d20e02cbd81e3a8f18c6c5c50f5:

  Fio 3.30 (2022-04-06 17:10:00 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6d01ac19170fadaf46a6db6b4cc347f1b389f422:

  iolog: Use %llu for 64-bit (2022-04-08 12:46:44 -0600)

----------------------------------------------------------------
Jens Axboe (2):
      iolog: fix warning for 32-bit compilation
      iolog: Use %llu for 64-bit

Mohamad Gebai (3):
      iolog: add version 3 to support timestamp-based replay
      iolog: add iolog_write for version 3
      iolog: update man page for version 3

 HOWTO.rst  |  29 +++++++++++++++-
 blktrace.c |  17 ++--------
 fio.1      |  35 +++++++++++++++++++-
 fio.h      |   4 ++-
 iolog.c    | 109 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 iolog.h    |   8 ++---
 6 files changed, 158 insertions(+), 44 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index 0978879c..a5fa432e 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -4398,7 +4398,9 @@ given in bytes. The `action` can be one of these:
 
 **wait**
 	   Wait for `offset` microseconds. Everything below 100 is discarded.
-	   The time is relative to the previous `wait` statement.
+	   The time is relative to the previous `wait` statement. Note that
+	   action `wait` is not allowed as of version 3, as the same behavior
+	   can be achieved using timestamps.
 **read**
 	   Read `length` bytes beginning from `offset`.
 **write**
@@ -4411,6 +4413,31 @@ given in bytes. The `action` can be one of these:
 	   Trim the given file from the given `offset` for `length` bytes.
 
 
+Trace file format v3
+~~~~~~~~~~~~~~~~~~~~
+
+The third version of the trace file format was added in fio version 3.31. It
+forces each action to have a timestamp associated with it.
+
+The first line of the trace file has to be::
+
+    fio version 3 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+    timestamp filename action
+
+The file I/O action format::
+
+    timestamp filename action offset length
+
+The `timestamp` is relative to the beginning of the run (ie starts at 0). The
+`filename`, `action`, `offset` and `length`  are identical to version 2, except
+that version 3 does not allow the `wait` action.
+
+
 I/O Replay - Merging Traces
 ---------------------------
 
diff --git a/blktrace.c b/blktrace.c
index ead60130..619121c7 100644
--- a/blktrace.c
+++ b/blktrace.c
@@ -313,25 +313,14 @@ static bool queue_trace(struct thread_data *td, struct blk_io_trace *t,
 			 unsigned long *ios, unsigned long long *bs,
 			 struct file_cache *cache)
 {
-	unsigned long long *last_ttime = &td->io_log_blktrace_last_ttime;
+	unsigned long long *last_ttime = &td->io_log_last_ttime;
 	unsigned long long delay = 0;
 
 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
 		return false;
 
 	if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
-		if (!*last_ttime || td->o.no_stall || t->time < *last_ttime)
-			delay = 0;
-		else if (td->o.replay_time_scale == 100)
-			delay = t->time - *last_ttime;
-		else {
-			double tmp = t->time - *last_ttime;
-			double scale;
-
-			scale = (double) 100.0 / (double) td->o.replay_time_scale;
-			tmp *= scale;
-			delay = tmp;
-		}
+		delay = delay_since_ttime(td, t->time);
 		*last_ttime = t->time;
 	}
 
@@ -422,7 +411,7 @@ bool init_blktrace_read(struct thread_data *td, const char *filename, int need_s
 		goto err;
 	}
 	td->io_log_blktrace_swap = need_swap;
-	td->io_log_blktrace_last_ttime = 0;
+	td->io_log_last_ttime = 0;
 	td->o.size = 0;
 
 	free_release_files(td);
diff --git a/fio.1 b/fio.1
index 98410655..a2ec836f 100644
--- a/fio.1
+++ b/fio.1
@@ -4117,7 +4117,9 @@ given in bytes. The `action' can be one of these:
 .TP
 .B wait
 Wait for `offset' microseconds. Everything below 100 is discarded.
-The time is relative to the previous `wait' statement.
+The time is relative to the previous `wait' statement. Note that action `wait`
+is not allowed as of version 3, as the same behavior can be achieved using
+timestamps.
 .TP
 .B read
 Read `length' bytes beginning from `offset'.
@@ -4135,6 +4137,37 @@ Write `length' bytes beginning from `offset'.
 Trim the given file from the given `offset' for `length' bytes.
 .RE
 .RE
+.RE
+.TP
+.B Trace file format v3
+The third version of the trace file format was added in fio version 3.31. It
+forces each action to have a timestamp associated with it.
+.RS
+.P
+The first line of the trace file has to be:
+.RS
+.P
+"fio version 3 iolog"
+.RE
+.P
+Following this can be lines in two different formats, which are described below.
+.P
+.B
+The file management format:
+.RS
+timestamp filename action
+.P
+.RE
+.B
+The file I/O action format:
+.RS
+timestamp filename action offset length
+.P
+The `timestamp` is relative to the beginning of the run (ie starts at 0). The
+`filename`, `action`, `offset` and `length`  are identical to version 2, except
+that version 3 does not allow the `wait` action.
+.RE
+.RE
 .SH I/O REPLAY \- MERGING TRACES
 Colocation is a common practice used to get the most out of a machine.
 Knowing which workloads play nicely with each other and which ones don't is
diff --git a/fio.h b/fio.h
index 776fb51f..de7eca79 100644
--- a/fio.h
+++ b/fio.h
@@ -431,10 +431,12 @@ struct thread_data {
 	FILE *io_log_rfile;
 	unsigned int io_log_blktrace;
 	unsigned int io_log_blktrace_swap;
-	unsigned long long io_log_blktrace_last_ttime;
+	unsigned long long io_log_last_ttime;
+	struct timespec io_log_start_time;
 	unsigned int io_log_current;
 	unsigned int io_log_checkmark;
 	unsigned int io_log_highmark;
+	unsigned int io_log_version;
 	struct timespec io_log_highmark_time;
 
 	/*
diff --git a/iolog.c b/iolog.c
index 724ec1fe..37e799a1 100644
--- a/iolog.c
+++ b/iolog.c
@@ -31,6 +31,7 @@
 static int iolog_flush(struct io_log *log);
 
 static const char iolog_ver2[] = "fio version 2 iolog";
+static const char iolog_ver3[] = "fio version 3 iolog";
 
 void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
 {
@@ -40,18 +41,24 @@ void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
 
 void log_io_u(const struct thread_data *td, const struct io_u *io_u)
 {
+	struct timespec now;
+
 	if (!td->o.write_iolog_file)
 		return;
 
-	fprintf(td->iolog_f, "%s %s %llu %llu\n", io_u->file->file_name,
-						io_ddir_name(io_u->ddir),
-						io_u->offset, io_u->buflen);
+	fio_gettime(&now, NULL);
+	fprintf(td->iolog_f, "%llu %s %s %llu %llu\n",
+		(unsigned long long) utime_since_now(&td->io_log_start_time),
+		io_u->file->file_name, io_ddir_name(io_u->ddir), io_u->offset,
+		io_u->buflen);
+
 }
 
 void log_file(struct thread_data *td, struct fio_file *f,
 	      enum file_log_act what)
 {
 	const char *act[] = { "add", "open", "close" };
+	struct timespec now;
 
 	assert(what < 3);
 
@@ -65,7 +72,10 @@ void log_file(struct thread_data *td, struct fio_file *f,
 	if (!td->iolog_f)
 		return;
 
-	fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
+	fio_gettime(&now, NULL);
+	fprintf(td->iolog_f, "%llu %s %s\n",
+		(unsigned long long) utime_since_now(&td->io_log_start_time),
+		f->file_name, act[what]);
 }
 
 static void iolog_delay(struct thread_data *td, unsigned long delay)
@@ -116,6 +126,10 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
 
 	f = td->files[ipo->fileno];
 
+	if (ipo->delay)
+		iolog_delay(td, ipo->delay);
+	if (fio_fill_issue_time(td))
+		fio_gettime(&td->last_issue, NULL);
 	switch (ipo->file_action) {
 	case FIO_LOG_OPEN_FILE:
 		if (td->o.replay_redirect && fio_file_open(f)) {
@@ -134,6 +148,11 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
 	case FIO_LOG_UNLINK_FILE:
 		td_io_unlink_file(td, f);
 		break;
+	case FIO_LOG_ADD_FILE:
+		/*
+		 * Nothing to do
+		 */
+		break;
 	default:
 		log_err("fio: bad file action %d\n", ipo->file_action);
 		break;
@@ -142,7 +161,25 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
 	return 1;
 }
 
-static bool read_iolog2(struct thread_data *td);
+static bool read_iolog(struct thread_data *td);
+
+unsigned long long delay_since_ttime(const struct thread_data *td,
+	       unsigned long long time)
+{
+	double tmp;
+	double scale;
+	const unsigned long long *last_ttime = &td->io_log_last_ttime;
+
+	if (!*last_ttime || td->o.no_stall || time < *last_ttime)
+		return 0;
+	else if (td->o.replay_time_scale == 100)
+		return time - *last_ttime;
+
+
+	scale = (double) 100.0 / (double) td->o.replay_time_scale;
+	tmp = time - *last_ttime;
+	return tmp * scale;
+}
 
 int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 {
@@ -158,7 +195,7 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 					if (!read_blktrace(td))
 						return 1;
 				} else {
-					if (!read_iolog2(td))
+					if (!read_iolog(td))
 						return 1;
 				}
 			}
@@ -388,14 +425,20 @@ int64_t iolog_items_to_fetch(struct thread_data *td)
 	return items_to_fetch;
 }
 
+#define io_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 5) || \
+					((_td)->io_log_version == 2 && (r) == 4))
+#define file_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 3) || \
+					((_td)->io_log_version == 2 && (r) == 2))
+
 /*
- * Read version 2 iolog data. It is enhanced to include per-file logging,
+ * Read version 2 and 3 iolog data. It is enhanced to include per-file logging,
  * syncs, etc.
  */
-static bool read_iolog2(struct thread_data *td)
+static bool read_iolog(struct thread_data *td)
 {
 	unsigned long long offset;
 	unsigned int bytes;
+	unsigned long long delay = 0;
 	int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
 	char *rfname, *fname, *act;
 	char *str, *p;
@@ -422,14 +465,28 @@ static bool read_iolog2(struct thread_data *td)
 	while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
 		struct io_piece *ipo;
 		int r;
+		unsigned long long ttime;
 
-		r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
-									&bytes);
+		if (td->io_log_version == 3) {
+			r = sscanf(p, "%llu %256s %256s %llu %u", &ttime, rfname, act,
+							&offset, &bytes);
+			delay = delay_since_ttime(td, ttime);
+			td->io_log_last_ttime = ttime;
+			/*
+			 * "wait" is not allowed with version 3
+			 */
+			if (!strcmp(act, "wait")) {
+				log_err("iolog: ignoring wait command with"
+					" version 3 for file %s\n", fname);
+				continue;
+			}
+		} else /* version 2 */
+			r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset, &bytes);
 
 		if (td->o.replay_redirect)
 			fname = td->o.replay_redirect;
 
-		if (r == 4) {
+		if (io_act(td, r)) {
 			/*
 			 * Check action first
 			 */
@@ -451,7 +508,7 @@ static bool read_iolog2(struct thread_data *td)
 				continue;
 			}
 			fileno = get_fileno(td, fname);
-		} else if (r == 2) {
+		} else if (file_act(td, r)) {
 			rw = DDIR_INVAL;
 			if (!strcmp(act, "add")) {
 				if (td->o.replay_redirect &&
@@ -462,7 +519,6 @@ static bool read_iolog2(struct thread_data *td)
 					fileno = add_file(td, fname, td->subjob_number, 1);
 					file_action = FIO_LOG_ADD_FILE;
 				}
-				continue;
 			} else if (!strcmp(act, "open")) {
 				fileno = get_fileno(td, fname);
 				file_action = FIO_LOG_OPEN_FILE;
@@ -475,7 +531,7 @@ static bool read_iolog2(struct thread_data *td)
 				continue;
 			}
 		} else {
-			log_err("bad iolog2: %s\n", p);
+			log_err("bad iolog%d: %s\n", td->io_log_version, p);
 			continue;
 		}
 
@@ -506,6 +562,8 @@ static bool read_iolog2(struct thread_data *td)
 		ipo = calloc(1, sizeof(*ipo));
 		init_ipo(ipo);
 		ipo->ddir = rw;
+		if (td->io_log_version == 3)
+			ipo->delay = delay;
 		if (rw == DDIR_WAIT) {
 			ipo->delay = offset;
 		} else {
@@ -650,18 +708,22 @@ static bool init_iolog_read(struct thread_data *td, char *fname)
 	}
 
 	/*
-	 * version 2 of the iolog stores a specific string as the
+	 * versions 2 and 3 of the iolog store a specific string as the
 	 * first line, check for that
 	 */
-	if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) {
-		free_release_files(td);
-		td->io_log_rfile = f;
-		return read_iolog2(td);
+	if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
+		td->io_log_version = 2;
+	else if (!strncmp(iolog_ver3, buffer, strlen(iolog_ver3)))
+		td->io_log_version = 3;
+	else {
+		log_err("fio: iolog version 1 is no longer supported\n");
+		fclose(f);
+		return false;
 	}
 
-	log_err("fio: iolog version 1 is no longer supported\n");
-	fclose(f);
-	return false;
+	free_release_files(td);
+	td->io_log_rfile = f;
+	return read_iolog(td);
 }
 
 /*
@@ -685,11 +747,12 @@ static bool init_iolog_write(struct thread_data *td)
 	td->iolog_f = f;
 	td->iolog_buf = malloc(8192);
 	setvbuf(f, td->iolog_buf, _IOFBF, 8192);
+	fio_gettime(&td->io_log_start_time, NULL);
 
 	/*
 	 * write our version line
 	 */
-	if (fprintf(f, "%s\n", iolog_ver2) < 0) {
+	if (fprintf(f, "%s\n", iolog_ver3) < 0) {
 		perror("iolog init\n");
 		return false;
 	}
diff --git a/iolog.h b/iolog.h
index a3986309..62cbd1b0 100644
--- a/iolog.h
+++ b/iolog.h
@@ -227,10 +227,8 @@ struct io_piece {
 	unsigned long len;
 	unsigned int flags;
 	enum fio_ddir ddir;
-	union {
-		unsigned long delay;
-		unsigned int file_action;
-	};
+	unsigned long delay;
+	unsigned int file_action;
 };
 
 /*
@@ -259,6 +257,8 @@ extern int iolog_compress_init(struct thread_data *, struct sk_out *);
 extern void iolog_compress_exit(struct thread_data *);
 extern size_t log_chunk_sizes(struct io_log *);
 extern int init_io_u_buffers(struct thread_data *);
+extern unsigned long long delay_since_ttime(const struct thread_data *,
+					     unsigned long long);
 
 #ifdef CONFIG_ZLIB
 extern int iolog_file_inflate(const char *);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-04-07 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-04-07 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 06bbdc1cb857a11e6d1b7c089126397daca904fe:

  smalloc: fix ptr address in redzone error message (2022-04-05 11:47:35 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a3e48f483db27d20e02cbd81e3a8f18c6c5c50f5:

  Fio 3.30 (2022-04-06 17:10:00 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Fio 3.30

 FIO-VERSION-GEN | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index 60f7bb21..fa64f50f 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.29
+DEF_VER=fio-3.30
 
 LF='
 '

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-04-06 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-04-06 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 87933e32e356b15b85c6d9775d5e840994080a4f:

  Rename 'fallthrough' attribute to 'fio_fallthrough' (2022-03-30 17:31:36 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 06bbdc1cb857a11e6d1b7c089126397daca904fe:

  smalloc: fix ptr address in redzone error message (2022-04-05 11:47:35 -0600)

----------------------------------------------------------------
Vincent Fu (1):
      smalloc: fix ptr address in redzone error message

 smalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

---

Diff of recent changes:

diff --git a/smalloc.c b/smalloc.c
index fa00f0ee..23243054 100644
--- a/smalloc.c
+++ b/smalloc.c
@@ -283,13 +283,13 @@ static void sfree_check_redzone(struct block_hdr *hdr)
 	if (hdr->prered != SMALLOC_PRE_RED) {
 		log_err("smalloc pre redzone destroyed!\n"
 			" ptr=%p, prered=%x, expected %x\n",
-				hdr, hdr->prered, SMALLOC_PRE_RED);
+				hdr+1, hdr->prered, SMALLOC_PRE_RED);
 		assert(0);
 	}
 	if (*postred != SMALLOC_POST_RED) {
 		log_err("smalloc post redzone destroyed!\n"
 			"  ptr=%p, postred=%x, expected %x\n",
-				hdr, *postred, SMALLOC_POST_RED);
+				hdr+1, *postred, SMALLOC_POST_RED);
 		assert(0);
 	}
 }

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-31 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-31 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 5e644771eb91e91dd0fa32f4b51f90c44853a2b1:

  Merge branch 'status-interval-finished-jobs' of https://github.com/mmkayPL/fio (2022-03-29 06:30:44 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 87933e32e356b15b85c6d9775d5e840994080a4f:

  Rename 'fallthrough' attribute to 'fio_fallthrough' (2022-03-30 17:31:36 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Rename 'fallthrough' attribute to 'fio_fallthrough'

 compiler/compiler.h |  4 ++--
 crc/murmur3.c       |  4 ++--
 engines/http.c      |  2 +-
 hash.h              | 24 ++++++++++++------------
 init.c              |  2 +-
 io_u.c              | 10 +++++-----
 lib/lfsr.c          | 32 ++++++++++++++++----------------
 parse.c             |  4 ++--
 t/lfsr-test.c       |  6 +++---
 9 files changed, 44 insertions(+), 44 deletions(-)

---

Diff of recent changes:

diff --git a/compiler/compiler.h b/compiler/compiler.h
index 3fd0822f..fefadeaa 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -72,9 +72,9 @@
 #endif
 
 #if __has_attribute(__fallthrough__)
-#define fallthrough	 __attribute__((__fallthrough__))
+#define fio_fallthrough	 __attribute__((__fallthrough__))
 #else
-#define fallthrough	do {} while (0)  /* fallthrough */
+#define fio_fallthrough	do {} while (0)  /* fallthrough */
 #endif
 
 #endif
diff --git a/crc/murmur3.c b/crc/murmur3.c
index ba408a9e..08660bc8 100644
--- a/crc/murmur3.c
+++ b/crc/murmur3.c
@@ -30,10 +30,10 @@ static uint32_t murmur3_tail(const uint8_t *data, const int nblocks,
 	switch (len & 3) {
 	case 3:
 		k1 ^= tail[2] << 16;
-		fallthrough;
+		fio_fallthrough;
 	case 2:
 		k1 ^= tail[1] << 8;
-		fallthrough;
+		fio_fallthrough;
 	case 1:
 		k1 ^= tail[0];
 		k1 *= c1;
diff --git a/engines/http.c b/engines/http.c
index 57d4967d..696febe1 100644
--- a/engines/http.c
+++ b/engines/http.c
@@ -297,7 +297,7 @@ static int _curl_trace(CURL *handle, curl_infotype type,
 	switch (type) {
 	case CURLINFO_TEXT:
 		fprintf(stderr, "== Info: %s", data);
-		fallthrough;
+		fio_fallthrough;
 	default:
 	case CURLINFO_SSL_DATA_OUT:
 	case CURLINFO_SSL_DATA_IN:
diff --git a/hash.h b/hash.h
index 2c04bc29..f7596a56 100644
--- a/hash.h
+++ b/hash.h
@@ -142,20 +142,20 @@ static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval)
 	/* Last block: affect all 32 bits of (c) */
 	/* All the case statements fall through */
 	switch (length) {
-	case 12: c += (uint32_t) k[11] << 24;	fallthrough;
-	case 11: c += (uint32_t) k[10] << 16;	fallthrough;
-	case 10: c += (uint32_t) k[9] << 8;	fallthrough;
-	case 9:  c += k[8];			fallthrough;
-	case 8:  b += (uint32_t) k[7] << 24;	fallthrough;
-	case 7:  b += (uint32_t) k[6] << 16;	fallthrough;
-	case 6:  b += (uint32_t) k[5] << 8;	fallthrough;
-	case 5:  b += k[4];			fallthrough;
-	case 4:  a += (uint32_t) k[3] << 24;	fallthrough;
-	case 3:  a += (uint32_t) k[2] << 16;	fallthrough;
-	case 2:  a += (uint32_t) k[1] << 8;	fallthrough;
+	case 12: c += (uint32_t) k[11] << 24;	fio_fallthrough;
+	case 11: c += (uint32_t) k[10] << 16;	fio_fallthrough;
+	case 10: c += (uint32_t) k[9] << 8;	fio_fallthrough;
+	case 9:  c += k[8];			fio_fallthrough;
+	case 8:  b += (uint32_t) k[7] << 24;	fio_fallthrough;
+	case 7:  b += (uint32_t) k[6] << 16;	fio_fallthrough;
+	case 6:  b += (uint32_t) k[5] << 8;	fio_fallthrough;
+	case 5:  b += k[4];			fio_fallthrough;
+	case 4:  a += (uint32_t) k[3] << 24;	fio_fallthrough;
+	case 3:  a += (uint32_t) k[2] << 16;	fio_fallthrough;
+	case 2:  a += (uint32_t) k[1] << 8;	fio_fallthrough;
 	case 1:  a += k[0];
 		 __jhash_final(a, b, c);
-		 fallthrough;
+		 fio_fallthrough;
 	case 0: /* Nothing left to add */
 		break;
 	}
diff --git a/init.c b/init.c
index b7f866e6..6f186051 100644
--- a/init.c
+++ b/init.c
@@ -2990,7 +2990,7 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
 			log_err("%s: unrecognized option '%s'\n", argv[0],
 							argv[optind - 1]);
 			show_closest_option(argv[optind - 1]);
-			fallthrough;
+			fio_fallthrough;
 		default:
 			do_exit++;
 			exit_val = 1;
diff --git a/io_u.c b/io_u.c
index 50197a4b..eec378dd 100644
--- a/io_u.c
+++ b/io_u.c
@@ -993,7 +993,7 @@ static void __io_u_mark_map(uint64_t *map, unsigned int nr)
 		break;
 	case 1 ... 4:
 		idx = 1;
-		fallthrough;
+		fio_fallthrough;
 	case 0:
 		break;
 	}
@@ -1035,7 +1035,7 @@ void io_u_mark_depth(struct thread_data *td, unsigned int nr)
 		break;
 	case 2 ... 3:
 		idx = 1;
-		fallthrough;
+		fio_fallthrough;
 	case 1:
 		break;
 	}
@@ -1076,7 +1076,7 @@ static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec)
 		break;
 	case 2 ... 3:
 		idx = 1;
-		fallthrough;
+		fio_fallthrough;
 	case 0 ... 1:
 		break;
 	}
@@ -1118,7 +1118,7 @@ static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec)
 		break;
 	case 2 ... 3:
 		idx = 1;
-		fallthrough;
+		fio_fallthrough;
 	case 0 ... 1:
 		break;
 	}
@@ -1166,7 +1166,7 @@ static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec)
 		break;
 	case 2 ... 3:
 		idx = 1;
-		fallthrough;
+		fio_fallthrough;
 	case 0 ... 1:
 		break;
 	}
diff --git a/lib/lfsr.c b/lib/lfsr.c
index a32e850a..e86086c4 100644
--- a/lib/lfsr.c
+++ b/lib/lfsr.c
@@ -88,37 +88,37 @@ static inline void __lfsr_next(struct fio_lfsr *fl, unsigned int spin)
 	 */
 	switch (spin) {
 		case 15: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case 14: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case 13: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case 12: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case 11: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case 10: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  9: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  8: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  7: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  6: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  5: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  4: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  3: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  2: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  1: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		case  0: __LFSR_NEXT(fl, fl->last_val);
-		fallthrough;
+		fio_fallthrough;
 		default: break;
 	}
 }
diff --git a/parse.c b/parse.c
index e0bee004..656a5025 100644
--- a/parse.c
+++ b/parse.c
@@ -601,7 +601,7 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
 	}
 	case FIO_OPT_STR_VAL_TIME:
 		is_time = 1;
-		fallthrough;
+		fio_fallthrough;
 	case FIO_OPT_ULL:
 	case FIO_OPT_INT:
 	case FIO_OPT_STR_VAL:
@@ -980,7 +980,7 @@ store_option_value:
 	}
 	case FIO_OPT_DEPRECATED:
 		ret = 1;
-		fallthrough;
+		fio_fallthrough;
 	case FIO_OPT_SOFT_DEPRECATED:
 		log_info("Option %s is deprecated\n", o->name);
 		break;
diff --git a/t/lfsr-test.c b/t/lfsr-test.c
index 279e07f0..4b255e19 100644
--- a/t/lfsr-test.c
+++ b/t/lfsr-test.c
@@ -41,11 +41,11 @@ int main(int argc, char *argv[])
 	switch (argc) {
 		case 5: if (strncmp(argv[4], "verify", 7) == 0)
 				verify = 1;
-			fallthrough;
+			fio_fallthrough;
 		case 4: spin = atoi(argv[3]);
-			fallthrough;
+			fio_fallthrough;
 		case 3: seed = atol(argv[2]);
-			fallthrough;
+			fio_fallthrough;
 		case 2: numbers = strtol(argv[1], NULL, 16);
 				break;
 		default: usage();

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-30 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-30 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit a57d3fdce796f1bb516c74db95d016bb6db170c1:

  Merge branch 'master' of https://github.com/cccheng/fio (2022-03-28 06:43:56 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 5e644771eb91e91dd0fa32f4b51f90c44853a2b1:

  Merge branch 'status-interval-finished-jobs' of https://github.com/mmkayPL/fio (2022-03-29 06:30:44 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'status-interval-finished-jobs' of https://github.com/mmkayPL/fio

Kozlowski Mateusz (1):
      Handle finished jobs when using status-interval

 stat.c | 6 ++++++
 1 file changed, 6 insertions(+)

---

Diff of recent changes:

diff --git a/stat.c b/stat.c
index 7947edb4..356083e2 100644
--- a/stat.c
+++ b/stat.c
@@ -2731,6 +2731,9 @@ int __show_running_run_stats(void)
 	fio_gettime(&ts, NULL);
 
 	for_each_td(td, i) {
+		if (td->runstate >= TD_EXITED)
+			continue;
+
 		td->update_rusage = 1;
 		for_each_rw_ddir(ddir) {
 			td->ts.io_bytes[ddir] = td->io_bytes[ddir];
@@ -2759,6 +2762,9 @@ int __show_running_run_stats(void)
 	__show_run_stats();
 
 	for_each_td(td, i) {
+		if (td->runstate >= TD_EXITED)
+			continue;
+
 		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
 			td->ts.runtime[DDIR_READ] -= rt[i];
 		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-29 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-29 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit e3de2e7fe2889942d46699e72ac06b96eab09e27:

  Merge branch 'github-1372' of https://github.com/vincentkfu/fio (2022-03-24 10:11:34 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a57d3fdce796f1bb516c74db95d016bb6db170c1:

  Merge branch 'master' of https://github.com/cccheng/fio (2022-03-28 06:43:56 -0600)

----------------------------------------------------------------
Chung-Chiang Cheng (1):
      Fix compile error of GCC 4

Jens Axboe (1):
      Merge branch 'master' of https://github.com/cccheng/fio

 compiler/compiler.h | 1 +
 1 file changed, 1 insertion(+)

---

Diff of recent changes:

diff --git a/compiler/compiler.h b/compiler/compiler.h
index 44fa87b9..3fd0822f 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -67,6 +67,7 @@
 #endif
 
 #ifndef __has_attribute
+#define __has_attribute(x) __GCC4_has_attribute_##x
 #define __GCC4_has_attribute___fallthrough__	0
 #endif
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-25 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-25 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit c822572d68e326384ce179b9484de0e4abf3d514:

  engines/null: use correct -include (2022-03-20 09:31:20 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to e3de2e7fe2889942d46699e72ac06b96eab09e27:

  Merge branch 'github-1372' of https://github.com/vincentkfu/fio (2022-03-24 10:11:34 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'github-1372' of https://github.com/vincentkfu/fio

Vincent Fu (1):
      io_u: produce bad offsets for some time_based jobs

 io_u.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/io_u.c b/io_u.c
index 806ceb77..50197a4b 100644
--- a/io_u.c
+++ b/io_u.c
@@ -355,7 +355,7 @@ static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
 	 * and invalidate the cache, if we need to.
 	 */
 	if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
-	    o->time_based) {
+	    o->time_based && o->nr_files == 1) {
 		f->last_pos[ddir] = f->file_offset;
 		loop_cache_invalidate(td, f);
 	}

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-21 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-21 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 1953e1adb5a28ed21370e85991d7f5c3cdc699f3:

  Merge branch 'flags-fix' of https://github.com/albertofaria/fio (2022-03-15 17:21:41 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to c822572d68e326384ce179b9484de0e4abf3d514:

  engines/null: use correct -include (2022-03-20 09:31:20 -0600)

----------------------------------------------------------------
Jens Axboe (3):
      engines/null: update external engine compilation
      Merge branch 'master' of https://github.com/jnoc/fio
      engines/null: use correct -include

Jonathon Carter (1):
      Added citation.cff for easy APA/BibTeX citation directly from the Github repository

 CITATION.cff   | 11 +++++++++++
 engines/null.c |  7 ++++---
 2 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 CITATION.cff

---

Diff of recent changes:

diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..3df315e5
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,11 @@
+cff-version: 1.2.0
+preferred-citation:
+  type: software
+  authors:
+  - family-names: "Axboe"
+    given-names: "Jens"
+    email: axboe@kernel.dk
+  title: "Flexible I/O Tester"
+  year: 2022
+  url: "https://github.com/axboe/fio"
+licence: GNU GPL v2.0
diff --git a/engines/null.c b/engines/null.c
index 4cc0102b..8dcd1b21 100644
--- a/engines/null.c
+++ b/engines/null.c
@@ -6,7 +6,8 @@
  *
  * It also can act as external C++ engine - compiled with:
  *
- * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c -DFIO_EXTERNAL_ENGINE
+ * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c \
+ *	-include ../config-host.h -DFIO_EXTERNAL_ENGINE
  *
  * to test it execute:
  *
@@ -201,7 +202,7 @@ struct NullData {
 		return null_commit(td, impl_);
 	}
 
-	int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+	fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u)
 	{
 		return null_queue(td, impl_, io_u);
 	}
@@ -233,7 +234,7 @@ static int fio_null_commit(struct thread_data *td)
 	return NullData::get(td)->fio_null_commit(td);
 }
 
-static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+static fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u)
 {
 	return NullData::get(td)->fio_null_queue(td, io_u);
 }

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-16 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-16 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 1fe261a24794f60bf374cd1852e09ec56997a20a:

  t/dedupe: ensure that 'ret' is initialized (2022-03-11 06:15:53 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 1953e1adb5a28ed21370e85991d7f5c3cdc699f3:

  Merge branch 'flags-fix' of https://github.com/albertofaria/fio (2022-03-15 17:21:41 -0600)

----------------------------------------------------------------
Alberto Faria (1):
      Properly encode engine flags in thread_data::flags

Jens Axboe (1):
      Merge branch 'flags-fix' of https://github.com/albertofaria/fio

 fio.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

---

Diff of recent changes:

diff --git a/fio.h b/fio.h
index c314f0a8..776fb51f 100644
--- a/fio.h
+++ b/fio.h
@@ -184,7 +184,7 @@ struct zone_split_index {
  */
 struct thread_data {
 	struct flist_head opt_list;
-	unsigned long flags;
+	unsigned long long flags;
 	struct thread_options o;
 	void *eo;
 	pthread_t thread;
@@ -681,12 +681,12 @@ enum {
 };
 
 #define TD_ENG_FLAG_SHIFT	18
-#define TD_ENG_FLAG_MASK	((1U << 18) - 1)
+#define TD_ENG_FLAG_MASK	((1ULL << 18) - 1)
 
 static inline void td_set_ioengine_flags(struct thread_data *td)
 {
 	td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
-		    (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+		    ((unsigned long long)td->io_ops->flags << TD_ENG_FLAG_SHIFT);
 }
 
 static inline bool td_ioengine_flagged(struct thread_data *td,

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-12 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-12 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 16b1e24562347d371d6d62e0bb9a03ad4e2a8a96:

  t/dedupe: handle errors more gracefully (2022-03-11 05:09:20 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 1fe261a24794f60bf374cd1852e09ec56997a20a:

  t/dedupe: ensure that 'ret' is initialized (2022-03-11 06:15:53 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      t/dedupe: ensure that 'ret' is initialized

 t/dedupe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/t/dedupe.c b/t/dedupe.c
index 561aa08d..d21e96f4 100644
--- a/t/dedupe.c
+++ b/t/dedupe.c
@@ -280,7 +280,7 @@ static int insert_chunks(struct item *items, unsigned int nitems,
 			 uint64_t *ndupes, uint64_t *unique_capacity,
 			 struct zlib_ctrl *zc)
 {
-	int i, ret;
+	int i, ret = 0;
 
 	fio_sem_down(rb_lock);
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-11 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-11 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit df0ab55ff9e28f4b85c199e207aec904f8a76440:

  Merge branch 'master' of https://github.com/dpronin/fio (2022-03-09 06:20:31 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 16b1e24562347d371d6d62e0bb9a03ad4e2a8a96:

  t/dedupe: handle errors more gracefully (2022-03-11 05:09:20 -0700)

----------------------------------------------------------------
Denis Pronin (4):
      configure script refactoring
      improvements in dup_files function
      fixed memory leak detected by ASAN
      ASAN enabling when configuring

Jens Axboe (7):
      Merge branch 'master' of https://github.com/dpronin/fio
      Merge branch 'refactoring/configure' of https://github.com/dpronin/fio
      Merge branch 'improvement/prevent-sigsegv-when-dup-files' of https://github.com/dpronin/fio
      Merge branch 'improvement/enable-asan' of https://github.com/dpronin/fio
      t/io_uring: only enable sync if we have preadv2
      Merge branch 'fuzz-cleanup' of https://github.com/vincentkfu/fio
      t/dedupe: handle errors more gracefully

Vincent Fu (1):
      fuzz: avoid building t/fuzz/parse_ini by default

 Makefile     |  8 +++++++-
 backend.c    |  6 ++++++
 configure    | 14 ++++++++++----
 filesetup.c  |  3 ++-
 t/dedupe.c   | 57 +++++++++++++++++++++++++++++++++++----------------------
 t/io_uring.c | 13 +++++++++++++
 6 files changed, 73 insertions(+), 28 deletions(-)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index 6ffd3d13..e670c1f2 100644
--- a/Makefile
+++ b/Makefile
@@ -385,14 +385,16 @@ T_MEMLOCK_PROGS = t/memlock
 T_TT_OBJS = t/time-test.o
 T_TT_PROGS = t/time-test
 
+ifneq (,$(findstring -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION,$(CFLAGS)))
 T_FUZZ_OBJS = t/fuzz/fuzz_parseini.o
 T_FUZZ_OBJS += $(OBJS)
 ifdef CONFIG_ARITHMETIC
 T_FUZZ_OBJS += lex.yy.o y.tab.o
 endif
+# For proper fio code teardown CFLAGS needs to include -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
 # in case there is no fuzz driver defined by environment variable LIB_FUZZING_ENGINE, use a simple one
 # For instance, with compiler clang, address sanitizer and libFuzzer as a fuzzing engine, you should define
-# export CFLAGS="-fsanitize=address,fuzzer-no-link"
+# export CFLAGS="-fsanitize=address,fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION"
 # export LIB_FUZZING_ENGINE="-fsanitize=address"
 # export CC=clang
 # before running configure && make
@@ -401,6 +403,10 @@ ifndef LIB_FUZZING_ENGINE
 T_FUZZ_OBJS += t/fuzz/onefile.o
 endif
 T_FUZZ_PROGS = t/fuzz/fuzz_parseini
+else	# CFLAGS includes -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+T_FUZZ_OBJS =
+T_FUZZ_PROGS =
+endif
 
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
diff --git a/backend.c b/backend.c
index cd7f4e5f..001b2b96 100644
--- a/backend.c
+++ b/backend.c
@@ -2432,7 +2432,10 @@ reap:
 							strerror(ret));
 			} else {
 				pid_t pid;
+				struct fio_file **files;
 				dprint(FD_PROCESS, "will fork\n");
+				files = td->files;
+				read_barrier();
 				pid = fork();
 				if (!pid) {
 					int ret;
@@ -2441,6 +2444,9 @@ reap:
 					_exit(ret);
 				} else if (i == fio_debug_jobno)
 					*fio_debug_jobp = pid;
+				// freeing previously allocated memory for files
+				// this memory freed MUST NOT be shared between processes, only the pointer itself may be shared within TD
+				free(files);
 				free(fd);
 				fd = NULL;
 			}
diff --git a/configure b/configure
index 67e5d535..d327d2ca 100755
--- a/configure
+++ b/configure
@@ -248,6 +248,8 @@ for opt do
   ;;
   --disable-dfs) dfs="no"
   ;;
+  --enable-asan) asan="yes"
+  ;;
   --help)
     show_help="yes"
     ;;
@@ -290,9 +292,10 @@ if test "$show_help" = "yes" ; then
   echo "--enable-libiscsi       Enable iscsi support"
   echo "--enable-libnbd         Enable libnbd (NBD engine) support"
   echo "--disable-libzbc        Disable libzbc even if found"
-  echo "--disable-tcmalloc	Disable tcmalloc support"
-  echo "--dynamic-libengines	Lib-based ioengines as dynamic libraries"
-  echo "--disable-dfs		Disable DAOS File System support even if found"
+  echo "--disable-tcmalloc      Disable tcmalloc support"
+  echo "--dynamic-libengines    Lib-based ioengines as dynamic libraries"
+  echo "--disable-dfs           Disable DAOS File System support even if found"
+  echo "--enable-asan           Enable address sanitizer"
   exit $exit_val
 fi
 
@@ -3196,7 +3199,10 @@ fi
 if test "$fcntl_sync" = "yes" ; then
   output_sym "CONFIG_FCNTL_SYNC"
 fi
-
+if test "$asan" = "yes"; then
+  CFLAGS="$CFLAGS -fsanitize=address"
+  LDFLAGS="$LDFLAGS -fsanitize=address"
+fi
 print_config "Lib-based ioengines dynamic" "$dynamic_engines"
 cat > $TMPC << EOF
 int main(int argc, char **argv)
diff --git a/filesetup.c b/filesetup.c
index 7c32d0af..ab6c488b 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -2031,11 +2031,12 @@ void dup_files(struct thread_data *td, struct thread_data *org)
 	if (!org->files)
 		return;
 
-	td->files = malloc(org->files_index * sizeof(f));
+	td->files = calloc(org->files_index, sizeof(f));
 
 	if (td->o.file_lock_mode != FILE_LOCK_NONE)
 		td->file_locks = malloc(org->files_index);
 
+	assert(org->files_index >= org->o.nr_files);
 	for_each_file(org, f, i) {
 		struct fio_file *__f;
 
diff --git a/t/dedupe.c b/t/dedupe.c
index 109ea1af..561aa08d 100644
--- a/t/dedupe.c
+++ b/t/dedupe.c
@@ -143,15 +143,15 @@ static int read_block(int fd, void *buf, off_t offset)
 	return __read_block(fd, buf, offset, blocksize);
 }
 
-static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
-				    struct zlib_ctrl *zc)
+static int account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
+				   struct zlib_ctrl *zc)
 {
 	z_stream *stream = &zc->stream;
 	unsigned int compressed_len;
 	int ret;
 
 	if (read_block(file.fd, zc->buf_in, offset))
-		return;
+		return 1;
 
 	stream->next_in = zc->buf_in;
 	stream->avail_in = blocksize;
@@ -159,7 +159,8 @@ static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
 	stream->next_out = zc->buf_out;
 
 	ret = deflate(stream, Z_FINISH);
-	assert(ret != Z_STREAM_ERROR);
+	if (ret == Z_STREAM_ERROR)
+		return 1;
 	compressed_len = blocksize - stream->avail_out;
 
 	if (dump_output)
@@ -169,6 +170,7 @@ static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
 
 	*unique_capacity += compressed_len;
 	deflateReset(stream);
+	return 0;
 }
 
 static void add_item(struct chunk *c, struct item *i)
@@ -225,12 +227,12 @@ static struct chunk *alloc_chunk(void)
 	return c;
 }
 
-static void insert_chunk(struct item *i, uint64_t *unique_capacity,
-			 struct zlib_ctrl *zc)
+static int insert_chunk(struct item *i, uint64_t *unique_capacity,
+			struct zlib_ctrl *zc)
 {
 	struct fio_rb_node **p, *parent;
 	struct chunk *c;
-	int diff;
+	int ret, diff;
 
 	p = &rb_root.rb_node;
 	parent = NULL;
@@ -244,8 +246,6 @@ static void insert_chunk(struct item *i, uint64_t *unique_capacity,
 		} else if (diff > 0) {
 			p = &(*p)->rb_right;
 		} else {
-			int ret;
-
 			if (!collision_check)
 				goto add;
 
@@ -266,17 +266,21 @@ static void insert_chunk(struct item *i, uint64_t *unique_capacity,
 	memcpy(c->hash, i->hash, sizeof(i->hash));
 	rb_link_node(&c->rb_node, parent, p);
 	rb_insert_color(&c->rb_node, &rb_root);
-	if (compression)
-		account_unique_capacity(i->offset, unique_capacity, zc);
+	if (compression) {
+		ret = account_unique_capacity(i->offset, unique_capacity, zc);
+		if (ret)
+			return ret;
+	}
 add:
 	add_item(c, i);
+	return 0;
 }
 
-static void insert_chunks(struct item *items, unsigned int nitems,
-			  uint64_t *ndupes, uint64_t *unique_capacity,
-			  struct zlib_ctrl *zc)
+static int insert_chunks(struct item *items, unsigned int nitems,
+			 uint64_t *ndupes, uint64_t *unique_capacity,
+			 struct zlib_ctrl *zc)
 {
-	int i;
+	int i, ret;
 
 	fio_sem_down(rb_lock);
 
@@ -288,11 +292,15 @@ static void insert_chunks(struct item *items, unsigned int nitems,
 			s = sizeof(items[i].hash) / sizeof(uint32_t);
 			r = bloom_set(bloom, items[i].hash, s);
 			*ndupes += r;
-		} else
-			insert_chunk(&items[i], unique_capacity, zc);
+		} else {
+			ret = insert_chunk(&items[i], unique_capacity, zc);
+			if (ret)
+				break;
+		}
 	}
 
 	fio_sem_up(rb_lock);
+	return ret;
 }
 
 static void crc_buf(void *buf, uint32_t *hash)
@@ -320,6 +328,7 @@ static int do_work(struct worker_thread *thread, void *buf)
 	uint64_t ndupes = 0;
 	uint64_t unique_capacity = 0;
 	struct item *items;
+	int ret;
 
 	offset = thread->cur_offset;
 
@@ -339,13 +348,17 @@ static int do_work(struct worker_thread *thread, void *buf)
 		nitems++;
 	}
 
-	insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
+	ret = insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
 
 	free(items);
-	thread->items += nitems;
-	thread->dupes += ndupes;
-	thread->unique_capacity += unique_capacity;
-	return 0;
+	if (!ret) {
+		thread->items += nitems;
+		thread->dupes += ndupes;
+		thread->unique_capacity += unique_capacity;
+		return 0;
+	}
+
+	return ret;
 }
 
 static void thread_init_zlib_control(struct worker_thread *thread)
diff --git a/t/io_uring.c b/t/io_uring.c
index 157eea9e..10035912 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -939,6 +939,7 @@ submit:
 	return NULL;
 }
 
+#ifdef CONFIG_PWRITEV2
 static void *submitter_sync_fn(void *data)
 {
 	struct submitter *s = data;
@@ -1004,6 +1005,13 @@ static void *submitter_sync_fn(void *data)
 	finish = 1;
 	return NULL;
 }
+#else
+static void *submitter_sync_fn(void *data)
+{
+	finish = 1;
+	return NULL;
+}
+#endif
 
 static struct submitter *get_submitter(int offset)
 {
@@ -1346,7 +1354,12 @@ int main(int argc, char *argv[])
 			register_ring = !!atoi(optarg);
 			break;
 		case 'S':
+#ifdef CONFIG_PWRITEV2
 			use_sync = !!atoi(optarg);
+#else
+			fprintf(stderr, "preadv2 not supported\n");
+			exit(1);
+#endif
 			break;
 		case 'h':
 		case '?':

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-10 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-10 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit a24ef2702e2c1b948df37080eb3f18cca60d414b:

  Merge branch 'master' of https://github.com/dpronin/fio (2022-03-08 16:42:37 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to df0ab55ff9e28f4b85c199e207aec904f8a76440:

  Merge branch 'master' of https://github.com/dpronin/fio (2022-03-09 06:20:31 -0700)

----------------------------------------------------------------
Denis Pronin (3):
      - freeing job_sections array of strings upon freeing each its item in init.c
      - fixed memory leak, which is happening when parsing options, claimed by ASAN
      - fixed memory leak in parent process detected by ASAN when forking and not freeing memory in the parent process allocated for fork_data

Jens Axboe (3):
      Merge branch 'fix/asan-memleak' of https://github.com/dpronin/fio
      Merge branch 'fix/asan-memleak-forkdata' of https://github.com/dpronin/fio
      Merge branch 'master' of https://github.com/dpronin/fio

 backend.c | 2 ++
 init.c    | 4 ++++
 parse.c   | 2 ++
 3 files changed, 8 insertions(+)

---

Diff of recent changes:

diff --git a/backend.c b/backend.c
index a21dfef6..cd7f4e5f 100644
--- a/backend.c
+++ b/backend.c
@@ -2441,6 +2441,8 @@ reap:
 					_exit(ret);
 				} else if (i == fio_debug_jobno)
 					*fio_debug_jobp = pid;
+				free(fd);
+				fd = NULL;
 			}
 			dprint(FD_MUTEX, "wait on startup_sem\n");
 			if (fio_sem_down_timeout(startup_sem, 10000)) {
diff --git a/init.c b/init.c
index 81c30f8c..b7f866e6 100644
--- a/init.c
+++ b/init.c
@@ -2185,6 +2185,10 @@ static int __parse_jobs_ini(struct thread_data *td,
 		i++;
 	}
 
+	free(job_sections);
+	job_sections = NULL;
+	nr_job_sections = 0;
+
 	free(opts);
 out:
 	free(string);
diff --git a/parse.c b/parse.c
index d086ee48..e0bee004 100644
--- a/parse.c
+++ b/parse.c
@@ -817,6 +817,8 @@ store_option_value:
 
 		if (o->off1) {
 			cp = td_var(data, o, o->off1);
+			if (*cp)
+				free(*cp);
 			*cp = strdup(ptr);
 			if (strlen(ptr) > o->maxlen - 1) {
 				log_err("value exceeds max length of %d\n",

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-09 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-09 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit dc44588f2e445edd7a4ca7dc9bf05bb3b4b2789e:

  Makefile: get rid of fortify source (2022-03-07 09:16:39 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a24ef2702e2c1b948df37080eb3f18cca60d414b:

  Merge branch 'master' of https://github.com/dpronin/fio (2022-03-08 16:42:37 -0700)

----------------------------------------------------------------
Denis Pronin (1):
      - fixed typo in configure script

Jens Axboe (1):
      Merge branch 'master' of https://github.com/dpronin/fio

 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index be4605f9..67e5d535 100755
--- a/configure
+++ b/configure
@@ -2098,7 +2098,7 @@ if test "$libhdfs" = "yes" ; then
     hdfs_conf_error=1
   fi
   if test "$FIO_LIBHDFS_INCLUDE" = "" ; then
-    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path"
+    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs include path"
     hdfs_conf_error=1
   fi
   if test "$FIO_LIBHDFS_LIB" = "" ; then

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-03-08 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-03-08 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit c3773c171dffb79f771d213d94249cefc4b9b6de:

  windowsaio: open file for write if we have syncs (2022-02-26 10:43:20 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to dc44588f2e445edd7a4ca7dc9bf05bb3b4b2789e:

  Makefile: get rid of fortify source (2022-03-07 09:16:39 -0700)

----------------------------------------------------------------
Jens Axboe (7):
      t/io_uring: change map buffers registration opcode
      t/io_uring: change fatal map buffers condition with multiple files
      io_uring.h: sync with 5.18 kernel bits
      t/io_uring: add support for registering the ring fd
      t/io_uring: support using preadv2
      t/io_uring: add missing CR
      Makefile: get rid of fortify source

 Makefile            |   2 +-
 os/linux/io_uring.h |  17 ++++--
 t/io_uring.c        | 148 ++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 147 insertions(+), 20 deletions(-)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index 0ab4f82c..6ffd3d13 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,7 @@ PROGS	= fio
 SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/hist/fio-histo-log-pctiles.py tools/fio_jsonplus_clat2csv)
 
 ifndef CONFIG_FIO_NO_OPT
-  FIO_CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
+  FIO_CFLAGS += -O3
 endif
 ifdef CONFIG_BUILD_NATIVE
   FIO_CFLAGS += -march=native
diff --git a/os/linux/io_uring.h b/os/linux/io_uring.h
index c45b5e9a..42b2fe84 100644
--- a/os/linux/io_uring.h
+++ b/os/linux/io_uring.h
@@ -70,6 +70,7 @@ enum {
 	IOSQE_IO_HARDLINK_BIT,
 	IOSQE_ASYNC_BIT,
 	IOSQE_BUFFER_SELECT_BIT,
+	IOSQE_CQE_SKIP_SUCCESS_BIT,
 };
 
 /*
@@ -87,6 +88,8 @@ enum {
 #define IOSQE_ASYNC		(1U << IOSQE_ASYNC_BIT)
 /* select buffer from sqe->buf_group */
 #define IOSQE_BUFFER_SELECT	(1U << IOSQE_BUFFER_SELECT_BIT)
+/* don't post CQE if request succeeded */
+#define IOSQE_CQE_SKIP_SUCCESS	(1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
 
 /*
  * io_uring_setup() flags
@@ -254,10 +257,11 @@ struct io_cqring_offsets {
 /*
  * io_uring_enter(2) flags
  */
-#define IORING_ENTER_GETEVENTS	(1U << 0)
-#define IORING_ENTER_SQ_WAKEUP	(1U << 1)
-#define IORING_ENTER_SQ_WAIT	(1U << 2)
-#define IORING_ENTER_EXT_ARG	(1U << 3)
+#define IORING_ENTER_GETEVENTS		(1U << 0)
+#define IORING_ENTER_SQ_WAKEUP		(1U << 1)
+#define IORING_ENTER_SQ_WAIT		(1U << 2)
+#define IORING_ENTER_EXT_ARG		(1U << 3)
+#define IORING_ENTER_REGISTERED_RING	(1U << 4)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -289,6 +293,7 @@ struct io_uring_params {
 #define IORING_FEAT_EXT_ARG		(1U << 8)
 #define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
 #define IORING_FEAT_RSRC_TAGS		(1U << 10)
+#define IORING_FEAT_CQE_SKIP		(1U << 11)
 
 /*
  * io_uring_register(2) opcodes and arguments
@@ -321,6 +326,10 @@ enum {
 	/* set/get max number of io-wq workers */
 	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* register/unregister io_uring fd with the ring */
+	IORING_REGISTER_RING_FDS		= 20,
+	IORING_UNREGISTER_RING_FDS		= 21,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
diff --git a/t/io_uring.c b/t/io_uring.c
index b8fcffe8..157eea9e 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -76,6 +76,7 @@ struct file {
 struct submitter {
 	pthread_t thread;
 	int ring_fd;
+	int enter_ring_fd;
 	int index;
 	struct io_sq_ring sq_ring;
 	struct io_uring_sqe *sqes;
@@ -127,6 +128,8 @@ static int stats = 0;		/* generate IO stats */
 static int aio = 0;		/* use libaio */
 static int runtime = 0;		/* runtime */
 static int random_io = 1;	/* random or sequential IO */
+static int register_ring = 1;	/* register ring */
+static int use_sync = 0;	/* use preadv2 */
 
 static unsigned long tsc_rate;
 
@@ -139,7 +142,7 @@ static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
 static int plist_len = 17;
 
 #ifndef IORING_REGISTER_MAP_BUFFERS
-#define IORING_REGISTER_MAP_BUFFERS	20
+#define IORING_REGISTER_MAP_BUFFERS	22
 struct io_uring_map_buffers {
 	__s32	fd;
 	__u32	buf_start;
@@ -349,10 +352,8 @@ static int io_uring_map_buffers(struct submitter *s)
 
 	if (do_nop)
 		return 0;
-	if (s->nr_files > 1) {
-		fprintf(stderr, "Can't map buffers with multiple files\n");
-		return -1;
-	}
+	if (s->nr_files > 1)
+		fprintf(stdout, "Mapping buffers may not work with multiple files\n");
 
 	return syscall(__NR_io_uring_register, s->ring_fd,
 			IORING_REGISTER_MAP_BUFFERS, &map, 1);
@@ -422,12 +423,14 @@ out:
 static int io_uring_enter(struct submitter *s, unsigned int to_submit,
 			  unsigned int min_complete, unsigned int flags)
 {
+	if (register_ring)
+		flags |= IORING_ENTER_REGISTERED_RING;
 #ifdef FIO_ARCH_HAS_SYSCALL
-	return __do_syscall6(__NR_io_uring_enter, s->ring_fd, to_submit,
+	return __do_syscall6(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
 				min_complete, flags, NULL, 0);
 #else
-	return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
-			flags, NULL, 0);
+	return syscall(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
+			min_complete, flags, NULL, 0);
 #endif
 }
 
@@ -795,6 +798,34 @@ static void *submitter_aio_fn(void *data)
 }
 #endif
 
+static void io_uring_unregister_ring(struct submitter *s)
+{
+	struct io_uring_rsrc_update up = {
+		.offset	= s->enter_ring_fd,
+	};
+
+	syscall(__NR_io_uring_register, s->ring_fd, IORING_UNREGISTER_RING_FDS,
+		&up, 1);
+}
+
+static int io_uring_register_ring(struct submitter *s)
+{
+	struct io_uring_rsrc_update up = {
+		.data	= s->ring_fd,
+		.offset	= -1U,
+	};
+	int ret;
+
+	ret = syscall(__NR_io_uring_register, s->ring_fd,
+			IORING_REGISTER_RING_FDS, &up, 1);
+	if (ret == 1) {
+		s->enter_ring_fd = up.offset;
+		return 0;
+	}
+	register_ring = 0;
+	return -1;
+}
+
 static void *submitter_uring_fn(void *data)
 {
 	struct submitter *s = data;
@@ -806,6 +837,9 @@ static void *submitter_uring_fn(void *data)
 	submitter_init(s);
 #endif
 
+	if (register_ring)
+		io_uring_register_ring(s);
+
 	prepped = 0;
 	do {
 		int to_wait, to_submit, this_reap, to_prep;
@@ -898,6 +932,75 @@ submit:
 		}
 	} while (!s->finish);
 
+	if (register_ring)
+		io_uring_unregister_ring(s);
+
+	finish = 1;
+	return NULL;
+}
+
+static void *submitter_sync_fn(void *data)
+{
+	struct submitter *s = data;
+	int ret;
+
+	submitter_init(s);
+
+	do {
+		uint64_t offset;
+		struct file *f;
+		long r;
+
+		if (s->nr_files == 1) {
+			f = &s->files[0];
+		} else {
+			f = &s->files[s->cur_file];
+			if (f->pending_ios >= file_depth(s)) {
+				s->cur_file++;
+				if (s->cur_file == s->nr_files)
+					s->cur_file = 0;
+				f = &s->files[s->cur_file];
+			}
+		}
+		f->pending_ios++;
+
+		if (random_io) {
+			r = __rand64(&s->rand_state);
+			offset = (r % (f->max_blocks - 1)) * bs;
+		} else {
+			offset = f->cur_off;
+			f->cur_off += bs;
+			if (f->cur_off + bs > f->max_size)
+				f->cur_off = 0;
+		}
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+		if (stats)
+			s->clock_batch[s->clock_index] = get_cpu_clock();
+#endif
+
+		s->inflight++;
+		s->calls++;
+
+		if (polled)
+			ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI);
+		else
+			ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, 0);
+
+		if (ret < 0) {
+			perror("preadv2");
+			break;
+		} else if (ret != bs) {
+			break;
+		}
+
+		s->done++;
+		s->inflight--;
+		f->pending_ios--;
+		if (stats)
+			add_stat(s, s->clock_index, 1);
+	} while (!s->finish);
+
 	finish = 1;
 	return NULL;
 }
@@ -1000,7 +1103,7 @@ static int setup_ring(struct submitter *s)
 		perror("io_uring_setup");
 		return 1;
 	}
-	s->ring_fd = fd;
+	s->ring_fd = s->enter_ring_fd = fd;
 
 	io_uring_probe(fd);
 
@@ -1105,10 +1208,13 @@ static void usage(char *argv, int status)
 		" -T <int>  : TSC rate in HZ\n"
 		" -r <int>  : Runtime in seconds, default %s\n"
 		" -R <bool> : Use random IO, default %d\n"
-		" -a <bool> : Use legacy aio, default %d\n",
+		" -a <bool> : Use legacy aio, default %d\n"
+		" -S <bool> : Use sync IO (preadv2), default %d\n"
+		" -X <bool> : Use registered ring %d\n",
 		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
 		fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
-		stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio);
+		stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
+		use_sync, register_ring);
 	exit(status);
 }
 
@@ -1169,7 +1275,7 @@ int main(int argc, char *argv[])
 	if (!do_nop && argc < 2)
 		usage(argv[0], 1);
 
-	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:h?")) != -1) {
+	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:h?")) != -1) {
 		switch (opt) {
 		case 'a':
 			aio = !!atoi(optarg);
@@ -1236,6 +1342,12 @@ int main(int argc, char *argv[])
 		case 'R':
 			random_io = !!atoi(optarg);
 			break;
+		case 'X':
+			register_ring = !!atoi(optarg);
+			break;
+		case 'S':
+			use_sync = !!atoi(optarg);
+			break;
 		case 'h':
 		case '?':
 		default:
@@ -1346,7 +1458,9 @@ int main(int argc, char *argv[])
 	for (j = 0; j < nthreads; j++) {
 		s = get_submitter(j);
 
-		if (!aio)
+		if (use_sync)
+			continue;
+		else if (!aio)
 			err = setup_ring(s);
 		else
 			err = setup_aio(s);
@@ -1357,14 +1471,18 @@ int main(int argc, char *argv[])
 	}
 	s = get_submitter(0);
 	printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
-	if (!aio)
+	if (use_sync)
+		printf("Engine=preadv2\n");
+	else if (!aio)
 		printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
 	else
 		printf("Engine=aio\n");
 
 	for (j = 0; j < nthreads; j++) {
 		s = get_submitter(j);
-		if (!aio)
+		if (use_sync)
+			pthread_create(&s->thread, NULL, submitter_sync_fn, s);
+		else if (!aio)
 			pthread_create(&s->thread, NULL, submitter_uring_fn, s);
 #ifdef CONFIG_LIBAIO
 		else

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-27 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-27 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit cf2511565f40be1b78b3fc1194e823baf305f0a0:

  Merge branch 'master' of https://github.com/bvanassche/fio (2022-02-24 12:40:19 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to c3773c171dffb79f771d213d94249cefc4b9b6de:

  windowsaio: open file for write if we have syncs (2022-02-26 10:43:20 -0700)

----------------------------------------------------------------
Jens Axboe (2):
      Add TD_F_SYNCS thread flag
      windowsaio: open file for write if we have syncs

 blktrace.c           | 4 ++++
 engines/windowsaio.c | 2 +-
 fio.h                | 6 ++++--
 ioengines.h          | 2 +-
 iolog.c              | 9 +++++++--
 5 files changed, 17 insertions(+), 6 deletions(-)

---

Diff of recent changes:

diff --git a/blktrace.c b/blktrace.c
index e1804765..ead60130 100644
--- a/blktrace.c
+++ b/blktrace.c
@@ -297,6 +297,10 @@ static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
 
 	ios[DDIR_SYNC]++;
 	dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
+
+	if (!(td->flags & TD_F_SYNCS))
+		td->flags |= TD_F_SYNCS;
+
 	queue_io_piece(td, ipo);
 	return true;
 }
diff --git a/engines/windowsaio.c b/engines/windowsaio.c
index d82c8053..6681f8bb 100644
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -248,7 +248,7 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
 		log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
 	}
 
-	if (!td_write(td) || read_only)
+	if ((!td_write(td) && !(td->flags & TD_F_SYNCS)) || read_only)
 		access = GENERIC_READ;
 	else
 		access = (GENERIC_READ | GENERIC_WRITE);
diff --git a/fio.h b/fio.h
index 88df117d..c314f0a8 100644
--- a/fio.h
+++ b/fio.h
@@ -97,6 +97,7 @@ enum {
 	__TD_F_MMAP_KEEP,
 	__TD_F_DIRS_CREATED,
 	__TD_F_CHECK_RATE,
+	__TD_F_SYNCS,
 	__TD_F_LAST,		/* not a real bit, keep last */
 };
 
@@ -118,6 +119,7 @@ enum {
 	TD_F_MMAP_KEEP		= 1U << __TD_F_MMAP_KEEP,
 	TD_F_DIRS_CREATED	= 1U << __TD_F_DIRS_CREATED,
 	TD_F_CHECK_RATE		= 1U << __TD_F_CHECK_RATE,
+	TD_F_SYNCS		= 1U << __TD_F_SYNCS,
 };
 
 enum {
@@ -678,8 +680,8 @@ enum {
 	TD_NR,
 };
 
-#define TD_ENG_FLAG_SHIFT	17
-#define TD_ENG_FLAG_MASK	((1U << 17) - 1)
+#define TD_ENG_FLAG_SHIFT	18
+#define TD_ENG_FLAG_MASK	((1U << 18) - 1)
 
 static inline void td_set_ioengine_flags(struct thread_data *td)
 {
diff --git a/ioengines.h b/ioengines.h
index b3f755b4..acdb0071 100644
--- a/ioengines.h
+++ b/ioengines.h
@@ -8,7 +8,7 @@
 #include "io_u.h"
 #include "zbd_types.h"
 
-#define FIO_IOOPS_VERSION	30
+#define FIO_IOOPS_VERSION	31
 
 #ifndef CONFIG_DYNAMIC_ENGINES
 #define FIO_STATIC	static
diff --git a/iolog.c b/iolog.c
index a2cf0c1c..724ec1fe 100644
--- a/iolog.c
+++ b/iolog.c
@@ -402,6 +402,7 @@ static bool read_iolog2(struct thread_data *td)
 	enum fio_ddir rw;
 	bool realloc = false;
 	int64_t items_to_fetch = 0;
+	int syncs;
 
 	if (td->o.read_iolog_chunked) {
 		items_to_fetch = iolog_items_to_fetch(td);
@@ -417,7 +418,7 @@ static bool read_iolog2(struct thread_data *td)
 	rfname = fname = malloc(256+16);
 	act = malloc(256+16);
 
-	reads = writes = waits = 0;
+	syncs = reads = writes = waits = 0;
 	while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
 		struct io_piece *ipo;
 		int r;
@@ -492,7 +493,9 @@ static bool read_iolog2(struct thread_data *td)
 				continue;
 			waits++;
 		} else if (rw == DDIR_INVAL) {
-		} else if (!ddir_sync(rw)) {
+		} else if (ddir_sync(rw)) {
+			syncs++;
+		} else {
 			log_err("bad ddir: %d\n", rw);
 			continue;
 		}
@@ -547,6 +550,8 @@ static bool read_iolog2(struct thread_data *td)
 			" read-only\n", td->o.name, writes);
 		writes = 0;
 	}
+	if (syncs)
+		td->flags |= TD_F_SYNCS;
 
 	if (td->o.read_iolog_chunked) {
 		if (td->io_log_current == 0) {

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-25 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-25 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit c377f4f85943e5b155b3daaab1ce5213077531d8:

  io_uring: use syscall helpers for the hot path (2022-02-21 09:43:48 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to cf2511565f40be1b78b3fc1194e823baf305f0a0:

  Merge branch 'master' of https://github.com/bvanassche/fio (2022-02-24 12:40:19 -0700)

----------------------------------------------------------------
Bart Van Assche (1):
      Fix three compiler warnings

Jens Axboe (1):
      Merge branch 'master' of https://github.com/bvanassche/fio

 engines/cmdprio.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

---

Diff of recent changes:

diff --git a/engines/cmdprio.c b/engines/cmdprio.c
index dd358754..979a81b6 100644
--- a/engines/cmdprio.c
+++ b/engines/cmdprio.c
@@ -319,7 +319,7 @@ static int fio_cmdprio_gen_perc(struct thread_data *td, struct cmdprio *cmdprio)
 {
 	struct cmdprio_options *options = cmdprio->options;
 	struct cmdprio_prio *prio;
-	struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {0};
+	struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {};
 	struct thread_stat *ts = &td->ts;
 	enum fio_ddir ddir;
 	int ret;
@@ -368,8 +368,8 @@ static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
 					     struct cmdprio *cmdprio)
 {
 	struct cmdprio_options *options = cmdprio->options;
-	struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {0};
-	struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {0};
+	struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {};
+	struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {};
 	struct thread_stat *ts = &td->ts;
 	int ret, implicit_cmdprio;
 	enum fio_ddir ddir;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-22 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-22 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 3f43022d4021850905886e391ec68c02c99aec5a:

  Merge branch 'genfio-tempfile' of https://github.com/scop/fio (2022-02-20 12:39:11 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to c377f4f85943e5b155b3daaab1ce5213077531d8:

  io_uring: use syscall helpers for the hot path (2022-02-21 09:43:48 -0700)

----------------------------------------------------------------
Jens Axboe (3):
      aarch64: add system call definitions
      x86-64: add system call definitions
      io_uring: use syscall helpers for the hot path

 arch/arch-aarch64.h |  77 +++++++++++++++++++++++++++++++++++
 arch/arch-x86_64.h  | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 engines/io_uring.c  |   5 +++
 t/io_uring.c        |   5 +++
 4 files changed, 200 insertions(+)

---

Diff of recent changes:

diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h
index 94571709..951d1718 100644
--- a/arch/arch-aarch64.h
+++ b/arch/arch-aarch64.h
@@ -44,4 +44,81 @@ static inline int arch_init(char *envp[])
 	return 0;
 }
 
+#define __do_syscallN(...) ({						\
+	__asm__ volatile (						\
+		"svc 0"							\
+		: "=r"(x0)						\
+		: __VA_ARGS__						\
+		: "memory", "cc");					\
+	(long) x0;							\
+})
+
+#define __do_syscall0(__n) ({						\
+	register long x8 __asm__("x8") = __n;				\
+	register long x0 __asm__("x0");					\
+									\
+	__do_syscallN("r" (x8));					\
+})
+
+#define __do_syscall1(__n, __a) ({					\
+	register long x8 __asm__("x8") = __n;				\
+	register __typeof__(__a) x0 __asm__("x0") = __a;		\
+									\
+	__do_syscallN("r" (x8), "0" (x0));				\
+})
+
+#define __do_syscall2(__n, __a, __b) ({					\
+	register long x8 __asm__("x8") = __n;				\
+	register __typeof__(__a) x0 __asm__("x0") = __a;		\
+	register __typeof__(__b) x1 __asm__("x1") = __b;		\
+									\
+	__do_syscallN("r" (x8), "0" (x0), "r" (x1));			\
+})
+
+#define __do_syscall3(__n, __a, __b, __c) ({				\
+	register long x8 __asm__("x8") = __n;				\
+	register __typeof__(__a) x0 __asm__("x0") = __a;		\
+	register __typeof__(__b) x1 __asm__("x1") = __b;		\
+	register __typeof__(__c) x2 __asm__("x2") = __c;		\
+									\
+	__do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2));		\
+})
+
+#define __do_syscall4(__n, __a, __b, __c, __d) ({			\
+	register long x8 __asm__("x8") = __n;				\
+	register __typeof__(__a) x0 __asm__("x0") = __a;		\
+	register __typeof__(__b) x1 __asm__("x1") = __b;		\
+	register __typeof__(__c) x2 __asm__("x2") = __c;		\
+	register __typeof__(__d) x3 __asm__("x3") = __d;		\
+									\
+	__do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3));\
+})
+
+#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({			\
+	register long x8 __asm__("x8") = __n;				\
+	register __typeof__(__a) x0 __asm__("x0") = __a;		\
+	register __typeof__(__b) x1 __asm__("x1") = __b;		\
+	register __typeof__(__c) x2 __asm__("x2") = __c;		\
+	register __typeof__(__d) x3 __asm__("x3") = __d;		\
+	register __typeof__(__e) x4 __asm__("x4") = __e;		\
+									\
+	__do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3),	\
+			"r"(x4));					\
+})
+
+#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({		\
+	register long x8 __asm__("x8") = __n;				\
+	register __typeof__(__a) x0 __asm__("x0") = __a;		\
+	register __typeof__(__b) x1 __asm__("x1") = __b;		\
+	register __typeof__(__c) x2 __asm__("x2") = __c;		\
+	register __typeof__(__d) x3 __asm__("x3") = __d;		\
+	register __typeof__(__e) x4 __asm__("x4") = __e;		\
+	register __typeof__(__f) x5 __asm__("x5") = __f;		\
+									\
+	__do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3),	\
+			"r" (x4), "r"(x5));				\
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
 #endif
diff --git a/arch/arch-x86_64.h b/arch/arch-x86_64.h
index 25850f90..86ce1b7e 100644
--- a/arch/arch-x86_64.h
+++ b/arch/arch-x86_64.h
@@ -68,4 +68,117 @@ static inline int arch_rand_seed(unsigned long *seed)
 	return 0;
 }
 
+#define __do_syscall0(NUM) ({			\
+	intptr_t rax;				\
+						\
+	__asm__ volatile(			\
+		"syscall"			\
+		: "=a"(rax)	/* %rax */	\
+		: "a"(NUM)	/* %rax */	\
+		: "rcx", "r11", "memory"	\
+	);					\
+	rax;					\
+})
+
+#define __do_syscall1(NUM, ARG1) ({		\
+	intptr_t rax;				\
+						\
+	__asm__ volatile(			\
+		"syscall"			\
+		: "=a"(rax)	/* %rax */	\
+		: "a"((NUM)),	/* %rax */	\
+		  "D"((ARG1))	/* %rdi */	\
+		: "rcx", "r11", "memory"	\
+	);					\
+	rax;					\
+})
+
+#define __do_syscall2(NUM, ARG1, ARG2) ({	\
+	intptr_t rax;				\
+						\
+	__asm__ volatile(			\
+		"syscall"			\
+		: "=a"(rax)	/* %rax */	\
+		: "a"((NUM)),	/* %rax */	\
+		  "D"((ARG1)),	/* %rdi */	\
+		  "S"((ARG2))	/* %rsi */	\
+		: "rcx", "r11", "memory"	\
+	);					\
+	rax;					\
+})
+
+#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({	\
+	intptr_t rax;				\
+						\
+	__asm__ volatile(			\
+		"syscall"			\
+		: "=a"(rax)	/* %rax */	\
+		: "a"((NUM)),	/* %rax */	\
+		  "D"((ARG1)),	/* %rdi */	\
+		  "S"((ARG2)),	/* %rsi */	\
+		  "d"((ARG3))	/* %rdx */	\
+		: "rcx", "r11", "memory"	\
+	);					\
+	rax;					\
+})
+
+#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({			\
+	intptr_t rax;							\
+	register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4);	\
+									\
+	__asm__ volatile(						\
+		"syscall"						\
+		: "=a"(rax)	/* %rax */				\
+		: "a"((NUM)),	/* %rax */				\
+		  "D"((ARG1)),	/* %rdi */				\
+		  "S"((ARG2)),	/* %rsi */				\
+		  "d"((ARG3)),	/* %rdx */				\
+		  "r"(__r10)	/* %r10 */				\
+		: "rcx", "r11", "memory"				\
+	);								\
+	rax;								\
+})
+
+#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({		\
+	intptr_t rax;							\
+	register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4);	\
+	register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5);		\
+									\
+	__asm__ volatile(						\
+		"syscall"						\
+		: "=a"(rax)	/* %rax */				\
+		: "a"((NUM)),	/* %rax */				\
+		  "D"((ARG1)),	/* %rdi */				\
+		  "S"((ARG2)),	/* %rsi */				\
+		  "d"((ARG3)),	/* %rdx */				\
+		  "r"(__r10),	/* %r10 */				\
+		  "r"(__r8)	/* %r8 */				\
+		: "rcx", "r11", "memory"				\
+	);								\
+	rax;								\
+})
+
+#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({	\
+	intptr_t rax;							\
+	register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4);	\
+	register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5);		\
+	register __typeof__(ARG6) __r9 __asm__("r9") = (ARG6);		\
+									\
+	__asm__ volatile(						\
+		"syscall"						\
+		: "=a"(rax)	/* %rax */				\
+		: "a"((NUM)),	/* %rax */				\
+		  "D"((ARG1)),	/* %rdi */				\
+		  "S"((ARG2)),	/* %rsi */				\
+		  "d"((ARG3)),	/* %rdx */				\
+		  "r"(__r10),	/* %r10 */				\
+		  "r"(__r8),	/* %r8 */				\
+		  "r"(__r9)	/* %r9 */				\
+		: "rcx", "r11", "memory"				\
+	);								\
+	rax;								\
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
 #endif
diff --git a/engines/io_uring.c b/engines/io_uring.c
index a2533c88..1e15647e 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -278,8 +278,13 @@ static struct fio_option options[] = {
 static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
 			 unsigned int min_complete, unsigned int flags)
 {
+#ifdef FIO_ARCH_HAS_SYSCALL
+	return __do_syscall6(__NR_io_uring_enter, ld->ring_fd, to_submit,
+				min_complete, flags, NULL, 0);
+#else
 	return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
 			min_complete, flags, NULL, 0);
+#endif
 }
 
 static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
diff --git a/t/io_uring.c b/t/io_uring.c
index f513d7dc..b8fcffe8 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -422,8 +422,13 @@ out:
 static int io_uring_enter(struct submitter *s, unsigned int to_submit,
 			  unsigned int min_complete, unsigned int flags)
 {
+#ifdef FIO_ARCH_HAS_SYSCALL
+	return __do_syscall6(__NR_io_uring_enter, s->ring_fd, to_submit,
+				min_complete, flags, NULL, 0);
+#else
 	return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
 			flags, NULL, 0);
+#endif
 }
 
 #ifndef CONFIG_HAVE_GETTID

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-21 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-21 13:00 UTC (permalink / raw)
  To: fio

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 29560 bytes --]

The following changes since commit 933651ec130ce4d27a5c249d649d20afeb2bdf38:

  Merge branch 'rpma-update-RPMA-engines-with-new-librpma-completions-API' of https://github.com/ldorau/fio (2022-02-18 09:02:03 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 3f43022d4021850905886e391ec68c02c99aec5a:

  Merge branch 'genfio-tempfile' of https://github.com/scop/fio (2022-02-20 12:39:11 -0700)

----------------------------------------------------------------
Jens Axboe (3):
      Merge branch 'which-command-v-type-P' of https://github.com/scop/fio
      Merge branch 'spelling' of https://github.com/scop/fio
      Merge branch 'genfio-tempfile' of https://github.com/scop/fio

Ville Skyttä (3):
      genfio: fix temporary file handling
      ci, t, tools: use `command` and `type` instead of `which`
      Spelling and grammar fixes

 HOWTO.rst                           | 4 ++--
 ci/travis-install-pmdk.sh           | 2 +-
 crc/xxhash.c                        | 4 ++--
 engines/exec.c                      | 4 ++--
 engines/http.c                      | 4 ++--
 engines/ime.c                       | 2 +-
 engines/libhdfs.c                   | 2 +-
 engines/librpma_fio.c               | 2 +-
 engines/librpma_gpspm.c             | 2 +-
 engines/nbd.c                       | 2 +-
 engines/rados.c                     | 2 +-
 engines/rbd.c                       | 4 ++--
 engines/rdma.c                      | 2 +-
 examples/enospc-pressure.fio        | 4 ++--
 examples/falloc.fio                 | 2 +-
 examples/librpma_apm-server.fio     | 2 +-
 examples/librpma_gpspm-server.fio   | 2 +-
 examples/rand-zones.fio             | 2 +-
 filesetup.c                         | 2 +-
 fio.1                               | 4 ++--
 graph.c                             | 2 +-
 lib/pattern.c                       | 6 +++---
 options.c                           | 4 ++--
 os/os-android.h                     | 2 +-
 os/os-netbsd.h                      | 2 +-
 os/windows/posix.c                  | 2 +-
 oslib/libmtd.h                      | 6 +++---
 stat.c                              | 2 +-
 stat.h                              | 2 +-
 t/latency_percentiles.py            | 2 +-
 t/one-core-peak.sh                  | 6 +++---
 t/readonly.py                       | 2 +-
 t/sgunmap-test.py                   | 2 +-
 t/steadystate_tests.py              | 2 +-
 t/time-test.c                       | 2 +-
 tools/fio_generate_plots            | 2 +-
 tools/fio_jsonplus_clat2csv         | 4 ++--
 tools/fiograph/fiograph.py          | 2 +-
 tools/genfio                        | 5 +++--
 tools/hist/fio-histo-log-pctiles.py | 2 +-
 tools/plot/fio2gnuplot              | 4 ++--
 tools/plot/fio2gnuplot.1            | 2 +-
 tools/plot/fio2gnuplot.manpage      | 2 +-
 43 files changed, 61 insertions(+), 60 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO.rst b/HOWTO.rst
index ac1f3478..0978879c 100644
--- a/HOWTO.rst
+++ b/HOWTO.rst
@@ -1443,7 +1443,7 @@ I/O type
 	range of possible random values.
 	Defaults are: random for **pareto** and **zipf**, and 0.5 for **normal**.
 	If you wanted to use **zipf** with a `theta` of 1.2 centered on 1/4 of allowed value range,
-	you would use ``random_distibution=zipf:1.2:0.25``.
+	you would use ``random_distribution=zipf:1.2:0.25``.
 
 	For a **zoned** distribution, fio supports specifying percentages of I/O
 	access that should fall within what range of the file or device. For
@@ -3370,7 +3370,7 @@ Verification
 	To avoid false verification errors, do not use the norandommap option when
 	verifying data with async I/O engines and I/O depths > 1.  Or use the
 	norandommap and the lfsr random generator together to avoid writing to the
-	same offset with muliple outstanding I/Os.
+	same offset with multiple outstanding I/Os.
 
 .. option:: verify_offset=int
 
diff --git a/ci/travis-install-pmdk.sh b/ci/travis-install-pmdk.sh
index 803438f8..3b0b5bbc 100755
--- a/ci/travis-install-pmdk.sh
+++ b/ci/travis-install-pmdk.sh
@@ -12,7 +12,7 @@ WORKDIR=$(pwd)
 #    /bin/sh: 1: clang: not found
 # if CC is not set to the full path of clang.
 #
-export CC=$(which $CC)
+export CC=$(type -P $CC)
 
 # Install PMDK libraries, because PMDK's libpmem
 # is a dependency of the librpma fio engine.
diff --git a/crc/xxhash.c b/crc/xxhash.c
index 4736c528..0119564b 100644
--- a/crc/xxhash.c
+++ b/crc/xxhash.c
@@ -50,10 +50,10 @@ You can contact the author at :
 //#define XXH_ACCEPT_NULL_INPUT_POINTER 1
 
 // XXH_FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
 // Results are therefore identical for little-endian and big-endian CPU.
 // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance for your application, you may set the #define below to 1.
+// Should endian-independence be of no importance for your application, you may set the #define below to 1.
 // It will improve speed for Big-endian CPU.
 // This option has no impact on Little_Endian CPU.
 #define XXH_FORCE_NATIVE_FORMAT 0
diff --git a/engines/exec.c b/engines/exec.c
index ab3639c5..20e50e00 100644
--- a/engines/exec.c
+++ b/engines/exec.c
@@ -67,8 +67,8 @@ char *str_replace(char *orig, const char *rep, const char *with)
 	/*
 	 * Replace a substring by another.
 	 *
-	 * Returns the new string if occurences were found
-	 * Returns orig if no occurence is found
+	 * Returns the new string if occurrences were found
+	 * Returns orig if no occurrence is found
 	 */
 	char *result, *insert, *tmp;
 	int len_rep, len_with, len_front, count;
diff --git a/engines/http.c b/engines/http.c
index 35c44871..57d4967d 100644
--- a/engines/http.c
+++ b/engines/http.c
@@ -388,7 +388,7 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
 
 	signature = _conv_hex(md, SHA256_DIGEST_LENGTH);
 
-	/* Surpress automatic Accept: header */
+	/* Suppress automatic Accept: header */
 	slist = curl_slist_append(slist, "Accept:");
 
 	snprintf(s, sizeof(s), "x-amz-content-sha256: %s", dsha);
@@ -419,7 +419,7 @@ static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_
 	if (op == DDIR_WRITE) {
 		dsha = _gen_hex_md5(buf, len);
 	}
-	/* Surpress automatic Accept: header */
+	/* Suppress automatic Accept: header */
 	slist = curl_slist_append(slist, "Accept:");
 
 	snprintf(s, sizeof(s), "etag: %s", dsha);
diff --git a/engines/ime.c b/engines/ime.c
index 440cc29e..f6690cc1 100644
--- a/engines/ime.c
+++ b/engines/ime.c
@@ -83,7 +83,7 @@ struct ime_data {
 	};
 	struct iovec 	*iovecs;		/* array of queued iovecs */
 	struct io_u 	**io_us;		/* array of queued io_u pointers */
-	struct io_u 	**event_io_us;	/* array of the events retieved afer get_events*/
+	struct io_u 	**event_io_us;	/* array of the events retrieved after get_events*/
 	unsigned int 	queued;			/* iovecs/io_us in the queue */
 	unsigned int 	events;			/* number of committed iovecs/io_us */
 
diff --git a/engines/libhdfs.c b/engines/libhdfs.c
index eb55c3c5..f20e45ca 100644
--- a/engines/libhdfs.c
+++ b/engines/libhdfs.c
@@ -27,7 +27,7 @@ struct hdfsio_data {
 };
 
 struct hdfsio_options {
-	void *pad;			/* needed because offset can't be 0 for a option defined used offsetof */
+	void *pad;			/* needed because offset can't be 0 for an option defined used offsetof */
 	char *host;
 	char *directory;
 	unsigned int port;
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c
index dfd82180..34818904 100644
--- a/engines/librpma_fio.c
+++ b/engines/librpma_fio.c
@@ -426,7 +426,7 @@ int librpma_fio_client_post_init(struct thread_data *td)
 
 	/*
 	 * td->orig_buffer is not aligned. The engine requires aligned io_us
-	 * so FIO alignes up the address using the formula below.
+	 * so FIO aligns up the address using the formula below.
 	 */
 	ccd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
 			td->o.mem_align;
diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c
index 14626e7f..5cf97472 100644
--- a/engines/librpma_gpspm.c
+++ b/engines/librpma_gpspm.c
@@ -431,7 +431,7 @@ static int server_post_init(struct thread_data *td)
 
 	/*
 	 * td->orig_buffer is not aligned. The engine requires aligned io_us
-	 * so FIO alignes up the address using the formula below.
+	 * so FIO aligns up the address using the formula below.
 	 */
 	sd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
 			td->o.mem_align;
diff --git a/engines/nbd.c b/engines/nbd.c
index b0ba75e6..7c2d5f4b 100644
--- a/engines/nbd.c
+++ b/engines/nbd.c
@@ -52,7 +52,7 @@ static struct fio_option options[] = {
 	},
 };
 
-/* Alocates nbd_data. */
+/* Allocates nbd_data. */
 static int nbd_setup(struct thread_data *td)
 {
 	struct nbd_data *nbd_data;
diff --git a/engines/rados.c b/engines/rados.c
index 23e62c4c..976f9229 100644
--- a/engines/rados.c
+++ b/engines/rados.c
@@ -151,7 +151,7 @@ static int _fio_rados_connect(struct thread_data *td)
 		char *client_name = NULL;
 
 		/*
-		* If we specify cluser name, the rados_create2
+		* If we specify cluster name, the rados_create2
 		* will not assume 'client.'. name is considered
 		* as a full type.id namestr
 		*/
diff --git a/engines/rbd.c b/engines/rbd.c
index c6203d4c..2f25889a 100644
--- a/engines/rbd.c
+++ b/engines/rbd.c
@@ -173,7 +173,7 @@ static int _fio_rbd_connect(struct thread_data *td)
 		char *client_name = NULL; 
 
 		/*
-		 * If we specify cluser name, the rados_create2
+		 * If we specify cluster name, the rados_create2
 		 * will not assume 'client.'. name is considered
 		 * as a full type.id namestr
 		 */
@@ -633,7 +633,7 @@ static int fio_rbd_setup(struct thread_data *td)
 
 	/* taken from "net" engine. Pretend we deal with files,
 	 * even if we do not have any ideas about files.
-	 * The size of the RBD is set instead of a artificial file.
+	 * The size of the RBD is set instead of an artificial file.
 	 */
 	if (!td->files_index) {
 		add_file(td, td->o.filename ? : "rbd", 0, 0);
diff --git a/engines/rdma.c b/engines/rdma.c
index f4471869..4eb86652 100644
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -1194,7 +1194,7 @@ static int check_set_rlimits(struct thread_data *td)
 
 static int compat_options(struct thread_data *td)
 {
-	// The original RDMA engine had an ugly / seperator
+	// The original RDMA engine had an ugly / separator
 	// on the filename for it's options. This function
 	// retains backwards compatibility with it. Note we do not
 	// support setting the bindname option is this legacy mode.
diff --git a/examples/enospc-pressure.fio b/examples/enospc-pressure.fio
index ca9d8f7a..fa404fd5 100644
--- a/examples/enospc-pressure.fio
+++ b/examples/enospc-pressure.fio
@@ -35,8 +35,8 @@ bs=4k
 rw=randtrim
 filename=raicer
 
-# Verifier thread continiously write to newly allcated blocks
-# and veryfy written content
+# Verifier thread continuously writes to newly allcated blocks
+# and verifies written content
 [aio-dio-verifier]
 create_on_open=1
 verify=crc32c-intel
diff --git a/examples/falloc.fio b/examples/falloc.fio
index fadf1321..5a3e88b8 100644
--- a/examples/falloc.fio
+++ b/examples/falloc.fio
@@ -29,7 +29,7 @@ rw=randtrim
 numjobs=2
 filename=fragmented_file
 
-## Mesure IO performance on fragmented file
+## Measure IO performance on fragmented file
 [sequential aio-dio write]
 stonewall
 ioengine=libaio
diff --git a/examples/librpma_apm-server.fio b/examples/librpma_apm-server.fio
index 062b5215..dc1ddba2 100644
--- a/examples/librpma_apm-server.fio
+++ b/examples/librpma_apm-server.fio
@@ -20,7 +20,7 @@ thread
 # (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
 direct_write_to_pmem=0
 
-numjobs=1 # number of expected incomming connections
+numjobs=1 # number of expected incoming connections
 size=100MiB # size of workspace for a single connection
 filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
 # filename=/dev/dax1.0
diff --git a/examples/librpma_gpspm-server.fio b/examples/librpma_gpspm-server.fio
index 67e92a28..4555314f 100644
--- a/examples/librpma_gpspm-server.fio
+++ b/examples/librpma_gpspm-server.fio
@@ -22,7 +22,7 @@ thread
 direct_write_to_pmem=0
 # set to 0 (false) to wait for completion instead of busy-wait polling completion.
 busy_wait_polling=1
-numjobs=1 # number of expected incomming connections
+numjobs=1 # number of expected incoming connections
 iodepth=2 # number of parallel GPSPM requests
 size=100MiB # size of workspace for a single connection
 filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
diff --git a/examples/rand-zones.fio b/examples/rand-zones.fio
index 169137d4..10e71727 100644
--- a/examples/rand-zones.fio
+++ b/examples/rand-zones.fio
@@ -21,6 +21,6 @@ random_distribution=zoned:50/5:30/15:20/
 # The above applies to all of reads/writes/trims. If we wanted to do
 # something differently for writes, let's say 50% for the first 10%
 # and 50% for the remaining 90%, we could do it by adding a new section
-# after a a comma.
+# after a comma.
 
 # random_distribution=zoned:50/5:30/15:20/,50/10:50/90
diff --git a/filesetup.c b/filesetup.c
index fb556d84..7c32d0af 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -1486,7 +1486,7 @@ static bool init_rand_distribution(struct thread_data *td)
 
 /*
  * Check if the number of blocks exceeds the randomness capability of
- * the selected generator. Tausworthe is 32-bit, the others are fullly
+ * the selected generator. Tausworthe is 32-bit, the others are fully
  * 64-bit capable.
  */
 static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f,
diff --git a/fio.1 b/fio.1
index e23d4092..98410655 100644
--- a/fio.1
+++ b/fio.1
@@ -1221,7 +1221,7 @@ more control over most probable outcome. This value is in range [0-1] which maps
 range of possible random values.
 Defaults are: random for \fBpareto\fR and \fBzipf\fR, and 0.5 for \fBnormal\fR.
 If you wanted to use \fBzipf\fR with a `theta` of 1.2 centered on 1/4 of allowed value range,
-you would use `random_distibution=zipf:1.2:0.25`.
+you would use `random_distribution=zipf:1.2:0.25`.
 .P
 For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
 access that should fall within what range of the file or device. For
@@ -3082,7 +3082,7 @@ the verify will be of the newly written data.
 To avoid false verification errors, do not use the norandommap option when
 verifying data with async I/O engines and I/O depths > 1.  Or use the
 norandommap and the lfsr random generator together to avoid writing to the
-same offset with muliple outstanding I/Os.
+same offset with multiple outstanding I/Os.
 .RE
 .TP
 .BI verify_offset \fR=\fPint
diff --git a/graph.c b/graph.c
index 7a174170..c49cdae1 100644
--- a/graph.c
+++ b/graph.c
@@ -999,7 +999,7 @@ const char *graph_find_tooltip(struct graph *g, int ix, int iy)
 				ydiff = fabs(yval - y);
 
 				/*
-				 * zero delta, or within or match critera, break
+				 * zero delta, or within or match criteria, break
 				 */
 				if (ydiff < best_delta) {
 					best_delta = ydiff;
diff --git a/lib/pattern.c b/lib/pattern.c
index 680a12be..d8203630 100644
--- a/lib/pattern.c
+++ b/lib/pattern.c
@@ -211,7 +211,7 @@ static const char *parse_number(const char *beg, char *out,
  * This function tries to find formats, e.g.:
  *   %o - offset of the block
  *
- * In case of successfull parsing it fills the format param
+ * In case of successful parsing it fills the format param
  * with proper offset and the size of the expected value, which
  * should be pasted into buffer using the format 'func' callback.
  *
@@ -267,7 +267,7 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed,
  * @fmt_desc - array of pattern format descriptors [input]
  * @fmt - array of pattern formats [output]
  * @fmt_sz - pointer where the size of pattern formats array stored [input],
- *           after successfull parsing this pointer will contain the number
+ *           after successful parsing this pointer will contain the number
  *           of parsed formats if any [output].
  *
  * strings:
@@ -275,7 +275,7 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed,
  *   NOTE: there is no way to escape quote, so "123\"abc" does not work.
  *
  * numbers:
- *   hexidecimal - sequence of hex bytes starting from 0x or 0X prefix,
+ *   hexadecimal - sequence of hex bytes starting from 0x or 0X prefix,
  *                 e.g. 0xff12ceff1100ff
  *   decimal     - decimal number in range [INT_MIN, INT_MAX]
  *
diff --git a/options.c b/options.c
index 6cdbd268..e06d9b66 100644
--- a/options.c
+++ b/options.c
@@ -1366,7 +1366,7 @@ int get_max_str_idx(char *input)
 }
 
 /*
- * Returns the directory at the index, indexes > entires will be
+ * Returns the directory at the index, indexes > entries will be
  * assigned via modulo division of the index
  */
 int set_name_idx(char *target, size_t tlen, char *input, int index,
@@ -1560,7 +1560,7 @@ static int str_gtod_reduce_cb(void *data, int *il)
 	int val = *il;
 
 	/*
-	 * Only modfiy options if gtod_reduce==1
+	 * Only modify options if gtod_reduce==1
 	 * Otherwise leave settings alone.
 	 */
 	if (val) {
diff --git a/os/os-android.h b/os/os-android.h
index 10c51b83..2f73d249 100644
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -66,7 +66,7 @@
 
 #ifndef CONFIG_NO_SHM
 /*
- * Bionic doesn't support SysV shared memeory, so implement it using ashmem
+ * Bionic doesn't support SysV shared memory, so implement it using ashmem
  */
 #include <stdio.h>
 #include <linux/ashmem.h>
diff --git a/os/os-netbsd.h b/os/os-netbsd.h
index 624c7fa5..b553a430 100644
--- a/os/os-netbsd.h
+++ b/os/os-netbsd.h
@@ -13,7 +13,7 @@
 #include <sys/endian.h>
 #include <sys/sysctl.h>
 
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
+/* XXX hack to avoid conflicts between rbtree.h and <sys/rbtree.h> */
 #undef rb_node
 #undef rb_left
 #undef rb_right
diff --git a/os/windows/posix.c b/os/windows/posix.c
index 0d415e1e..a3a6c89f 100644
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -1165,7 +1165,7 @@ HANDLE windows_handle_connection(HANDLE hjob, int sk)
 		ret = pi.hProcess;
 
 	/* duplicate socket and write the protocol_info to pipe so child can
-	 * duplicate the communciation socket */
+	 * duplicate the communication socket */
 	if (WSADuplicateSocket(sk, GetProcessId(pi.hProcess), &protocol_info)) {
 		log_err("WSADuplicateSocket failed (%lu).\n", GetLastError());
 		ret = INVALID_HANDLE_VALUE;
diff --git a/oslib/libmtd.h b/oslib/libmtd.h
index a0c90dcb..668e7798 100644
--- a/oslib/libmtd.h
+++ b/oslib/libmtd.h
@@ -256,7 +256,7 @@ int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb);
  * @mtd: MTD device description object
  * @fd: MTD device node file descriptor
  * @eb: eraseblock to read from
- * @offs: offset withing the eraseblock to read from
+ * @offs: offset within the eraseblock to read from
  * @buf: buffer to read data to
  * @len: how many bytes to read
  *
@@ -273,7 +273,7 @@ int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
  * @mtd: MTD device description object
  * @fd: MTD device node file descriptor
  * @eb: eraseblock to write to
- * @offs: offset withing the eraseblock to write to
+ * @offs: offset within the eraseblock to write to
  * @data: data buffer to write
  * @len: how many data bytes to write
  * @oob: OOB buffer to write
@@ -329,7 +329,7 @@ int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
  * @mtd: MTD device description object
  * @fd: MTD device node file descriptor
  * @eb: eraseblock to write to
- * @offs: offset withing the eraseblock to write to
+ * @offs: offset within the eraseblock to write to
  * @img_name: the file to write
  *
  * This function writes an image @img_name the MTD device defined by @mtd. @eb
diff --git a/stat.c b/stat.c
index 1764eebc..7947edb4 100644
--- a/stat.c
+++ b/stat.c
@@ -377,7 +377,7 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
 		free(maxalt);
 	}
 
-	/* Need to aggregate statisitics to show mixed values */
+	/* Need to aggregate statistics to show mixed values */
 	if (rs->unified_rw_rep == UNIFIED_BOTH)
 		show_mixed_group_stats(rs, out);
 }
diff --git a/stat.h b/stat.h
index dce0bb0d..eb7845af 100644
--- a/stat.h
+++ b/stat.h
@@ -68,7 +68,7 @@ struct group_run_stats {
  * than one. This method has low accuracy when the value is small. For
  * example, let the buckets be {[0,99],[100,199],...,[900,999]}, and
  * the represented value of each bucket be the mean of the range. Then
- * a value 0 has an round-off error of 49.5. To improve on this, we
+ * a value 0 has a round-off error of 49.5. To improve on this, we
  * use buckets with non-uniform ranges, while bounding the error of
  * each bucket within a ratio of the sample value. A simple example
  * would be when error_bound = 0.005, buckets are {
diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py
index 9e37d9fe..81704700 100755
--- a/t/latency_percentiles.py
+++ b/t/latency_percentiles.py
@@ -270,7 +270,7 @@ class FioLatTest():
             #
             # Check only for the presence/absence of json+
             # latency bins. Future work can check the
-            # accurracy of the bin values and counts.
+            # accuracy of the bin values and counts.
             #
             # Because the latency percentiles are based on
             # the bins, we can be confident that the bin
diff --git a/t/one-core-peak.sh b/t/one-core-peak.sh
index 9da8304e..3ac119f6 100755
--- a/t/one-core-peak.sh
+++ b/t/one-core-peak.sh
@@ -33,8 +33,8 @@ check_binary() {
   # Ensure the binaries are present and executable
   for bin in "$@"; do
     if [ ! -x ${bin} ]; then
-      which ${bin} >/dev/null
-      [ $? -eq 0 ] || fatal "${bin} doesn't exists or is not executable"
+      command -v ${bin} >/dev/null
+      [ $? -eq 0 ] || fatal "${bin} doesn't exist or is not executable"
     fi
   done
 }
@@ -197,7 +197,7 @@ show_nvme() {
   fw=$(cat ${device_dir}/firmware_rev | xargs) #xargs for trimming spaces
   serial=$(cat ${device_dir}/serial | xargs) #xargs for trimming spaces
   info ${device_name} "MODEL=${model} FW=${fw} serial=${serial} PCI=${pci_addr}@${link_speed} IRQ=${irq} NUMA=${numa} CPUS=${cpus} "
-  which nvme &> /dev/null
+  command -v nvme > /dev/null
   if [ $? -eq 0 ]; then
     status=""
     NCQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NCQA |cut -d ':' -f 2 | xargs)
diff --git a/t/readonly.py b/t/readonly.py
index 464847c6..80fac639 100755
--- a/t/readonly.py
+++ b/t/readonly.py
@@ -6,7 +6,7 @@
 #
 # readonly.py
 #
-# Do some basic tests of the --readonly paramter
+# Do some basic tests of the --readonly parameter
 #
 # USAGE
 # python readonly.py [-f fio-executable]
diff --git a/t/sgunmap-test.py b/t/sgunmap-test.py
index 4960a040..6687494f 100755
--- a/t/sgunmap-test.py
+++ b/t/sgunmap-test.py
@@ -3,7 +3,7 @@
 #
 # sgunmap-test.py
 #
-# Limited functonality test for trim workloads using fio's sg ioengine
+# Limited functionality test for trim workloads using fio's sg ioengine
 # This checks only the three sets of reported iodepths
 #
 # !!!WARNING!!!
diff --git a/t/steadystate_tests.py b/t/steadystate_tests.py
index e8bd768c..d6ffd177 100755
--- a/t/steadystate_tests.py
+++ b/t/steadystate_tests.py
@@ -2,7 +2,7 @@
 #
 # steadystate_tests.py
 #
-# Test option parsing and functonality for fio's steady state detection feature.
+# Test option parsing and functionality for fio's steady state detection feature.
 #
 # steadystate_tests.py --read file-for-read-testing --write file-for-write-testing ./fio
 #
diff --git a/t/time-test.c b/t/time-test.c
index a74d9206..3c87d4d4 100644
--- a/t/time-test.c
+++ b/t/time-test.c
@@ -67,7 +67,7 @@
  *	accuracy because the (ticks * clock_mult) product used for final
  *	fractional chunk
  *
- *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occurring in
  *	two stages. This is carried out using locks to update the number of
  *	large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
  *
diff --git a/tools/fio_generate_plots b/tools/fio_generate_plots
index e4558788..468cf27a 100755
--- a/tools/fio_generate_plots
+++ b/tools/fio_generate_plots
@@ -21,7 +21,7 @@ if [ -z "$1" ]; then
 	exit 1
 fi
 
-GNUPLOT=$(which gnuplot)
+GNUPLOT=$(command -v gnuplot)
 if [ ! -x "$GNUPLOT" ]
 then
 	echo You need gnuplot installed to generate graphs
diff --git a/tools/fio_jsonplus_clat2csv b/tools/fio_jsonplus_clat2csv
index 7f310fcc..8fdd014d 100755
--- a/tools/fio_jsonplus_clat2csv
+++ b/tools/fio_jsonplus_clat2csv
@@ -135,7 +135,7 @@ def more_bins(indices, bins):
 
     Returns:
         True if the indices do not yet point to the end of each bin in bins.
-        False if the indices point beyond their repsective bins.
+        False if the indices point beyond their respective bins.
     """
 
     for key, value in six.iteritems(indices):
@@ -160,7 +160,7 @@ def debug_print(debug, *args):
 def get_csvfile(dest, jobnum):
     """Generate CSV filename from command-line arguments and job numbers.
 
-    Paramaters:
+    Parameters:
         dest        file specification for CSV filename.
         jobnum      job number.
 
diff --git a/tools/fiograph/fiograph.py b/tools/fiograph/fiograph.py
index b5669a2d..384decda 100755
--- a/tools/fiograph/fiograph.py
+++ b/tools/fiograph/fiograph.py
@@ -218,7 +218,7 @@ def fio_to_graphviz(filename, format):
     # The first job will be a new execution group
     new_execution_group = True
 
-    # Let's interate on all sections to create links between them
+    # Let's iterate on all sections to create links between them
     for section_name in fio_file.sections():
         # The current section
         section = fio_file[section_name]
diff --git a/tools/genfio b/tools/genfio
index 8518bbcc..c9bc2f76 100755
--- a/tools/genfio
+++ b/tools/genfio
@@ -22,7 +22,8 @@
 BLK_SIZE=
 BLOCK_SIZE=4k
 SEQ=-1
-TEMPLATE=/tmp/template.fio
+TEMPLATE=$(mktemp "${TMPDIR:-${TEMP:-/tmp}}/template.fio.XXXXXX") || exit $?
+trap 'rm -f "$TEMPLATE"' EXIT
 OUTFILE=
 DISKS=
 PRINTABLE_DISKS=
@@ -48,7 +49,7 @@ show_help() {
 					one test after another then one disk after another
 					Disabled by default
 -p				: Run parallel test
-					one test after anoter but all disks at the same time
+					one test after another but all disks at the same time
 					Enabled by default
 -D iodepth			: Run with the specified iodepth
 					Default is $IODEPTH
diff --git a/tools/hist/fio-histo-log-pctiles.py b/tools/hist/fio-histo-log-pctiles.py
index 08e7722d..b5d167de 100755
--- a/tools/hist/fio-histo-log-pctiles.py
+++ b/tools/hist/fio-histo-log-pctiles.py
@@ -748,7 +748,7 @@ if unittest2_imported:
     def test_e2_get_pctiles_highest_pct(self):
         fio_v3_bucket_count = 29 * 64
         with open(self.fn, 'w') as f:
-            # make a empty fio v3 histogram
+            # make an empty fio v3 histogram
             buckets = [ 0 for j in range(0, fio_v3_bucket_count) ]
             # add one I/O request to last bucket
             buckets[-1] = 1
diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot
index d2dc81df..ce3ca2cc 100755
--- a/tools/plot/fio2gnuplot
+++ b/tools/plot/fio2gnuplot
@@ -492,8 +492,8 @@ def main(argv):
     #We need to adjust the output filename regarding the pattern required by the user
     if (pattern_set_by_user == True):
         gnuplot_output_filename=pattern
-        # As we do have some glob in the pattern, let's make this simpliest
-        # We do remove the simpliest parts of the expression to get a clear file name
+        # As we do have some glob in the pattern, let's make this simplest
+        # We do remove the simplest parts of the expression to get a clear file name
         gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
         gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
         gnuplot_output_filename=gnuplot_output_filename.replace('--','-')
diff --git a/tools/plot/fio2gnuplot.1 b/tools/plot/fio2gnuplot.1
index 6fb1283f..bfa10d26 100644
--- a/tools/plot/fio2gnuplot.1
+++ b/tools/plot/fio2gnuplot.1
@@ -35,7 +35,7 @@ The resulting graph helps at understanding trends.
 .TP
 .B
 Grouped 2D graph
-All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used :
 .RS
 .IP \(bu 3
 raw
diff --git a/tools/plot/fio2gnuplot.manpage b/tools/plot/fio2gnuplot.manpage
index 6a12cf81..be3f13c2 100644
--- a/tools/plot/fio2gnuplot.manpage
+++ b/tools/plot/fio2gnuplot.manpage
@@ -20,7 +20,7 @@ DESCRIPTION
                     	The resulting graph helps at understanding trends.
 
  Grouped 2D graph   
-	All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+	All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used :
          - raw
          - smooth
          - trend

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-19 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-19 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit c99c81adb3510a8dc34d47fd40b19ef657e32192:

  Correct F_FULLSYNC -> F_FULLFSYNC (2022-02-17 12:53:59 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 933651ec130ce4d27a5c249d649d20afeb2bdf38:

  Merge branch 'rpma-update-RPMA-engines-with-new-librpma-completions-API' of https://github.com/ldorau/fio (2022-02-18 09:02:03 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'rpma-update-RPMA-engines-with-new-librpma-completions-API' of https://github.com/ldorau/fio

Lukasz Dorau (1):
      rpma: RPMA engines require librpma>=v0.11.0 with rpma_cq_get_wc()

Oksana Salyk (1):
      rpma: update RPMA engines with new librpma completions API

 configure               |  4 ++--
 engines/librpma_apm.c   |  8 +++-----
 engines/librpma_fio.c   | 46 +++++++++++++++++++++++++++++-----------------
 engines/librpma_fio.h   | 16 +++++++++-------
 engines/librpma_gpspm.c | 39 ++++++++++++++++++---------------------
 5 files changed, 61 insertions(+), 52 deletions(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index 6160d84d..be4605f9 100755
--- a/configure
+++ b/configure
@@ -974,7 +974,7 @@ print_config "rdmacm" "$rdmacm"
 
 ##########################################
 # librpma probe
-# The librpma engine requires librpma>=v0.10.0 with rpma_mr_advise().
+# The librpma engines require librpma>=v0.11.0 with rpma_cq_get_wc().
 if test "$librpma" != "yes" ; then
   librpma="no"
 fi
@@ -982,7 +982,7 @@ cat > $TMPC << EOF
 #include <librpma.h>
 int main(void)
 {
-  void *ptr = rpma_mr_advise;
+  void *ptr = rpma_cq_get_wc;
   (void) ptr; /* unused */
   return 0;
 }
diff --git a/engines/librpma_apm.c b/engines/librpma_apm.c
index ffa3769d..d1166ad8 100644
--- a/engines/librpma_apm.c
+++ b/engines/librpma_apm.c
@@ -22,8 +22,7 @@ static inline int client_io_flush(struct thread_data *td,
 		struct io_u *first_io_u, struct io_u *last_io_u,
 		unsigned long long int len);
 
-static int client_get_io_u_index(struct rpma_completion *cmpl,
-		unsigned int *io_u_index);
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index);
 
 static int client_init(struct thread_data *td)
 {
@@ -188,10 +187,9 @@ static inline int client_io_flush(struct thread_data *td,
 	return 0;
 }
 
-static int client_get_io_u_index(struct rpma_completion *cmpl,
-		unsigned int *io_u_index)
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index)
 {
-	memcpy(io_u_index, &cmpl->op_context, sizeof(*io_u_index));
+	memcpy(io_u_index, &wc->wr_id, sizeof(*io_u_index));
 
 	return 1;
 }
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c
index 9d6ebf38..dfd82180 100644
--- a/engines/librpma_fio.c
+++ b/engines/librpma_fio.c
@@ -302,6 +302,12 @@ int librpma_fio_client_init(struct thread_data *td,
 	if (ccd->conn == NULL)
 		goto err_peer_delete;
 
+	/* get the connection's main CQ */
+	if ((ret = rpma_conn_get_cq(ccd->conn, &ccd->cq))) {
+		librpma_td_verror(td, ret, "rpma_conn_get_cq");
+		goto err_conn_delete;
+	}
+
 	/* get the connection's private data sent from the server */
 	if ((ret = rpma_conn_get_private_data(ccd->conn, &pdata))) {
 		librpma_td_verror(td, ret, "rpma_conn_get_private_data");
@@ -455,7 +461,7 @@ static enum fio_q_status client_queue_sync(struct thread_data *td,
 		struct io_u *io_u)
 {
 	struct librpma_fio_client_data *ccd = td->io_ops_data;
-	struct rpma_completion cmpl;
+	struct ibv_wc wc;
 	unsigned io_u_index;
 	int ret;
 
@@ -478,31 +484,31 @@ static enum fio_q_status client_queue_sync(struct thread_data *td,
 
 	do {
 		/* get a completion */
-		ret = rpma_conn_completion_get(ccd->conn, &cmpl);
+		ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL);
 		if (ret == RPMA_E_NO_COMPLETION) {
 			/* lack of completion is not an error */
 			continue;
 		} else if (ret != 0) {
 			/* an error occurred */
-			librpma_td_verror(td, ret, "rpma_conn_completion_get");
+			librpma_td_verror(td, ret, "rpma_cq_get_wc");
 			goto err;
 		}
 
 		/* if io_us has completed with an error */
-		if (cmpl.op_status != IBV_WC_SUCCESS)
+		if (wc.status != IBV_WC_SUCCESS)
 			goto err;
 
-		if (cmpl.op == RPMA_OP_SEND)
+		if (wc.opcode == IBV_WC_SEND)
 			++ccd->op_send_completed;
 		else {
-			if (cmpl.op == RPMA_OP_RECV)
+			if (wc.opcode == IBV_WC_RECV)
 				++ccd->op_recv_completed;
 
 			break;
 		}
 	} while (1);
 
-	if (ccd->get_io_u_index(&cmpl, &io_u_index) != 1)
+	if (ccd->get_io_u_index(&wc, &io_u_index) != 1)
 		goto err;
 
 	if (io_u->index != io_u_index) {
@@ -654,8 +660,8 @@ int librpma_fio_client_commit(struct thread_data *td)
 static int client_getevent_process(struct thread_data *td)
 {
 	struct librpma_fio_client_data *ccd = td->io_ops_data;
-	struct rpma_completion cmpl;
-	/* io_u->index of completed io_u (cmpl.op_context) */
+	struct ibv_wc wc;
+	/* io_u->index of completed io_u (wc.wr_id) */
 	unsigned int io_u_index;
 	/* # of completed io_us */
 	int cmpl_num = 0;
@@ -665,7 +671,7 @@ static int client_getevent_process(struct thread_data *td)
 	int ret;
 
 	/* get a completion */
-	if ((ret = rpma_conn_completion_get(ccd->conn, &cmpl))) {
+	if ((ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL))) {
 		/* lack of completion is not an error */
 		if (ret == RPMA_E_NO_COMPLETION) {
 			/* lack of completion is not an error */
@@ -673,22 +679,22 @@ static int client_getevent_process(struct thread_data *td)
 		}
 
 		/* an error occurred */
-		librpma_td_verror(td, ret, "rpma_conn_completion_get");
+		librpma_td_verror(td, ret, "rpma_cq_get_wc");
 		return -1;
 	}
 
 	/* if io_us has completed with an error */
-	if (cmpl.op_status != IBV_WC_SUCCESS) {
-		td->error = cmpl.op_status;
+	if (wc.status != IBV_WC_SUCCESS) {
+		td->error = wc.status;
 		return -1;
 	}
 
-	if (cmpl.op == RPMA_OP_SEND)
+	if (wc.opcode == IBV_WC_SEND)
 		++ccd->op_send_completed;
-	else if (cmpl.op == RPMA_OP_RECV)
+	else if (wc.opcode == IBV_WC_RECV)
 		++ccd->op_recv_completed;
 
-	if ((ret = ccd->get_io_u_index(&cmpl, &io_u_index)) != 1)
+	if ((ret = ccd->get_io_u_index(&wc, &io_u_index)) != 1)
 		return ret;
 
 	/* look for an io_u being completed */
@@ -750,7 +756,7 @@ int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
 
 			/*
 			 * To reduce CPU consumption one can use
-			 * the rpma_conn_completion_wait() function.
+			 * the rpma_cq_wait() function.
 			 * Note this greatly increase the latency
 			 * and make the results less stable.
 			 * The bandwidth stays more or less the same.
@@ -1029,6 +1035,12 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
 	csd->ws_ptr = ws_ptr;
 	csd->conn = conn;
 
+	/* get the connection's main CQ */
+	if ((ret = rpma_conn_get_cq(csd->conn, &csd->cq))) {
+		librpma_td_verror(td, ret, "rpma_conn_get_cq");
+		goto err_conn_delete;
+	}
+
 	return 0;
 
 err_conn_delete:
diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h
index 2c507e9c..91290235 100644
--- a/engines/librpma_fio.h
+++ b/engines/librpma_fio.h
@@ -94,12 +94,13 @@ typedef int (*librpma_fio_flush_t)(struct thread_data *td,
  * - ( 0) - skip
  * - (-1) - on error
  */
-typedef int (*librpma_fio_get_io_u_index_t)(struct rpma_completion *cmpl,
+typedef int (*librpma_fio_get_io_u_index_t)(struct ibv_wc *wc,
 		unsigned int *io_u_index);
 
 struct librpma_fio_client_data {
 	struct rpma_peer *peer;
 	struct rpma_conn *conn;
+	struct rpma_cq *cq;
 
 	/* aligned td->orig_buffer */
 	char *orig_buffer_aligned;
@@ -199,29 +200,29 @@ static inline int librpma_fio_client_io_complete_all_sends(
 		struct thread_data *td)
 {
 	struct librpma_fio_client_data *ccd = td->io_ops_data;
-	struct rpma_completion cmpl;
+	struct ibv_wc wc;
 	int ret;
 
 	while (ccd->op_send_posted != ccd->op_send_completed) {
 		/* get a completion */
-		ret = rpma_conn_completion_get(ccd->conn, &cmpl);
+		ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL);
 		if (ret == RPMA_E_NO_COMPLETION) {
 			/* lack of completion is not an error */
 			continue;
 		} else if (ret != 0) {
 			/* an error occurred */
-			librpma_td_verror(td, ret, "rpma_conn_completion_get");
+			librpma_td_verror(td, ret, "rpma_cq_get_wc");
 			break;
 		}
 
-		if (cmpl.op_status != IBV_WC_SUCCESS)
+		if (wc.status != IBV_WC_SUCCESS)
 			return -1;
 
-		if (cmpl.op == RPMA_OP_SEND)
+		if (wc.opcode == IBV_WC_SEND)
 			++ccd->op_send_completed;
 		else {
 			log_err(
-				"A completion other than RPMA_OP_SEND got during cleaning up the CQ from SENDs\n");
+				"A completion other than IBV_WC_SEND got during cleaning up the CQ from SENDs\n");
 			return -1;
 		}
 	}
@@ -251,6 +252,7 @@ struct librpma_fio_server_data {
 
 	/* resources of an incoming connection */
 	struct rpma_conn *conn;
+	struct rpma_cq *cq;
 
 	char *ws_ptr;
 	struct rpma_mr_local *ws_mr;
diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c
index 74147709..14626e7f 100644
--- a/engines/librpma_gpspm.c
+++ b/engines/librpma_gpspm.c
@@ -60,8 +60,7 @@ static inline int client_io_flush(struct thread_data *td,
 		struct io_u *first_io_u, struct io_u *last_io_u,
 		unsigned long long int len);
 
-static int client_get_io_u_index(struct rpma_completion *cmpl,
-		unsigned int *io_u_index);
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index);
 
 static int client_init(struct thread_data *td)
 {
@@ -317,17 +316,16 @@ static inline int client_io_flush(struct thread_data *td,
 	return 0;
 }
 
-static int client_get_io_u_index(struct rpma_completion *cmpl,
-		unsigned int *io_u_index)
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index)
 {
 	GPSPMFlushResponse *flush_resp;
 
-	if (cmpl->op != RPMA_OP_RECV)
+	if (wc->opcode != IBV_WC_RECV)
 		return 0;
 
 	/* unpack a response from the received buffer */
 	flush_resp = gpspm_flush_response__unpack(NULL,
-			cmpl->byte_len, cmpl->op_context);
+			wc->byte_len, (void *)wc->wr_id);
 	if (flush_resp == NULL) {
 		log_err("Cannot unpack the flush response buffer\n");
 		return -1;
@@ -373,7 +371,7 @@ struct server_data {
 	uint32_t msg_sqe_available; /* # of free SQ slots */
 
 	/* in-memory queues */
-	struct rpma_completion *msgs_queued;
+	struct ibv_wc *msgs_queued;
 	uint32_t msg_queued_nr;
 };
 
@@ -562,8 +560,7 @@ err_cfg_delete:
 	return ret;
 }
 
-static int server_qe_process(struct thread_data *td,
-		struct rpma_completion *cmpl)
+static int server_qe_process(struct thread_data *td, struct ibv_wc *wc)
 {
 	struct librpma_fio_server_data *csd = td->io_ops_data;
 	struct server_data *sd = csd->server_data;
@@ -580,7 +577,7 @@ static int server_qe_process(struct thread_data *td,
 	int ret;
 
 	/* calculate SEND/RECV pair parameters */
-	msg_index = (int)(uintptr_t)cmpl->op_context;
+	msg_index = (int)(uintptr_t)wc->wr_id;
 	io_u_buff_offset = IO_U_BUFF_OFF_SERVER(msg_index);
 	send_buff_offset = io_u_buff_offset + SEND_OFFSET;
 	recv_buff_offset = io_u_buff_offset + RECV_OFFSET;
@@ -588,7 +585,7 @@ static int server_qe_process(struct thread_data *td,
 	recv_buff_ptr = sd->orig_buffer_aligned + recv_buff_offset;
 
 	/* unpack a flush request from the received buffer */
-	flush_req = gpspm_flush_request__unpack(NULL, cmpl->byte_len,
+	flush_req = gpspm_flush_request__unpack(NULL, wc->byte_len,
 			recv_buff_ptr);
 	if (flush_req == NULL) {
 		log_err("cannot unpack the flush request buffer\n");
@@ -682,28 +679,28 @@ static int server_cmpl_process(struct thread_data *td)
 {
 	struct librpma_fio_server_data *csd = td->io_ops_data;
 	struct server_data *sd = csd->server_data;
-	struct rpma_completion *cmpl = &sd->msgs_queued[sd->msg_queued_nr];
+	struct ibv_wc *wc = &sd->msgs_queued[sd->msg_queued_nr];
 	struct librpma_fio_options_values *o = td->eo;
 	int ret;
 
-	ret = rpma_conn_completion_get(csd->conn, cmpl);
+	ret = rpma_cq_get_wc(csd->cq, 1, wc, NULL);
 	if (ret == RPMA_E_NO_COMPLETION) {
 		if (o->busy_wait_polling == 0) {
-			ret = rpma_conn_completion_wait(csd->conn);
+			ret = rpma_cq_wait(csd->cq);
 			if (ret == RPMA_E_NO_COMPLETION) {
 				/* lack of completion is not an error */
 				return 0;
 			} else if (ret != 0) {
-				librpma_td_verror(td, ret, "rpma_conn_completion_wait");
+				librpma_td_verror(td, ret, "rpma_cq_wait");
 				goto err_terminate;
 			}
 
-			ret = rpma_conn_completion_get(csd->conn, cmpl);
+			ret = rpma_cq_get_wc(csd->cq, 1, wc, NULL);
 			if (ret == RPMA_E_NO_COMPLETION) {
 				/* lack of completion is not an error */
 				return 0;
 			} else if (ret != 0) {
-				librpma_td_verror(td, ret, "rpma_conn_completion_get");
+				librpma_td_verror(td, ret, "rpma_cq_get_wc");
 				goto err_terminate;
 			}
 		} else {
@@ -711,17 +708,17 @@ static int server_cmpl_process(struct thread_data *td)
 			return 0;
 		}
 	} else if (ret != 0) {
-		librpma_td_verror(td, ret, "rpma_conn_completion_get");
+		librpma_td_verror(td, ret, "rpma_cq_get_wc");
 		goto err_terminate;
 	}
 
 	/* validate the completion */
-	if (cmpl->op_status != IBV_WC_SUCCESS)
+	if (wc->status != IBV_WC_SUCCESS)
 		goto err_terminate;
 
-	if (cmpl->op == RPMA_OP_RECV)
+	if (wc->opcode == IBV_WC_RECV)
 		++sd->msg_queued_nr;
-	else if (cmpl->op == RPMA_OP_SEND)
+	else if (wc->opcode == IBV_WC_SEND)
 		++sd->msg_sqe_available;
 
 	return 0;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-18 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-18 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6a16e9e9531a5f746c4e2fe43873de1db434b4fc:

  diskutil: include limits.h for PATH_MAX (2022-02-15 17:17:30 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to c99c81adb3510a8dc34d47fd40b19ef657e32192:

  Correct F_FULLSYNC -> F_FULLFSYNC (2022-02-17 12:53:59 -0700)

----------------------------------------------------------------
Jens Axboe (4):
      t/io_uring: allow non-power-of-2 queue depths
      t/io_uring: align buffers correctly on non-4k page sizes
      Use fcntl(..., F_FULLSYNC) if available
      Correct F_FULLSYNC -> F_FULLFSYNC

 configure    | 22 ++++++++++++++++++++++
 io_u.c       |  4 ++++
 t/io_uring.c | 15 ++++++++++-----
 3 files changed, 36 insertions(+), 5 deletions(-)

---

Diff of recent changes:

diff --git a/configure b/configure
index 0efde7d6..6160d84d 100755
--- a/configure
+++ b/configure
@@ -645,6 +645,25 @@ if compile_prog "" "-lz" "zlib" ; then
 fi
 print_config "zlib" "$zlib"
 
+##########################################
+# fcntl(F_FULLFSYNC) support
+if test "$fcntl_sync" != "yes" ; then
+  fcntl_sync="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv)
+{
+  return fcntl(0, F_FULLFSYNC);
+}
+EOF
+if compile_prog "" "" "fcntl(F_FULLFSYNC)" ; then
+    fcntl_sync="yes"
+fi
+print_config "fcntl(F_FULLFSYNC)" "$fcntl_sync"
+
 ##########################################
 # linux-aio probe
 if test "$libaio" != "yes" ; then
@@ -3174,6 +3193,9 @@ fi
 if test "$pdb" = yes; then
   output_sym "CONFIG_PDB"
 fi
+if test "$fcntl_sync" = "yes" ; then
+  output_sym "CONFIG_FCNTL_SYNC"
+fi
 
 print_config "Lib-based ioengines dynamic" "$dynamic_engines"
 cat > $TMPC << EOF
diff --git a/io_u.c b/io_u.c
index 059637e5..806ceb77 100644
--- a/io_u.c
+++ b/io_u.c
@@ -2297,7 +2297,11 @@ int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
 	int ret;
 
 	if (io_u->ddir == DDIR_SYNC) {
+#ifdef CONFIG_FCNTL_SYNC
+		ret = fcntl(io_u->file->fd, F_FULLFSYNC);
+#else
 		ret = fsync(io_u->file->fd);
+#endif
 	} else if (io_u->ddir == DDIR_DATASYNC) {
 #ifdef CONFIG_FDATASYNC
 		ret = fdatasync(io_u->file->fd);
diff --git a/t/io_uring.c b/t/io_uring.c
index 4520de43..f513d7dc 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -364,7 +364,7 @@ static int io_uring_register_buffers(struct submitter *s)
 		return 0;
 
 	return syscall(__NR_io_uring_register, s->ring_fd,
-			IORING_REGISTER_BUFFERS, s->iovecs, depth);
+			IORING_REGISTER_BUFFERS, s->iovecs, roundup_pow2(depth));
 }
 
 static int io_uring_register_files(struct submitter *s)
@@ -962,7 +962,7 @@ static int setup_aio(struct submitter *s)
 		fixedbufs = register_files = 0;
 	}
 
-	return io_queue_init(depth, &s->aio_ctx);
+	return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
 #else
 	fprintf(stderr, "Legacy AIO not available on this system/build\n");
 	errno = EINVAL;
@@ -1156,6 +1156,7 @@ int main(int argc, char *argv[])
 	struct submitter *s;
 	unsigned long done, calls, reap;
 	int err, i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
+	long page_size;
 	struct file f;
 	char *fdepths;
 	void *ret;
@@ -1249,7 +1250,7 @@ int main(int argc, char *argv[])
 		dma_map = 0;
 
 	submitter = calloc(nthreads, sizeof(*submitter) +
-				depth * sizeof(struct iovec));
+				roundup_pow2(depth) * sizeof(struct iovec));
 	for (j = 0; j < nthreads; j++) {
 		s = get_submitter(j);
 		s->index = j;
@@ -1319,12 +1320,16 @@ int main(int argc, char *argv[])
 
 	arm_sig_int();
 
+	page_size = sysconf(_SC_PAGESIZE);
+	if (page_size < 0)
+		page_size = 4096;
+
 	for (j = 0; j < nthreads; j++) {
 		s = get_submitter(j);
-		for (i = 0; i < depth; i++) {
+		for (i = 0; i < roundup_pow2(depth); i++) {
 			void *buf;
 
-			if (posix_memalign(&buf, bs, bs)) {
+			if (posix_memalign(&buf, page_size, bs)) {
 				printf("failed alloc\n");
 				return 1;
 			}

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-16 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-16 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit a1db4528a59a99c5e2aa66091c505fb60e3a70ca:

  Merge branch 'fio-docs-ci' of https://github.com/vincentkfu/fio (2022-02-11 16:29:44 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6a16e9e9531a5f746c4e2fe43873de1db434b4fc:

  diskutil: include limits.h for PATH_MAX (2022-02-15 17:17:30 -0700)

----------------------------------------------------------------
Jens Axboe (4):
      Merge branch 'fix_bytesrate_eta' of https://github.com/PCPartPicker/fio
      Merge branch 'rand_nr_bugfix' of https://github.com/PCPartPicker/fio
      Merge branch 'check_min_rate_cleanup' of https://github.com/PCPartPicker/fio
      diskutil: include limits.h for PATH_MAX

Vincent Fu (1):
      ci: detect Windows installer build failures

aggieNick02 (3):
      Cleanup __check_min_rate
      Fix ETA display when rate and/or rate_min are specified
      Fix :<nr> suffix with random read/write causing 0 initial offset

 .appveyor.yml |  1 +
 backend.c     | 81 ++++++++++++++++++++---------------------------------------
 diskutil.h    |  2 ++
 eta.c         |  5 ++--
 fio.h         |  6 ++---
 init.c        |  9 ++++++-
 libfio.c      |  4 +--
 7 files changed, 46 insertions(+), 62 deletions(-)

---

Diff of recent changes:

diff --git a/.appveyor.yml b/.appveyor.yml
index 42b79958..b94eefe3 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -44,6 +44,7 @@ after_build:
   - file.exe fio.exe
   - make.exe test
   - 'cd os\windows && dobuild.cmd %ARCHITECTURE% && cd ..'
+  - ls.exe ./os/windows/*.msi
   - ps: Get-ChildItem .\os\windows\*.msi | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name -DeploymentName fio.msi }
 
 test_script:
diff --git a/backend.c b/backend.c
index c035baed..a21dfef6 100644
--- a/backend.c
+++ b/backend.c
@@ -136,13 +136,10 @@ static void set_sig_handlers(void)
 static bool __check_min_rate(struct thread_data *td, struct timespec *now,
 			     enum fio_ddir ddir)
 {
-	unsigned long long bytes = 0;
-	unsigned long iops = 0;
-	unsigned long spent;
-	unsigned long long rate;
-	unsigned long long ratemin = 0;
-	unsigned int rate_iops = 0;
-	unsigned int rate_iops_min = 0;
+	unsigned long long current_rate_check_bytes = td->this_io_bytes[ddir];
+	unsigned long current_rate_check_blocks = td->this_io_blocks[ddir];
+	unsigned long long option_rate_bytes_min = td->o.ratemin[ddir];
+	unsigned int option_rate_iops_min = td->o.rate_iops_min[ddir];
 
 	assert(ddir_rw(ddir));
 
@@ -155,68 +152,44 @@ static bool __check_min_rate(struct thread_data *td, struct timespec *now,
 	if (mtime_since(&td->start, now) < 2000)
 		return false;
 
-	iops += td->this_io_blocks[ddir];
-	bytes += td->this_io_bytes[ddir];
-	ratemin += td->o.ratemin[ddir];
-	rate_iops += td->o.rate_iops[ddir];
-	rate_iops_min += td->o.rate_iops_min[ddir];
-
 	/*
-	 * if rate blocks is set, sample is running
+	 * if last_rate_check_blocks or last_rate_check_bytes is set,
+	 * we can compute a rate per ratecycle
 	 */
-	if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) {
-		spent = mtime_since(&td->lastrate[ddir], now);
-		if (spent < td->o.ratecycle)
+	if (td->last_rate_check_bytes[ddir] || td->last_rate_check_blocks[ddir]) {
+		unsigned long spent = mtime_since(&td->last_rate_check_time[ddir], now);
+		if (spent < td->o.ratecycle || spent==0)
 			return false;
 
-		if (td->o.rate[ddir] || td->o.ratemin[ddir]) {
+		if (td->o.ratemin[ddir]) {
 			/*
 			 * check bandwidth specified rate
 			 */
-			if (bytes < td->rate_bytes[ddir]) {
-				log_err("%s: rate_min=%lluB/s not met, only transferred %lluB\n",
-					td->o.name, ratemin, bytes);
+			unsigned long long current_rate_bytes =
+				((current_rate_check_bytes - td->last_rate_check_bytes[ddir]) * 1000) / spent;
+			if (current_rate_bytes < option_rate_bytes_min) {
+				log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n",
+					td->o.name, option_rate_bytes_min, current_rate_bytes);
 				return true;
-			} else {
-				if (spent)
-					rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
-				else
-					rate = 0;
-
-				if (rate < ratemin ||
-				    bytes < td->rate_bytes[ddir]) {
-					log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n",
-						td->o.name, ratemin, rate);
-					return true;
-				}
 			}
 		} else {
 			/*
 			 * checks iops specified rate
 			 */
-			if (iops < rate_iops) {
-				log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
-						td->o.name, rate_iops, iops);
+			unsigned long long current_rate_iops =
+				((current_rate_check_blocks - td->last_rate_check_blocks[ddir]) * 1000) / spent;
+
+			if (current_rate_iops < option_rate_iops_min) {
+				log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n",
+					td->o.name, option_rate_iops_min, current_rate_iops);
 				return true;
-			} else {
-				if (spent)
-					rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
-				else
-					rate = 0;
-
-				if (rate < rate_iops_min ||
-				    iops < td->rate_blocks[ddir]) {
-					log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n",
-						td->o.name, rate_iops_min, rate);
-					return true;
-				}
 			}
 		}
 	}
 
-	td->rate_bytes[ddir] = bytes;
-	td->rate_blocks[ddir] = iops;
-	memcpy(&td->lastrate[ddir], now, sizeof(*now));
+	td->last_rate_check_bytes[ddir] = current_rate_check_bytes;
+	td->last_rate_check_blocks[ddir] = current_rate_check_blocks;
+	memcpy(&td->last_rate_check_time[ddir], now, sizeof(*now));
 	return false;
 }
 
@@ -1845,11 +1818,11 @@ static void *thread_main(void *data)
 
 	if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
 			o->ratemin[DDIR_TRIM]) {
-	        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
+	        memcpy(&td->last_rate_check_time[DDIR_READ], &td->bw_sample_time,
 					sizeof(td->bw_sample_time));
-	        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+	        memcpy(&td->last_rate_check_time[DDIR_WRITE], &td->bw_sample_time,
 					sizeof(td->bw_sample_time));
-	        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
+	        memcpy(&td->last_rate_check_time[DDIR_TRIM], &td->bw_sample_time,
 					sizeof(td->bw_sample_time));
 	}
 
diff --git a/diskutil.h b/diskutil.h
index 83bcbf89..7d7ef802 100644
--- a/diskutil.h
+++ b/diskutil.h
@@ -2,6 +2,8 @@
 #define FIO_DISKUTIL_H
 #define FIO_DU_NAME_SZ		64
 
+#include <limits.h>
+
 #include "helper_thread.h"
 #include "fio_sem.h"
 
diff --git a/eta.c b/eta.c
index ea1781f3..17970c78 100644
--- a/eta.c
+++ b/eta.c
@@ -420,6 +420,7 @@ bool calc_thread_status(struct jobs_eta *je, int force)
 		if (is_power_of_2(td->o.kb_base))
 			je->is_pow2 = 1;
 		je->unit_base = td->o.unit_base;
+		je->sig_figs = td->o.sig_figs;
 		if (td->o.bw_avg_time < bw_avg_time)
 			bw_avg_time = td->o.bw_avg_time;
 		if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING
@@ -600,9 +601,9 @@ void display_thread_status(struct jobs_eta *je)
 		char *tr, *mr;
 
 		mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
-				je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+				je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC);
 		tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
-				je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+				je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC);
 
 		p += sprintf(p, ", %s-%s", mr, tr);
 		free(tr);
diff --git a/fio.h b/fio.h
index 7b0ca843..88df117d 100644
--- a/fio.h
+++ b/fio.h
@@ -335,10 +335,10 @@ struct thread_data {
 	 */
 	uint64_t rate_bps[DDIR_RWDIR_CNT];
 	uint64_t rate_next_io_time[DDIR_RWDIR_CNT];
-	unsigned long long rate_bytes[DDIR_RWDIR_CNT];
-	unsigned long rate_blocks[DDIR_RWDIR_CNT];
+	unsigned long long last_rate_check_bytes[DDIR_RWDIR_CNT];
+	unsigned long last_rate_check_blocks[DDIR_RWDIR_CNT];
 	unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
-	struct timespec lastrate[DDIR_RWDIR_CNT];
+	struct timespec last_rate_check_time[DDIR_RWDIR_CNT];
 	int64_t last_usec[DDIR_RWDIR_CNT];
 	struct frand_state poisson_state[DDIR_RWDIR_CNT];
 
diff --git a/init.c b/init.c
index 13935152..81c30f8c 100644
--- a/init.c
+++ b/init.c
@@ -1576,7 +1576,14 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
 	td->ts.sig_figs = o->sig_figs;
 
 	init_thread_stat_min_vals(&td->ts);
-	td->ddir_seq_nr = o->ddir_seq_nr;
+
+	/*
+	 * td->>ddir_seq_nr needs to be initialized to 1, NOT o->ddir_seq_nr,
+	 * so that get_next_offset gets a new random offset the first time it
+	 * is called, instead of keeping an initial offset of 0 for the first
+	 * nr-1 calls
+	 */
+	td->ddir_seq_nr = 1;
 
 	if ((o->stonewall || o->new_group) && prev_group_jobs) {
 		prev_group_jobs = 0;
diff --git a/libfio.c b/libfio.c
index 01fa7452..1a891776 100644
--- a/libfio.c
+++ b/libfio.c
@@ -87,8 +87,8 @@ static void reset_io_counters(struct thread_data *td, int all)
 			td->this_io_bytes[ddir] = 0;
 			td->stat_io_blocks[ddir] = 0;
 			td->this_io_blocks[ddir] = 0;
-			td->rate_bytes[ddir] = 0;
-			td->rate_blocks[ddir] = 0;
+			td->last_rate_check_bytes[ddir] = 0;
+			td->last_rate_check_blocks[ddir] = 0;
 			td->bytes_done[ddir] = 0;
 			td->rate_io_issue_bytes[ddir] = 0;
 			td->rate_next_io_time[ddir] = 0;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-12 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-12 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit df597be63e26ef59c1538b3ce2026c83684ff7fb:

  fio: really use LDFLAGS when linking dynamic engines (2022-02-08 09:28:30 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a1db4528a59a99c5e2aa66091c505fb60e3a70ca:

  Merge branch 'fio-docs-ci' of https://github.com/vincentkfu/fio (2022-02-11 16:29:44 -0700)

----------------------------------------------------------------
Jens Axboe (4):
      t/io_uring: avoid unused `nr_batch` warning
      Add aarch64 cpu clock support
      Merge branch 'fio_offload_fixes' of https://github.com/PCPartPicker/fio
      Merge branch 'fio-docs-ci' of https://github.com/vincentkfu/fio

Vincent Fu (8):
      docs: document cpumode option for the cpuio ioengine
      docs: update Makefile in order to detect build failures
      docs: rename HOWTO to HOWTO.rst
      HOWTO: combine multiple pool option listings
      HOWTO: combine separate hipri listings into a single one
      HOWTO: combine two chunk_size listings into a single one
      ci: install sphinx packages and add doc building to GitHub Actions
      windows: update the installer build for renamed files

aggieNick02 (1):
      Fix issues (assert or uninit var, hang) with check_min_rate and offloading

 HOWTO => HOWTO.rst      | 126 ++++++++++++++++++++++++++++--------------------
 arch/arch-aarch64.h     |  17 +++++++
 backend.c               |   9 +++-
 ci/actions-full-test.sh |   1 +
 ci/actions-install.sh   |   3 +-
 doc/Makefile            |   2 +-
 doc/fio_doc.rst         |   2 +-
 doc/fio_man.rst         |   2 +-
 fio.1                   |  13 +++++
 os/windows/install.wxs  |   4 +-
 t/io_uring.c            |   9 ++--
 11 files changed, 124 insertions(+), 64 deletions(-)
 rename HOWTO => HOWTO.rst (99%)

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO.rst
similarity index 99%
rename from HOWTO
rename to HOWTO.rst
index 74ba7216..ac1f3478 100644
--- a/HOWTO
+++ b/HOWTO.rst
@@ -2137,8 +2137,10 @@ I/O engine
 			Asynchronous read and write using DDN's Infinite Memory Engine (IME).
 			This engine will try to stack as much IOs as possible by creating
 			requests for IME. FIO will then decide when to commit these requests.
+
 		**libiscsi**
 			Read and write iscsi lun with libiscsi.
+
 		**nbd**
 			Read and write a Network Block Device (NBD).
 
@@ -2149,6 +2151,7 @@ I/O engine
 			unless :option:`verify` is set or :option:`cuda_io` is `posix`.
 			:option:`iomem` must not be `cudamalloc`. This ioengine defines
 			engine specific options.
+
 		**dfs**
 			I/O engine supporting asynchronous read and write operations to the
 			DAOS File System (DFS) via libdfs.
@@ -2175,8 +2178,8 @@ with the caveat that when used on the command line, they must come after the
     Set the percentage of I/O that will be issued with the highest priority.
     Default: 0. A single value applies to reads and writes. Comma-separated
     values may be specified for reads and writes. For this option to be
-    effective, NCQ priority must be supported and enabled, and `direct=1'
-    option must be used. fio must also be run as the root user. Unlike
+    effective, NCQ priority must be supported and enabled, and the :option:`direct`
+    option must be set. fio must also be run as the root user. Unlike
     slat/clat/lat stats, which can be tracked and reported independently, per
     priority stats only track and report a single type of latency. By default,
     completion latency (clat) will be reported, if :option:`lat_percentiles` is
@@ -2207,6 +2210,7 @@ with the caveat that when used on the command line, they must come after the
 	meaning of priority may differ. See also the :option:`prio` option.
 
 .. option:: cmdprio_bssplit=str[,str] : [io_uring] [libaio]
+
 	To get a finer control over I/O priority, this option allows
 	specifying the percentage of IOs that must have a priority set
 	depending on the block size of the IO. This option is useful only
@@ -2243,14 +2247,6 @@ with the caveat that when used on the command line, they must come after the
     map and release for each IO. This is more efficient, and reduces the
     IO latency as well.
 
-.. option:: hipri : [io_uring]
-
-    If this option is set, fio will attempt to use polled IO completions.
-    Normal IO completions generate interrupts to signal the completion of
-    IO, polled completions do not. Hence they are require active reaping
-    by the application. The benefits are more efficient IO for high IOPS
-    scenarios, and lower latencies for low queue depth IO.
-
 .. option:: registerfiles : [io_uring]
 
 	With this option, fio registers the set of files being used with the
@@ -2271,6 +2267,33 @@ with the caveat that when used on the command line, they must come after the
 	When :option:`sqthread_poll` is set, this option provides a way to
 	define which CPU should be used for the polling thread.
 
+.. option:: hipri
+
+   [io_uring]
+
+        If this option is set, fio will attempt to use polled IO completions.
+        Normal IO completions generate interrupts to signal the completion of
+        IO, polled completions do not. Hence they are require active reaping
+        by the application. The benefits are more efficient IO for high IOPS
+        scenarios, and lower latencies for low queue depth IO.
+
+   [pvsync2]
+
+	Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+	than normal.
+
+   [sg]
+
+	If this option is set, fio will attempt to use polled IO completions.
+	This will have a similar effect as (io_uring)hipri. Only SCSI READ and
+	WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor
+	VERIFY). Older versions of the Linux sg driver that do not support
+	hipri will simply ignore this flag and do normal IO. The Linux SCSI
+	Low Level Driver (LLD) that "owns" the device also needs to support
+	hipri (also known as iopoll and mq_poll). The MegaRAID driver is an
+	example of a SCSI LLD. Default: clear (0) which does normal
+	(interrupted based) IO.
+
 .. option:: userspace_reap : [libaio]
 
 	Normally, with the libaio engine in use, fio will use the
@@ -2279,11 +2302,6 @@ with the caveat that when used on the command line, they must come after the
 	reap events. The reaping mode is only enabled when polling for a minimum of
 	0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
 
-.. option:: hipri : [pvsync2]
-
-	Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
-	than normal.
-
 .. option:: hipri_percentage : [pvsync2]
 
 	When hipri is set this determines the probability of a pvsync2 I/O being high
@@ -2318,6 +2336,16 @@ with the caveat that when used on the command line, they must come after the
 
 	Split the load into cycles of the given time. In microseconds.
 
+.. option:: cpumode=str : [cpuio]
+
+	Specify how to stress the CPU. It can take these two values:
+
+	**noop**
+		This is the default where the CPU executes noop instructions.
+	**qsort**
+		Replace the default noop instructions loop with a qsort algorithm to
+		consume more energy.
+
 .. option:: exit_on_io_done=bool : [cpuio]
 
 	Detect when I/O threads are done, then exit.
@@ -2444,10 +2472,6 @@ with the caveat that when used on the command line, they must come after the
 
 	Specifies the name of the RBD.
 
-.. option:: pool=str : [rbd,rados]
-
-	Specifies the name of the Ceph pool containing RBD or RADOS data.
-
 .. option:: clientname=str : [rbd,rados]
 
 	Specifies the username (without the 'client.' prefix) used to access the
@@ -2466,6 +2490,36 @@ with the caveat that when used on the command line, they must come after the
         Touching all objects affects ceph caches and likely impacts test results.
         Enabled by default.
 
+.. option:: pool=str :
+
+   [rbd,rados]
+
+	Specifies the name of the Ceph pool containing RBD or RADOS data.
+
+   [dfs]
+
+	Specify the label or UUID of the DAOS pool to connect to.
+
+.. option:: cont=str : [dfs]
+
+	Specify the label or UUID of the DAOS container to open.
+
+.. option:: chunk_size=int
+
+   [dfs]
+
+	Specificy a different chunk size (in bytes) for the dfs file.
+	Use DAOS container's chunk size by default.
+
+   [libhdfs]
+
+	The size of the chunk to use for each file.
+
+.. option:: object_class=str : [dfs]
+
+	Specificy a different object class for the dfs file.
+	Use DAOS container's object class by default.
+
 .. option:: skip_bad=bool : [mtd]
 
 	Skip operations against known bad blocks.
@@ -2474,10 +2528,6 @@ with the caveat that when used on the command line, they must come after the
 
 	libhdfs will create chunk in this HDFS directory.
 
-.. option:: chunk_size : [libhdfs]
-
-	The size of the chunk to use for each file.
-
 .. option:: verb=str : [rdma]
 
 	The RDMA verb to use on this side of the RDMA ioengine connection. Valid
@@ -2563,18 +2613,6 @@ with the caveat that when used on the command line, they must come after the
 	a valid stream identifier) fio will open a stream and then close it when done. Default
 	is 0.
 
-.. option:: hipri : [sg]
-
-	If this option is set, fio will attempt to use polled IO completions.
-	This will have a similar effect as (io_uring)hipri. Only SCSI READ and
-	WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor
-	VERIFY). Older versions of the Linux sg driver that do not support
-	hipri will simply ignore this flag and do normal IO. The Linux SCSI
-	Low Level Driver (LLD) that "owns" the device also needs to support
-	hipri (also known as iopoll and mq_poll). The MegaRAID driver is an
-	example of a SCSI LLD. Default: clear (0) which does normal
-	(interrupted based) IO.
-
 .. option:: http_host=str : [http]
 
 	Hostname to connect to. For S3, this could be the bucket hostname.
@@ -2654,24 +2692,6 @@ with the caveat that when used on the command line, they must come after the
 		GPU to RAM before a write and copied from RAM to GPU after a
 		read. :option:`verify` does not affect use of cudaMemcpy.
 
-.. option:: pool=str : [dfs]
-
-	Specify the label or UUID of the DAOS pool to connect to.
-
-.. option:: cont=str : [dfs]
-
-	Specify the label or UUID of the DAOS container to open.
-
-.. option:: chunk_size=int : [dfs]
-
-	Specificy a different chunk size (in bytes) for the dfs file.
-	Use DAOS container's chunk size by default.
-
-.. option:: object_class=str : [dfs]
-
-	Specificy a different object class for the dfs file.
-	Use DAOS container's object class by default.
-
 .. option:: nfs_url=str : [nfs]
 
 	URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h
index 2a86cc5a..94571709 100644
--- a/arch/arch-aarch64.h
+++ b/arch/arch-aarch64.h
@@ -27,4 +27,21 @@ static inline int arch_ffz(unsigned long bitmask)
 
 #define ARCH_HAVE_FFZ
 
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long val;
+
+	asm volatile("mrs %0, cntvct_el0" : "=r" (val));
+	return val;
+}
+#define ARCH_HAVE_CPU_CLOCK
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+	tsc_reliable = true;
+	return 0;
+}
+
 #endif
diff --git a/backend.c b/backend.c
index 061e3b32..c035baed 100644
--- a/backend.c
+++ b/backend.c
@@ -1091,8 +1091,10 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
 				td->rate_io_issue_bytes[__ddir] += blen;
 			}
 
-			if (should_check_rate(td))
+			if (should_check_rate(td)) {
 				td->rate_next_io_time[__ddir] = usec_for_io(td, __ddir);
+				fio_gettime(&comp_time, NULL);
+			}
 
 		} else {
 			ret = io_u_submit(td, io_u);
@@ -1172,8 +1174,11 @@ reap:
 								f->file_name);
 			}
 		}
-	} else
+	} else {
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD)
+			workqueue_flush(&td->io_wq);
 		cleanup_pending_aio(td);
+	}
 
 	/*
 	 * stop job if we failed doing any IO
diff --git a/ci/actions-full-test.sh b/ci/actions-full-test.sh
index 4ae1dba1..91790664 100755
--- a/ci/actions-full-test.sh
+++ b/ci/actions-full-test.sh
@@ -10,6 +10,7 @@ main() {
     else
         sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
     fi
+    make -C doc html
 }
 
 main
diff --git a/ci/actions-install.sh b/ci/actions-install.sh
index b3486a47..0e472717 100755
--- a/ci/actions-install.sh
+++ b/ci/actions-install.sh
@@ -60,6 +60,7 @@ DPKGCFG
     # care about the architecture.
     pkgs+=(
         python3-scipy
+	python3-sphinx
     )
 
     echo "Updating APT..."
@@ -78,7 +79,7 @@ install_macos() {
     #brew update >/dev/null 2>&1
     echo "Installing packages..."
     HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit
-    pip3 install scipy six
+    pip3 install scipy six sphinx
 }
 
 main() {
diff --git a/doc/Makefile b/doc/Makefile
index 3b979f9a..a444d83a 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -2,7 +2,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -W --keep-going
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = output
diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst
index 8e1216f0..34e7fde9 100644
--- a/doc/fio_doc.rst
+++ b/doc/fio_doc.rst
@@ -5,7 +5,7 @@ fio - Flexible I/O tester rev. |version|
 .. include:: ../README.rst
 
 
-.. include:: ../HOWTO
+.. include:: ../HOWTO.rst
 
 
 
diff --git a/doc/fio_man.rst b/doc/fio_man.rst
index 44312f16..dc1d1c0d 100644
--- a/doc/fio_man.rst
+++ b/doc/fio_man.rst
@@ -9,4 +9,4 @@ Fio Manpage
 .. include:: ../README.rst
 
 
-.. include:: ../HOWTO
+.. include:: ../HOWTO.rst
diff --git a/fio.1 b/fio.1
index f32d7915..e23d4092 100644
--- a/fio.1
+++ b/fio.1
@@ -2091,6 +2091,19 @@ option when using cpuio I/O engine.
 .BI (cpuio)cpuchunks \fR=\fPint
 Split the load into cycles of the given time. In microseconds.
 .TP
+.BI (cpuio)cpumode \fR=\fPstr
+Specify how to stress the CPU. It can take these two values:
+.RS
+.RS
+.TP
+.B noop
+This is the default and directs the CPU to execute noop instructions.
+.TP
+.B qsort
+Replace the default noop instructions with a qsort algorithm to consume more energy.
+.RE
+.RE
+.TP
 .BI (cpuio)exit_on_io_done \fR=\fPbool
 Detect when I/O threads are done, then exit.
 .TP
diff --git a/os/windows/install.wxs b/os/windows/install.wxs
index 7773bb3b..f2753289 100755
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -33,13 +33,13 @@
 						</Component>
 						<?endif?>
 						<Component>
-							<File Id="README" Name="README.txt" Source="..\..\README"/>
+							<File Id="README" Name="README.txt" Source="..\..\README.rst"/>
 						</Component>
 						<Component>
 							<File Id="REPORTING_BUGS" Name="REPORTING-BUGS.txt" Source="..\..\REPORTING-BUGS"/>
 						</Component>
 						<Component>
-							<File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO"/>
+							<File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO.rst"/>
 						</Component>
 						<Component>
 							<File Id="COPYING" Name="COPYING.txt" Source="..\..\COPYING"/>
diff --git a/t/io_uring.c b/t/io_uring.c
index faf5978c..4520de43 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -714,12 +714,15 @@ static int reap_events_aio(struct submitter *s, struct io_event *events, int evs
 static void *submitter_aio_fn(void *data)
 {
 	struct submitter *s = data;
-	int i, ret, prepped, nr_batch;
+	int i, ret, prepped;
 	struct iocb **iocbsptr;
 	struct iocb *iocbs;
 	struct io_event *events;
-
-	nr_batch = submitter_init(s);
+#ifdef ARCH_HAVE_CPU_CLOCK
+	int nr_batch = submitter_init(s);
+#else
+	submitter_init(s);
+#endif
 
 	iocbsptr = calloc(depth, sizeof(struct iocb *));
 	iocbs = calloc(depth, sizeof(struct iocb));

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-09 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-09 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit b65c1fc07d4794920224312c56c785de2f3f1692:

  t/io_uring: fix warnings for !ARCH_HAVE_CPU_CLOCK (2022-02-04 09:02:49 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to df597be63e26ef59c1538b3ce2026c83684ff7fb:

  fio: really use LDFLAGS when linking dynamic engines (2022-02-08 09:28:30 -0700)

----------------------------------------------------------------
Eric Sandeen (1):
      fio: really use LDFLAGS when linking dynamic engines

 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index 2432f519..0ab4f82c 100644
--- a/Makefile
+++ b/Makefile
@@ -295,7 +295,7 @@ define engine_template =
 $(1)_OBJS := $$($(1)_SRCS:.c=.o)
 $$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS)
 engines/fio-$(1).so: $$($(1)_OBJS)
-	$$(QUIET_LINK)$(CC) $(DYNAMIC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
+	$$(QUIET_LINK)$(CC) $(LDFLAGS) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
 ENGS_OBJS += engines/fio-$(1).so
 endef
 else # !CONFIG_DYNAMIC_ENGINES

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-05 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-05 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 62e9ece4d540ff2af865e4b43811f3150b8b846b:

  fio: use correct function declaration for set_epoch_time() (2022-02-03 16:06:59 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to b65c1fc07d4794920224312c56c785de2f3f1692:

  t/io_uring: fix warnings for !ARCH_HAVE_CPU_CLOCK (2022-02-04 09:02:49 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      t/io_uring: fix warnings for !ARCH_HAVE_CPU_CLOCK

Niklas Cassel (1):
      stat: make free_clat_prio_stats() safe against NULL

 stat.c       |  3 +++
 t/io_uring.c | 11 ++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

---

Diff of recent changes:

diff --git a/stat.c b/stat.c
index 0876222a..1764eebc 100644
--- a/stat.c
+++ b/stat.c
@@ -2041,6 +2041,9 @@ void free_clat_prio_stats(struct thread_stat *ts)
 {
 	enum fio_ddir ddir;
 
+	if (!ts)
+		return;
+
 	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 		sfree(ts->clat_prio[ddir]);
 		ts->clat_prio[ddir] = NULL;
diff --git a/t/io_uring.c b/t/io_uring.c
index e8365a79..faf5978c 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -287,6 +287,7 @@ out:
 	free(ovals);
 }
 
+#ifdef ARCH_HAVE_CPU_CLOCK
 static unsigned int plat_val_to_idx(unsigned long val)
 {
 	unsigned int msb, error_bits, base, offset, idx;
@@ -322,6 +323,7 @@ static unsigned int plat_val_to_idx(unsigned long val)
 
 	return idx;
 }
+#endif
 
 static void add_stat(struct submitter *s, int clock_index, int nr)
 {
@@ -789,9 +791,12 @@ static void *submitter_uring_fn(void *data)
 {
 	struct submitter *s = data;
 	struct io_sq_ring *ring = &s->sq_ring;
-	int ret, prepped, nr_batch;
-
-	nr_batch = submitter_init(s);
+	int ret, prepped;
+#ifdef ARCH_HAVE_CPU_CLOCK
+	int nr_batch = submitter_init(s);
+#else
+	submitter_init(s);
+#endif
 
 	prepped = 0;
 	do {

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-02-04 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-02-04 13:00 UTC (permalink / raw)
  To: fio

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain, Size: 132470 bytes --]

The following changes since commit 52a0b9ed71c3e929461e64b39059281948107071:

  Merge branch 'patch-1' of https://github.com/Nikratio/fio (2022-01-28 14:50:51 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 62e9ece4d540ff2af865e4b43811f3150b8b846b:

  fio: use correct function declaration for set_epoch_time() (2022-02-03 16:06:59 -0700)

----------------------------------------------------------------
David Korczynski (1):
      ci/Github actions: add CIFuzz integration

Jens Axboe (6):
      Merge branch 'master' of https://github.com/blah325/fio
      server: fix formatting issue
      Merge branch 'freebsd-comment-update' of https://github.com/macdice/fio
      Merge branch 'cifuzz-integration' of https://github.com/DavidKorczynski/fio
      Merge branch 'fio_pr_alternate_epoch' of https://github.com/PCPartPicker/fio
      fio: use correct function declaration for set_epoch_time()

Niklas Cassel (18):
      init: verify option lat_percentiles consistency for all jobs in group
      backend: do ioprio_set() before calling the ioengine init callback
      stat: save the default ioprio in struct thread_stat
      client/server: convert ss_data to use an offset instead of fixed position
      stat: add a new function to allocate a clat_prio_stat array
      os: define min/max prio class and level for systems without ioprio
      options: add a parsing function for an additional cmdprio_bssplit format
      cmdprio: add support for a new cmdprio_bssplit entry format
      examples: add new cmdprio_bssplit format examples
      stat: use enum fio_ddir consistently
      stat: increment members counter after call to sum_thread_stats()
      stat: add helper for resetting the latency buckets
      stat: disable per prio stats where not needed
      stat: report clat stats on a per priority granularity
      stat: convert json output to a new per priority granularity format
      gfio: drop support for high/low priority latency results
      stat: remove unused high/low prio struct members
      t/latency_percentiles.py: add tests for the new cmdprio_bssplit format

Thomas Munro (1):
      Update comments about availability of fdatasync().

aggieNick02 (1):
      Support for alternate epochs in fio log files

james rizzo (3):
      Avoid client calls to recv() without prior poll()
      Add Windows support for --server.
      Added a new windows only IO engine option “no_completion_thread”.

 .github/workflows/cifuzz.yml |  24 ++
 HOWTO                        |  41 +++-
 backend.c                    |  27 ++-
 cconv.c                      |   4 +
 client.c                     |  48 ++--
 engines/cmdprio.c            | 440 +++++++++++++++++++++++++++++------
 engines/cmdprio.h            |  22 +-
 engines/filecreate.c         |   2 +-
 engines/filedelete.c         |   2 +-
 engines/filestat.c           |   2 +-
 engines/windowsaio.c         | 134 +++++++++--
 examples/cmdprio-bssplit.fio |  39 +++-
 fio.1                        |  45 +++-
 fio.h                        |   2 +-
 fio_time.h                   |   2 +-
 gclient.c                    |  55 +----
 init.c                       |  37 +++
 io_u.c                       |   7 +-
 io_u.h                       |   3 +-
 libfio.c                     |   2 +-
 optgroup.h                   |   2 +
 options.c                    | 140 ++++++++++++
 os/os-windows.h              |   2 +
 os/os.h                      |   4 +
 os/windows/posix.c           | 182 ++++++++++++++-
 rate-submit.c                |  11 +-
 server.c                     | 369 +++++++++++++++++++++++++++---
 server.h                     |   7 +-
 stat.c                       | 531 ++++++++++++++++++++++++++++++++++---------
 stat.h                       |  40 +++-
 t/latency_percentiles.py     | 211 ++++++++++-------
 thread_options.h             |  14 ++
 time.c                       |  12 +-
 33 files changed, 2019 insertions(+), 444 deletions(-)
 create mode 100644 .github/workflows/cifuzz.yml

---

Diff of recent changes:

diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
new file mode 100644
index 00000000..acc8d482
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,24 @@
+name: CIFuzz
+on: [pull_request]
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'fio'
+        dry-run: false
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'fio'
+        fuzz-seconds: 600
+        dry-run: false
+    - name: Upload Crash
+      uses: actions/upload-artifact@v1
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
diff --git a/HOWTO b/HOWTO
index c72ec8cd..74ba7216 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1344,7 +1344,7 @@ I/O type
 .. option:: fdatasync=int
 
 	Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
-	not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+	not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
 	:manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
 	Defaults to 0, which means fio does not periodically issue and wait for a
 	data-only sync to complete.
@@ -2212,10 +2212,28 @@ with the caveat that when used on the command line, they must come after the
 	depending on the block size of the IO. This option is useful only
 	when used together with the :option:`bssplit` option, that is,
 	multiple different block sizes are used for reads and writes.
-	The format for this option is the same as the format of the
-	:option:`bssplit` option, with the exception that values for
-	trim IOs are ignored. This option is mutually exclusive with the
-	:option:`cmdprio_percentage` option.
+
+	The first accepted format for this option is the same as the format of
+	the :option:`bssplit` option:
+
+		cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+
+	In this case, each entry will use the priority class and priority
+	level defined by the options :option:`cmdprio_class` and
+	:option:`cmdprio` respectively.
+
+	The second accepted format for this option is:
+
+		cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+
+	In this case, the priority class and priority level is defined inside
+	each entry. In comparison with the first accepted format, the second
+	accepted format does not restrict all entries to have the same priority
+	class and priority level.
+
+	For both formats, only the read and write data directions are supported,
+	values for trim IOs are ignored. This option is mutually exclusive with
+	the :option:`cmdprio_percentage` option.
 
 .. option:: fixedbufs : [io_uring]
 
@@ -3663,6 +3681,19 @@ Measurements and reporting
 	write_type_log for each log type, instead of the default zero-based
 	timestamps.
 
+.. option:: log_alternate_epoch=bool
+
+	If set, fio will log timestamps based on the epoch used by the clock specified
+	in the log_alternate_epoch_clock_id option, to the log files produced by
+	enabling write_type_log for each log type, instead of the default zero-based
+	timestamps.
+
+.. option:: log_alternate_epoch_clock_id=int
+
+	Specifies the clock_id to be used by clock_gettime to obtain the alternate epoch
+	if either log_unix_epoch or log_alternate_epoch are true. Otherwise has no
+	effect. Default value is 0, or CLOCK_REALTIME.
+
 .. option:: block_error_percentiles=bool
 
 	If set, record errors in trim block-sized units from writes and trims and
diff --git a/backend.c b/backend.c
index c167f908..061e3b32 100644
--- a/backend.c
+++ b/backend.c
@@ -1777,6 +1777,18 @@ static void *thread_main(void *data)
 	if (!init_iolog(td))
 		goto err;
 
+	/* ioprio_set() has to be done before td_io_init() */
+	if (fio_option_is_set(o, ioprio) ||
+	    fio_option_is_set(o, ioprio_class)) {
+		ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
+		if (ret == -1) {
+			td_verror(td, errno, "ioprio_set");
+			goto err;
+		}
+		td->ioprio = ioprio_value(o->ioprio_class, o->ioprio);
+		td->ts.ioprio = td->ioprio;
+	}
+
 	if (td_io_init(td))
 		goto err;
 
@@ -1789,16 +1801,6 @@ static void *thread_main(void *data)
 	if (o->verify_async && verify_async_init(td))
 		goto err;
 
-	if (fio_option_is_set(o, ioprio) ||
-	    fio_option_is_set(o, ioprio_class)) {
-		ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
-		if (ret == -1) {
-			td_verror(td, errno, "ioprio_set");
-			goto err;
-		}
-		td->ioprio = ioprio_value(o->ioprio_class, o->ioprio);
-	}
-
 	if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
 		goto err;
 
@@ -1828,7 +1830,7 @@ static void *thread_main(void *data)
 	if (rate_submit_init(td, sk_out))
 		goto err;
 
-	set_epoch_time(td, o->log_unix_epoch);
+	set_epoch_time(td, o->log_unix_epoch | o->log_alternate_epoch, o->log_alternate_epoch_clock_id);
 	fio_getrusage(&td->ru_start);
 	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
 	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
@@ -2611,6 +2613,9 @@ int fio_backend(struct sk_out *sk_out)
 	}
 
 	for_each_td(td, i) {
+		struct thread_stat *ts = &td->ts;
+
+		free_clat_prio_stats(ts);
 		steadystate_free(td);
 		fio_options_free(td);
 		fio_dump_options_free(td);
diff --git a/cconv.c b/cconv.c
index 4f8d27eb..62d02e36 100644
--- a/cconv.c
+++ b/cconv.c
@@ -197,6 +197,8 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->log_gz = le32_to_cpu(top->log_gz);
 	o->log_gz_store = le32_to_cpu(top->log_gz_store);
 	o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
+	o->log_alternate_epoch = le32_to_cpu(top->log_alternate_epoch);
+	o->log_alternate_epoch_clock_id = le32_to_cpu(top->log_alternate_epoch_clock_id);
 	o->norandommap = le32_to_cpu(top->norandommap);
 	o->softrandommap = le32_to_cpu(top->softrandommap);
 	o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
@@ -425,6 +427,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->log_gz = cpu_to_le32(o->log_gz);
 	top->log_gz_store = cpu_to_le32(o->log_gz_store);
 	top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
+	top->log_alternate_epoch = cpu_to_le32(o->log_alternate_epoch);
+	top->log_alternate_epoch_clock_id = cpu_to_le32(o->log_alternate_epoch_clock_id);
 	top->norandommap = cpu_to_le32(o->norandommap);
 	top->softrandommap = cpu_to_le32(o->softrandommap);
 	top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
diff --git a/client.c b/client.c
index be8411d8..605a3ce5 100644
--- a/client.c
+++ b/client.c
@@ -284,9 +284,10 @@ static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
 static void fio_drain_client_text(struct fio_client *client)
 {
 	do {
-		struct fio_net_cmd *cmd;
+		struct fio_net_cmd *cmd = NULL;
 
-		cmd = fio_net_recv_cmd(client->fd, false);
+		if (fio_server_poll_fd(client->fd, POLLIN, 0))
+			cmd = fio_net_recv_cmd(client->fd, false);
 		if (!cmd)
 			break;
 
@@ -953,6 +954,8 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
 	dst->pid		= le32_to_cpu(src->pid);
 	dst->members		= le32_to_cpu(src->members);
 	dst->unified_rw_rep	= le32_to_cpu(src->unified_rw_rep);
+	dst->ioprio		= le32_to_cpu(src->ioprio);
+	dst->disable_prio_stat	= le32_to_cpu(src->disable_prio_stat);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		convert_io_stat(&dst->clat_stat[i], &src->clat_stat[i]);
@@ -1035,14 +1038,6 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
 	dst->nr_block_infos	= le64_to_cpu(src->nr_block_infos);
 	for (i = 0; i < dst->nr_block_infos; i++)
 		dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-			dst->io_u_plat_high_prio[i][j] = le64_to_cpu(src->io_u_plat_high_prio[i][j]);
-			dst->io_u_plat_low_prio[i][j] = le64_to_cpu(src->io_u_plat_low_prio[i][j]);
-		}
-		convert_io_stat(&dst->clat_high_prio_stat[i], &src->clat_high_prio_stat[i]);
-		convert_io_stat(&dst->clat_low_prio_stat[i], &src->clat_low_prio_stat[i]);
-	}
 
 	dst->ss_dur		= le64_to_cpu(src->ss_dur);
 	dst->ss_state		= le32_to_cpu(src->ss_state);
@@ -1052,6 +1047,19 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
 	dst->ss_deviation.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
 	dst->ss_criterion.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
 
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		dst->nr_clat_prio[i] = le32_to_cpu(src->nr_clat_prio[i]);
+		for (j = 0; j < dst->nr_clat_prio[i]; j++) {
+			for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+				dst->clat_prio[i][j].io_u_plat[k] =
+					le64_to_cpu(src->clat_prio[i][j].io_u_plat[k]);
+			convert_io_stat(&dst->clat_prio[i][j].clat_stat,
+					&src->clat_prio[i][j].clat_stat);
+			dst->clat_prio[i][j].ioprio =
+				le32_to_cpu(dst->clat_prio[i][j].ioprio);
+		}
+	}
+
 	if (dst->ss_state & FIO_SS_DATA) {
 		for (i = 0; i < dst->ss_dur; i++ ) {
 			dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
@@ -1760,7 +1768,6 @@ int fio_handle_client(struct fio_client *client)
 {
 	struct client_ops *ops = client->ops;
 	struct fio_net_cmd *cmd;
-	int size;
 
 	dprint(FD_NET, "client: handle %s\n", client->hostname);
 
@@ -1794,14 +1801,26 @@ int fio_handle_client(struct fio_client *client)
 		}
 	case FIO_NET_CMD_TS: {
 		struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+		uint64_t offset;
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			if (le32_to_cpu(p->ts.nr_clat_prio[i])) {
+				offset = le64_to_cpu(p->ts.clat_prio_offset[i]);
+				p->ts.clat_prio[i] =
+					(struct clat_prio_stat *)((char *)p + offset);
+			}
+		}
 
 		dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
 		if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) {
 			dprint(FD_NET, "client: received steadystate ring buffers\n");
 
-			size = le64_to_cpu(p->ts.ss_dur);
-			p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
-			p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+			offset = le64_to_cpu(p->ts.ss_iops_data_offset);
+			p->ts.ss_iops_data = (uint64_t *)((char *)p + offset);
+
+			offset = le64_to_cpu(p->ts.ss_bw_data_offset);
+			p->ts.ss_bw_data = (uint64_t *)((char *)p + offset);
 		}
 
 		convert_ts(&p->ts, &p->ts);
@@ -2152,6 +2171,7 @@ int fio_handle_clients(struct client_ops *ops)
 
 	fio_client_json_fini();
 
+	free_clat_prio_stats(&client_ts);
 	free(pfds);
 	return retval || error_clients;
 }
diff --git a/engines/cmdprio.c b/engines/cmdprio.c
index 92b752ae..dd358754 100644
--- a/engines/cmdprio.c
+++ b/engines/cmdprio.c
@@ -5,45 +5,201 @@
 
 #include "cmdprio.h"
 
-static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
-				    enum fio_ddir ddir, char *str, bool data)
+/*
+ * Temporary array used during parsing. Will be freed after the corresponding
+ * struct bsprio_desc has been generated and saved in cmdprio->bsprio_desc.
+ */
+struct cmdprio_parse_result {
+	struct split_prio *entries;
+	int nr_entries;
+};
+
+/*
+ * Temporary array used during init. Will be freed after the corresponding
+ * struct clat_prio_stat array has been saved in td->ts.clat_prio and the
+ * matching clat_prio_indexes have been saved in each struct cmdprio_prio.
+ */
+struct cmdprio_values {
+	unsigned int *prios;
+	int nr_prios;
+};
+
+static int find_clat_prio_index(unsigned int *all_prios, int nr_prios,
+				int32_t prio)
 {
-	struct cmdprio *cmdprio = cb_arg;
-	struct split split;
-	unsigned int i;
+	int i;
 
-	if (ddir == DDIR_TRIM)
-		return 0;
+	for (i = 0; i < nr_prios; i++) {
+		if (all_prios[i] == prio)
+			return i;
+	}
 
-	memset(&split, 0, sizeof(split));
+	return -1;
+}
 
-	if (split_parse_ddir(to, &split, str, data, BSSPLIT_MAX))
+/**
+ * assign_clat_prio_index - In order to avoid stat.c the need to loop through
+ * all possible priorities each time add_clat_sample() / add_lat_sample() is
+ * called, save which index to use in each cmdprio_prio. This will later be
+ * propagated to the io_u, if the specific io_u was determined to use a cmdprio
+ * priority value.
+ */
+static void assign_clat_prio_index(struct cmdprio_prio *prio,
+				   struct cmdprio_values *values)
+{
+	int clat_prio_index = find_clat_prio_index(values->prios,
+						   values->nr_prios,
+						   prio->prio);
+	if (clat_prio_index == -1) {
+		clat_prio_index = values->nr_prios;
+		values->prios[clat_prio_index] = prio->prio;
+		values->nr_prios++;
+	}
+	prio->clat_prio_index = clat_prio_index;
+}
+
+/**
+ * init_cmdprio_values - Allocate a temporary array that can hold all unique
+ * priorities (per ddir), so that we can assign_clat_prio_index() for each
+ * cmdprio_prio during setup. This temporary array is freed after setup.
+ */
+static int init_cmdprio_values(struct cmdprio_values *values,
+			       int max_unique_prios, struct thread_stat *ts)
+{
+	values->prios = calloc(max_unique_prios + 1,
+			       sizeof(*values->prios));
+	if (!values->prios)
 		return 1;
-	if (!split.nr)
-		return 0;
 
-	cmdprio->bssplit_nr[ddir] = split.nr;
-	cmdprio->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
-	if (!cmdprio->bssplit[ddir])
+	/* td->ioprio/ts->ioprio is always stored at index 0. */
+	values->prios[0] = ts->ioprio;
+	values->nr_prios++;
+
+	return 0;
+}
+
+/**
+ * init_ts_clat_prio - Allocates and fills a clat_prio_stat array which holds
+ * all unique priorities (per ddir).
+ */
+static int init_ts_clat_prio(struct thread_stat *ts, enum fio_ddir ddir,
+			     struct cmdprio_values *values)
+{
+	int i;
+
+	if (alloc_clat_prio_stat_ddir(ts, ddir, values->nr_prios))
 		return 1;
 
-	for (i = 0; i < split.nr; i++) {
-		cmdprio->bssplit[ddir][i].bs = split.val1[i];
-		if (split.val2[i] == -1U) {
-			cmdprio->bssplit[ddir][i].perc = 0;
-		} else {
-			if (split.val2[i] > 100)
-				cmdprio->bssplit[ddir][i].perc = 100;
-			else
-				cmdprio->bssplit[ddir][i].perc = split.val2[i];
+	for (i = 0; i < values->nr_prios; i++)
+		ts->clat_prio[ddir][i].ioprio = values->prios[i];
+
+	return 0;
+}
+
+static int fio_cmdprio_fill_bsprio(struct cmdprio_bsprio *bsprio,
+				   struct split_prio *entries,
+				   struct cmdprio_values *values,
+				   int implicit_cmdprio, int start, int end)
+{
+	struct cmdprio_prio *prio;
+	int i = end - start + 1;
+
+	bsprio->prios = calloc(i, sizeof(*bsprio->prios));
+	if (!bsprio->prios)
+		return 1;
+
+	bsprio->bs = entries[start].bs;
+	bsprio->nr_prios = 0;
+	for (i = start; i <= end; i++) {
+		prio = &bsprio->prios[bsprio->nr_prios];
+		prio->perc = entries[i].perc;
+		if (entries[i].prio == -1)
+			prio->prio = implicit_cmdprio;
+		else
+			prio->prio = entries[i].prio;
+		assign_clat_prio_index(prio, values);
+		bsprio->tot_perc += entries[i].perc;
+		if (bsprio->tot_perc > 100) {
+			log_err("fio: cmdprio_bssplit total percentage "
+				"for bs: %"PRIu64" exceeds 100\n",
+				bsprio->bs);
+			free(bsprio->prios);
+			return 1;
 		}
+		bsprio->nr_prios++;
+	}
+
+	return 0;
+}
+
+static int
+fio_cmdprio_generate_bsprio_desc(struct cmdprio_bsprio_desc *bsprio_desc,
+				 struct cmdprio_parse_result *parse_res,
+				 struct cmdprio_values *values,
+				 int implicit_cmdprio)
+{
+	struct split_prio *entries = parse_res->entries;
+	int nr_entries = parse_res->nr_entries;
+	struct cmdprio_bsprio *bsprio;
+	int i, start, count = 0;
+
+	/*
+	 * The parsed result is sorted by blocksize, so count only the number
+	 * of different blocksizes, to know how many cmdprio_bsprio we need.
+	 */
+	for (i = 0; i < nr_entries; i++) {
+		while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+			i++;
+		count++;
+	}
+
+	/*
+	 * This allocation is not freed on error. Instead, the calling function
+	 * is responsible for calling fio_cmdprio_cleanup() on error.
+	 */
+	bsprio_desc->bsprios = calloc(count, sizeof(*bsprio_desc->bsprios));
+	if (!bsprio_desc->bsprios)
+		return 1;
+
+	start = 0;
+	bsprio_desc->nr_bsprios = 0;
+	for (i = 0; i < nr_entries; i++) {
+		while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+			i++;
+		bsprio = &bsprio_desc->bsprios[bsprio_desc->nr_bsprios];
+		/*
+		 * All parsed entries with the same blocksize get saved in the
+		 * same cmdprio_bsprio, to expedite the search in the hot path.
+		 */
+		if (fio_cmdprio_fill_bsprio(bsprio, entries, values,
+					    implicit_cmdprio, start, i))
+			return 1;
+
+		start = i + 1;
+		bsprio_desc->nr_bsprios++;
 	}
 
 	return 0;
 }
 
-int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
-			      struct cmdprio *cmdprio)
+static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
+				    enum fio_ddir ddir, char *str, bool data)
+{
+	struct cmdprio_parse_result *parse_res_arr = cb_arg;
+	struct cmdprio_parse_result *parse_res = &parse_res_arr[ddir];
+
+	if (ddir == DDIR_TRIM)
+		return 0;
+
+	if (split_parse_prio_ddir(to, &parse_res->entries,
+				  &parse_res->nr_entries, str))
+		return 1;
+
+	return 0;
+}
+
+static int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
+				     struct cmdprio_parse_result *parse_res)
 {
 	char *str, *p;
 	int ret = 0;
@@ -53,26 +209,39 @@ int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
 	strip_blank_front(&str);
 	strip_blank_end(str);
 
-	ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, cmdprio,
+	ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, parse_res,
 			      false);
 
 	free(p);
 	return ret;
 }
 
-static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
+/**
+ * fio_cmdprio_percentage - Returns the percentage of I/Os that should
+ * use a cmdprio priority value (rather than the default context priority).
+ *
+ * For CMDPRIO_MODE_BSSPLIT, if the percentage is non-zero, we will also
+ * return the matching bsprio, to avoid the same linear search elsewhere.
+ * For CMDPRIO_MODE_PERC, we will never return a bsprio.
+ */
+static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u,
+				  struct cmdprio_bsprio **bsprio)
 {
+	struct cmdprio_bsprio *bsprio_entry;
 	enum fio_ddir ddir = io_u->ddir;
-	struct cmdprio_options *options = cmdprio->options;
 	int i;
 
 	switch (cmdprio->mode) {
 	case CMDPRIO_MODE_PERC:
-		return options->percentage[ddir];
+		*bsprio = NULL;
+		return cmdprio->perc_entry[ddir].perc;
 	case CMDPRIO_MODE_BSSPLIT:
-		for (i = 0; i < cmdprio->bssplit_nr[ddir]; i++) {
-			if (cmdprio->bssplit[ddir][i].bs == io_u->buflen)
-				return cmdprio->bssplit[ddir][i].perc;
+		for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++) {
+			bsprio_entry = &cmdprio->bsprio_desc[ddir].bsprios[i];
+			if (bsprio_entry->bs == io_u->buflen) {
+				*bsprio = bsprio_entry;
+				return bsprio_entry->tot_perc;
+			}
 		}
 		break;
 	default:
@@ -83,6 +252,11 @@ static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
 		assert(0);
 	}
 
+	/*
+	 * This is totally fine, the given blocksize simply does not
+	 * have any (non-zero) cmdprio_bssplit entries defined.
+	 */
+	*bsprio = NULL;
 	return 0;
 }
 
@@ -100,52 +274,162 @@ static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
 bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
 			    struct io_u *io_u)
 {
-	enum fio_ddir ddir = io_u->ddir;
-	struct cmdprio_options *options = cmdprio->options;
-	unsigned int p;
-	unsigned int cmdprio_value =
-		ioprio_value(options->class[ddir], options->level[ddir]);
-
-	p = fio_cmdprio_percentage(cmdprio, io_u);
-	if (p && rand_between(&td->prio_state, 0, 99) < p) {
-		io_u->ioprio = cmdprio_value;
-		if (!td->ioprio || cmdprio_value < td->ioprio) {
-			/*
-			 * The async IO priority is higher (has a lower value)
-			 * than the default priority (which is either 0 or the
-			 * value set by "prio" and "prioclass" options).
-			 */
-			io_u->flags |= IO_U_F_HIGH_PRIO;
-		}
+	struct cmdprio_bsprio *bsprio;
+	unsigned int p, rand;
+	uint32_t perc = 0;
+	int i;
+
+	p = fio_cmdprio_percentage(cmdprio, io_u, &bsprio);
+	if (!p)
+		return false;
+
+	rand = rand_between(&td->prio_state, 0, 99);
+	if (rand >= p)
+		return false;
+
+	switch (cmdprio->mode) {
+	case CMDPRIO_MODE_PERC:
+		io_u->ioprio = cmdprio->perc_entry[io_u->ddir].prio;
+		io_u->clat_prio_index =
+			cmdprio->perc_entry[io_u->ddir].clat_prio_index;
 		return true;
+	case CMDPRIO_MODE_BSSPLIT:
+		assert(bsprio);
+		for (i = 0; i < bsprio->nr_prios; i++) {
+			struct cmdprio_prio *prio = &bsprio->prios[i];
+
+			perc += prio->perc;
+			if (rand < perc) {
+				io_u->ioprio = prio->prio;
+				io_u->clat_prio_index = prio->clat_prio_index;
+				return true;
+			}
+		}
+		break;
+	default:
+		assert(0);
 	}
 
-	if (td->ioprio && td->ioprio < cmdprio_value) {
+	/* When rand < p (total perc), we should always find a cmdprio_prio. */
+	assert(0);
+	return false;
+}
+
+static int fio_cmdprio_gen_perc(struct thread_data *td, struct cmdprio *cmdprio)
+{
+	struct cmdprio_options *options = cmdprio->options;
+	struct cmdprio_prio *prio;
+	struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {0};
+	struct thread_stat *ts = &td->ts;
+	enum fio_ddir ddir;
+	int ret;
+
+	for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
 		/*
-		 * The IO will be executed with the default priority (which is
-		 * either 0 or the value set by "prio" and "prioclass options),
-		 * and this priority is higher (has a lower value) than the
-		 * async IO priority.
+		 * Do not allocate a clat_prio array nor set the cmdprio struct
+		 * if zero percent of the I/Os (for the ddir) should use a
+		 * cmdprio priority value, or when the ddir is not enabled.
 		 */
-		io_u->flags |= IO_U_F_HIGH_PRIO;
+		if (!options->percentage[ddir] ||
+		    (ddir == DDIR_READ && !td_read(td)) ||
+		    (ddir == DDIR_WRITE && !td_write(td)))
+			continue;
+
+		ret = init_cmdprio_values(&values[ddir], 1, ts);
+		if (ret)
+			goto err;
+
+		prio = &cmdprio->perc_entry[ddir];
+		prio->perc = options->percentage[ddir];
+		prio->prio = ioprio_value(options->class[ddir],
+					  options->level[ddir]);
+		assign_clat_prio_index(prio, &values[ddir]);
+
+		ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+		if (ret)
+			goto err;
+
+		free(values[ddir].prios);
+		values[ddir].prios = NULL;
+		values[ddir].nr_prios = 0;
 	}
 
-	return false;
+	return 0;
+
+err:
+	for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++)
+		free(values[ddir].prios);
+	free_clat_prio_stats(ts);
+
+	return ret;
 }
 
 static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
 					     struct cmdprio *cmdprio)
 {
 	struct cmdprio_options *options = cmdprio->options;
-	int ret;
-
-	ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str, cmdprio);
+	struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {0};
+	struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {0};
+	struct thread_stat *ts = &td->ts;
+	int ret, implicit_cmdprio;
+	enum fio_ddir ddir;
+
+	ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str,
+					&parse_res[0]);
 	if (ret)
 		goto err;
 
+	for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+		/*
+		 * Do not allocate a clat_prio array nor set the cmdprio structs
+		 * if there are no non-zero entries (for the ddir), or when the
+		 * ddir is not enabled.
+		 */
+		if (!parse_res[ddir].nr_entries ||
+		    (ddir == DDIR_READ && !td_read(td)) ||
+		    (ddir == DDIR_WRITE && !td_write(td))) {
+			free(parse_res[ddir].entries);
+			parse_res[ddir].entries = NULL;
+			parse_res[ddir].nr_entries = 0;
+			continue;
+		}
+
+		ret = init_cmdprio_values(&values[ddir],
+					  parse_res[ddir].nr_entries, ts);
+		if (ret)
+			goto err;
+
+		implicit_cmdprio = ioprio_value(options->class[ddir],
+						options->level[ddir]);
+
+		ret = fio_cmdprio_generate_bsprio_desc(&cmdprio->bsprio_desc[ddir],
+						       &parse_res[ddir],
+						       &values[ddir],
+						       implicit_cmdprio);
+		if (ret)
+			goto err;
+
+		free(parse_res[ddir].entries);
+		parse_res[ddir].entries = NULL;
+		parse_res[ddir].nr_entries = 0;
+
+		ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+		if (ret)
+			goto err;
+
+		free(values[ddir].prios);
+		values[ddir].prios = NULL;
+		values[ddir].nr_prios = 0;
+	}
+
 	return 0;
 
 err:
+	for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+		free(parse_res[ddir].entries);
+		free(values[ddir].prios);
+	}
+	free_clat_prio_stats(ts);
 	fio_cmdprio_cleanup(cmdprio);
 
 	return ret;
@@ -157,40 +441,46 @@ static int fio_cmdprio_parse_and_gen(struct thread_data *td,
 	struct cmdprio_options *options = cmdprio->options;
 	int i, ret;
 
+	/*
+	 * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
+	 * is not set, default to RT priority class.
+	 */
+	for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+		/*
+		 * A cmdprio value is only used when fio_cmdprio_percentage()
+		 * returns non-zero, so it is safe to set a class even for a
+		 * DDIR that will never use it.
+		 */
+		if (!options->class[i])
+			options->class[i] = IOPRIO_CLASS_RT;
+	}
+
 	switch (cmdprio->mode) {
 	case CMDPRIO_MODE_BSSPLIT:
 		ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio);
 		break;
 	case CMDPRIO_MODE_PERC:
-		ret = 0;
+		ret = fio_cmdprio_gen_perc(td, cmdprio);
 		break;
 	default:
 		assert(0);
 		return 1;
 	}
 
-	/*
-	 * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
-	 * is not set, default to RT priority class.
-	 */
-	for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
-		if (options->percentage[i] || cmdprio->bssplit_nr[i]) {
-			if (!options->class[i])
-				options->class[i] = IOPRIO_CLASS_RT;
-		}
-	}
-
 	return ret;
 }
 
 void fio_cmdprio_cleanup(struct cmdprio *cmdprio)
 {
-	int ddir;
+	enum fio_ddir ddir;
+	int i;
 
 	for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
-		free(cmdprio->bssplit[ddir]);
-		cmdprio->bssplit[ddir] = NULL;
-		cmdprio->bssplit_nr[ddir] = 0;
+		for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++)
+			free(cmdprio->bsprio_desc[ddir].bsprios[i].prios);
+		free(cmdprio->bsprio_desc[ddir].bsprios);
+		cmdprio->bsprio_desc[ddir].bsprios = NULL;
+		cmdprio->bsprio_desc[ddir].nr_bsprios = 0;
 	}
 
 	/*
diff --git a/engines/cmdprio.h b/engines/cmdprio.h
index 0c7bd6cf..755da8d0 100644
--- a/engines/cmdprio.h
+++ b/engines/cmdprio.h
@@ -17,6 +17,24 @@ enum {
 	CMDPRIO_MODE_BSSPLIT,
 };
 
+struct cmdprio_prio {
+	int32_t prio;
+	uint32_t perc;
+	uint16_t clat_prio_index;
+};
+
+struct cmdprio_bsprio {
+	uint64_t bs;
+	uint32_t tot_perc;
+	unsigned int nr_prios;
+	struct cmdprio_prio *prios;
+};
+
+struct cmdprio_bsprio_desc {
+	struct cmdprio_bsprio *bsprios;
+	unsigned int nr_bsprios;
+};
+
 struct cmdprio_options {
 	unsigned int percentage[CMDPRIO_RWDIR_CNT];
 	unsigned int class[CMDPRIO_RWDIR_CNT];
@@ -26,8 +44,8 @@ struct cmdprio_options {
 
 struct cmdprio {
 	struct cmdprio_options *options;
-	unsigned int bssplit_nr[CMDPRIO_RWDIR_CNT];
-	struct bssplit *bssplit[CMDPRIO_RWDIR_CNT];
+	struct cmdprio_prio perc_entry[CMDPRIO_RWDIR_CNT];
+	struct cmdprio_bsprio_desc bsprio_desc[CMDPRIO_RWDIR_CNT];
 	unsigned int mode;
 };
 
diff --git a/engines/filecreate.c b/engines/filecreate.c
index 4bb13c34..7884752d 100644
--- a/engines/filecreate.c
+++ b/engines/filecreate.c
@@ -49,7 +49,7 @@ static int open_file(struct thread_data *td, struct fio_file *f)
 		uint64_t nsec;
 
 		nsec = ntime_since_now(&start);
-		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
 	}
 
 	return 0;
diff --git a/engines/filedelete.c b/engines/filedelete.c
index e882ccf0..df388ac9 100644
--- a/engines/filedelete.c
+++ b/engines/filedelete.c
@@ -51,7 +51,7 @@ static int delete_file(struct thread_data *td, struct fio_file *f)
 		uint64_t nsec;
 
 		nsec = ntime_since_now(&start);
-		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
 	}
 
 	return 0;
diff --git a/engines/filestat.c b/engines/filestat.c
index 00311247..e587eb54 100644
--- a/engines/filestat.c
+++ b/engines/filestat.c
@@ -125,7 +125,7 @@ static int stat_file(struct thread_data *td, struct fio_file *f)
 		uint64_t nsec;
 
 		nsec = ntime_since_now(&start);
-		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
+		add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
 	}
 
 	return 0;
diff --git a/engines/windowsaio.c b/engines/windowsaio.c
index 9868e816..d82c8053 100644
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -11,6 +11,7 @@
 #include <errno.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped);
 
@@ -35,6 +36,26 @@ struct thread_ctx {
 	struct windowsaio_data *wd;
 };
 
+struct windowsaio_options {
+	struct thread_data *td;
+	unsigned int no_completion_thread;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "no_completion_thread",
+		.lname	= "No completion polling thread",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct windowsaio_options, no_completion_thread),
+		.help	= "Use to avoid separate completion polling thread",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_WINDOWSAIO,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
 static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
 
 static int fio_windowsaio_init(struct thread_data *td)
@@ -80,6 +101,7 @@ static int fio_windowsaio_init(struct thread_data *td)
 		struct thread_ctx *ctx;
 		struct windowsaio_data *wd;
 		HANDLE hFile;
+		struct windowsaio_options *o = td->eo;
 
 		hFile = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
 		if (hFile == INVALID_HANDLE_VALUE) {
@@ -91,29 +113,30 @@ static int fio_windowsaio_init(struct thread_data *td)
 		wd->iothread_running = TRUE;
 		wd->iocp = hFile;
 
-		if (!rc)
-			ctx = malloc(sizeof(struct thread_ctx));
+		if (o->no_completion_thread == 0) {
+			if (!rc)
+				ctx = malloc(sizeof(struct thread_ctx));
 
-		if (!rc && ctx == NULL) {
-			log_err("windowsaio: failed to allocate memory for thread context structure\n");
-			CloseHandle(hFile);
-			rc = 1;
-		}
+			if (!rc && ctx == NULL) {
+				log_err("windowsaio: failed to allocate memory for thread context structure\n");
+				CloseHandle(hFile);
+				rc = 1;
+			}
 
-		if (!rc) {
-			DWORD threadid;
+			if (!rc) {
+				DWORD threadid;
 
-			ctx->iocp = hFile;
-			ctx->wd = wd;
-			wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
-			if (!wd->iothread)
-				log_err("windowsaio: failed to create io completion thread\n");
-			else if (fio_option_is_set(&td->o, cpumask))
-				fio_setaffinity(threadid, td->o.cpumask);
+				ctx->iocp = hFile;
+				ctx->wd = wd;
+				wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+				if (!wd->iothread)
+					log_err("windowsaio: failed to create io completion thread\n");
+				else if (fio_option_is_set(&td->o, cpumask))
+					fio_setaffinity(threadid, td->o.cpumask);
+			}
+			if (rc || wd->iothread == NULL)
+				rc = 1;
 		}
-
-		if (rc || wd->iothread == NULL)
-			rc = 1;
 	}
 
 	return rc;
@@ -302,9 +325,63 @@ static struct io_u* fio_windowsaio_event(struct thread_data *td, int event)
 	return wd->aio_events[event];
 }
 
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-				    unsigned int max,
-				    const struct timespec *t)
+/* dequeue completion entrees directly (no separate completion thread) */
+static int fio_windowsaio_getevents_nothread(struct thread_data *td, unsigned int min,
+				    unsigned int max, const struct timespec *t)
+{
+	struct windowsaio_data *wd = td->io_ops_data;
+	unsigned int dequeued = 0;
+	struct io_u *io_u;
+	DWORD start_count = 0;
+	DWORD end_count = 0;
+	DWORD mswait = 250;
+	struct fio_overlapped *fov;
+
+	if (t != NULL) {
+		mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+		start_count = GetTickCount();
+		end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+	}
+
+	do {
+		BOOL ret;
+		OVERLAPPED *ovl;
+
+		ULONG entries = min(16, max-dequeued);
+		OVERLAPPED_ENTRY oe[16];
+		ret = GetQueuedCompletionStatusEx(wd->iocp, oe, 16, &entries, mswait, 0);
+		if (ret && entries) {
+			int entry_num;
+
+			for (entry_num=0; entry_num<entries; entry_num++) {
+				ovl = oe[entry_num].lpOverlapped;
+				fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o);
+				io_u = fov->io_u;
+
+				if (ovl->Internal == ERROR_SUCCESS) {
+					io_u->resid = io_u->xfer_buflen - ovl->InternalHigh;
+					io_u->error = 0;
+				} else {
+					io_u->resid = io_u->xfer_buflen;
+					io_u->error = win_to_posix_error(GetLastError());
+				}
+
+				fov->io_complete = FALSE;
+				wd->aio_events[dequeued] = io_u;
+				dequeued++;
+			}
+		}
+
+		if (dequeued >= min ||
+			(t != NULL && timeout_expired(start_count, end_count)))
+			break;
+	} while (1);
+	return dequeued;
+}
+
+/* dequeue completion entrees creates by separate IoCompletionRoutine thread */
+static int fio_windowaio_getevents_thread(struct thread_data *td, unsigned int min,
+				    unsigned int max, const struct timespec *t)
 {
 	struct windowsaio_data *wd = td->io_ops_data;
 	unsigned int dequeued = 0;
@@ -334,7 +411,6 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
 				wd->aio_events[dequeued] = io_u;
 				dequeued++;
 			}
-
 		}
 		if (dequeued >= min)
 			break;
@@ -353,6 +429,16 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
 	return dequeued;
 }
 
+static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
+				    unsigned int max, const struct timespec *t)
+{
+	struct windowsaio_options *o = td->eo;
+
+	if (o->no_completion_thread)
+		return fio_windowsaio_getevents_nothread(td, min, max, t);
+	return fio_windowaio_getevents_thread(td, min, max, t);
+}
+
 static enum fio_q_status fio_windowsaio_queue(struct thread_data *td,
 					      struct io_u *io_u)
 {
@@ -484,6 +570,8 @@ static struct ioengine_ops ioengine = {
 	.get_file_size	= generic_get_file_size,
 	.io_u_init	= fio_windowsaio_io_u_init,
 	.io_u_free	= fio_windowsaio_io_u_free,
+	.options	= options,
+	.option_struct_size	= sizeof(struct windowsaio_options),
 };
 
 static void fio_init fio_windowsaio_register(void)
diff --git a/examples/cmdprio-bssplit.fio b/examples/cmdprio-bssplit.fio
index 47e9a790..f3b2fac0 100644
--- a/examples/cmdprio-bssplit.fio
+++ b/examples/cmdprio-bssplit.fio
@@ -1,17 +1,44 @@
 ; Randomly read/write a block device file at queue depth 16.
-; 40 % of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
-; 100% of the 64kB reads are executed at the highest priority and
-; all other IOs executed without a priority set.
 [global]
 filename=/dev/sda
 direct=1
 write_lat_log=prio-run.log
 log_prio=1
-
-[randrw]
 rw=randrw
-bssplit=64k/40:1024k/60,1024k/100
 ioengine=libaio
 iodepth=16
+
+; Simple cmdprio_bssplit format. All non-zero percentage entries will
+; use the same prio class and prio level defined by the cmdprio_class
+; and cmdprio options.
+[cmdprio]
+; 40% of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 100% of the 64kB reads are executed with prio class 1 and prio level 0.
+; All other IOs are executed without a priority set.
+bssplit=64k/40:1024k/60,1024k/100
 cmdprio_bssplit=64k/100:1024k/0,1024k/0
 cmdprio_class=1
+cmdprio=0
+
+; Advanced cmdprio_bssplit format. Each non-zero percentage entry can
+; use a different prio class and prio level (appended to each entry).
+[cmdprio-adv]
+; 40% of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other IOs are executed without a priority set.
+stonewall
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
+
+; Identical to the previous example, but with a default priority defined.
+[cmdprio-adv-def]
+; 40% of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other IOs are executed with prio class 2 and prio level 7.
+stonewall
+prioclass=2
+prio=7
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
diff --git a/fio.1 b/fio.1
index b87d2309..f32d7915 100644
--- a/fio.1
+++ b/fio.1
@@ -1122,7 +1122,7 @@ see \fBend_fsync\fR and \fBfsync_on_close\fR.
 .TP
 .BI fdatasync \fR=\fPint
 Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
-not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
 \fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
 Defaults to 0, which means fio does not periodically issue and wait for a
 data-only sync to complete.
@@ -1995,10 +1995,34 @@ To get a finer control over I/O priority, this option allows specifying
 the percentage of IOs that must have a priority set depending on the block
 size of the IO. This option is useful only when used together with the option
 \fBbssplit\fR, that is, multiple different block sizes are used for reads and
-writes. The format for this option is the same as the format of the
-\fBbssplit\fR option, with the exception that values for trim IOs are
-ignored. This option is mutually exclusive with the \fBcmdprio_percentage\fR
-option.
+writes.
+.RS
+.P
+The first accepted format for this option is the same as the format of the
+\fBbssplit\fR option:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+In this case, each entry will use the priority class and priority level defined
+by the options \fBcmdprio_class\fR and \fBcmdprio\fR respectively.
+.P
+The second accepted format for this option is:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+.RE
+.P
+In this case, the priority class and priority level is defined inside each
+entry. In comparison with the first accepted format, the second accepted format
+does not restrict all entries to have the same priority class and priority
+level.
+.P
+For both formats, only the read and write data directions are supported, values
+for trim IOs are ignored. This option is mutually exclusive with the
+\fBcmdprio_percentage\fR option.
+.RE
 .TP
 .BI (io_uring)fixedbufs
 If fio is asked to do direct IO, then Linux will map pages for each IO call, and
@@ -3360,6 +3384,17 @@ If set, fio will log Unix timestamps to the log files produced by enabling
 write_type_log for each log type, instead of the default zero-based
 timestamps.
 .TP
+.BI log_alternate_epoch \fR=\fPbool
+If set, fio will log timestamps based on the epoch used by the clock specified
+in the \fBlog_alternate_epoch_clock_id\fR option, to the log files produced by
+enabling write_type_log for each log type, instead of the default zero-based
+timestamps.
+.TP
+.BI log_alternate_epoch_clock_id \fR=\fPint
+Specifies the clock_id to be used by clock_gettime to obtain the alternate epoch
+if either \fBBlog_unix_epoch\fR or \fBlog_alternate_epoch\fR are true. Otherwise has no
+effect. Default value is 0, or CLOCK_REALTIME.
+.TP
 .BI block_error_percentiles \fR=\fPbool
 If set, record errors in trim block-sized units from writes and trims and
 output a histogram of how many trims it took to get to errors, and what kind
diff --git a/fio.h b/fio.h
index 1ea3d064..7b0ca843 100644
--- a/fio.h
+++ b/fio.h
@@ -380,7 +380,7 @@ struct thread_data {
 
 	struct timespec start;	/* start of this loop */
 	struct timespec epoch;	/* time job was started */
-	unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
+	unsigned long long alternate_epoch; /* Time job was started, clock_gettime's clock_id epoch based. */
 	struct timespec last_issue;
 	long time_offset;
 	struct timespec ts_cache;
diff --git a/fio_time.h b/fio_time.h
index b3bbd4c0..62d92120 100644
--- a/fio_time.h
+++ b/fio_time.h
@@ -30,6 +30,6 @@ extern bool ramp_time_over(struct thread_data *);
 extern bool in_ramp_time(struct thread_data *);
 extern void fio_time_init(void);
 extern void timespec_add_msec(struct timespec *, unsigned int);
-extern void set_epoch_time(struct thread_data *, int);
+extern void set_epoch_time(struct thread_data *, int, clockid_t);
 
 #endif
diff --git a/gclient.c b/gclient.c
index ac063536..c59bcfe2 100644
--- a/gclient.c
+++ b/gclient.c
@@ -1155,21 +1155,18 @@ out:
 #define GFIO_CLAT	1
 #define GFIO_SLAT	2
 #define GFIO_LAT	4
-#define GFIO_HILAT	8
-#define GFIO_LOLAT	16
 
 static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
 				  struct group_run_stats *rs,
 				  struct thread_stat *ts, int ddir)
 {
 	const char *ddir_label[3] = { "Read", "Write", "Trim" };
-	const char *hilat, *lolat;
 	GtkWidget *frame, *label, *box, *vbox, *main_vbox;
-	unsigned long long min[5], max[5];
+	unsigned long long min[3], max[3];
 	unsigned long runt;
 	unsigned long long bw, iops;
 	unsigned int flags = 0;
-	double mean[5], dev[5];
+	double mean[3], dev[3];
 	char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
 	char tmp[128];
 	int i2p;
@@ -1268,14 +1265,6 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
 		flags |= GFIO_CLAT;
 	if (calc_lat(&ts->lat_stat[ddir], &min[2], &max[2], &mean[2], &dev[2]))
 		flags |= GFIO_LAT;
-	if (calc_lat(&ts->clat_high_prio_stat[ddir], &min[3], &max[3], &mean[3], &dev[3])) {
-		flags |= GFIO_HILAT;
-		if (calc_lat(&ts->clat_low_prio_stat[ddir], &min[4], &max[4], &mean[4], &dev[4]))
-			flags |= GFIO_LOLAT;
-		/* we only want to print low priority statistics if other IOs were
-		 * submitted with the priority bit set
-		 */
-	}
 
 	if (flags) {
 		frame = gtk_frame_new("Latency");
@@ -1284,24 +1273,12 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
 		vbox = gtk_vbox_new(FALSE, 3);
 		gtk_container_add(GTK_CONTAINER(frame), vbox);
 
-		if (ts->lat_percentiles) {
-			hilat = "High priority total latency";
-			lolat = "Low priority total latency";
-		} else {
-			hilat = "High priority completion latency";
-			lolat = "Low priority completion latency";
-		}
-
 		if (flags & GFIO_SLAT)
 			gfio_show_lat(vbox, "Submission latency", min[0], max[0], mean[0], dev[0]);
 		if (flags & GFIO_CLAT)
 			gfio_show_lat(vbox, "Completion latency", min[1], max[1], mean[1], dev[1]);
 		if (flags & GFIO_LAT)
 			gfio_show_lat(vbox, "Total latency", min[2], max[2], mean[2], dev[2]);
-		if (flags & GFIO_HILAT)
-			gfio_show_lat(vbox, hilat, min[3], max[3], mean[3], dev[3]);
-		if (flags & GFIO_LOLAT)
-			gfio_show_lat(vbox, lolat, min[4], max[4], mean[4], dev[4]);
 	}
 
 	if (ts->slat_percentiles && flags & GFIO_SLAT)
@@ -1309,40 +1286,16 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
 				ts->io_u_plat[FIO_SLAT][ddir],
 				ts->slat_stat[ddir].samples,
 				"Submission");
-	if (ts->clat_percentiles && flags & GFIO_CLAT) {
+	if (ts->clat_percentiles && flags & GFIO_CLAT)
 		gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
 				ts->io_u_plat[FIO_CLAT][ddir],
 				ts->clat_stat[ddir].samples,
 				"Completion");
-		if (!ts->lat_percentiles) {
-			if (flags & GFIO_HILAT)
-				gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-						ts->io_u_plat_high_prio[ddir],
-						ts->clat_high_prio_stat[ddir].samples,
-						"High priority completion");
-			if (flags & GFIO_LOLAT)
-				gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-						ts->io_u_plat_low_prio[ddir],
-						ts->clat_low_prio_stat[ddir].samples,
-						"Low priority completion");
-		}
-	}
-	if (ts->lat_percentiles && flags & GFIO_LAT) {
+	if (ts->lat_percentiles && flags & GFIO_LAT)
 		gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
 				ts->io_u_plat[FIO_LAT][ddir],
 				ts->lat_stat[ddir].samples,
 				"Total");
-		if (flags & GFIO_HILAT)
-			gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-					ts->io_u_plat_high_prio[ddir],
-					ts->clat_high_prio_stat[ddir].samples,
-					"High priority total");
-		if (flags & GFIO_LOLAT)
-			gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-					ts->io_u_plat_low_prio[ddir],
-					ts->clat_low_prio_stat[ddir].samples,
-					"Low priority total");
-	}
 
 	free(io_p);
 	free(bw_p);
diff --git a/init.c b/init.c
index 07daaa84..13935152 100644
--- a/init.c
+++ b/init.c
@@ -224,6 +224,13 @@ static struct option l_opts[FIO_NR_OPTIONS] = {
 		.has_arg	= optional_argument,
 		.val		= 'S',
 	},
+#ifdef WIN32
+	{
+		.name		= (char *) "server-internal",
+		.has_arg	= required_argument,
+		.val		= 'N',
+	},
+#endif
 	{	.name		= (char *) "daemonize",
 		.has_arg	= required_argument,
 		.val		= 'D',
@@ -1445,6 +1452,26 @@ static bool wait_for_ok(const char *jobname, struct thread_options *o)
 	return true;
 }
 
+static int verify_per_group_options(struct thread_data *td, const char *jobname)
+{
+	struct thread_data *td2;
+	int i;
+
+	for_each_td(td2, i) {
+		if (td->groupid != td2->groupid)
+			continue;
+
+		if (td->o.stats &&
+		    td->o.lat_percentiles != td2->o.lat_percentiles) {
+			log_err("fio: lat_percentiles in job: %s differs from group\n",
+				jobname);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Treat an empty log file name the same as a one not given
  */
@@ -1563,6 +1590,10 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
 	td->groupid = groupid;
 	prev_group_jobs++;
 
+	if (td->o.group_reporting && prev_group_jobs > 1 &&
+	    verify_per_group_options(td, jobname))
+		goto err;
+
 	if (setup_rate(td))
 		goto err;
 
@@ -2795,6 +2826,12 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
 			exit_val = 1;
 #endif
 			break;
+#ifdef WIN32
+		case 'N':
+			did_arg = true;
+			fio_server_internal_set(optarg);
+			break;
+#endif
 		case 'D':
 			if (pid_file)
 				free(pid_file);
diff --git a/io_u.c b/io_u.c
index 3c72d63d..059637e5 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1595,7 +1595,7 @@ again:
 		assert(io_u->flags & IO_U_F_FREE);
 		io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
 				 IO_U_F_TRIMMED | IO_U_F_BARRIER |
-				 IO_U_F_VER_LIST | IO_U_F_HIGH_PRIO);
+				 IO_U_F_VER_LIST);
 
 		io_u->error = 0;
 		io_u->acct_ddir = -1;
@@ -1803,6 +1803,7 @@ struct io_u *get_io_u(struct thread_data *td)
 	 * Remember the issuing context priority. The IO engine may change this.
 	 */
 	io_u->ioprio = td->ioprio;
+	io_u->clat_prio_index = 0;
 out:
 	assert(io_u->file);
 	if (!td_io_prep(td, io_u)) {
@@ -1889,7 +1890,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 
 		tnsec = ntime_since(&io_u->start_time, &icd->time);
 		add_lat_sample(td, idx, tnsec, bytes, io_u->offset,
-			       io_u->ioprio, io_u_is_high_prio(io_u));
+			       io_u->ioprio, io_u->clat_prio_index);
 
 		if (td->flags & TD_F_PROFILE_OPS) {
 			struct prof_io_ops *ops = &td->prof_io_ops;
@@ -1911,7 +1912,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 	if (ddir_rw(idx)) {
 		if (!td->o.disable_clat) {
 			add_clat_sample(td, idx, llnsec, bytes, io_u->offset,
-					io_u->ioprio, io_u_is_high_prio(io_u));
+					io_u->ioprio, io_u->clat_prio_index);
 			io_u_mark_latency(td, llnsec);
 		}
 
diff --git a/io_u.h b/io_u.h
index bdbac525..206e24fe 100644
--- a/io_u.h
+++ b/io_u.h
@@ -21,7 +21,6 @@ enum {
 	IO_U_F_TRIMMED		= 1 << 5,
 	IO_U_F_BARRIER		= 1 << 6,
 	IO_U_F_VER_LIST		= 1 << 7,
-	IO_U_F_HIGH_PRIO	= 1 << 8,
 };
 
 /*
@@ -50,6 +49,7 @@ struct io_u {
 	 * IO priority.
 	 */
 	unsigned short ioprio;
+	unsigned short clat_prio_index;
 
 	/*
 	 * Allocated/set buffer and length
@@ -193,6 +193,5 @@ static inline enum fio_ddir acct_ddir(struct io_u *io_u)
 	td_flags_clear((td), &(io_u->flags), (val))
 #define io_u_set(td, io_u, val)		\
 	td_flags_set((td), &(io_u)->flags, (val))
-#define io_u_is_high_prio(io_u)	(io_u->flags & IO_U_F_HIGH_PRIO)
 
 #endif
diff --git a/libfio.c b/libfio.c
index 198eaf2e..01fa7452 100644
--- a/libfio.c
+++ b/libfio.c
@@ -142,7 +142,7 @@ void reset_all_stats(struct thread_data *td)
 		td->ts.runtime[i] = 0;
 	}
 
-	set_epoch_time(td, td->o.log_unix_epoch);
+	set_epoch_time(td, td->o.log_unix_epoch | td->o.log_alternate_epoch, td->o.log_alternate_epoch_clock_id);
 	memcpy(&td->start, &td->epoch, sizeof(td->epoch));
 	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
 	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
diff --git a/optgroup.h b/optgroup.h
index 1fb84a29..3ac8f62a 100644
--- a/optgroup.h
+++ b/optgroup.h
@@ -71,6 +71,7 @@ enum opt_category_group {
 	__FIO_OPT_G_LIBCUFILE,
 	__FIO_OPT_G_DFS,
 	__FIO_OPT_G_NFS,
+	__FIO_OPT_G_WINDOWSAIO,
 
 	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
 	FIO_OPT_G_ZONE		= (1ULL << __FIO_OPT_G_ZONE),
@@ -116,6 +117,7 @@ enum opt_category_group {
 	FIO_OPT_G_FILESTAT	= (1ULL << __FIO_OPT_G_FILESTAT),
 	FIO_OPT_G_LIBCUFILE	= (1ULL << __FIO_OPT_G_LIBCUFILE),
 	FIO_OPT_G_DFS		= (1ULL << __FIO_OPT_G_DFS),
+	FIO_OPT_G_WINDOWSAIO	= (1ULL << __FIO_OPT_G_WINDOWSAIO),
 };
 
 extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
diff --git a/options.c b/options.c
index 102bcf56..6cdbd268 100644
--- a/options.c
+++ b/options.c
@@ -278,6 +278,128 @@ static int str_bssplit_cb(void *data, const char *input)
 	return ret;
 }
 
+static int parse_cmdprio_bssplit_entry(struct thread_options *o,
+				       struct split_prio *entry, char *str)
+{
+	int matches = 0;
+	char *bs_str = NULL;
+	long long bs_val;
+	unsigned int perc = 0, class, level;
+
+	/*
+	 * valid entry formats:
+	 * bs/ - %s/ - set perc to 0, prio to -1.
+	 * bs/perc - %s/%u - set prio to -1.
+	 * bs/perc/class/level - %s/%u/%u/%u
+	 */
+	matches = sscanf(str, "%m[^/]/%u/%u/%u", &bs_str, &perc, &class, &level);
+	if (matches < 1) {
+		log_err("fio: invalid cmdprio_bssplit format\n");
+		return 1;
+	}
+
+	if (str_to_decimal(bs_str, &bs_val, 1, o, 0, 0)) {
+		log_err("fio: split conversion failed\n");
+		free(bs_str);
+		return 1;
+	}
+	free(bs_str);
+
+	entry->bs = bs_val;
+	entry->perc = min(perc, 100u);
+	entry->prio = -1;
+	switch (matches) {
+	case 1: /* bs/ case */
+	case 2: /* bs/perc case */
+		break;
+	case 4: /* bs/perc/class/level case */
+		class = min(class, (unsigned int) IOPRIO_MAX_PRIO_CLASS);
+		level = min(level, (unsigned int) IOPRIO_MAX_PRIO);
+		entry->prio = ioprio_value(class, level);
+		break;
+	default:
+		log_err("fio: invalid cmdprio_bssplit format\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns a negative integer if the first argument should be before the second
+ * argument in the sorted list. A positive integer if the first argument should
+ * be after the second argument in the sorted list. A zero if they are equal.
+ */
+static int fio_split_prio_cmp(const void *p1, const void *p2)
+{
+	const struct split_prio *tmp1 = p1;
+	const struct split_prio *tmp2 = p2;
+
+	if (tmp1->bs > tmp2->bs)
+		return 1;
+	if (tmp1->bs < tmp2->bs)
+		return -1;
+	return 0;
+}
+
+int split_parse_prio_ddir(struct thread_options *o, struct split_prio **entries,
+			  int *nr_entries, char *str)
+{
+	struct split_prio *tmp_entries;
+	unsigned int nr_bssplits;
+	char *str_cpy, *p, *fname;
+
+	/* strsep modifies the string, dup it so that we can use strsep twice */
+	p = str_cpy = strdup(str);
+	if (!p)
+		return 1;
+
+	nr_bssplits = 0;
+	while ((fname = strsep(&str_cpy, ":")) != NULL) {
+		if (!strlen(fname))
+			break;
+		nr_bssplits++;
+	}
+	free(p);
+
+	if (nr_bssplits > BSSPLIT_MAX) {
+		log_err("fio: too many cmdprio_bssplit entries\n");
+		return 1;
+	}
+
+	tmp_entries = calloc(nr_bssplits, sizeof(*tmp_entries));
+	if (!tmp_entries)
+		return 1;
+
+	nr_bssplits = 0;
+	while ((fname = strsep(&str, ":")) != NULL) {
+		struct split_prio *entry;
+
+		if (!strlen(fname))
+			break;
+
+		entry = &tmp_entries[nr_bssplits];
+
+		if (parse_cmdprio_bssplit_entry(o, entry, fname)) {
+			log_err("fio: failed to parse cmdprio_bssplit entry\n");
+			free(tmp_entries);
+			return 1;
+		}
+
+		/* skip zero perc entries, they provide no useful information */
+		if (entry->perc)
+			nr_bssplits++;
+	}
+
+	qsort(tmp_entries, nr_bssplits, sizeof(*tmp_entries),
+	      fio_split_prio_cmp);
+
+	*entries = tmp_entries;
+	*nr_entries = nr_bssplits;
+
+	return 0;
+}
+
 static int str2error(char *str)
 {
 	const char *err[] = { "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO",
@@ -4392,6 +4514,24 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_LOG,
 		.group = FIO_OPT_G_INVALID,
 	},
+	{
+		.name = "log_alternate_epoch",
+		.lname = "Log epoch alternate",
+		.type = FIO_OPT_BOOL,
+		.off1 = offsetof(struct thread_options, log_alternate_epoch),
+		.help = "Use alternate epoch time in log files. Uses the same epoch as that is used by clock_gettime with specified log_alternate_epoch_clock_id.",
+		.category = FIO_OPT_C_LOG,
+		.group = FIO_OPT_G_INVALID,
+	},
+	{
+		.name = "log_alternate_epoch_clock_id",
+		.lname = "Log alternate epoch clock_id",
+		.type = FIO_OPT_INT,
+		.off1 = offsetof(struct thread_options, log_alternate_epoch_clock_id),
+		.help = "If log_alternate_epoch or log_unix_epoch is true, this option specifies the clock_id from clock_gettime whose epoch should be used. If neither of those is true, this option has no effect. Default value is 0, or CLOCK_REALTIME",
+		.category = FIO_OPT_C_LOG,
+		.group = FIO_OPT_G_INVALID,
+	},
 	{
 		.name	= "block_error_percentiles",
 		.lname	= "Block error percentiles",
diff --git a/os/os-windows.h b/os/os-windows.h
index 59da9dba..510b8143 100644
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -110,6 +110,8 @@ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
 ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
 ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
 		off_t offset);
+HANDLE windows_handle_connection(HANDLE hjob, int sk);
+HANDLE windows_create_job(void);
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
 {
diff --git a/os/os.h b/os/os.h
index 5965d7b8..810e6166 100644
--- a/os/os.h
+++ b/os/os.h
@@ -119,10 +119,14 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 
 #ifndef FIO_HAVE_IOPRIO_CLASS
 #define ioprio_value_is_class_rt(prio)	(false)
+#define IOPRIO_MIN_PRIO_CLASS		0
+#define IOPRIO_MAX_PRIO_CLASS		0
 #endif
 #ifndef FIO_HAVE_IOPRIO
 #define ioprio_value(prioclass, prio)	(0)
 #define ioprio_set(which, who, prioclass, prio)	(0)
+#define IOPRIO_MIN_PRIO			0
+#define IOPRIO_MAX_PRIO			0
 #endif
 
 #ifndef FIO_HAVE_ODIRECT
diff --git a/os/windows/posix.c b/os/windows/posix.c
index 09c2e4a7..0d415e1e 100644
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -537,16 +537,21 @@ int fcntl(int fildes, int cmd, ...)
 return 0;
 }
 
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif
+
 /*
  * Get the value of a local clock source.
- * This implementation supports 2 clocks: CLOCK_MONOTONIC provides high-accuracy
- * relative time, while CLOCK_REALTIME provides a low-accuracy wall time.
+ * This implementation supports 3 clocks: CLOCK_MONOTONIC/CLOCK_MONOTONIC_RAW
+ * provide high-accuracy relative time, while CLOCK_REALTIME provides a
+ * low-accuracy wall time.
  */
 int clock_gettime(clockid_t clock_id, struct timespec *tp)
 {
 	int rc = 0;
 
-	if (clock_id == CLOCK_MONOTONIC) {
+	if (clock_id == CLOCK_MONOTONIC || clock_id == CLOCK_MONOTONIC_RAW) {
 		static LARGE_INTEGER freq = {{0,0}};
 		LARGE_INTEGER counts;
 		uint64_t t;
@@ -1026,3 +1031,174 @@ in_addr_t inet_network(const char *cp)
 	hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24);
 	return hbo;
 }
+
+static HANDLE create_named_pipe(char *pipe_name, int wait_connect_time)
+{
+	HANDLE hpipe;
+
+	hpipe = CreateNamedPipe (
+			pipe_name,
+			PIPE_ACCESS_DUPLEX,
+			PIPE_WAIT | PIPE_TYPE_BYTE,
+			1, 0, 0, wait_connect_time, NULL);
+
+	if (hpipe == INVALID_HANDLE_VALUE) {
+		log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+		return INVALID_HANDLE_VALUE;
+	}
+
+	if (!ConnectNamedPipe(hpipe, NULL)) {
+		log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+		CloseHandle(hpipe);
+		return INVALID_HANDLE_VALUE;
+	}
+
+	return hpipe;
+}
+
+static BOOL windows_create_process(PROCESS_INFORMATION *pi, const char *args, HANDLE *hjob)
+{
+	LPSTR this_cmd_line = GetCommandLine();
+	LPSTR new_process_cmd_line = malloc((strlen(this_cmd_line)+strlen(args)) * sizeof(char *));
+	STARTUPINFO si = {0};
+	DWORD flags = 0;
+
+	strcpy(new_process_cmd_line, this_cmd_line);
+	strcat(new_process_cmd_line, args);
+
+	si.cb = sizeof(si);
+	memset(pi, 0, sizeof(*pi));
+
+	if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE))
+		flags = CREATE_SUSPENDED | CREATE_BREAKAWAY_FROM_JOB;
+
+	flags |= CREATE_NEW_CONSOLE;
+
+	if( !CreateProcess( NULL,
+		new_process_cmd_line,
+		NULL,    /* Process handle not inherited */
+		NULL,    /* Thread handle not inherited */
+		TRUE,    /* no handle inheritance */
+		flags,
+		NULL,    /* Use parent's environment block */
+		NULL,    /* Use parent's starting directory */
+		&si,
+		pi )
+	)
+	{
+		log_err("CreateProcess failed (%lu).\n", GetLastError() );
+		free(new_process_cmd_line);
+		return 1;
+	}
+	if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE)) {
+		BOOL ret = AssignProcessToJobObject(*hjob, pi->hProcess);
+		if (!ret) {
+			log_err("AssignProcessToJobObject failed (%lu).\n", GetLastError() );
+			return 1;
+		}
+
+ 		ResumeThread(pi->hThread);
+	}
+
+	free(new_process_cmd_line);
+	return 0;
+}
+
+HANDLE windows_create_job(void)
+{
+	JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli = { 0 };
+	BOOL success;
+	HANDLE hjob = CreateJobObject(NULL, NULL);
+
+	jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE;
+	success = SetInformationJobObject(hjob, JobObjectExtendedLimitInformation, &jeli, sizeof(jeli));
+	if ( success == 0 ) {
+        log_err( "SetInformationJobObject failed: error %lu\n", GetLastError() );
+        return INVALID_HANDLE_VALUE;
+    }
+	return hjob;
+}
+
+/* wait for a child process to either exit or connect to a child */
+static bool monitor_process_till_connect(PROCESS_INFORMATION *pi, HANDLE *hpipe)
+{
+	bool connected = FALSE;
+	bool process_alive = TRUE;
+	char buffer[32] = {0};
+	DWORD bytes_read;
+
+	do {
+		DWORD exit_code;
+		GetExitCodeProcess(pi->hProcess, &exit_code);
+		if (exit_code != STILL_ACTIVE) {
+			dprint(FD_PROCESS, "process %u exited %d\n", GetProcessId(pi->hProcess), exit_code);
+			break;
+		}
+
+		memset(buffer, 0, sizeof(buffer));
+		ReadFile(*hpipe, &buffer, sizeof(buffer) - 1, &bytes_read, NULL);
+		if (bytes_read && strstr(buffer, "connected")) {
+			dprint(FD_PROCESS, "process %u connected to client\n", GetProcessId(pi->hProcess));
+			connected = TRUE;
+		}
+		usleep(10*1000);
+	} while (process_alive && !connected);
+	return connected;
+}
+
+/*create a process with --server-internal to emulate fork() */
+HANDLE windows_handle_connection(HANDLE hjob, int sk)
+{
+	char pipe_name[64] =  "\\\\.\\pipe\\fiointernal-";
+	char args[128] = " --server-internal=";
+	PROCESS_INFORMATION pi;
+	HANDLE hpipe = INVALID_HANDLE_VALUE;
+	WSAPROTOCOL_INFO protocol_info;
+	HANDLE ret;
+
+	sprintf(pipe_name+strlen(pipe_name), "%d", GetCurrentProcessId());
+	sprintf(args+strlen(args), "%s", pipe_name);
+
+	if (windows_create_process(&pi, args, &hjob) != 0)
+		return INVALID_HANDLE_VALUE;
+	else
+		ret = pi.hProcess;
+
+	/* duplicate socket and write the protocol_info to pipe so child can
+	 * duplicate the communciation socket */
+	if (WSADuplicateSocket(sk, GetProcessId(pi.hProcess), &protocol_info)) {
+		log_err("WSADuplicateSocket failed (%lu).\n", GetLastError());
+		ret = INVALID_HANDLE_VALUE;
+		goto cleanup;
+	}
+
+	/* make a pipe with a unique name based upon processid */
+	hpipe = create_named_pipe(pipe_name, 1000);
+	if (hpipe == INVALID_HANDLE_VALUE) {
+		ret = INVALID_HANDLE_VALUE;
+		goto cleanup;
+	}
+
+	if (!WriteFile(hpipe, &protocol_info, sizeof(protocol_info), NULL, NULL)) {
+		log_err("WriteFile failed (%lu).\n", GetLastError());
+		ret = INVALID_HANDLE_VALUE;
+		goto cleanup;
+	}
+
+	dprint(FD_PROCESS, "process %d created child process %u\n", GetCurrentProcessId(), GetProcessId(pi.hProcess));
+
+	/* monitor the process until it either exits or connects. This level
+	 * doesnt care which of those occurs because the result is that it
+	 * needs to loop around and create another child process to monitor */
+	if (!monitor_process_till_connect(&pi, &hpipe))
+		ret = INVALID_HANDLE_VALUE;
+
+cleanup:
+	/* close the handles and pipes because this thread is done monitoring them */
+	if (ret == INVALID_HANDLE_VALUE)
+		CloseHandle(pi.hProcess);
+	CloseHandle(pi.hThread);
+	DisconnectNamedPipe(hpipe);
+	CloseHandle(hpipe);
+	return ret;
+}
\ No newline at end of file
diff --git a/rate-submit.c b/rate-submit.c
index 752c30a5..268356d1 100644
--- a/rate-submit.c
+++ b/rate-submit.c
@@ -173,7 +173,7 @@ static int io_workqueue_init_worker_fn(struct submit_worker *sw)
 	if (td->io_ops->post_init && td->io_ops->post_init(td))
 		goto err_io_init;
 
-	set_epoch_time(td, td->o.log_unix_epoch);
+	set_epoch_time(td, td->o.log_unix_epoch | td->o.log_alternate_epoch, td->o.log_alternate_epoch_clock_id);
 	fio_getrusage(&td->ru_start);
 	clear_io_state(td, 1);
 
@@ -195,6 +195,15 @@ static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
 	struct thread_data *td = sw->priv;
 
 	(*sum_cnt)++;
+
+	/*
+	 * io_workqueue_update_acct_fn() doesn't support per prio stats, and
+	 * even if it did, offload can't be used with all async IO engines.
+	 * If group reporting is set in the parent td, the group result
+	 * generated by __show_run_stats() can still contain multiple prios
+	 * from different offloaded jobs.
+	 */
+	sw->wq->td->ts.disable_prio_stat = 1;
 	sum_thread_stats(&sw->wq->td->ts, &td->ts);
 
 	fio_options_free(td);
diff --git a/server.c b/server.c
index 90c52e01..914a8c74 100644
--- a/server.c
+++ b/server.c
@@ -63,12 +63,28 @@ static char me[128];
 
 static pthread_key_t sk_out_key;
 
+#ifdef WIN32
+static char *fio_server_pipe_name  = NULL;
+static HANDLE hjob = INVALID_HANDLE_VALUE;
+struct ffi_element {
+	union {
+		pthread_t thread;
+		HANDLE hProcess;
+	};
+	bool is_thread;
+};
+#endif
+
 struct fio_fork_item {
 	struct flist_head list;
 	int exitval;
 	int signal;
 	int exited;
+#ifdef WIN32
+	struct ffi_element element;
+#else
 	pid_t pid;
+#endif
 };
 
 struct cmd_reply {
@@ -250,6 +266,28 @@ static int fio_send_data(int sk, const void *p, unsigned int len)
 	return fio_sendv_data(sk, &iov, 1);
 }
 
+bool fio_server_poll_fd(int fd, short events, int timeout)
+{
+	struct pollfd pfd = {
+		.fd	= fd,
+		.events	= events,
+	};
+	int ret;
+
+	ret = poll(&pfd, 1, timeout);
+	if (ret < 0) {
+		if (errno == EINTR)
+			return false;
+		log_err("fio: poll: %s\n", strerror(errno));
+		return false;
+	} else if (!ret) {
+		return false;
+	}
+	if (pfd.revents & events)
+		return true;
+	return false;
+}
+
 static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
 {
 	int flags;
@@ -651,6 +689,63 @@ static int fio_net_queue_stop(int error, int signal)
 	return fio_net_send_ack(NULL, error, signal);
 }
 
+#ifdef WIN32
+static void fio_server_add_fork_item(struct ffi_element *element, struct flist_head *list)
+{
+	struct fio_fork_item *ffi;
+
+	ffi = malloc(sizeof(*ffi));
+	ffi->exitval = 0;
+	ffi->signal = 0;
+	ffi->exited = 0;
+	ffi->element = *element;
+	flist_add_tail(&ffi->list, list);
+}
+
+static void fio_server_add_conn_pid(struct flist_head *conn_list, HANDLE hProcess)
+{
+	struct ffi_element element = {.hProcess = hProcess, .is_thread=FALSE};
+	dprint(FD_NET, "server: forked off connection job (tid=%u)\n", (int) element.thread);
+
+	fio_server_add_fork_item(&element, conn_list);
+}
+
+static void fio_server_add_job_pid(struct flist_head *job_list, pthread_t thread)
+{
+	struct ffi_element element = {.thread = thread, .is_thread=TRUE};
+	dprint(FD_NET, "server: forked off job job (tid=%u)\n", (int) element.thread);
+	fio_server_add_fork_item(&element, job_list);
+}
+
+static void fio_server_check_fork_item(struct fio_fork_item *ffi)
+{
+	int ret;
+
+	if (ffi->element.is_thread) {
+
+		ret = pthread_kill(ffi->element.thread, 0);
+		if (ret) {
+			int rev_val;
+			pthread_join(ffi->element.thread, (void**) &rev_val); /*if the thread is dead, then join it to get status*/
+
+			ffi->exitval = rev_val;
+			if (ffi->exitval)
+				log_err("thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+			dprint(FD_PROCESS, "thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+			ffi->exited = 1;
+		}
+	} else {
+		DWORD exit_val;
+		GetExitCodeProcess(ffi->element.hProcess, &exit_val);
+
+		if (exit_val != STILL_ACTIVE) {
+			dprint(FD_PROCESS, "process %u exited with %d\n", GetProcessId(ffi->element.hProcess), exit_val);
+			ffi->exited = 1;
+			ffi->exitval = exit_val;
+		}
+	}
+}
+#else
 static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
 {
 	struct fio_fork_item *ffi;
@@ -698,10 +793,21 @@ static void fio_server_check_fork_item(struct fio_fork_item *ffi)
 		}
 	}
 }
+#endif
 
 static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
 {
+#ifdef WIN32
+	if (ffi->element.is_thread)
+		dprint(FD_NET, "tid %u exited, sig=%u, exitval=%d\n", (int) ffi->element.thread, ffi->signal, ffi->exitval);
+	else {
+		dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int)  GetProcessId(ffi->element.hProcess), ffi->signal, ffi->exitval);
+		CloseHandle(ffi->element.hProcess);
+		ffi->element.hProcess = INVALID_HANDLE_VALUE;
+	}
+#else
 	dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
+#endif
 
 	/*
 	 * Fold STOP and QUIT...
@@ -762,27 +868,62 @@ static int handle_load_file_cmd(struct fio_net_cmd *cmd)
 	return 0;
 }
 
-static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
-			  struct fio_net_cmd *cmd)
+#ifdef WIN32
+static void *fio_backend_thread(void *data)
 {
-	pid_t pid;
 	int ret;
+	struct sk_out *sk_out = (struct sk_out *) data;
 
 	sk_out_assign(sk_out);
 
+	ret = fio_backend(sk_out);
+	sk_out_drop();
+
+	pthread_exit((void*) (intptr_t) ret);
+	return NULL;
+}
+#endif
+
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
+{
+	int ret;
+
 	fio_time_init();
 	set_genesis_time();
 
-	pid = fork();
-	if (pid) {
-		fio_server_add_job_pid(job_list, pid);
-		return 0;
+#ifdef WIN32
+	{
+		pthread_t thread;
+		/* both this thread and backend_thread call sk_out_assign() to double increment
+		 * the ref count.  This ensures struct is valid until both threads are done with it
+		 */
+		sk_out_assign(sk_out);
+		ret = pthread_create(&thread, NULL,	fio_backend_thread, sk_out);
+		if (ret) {
+			log_err("pthread_create: %s\n", strerror(ret));
+			return ret;
+		}
+
+		fio_server_add_job_pid(job_list, thread);
+		return ret;
 	}
+#else
+    {
+		pid_t pid;
+		sk_out_assign(sk_out);
+		pid = fork();
+		if (pid) {
+			fio_server_add_job_pid(job_list, pid);
+			return 0;
+		}
 
-	ret = fio_backend(sk_out);
-	free_threads_shm();
-	sk_out_drop();
-	_exit(ret);
+		ret = fio_backend(sk_out);
+		free_threads_shm();
+		sk_out_drop();
+		_exit(ret);
+	}
+#endif
 }
 
 static int handle_job_cmd(struct fio_net_cmd *cmd)
@@ -1238,7 +1379,8 @@ static int handle_connection(struct sk_out *sk_out)
 		if (ret < 0)
 			break;
 
-		cmd = fio_net_recv_cmd(sk_out->sk, true);
+		if (pfd.revents & POLLIN)
+			cmd = fio_net_recv_cmd(sk_out->sk, true);
 		if (!cmd) {
 			ret = -1;
 			break;
@@ -1300,6 +1442,73 @@ static int get_my_addr_str(int sk)
 	return 0;
 }
 
+#ifdef WIN32
+static int handle_connection_process(void)
+{
+	WSAPROTOCOL_INFO protocol_info;
+	DWORD bytes_read;
+	HANDLE hpipe;
+	int sk;
+	struct sk_out *sk_out;
+	int ret;
+	char *msg = (char *) "connected";
+
+	log_info("server enter accept loop.  ProcessID %d\n", GetCurrentProcessId());
+
+	hpipe = CreateFile(
+					fio_server_pipe_name,
+					GENERIC_READ | GENERIC_WRITE,
+					0, NULL,
+					OPEN_EXISTING,
+					0, NULL);
+
+	if (hpipe == INVALID_HANDLE_VALUE) {
+		log_err("couldnt open pipe %s error %lu\n",
+				fio_server_pipe_name, GetLastError());
+		return -1;
+	}
+
+	if (!ReadFile(hpipe, &protocol_info, sizeof(protocol_info), &bytes_read, NULL)) {
+		log_err("couldnt read pi from pipe %s error %lu\n", fio_server_pipe_name,
+				GetLastError());
+	}
+
+	if (use_ipv6) /* use protocol_info to create a duplicate of parents socket */
+		sk = WSASocket(AF_INET6, SOCK_STREAM, 0, &protocol_info, 0, 0);
+	else
+		sk = WSASocket(AF_INET,  SOCK_STREAM, 0, &protocol_info, 0, 0);
+
+	sk_out = scalloc(1, sizeof(*sk_out));
+	if (!sk_out) {
+		CloseHandle(hpipe);
+		close(sk);
+		return -1;
+	}
+
+	sk_out->sk = sk;
+	sk_out->hProcess = INVALID_HANDLE_VALUE;
+	INIT_FLIST_HEAD(&sk_out->list);
+	__fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED);
+	__fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
+	__fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
+
+	get_my_addr_str(sk);
+
+	if (!WriteFile(hpipe, msg, strlen(msg), NULL, NULL)) {
+		log_err("couldnt write pipe\n");
+		close(sk);
+		return -1;
+	}
+	CloseHandle(hpipe);
+
+	sk_out_assign(sk_out);
+
+	ret = handle_connection(sk_out);
+	__sk_out_drop(sk_out);
+	return ret;
+}
+#endif
+
 static int accept_loop(int listen_sk)
 {
 	struct sockaddr_in addr;
@@ -1317,8 +1526,11 @@ static int accept_loop(int listen_sk)
 		struct sk_out *sk_out;
 		const char *from;
 		char buf[64];
+#ifdef WIN32
+		HANDLE hProcess;
+#else
 		pid_t pid;
-
+#endif
 		pfd.fd = listen_sk;
 		pfd.events = POLLIN;
 		do {
@@ -1376,6 +1588,13 @@ static int accept_loop(int listen_sk)
 		__fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
 		__fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
 
+#ifdef WIN32
+		hProcess = windows_handle_connection(hjob, sk);
+		if (hProcess == INVALID_HANDLE_VALUE)
+			return -1;
+		sk_out->hProcess = hProcess;
+		fio_server_add_conn_pid(&conn_list, hProcess);
+#else
 		pid = fork();
 		if (pid) {
 			close(sk);
@@ -1392,6 +1611,7 @@ static int accept_loop(int listen_sk)
 		 */
 		sk_out_assign(sk_out);
 		handle_connection(sk_out);
+#endif
 	}
 
 	return exitval;
@@ -1465,8 +1685,11 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
 {
 	struct cmd_ts_pdu p;
 	int i, j, k;
-	void *ss_buf;
-	uint64_t *ss_iops, *ss_bw;
+	size_t clat_prio_stats_extra_size = 0;
+	size_t ss_extra_size = 0;
+	size_t extended_buf_size = 0;
+	void *extended_buf;
+	void *extended_buf_wp;
 
 	dprint(FD_NET, "server sending end stats\n");
 
@@ -1483,6 +1706,8 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
 	p.ts.pid		= cpu_to_le32(ts->pid);
 	p.ts.members		= cpu_to_le32(ts->members);
 	p.ts.unified_rw_rep	= cpu_to_le32(ts->unified_rw_rep);
+	p.ts.ioprio		= cpu_to_le32(ts->ioprio);
+	p.ts.disable_prio_stat	= cpu_to_le32(ts->disable_prio_stat);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		convert_io_stat(&p.ts.clat_stat[i], &ts->clat_stat[i]);
@@ -1577,38 +1802,88 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
 	p.ts.cachehit		= cpu_to_le64(ts->cachehit);
 	p.ts.cachemiss		= cpu_to_le64(ts->cachemiss);
 
+	convert_gs(&p.rs, rs);
+
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-			p.ts.io_u_plat_high_prio[i][j] = cpu_to_le64(ts->io_u_plat_high_prio[i][j]);
-			p.ts.io_u_plat_low_prio[i][j] = cpu_to_le64(ts->io_u_plat_low_prio[i][j]);
+		if (ts->nr_clat_prio[i])
+			clat_prio_stats_extra_size += ts->nr_clat_prio[i] * sizeof(*ts->clat_prio[i]);
+	}
+	extended_buf_size += clat_prio_stats_extra_size;
+
+	dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+	if (ts->ss_state & FIO_SS_DATA)
+		ss_extra_size = 2 * ts->ss_dur * sizeof(uint64_t);
+
+	extended_buf_size += ss_extra_size;
+	if (!extended_buf_size) {
+		fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+		return;
+	}
+
+	extended_buf_size += sizeof(p);
+	extended_buf = calloc(1, extended_buf_size);
+	if (!extended_buf) {
+		log_err("fio: failed to allocate FIO_NET_CMD_TS buffer\n");
+		return;
+	}
+
+	memcpy(extended_buf, &p, sizeof(p));
+	extended_buf_wp = (struct cmd_ts_pdu *)extended_buf + 1;
+
+	if (clat_prio_stats_extra_size) {
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			struct clat_prio_stat *prio = (struct clat_prio_stat *) extended_buf_wp;
+
+			for (j = 0; j < ts->nr_clat_prio[i]; j++) {
+				for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+					prio->io_u_plat[k] =
+						cpu_to_le64(ts->clat_prio[i][j].io_u_plat[k]);
+				convert_io_stat(&prio->clat_stat,
+						&ts->clat_prio[i][j].clat_stat);
+				prio->ioprio = cpu_to_le32(ts->clat_prio[i][j].ioprio);
+				prio++;
+			}
+
+			if (ts->nr_clat_prio[i]) {
+				uint64_t offset = (char *)extended_buf_wp - (char *)extended_buf;
+				struct cmd_ts_pdu *ptr = extended_buf;
+
+				ptr->ts.clat_prio_offset[i] = cpu_to_le64(offset);
+				ptr->ts.nr_clat_prio[i] = cpu_to_le32(ts->nr_clat_prio[i]);
+			}
+
+			extended_buf_wp = prio;
 		}
-		convert_io_stat(&p.ts.clat_high_prio_stat[i], &ts->clat_high_prio_stat[i]);
-		convert_io_stat(&p.ts.clat_low_prio_stat[i], &ts->clat_low_prio_stat[i]);
 	}
 
-	convert_gs(&p.rs, rs);
+	if (ss_extra_size) {
+		uint64_t *ss_iops, *ss_bw;
+		uint64_t offset;
+		struct cmd_ts_pdu *ptr = extended_buf;
 
-	dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
-	if (ts->ss_state & FIO_SS_DATA) {
 		dprint(FD_NET, "server sending steadystate ring buffers\n");
 
-		ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+		/* ss iops */
+		ss_iops = (uint64_t *) extended_buf_wp;
+		for (i = 0; i < ts->ss_dur; i++)
+			ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
 
-		memcpy(ss_buf, &p, sizeof(p));
+		offset = (char *)extended_buf_wp - (char *)extended_buf;
+		ptr->ts.ss_iops_data_offset = cpu_to_le64(offset);
+		extended_buf_wp = ss_iops + (int) ts->ss_dur;
 
-		ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
-		ss_bw = ss_iops + (int) ts->ss_dur;
-		for (i = 0; i < ts->ss_dur; i++) {
-			ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+		/* ss bw */
+		ss_bw = extended_buf_wp;
+		for (i = 0; i < ts->ss_dur; i++)
 			ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
-		}
-
-		fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
 
-		free(ss_buf);
+		offset = (char *)extended_buf_wp - (char *)extended_buf;
+		ptr->ts.ss_bw_data_offset = cpu_to_le64(offset);
+		extended_buf_wp = ss_bw + (int) ts->ss_dur;
 	}
-	else
-		fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+
+	fio_net_queue_cmd(FIO_NET_CMD_TS, extended_buf, extended_buf_size, NULL, SK_F_COPY);
+	free(extended_buf);
 }
 
 void fio_server_send_gs(struct group_run_stats *rs)
@@ -2489,12 +2764,25 @@ static int fio_server(void)
 	if (fio_handle_server_arg())
 		return -1;
 
+	set_sig_handlers();
+
+#ifdef WIN32
+	/* if this is a child process, go handle the connection */
+	if (fio_server_pipe_name != NULL) {
+		ret = handle_connection_process();
+		return ret;
+	}
+
+	/* job to link child processes so they terminate together */
+	hjob = windows_create_job();
+	if (hjob == INVALID_HANDLE_VALUE)
+		return -1;
+#endif
+
 	sk = fio_init_server_connection();
 	if (sk < 0)
 		return -1;
 
-	set_sig_handlers();
-
 	ret = accept_loop(sk);
 
 	close(sk);
@@ -2635,3 +2923,10 @@ void fio_server_set_arg(const char *arg)
 {
 	fio_server_arg = strdup(arg);
 }
+
+#ifdef WIN32
+void fio_server_internal_set(const char *arg)
+{
+	fio_server_pipe_name = strdup(arg);
+}
+#endif
diff --git a/server.h b/server.h
index 25b6bbdc..0e62b6df 100644
--- a/server.h
+++ b/server.h
@@ -15,6 +15,9 @@ struct sk_out {
 	unsigned int refs;	/* frees sk_out when it drops to zero.
 				 * protected by below ->lock */
 
+#ifdef WIN32
+	HANDLE hProcess;		/* process handle of handle_connection_process*/
+#endif
 	int sk;			/* socket fd to talk to client */
 	struct fio_sem lock;	/* protects ref and below list */
 	struct flist_head list;	/* list of pending transmit work */
@@ -48,7 +51,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 95,
+	FIO_SERVER_VER			= 96,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
@@ -212,6 +215,7 @@ extern int fio_server_text_output(int, const char *, size_t);
 extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
 extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
 extern void fio_server_set_arg(const char *);
+extern void fio_server_internal_set(const char *);
 extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
 extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
 extern const char *fio_server_op(unsigned int);
@@ -222,6 +226,7 @@ extern void fio_server_send_gs(struct group_run_stats *);
 extern void fio_server_send_du(void);
 extern void fio_server_send_job_options(struct flist_head *, unsigned int);
 extern int fio_server_get_verify_state(const char *, int, void **);
+extern bool fio_server_poll_fd(int fd, short events, int timeout);
 
 extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
 
diff --git a/stat.c b/stat.c
index b08d2f25..0876222a 100644
--- a/stat.c
+++ b/stat.c
@@ -265,6 +265,18 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
 	free(ovals);
 }
 
+static int get_nr_prios_with_samples(struct thread_stat *ts, enum fio_ddir ddir)
+{
+	int i, nr_prios_with_samples = 0;
+
+	for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+		if (ts->clat_prio[ddir][i].clat_stat.samples)
+			nr_prios_with_samples++;
+	}
+
+	return nr_prios_with_samples;
+}
+
 bool calc_lat(struct io_stat *is, unsigned long long *min,
 	      unsigned long long *max, double *mean, double *dev)
 {
@@ -491,7 +503,8 @@ static struct thread_stat *gen_mixed_ddir_stats_from_ts(struct thread_stat *ts)
 	return ts_lcl;
 }
 
-static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean)
+static double convert_agg_kbytes_percent(struct group_run_stats *rs,
+					 enum fio_ddir ddir, int mean)
 {
 	double p_of_agg = 100.0;
 	if (rs && rs->agg[ddir] > 1024) {
@@ -504,13 +517,14 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i
 }
 
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-			     int ddir, struct buf_output *out)
+			     enum fio_ddir ddir, struct buf_output *out)
 {
 	unsigned long runt;
 	unsigned long long min, max, bw, iops;
 	double mean, dev;
 	char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
-	int i2p;
+	int i2p, i;
+	const char *clat_type = ts->lat_percentiles ? "lat" : "clat";
 
 	if (ddir_sync(ddir)) {
 		if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
@@ -571,12 +585,22 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 		display_lat("clat", min, max, mean, dev, out);
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
 		display_lat(" lat", min, max, mean, dev, out);
-	if (calc_lat(&ts->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
-		display_lat(ts->lat_percentiles ? "high prio_lat" : "high prio_clat",
-				min, max, mean, dev, out);
-		if (calc_lat(&ts->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
-			display_lat(ts->lat_percentiles ? "low prio_lat" : "low prio_clat",
-					min, max, mean, dev, out);
+
+	/* Only print per prio stats if there are >= 2 prios with samples */
+	if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+		for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+			if (calc_lat(&ts->clat_prio[ddir][i].clat_stat, &min,
+				     &max, &mean, &dev)) {
+				char buf[64];
+
+				snprintf(buf, sizeof(buf),
+					 "%s prio %u/%u",
+					 clat_type,
+					 ts->clat_prio[ddir][i].ioprio >> 13,
+					 ts->clat_prio[ddir][i].ioprio & 7);
+				display_lat(buf, min, max, mean, dev, out);
+			}
+		}
 	}
 
 	if (ts->slat_percentiles && ts->slat_stat[ddir].samples > 0)
@@ -596,8 +620,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 					ts->percentile_precision, "lat", out);
 
 	if (ts->clat_percentiles || ts->lat_percentiles) {
-		const char *name = ts->lat_percentiles ? "lat" : "clat";
-		char prio_name[32];
+		char prio_name[64];
 		uint64_t samples;
 
 		if (ts->lat_percentiles)
@@ -605,25 +628,24 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 		else
 			samples = ts->clat_stat[ddir].samples;
 
-		/* Only print this if some high and low priority stats were collected */
-		if (ts->clat_high_prio_stat[ddir].samples > 0 &&
-			ts->clat_low_prio_stat[ddir].samples > 0)
-		{
-			sprintf(prio_name, "high prio (%.2f%%) %s",
-					100. * (double) ts->clat_high_prio_stat[ddir].samples / (double) samples,
-					name);
-			show_clat_percentiles(ts->io_u_plat_high_prio[ddir],
-						ts->clat_high_prio_stat[ddir].samples,
-						ts->percentile_list,
-						ts->percentile_precision, prio_name, out);
-
-			sprintf(prio_name, "low prio (%.2f%%) %s",
-					100. * (double) ts->clat_low_prio_stat[ddir].samples / (double) samples,
-					name);
-			show_clat_percentiles(ts->io_u_plat_low_prio[ddir],
-						ts->clat_low_prio_stat[ddir].samples,
-						ts->percentile_list,
-						ts->percentile_precision, prio_name, out);
+		/* Only print per prio stats if there are >= 2 prios with samples */
+		if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+			for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+				uint64_t prio_samples = ts->clat_prio[ddir][i].clat_stat.samples;
+
+				if (prio_samples > 0) {
+					snprintf(prio_name, sizeof(prio_name),
+						 "%s prio %u/%u (%.2f%% of IOs)",
+						 clat_type,
+						 ts->clat_prio[ddir][i].ioprio >> 13,
+						 ts->clat_prio[ddir][i].ioprio & 7,
+						 100. * (double) prio_samples / (double) samples);
+					show_clat_percentiles(ts->clat_prio[ddir][i].io_u_plat,
+							      prio_samples, ts->percentile_list,
+							      ts->percentile_precision,
+							      prio_name, out);
+				}
+			}
 		}
 	}
 
@@ -678,6 +700,7 @@ static void show_mixed_ddir_status(struct group_run_stats *rs,
 	if (ts_lcl)
 		show_ddir_status(rs, ts_lcl, DDIR_READ, out);
 
+	free_clat_prio_stats(ts_lcl);
 	free(ts_lcl);
 }
 
@@ -1251,8 +1274,9 @@ static void show_thread_status_normal(struct thread_stat *ts,
 }
 
 static void show_ddir_status_terse(struct thread_stat *ts,
-				   struct group_run_stats *rs, int ddir,
-				   int ver, struct buf_output *out)
+				   struct group_run_stats *rs,
+				   enum fio_ddir ddir, int ver,
+				   struct buf_output *out)
 {
 	unsigned long long min, max, minv, maxv, bw, iops;
 	unsigned long long *ovals = NULL;
@@ -1351,6 +1375,7 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
 	if (ts_lcl)
 		show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
 
+	free_clat_prio_stats(ts_lcl);
 	free(ts_lcl);
 }
 
@@ -1407,7 +1432,8 @@ static struct json_object *add_ddir_lat_json(struct thread_stat *ts,
 }
 
 static void add_ddir_status_json(struct thread_stat *ts,
-		struct group_run_stats *rs, int ddir, struct json_object *parent)
+				 struct group_run_stats *rs, enum fio_ddir ddir,
+				 struct json_object *parent)
 {
 	unsigned long long min, max;
 	unsigned long long bw_bytes, bw;
@@ -1467,25 +1493,37 @@ static void add_ddir_status_json(struct thread_stat *ts,
 	if (!ddir_rw(ddir))
 		return;
 
-	/* Only print PRIO latencies if some high priority samples were gathered */
-	if (ts->clat_high_prio_stat[ddir].samples > 0) {
-		const char *high, *low;
+	/* Only include per prio stats if there are >= 2 prios with samples */
+	if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+		struct json_array *array = json_create_array();
+		const char *obj_name;
+		int i;
 
-		if (ts->lat_percentiles) {
-			high = "lat_high_prio";
-			low = "lat_low_prio";
-		} else {
-			high = "clat_high_prio";
-			low = "clat_low_prio";
+		if (ts->lat_percentiles)
+			obj_name = "lat_ns";
+		else
+			obj_name = "clat_ns";
+
+		json_object_add_value_array(dir_object, "prios", array);
+
+		for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+			if (ts->clat_prio[ddir][i].clat_stat.samples > 0) {
+				struct json_object *obj = json_create_object();
+				unsigned long long class, level;
+
+				class = ts->clat_prio[ddir][i].ioprio >> 13;
+				json_object_add_value_int(obj, "prioclass", class);
+				level = ts->clat_prio[ddir][i].ioprio & 7;
+				json_object_add_value_int(obj, "prio", level);
+
+				tmp_object = add_ddir_lat_json(ts,
+							       ts->clat_percentiles | ts->lat_percentiles,
+							       &ts->clat_prio[ddir][i].clat_stat,
+							       ts->clat_prio[ddir][i].io_u_plat);
+				json_object_add_value_object(obj, obj_name, tmp_object);
+				json_array_add_value_object(array, obj);
+			}
 		}
-
-		tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
-				&ts->clat_high_prio_stat[ddir], ts->io_u_plat_high_prio[ddir]);
-		json_object_add_value_object(dir_object, high, tmp_object);
-
-		tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
-				&ts->clat_low_prio_stat[ddir], ts->io_u_plat_low_prio[ddir]);
-		json_object_add_value_object(dir_object, low, tmp_object);
 	}
 
 	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
@@ -1534,6 +1572,7 @@ static void add_mixed_ddir_status_json(struct thread_stat *ts,
 	if (ts_lcl)
 		add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
 
+	free_clat_prio_stats(ts_lcl);
 	free(ts_lcl);
 }
 
@@ -1995,6 +2034,215 @@ void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
 		dst->sig_figs = src->sig_figs;
 }
 
+/*
+ * Free the clat_prio_stat arrays allocated by alloc_clat_prio_stat_ddir().
+ */
+void free_clat_prio_stats(struct thread_stat *ts)
+{
+	enum fio_ddir ddir;
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+		sfree(ts->clat_prio[ddir]);
+		ts->clat_prio[ddir] = NULL;
+		ts->nr_clat_prio[ddir] = 0;
+	}
+}
+
+/*
+ * Allocate a clat_prio_stat array. The array has to be allocated/freed using
+ * smalloc/sfree, so that it is accessible by the process/thread summing the
+ * thread_stats.
+ */
+int alloc_clat_prio_stat_ddir(struct thread_stat *ts, enum fio_ddir ddir,
+			      int nr_prios)
+{
+	struct clat_prio_stat *clat_prio;
+	int i;
+
+	clat_prio = scalloc(nr_prios, sizeof(*ts->clat_prio[ddir]));
+	if (!clat_prio) {
+		log_err("fio: failed to allocate ts clat data\n");
+		return 1;
+	}
+
+	for (i = 0; i < nr_prios; i++)
+		clat_prio[i].clat_stat.min_val = ULONG_MAX;
+
+	ts->clat_prio[ddir] = clat_prio;
+	ts->nr_clat_prio[ddir] = nr_prios;
+
+	return 0;
+}
+
+static int grow_clat_prio_stat(struct thread_stat *dst, enum fio_ddir ddir)
+{
+	int curr_len = dst->nr_clat_prio[ddir];
+	void *new_arr;
+
+	new_arr = scalloc(curr_len + 1, sizeof(*dst->clat_prio[ddir]));
+	if (!new_arr) {
+		log_err("fio: failed to grow clat prio array\n");
+		return 1;
+	}
+
+	memcpy(new_arr, dst->clat_prio[ddir],
+	       curr_len * sizeof(*dst->clat_prio[ddir]));
+	sfree(dst->clat_prio[ddir]);
+
+	dst->clat_prio[ddir] = new_arr;
+	dst->clat_prio[ddir][curr_len].clat_stat.min_val = ULONG_MAX;
+	dst->nr_clat_prio[ddir]++;
+
+	return 0;
+}
+
+static int find_clat_prio_index(struct thread_stat *dst, enum fio_ddir ddir,
+				uint32_t ioprio)
+{
+	int i, nr_prios = dst->nr_clat_prio[ddir];
+
+	for (i = 0; i < nr_prios; i++) {
+		if (dst->clat_prio[ddir][i].ioprio == ioprio)
+			return i;
+	}
+
+	return -1;
+}
+
+static int alloc_or_get_clat_prio_index(struct thread_stat *dst,
+					enum fio_ddir ddir, uint32_t ioprio,
+					int *idx)
+{
+	int index = find_clat_prio_index(dst, ddir, ioprio);
+
+	if (index == -1) {
+		index = dst->nr_clat_prio[ddir];
+
+		if (grow_clat_prio_stat(dst, ddir))
+			return 1;
+
+		dst->clat_prio[ddir][index].ioprio = ioprio;
+	}
+
+	*idx = index;
+
+	return 0;
+}
+
+static int clat_prio_stats_copy(struct thread_stat *dst, struct thread_stat *src,
+				enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+	size_t sz = sizeof(*src->clat_prio[src_ddir]) *
+		src->nr_clat_prio[src_ddir];
+
+	dst->clat_prio[dst_ddir] = smalloc(sz);
+	if (!dst->clat_prio[dst_ddir]) {
+		log_err("fio: failed to alloc clat prio array\n");
+		return 1;
+	}
+
+	memcpy(dst->clat_prio[dst_ddir], src->clat_prio[src_ddir], sz);
+	dst->nr_clat_prio[dst_ddir] = src->nr_clat_prio[src_ddir];
+
+	return 0;
+}
+
+static int clat_prio_stat_add_samples(struct thread_stat *dst,
+				      enum fio_ddir dst_ddir, uint32_t ioprio,
+				      struct io_stat *io_stat,
+				      uint64_t *io_u_plat)
+{
+	int i, dst_index;
+
+	if (!io_stat->samples)
+		return 0;
+
+	if (alloc_or_get_clat_prio_index(dst, dst_ddir, ioprio, &dst_index))
+		return 1;
+
+	sum_stat(&dst->clat_prio[dst_ddir][dst_index].clat_stat, io_stat,
+		 false);
+
+	for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+		dst->clat_prio[dst_ddir][dst_index].io_u_plat[i] += io_u_plat[i];
+
+	return 0;
+}
+
+static int sum_clat_prio_stats_src_single_prio(struct thread_stat *dst,
+					       struct thread_stat *src,
+					       enum fio_ddir dst_ddir,
+					       enum fio_ddir src_ddir)
+{
+	struct io_stat *io_stat;
+	uint64_t *io_u_plat;
+
+	/*
+	 * If src ts has no clat_prio_stat array, then all I/Os were submitted
+	 * using src->ioprio. Thus, the global samples in src->clat_stat (or
+	 * src->lat_stat) can be used as the 'per prio' samples for src->ioprio.
+	 */
+	assert(!src->clat_prio[src_ddir]);
+	assert(src->nr_clat_prio[src_ddir] == 0);
+
+	if (src->lat_percentiles) {
+		io_u_plat = src->io_u_plat[FIO_LAT][src_ddir];
+		io_stat = &src->lat_stat[src_ddir];
+	} else {
+		io_u_plat = src->io_u_plat[FIO_CLAT][src_ddir];
+		io_stat = &src->clat_stat[src_ddir];
+	}
+
+	return clat_prio_stat_add_samples(dst, dst_ddir, src->ioprio, io_stat,
+					  io_u_plat);
+}
+
+static int sum_clat_prio_stats_src_multi_prio(struct thread_stat *dst,
+					      struct thread_stat *src,
+					      enum fio_ddir dst_ddir,
+					      enum fio_ddir src_ddir)
+{
+	int i;
+
+	/*
+	 * If src ts has a clat_prio_stat array, then there are multiple prios
+	 * in use (i.e. src ts had cmdprio_percentage or cmdprio_bssplit set).
+	 * The samples for the default prio will exist in the src->clat_prio
+	 * array, just like the samples for any other prio.
+	 */
+	assert(src->clat_prio[src_ddir]);
+	assert(src->nr_clat_prio[src_ddir]);
+
+	/* If the dst ts doesn't yet have a clat_prio array, simply memcpy. */
+	if (!dst->clat_prio[dst_ddir])
+		return clat_prio_stats_copy(dst, src, dst_ddir, src_ddir);
+
+	/* The dst ts already has a clat_prio_array, add src stats into it. */
+	for (i = 0; i < src->nr_clat_prio[src_ddir]; i++) {
+		struct io_stat *io_stat = &src->clat_prio[src_ddir][i].clat_stat;
+		uint64_t *io_u_plat = src->clat_prio[src_ddir][i].io_u_plat;
+		uint32_t ioprio = src->clat_prio[src_ddir][i].ioprio;
+
+		if (clat_prio_stat_add_samples(dst, dst_ddir, ioprio, io_stat, io_u_plat))
+			return 1;
+	}
+
+	return 0;
+}
+
+static int sum_clat_prio_stats(struct thread_stat *dst, struct thread_stat *src,
+			       enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+	if (dst->disable_prio_stat)
+		return 0;
+
+	if (!src->clat_prio[src_ddir])
+		return sum_clat_prio_stats_src_single_prio(dst, src, dst_ddir,
+							   src_ddir);
+
+	return sum_clat_prio_stats_src_multi_prio(dst, src, dst_ddir, src_ddir);
+}
+
 void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
 {
 	int k, l, m;
@@ -2002,12 +2250,11 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
 	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
 		if (dst->unified_rw_rep != UNIFIED_MIXED) {
 			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], false);
-			sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], false);
-			sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], false);
 			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], false);
 			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], false);
 			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], true);
 			sum_stat(&dst->iops_stat[l], &src->iops_stat[l], true);
+			sum_clat_prio_stats(dst, src, l, l);
 
 			dst->io_bytes[l] += src->io_bytes[l];
 
@@ -2015,12 +2262,11 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
 				dst->runtime[l] = src->runtime[l];
 		} else {
 			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], false);
-			sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], false);
-			sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], false);
 			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], false);
 			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], false);
 			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], true);
 			sum_stat(&dst->iops_stat[0], &src->iops_stat[l], true);
+			sum_clat_prio_stats(dst, src, 0, l);
 
 			dst->io_bytes[0] += src->io_bytes[l];
 
@@ -2074,19 +2320,6 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
 	for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
 		dst->io_u_sync_plat[k] += src->io_u_sync_plat[k];
 
-	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
-		for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
-			if (dst->unified_rw_rep != UNIFIED_MIXED) {
-				dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
-				dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
-			} else {
-				dst->io_u_plat_high_prio[0][m] += src->io_u_plat_high_prio[k][m];
-				dst->io_u_plat_low_prio[0][m] += src->io_u_plat_low_prio[k][m];
-			}
-
-		}
-	}
-
 	dst->total_run_time += src->total_run_time;
 	dst->total_submit += src->total_submit;
 	dst->total_complete += src->total_complete;
@@ -2114,8 +2347,6 @@ void init_thread_stat_min_vals(struct thread_stat *ts)
 		ts->lat_stat[i].min_val = ULONG_MAX;
 		ts->bw_stat[i].min_val = ULONG_MAX;
 		ts->iops_stat[i].min_val = ULONG_MAX;
-		ts->clat_high_prio_stat[i].min_val = ULONG_MAX;
-		ts->clat_low_prio_stat[i].min_val = ULONG_MAX;
 	}
 	ts->sync_stat.min_val = ULONG_MAX;
 }
@@ -2128,6 +2359,58 @@ void init_thread_stat(struct thread_stat *ts)
 	ts->groupid = -1;
 }
 
+static void init_per_prio_stats(struct thread_stat *threadstats, int nr_ts)
+{
+	struct thread_data *td;
+	struct thread_stat *ts;
+	int i, j, last_ts, idx;
+	enum fio_ddir ddir;
+
+	j = 0;
+	last_ts = -1;
+	idx = 0;
+
+	/*
+	 * Loop through all tds, if a td requires per prio stats, temporarily
+	 * store a 1 in ts->disable_prio_stat, and then do an additional
+	 * loop at the end where we invert the ts->disable_prio_stat values.
+	 */
+	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
+		if (idx &&
+		    (!td->o.group_reporting ||
+		     (td->o.group_reporting && last_ts != td->groupid))) {
+			idx = 0;
+			j++;
+		}
+
+		last_ts = td->groupid;
+		ts = &threadstats[j];
+
+		/* idx == 0 means first td in group, or td is not in a group. */
+		if (idx == 0)
+			ts->ioprio = td->ioprio;
+		else if (td->ioprio != ts->ioprio)
+			ts->disable_prio_stat = 1;
+
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			if (td->ts.clat_prio[ddir]) {
+				ts->disable_prio_stat = 1;
+				break;
+			}
+		}
+
+		idx++;
+	}
+
+	/* Loop through all dst threadstats and fixup the values. */
+	for (i = 0; i < nr_ts; i++) {
+		ts = &threadstats[i];
+		ts->disable_prio_stat = !ts->disable_prio_stat;
+	}
+}
+
 void __show_run_stats(void)
 {
 	struct group_run_stats *runstats, *rs;
@@ -2174,6 +2457,8 @@ void __show_run_stats(void)
 		opt_lists[i] = NULL;
 	}
 
+	init_per_prio_stats(threadstats, nr_ts);
+
 	j = 0;
 	last_ts = -1;
 	idx = 0;
@@ -2198,7 +2483,6 @@ void __show_run_stats(void)
 		opt_lists[j] = &td->opt_list;
 
 		idx++;
-		ts->members++;
 
 		if (ts->groupid == -1) {
 			/*
@@ -2265,6 +2549,8 @@ void __show_run_stats(void)
 
 		sum_thread_stats(ts, &td->ts);
 
+		ts->members++;
+
 		if (td->o.ss_dur) {
 			ts->ss_state = td->ss.state;
 			ts->ss_dur = td->ss.dur;
@@ -2313,7 +2599,7 @@ void __show_run_stats(void)
 	}
 
 	for (i = 0; i < groupid + 1; i++) {
-		int ddir;
+		enum fio_ddir ddir;
 
 		rs = &runstats[i];
 
@@ -2419,6 +2705,12 @@ void __show_run_stats(void)
 
 	log_info_flush();
 	free(runstats);
+
+	/* free arrays allocated by sum_thread_stats(), if any */
+	for (i = 0; i < nr_ts; i++) {
+		ts = &threadstats[i];
+		free_clat_prio_stats(ts);
+	}
 	free(threadstats);
 	free(opt_lists);
 }
@@ -2545,6 +2837,14 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
 	is->samples++;
 }
 
+static inline void add_stat_prio_sample(struct clat_prio_stat *clat_prio,
+					unsigned short clat_prio_index,
+					unsigned long long nsec)
+{
+	if (clat_prio)
+		add_stat_sample(&clat_prio[clat_prio_index].clat_stat, nsec);
+}
+
 /*
  * Return a struct io_logs, which is added to the tail of the log
  * list for 'iolog'.
@@ -2717,7 +3017,7 @@ static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
 		s = get_sample(iolog, cur_log, cur_log->nr_samples);
 
 		s->data = data;
-		s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
+		s->time = t + (iolog->td ? iolog->td->alternate_epoch : 0);
 		io_sample_set_ddir(iolog, s, ddir);
 		s->bs = bs;
 		s->priority = priority;
@@ -2742,14 +3042,36 @@ static inline void reset_io_stat(struct io_stat *ios)
 	ios->mean.u.f = ios->S.u.f = 0;
 }
 
+static inline void reset_io_u_plat(uint64_t *io_u_plat)
+{
+	int i;
+
+	for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+		io_u_plat[i] = 0;
+}
+
+static inline void reset_clat_prio_stats(struct thread_stat *ts)
+{
+	enum fio_ddir ddir;
+	int i;
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+		if (!ts->clat_prio[ddir])
+			continue;
+
+		for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+			reset_io_stat(&ts->clat_prio[ddir][i].clat_stat);
+			reset_io_u_plat(ts->clat_prio[ddir][i].io_u_plat);
+		}
+	}
+}
+
 void reset_io_stats(struct thread_data *td)
 {
 	struct thread_stat *ts = &td->ts;
-	int i, j, k;
+	int i, j;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		reset_io_stat(&ts->clat_high_prio_stat[i]);
-		reset_io_stat(&ts->clat_low_prio_stat[i]);
 		reset_io_stat(&ts->clat_stat[i]);
 		reset_io_stat(&ts->slat_stat[i]);
 		reset_io_stat(&ts->lat_stat[i]);
@@ -2761,21 +3083,16 @@ void reset_io_stats(struct thread_data *td)
 		ts->total_io_u[i] = 0;
 		ts->short_io_u[i] = 0;
 		ts->drop_io_u[i] = 0;
-
-		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-			ts->io_u_plat_high_prio[i][j] = 0;
-			ts->io_u_plat_low_prio[i][j] = 0;
-			if (!i)
-				ts->io_u_sync_plat[j] = 0;
-		}
 	}
 
 	for (i = 0; i < FIO_LAT_CNT; i++)
 		for (j = 0; j < DDIR_RWDIR_CNT; j++)
-			for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
-				ts->io_u_plat[i][j][k] = 0;
+			reset_io_u_plat(ts->io_u_plat[i][j]);
+
+	reset_clat_prio_stats(ts);
 
 	ts->total_io_u[DDIR_SYNC] = 0;
+	reset_io_u_plat(ts->io_u_sync_plat);
 
 	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
 		ts->io_u_map[i] = 0;
@@ -2821,7 +3138,7 @@ static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
 static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
 			     bool log_max)
 {
-	int ddir;
+	enum fio_ddir ddir;
 
 	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
 		__add_stat_to_log(iolog, ddir, elapsed, log_max);
@@ -2926,22 +3243,21 @@ static inline void add_lat_percentile_sample(struct thread_stat *ts,
 	ts->io_u_plat[lat][ddir][idx]++;
 }
 
-static inline void add_lat_percentile_prio_sample(struct thread_stat *ts,
-						  unsigned long long nsec,
-						  enum fio_ddir ddir,
-						  bool high_prio)
+static inline void
+add_lat_percentile_prio_sample(struct thread_stat *ts, unsigned long long nsec,
+			       enum fio_ddir ddir,
+			       unsigned short clat_prio_index)
 {
 	unsigned int idx = plat_val_to_idx(nsec);
 
-	if (!high_prio)
-		ts->io_u_plat_low_prio[ddir][idx]++;
-	else
-		ts->io_u_plat_high_prio[ddir][idx]++;
+	if (ts->clat_prio[ddir])
+		ts->clat_prio[ddir][clat_prio_index].io_u_plat[idx]++;
 }
 
 void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 		     unsigned long long nsec, unsigned long long bs,
-		     uint64_t offset, unsigned int ioprio, bool high_prio)
+		     uint64_t offset, unsigned int ioprio,
+		     unsigned short clat_prio_index)
 {
 	const bool needs_lock = td_async_processing(td);
 	unsigned long elapsed, this_window;
@@ -2954,7 +3270,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 	add_stat_sample(&ts->clat_stat[ddir], nsec);
 
 	/*
-	 * When lat_percentiles=1 (default 0), the reported high/low priority
+	 * When lat_percentiles=1 (default 0), the reported per priority
 	 * percentiles and stats are used for describing total latency values,
 	 * even though the variable names themselves start with clat_.
 	 *
@@ -2962,12 +3278,9 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 	 * lat_percentiles=0. add_lat_sample() will add the prio stat sample
 	 * when lat_percentiles=1.
 	 */
-	if (!ts->lat_percentiles) {
-		if (high_prio)
-			add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
-		else
-			add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-	}
+	if (!ts->lat_percentiles)
+		add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+				     nsec);
 
 	if (td->clat_log)
 		add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
@@ -2982,7 +3295,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 		add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT);
 		if (!ts->lat_percentiles)
 			add_lat_percentile_prio_sample(ts, nsec, ddir,
-						       high_prio);
+						       clat_prio_index);
 	}
 
 	if (iolog && iolog->hist_msec) {
@@ -3055,7 +3368,8 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
 
 void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
 		    unsigned long long nsec, unsigned long long bs,
-		    uint64_t offset, unsigned int ioprio, bool high_prio)
+		    uint64_t offset, unsigned int ioprio,
+		    unsigned short clat_prio_index)
 {
 	const bool needs_lock = td_async_processing(td);
 	struct thread_stat *ts = &td->ts;
@@ -3073,7 +3387,7 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
 			       offset, ioprio);
 
 	/*
-	 * When lat_percentiles=1 (default 0), the reported high/low priority
+	 * When lat_percentiles=1 (default 0), the reported per priority
 	 * percentiles and stats are used for describing total latency values,
 	 * even though the variable names themselves start with clat_.
 	 *
@@ -3084,12 +3398,9 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
 	 */
 	if (ts->lat_percentiles) {
 		add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT);
-		add_lat_percentile_prio_sample(ts, nsec, ddir, high_prio);
-		if (high_prio)
-			add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
-		else
-			add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-
+		add_lat_percentile_prio_sample(ts, nsec, ddir, clat_prio_index);
+		add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+				     nsec);
 	}
 	if (needs_lock)
 		__td_io_u_unlock(td);
diff --git a/stat.h b/stat.h
index 15ca4eff..dce0bb0d 100644
--- a/stat.h
+++ b/stat.h
@@ -158,6 +158,12 @@ enum fio_lat {
 	FIO_LAT_CNT = 3,
 };
 
+struct clat_prio_stat {
+	uint64_t io_u_plat[FIO_IO_U_PLAT_NR];
+	struct io_stat clat_stat;
+	uint32_t ioprio;
+};
+
 struct thread_stat {
 	char name[FIO_JOBNAME_SIZE];
 	char verror[FIO_VERROR_SIZE];
@@ -168,6 +174,7 @@ struct thread_stat {
 	char description[FIO_JOBDESC_SIZE];
 	uint32_t members;
 	uint32_t unified_rw_rep;
+	uint32_t disable_prio_stat;
 
 	/*
 	 * bandwidth and latency stats
@@ -252,21 +259,40 @@ struct thread_stat {
 	fio_fp64_t ss_deviation;
 	fio_fp64_t ss_criterion;
 
-	uint64_t io_u_plat_high_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR] __attribute__((aligned(8)));;
-	uint64_t io_u_plat_low_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
-	struct io_stat clat_high_prio_stat[DDIR_RWDIR_CNT] __attribute__((aligned(8)));
-	struct io_stat clat_low_prio_stat[DDIR_RWDIR_CNT];
+	/* A mirror of td->ioprio. */
+	uint32_t ioprio;
 
 	union {
 		uint64_t *ss_iops_data;
+		/*
+		 * For FIO_NET_CMD_TS, the pointed to data will temporarily
+		 * be stored at this offset from the start of the payload.
+		 */
+		uint64_t ss_iops_data_offset;
 		uint64_t pad4;
 	};
 
 	union {
 		uint64_t *ss_bw_data;
+		/*
+		 * For FIO_NET_CMD_TS, the pointed to data will temporarily
+		 * be stored at this offset from the start of the payload.
+		 */
+		uint64_t ss_bw_data_offset;
 		uint64_t pad5;
 	};
 
+	union {
+		struct clat_prio_stat *clat_prio[DDIR_RWDIR_CNT];
+		/*
+		 * For FIO_NET_CMD_TS, the pointed to data will temporarily
+		 * be stored at this offset from the start of the payload.
+		 */
+		uint64_t clat_prio_offset[DDIR_RWDIR_CNT];
+		uint64_t pad6;
+	};
+	uint32_t nr_clat_prio[DDIR_RWDIR_CNT];
+
 	uint64_t cachehit;
 	uint64_t cachemiss;
 } __attribute__((packed));
@@ -342,9 +368,9 @@ extern void update_rusage_stat(struct thread_data *);
 extern void clear_rusage_stat(struct thread_data *);
 
 extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-			   unsigned long long, uint64_t, unsigned int, bool);
+			   unsigned long long, uint64_t, unsigned int, unsigned short);
 extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-			    unsigned long long, uint64_t, unsigned int, bool);
+			    unsigned long long, uint64_t, unsigned int, unsigned short);
 extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
 				unsigned long long, uint64_t, unsigned int);
 extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long);
@@ -355,6 +381,8 @@ extern void add_bw_sample(struct thread_data *, struct io_u *,
 extern void add_sync_clat_sample(struct thread_stat *ts,
 				unsigned long long nsec);
 extern int calc_log_samples(void);
+extern void free_clat_prio_stats(struct thread_stat *);
+extern int alloc_clat_prio_stat_ddir(struct thread_stat *, enum fio_ddir, int);
 
 extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
 extern void json_array_add_disk_util(struct disk_util_stat *dus,
diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py
index cc437426..9e37d9fe 100755
--- a/t/latency_percentiles.py
+++ b/t/latency_percentiles.py
@@ -80,6 +80,7 @@ import time
 import argparse
 import platform
 import subprocess
+from collections import Counter
 from pathlib import Path
 
 
@@ -125,7 +126,8 @@ class FioLatTest():
             "--output-format={output-format}".format(**self.test_options),
         ]
         for opt in ['slat_percentiles', 'clat_percentiles', 'lat_percentiles',
-                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', 'cmdprio_percentage']:
+                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs',
+                    'cmdprio_percentage', 'bssplit', 'cmdprio_bssplit']:
             if opt in self.test_options:
                 option = '--{0}={{{0}}}'.format(opt)
                 fio_args.append(option.format(**self.test_options))
@@ -363,20 +365,19 @@ class FioLatTest():
 
     def check_nocmdprio_lat(self, job):
         """
-        Make sure no high/low priority latencies appear.
+        Make sure no per priority latencies appear.
 
         job         JSON object to check
         """
 
         for ddir in ['read', 'write', 'trim']:
             if ddir in job:
-                if 'lat_high_prio' in job[ddir] or 'lat_low_prio' in job[ddir] or \
-                    'clat_high_prio' in job[ddir] or 'clat_low_prio' in job[ddir]:
-                    print("Unexpected high/low priority latencies found in %s output" % ddir)
+                if 'prios' in job[ddir]:
+                    print("Unexpected per priority latencies found in %s output" % ddir)
                     return False
 
         if self.debug:
-            print("No high/low priority latencies found")
+            print("No per priority latencies found")
 
         return True
 
@@ -497,7 +498,7 @@ class FioLatTest():
         return retval
 
     def check_prio_latencies(self, jsondata, clat=True, plus=False):
-        """Check consistency of high/low priority latencies.
+        """Check consistency of per priority latencies.
 
         clat                True if we should check clat data; other check lat data
         plus                True if we have json+ format data where additional checks can
@@ -506,78 +507,78 @@ class FioLatTest():
         """
 
         if clat:
-            high = 'clat_high_prio'
-            low = 'clat_low_prio'
-            combined = 'clat_ns'
+            obj = combined = 'clat_ns'
         else:
-            high = 'lat_high_prio'
-            low = 'lat_low_prio'
-            combined = 'lat_ns'
+            obj = combined = 'lat_ns'
 
-        if not high in jsondata or not low in jsondata or not combined in jsondata:
-            print("Error identifying high/low priority latencies")
+        if not 'prios' in jsondata or not combined in jsondata:
+            print("Error identifying per priority latencies")
             return False
 
-        if jsondata[high]['N'] + jsondata[low]['N'] != jsondata[combined]['N']:
-            print("High %d + low %d != combined sample size %d" % \
-                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+        sum_sample_size = sum([x[obj]['N'] for x in jsondata['prios']])
+        if sum_sample_size != jsondata[combined]['N']:
+            print("Per prio sample size sum %d != combined sample size %d" %
+                  (sum_sample_size, jsondata[combined]['N']))
             return False
         elif self.debug:
-            print("High %d + low %d == combined sample size %d" % \
-                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+            print("Per prio sample size sum %d == combined sample size %d" %
+                  (sum_sample_size, jsondata[combined]['N']))
 
-        if min(jsondata[high]['min'], jsondata[low]['min']) != jsondata[combined]['min']:
-            print("Min of high %d, low %d min latencies does not match min %d from combined data" % \
-                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+        min_val = min([x[obj]['min'] for x in jsondata['prios']])
+        if min_val != jsondata[combined]['min']:
+            print("Min per prio min latency %d does not match min %d from combined data" %
+                  (min_val, jsondata[combined]['min']))
             return False
         elif self.debug:
-            print("Min of high %d, low %d min latencies matches min %d from combined data" % \
-                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+            print("Min per prio min latency %d matches min %d from combined data" %
+                  (min_val, jsondata[combined]['min']))
 
-        if max(jsondata[high]['max'], jsondata[low]['max']) != jsondata[combined]['max']:
-            print("Max of high %d, low %d max latencies does not match max %d from combined data" % \
-                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+        max_val = max([x[obj]['max'] for x in jsondata['prios']])
+        if max_val != jsondata[combined]['max']:
+            print("Max per prio max latency %d does not match max %d from combined data" %
+                  (max_val, jsondata[combined]['max']))
             return False
         elif self.debug:
-            print("Max of high %d, low %d max latencies matches max %d from combined data" % \
-                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+            print("Max per prio max latency %d matches max %d from combined data" %
+                  (max_val, jsondata[combined]['max']))
 
-        weighted_avg = (jsondata[high]['mean'] * jsondata[high]['N'] + \
-                        jsondata[low]['mean'] * jsondata[low]['N']) / jsondata[combined]['N']
+        weighted_vals = [x[obj]['mean'] * x[obj]['N'] for x in jsondata['prios']]
+        weighted_avg = sum(weighted_vals) / jsondata[combined]['N']
         delta = abs(weighted_avg - jsondata[combined]['mean'])
         if (delta / jsondata[combined]['mean']) > 0.0001:
-            print("Difference between weighted average %f of high, low means "
+            print("Difference between merged per prio weighted average %f mean "
                   "and actual mean %f exceeds 0.01%%" % (weighted_avg, jsondata[combined]['mean']))
             return False
         elif self.debug:
-            print("Weighted average %f of high, low means matches actual mean %f" % \
-                    (weighted_avg, jsondata[combined]['mean']))
+            print("Merged per prio weighted average %f mean matches actual mean %f" %
+                  (weighted_avg, jsondata[combined]['mean']))
 
         if plus:
-            if not self.check_jsonplus(jsondata[high]):
-                return False
-            if not self.check_jsonplus(jsondata[low]):
-                return False
+            for prio in jsondata['prios']:
+                if not self.check_jsonplus(prio[obj]):
+                    return False
 
-            bins = {**jsondata[high]['bins'], **jsondata[low]['bins']}
-            for duration in bins.keys():
-                if duration in jsondata[high]['bins'] and duration in jsondata[low]['bins']:
-                    bins[duration] = jsondata[high]['bins'][duration] + \
-                            jsondata[low]['bins'][duration]
+            counter = Counter()
+            for prio in jsondata['prios']:
+                counter.update(prio[obj]['bins'])
+
+            bins = dict(counter)
 
             if len(bins) != len(jsondata[combined]['bins']):
-                print("Number of combined high/low bins does not match number of overall bins")
+                print("Number of merged bins %d does not match number of overall bins %d" %
+                      (len(bins), len(jsondata[combined]['bins'])))
                 return False
             elif self.debug:
-                print("Number of bins from merged high/low data matches number of overall bins")
+                print("Number of merged bins %d matches number of overall bins %d" %
+                      (len(bins), len(jsondata[combined]['bins'])))
 
             for duration in bins.keys():
                 if bins[duration] != jsondata[combined]['bins'][duration]:
-                    print("Merged high/low count does not match overall count for duration %d" \
-                            % duration)
+                    print("Merged per prio count does not match overall count for duration %d" %
+                          duration)
                     return False
 
-        print("Merged high/low priority latency data match combined latency data")
+        print("Merged per priority latency data match combined latency data")
         return True
 
     def check(self):
@@ -602,7 +603,7 @@ class Test001(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['read'], 0, slat=False)
@@ -626,7 +627,7 @@ class Test002(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['write'], 1, slat=False, clat=False)
@@ -650,7 +651,7 @@ class Test003(FioLatTest):
             print("Unexpected write data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['trim'], 2, slat=False, tlat=False)
@@ -674,7 +675,7 @@ class Test004(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['read'], 0, plus=True)
@@ -698,7 +699,7 @@ class Test005(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
@@ -722,7 +723,7 @@ class Test006(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
@@ -743,7 +744,7 @@ class Test007(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True)
@@ -761,11 +762,11 @@ class Test008(FioLatTest):
         job = self.json_data['jobs'][0]
 
         retval = True
-        if 'read' in job or 'write'in job or 'trim' in job:
+        if 'read' in job or 'write' in job or 'trim' in job:
             print("Unexpected data direction found in fio output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True)
@@ -792,7 +793,7 @@ class Test009(FioLatTest):
             print("Error checking fsync latency data")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
@@ -813,7 +814,7 @@ class Test010(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['read'], 0, plus=True)
@@ -839,7 +840,7 @@ class Test011(FioLatTest):
             print("Unexpected trim data found in output")
             retval = False
         if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
             retval = False
 
         retval &= self.check_latencies(job['read'], 0, slat=False, clat=False, plus=True)
@@ -953,7 +954,7 @@ class Test019(FioLatTest):
         job = self.json_data['jobs'][0]
 
         retval = True
-        if 'read' in job or 'write'in job or 'trim' in job:
+        if 'read' in job or 'write' in job or 'trim' in job:
             print("Unexpected data direction found in fio output")
             retval = False
 
@@ -963,6 +964,27 @@ class Test019(FioLatTest):
         return retval
 
 
+class Test021(FioLatTest):
+    """Test object for Test 21."""
+
+    def check(self):
+        """Check Test 21 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
+        retval &= self.check_latencies(job['write'], 1, slat=False, tlat=False, plus=True)
+        retval &= self.check_prio_latencies(job['read'], clat=True, plus=True)
+        retval &= self.check_prio_latencies(job['write'], clat=True, plus=True)
+
+        return retval
+
+
 def parse_args():
     """Parse command-line arguments."""
 
@@ -1007,7 +1029,7 @@ def main():
             # randread, null
             # enable slat, clat, lat
             # only clat and lat will appear because
-            # because the null ioengine is syncrhonous
+            # because the null ioengine is synchronous
             "test_id": 1,
             "runtime": 2,
             "output-format": "json",
@@ -1047,7 +1069,7 @@ def main():
         {
             # randread, aio
             # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
+            # all will appear because libaio is asynchronous
             "test_id": 4,
             "runtime": 5,
             "output-format": "json+",
@@ -1153,9 +1175,9 @@ def main():
             # randread, null
             # enable slat, clat, lat
             # only clat and lat will appear because
-            # because the null ioengine is syncrhonous
-            # same as Test 1 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # because the null ioengine is synchronous
+            # same as Test 1 except add numjobs = 4 to test
+            # sum_thread_stats() changes
             "test_id": 12,
             "runtime": 2,
             "output-format": "json",
@@ -1170,9 +1192,9 @@ def main():
         {
             # randread, aio
             # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
-            # same as Test 4 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # all will appear because libaio is asynchronous
+            # same as Test 4 except add numjobs = 4 to test
+            # sum_thread_stats() changes
             "test_id": 13,
             "runtime": 5,
             "output-format": "json+",
@@ -1187,8 +1209,8 @@ def main():
         {
             # 50/50 r/w, aio, unified_rw_reporting
             # enable slat, clat, lata
-            # same as Test 8 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # same as Test 8 except add numjobs = 4 to test
+            # sum_thread_stats() changes
             "test_id": 14,
             "runtime": 5,
             "output-format": "json+",
@@ -1204,7 +1226,7 @@ def main():
         {
             # randread, aio
             # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
+            # all will appear because libaio is asynchronous
             # same as Test 4 except add cmdprio_percentage
             "test_id": 15,
             "runtime": 5,
@@ -1278,8 +1300,8 @@ def main():
         {
             # 50/50 r/w, aio, unified_rw_reporting
             # enable slat, clat, lat
-            # same as Test 19 except
-            # add numjobs = 4 to test sum_thread_stats() changes
+            # same as Test 19 except add numjobs = 4 to test
+            # sum_thread_stats() changes
             "test_id": 20,
             "runtime": 5,
             "output-format": "json+",
@@ -1293,6 +1315,40 @@ def main():
             'numjobs': 4,
             "test_obj": Test019,
         },
+        {
+            # r/w, aio
+            # enable only clat
+            # test bssplit and cmdprio_bssplit
+            "test_id": 21,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'bssplit': '64k/40:1024k/60',
+            'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+            "test_obj": Test021,
+        },
+        {
+            # r/w, aio
+            # enable only clat
+            # same as Test 21 except add numjobs = 4 to test
+            # sum_thread_stats() changes
+            "test_id": 22,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'bssplit': '64k/40:1024k/60',
+            'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+            'numjobs': 4,
+            "test_obj": Test021,
+        },
     ]
 
     passed = 0
@@ -1304,9 +1360,10 @@ def main():
            (args.run_only and test['test_id'] not in args.run_only):
             skipped = skipped + 1
             outcome = 'SKIPPED (User request)'
-        elif (platform.system() != 'Linux' or os.geteuid() != 0) and 'cmdprio_percentage' in test:
+        elif (platform.system() != 'Linux' or os.geteuid() != 0) and \
+             ('cmdprio_percentage' in test or 'cmdprio_bssplit' in test):
             skipped = skipped + 1
-            outcome = 'SKIPPED (Linux root required for cmdprio_percentage tests)'
+            outcome = 'SKIPPED (Linux root required for cmdprio tests)'
         else:
             test_obj = test['test_obj'](artifact_root, test, args.debug)
             status = test_obj.run_fio(fio)
diff --git a/thread_options.h b/thread_options.h
index 8f4c8a59..4162c42f 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -50,6 +50,12 @@ struct split {
 	unsigned long long val2[ZONESPLIT_MAX];
 };
 
+struct split_prio {
+	uint64_t bs;
+	int32_t prio;
+	uint32_t perc;
+};
+
 struct bssplit {
 	uint64_t bs;
 	uint32_t perc;
@@ -166,6 +172,8 @@ struct thread_options {
 	unsigned int log_gz;
 	unsigned int log_gz_store;
 	unsigned int log_unix_epoch;
+	unsigned int log_alternate_epoch;
+	unsigned int log_alternate_epoch_clock_id;
 	unsigned int norandommap;
 	unsigned int softrandommap;
 	unsigned int bs_unaligned;
@@ -482,6 +490,8 @@ struct thread_options_pack {
 	uint32_t log_gz;
 	uint32_t log_gz_store;
 	uint32_t log_unix_epoch;
+	uint32_t log_alternate_epoch;
+	uint32_t log_alternate_epoch_clock_id;
 	uint32_t norandommap;
 	uint32_t softrandommap;
 	uint32_t bs_unaligned;
@@ -702,4 +712,8 @@ extern int str_split_parse(struct thread_data *td, char *str,
 extern int split_parse_ddir(struct thread_options *o, struct split *split,
 			    char *str, bool absolute, unsigned int max_splits);
 
+extern int split_parse_prio_ddir(struct thread_options *o,
+				 struct split_prio **entries, int *nr_entries,
+				 char *str);
+
 #endif
diff --git a/time.c b/time.c
index cd0e2a89..5c4d6de0 100644
--- a/time.c
+++ b/time.c
@@ -172,14 +172,14 @@ void set_genesis_time(void)
 	fio_gettime(&genesis, NULL);
 }
 
-void set_epoch_time(struct thread_data *td, int log_unix_epoch)
+void set_epoch_time(struct thread_data *td, int log_alternate_epoch, clockid_t clock_id)
 {
 	fio_gettime(&td->epoch, NULL);
-	if (log_unix_epoch) {
-		struct timeval tv;
-		gettimeofday(&tv, NULL);
-		td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 +
-		                 (unsigned long long)(tv.tv_usec) / 1000;
+	if (log_alternate_epoch) {
+		struct timespec ts;
+		clock_gettime(clock_id, &ts);
+		td->alternate_epoch = (unsigned long long)(ts.tv_sec) * 1000 +
+		                 (unsigned long long)(ts.tv_nsec) / 1000000;
 	}
 }
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-29 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-29 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 2b3d4a6a924e0aa82654d3b96fb134085af7a98a:

  fio: use LDFLAGS when linking dynamic engines (2022-01-26 13:12:14 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 52a0b9ed71c3e929461e64b39059281948107071:

  Merge branch 'patch-1' of https://github.com/Nikratio/fio (2022-01-28 14:50:51 -0700)

----------------------------------------------------------------
Jens Axboe (2):
      Merge branch 'docs' of https://github.com/vincentkfu/fio
      Merge branch 'patch-1' of https://github.com/Nikratio/fio

Nikolaus Rath (1):
      I/O size: fix description of filesize

Vincent Fu (4):
      Revert "Update README to markdown format"
      docs: rename README to README.rst
      docs: update fio docs to pull from README.rst
      Makefile: build t/fio-dedupe only if zlib support is found

 HOWTO                   | 11 +++----
 Makefile                |  4 +++
 README.md => README.rst | 78 +++++++++++++++++++++++++------------------------
 doc/fio_doc.rst         |  2 +-
 doc/fio_man.rst         |  2 +-
 fio.1                   |  8 ++---
 6 files changed, 56 insertions(+), 49 deletions(-)
 rename README.md => README.rst (94%)

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO
index f9e7c857..c72ec8cd 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1886,11 +1886,12 @@ I/O size
 
 .. option:: filesize=irange(int)
 
-	Individual file sizes. May be a range, in which case fio will select sizes
-	for files at random within the given range and limited to :option:`size` in
-	total (if that is given). If not given, each created file is the same size.
-	This option overrides :option:`size` in terms of file size, which means
-	this value is used as a fixed size or possible range of each file.
+	Individual file sizes. May be a range, in which case fio will select sizes for
+	files at random within the given range. If not given, each created file is the
+	same size. This option overrides :option:`size` in terms of file size, i.e. if
+	:option:`filesize` is specified then :option:`size` becomes merely the default
+	for :option:`io_size` and has no effect at all if :option:`io_size` is set
+	explicitly.
 
 .. option:: file_append=bool
 
diff --git a/Makefile b/Makefile
index 00e79539..2432f519 100644
--- a/Makefile
+++ b/Makefile
@@ -430,7 +430,9 @@ T_TEST_PROGS += $(T_AXMAP_PROGS)
 T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
 T_TEST_PROGS += $(T_GEN_RAND_PROGS)
 T_PROGS += $(T_BTRACE_FIO_PROGS)
+ifdef CONFIG_ZLIB
 T_PROGS += $(T_DEDUPE_PROGS)
+endif
 T_PROGS += $(T_VS_PROGS)
 T_TEST_PROGS += $(T_MEMLOCK_PROGS)
 ifdef CONFIG_PREAD
@@ -618,8 +620,10 @@ t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
 endif
 
+ifdef CONFIG_ZLIB
 t/fio-dedupe: $(T_DEDUPE_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+endif
 
 t/fio-verify-state: $(T_VS_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
diff --git a/README.md b/README.rst
similarity index 94%
rename from README.md
rename to README.rst
index b10b1688..d566fae3 100644
--- a/README.md
+++ b/README.rst
@@ -1,5 +1,5 @@
-# Fio README
-## Overview and history
+Overview and history
+--------------------
 
 Fio was originally written to save me the hassle of writing special test case
 programs when I wanted to test a specific workload, either for performance
@@ -22,13 +22,14 @@ that setting is given.  The typical use of fio is to write a job file matching
 the I/O load one wants to simulate.
 
 
-## Source
+Source
+------
 
 Fio resides in a git repo, the canonical place is:
 
 	git://git.kernel.dk/fio.git
 
-When inside a corporate firewall, `git://` URL sometimes does not work.
+When inside a corporate firewall, git:// URL sometimes does not work.
 If git:// does not work, use the http protocol instead:
 
 	http://git.kernel.dk/fio.git
@@ -54,8 +55,8 @@ or
 	https://github.com/axboe/fio.git
 
 
-## Mailing list
-
+Mailing list
+------------
 
 The fio project mailing list is meant for anything related to fio including
 general discussion, bug reporting, questions, and development. For bug reporting,
@@ -80,8 +81,8 @@ and archives for the old list can be found here:
 	http://maillist.kernel.dk/fio-devel/
 
 
-## Author
-
+Author
+------
 
 Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
 the Linux I/O subsystem and schedulers. He got tired of writing specific test
@@ -91,55 +92,56 @@ benchmark/test tools out there weren't flexible enough to do what he wanted.
 Jens Axboe <axboe@kernel.dk> 20060905
 
 
-## Binary packages
+Binary packages
+---------------
 
-**Debian:**
+Debian:
 	Starting with Debian "Squeeze", fio packages are part of the official
 	Debian repository. http://packages.debian.org/search?keywords=fio .
 
-**Ubuntu:**
+Ubuntu:
 	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
 	of the Ubuntu "universe" repository.
 	http://packages.ubuntu.com/search?keywords=fio .
 
-**Red Hat, Fedora, CentOS & Co:**
+Red Hat, Fedora, CentOS & Co:
 	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
 	packages are part of the Fedora/EPEL repositories.
 	https://apps.fedoraproject.org/packages/fio .
 
-**Mandriva:**
+Mandriva:
 	Mandriva has integrated fio into their package repository, so installing
 	on that distro should be as easy as typing ``urpmi fio``.
 
-**Arch Linux:**
+Arch Linux:
         An Arch Linux package is provided under the Community sub-repository:
         https://www.archlinux.org/packages/?sort=&q=fio
 
-**Solaris:**
+Solaris:
 	Packages for Solaris are available from OpenCSW. Install their pkgutil
 	tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
 	``pkgutil -i fio``.
 
-**Windows:**
+Windows:
 	Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
 	https://bsdio.com/fio/ . The latest builds for Windows can also
 	be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
 	the latest x86 or x64 build, then selecting the ARTIFACTS tab.
 
-**BSDs:**
+BSDs:
 	Packages for BSDs may be available from their binary package repositories.
 	Look for a package "fio" using their binary package managers.
 
 
-## Building
-
+Building
+--------
 
 Just type::
-```
-./configure
-make
-make install
-```
+
+ $ ./configure
+ $ make
+ $ make install
+
 Note that GNU make is required. On BSDs it's available from devel/gmake within
 ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
 GNU make isn't the default, type ``gmake`` instead of ``make``.
@@ -153,18 +155,18 @@ to be installed.  gfio isn't built automatically and can be enabled with a
 ``--enable-gfio`` option to configure.
 
 To build fio with a cross-compiler::
-```
-make clean
-make CROSS_COMPILE=/path/to/toolchain/prefix
-```
+
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
+
 Configure will attempt to determine the target platform automatically.
 
 It's possible to build fio for ESX as well, use the ``--esx`` switch to
 configure.
 
 
-## Windows
-
+Windows
+~~~~~~~
 
 The minimum versions of Windows for building/runing fio are Windows 7/Windows
 Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
@@ -172,7 +174,7 @@ order to build fio. To create an MSI installer package install WiX from
 https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
 directory.
 
-### How to compile fio on 64-bit Windows:
+How to compile fio on 64-bit Windows:
 
  1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
     packages starting with **mingw64-x86_64**. Ensure
@@ -194,21 +196,21 @@ https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-pr
 for details).
 
 
-## Documentation
-
+Documentation
+~~~~~~~~~~~~~
 
 Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
 To build HTML formatted documentation run ``make -C doc html`` and direct your
 browser to :file:`./doc/output/html/index.html`.  To build manual page run
 ``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
 output formats are supported run ``make -C doc help``.
-```
+
 .. _reStructuredText: http://www.sphinx-doc.org/rest.html
 .. _Sphinx: http://www.sphinx-doc.org
-```
 
-## Platforms
 
+Platforms
+---------
 
 Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
 Windows, FreeBSD, and DragonFly. Some features and/or options may only be
@@ -250,8 +252,8 @@ POSIX aio should work now. To make the change permanent::
         posix_aio0 changed
 
 
-## Running fio
-
+Running fio
+-----------
 
 Running fio is normally the easiest part - you just give it the job file
 (or job files) as parameters::
diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst
index b5987b52..8e1216f0 100644
--- a/doc/fio_doc.rst
+++ b/doc/fio_doc.rst
@@ -2,7 +2,7 @@ fio - Flexible I/O tester rev. |version|
 ========================================
 
 
-.. include:: ../README
+.. include:: ../README.rst
 
 
 .. include:: ../HOWTO
diff --git a/doc/fio_man.rst b/doc/fio_man.rst
index c6a6438f..44312f16 100644
--- a/doc/fio_man.rst
+++ b/doc/fio_man.rst
@@ -6,7 +6,7 @@ Fio Manpage
 (rev. |release|)
 
 
-.. include:: ../README
+.. include:: ../README.rst
 
 
 .. include:: ../HOWTO
diff --git a/fio.1 b/fio.1
index 34aa874d..b87d2309 100644
--- a/fio.1
+++ b/fio.1
@@ -1686,10 +1686,10 @@ also be set as number of zones using 'z'.
 .TP
 .BI filesize \fR=\fPirange(int)
 Individual file sizes. May be a range, in which case fio will select sizes
-for files at random within the given range and limited to \fBsize\fR in
-total (if that is given). If not given, each created file is the same size.
-This option overrides \fBsize\fR in terms of file size, which means
-this value is used as a fixed size or possible range of each file.
+for files at random within the given range. If not given, each created file
+is the same size. This option overrides \fBsize\fR in terms of file size, 
+i.e. \fBsize\fR becomes merely the default for \fBio_size\fR (and
+has no effect it all if \fBio_size\fR is set explicitly).
 .TP
 .BI file_append \fR=\fPbool
 Perform I/O after the end of the file. Normally fio will operate within the

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-27 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-27 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 2f0b54419a6ab039c677e41008391b8c53ae2e6b:

  Merge branch 'master' of https://github.com/ben-ihelputech/fio (2022-01-21 10:46:26 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 2b3d4a6a924e0aa82654d3b96fb134085af7a98a:

  fio: use LDFLAGS when linking dynamic engines (2022-01-26 13:12:14 -0700)

----------------------------------------------------------------
Eric Sandeen (2):
      t/io_uring: link with libaio when necessary
      fio: use LDFLAGS when linking dynamic engines

Jens Axboe (1):
      Merge branch 'rpma-add-support-for-File-System-DAX' of https://github.com/ldorau/fio

Lukasz Dorau (1):
      rpma: RPMA engine requires librpma>=v0.10.0 with rpma_mr_advise()

Wang, Long (1):
      rpma: add support for File System DAX

 Makefile              |  3 ++-
 configure             |  9 ++++-----
 engines/librpma_fio.c | 44 +++++++++++++++++++++++++++++++++-----------
 engines/librpma_fio.h |  2 +-
 4 files changed, 40 insertions(+), 18 deletions(-)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index 5d17bcab..00e79539 100644
--- a/Makefile
+++ b/Makefile
@@ -99,6 +99,7 @@ endif
 ifdef CONFIG_LIBAIO
   libaio_SRCS = engines/libaio.c
   cmdprio_SRCS = engines/cmdprio.c
+  LIBS += -laio
   libaio_LIBS = -laio
   ENGINES += libaio
 endif
@@ -294,7 +295,7 @@ define engine_template =
 $(1)_OBJS := $$($(1)_SRCS:.c=.o)
 $$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS)
 engines/fio-$(1).so: $$($(1)_OBJS)
-	$$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
+	$$(QUIET_LINK)$(CC) $(DYNAMIC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
 ENGS_OBJS += engines/fio-$(1).so
 endef
 else # !CONFIG_DYNAMIC_ENGINES
diff --git a/configure b/configure
index 84ccce04..0efde7d6 100755
--- a/configure
+++ b/configure
@@ -955,17 +955,16 @@ print_config "rdmacm" "$rdmacm"
 
 ##########################################
 # librpma probe
+# The librpma engine requires librpma>=v0.10.0 with rpma_mr_advise().
 if test "$librpma" != "yes" ; then
   librpma="no"
 fi
 cat > $TMPC << EOF
-#include <stdio.h>
 #include <librpma.h>
-int main(int argc, char **argv)
+int main(void)
 {
-  enum rpma_conn_event event = RPMA_CONN_REJECTED;
-  (void) event; /* unused */
-  rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+  void *ptr = rpma_mr_advise;
+  (void) ptr; /* unused */
   return 0;
 }
 EOF
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c
index 3d605ed6..9d6ebf38 100644
--- a/engines/librpma_fio.c
+++ b/engines/librpma_fio.c
@@ -108,7 +108,7 @@ char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
 	return mem_ptr;
 }
 
-char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
 		size_t size, struct librpma_fio_mem *mem)
 {
 	size_t size_mmap = 0;
@@ -122,18 +122,24 @@ char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
 		return NULL;
 	}
 
-	ws_offset = (td->thread_number - 1) * size;
+	if (f->filetype == FIO_TYPE_CHAR) {
+		/* Each thread uses a separate offset within DeviceDAX. */
+		ws_offset = (td->thread_number - 1) * size;
+	} else {
+		/* Each thread uses a separate FileSystemDAX file. No offset is needed. */
+		ws_offset = 0;
+	}
 
-	if (!filename) {
+	if (!f->file_name) {
 		log_err("fio: filename is not set\n");
 		return NULL;
 	}
 
 	/* map the file */
-	mem_ptr = pmem_map_file(filename, 0 /* len */, 0 /* flags */,
+	mem_ptr = pmem_map_file(f->file_name, 0 /* len */, 0 /* flags */,
 			0 /* mode */, &size_mmap, &is_pmem);
 	if (mem_ptr == NULL) {
-		log_err("fio: pmem_map_file(%s) failed\n", filename);
+		log_err("fio: pmem_map_file(%s) failed\n", f->file_name);
 		/* pmem_map_file() sets errno on failure */
 		td_verror(td, errno, "pmem_map_file");
 		return NULL;
@@ -142,7 +148,7 @@ char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
 	/* pmem is expected */
 	if (!is_pmem) {
 		log_err("fio: %s is not located in persistent memory\n",
-			filename);
+			f->file_name);
 		goto err_unmap;
 	}
 
@@ -150,12 +156,12 @@ char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
 	if (size_mmap < ws_offset + size) {
 		log_err(
 			"fio: %s is too small to handle so many threads (%zu < %zu)\n",
-			filename, size_mmap, ws_offset + size);
+			f->file_name, size_mmap, ws_offset + size);
 		goto err_unmap;
 	}
 
 	log_info("fio: size of memory mapped from the file %s: %zu\n",
-		filename, size_mmap);
+		f->file_name, size_mmap);
 
 	mem->mem_ptr = mem_ptr;
 	mem->size_mmap = size_mmap;
@@ -893,6 +899,7 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
 	size_t mem_size = td->o.size;
 	size_t mr_desc_size;
 	void *ws_ptr;
+	bool is_dram;
 	int usage_mem_type;
 	int ret;
 
@@ -910,14 +917,14 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
 		return -1;
 	}
 
-	if (strcmp(f->file_name, "malloc") == 0) {
+	is_dram = !strcmp(f->file_name, "malloc");
+	if (is_dram) {
 		/* allocation from DRAM using posix_memalign() */
 		ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem);
 		usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY;
 	} else {
 		/* allocation from PMEM using pmem_map_file() */
-		ws_ptr = librpma_fio_allocate_pmem(td, f->file_name,
-				mem_size, &csd->mem);
+		ws_ptr = librpma_fio_allocate_pmem(td, f, mem_size, &csd->mem);
 		usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT;
 	}
 
@@ -934,6 +941,21 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
 		goto err_free;
 	}
 
+	if (!is_dram && f->filetype == FIO_TYPE_FILE) {
+		ret = rpma_mr_advise(mr, 0, mem_size,
+				IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+				IBV_ADVISE_MR_FLAG_FLUSH);
+		if (ret) {
+			librpma_td_verror(td, ret, "rpma_mr_advise");
+			/* an invalid argument is an error */
+			if (ret == RPMA_E_INVAL)
+				goto err_mr_dereg;
+
+			/* log_err used instead of log_info to avoid corruption of the JSON output */
+			log_err("Note: having rpma_mr_advise(3) failed because of RPMA_E_NOSUPP or RPMA_E_PROVIDER may come with a performance penalty, but it is not a blocker for running the benchmark.\n");
+		}
+	}
+
 	/* get size of the memory region's descriptor */
 	if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) {
 		librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size");
diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h
index fb89d99d..2c507e9c 100644
--- a/engines/librpma_fio.h
+++ b/engines/librpma_fio.h
@@ -77,7 +77,7 @@ struct librpma_fio_mem {
 char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
 		struct librpma_fio_mem *mem);
 
-char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
 		size_t size, struct librpma_fio_mem *mem);
 
 void librpma_fio_free(struct librpma_fio_mem *mem);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-22 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-22 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 3a3e5c6e7606e727df1788a73d04db56d77ba00d:

  iolog.c: Fix memory leak for blkparse case (2022-01-20 11:40:42 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 2f0b54419a6ab039c677e41008391b8c53ae2e6b:

  Merge branch 'master' of https://github.com/ben-ihelputech/fio (2022-01-21 10:46:26 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'master' of https://github.com/ben-ihelputech/fio

ben-ihelputech (1):
      Update README to markdown format

 README => README.md | 78 ++++++++++++++++++++++++++---------------------------
 1 file changed, 38 insertions(+), 40 deletions(-)
 rename README => README.md (94%)

---

Diff of recent changes:

diff --git a/README b/README.md
similarity index 94%
rename from README
rename to README.md
index d566fae3..b10b1688 100644
--- a/README
+++ b/README.md
@@ -1,5 +1,5 @@
-Overview and history
---------------------
+# Fio README
+## Overview and history
 
 Fio was originally written to save me the hassle of writing special test case
 programs when I wanted to test a specific workload, either for performance
@@ -22,14 +22,13 @@ that setting is given.  The typical use of fio is to write a job file matching
 the I/O load one wants to simulate.
 
 
-Source
-------
+## Source
 
 Fio resides in a git repo, the canonical place is:
 
 	git://git.kernel.dk/fio.git
 
-When inside a corporate firewall, git:// URL sometimes does not work.
+When inside a corporate firewall, `git://` URL sometimes does not work.
 If git:// does not work, use the http protocol instead:
 
 	http://git.kernel.dk/fio.git
@@ -55,8 +54,8 @@ or
 	https://github.com/axboe/fio.git
 
 
-Mailing list
-------------
+## Mailing list
+
 
 The fio project mailing list is meant for anything related to fio including
 general discussion, bug reporting, questions, and development. For bug reporting,
@@ -81,8 +80,8 @@ and archives for the old list can be found here:
 	http://maillist.kernel.dk/fio-devel/
 
 
-Author
-------
+## Author
+
 
 Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
 the Linux I/O subsystem and schedulers. He got tired of writing specific test
@@ -92,56 +91,55 @@ benchmark/test tools out there weren't flexible enough to do what he wanted.
 Jens Axboe <axboe@kernel.dk> 20060905
 
 
-Binary packages
----------------
+## Binary packages
 
-Debian:
+**Debian:**
 	Starting with Debian "Squeeze", fio packages are part of the official
 	Debian repository. http://packages.debian.org/search?keywords=fio .
 
-Ubuntu:
+**Ubuntu:**
 	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
 	of the Ubuntu "universe" repository.
 	http://packages.ubuntu.com/search?keywords=fio .
 
-Red Hat, Fedora, CentOS & Co:
+**Red Hat, Fedora, CentOS & Co:**
 	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
 	packages are part of the Fedora/EPEL repositories.
 	https://apps.fedoraproject.org/packages/fio .
 
-Mandriva:
+**Mandriva:**
 	Mandriva has integrated fio into their package repository, so installing
 	on that distro should be as easy as typing ``urpmi fio``.
 
-Arch Linux:
+**Arch Linux:**
         An Arch Linux package is provided under the Community sub-repository:
         https://www.archlinux.org/packages/?sort=&q=fio
 
-Solaris:
+**Solaris:**
 	Packages for Solaris are available from OpenCSW. Install their pkgutil
 	tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
 	``pkgutil -i fio``.
 
-Windows:
+**Windows:**
 	Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
 	https://bsdio.com/fio/ . The latest builds for Windows can also
 	be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
 	the latest x86 or x64 build, then selecting the ARTIFACTS tab.
 
-BSDs:
+**BSDs:**
 	Packages for BSDs may be available from their binary package repositories.
 	Look for a package "fio" using their binary package managers.
 
 
-Building
---------
-
-Just type::
+## Building
 
- $ ./configure
- $ make
- $ make install
 
+Just type::
+```
+./configure
+make
+make install
+```
 Note that GNU make is required. On BSDs it's available from devel/gmake within
 ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
 GNU make isn't the default, type ``gmake`` instead of ``make``.
@@ -155,18 +153,18 @@ to be installed.  gfio isn't built automatically and can be enabled with a
 ``--enable-gfio`` option to configure.
 
 To build fio with a cross-compiler::
-
- $ make clean
- $ make CROSS_COMPILE=/path/to/toolchain/prefix
-
+```
+make clean
+make CROSS_COMPILE=/path/to/toolchain/prefix
+```
 Configure will attempt to determine the target platform automatically.
 
 It's possible to build fio for ESX as well, use the ``--esx`` switch to
 configure.
 
 
-Windows
-~~~~~~~
+## Windows
+
 
 The minimum versions of Windows for building/runing fio are Windows 7/Windows
 Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
@@ -174,7 +172,7 @@ order to build fio. To create an MSI installer package install WiX from
 https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
 directory.
 
-How to compile fio on 64-bit Windows:
+### How to compile fio on 64-bit Windows:
 
  1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
     packages starting with **mingw64-x86_64**. Ensure
@@ -196,21 +194,21 @@ https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-pr
 for details).
 
 
-Documentation
-~~~~~~~~~~~~~
+## Documentation
+
 
 Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
 To build HTML formatted documentation run ``make -C doc html`` and direct your
 browser to :file:`./doc/output/html/index.html`.  To build manual page run
 ``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
 output formats are supported run ``make -C doc help``.
-
+```
 .. _reStructuredText: http://www.sphinx-doc.org/rest.html
 .. _Sphinx: http://www.sphinx-doc.org
+```
 
+## Platforms
 
-Platforms
----------
 
 Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
 Windows, FreeBSD, and DragonFly. Some features and/or options may only be
@@ -252,8 +250,8 @@ POSIX aio should work now. To make the change permanent::
         posix_aio0 changed
 
 
-Running fio
------------
+## Running fio
+
 
 Running fio is normally the easiest part - you just give it the job file
 (or job files) as parameters::

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-21 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-21 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 71efbed61dfb157dfa7fe550f500b53f9731e1cb:

  docs: documentation for sg WRITE STREAM(16) (2022-01-18 06:37:39 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 3a3e5c6e7606e727df1788a73d04db56d77ba00d:

  iolog.c: Fix memory leak for blkparse case (2022-01-20 11:40:42 -0700)

----------------------------------------------------------------
Lukas Straub (8):
      blktrace.c: Use file stream interface instead of fifo
      iolog.c: Make iolog_items_to_fetch public
      blktrace.c: Add support for read_iolog_chunked
      linux-dev-lookup.c: Put the check for replay_redirect in the beginning
      blktrace.c: Don't hardcode direct-io
      blktrace.c: Don't sleep indefinitely if there is a wrong timestamp
      blktrace.c: Make thread-safe by removing local static variables
      iolog.c: Fix memory leak for blkparse case

 blktrace.c               | 325 ++++++++++++++++++++++++-----------------------
 blktrace.h               |  14 +-
 fio.h                    |   2 +
 iolog.c                  |  18 ++-
 iolog.h                  |   1 +
 oslib/linux-dev-lookup.c |  21 ++-
 6 files changed, 203 insertions(+), 178 deletions(-)

---

Diff of recent changes:

diff --git a/blktrace.c b/blktrace.c
index 64a610a9..e1804765 100644
--- a/blktrace.c
+++ b/blktrace.c
@@ -4,71 +4,34 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <errno.h>
 
 #include "flist.h"
 #include "fio.h"
+#include "iolog.h"
 #include "blktrace.h"
 #include "blktrace_api.h"
 #include "oslib/linux-dev-lookup.h"
 
-#define TRACE_FIFO_SIZE	8192
-
-/*
- * fifo refill frontend, to avoid reading data in trace sized bites
- */
-static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
-{
-	char buf[TRACE_FIFO_SIZE];
-	unsigned int total;
-	int ret;
-
-	total = sizeof(buf);
-	if (total > fifo_room(fifo))
-		total = fifo_room(fifo);
-
-	ret = read(fd, buf, total);
-	if (ret < 0) {
-		int read_err = errno;
-
-		assert(read_err > 0);
-		td_verror(td, read_err, "read blktrace file");
-		return -read_err;
-	}
-
-	if (ret > 0)
-		ret = fifo_put(fifo, buf, ret);
-
-	dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
-	return ret;
-}
-
-/*
- * Retrieve 'len' bytes from the fifo, refilling if necessary.
- */
-static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
-			  void *buf, unsigned int len)
-{
-	if (fifo_len(fifo) < len) {
-		int ret = refill_fifo(td, fifo, fd);
-
-		if (ret < 0)
-			return ret;
-	}
-
-	return fifo_get(fifo, buf, len);
-}
+struct file_cache {
+	unsigned int maj;
+	unsigned int min;
+	unsigned int fileno;
+};
 
 /*
  * Just discard the pdu by seeking past it.
  */
-static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
-		       struct blk_io_trace *t)
+static int discard_pdu(FILE* f, struct blk_io_trace *t)
 {
 	if (t->pdu_len == 0)
 		return 0;
 
 	dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
-	return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
+	if (fseek(f, t->pdu_len, SEEK_CUR) < 0)
+		return -errno;
+
+	return t->pdu_len;
 }
 
 /*
@@ -130,28 +93,28 @@ static void trace_add_open_close_event(struct thread_data *td, int fileno, enum
 	flist_add_tail(&ipo->list, &td->io_log_list);
 }
 
-static int trace_add_file(struct thread_data *td, __u32 device)
+static int trace_add_file(struct thread_data *td, __u32 device,
+			  struct file_cache *cache)
 {
-	static unsigned int last_maj, last_min, last_fileno;
 	unsigned int maj = FMAJOR(device);
 	unsigned int min = FMINOR(device);
 	struct fio_file *f;
 	char dev[256];
 	unsigned int i;
 
-	if (last_maj == maj && last_min == min)
-		return last_fileno;
+	if (cache->maj == maj && cache->min == min)
+		return cache->fileno;
 
-	last_maj = maj;
-	last_min = min;
+	cache->maj = maj;
+	cache->min = min;
 
 	/*
 	 * check for this file in our list
 	 */
 	for_each_file(td, f, i)
 		if (f->major == maj && f->minor == min) {
-			last_fileno = f->fileno;
-			return last_fileno;
+			cache->fileno = f->fileno;
+			return cache->fileno;
 		}
 
 	strcpy(dev, "/dev");
@@ -171,10 +134,10 @@ static int trace_add_file(struct thread_data *td, __u32 device)
 		td->files[fileno]->major = maj;
 		td->files[fileno]->minor = min;
 		trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
-		last_fileno = fileno;
+		cache->fileno = fileno;
 	}
 
-	return last_fileno;
+	return cache->fileno;
 }
 
 static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
@@ -215,7 +178,7 @@ static void store_ipo(struct thread_data *td, unsigned long long offset,
 	queue_io_piece(td, ipo);
 }
 
-static void handle_trace_notify(struct blk_io_trace *t)
+static bool handle_trace_notify(struct blk_io_trace *t)
 {
 	switch (t->action) {
 	case BLK_TN_PROCESS:
@@ -232,22 +195,24 @@ static void handle_trace_notify(struct blk_io_trace *t)
 		dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
 		break;
 	}
+	return false;
 }
 
-static void handle_trace_discard(struct thread_data *td,
+static bool handle_trace_discard(struct thread_data *td,
 				 struct blk_io_trace *t,
 				 unsigned long long ttime,
-				 unsigned long *ios, unsigned int *bs)
+				 unsigned long *ios, unsigned long long *bs,
+				 struct file_cache *cache)
 {
 	struct io_piece *ipo;
 	int fileno;
 
 	if (td->o.replay_skip & (1u << DDIR_TRIM))
-		return;
+		return false;
 
 	ipo = calloc(1, sizeof(*ipo));
 	init_ipo(ipo);
-	fileno = trace_add_file(td, t->device);
+	fileno = trace_add_file(td, t->device, cache);
 
 	ios[DDIR_TRIM]++;
 	if (t->bytes > bs[DDIR_TRIM])
@@ -270,6 +235,7 @@ static void handle_trace_discard(struct thread_data *td,
 							ipo->offset, ipo->len,
 							ipo->delay);
 	queue_io_piece(td, ipo);
+	return true;
 }
 
 static void dump_trace(struct blk_io_trace *t)
@@ -277,29 +243,29 @@ static void dump_trace(struct blk_io_trace *t)
 	log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
 }
 
-static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
+static bool handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
 			    unsigned long long ttime, unsigned long *ios,
-			    unsigned int *bs)
+			    unsigned long long *bs, struct file_cache *cache)
 {
 	int rw;
 	int fileno;
 
-	fileno = trace_add_file(td, t->device);
+	fileno = trace_add_file(td, t->device, cache);
 
 	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
 
 	if (rw) {
 		if (td->o.replay_skip & (1u << DDIR_WRITE))
-			return;
+			return false;
 	} else {
 		if (td->o.replay_skip & (1u << DDIR_READ))
-			return;
+			return false;
 	}
 
 	if (!t->bytes) {
 		if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
 			dump_trace(t);
-		return;
+		return false;
 	}
 
 	if (t->bytes > bs[rw])
@@ -308,20 +274,22 @@ static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
 	ios[rw]++;
 	td->o.size += t->bytes;
 	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
+	return true;
 }
 
-static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
-			       unsigned long long ttime, unsigned long *ios)
+static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
+			       unsigned long long ttime, unsigned long *ios,
+			       struct file_cache *cache)
 {
 	struct io_piece *ipo;
 	int fileno;
 
 	if (td->o.replay_skip & (1u << DDIR_SYNC))
-		return;
+		return false;
 
 	ipo = calloc(1, sizeof(*ipo));
 	init_ipo(ipo);
-	fileno = trace_add_file(td, t->device);
+	fileno = trace_add_file(td, t->device, cache);
 
 	ipo->delay = ttime / 1000;
 	ipo->ddir = DDIR_SYNC;
@@ -330,47 +298,49 @@ static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
 	ios[DDIR_SYNC]++;
 	dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
 	queue_io_piece(td, ipo);
+	return true;
 }
 
 /*
  * We only care for queue traces, most of the others are side effects
  * due to internal workings of the block layer.
  */
-static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
-			 unsigned long *ios, unsigned int *bs)
+static bool queue_trace(struct thread_data *td, struct blk_io_trace *t,
+			 unsigned long *ios, unsigned long long *bs,
+			 struct file_cache *cache)
 {
-	static unsigned long long last_ttime;
+	unsigned long long *last_ttime = &td->io_log_blktrace_last_ttime;
 	unsigned long long delay = 0;
 
 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
-		return;
+		return false;
 
 	if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
-		if (!last_ttime || td->o.no_stall)
+		if (!*last_ttime || td->o.no_stall || t->time < *last_ttime)
 			delay = 0;
 		else if (td->o.replay_time_scale == 100)
-			delay = t->time - last_ttime;
+			delay = t->time - *last_ttime;
 		else {
-			double tmp = t->time - last_ttime;
+			double tmp = t->time - *last_ttime;
 			double scale;
 
 			scale = (double) 100.0 / (double) td->o.replay_time_scale;
 			tmp *= scale;
 			delay = tmp;
 		}
-		last_ttime = t->time;
+		*last_ttime = t->time;
 	}
 
 	t_bytes_align(&td->o, t);
 
 	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
-		handle_trace_notify(t);
+		return handle_trace_notify(t);
 	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
-		handle_trace_discard(td, t, delay, ios, bs);
+		return handle_trace_discard(td, t, delay, ios, bs, cache);
 	else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
-		handle_trace_flush(td, t, delay, ios);
+		return handle_trace_flush(td, t, delay, ios, cache);
 	else
-		handle_trace_fs(td, t, delay, ios, bs);
+		return handle_trace_fs(td, t, delay, ios, bs, cache);
 }
 
 static void byteswap_trace(struct blk_io_trace *t)
@@ -438,43 +408,79 @@ static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
  * Load a blktrace file by reading all the blk_io_trace entries, and storing
  * them as io_pieces like the fio text version would do.
  */
-bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
+bool init_blktrace_read(struct thread_data *td, const char *filename, int need_swap)
+{
+	int old_state;
+
+	td->io_log_rfile = fopen(filename, "rb");
+	if (!td->io_log_rfile) {
+		td_verror(td, errno, "open blktrace file");
+		goto err;
+	}
+	td->io_log_blktrace_swap = need_swap;
+	td->io_log_blktrace_last_ttime = 0;
+	td->o.size = 0;
+
+	free_release_files(td);
+
+	old_state = td_bump_runstate(td, TD_SETTING_UP);
+
+	if (!read_blktrace(td)) {
+		goto err;
+	}
+
+	td_restore_runstate(td, old_state);
+
+	if (!td->files_index) {
+		log_err("fio: did not find replay device(s)\n");
+		return false;
+	}
+
+	return true;
+
+err:
+	if (td->io_log_rfile) {
+		fclose(td->io_log_rfile);
+		td->io_log_rfile = NULL;
+	}
+	return false;
+}
+
+bool read_blktrace(struct thread_data* td)
 {
 	struct blk_io_trace t;
+	struct file_cache cache = { };
 	unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
-	unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
+	unsigned long long rw_bs[DDIR_RWDIR_CNT] = { };
 	unsigned long skipped_writes;
-	struct fifo *fifo;
-	int fd, i, old_state, max_depth;
-	struct fio_file *f;
+	FILE *f = td->io_log_rfile;
+	int i, max_depth;
+	struct fio_file *fiof;
 	int this_depth[DDIR_RWDIR_CNT] = { };
 	int depth[DDIR_RWDIR_CNT] = { };
+	int64_t items_to_fetch = 0;
 
-	fd = open(filename, O_RDONLY);
-	if (fd < 0) {
-		td_verror(td, errno, "open blktrace file");
-		return false;
+	if (td->o.read_iolog_chunked) {
+		items_to_fetch = iolog_items_to_fetch(td);
+		if (!items_to_fetch)
+			return true;
 	}
 
-	fifo = fifo_alloc(TRACE_FIFO_SIZE);
-
-	old_state = td_bump_runstate(td, TD_SETTING_UP);
-
-	td->o.size = 0;
 	skipped_writes = 0;
 	do {
-		int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
+		int ret = fread(&t, 1, sizeof(t), f);
 
-		if (ret < 0)
+		if (ferror(f)) {
+			td_verror(td, errno, "read blktrace file");
 			goto err;
-		else if (!ret)
+		} else if (feof(f)) {
 			break;
-		else if (ret < (int) sizeof(t)) {
-			log_err("fio: short fifo get\n");
+		} else if (ret < (int) sizeof(t)) {
+			log_err("fio: iolog short read\n");
 			break;
 		}
 
-		if (need_swap)
+		if (td->io_log_blktrace_swap)
 			byteswap_trace(&t);
 
 		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
@@ -487,13 +493,10 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 								t.magic & 0xff);
 			goto err;
 		}
-		ret = discard_pdu(td, fifo, fd, &t);
+		ret = discard_pdu(f, &t);
 		if (ret < 0) {
 			td_verror(td, -ret, "blktrace lseek");
 			goto err;
-		} else if (t.pdu_len != ret) {
-			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
-			goto err;
 		}
 		if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
 			if ((t.action & 0xffff) == __BLK_TA_QUEUE)
@@ -510,22 +513,53 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 			}
 		}
 
-		handle_trace(td, &t, ios, rw_bs);
-	} while (1);
+		if (!queue_trace(td, &t, ios, rw_bs, &cache))
+			continue;
 
-	for_each_file(td, f, i)
-		trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
+		if (td->o.read_iolog_chunked) {
+			td->io_log_current++;
+			items_to_fetch--;
+			if (items_to_fetch == 0)
+				break;
+		}
+	} while (1);
 
-	fifo_free(fifo);
-	close(fd);
+	if (td->o.read_iolog_chunked) {
+		td->io_log_highmark = td->io_log_current;
+		td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+		fio_gettime(&td->io_log_highmark_time, NULL);
+	}
 
-	td_restore_runstate(td, old_state);
+	if (skipped_writes)
+		log_err("fio: %s skips replay of %lu writes due to read-only\n",
+						td->o.name, skipped_writes);
 
-	if (!td->files_index) {
-		log_err("fio: did not find replay device(s)\n");
-		return false;
+	if (td->o.read_iolog_chunked) {
+		if (td->io_log_current == 0) {
+			return false;
+		}
+		td->o.td_ddir = TD_DDIR_RW;
+		if ((rw_bs[DDIR_READ] > td->o.max_bs[DDIR_READ] ||
+		     rw_bs[DDIR_WRITE] > td->o.max_bs[DDIR_WRITE] ||
+		     rw_bs[DDIR_TRIM] > td->o.max_bs[DDIR_TRIM]) &&
+		    td->orig_buffer)
+		{
+			td->o.max_bs[DDIR_READ] = max(td->o.max_bs[DDIR_READ], rw_bs[DDIR_READ]);
+			td->o.max_bs[DDIR_WRITE] = max(td->o.max_bs[DDIR_WRITE], rw_bs[DDIR_WRITE]);
+			td->o.max_bs[DDIR_TRIM] = max(td->o.max_bs[DDIR_TRIM], rw_bs[DDIR_TRIM]);
+			io_u_quiesce(td);
+			free_io_mem(td);
+			init_io_u_buffers(td);
+		}
+		return true;
 	}
 
+	for_each_file(td, fiof, i)
+		trace_add_open_close_event(td, fiof->fileno, FIO_LOG_CLOSE_FILE);
+
+	fclose(td->io_log_rfile);
+	td->io_log_rfile = NULL;
+
 	/*
 	 * For stacked devices, we don't always get a COMPLETE event so
 	 * the depth grows to insane values. Limit it to something sane(r).
@@ -539,10 +573,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 		max_depth = max(depth[i], max_depth);
 	}
 
-	if (skipped_writes)
-		log_err("fio: %s skips replay of %lu writes due to read-only\n",
-						td->o.name, skipped_writes);
-
 	if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
 	    !ios[DDIR_SYNC]) {
 		log_err("fio: found no ios in blktrace data\n");
@@ -563,14 +593,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 		td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
 	}
 
-	/*
-	 * We need to do direct/raw ios to the device, to avoid getting
-	 * read-ahead in our way. But only do so if the minimum block size
-	 * is a multiple of 4k, otherwise we don't know if it's safe to do so.
-	 */
-	if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
-		td->o.odirect = 1;
-
 	/*
 	 * If depth wasn't manually set, use probed depth
 	 */
@@ -579,8 +601,7 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 
 	return true;
 err:
-	close(fd);
-	fifo_free(fifo);
+	fclose(f);
 	return false;
 }
 
@@ -625,15 +646,14 @@ static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
 {
 	bcs[i].iter++;
 	if (bcs[i].iter < bcs[i].nr_iter) {
-		lseek(bcs[i].fd, 0, SEEK_SET);
+		fseek(bcs[i].f, 0, SEEK_SET);
 		return;
 	}
 
 	*nr_logs -= 1;
 
 	/* close file */
-	fifo_free(bcs[i].fifo);
-	close(bcs[i].fd);
+	fclose(bcs[i].f);
 
 	/* keep active files contiguous */
 	memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
@@ -646,15 +666,16 @@ static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
 
 read_skip:
 	/* read an io trace */
-	ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
-	if (ret < 0) {
+	ret = fread(&t, 1, sizeof(t), bc->f);
+	if (ferror(bc->f)) {
+		td_verror(td, errno, "read blktrace file");
 		return ret;
-	} else if (!ret) {
+	} else if (feof(bc->f)) {
 		if (!bc->length)
 			bc->length = bc->t.time;
 		return ret;
 	} else if (ret < (int) sizeof(*t)) {
-		log_err("fio: short fifo get\n");
+		log_err("fio: iolog short read\n");
 		return -1;
 	}
 
@@ -664,14 +685,10 @@ read_skip:
 	/* skip over actions that fio does not care about */
 	if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
 	    t_get_ddir(t) == DDIR_INVAL) {
-		ret = discard_pdu(td, bc->fifo, bc->fd, t);
+		ret = discard_pdu(bc->f, t);
 		if (ret < 0) {
 			td_verror(td, -ret, "blktrace lseek");
 			return ret;
-		} else if (t->pdu_len != ret) {
-			log_err("fio: discarded %d of %d\n", ret,
-				t->pdu_len);
-			return -1;
 		}
 		goto read_skip;
 	}
@@ -729,14 +746,13 @@ int merge_blktrace_iologs(struct thread_data *td)
 	str = ptr = strdup(td->o.read_iolog_file);
 	nr_logs = 0;
 	for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
-		bcs[i].fd = open(name, O_RDONLY);
-		if (bcs[i].fd < 0) {
+		bcs[i].f = fopen(name, "rb");
+		if (!bcs[i].f) {
 			log_err("fio: could not open file: %s\n", name);
-			ret = bcs[i].fd;
+			ret = -errno;
 			free(str);
 			goto err_file;
 		}
-		bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
 		nr_logs++;
 
 		if (!is_blktrace(name, &bcs[i].swap)) {
@@ -761,14 +777,10 @@ int merge_blktrace_iologs(struct thread_data *td)
 		i = find_earliest_io(bcs, nr_logs);
 		bc = &bcs[i];
 		/* skip over the pdu */
-		ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
+		ret = discard_pdu(bc->f, &bc->t);
 		if (ret < 0) {
 			td_verror(td, -ret, "blktrace lseek");
 			goto err_file;
-		} else if (bc->t.pdu_len != ret) {
-			log_err("fio: discarded %d of %d\n", ret,
-				bc->t.pdu_len);
-			goto err_file;
 		}
 
 		ret = write_trace(merge_fp, &bc->t);
@@ -786,8 +798,7 @@ int merge_blktrace_iologs(struct thread_data *td)
 err_file:
 	/* cleanup */
 	for (i = 0; i < nr_logs; i++) {
-		fifo_free(bcs[i].fifo);
-		close(bcs[i].fd);
+		fclose(bcs[i].f);
 	}
 err_merge_buf:
 	free(merge_buf);
diff --git a/blktrace.h b/blktrace.h
index a0e82faa..c53b717b 100644
--- a/blktrace.h
+++ b/blktrace.h
@@ -10,7 +10,7 @@
 
 struct blktrace_cursor {
 	struct fifo		*fifo;	// fifo queue for reading
-	int			fd;	// blktrace file
+	FILE			*f;	// blktrace file
 	__u64			length; // length of trace
 	struct blk_io_trace	t;	// current io trace
 	int			swap;	// bitwise reverse required
@@ -20,7 +20,9 @@ struct blktrace_cursor {
 };
 
 bool is_blktrace(const char *, int *);
-bool load_blktrace(struct thread_data *, const char *, int);
+bool init_blktrace_read(struct thread_data *, const char *, int);
+bool read_blktrace(struct thread_data* td);
+
 int merge_blktrace_iologs(struct thread_data *td);
 
 #else
@@ -30,12 +32,18 @@ static inline bool is_blktrace(const char *fname, int *need_swap)
 	return false;
 }
 
-static inline bool load_blktrace(struct thread_data *td, const char *fname,
+static inline bool init_blktrace_read(struct thread_data *td, const char *fname,
 				 int need_swap)
 {
 	return false;
 }
 
+static inline bool read_blktrace(struct thread_data* td)
+{
+	return false;
+}
+
+
 static inline int merge_blktrace_iologs(struct thread_data *td)
 {
 	return false;
diff --git a/fio.h b/fio.h
index 6bb21ebb..1ea3d064 100644
--- a/fio.h
+++ b/fio.h
@@ -428,6 +428,8 @@ struct thread_data {
 	struct flist_head io_log_list;
 	FILE *io_log_rfile;
 	unsigned int io_log_blktrace;
+	unsigned int io_log_blktrace_swap;
+	unsigned long long io_log_blktrace_last_ttime;
 	unsigned int io_log_current;
 	unsigned int io_log_checkmark;
 	unsigned int io_log_highmark;
diff --git a/iolog.c b/iolog.c
index 1aeb7a76..a2cf0c1c 100644
--- a/iolog.c
+++ b/iolog.c
@@ -152,10 +152,15 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 	while (!flist_empty(&td->io_log_list)) {
 		int ret;
 
-		if (!td->io_log_blktrace && td->o.read_iolog_chunked) {
+		if (td->o.read_iolog_chunked) {
 			if (td->io_log_checkmark == td->io_log_current) {
-				if (!read_iolog2(td))
-					return 1;
+				if (td->io_log_blktrace) {
+					if (!read_blktrace(td))
+						return 1;
+				} else {
+					if (!read_iolog2(td))
+						return 1;
+				}
 			}
 			td->io_log_current--;
 		}
@@ -355,7 +360,7 @@ void write_iolog_close(struct thread_data *td)
 	td->iolog_buf = NULL;
 }
 
-static int64_t iolog_items_to_fetch(struct thread_data *td)
+int64_t iolog_items_to_fetch(struct thread_data *td)
 {
 	struct timespec now;
 	uint64_t elapsed;
@@ -626,8 +631,6 @@ static bool init_iolog_read(struct thread_data *td, char *fname)
 	} else
 		f = fopen(fname, "r");
 
-	free(fname);
-
 	if (!f) {
 		perror("fopen read iolog");
 		return false;
@@ -709,11 +712,12 @@ bool init_iolog(struct thread_data *td)
 		 */
 		if (is_blktrace(fname, &need_swap)) {
 			td->io_log_blktrace = 1;
-			ret = load_blktrace(td, fname, need_swap);
+			ret = init_blktrace_read(td, fname, need_swap);
 		} else {
 			td->io_log_blktrace = 0;
 			ret = init_iolog_read(td, fname);
 		}
+		free(fname);
 	} else if (td->o.write_iolog_file)
 		ret = init_iolog_write(td);
 	else
diff --git a/iolog.h b/iolog.h
index 7d66b7c4..a3986309 100644
--- a/iolog.h
+++ b/iolog.h
@@ -254,6 +254,7 @@ extern void trim_io_piece(const struct io_u *);
 extern void queue_io_piece(struct thread_data *, struct io_piece *);
 extern void prune_io_piece_log(struct thread_data *);
 extern void write_iolog_close(struct thread_data *);
+int64_t iolog_items_to_fetch(struct thread_data *td);
 extern int iolog_compress_init(struct thread_data *, struct sk_out *);
 extern void iolog_compress_exit(struct thread_data *);
 extern size_t log_chunk_sizes(struct io_log *);
diff --git a/oslib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c
index 1dda93f2..4335faf9 100644
--- a/oslib/linux-dev-lookup.c
+++ b/oslib/linux-dev-lookup.c
@@ -16,6 +16,16 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
 	int found = 0;
 	DIR *D;
 
+	/*
+	 * If replay_redirect is set then always return this device
+	 * upon lookup which overrides the device lookup based on
+	 * major minor in the actual blktrace
+	 */
+	if (redirect) {
+		strcpy(path, redirect);
+		return 1;
+	}
+
 	D = opendir(path);
 	if (!D)
 		return 0;
@@ -44,17 +54,6 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
 		if (!S_ISBLK(st.st_mode))
 			continue;
 
-		/*
-		 * If replay_redirect is set then always return this device
-		 * upon lookup which overrides the device lookup based on
-		 * major minor in the actual blktrace
-		 */
-		if (redirect) {
-			strcpy(path, redirect);
-			found = 1;
-			break;
-		}
-
 		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
 			strcpy(path, full_path);
 			found = 1;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-19 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-19 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit ef37053efdfb8c3b8b6deef43c0969753e6adb44:

  init: do not create lat logs when not needed (2022-01-17 07:21:58 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 71efbed61dfb157dfa7fe550f500b53f9731e1cb:

  docs: documentation for sg WRITE STREAM(16) (2022-01-18 06:37:39 -0700)

----------------------------------------------------------------
Niklas Cassel (2):
      stat: remove duplicated code in show_mixed_ddir_status()
      stat: move unified=both mixed allocation and calculation to new helper

Vincent Fu (6):
      sg: add support for VERIFY command using write modes
      sg: add support for WRITE SAME(16) commands with NDOB flag set
      sg: improve sg_write_mode option names
      sg: add support for WRITE STREAM(16) commands
      sg: allow fio to open and close streams for WRITE STREAM(16) commands
      docs: documentation for sg WRITE STREAM(16)

 HOWTO                           |  36 +++++-
 engines/sg.c                    | 181 +++++++++++++++++++++++++++++--
 examples/sg_verify-fail.fio     |  48 ++++++++
 examples/sg_verify.fio          |  57 ++++++++++
 examples/sg_write_same_ndob.fio |  44 ++++++++
 fio.1                           |  47 +++++++-
 stat.c                          | 235 ++++++++--------------------------------
 7 files changed, 441 insertions(+), 207 deletions(-)
 create mode 100644 examples/sg_verify-fail.fio
 create mode 100644 examples/sg_verify.fio
 create mode 100644 examples/sg_write_same_ndob.fio

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO
index 2956e50d..f9e7c857 100644
--- a/HOWTO
+++ b/HOWTO
@@ -2496,11 +2496,13 @@ with the caveat that when used on the command line, they must come after the
 
 	**write**
 		This is the default where write opcodes are issued as usual.
-	**verify**
+	**write_and_verify**
 		Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
 		directs the device to carry out a medium verification with no data
 		comparison. The writefua option is ignored with this selection.
-	**same**
+	**verify**
+		This option is deprecated. Use write_and_verify instead.
+	**write_same**
 		Issue WRITE SAME commands. This transfers a single block to the device
 		and writes this same block of data to a contiguous sequence of LBAs
 		beginning at the specified offset. fio's block size parameter specifies
@@ -2511,6 +2513,36 @@ with the caveat that when used on the command line, they must come after the
 		for each command but only the first 512 bytes will be used and
 		transferred to the device. The writefua option is ignored with this
 		selection.
+	**same**
+		This option is deprecated. Use write_same instead.
+	**write_same_ndob**
+		Issue WRITE SAME(16) commands as above but with the No Data Output
+		Buffer (NDOB) bit set. No data will be transferred to the device with
+		this bit set. Data written will be a pre-determined pattern such as
+		all zeroes.
+	**write_stream**
+		Issue WRITE STREAM(16) commands. Use the **stream_id** option to specify
+		the stream identifier.
+	**verify_bytchk_00**
+		Issue VERIFY commands with BYTCHK set to 00. This directs the
+		device to carry out a medium verification with no data comparison.
+	**verify_bytchk_01**
+		Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+		compare the data on the device with the data transferred to the device.
+	**verify_bytchk_11**
+		Issue VERIFY commands with BYTCHK set to 11. This transfers a
+		single block to the device and compares the contents of this block with the
+		data on the device beginning at the specified offset. fio's block size
+		parameter specifies the total amount of data compared with this command.
+		However, only one block (sector) worth of data is transferred to the device.
+		This is similar to the WRITE SAME command except that data is compared instead
+		of written.
+
+.. option:: stream_id=int : [sg]
+
+	Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+	a valid stream identifier) fio will open a stream and then close it when done. Default
+	is 0.
 
 .. option:: hipri : [sg]
 
diff --git a/engines/sg.c b/engines/sg.c
index 1c019384..72ee07ba 100644
--- a/engines/sg.c
+++ b/engines/sg.c
@@ -66,8 +66,13 @@
 
 enum {
 	FIO_SG_WRITE		= 1,
-	FIO_SG_WRITE_VERIFY	= 2,
-	FIO_SG_WRITE_SAME	= 3
+	FIO_SG_WRITE_VERIFY,
+	FIO_SG_WRITE_SAME,
+	FIO_SG_WRITE_SAME_NDOB,
+	FIO_SG_WRITE_STREAM,
+	FIO_SG_VERIFY_BYTCHK_00,
+	FIO_SG_VERIFY_BYTCHK_01,
+	FIO_SG_VERIFY_BYTCHK_11,
 };
 
 struct sg_options {
@@ -76,6 +81,7 @@ struct sg_options {
 	unsigned int readfua;
 	unsigned int writefua;
 	unsigned int write_mode;
+	uint16_t stream_id;
 };
 
 static struct fio_option options[] = {
@@ -120,18 +126,58 @@ static struct fio_option options[] = {
 			    .oval = FIO_SG_WRITE,
 			    .help = "Issue standard SCSI WRITE commands",
 			  },
-			  { .ival = "verify",
+			  { .ival = "write_and_verify",
 			    .oval = FIO_SG_WRITE_VERIFY,
 			    .help = "Issue SCSI WRITE AND VERIFY commands",
 			  },
-			  { .ival = "same",
+			  { .ival = "verify",
+			    .oval = FIO_SG_WRITE_VERIFY,
+			    .help = "Issue SCSI WRITE AND VERIFY commands. This "
+				    "option is deprecated. Use write_and_verify instead.",
+			  },
+			  { .ival = "write_same",
 			    .oval = FIO_SG_WRITE_SAME,
 			    .help = "Issue SCSI WRITE SAME commands",
 			  },
+			  { .ival = "same",
+			    .oval = FIO_SG_WRITE_SAME,
+			    .help = "Issue SCSI WRITE SAME commands. This "
+				    "option is deprecated. Use write_same instead.",
+			  },
+			  { .ival = "write_same_ndob",
+			    .oval = FIO_SG_WRITE_SAME_NDOB,
+			    .help = "Issue SCSI WRITE SAME(16) commands with NDOB flag set",
+			  },
+			  { .ival = "verify_bytchk_00",
+			    .oval = FIO_SG_VERIFY_BYTCHK_00,
+			    .help = "Issue SCSI VERIFY commands with BYTCHK set to 00",
+			  },
+			  { .ival = "verify_bytchk_01",
+			    .oval = FIO_SG_VERIFY_BYTCHK_01,
+			    .help = "Issue SCSI VERIFY commands with BYTCHK set to 01",
+			  },
+			  { .ival = "verify_bytchk_11",
+			    .oval = FIO_SG_VERIFY_BYTCHK_11,
+			    .help = "Issue SCSI VERIFY commands with BYTCHK set to 11",
+			  },
+			  { .ival = "write_stream",
+			    .oval = FIO_SG_WRITE_STREAM,
+			    .help = "Issue SCSI WRITE STREAM(16) commands",
+			  },
 		},
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_SG,
 	},
+	{
+		.name	= "stream_id",
+		.lname	= "stream id for WRITE STREAM(16) commands",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct sg_options, stream_id),
+		.help	= "Stream ID for WRITE STREAM(16) commands",
+		.def	= "0",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_SG,
+	},
 	{
 		.name	= NULL,
 	},
@@ -171,6 +217,11 @@ struct sgio_data {
 #endif
 };
 
+static inline uint16_t sgio_get_be16(uint8_t *buf)
+{
+	return be16_to_cpu(*((uint16_t *) buf));
+}
+
 static inline uint32_t sgio_get_be32(uint8_t *buf)
 {
 	return be32_to_cpu(*((uint32_t *) buf));
@@ -502,9 +553,9 @@ static enum fio_q_status fio_sgio_doio(struct thread_data *td,
 }
 
 static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba,
-			    unsigned long long nr_blocks)
+			    unsigned long long nr_blocks, bool override16)
 {
-	if (lba < MAX_10B_LBA) {
+	if (lba < MAX_10B_LBA && !override16) {
 		sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]);
 		sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]);
 	} else {
@@ -545,7 +596,7 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
 		if (o->readfua)
 			hdr->cmdp[1] |= 0x08;
 
-		fio_sgio_rw_lba(hdr, lba, nr_blocks);
+		fio_sgio_rw_lba(hdr, lba, nr_blocks, false);
 
 	} else if (io_u->ddir == DDIR_WRITE) {
 		sgio_hdr_init(sd, hdr, io_u, 1);
@@ -576,9 +627,46 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
 			else
 				hdr->cmdp[0] = 0x93; // write same(16)
 			break;
+		case FIO_SG_WRITE_SAME_NDOB:
+			hdr->cmdp[0] = 0x93; // write same(16)
+			hdr->cmdp[1] |= 0x1; // no data output buffer
+			hdr->dxfer_len = 0;
+			break;
+		case FIO_SG_WRITE_STREAM:
+			hdr->cmdp[0] = 0x9a; // write stream (16)
+			if (o->writefua)
+				hdr->cmdp[1] |= 0x08;
+			sgio_set_be64(lba, &hdr->cmdp[2]);
+			sgio_set_be16((uint16_t) io_u->file->engine_pos, &hdr->cmdp[10]);
+			sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[12]);
+			break;
+		case FIO_SG_VERIFY_BYTCHK_00:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2f; // VERIFY(10)
+			else
+				hdr->cmdp[0] = 0x8f; // VERIFY(16)
+			hdr->dxfer_len = 0;
+			break;
+		case FIO_SG_VERIFY_BYTCHK_01:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2f; // VERIFY(10)
+			else
+				hdr->cmdp[0] = 0x8f; // VERIFY(16)
+			hdr->cmdp[1] |= 0x02;		// BYTCHK = 01b
+			break;
+		case FIO_SG_VERIFY_BYTCHK_11:
+			if (lba < MAX_10B_LBA)
+				hdr->cmdp[0] = 0x2f; // VERIFY(10)
+			else
+				hdr->cmdp[0] = 0x8f; // VERIFY(16)
+			hdr->cmdp[1] |= 0x06;		// BYTCHK = 11b
+			hdr->dxfer_len = sd->bs;
+			break;
 		};
 
-		fio_sgio_rw_lba(hdr, lba, nr_blocks);
+		if (o->write_mode != FIO_SG_WRITE_STREAM)
+			fio_sgio_rw_lba(hdr, lba, nr_blocks,
+				o->write_mode == FIO_SG_WRITE_SAME_NDOB);
 
 	} else if (io_u->ddir == DDIR_TRIM) {
 		struct sgio_trim *st;
@@ -970,9 +1058,60 @@ static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f)
 	return 0;
 }
 
+static int fio_sgio_stream_control(struct fio_file *f, bool open_stream, uint16_t *stream_id)
+{
+	struct sg_io_hdr hdr;
+	unsigned char cmd[16];
+	unsigned char sb[64];
+	unsigned char buf[8];
+	int ret;
+
+	memset(&hdr, 0, sizeof(hdr));
+	memset(cmd, 0, sizeof(cmd));
+	memset(sb, 0, sizeof(sb));
+	memset(buf, 0, sizeof(buf));
+
+	hdr.interface_id = 'S';
+	hdr.cmdp = cmd;
+	hdr.cmd_len = 16;
+	hdr.sbp = sb;
+	hdr.mx_sb_len = sizeof(sb);
+	hdr.timeout = SCSI_TIMEOUT_MS;
+	hdr.cmdp[0] = 0x9e;
+	hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	hdr.dxferp = buf;
+	hdr.dxfer_len = sizeof(buf);
+	sgio_set_be32(sizeof(buf), &hdr.cmdp[10]);
+
+	if (open_stream)
+		hdr.cmdp[1] = 0x34;
+	else {
+		hdr.cmdp[1] = 0x54;
+		sgio_set_be16(*stream_id, &hdr.cmdp[4]);
+	}
+
+	ret = ioctl(f->fd, SG_IO, &hdr);
+
+	if (ret < 0)
+		return ret;
+
+	if (hdr.info & SG_INFO_CHECK)
+		return 1;
+
+	if (open_stream) {
+		*stream_id = sgio_get_be16(&buf[4]);
+		dprint(FD_FILE, "sgio_stream_control: opened stream %u\n", (unsigned int) *stream_id);
+		assert(*stream_id != 0);
+	} else
+		dprint(FD_FILE, "sgio_stream_control: closed stream %u\n", (unsigned int) *stream_id);
+
+	return 0;
+}
+
 static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
 {
 	struct sgio_data *sd = td->io_ops_data;
+	struct sg_options *o = td->eo;
 	int ret;
 
 	ret = generic_open_file(td, f);
@@ -984,9 +1123,33 @@ static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
 		return ret;
 	}
 
+	if (o->write_mode == FIO_SG_WRITE_STREAM) {
+		if (o->stream_id)
+			f->engine_pos = o->stream_id;
+		else {
+			ret = fio_sgio_stream_control(f, true, (uint16_t *) &f->engine_pos);
+			if (ret)
+				return ret;
+		}
+	}
+
 	return 0;
 }
 
+int fio_sgio_close(struct thread_data *td, struct fio_file *f)
+{
+	struct sg_options *o = td->eo;
+	int ret;
+
+	if (!o->stream_id && o->write_mode == FIO_SG_WRITE_STREAM) {
+		ret = fio_sgio_stream_control(f, false, (uint16_t *) &f->engine_pos);
+		if (ret)
+			return ret;
+	}
+
+	return generic_close_file(td, f);
+}
+
 /*
  * Build an error string with details about the driver, host or scsi
  * error contained in the sg header Caller will use as necessary.
@@ -1261,7 +1424,7 @@ static struct ioengine_ops ioengine = {
 	.event		= fio_sgio_event,
 	.cleanup	= fio_sgio_cleanup,
 	.open_file	= fio_sgio_open,
-	.close_file	= generic_close_file,
+	.close_file	= fio_sgio_close,
 	.get_file_size	= fio_sgio_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_RAWIO,
 	.options	= options,
diff --git a/examples/sg_verify-fail.fio b/examples/sg_verify-fail.fio
new file mode 100644
index 00000000..64feece3
--- /dev/null
+++ b/examples/sg_verify-fail.fio
@@ -0,0 +1,48 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# The jobs with fail in the name should produce errors
+#
+# job			description
+# precon		precondition the device by writing with a known
+#			pattern
+# verify01		verify each block one at a time by comparing to known
+#			pattern
+# verify01-fail		verifying one too many blocks should produce a failure
+# verify11-one_ios	verify all 20 blocks by sending only 512 bytes
+# verify11-fail		verifying beyond the preconditioned region should
+#			produce a failure
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+number_ios=20
+
+[verify01-fail]
+sg_write_mode=verify_bytchk_01
+number_ios=21
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10240
+
+[verify11-fail]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10752
diff --git a/examples/sg_verify.fio b/examples/sg_verify.fio
new file mode 100644
index 00000000..6db0dd0a
--- /dev/null
+++ b/examples/sg_verify.fio
@@ -0,0 +1,57 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# All of the jobs below should complete without error
+#
+# job			description
+# precon		precondition the device by writing with a known
+#			pattern
+# verify00		verify written data on medium only
+# verify01		verify each block one at a time by comparing to known
+#			pattern
+# verify01-two_ios	verify same data but with only two VERIFY operations
+# verify11		verify each block one at a time
+# verify11-five_ios	verify data with five IOs, four blocks at a time,
+#			sending 512 bytes for each IO
+# verify11-one_ios	verify all 20 blocks by sending only 512 bytes
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify00]
+sg_write_mode=verify_bytchk_00
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+
+[verify01-two_ios]
+sg_write_mode=verify_bytchk_01
+bs=5120
+number_ios=2
+
+[verify11]
+sg_write_mode=verify_bytchk_11
+
+[verify11-five_ios]
+sg_write_mode=verify_bytchk_11
+bs=2048
+number_ios=5
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+bs=10240
+number_ios=1
diff --git a/examples/sg_write_same_ndob.fio b/examples/sg_write_same_ndob.fio
new file mode 100644
index 00000000..fb047319
--- /dev/null
+++ b/examples/sg_write_same_ndob.fio
@@ -0,0 +1,44 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test WRITE SAME commands with the NDOB flag set
+# issued via the sg ioengine
+# All of the jobs below should complete without error
+# except the last one
+#
+# job			description
+# precon		Precondition the device by writing 20 blocks with a
+# 			known pattern
+# write_same_ndob	Write 19 sectors of all zeroes with the NDOB flag set
+# verify-pass		Verify 19 blocks of all zeroes
+# verify-fail		Verify 20 blocks of all zeroes. This should fail.
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+stonewall
+
+[precon]
+number_ios=20
+
+[write_same_ndob]
+sg_write_mode=write_same_ndob
+number_ios=19
+
+[verify-pass]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=19
+
+[verify-fail]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=20
diff --git a/fio.1 b/fio.1
index e0458c22..34aa874d 100644
--- a/fio.1
+++ b/fio.1
@@ -2284,7 +2284,7 @@ With writefua option set to 1, write operations include the force
 unit access (fua) flag. Default: 0.
 .TP
 .BI (sg)sg_write_mode \fR=\fPstr
-Specify the type of write commands to issue. This option can take three
+Specify the type of write commands to issue. This option can take multiple
 values:
 .RS
 .RS
@@ -2292,12 +2292,15 @@ values:
 .B write (default)
 Write opcodes are issued as usual
 .TP
+.B write_and_verify
+Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 00b. This directs the
+device to carry out a medium verification with no data comparison for the data
+that was written. The writefua option is ignored with this selection.
+.TP
 .B verify
-Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
-directs the device to carry out a medium verification with no data
-comparison. The writefua option is ignored with this selection.
+This option is deprecated. Use write_and_verify instead.
 .TP
-.B same
+.B write_same
 Issue WRITE SAME commands. This transfers a single block to the device
 and writes this same block of data to a contiguous sequence of LBAs
 beginning at the specified offset. fio's block size parameter
@@ -2308,9 +2311,43 @@ blocksize=8k will write 16 sectors with each command. fio will still
 generate 8k of data for each command butonly the first 512 bytes will
 be used and transferred to the device. The writefua option is ignored
 with this selection.
+.TP
+.B same
+This option is deprecated. Use write_same instead.
+.TP
+.B write_same_ndob
+Issue WRITE SAME(16) commands as above but with the No Data Output
+Buffer (NDOB) bit set. No data will be transferred to the device with
+this bit set. Data written will be a pre-determined pattern such as
+all zeroes.
+.TP
+.B write_stream
+Issue WRITE STREAM(16) commands. Use the stream_id option to specify
+the stream identifier.
+.TP
+.B verify_bytchk_00
+Issue VERIFY commands with BYTCHK set to 00. This directs the device to carry
+out a medium verification with no data comparison.
+.TP
+.B verify_bytchk_01
+Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+compare the data on the device with the data transferred to the device.
+.TP
+.B verify_bytchk_11
+Issue VERIFY commands with BYTCHK set to 11. This transfers a single block to
+the device and compares the contents of this block with the data on the device
+beginning at the specified offset. fio's block size parameter specifies the
+total amount of data compared with this command. However, only one block
+(sector) worth of data is transferred to the device. This is similar to the
+WRITE SAME command except that data is compared instead of written.
 .RE
 .RE
 .TP
+.BI (sg)stream_id \fR=\fPint
+Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+a valid stream identifier) fio will open a stream and then close it when done. Default
+is 0.
+.TP
 .BI (nbd)uri \fR=\fPstr
 Specify the NBD URI of the server to test.
 The string is a standard NBD URI (see
diff --git a/stat.c b/stat.c
index 36742a25..b08d2f25 100644
--- a/stat.c
+++ b/stat.c
@@ -462,173 +462,45 @@ static void display_lat(const char *name, unsigned long long min,
 	free(maxp);
 }
 
-static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean)
+static struct thread_stat *gen_mixed_ddir_stats_from_ts(struct thread_stat *ts)
 {
-	double p_of_agg = 100.0;
-	if (rs && rs->agg[ddir] > 1024) {
-		p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
-
-		if (p_of_agg > 100.0)
-			p_of_agg = 100.0;
-	}
-	return p_of_agg;
-}
-
-static void show_mixed_ddir_status(struct group_run_stats *rs,
-				   struct thread_stat *ts,
-				   struct buf_output *out)
-{
-	unsigned long runt;
-	unsigned long long min, max, bw, iops;
-	double mean, dev;
-	char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
 	struct thread_stat *ts_lcl;
-	int i2p;
-	int ddir = 0;
 
 	/*
 	 * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
-	 * Trims (ddir = 2) */
+	 * Trims (ddir = 2)
+	 */
 	ts_lcl = malloc(sizeof(struct thread_stat));
-	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-	/* calculate mixed stats  */
-	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
-	init_thread_stat_min_vals(ts_lcl);
-
-	sum_thread_stats(ts_lcl, ts);
-
-	assert(ddir_rw(ddir));
-
-	if (!ts_lcl->runtime[ddir]) {
-		free(ts_lcl);
-		return;
-	}
-
-	i2p = is_power_of_2(rs->kb_base);
-	runt = ts_lcl->runtime[ddir];
-
-	bw = (1000 * ts_lcl->io_bytes[ddir]) / runt;
-	io_p = num2str(ts_lcl->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
-	bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
-	bw_p_alt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
-
-	iops = (1000 * ts_lcl->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
-
-	log_buf(out, "  mixed: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
-			iops_p, bw_p, bw_p_alt, io_p,
-			(unsigned long long) ts_lcl->runtime[ddir],
-			post_st ? : "");
-
-	free(post_st);
-	free(io_p);
-	free(bw_p);
-	free(bw_p_alt);
-	free(iops_p);
-
-	if (calc_lat(&ts_lcl->slat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat("slat", min, max, mean, dev, out);
-	if (calc_lat(&ts_lcl->clat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat("clat", min, max, mean, dev, out);
-	if (calc_lat(&ts_lcl->lat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat(" lat", min, max, mean, dev, out);
-	if (calc_lat(&ts_lcl->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
-		display_lat(ts_lcl->lat_percentiles ? "high prio_lat" : "high prio_clat",
-				min, max, mean, dev, out);
-		if (calc_lat(&ts_lcl->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
-			display_lat(ts_lcl->lat_percentiles ? "low prio_lat" : "low prio_clat",
-					min, max, mean, dev, out);
-	}
-
-	if (ts->slat_percentiles && ts_lcl->slat_stat[ddir].samples > 0)
-		show_clat_percentiles(ts_lcl->io_u_plat[FIO_SLAT][ddir],
-				ts_lcl->slat_stat[ddir].samples,
-				ts->percentile_list,
-				ts->percentile_precision, "slat", out);
-	if (ts->clat_percentiles && ts_lcl->clat_stat[ddir].samples > 0)
-		show_clat_percentiles(ts_lcl->io_u_plat[FIO_CLAT][ddir],
-				ts_lcl->clat_stat[ddir].samples,
-				ts->percentile_list,
-				ts->percentile_precision, "clat", out);
-	if (ts->lat_percentiles && ts_lcl->lat_stat[ddir].samples > 0)
-		show_clat_percentiles(ts_lcl->io_u_plat[FIO_LAT][ddir],
-				ts_lcl->lat_stat[ddir].samples,
-				ts->percentile_list,
-				ts->percentile_precision, "lat", out);
-
-	if (ts->clat_percentiles || ts->lat_percentiles) {
-		const char *name = ts->lat_percentiles ? "lat" : "clat";
-		char prio_name[32];
-		uint64_t samples;
-
-		if (ts->lat_percentiles)
-			samples = ts_lcl->lat_stat[ddir].samples;
-		else
-			samples = ts_lcl->clat_stat[ddir].samples;
-
-		/* Only print if high and low priority stats were collected */
-		if (ts_lcl->clat_high_prio_stat[ddir].samples > 0 &&
-				ts_lcl->clat_low_prio_stat[ddir].samples > 0) {
-			sprintf(prio_name, "high prio (%.2f%%) %s",
-					100. * (double) ts_lcl->clat_high_prio_stat[ddir].samples / (double) samples,
-					name);
-			show_clat_percentiles(ts_lcl->io_u_plat_high_prio[ddir],
-					ts_lcl->clat_high_prio_stat[ddir].samples,
-					ts->percentile_list,
-					ts->percentile_precision, prio_name, out);
-
-			sprintf(prio_name, "low prio (%.2f%%) %s",
-					100. * (double) ts_lcl->clat_low_prio_stat[ddir].samples / (double) samples,
-					name);
-			show_clat_percentiles(ts_lcl->io_u_plat_low_prio[ddir],
-					ts_lcl->clat_low_prio_stat[ddir].samples,
-					ts->percentile_list,
-					ts->percentile_precision, prio_name, out);
-		}
+	if (!ts_lcl) {
+		log_err("fio: failed to allocate local thread stat\n");
+		return NULL;
 	}
 
-	if (calc_lat(&ts_lcl->bw_stat[ddir], &min, &max, &mean, &dev)) {
-		double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
-		const char *bw_str;
+	init_thread_stat(ts_lcl);
 
-		if ((rs->unit_base == 1) && i2p)
-			bw_str = "Kibit";
-		else if (rs->unit_base == 1)
-			bw_str = "kbit";
-		else if (i2p)
-			bw_str = "KiB";
-		else
-			bw_str = "kB";
+	/* calculate mixed stats  */
+	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
+	ts_lcl->lat_percentiles = ts->lat_percentiles;
+	ts_lcl->clat_percentiles = ts->clat_percentiles;
+	ts_lcl->slat_percentiles = ts->slat_percentiles;
+	ts_lcl->percentile_precision = ts->percentile_precision;
+	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
 
-		p_of_agg = convert_agg_kbytes_percent(rs, ddir, mean);
+	sum_thread_stats(ts_lcl, ts);
 
-		if (rs->unit_base == 1) {
-			min *= 8.0;
-			max *= 8.0;
-			mean *= 8.0;
-			dev *= 8.0;
-		}
+	return ts_lcl;
+}
 
-		if (mean > fkb_base * fkb_base) {
-			min /= fkb_base;
-			max /= fkb_base;
-			mean /= fkb_base;
-			dev /= fkb_base;
-			bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
-		}
+static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean)
+{
+	double p_of_agg = 100.0;
+	if (rs && rs->agg[ddir] > 1024) {
+		p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
 
-		log_buf(out, "   bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
-			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
-			bw_str, min, max, p_of_agg, mean, dev,
-			(&ts_lcl->bw_stat[ddir])->samples);
-	}
-	if (calc_lat(&ts_lcl->iops_stat[ddir], &min, &max, &mean, &dev)) {
-		log_buf(out, "   iops        : min=%5llu, max=%5llu, "
-			"avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
-			min, max, mean, dev, (&ts_lcl->iops_stat[ddir])->samples);
+		if (p_of_agg > 100.0)
+			p_of_agg = 100.0;
 	}
-
-	free(ts_lcl);
+	return p_of_agg;
 }
 
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
@@ -797,6 +669,18 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 	}
 }
 
+static void show_mixed_ddir_status(struct group_run_stats *rs,
+				   struct thread_stat *ts,
+				   struct buf_output *out)
+{
+	struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+	if (ts_lcl)
+		show_ddir_status(rs, ts_lcl, DDIR_READ, out);
+
+	free(ts_lcl);
+}
+
 static bool show_lat(double *io_u_lat, int nr, const char **ranges,
 		     const char *msg, struct buf_output *out)
 {
@@ -1462,27 +1346,11 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
 				   struct group_run_stats *rs,
 				   int ver, struct buf_output *out)
 {
-	struct thread_stat *ts_lcl;
+	struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
 
-	/*
-	 * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
-	 * Trims (ddir = 2)
-	 */
-	ts_lcl = malloc(sizeof(struct thread_stat));
-	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-	/* calculate mixed stats  */
-	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
-	init_thread_stat_min_vals(ts_lcl);
-	ts_lcl->lat_percentiles = ts->lat_percentiles;
-	ts_lcl->clat_percentiles = ts->clat_percentiles;
-	ts_lcl->slat_percentiles = ts->slat_percentiles;
-	ts_lcl->percentile_precision = ts->percentile_precision;
-	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
-	
-	sum_thread_stats(ts_lcl, ts);
+	if (ts_lcl)
+		show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
 
-	/* add the aggregated stats to json parent */
-	show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
 	free(ts_lcl);
 }
 
@@ -1660,27 +1528,12 @@ static void add_ddir_status_json(struct thread_stat *ts,
 static void add_mixed_ddir_status_json(struct thread_stat *ts,
 		struct group_run_stats *rs, struct json_object *parent)
 {
-	struct thread_stat *ts_lcl;
-
-	/*
-	 * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
-	 * Trims (ddir = 2)
-	 */
-	ts_lcl = malloc(sizeof(struct thread_stat));
-	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-	/* calculate mixed stats  */
-	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
-	init_thread_stat_min_vals(ts_lcl);
-	ts_lcl->lat_percentiles = ts->lat_percentiles;
-	ts_lcl->clat_percentiles = ts->clat_percentiles;
-	ts_lcl->slat_percentiles = ts->slat_percentiles;
-	ts_lcl->percentile_precision = ts->percentile_precision;
-	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
-
-	sum_thread_stats(ts_lcl, ts);
+	struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
 
 	/* add the aggregated stats to json parent */
-	add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+	if (ts_lcl)
+		add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+
 	free(ts_lcl);
 }
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-18 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-18 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 016869bebe9bef7cae5a7f9dc0762162b0612226:

  stat: remove unnecessary bool parameter to sum_thread_stats() (2022-01-10 09:22:14 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to ef37053efdfb8c3b8b6deef43c0969753e6adb44:

  init: do not create lat logs when not needed (2022-01-17 07:21:58 -0700)

----------------------------------------------------------------
Damien Le Moal (1):
      init: do not create lat logs when not needed

 init.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

---

Diff of recent changes:

diff --git a/init.c b/init.c
index 5f069d9a..07daaa84 100644
--- a/init.c
+++ b/init.c
@@ -1586,17 +1586,23 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
 		else
 			suf = "log";
 
-		gen_log_name(logname, sizeof(logname), "lat", pre,
-				td->thread_number, suf, o->per_job_logs);
-		setup_log(&td->lat_log, &p, logname);
+		if (!o->disable_lat) {
+			gen_log_name(logname, sizeof(logname), "lat", pre,
+				     td->thread_number, suf, o->per_job_logs);
+			setup_log(&td->lat_log, &p, logname);
+		}
 
-		gen_log_name(logname, sizeof(logname), "slat", pre,
-				td->thread_number, suf, o->per_job_logs);
-		setup_log(&td->slat_log, &p, logname);
+		if (!o->disable_slat) {
+			gen_log_name(logname, sizeof(logname), "slat", pre,
+				     td->thread_number, suf, o->per_job_logs);
+			setup_log(&td->slat_log, &p, logname);
+		}
 
-		gen_log_name(logname, sizeof(logname), "clat", pre,
-				td->thread_number, suf, o->per_job_logs);
-		setup_log(&td->clat_log, &p, logname);
+		if (!o->disable_clat) {
+			gen_log_name(logname, sizeof(logname), "clat", pre,
+				     td->thread_number, suf, o->per_job_logs);
+			setup_log(&td->clat_log, &p, logname);
+		}
 
 	}
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-11 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-11 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit b5e99df6ec605b4dc6a3488203f32d5c5bfce8df:

  engines/io_uring: don't set CQSIZE clamp unconditionally (2022-01-09 19:34:27 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 016869bebe9bef7cae5a7f9dc0762162b0612226:

  stat: remove unnecessary bool parameter to sum_thread_stats() (2022-01-10 09:22:14 -0700)

----------------------------------------------------------------
Niklas Cassel (1):
      stat: remove unnecessary bool parameter to sum_thread_stats()

 client.c      |  2 +-
 gclient.c     |  2 +-
 rate-submit.c |  2 +-
 stat.c        | 53 +++++++++++++++++++++++------------------------------
 stat.h        |  2 +-
 5 files changed, 27 insertions(+), 34 deletions(-)

---

Diff of recent changes:

diff --git a/client.c b/client.c
index 8b230617..be8411d8 100644
--- a/client.c
+++ b/client.c
@@ -1111,7 +1111,7 @@ static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
 	if (sum_stat_clients <= 1)
 		return;
 
-	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+	sum_thread_stats(&client_ts, &p->ts);
 	sum_group_stats(&client_gs, &p->rs);
 
 	client_ts.members++;
diff --git a/gclient.c b/gclient.c
index e0e0e7bf..ac063536 100644
--- a/gclient.c
+++ b/gclient.c
@@ -292,7 +292,7 @@ static void gfio_thread_status_op(struct fio_client *client,
 	if (sum_stat_clients == 1)
 		return;
 
-	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+	sum_thread_stats(&client_ts, &p->ts);
 	sum_group_stats(&client_gs, &p->rs);
 
 	client_ts.members++;
diff --git a/rate-submit.c b/rate-submit.c
index 13dbe7a2..752c30a5 100644
--- a/rate-submit.c
+++ b/rate-submit.c
@@ -195,7 +195,7 @@ static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
 	struct thread_data *td = sw->priv;
 
 	(*sum_cnt)++;
-	sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+	sum_thread_stats(&sw->wq->td->ts, &td->ts);
 
 	fio_options_free(td);
 	close_and_free_files(td);
diff --git a/stat.c b/stat.c
index 99de1294..36742a25 100644
--- a/stat.c
+++ b/stat.c
@@ -495,7 +495,7 @@ static void show_mixed_ddir_status(struct group_run_stats *rs,
 	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
 	init_thread_stat_min_vals(ts_lcl);
 
-	sum_thread_stats(ts_lcl, ts, 1);
+	sum_thread_stats(ts_lcl, ts);
 
 	assert(ddir_rw(ddir));
 
@@ -1479,7 +1479,7 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
 	ts_lcl->percentile_precision = ts->percentile_precision;
 	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
 	
-	sum_thread_stats(ts_lcl, ts, 1);
+	sum_thread_stats(ts_lcl, ts);
 
 	/* add the aggregated stats to json parent */
 	show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
@@ -1677,7 +1677,7 @@ static void add_mixed_ddir_status_json(struct thread_stat *ts,
 	ts_lcl->percentile_precision = ts->percentile_precision;
 	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
 
-	sum_thread_stats(ts_lcl, ts, 1);
+	sum_thread_stats(ts_lcl, ts);
 
 	/* add the aggregated stats to json parent */
 	add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
@@ -2089,9 +2089,10 @@ static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first)
  * numbers. For group_reporting, we should just add those up, not make
  * them the mean of everything.
  */
-static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first,
-		     bool pure_sum)
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool pure_sum)
 {
+	bool first = dst->samples == 0;
+
 	if (src->samples == 0)
 		return;
 
@@ -2141,49 +2142,41 @@ void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
 		dst->sig_figs = src->sig_figs;
 }
 
-void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
-		      bool first)
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
 {
 	int k, l, m;
 
-	sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
-
 	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
 		if (dst->unified_rw_rep != UNIFIED_MIXED) {
-			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
-			sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
-			sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
-			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false);
-			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false);
-			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true);
-			sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true);
+			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], false);
+			sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], false);
+			sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], false);
+			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], false);
+			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], false);
+			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], true);
+			sum_stat(&dst->iops_stat[l], &src->iops_stat[l], true);
 
 			dst->io_bytes[l] += src->io_bytes[l];
 
 			if (dst->runtime[l] < src->runtime[l])
 				dst->runtime[l] = src->runtime[l];
 		} else {
-			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false);
-			sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], first, false);
-			sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], first, false);
-			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false);
-			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false);
-			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true);
-			sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true);
+			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], false);
+			sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], false);
+			sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], false);
+			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], false);
+			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], false);
+			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], true);
+			sum_stat(&dst->iops_stat[0], &src->iops_stat[l], true);
 
 			dst->io_bytes[0] += src->io_bytes[l];
 
 			if (dst->runtime[0] < src->runtime[l])
 				dst->runtime[0] = src->runtime[l];
-
-			/*
-			 * We're summing to the same destination, so override
-			 * 'first' after the first iteration of the loop
-			 */
-			first = false;
 		}
 	}
 
+	sum_stat(&dst->sync_stat, &src->sync_stat, false);
 	dst->usr_time += src->usr_time;
 	dst->sys_time += src->sys_time;
 	dst->ctx += src->ctx;
@@ -2417,7 +2410,7 @@ void __show_run_stats(void)
 		for (k = 0; k < ts->nr_block_infos; k++)
 			ts->block_infos[k] = td->ts.block_infos[k];
 
-		sum_thread_stats(ts, &td->ts, idx == 1);
+		sum_thread_stats(ts, &td->ts);
 
 		if (td->o.ss_dur) {
 			ts->ss_state = td->ss.state;
diff --git a/stat.h b/stat.h
index 9ef8caa4..15ca4eff 100644
--- a/stat.h
+++ b/stat.h
@@ -325,7 +325,7 @@ extern void __show_run_stats(void);
 extern int __show_running_run_stats(void);
 extern void show_running_run_stats(void);
 extern void check_for_running_stats(void);
-extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src);
 extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
 extern void init_thread_stat_min_vals(struct thread_stat *ts);
 extern void init_thread_stat(struct thread_stat *ts);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2022-01-10 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2022-01-10 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit a3e33e2fc06582e4170f90ae6e62d6225d52dc7c:

  Merge branch 'github-actions-i686' of https://github.com/vincentkfu/fio (2021-12-23 16:27:33 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to b5e99df6ec605b4dc6a3488203f32d5c5bfce8df:

  engines/io_uring: don't set CQSIZE clamp unconditionally (2022-01-09 19:34:27 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      engines/io_uring: don't set CQSIZE clamp unconditionally

 engines/io_uring.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/engines/io_uring.c b/engines/io_uring.c
index 00ae3482..a2533c88 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -699,9 +699,15 @@ static int fio_ioring_queue_init(struct thread_data *td)
 	p.flags |= IORING_SETUP_CQSIZE;
 	p.cq_entries = depth;
 
+retry:
 	ret = syscall(__NR_io_uring_setup, depth, &p);
-	if (ret < 0)
+	if (ret < 0) {
+		if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+			p.flags &= ~IORING_SETUP_CQSIZE;
+			goto retry;
+		}
 		return ret;
+	}
 
 	ld->ring_fd = ret;
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-24 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-24 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 9b46661c289d01dbfe5182189a7abea9ce2f9e04:

  Fio 3.29 (2021-12-18 07:09:32 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a3e33e2fc06582e4170f90ae6e62d6225d52dc7c:

  Merge branch 'github-actions-i686' of https://github.com/vincentkfu/fio (2021-12-23 16:27:33 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'github-actions-i686' of https://github.com/vincentkfu/fio

Vincent Fu (4):
      ci: workaround for problem with i686 builds
      Revert "ci: temporarily remove linux-i686-gcc build"
      t/io_uring: fix 32-bit build warnings
      t/io_uring: fix help defaults for aio and random_io

 .github/workflows/ci.yml | 4 ++++
 ci/actions-install.sh    | 5 ++++-
 t/io_uring.c             | 9 +++++----
 3 files changed, 13 insertions(+), 5 deletions(-)

---

Diff of recent changes:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8167e3d1..cd8ce142 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,6 +14,7 @@ jobs:
         - linux-gcc
         - linux-clang
         - macos
+        - linux-i686-gcc
         include:
         - build: linux-gcc
           os: ubuntu-20.04
@@ -23,6 +24,9 @@ jobs:
           cc: clang
         - build: macos
           os: macos-11
+        - build: linux-i686-gcc
+          os: ubuntu-20.04
+          arch: i686
 
     env:
       CI_TARGET_ARCH: ${{ matrix.arch }}
diff --git a/ci/actions-install.sh b/ci/actions-install.sh
index 7408ccb4..b3486a47 100755
--- a/ci/actions-install.sh
+++ b/ci/actions-install.sh
@@ -31,14 +31,17 @@ DPKGCFG
     case "${CI_TARGET_ARCH}" in
         "i686")
             sudo dpkg --add-architecture i386
+            opts="--allow-downgrades"
             pkgs=("${pkgs[@]/%/:i386}")
             pkgs+=(
                 gcc-multilib
                 pkg-config:i386
                 zlib1g-dev:i386
+		libpcre2-8-0=10.34-7
             )
             ;;
         "x86_64")
+            opts=""
             pkgs+=(
                 libglusterfs-dev
                 libgoogle-perftools-dev
@@ -62,7 +65,7 @@ DPKGCFG
     echo "Updating APT..."
     sudo apt-get -qq update
     echo "Installing packages..."
-    sudo apt-get install -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
+    sudo apt-get install "$opts" -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
 }
 
 install_linux() {
diff --git a/t/io_uring.c b/t/io_uring.c
index a98f78fd..e8365a79 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -634,7 +634,8 @@ static int submitter_init(struct submitter *s)
 #ifdef CONFIG_LIBAIO
 static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
 {
-	unsigned long offset, data;
+	uint64_t data;
+	long long offset;
 	struct file *f;
 	unsigned index;
 	long r;
@@ -663,7 +664,7 @@ static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocb
 
 		data = f->fileno;
 		if (stats && stats_running)
-			data |= ((unsigned long) s->clock_index << 32);
+			data |= (((uint64_t) s->clock_index) << 32);
 		iocb->data = (void *) (uintptr_t) data;
 		index++;
 	}
@@ -676,7 +677,7 @@ static int reap_events_aio(struct submitter *s, struct io_event *events, int evs
 	int reaped = 0;
 
 	while (evs) {
-		unsigned long data = (uintptr_t) events[reaped].data;
+		uint64_t data = (uintptr_t) events[reaped].data;
 		struct file *f = &s->files[data & 0xffffffff];
 
 		f->pending_ios--;
@@ -1094,7 +1095,7 @@ static void usage(char *argv, int status)
 		" -a <bool> : Use legacy aio, default %d\n",
 		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
 		fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
-		stats, runtime == 0 ? "unlimited" : runtime_str, aio, random_io);
+		stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio);
 	exit(status);
 }
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-19 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-19 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit e86afa536b175a90546e20d7d19f2418ee1bca78:

  stat: sum sync_stat before reassigning bool first (2021-12-15 08:45:32 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 9b46661c289d01dbfe5182189a7abea9ce2f9e04:

  Fio 3.29 (2021-12-18 07:09:32 -0700)

----------------------------------------------------------------
Jens Axboe (2):
      stat: code cleanup and leak free
      Fio 3.29

 FIO-VERSION-GEN |  2 +-
 stat.c          | 84 ++++++++++++++++++++++++++++++++++-----------------------
 2 files changed, 51 insertions(+), 35 deletions(-)

---

Diff of recent changes:

diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index e9d563c1..60f7bb21 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.28
+DEF_VER=fio-3.29
 
 LF='
 '
diff --git a/stat.c b/stat.c
index ec44c79e..99de1294 100644
--- a/stat.c
+++ b/stat.c
@@ -289,9 +289,10 @@ void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out)
 {
 	char *io, *agg, *min, *max;
 	char *ioalt, *aggalt, *minalt, *maxalt;
-	uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0, min_run = -1, max_run = 0;
-	int i;
+	uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0;
+	uint64_t min_run = -1, max_run = 0;
 	const int i2p = is_power_of_2(rs->kb_base);
+	int i;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		if (!rs->max_run[i])
@@ -363,9 +364,9 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
 		free(minalt);
 		free(maxalt);
 	}
-	
+
 	/* Need to aggregate statisitics to show mixed values */
-	if (rs->unified_rw_rep == UNIFIED_BOTH) 
+	if (rs->unified_rw_rep == UNIFIED_BOTH)
 		show_mixed_group_stats(rs, out);
 }
 
@@ -473,30 +474,35 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i
 	return p_of_agg;
 }
 
-static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-			     struct buf_output *out)
+static void show_mixed_ddir_status(struct group_run_stats *rs,
+				   struct thread_stat *ts,
+				   struct buf_output *out)
 {
 	unsigned long runt;
 	unsigned long long min, max, bw, iops;
 	double mean, dev;
 	char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
 	struct thread_stat *ts_lcl;
-
 	int i2p;
 	int ddir = 0;
 
-	/* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+	/*
+	 * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
+	 * Trims (ddir = 2) */
 	ts_lcl = malloc(sizeof(struct thread_stat));
 	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-	ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
+	/* calculate mixed stats  */
+	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
 	init_thread_stat_min_vals(ts_lcl);
 
 	sum_thread_stats(ts_lcl, ts, 1);
 
 	assert(ddir_rw(ddir));
 
-	if (!ts_lcl->runtime[ddir])
+	if (!ts_lcl->runtime[ddir]) {
+		free(ts_lcl);
 		return;
+	}
 
 	i2p = is_power_of_2(rs->kb_base);
 	runt = ts_lcl->runtime[ddir];
@@ -560,10 +566,9 @@ static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_sta
 		else
 			samples = ts_lcl->clat_stat[ddir].samples;
 
-		/* Only print this if some high and low priority stats were collected */
+		/* Only print if high and low priority stats were collected */
 		if (ts_lcl->clat_high_prio_stat[ddir].samples > 0 &&
-				ts_lcl->clat_low_prio_stat[ddir].samples > 0)
-		{
+				ts_lcl->clat_low_prio_stat[ddir].samples > 0) {
 			sprintf(prio_name, "high prio (%.2f%%) %s",
 					100. * (double) ts_lcl->clat_high_prio_stat[ddir].samples / (double) samples,
 					name);
@@ -1222,9 +1227,8 @@ void show_disk_util(int terse, struct json_object *parent,
 	if (!is_running_backend())
 		return;
 
-	if (flist_empty(&disk_list)) {
+	if (flist_empty(&disk_list))
 		return;
-	}
 
 	if ((output_format & FIO_OUTPUT_JSON) && parent)
 		do_json = true;
@@ -1234,9 +1238,9 @@ void show_disk_util(int terse, struct json_object *parent,
 	if (!terse && !do_json)
 		log_buf(out, "\nDisk stats (read/write):\n");
 
-	if (do_json)
+	if (do_json) {
 		json_object_add_disk_utils(parent, &disk_list);
-	else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+	} else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
 		flist_for_each(entry, &disk_list) {
 			du = flist_entry(entry, struct disk_util, list);
 
@@ -1396,19 +1400,20 @@ static void show_ddir_status_terse(struct thread_stat *ts,
 	else
 		log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
 
-	if (ts->lat_percentiles)
+	if (ts->lat_percentiles) {
 		len = calc_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir],
 					ts->lat_stat[ddir].samples,
 					ts->percentile_list, &ovals, &maxv,
 					&minv);
-	else if (ts->clat_percentiles)
+	} else if (ts->clat_percentiles) {
 		len = calc_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir],
 					ts->clat_stat[ddir].samples,
 					ts->percentile_list, &ovals, &maxv,
 					&minv);
-	else
+	} else {
 		len = 0;
-	
+	}
+
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		if (i >= len) {
 			log_buf(out, ";0%%=0");
@@ -1435,8 +1440,9 @@ static void show_ddir_status_terse(struct thread_stat *ts,
 		}
 
 		log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
-	} else
+	} else {
 		log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+	}
 
 	if (ver == 5) {
 		if (bw_stat)
@@ -1458,15 +1464,19 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
 {
 	struct thread_stat *ts_lcl;
 
-	/* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+	/*
+	 * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
+	 * Trims (ddir = 2)
+	 */
 	ts_lcl = malloc(sizeof(struct thread_stat));
 	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-	ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
+	/* calculate mixed stats  */
+	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
 	init_thread_stat_min_vals(ts_lcl);
 	ts_lcl->lat_percentiles = ts->lat_percentiles;
 	ts_lcl->clat_percentiles = ts->clat_percentiles;
 	ts_lcl->slat_percentiles = ts->slat_percentiles;
-	ts_lcl->percentile_precision = ts->percentile_precision;		
+	ts_lcl->percentile_precision = ts->percentile_precision;
 	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
 	
 	sum_thread_stats(ts_lcl, ts, 1);
@@ -1476,8 +1486,10 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
 	free(ts_lcl);
 }
 
-static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles,
-		struct io_stat *lat_stat, uint64_t *io_u_plat)
+static struct json_object *add_ddir_lat_json(struct thread_stat *ts,
+					     uint32_t percentiles,
+					     struct io_stat *lat_stat,
+					     uint64_t *io_u_plat)
 {
 	char buf[120];
 	double mean, dev;
@@ -1650,15 +1662,19 @@ static void add_mixed_ddir_status_json(struct thread_stat *ts,
 {
 	struct thread_stat *ts_lcl;
 
-	/* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+	/*
+	 * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
+	 * Trims (ddir = 2)
+	 */
 	ts_lcl = malloc(sizeof(struct thread_stat));
 	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-	ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
+	/* calculate mixed stats  */
+	ts_lcl->unified_rw_rep = UNIFIED_MIXED;
 	init_thread_stat_min_vals(ts_lcl);
 	ts_lcl->lat_percentiles = ts->lat_percentiles;
 	ts_lcl->clat_percentiles = ts->clat_percentiles;
 	ts_lcl->slat_percentiles = ts->slat_percentiles;
-	ts_lcl->percentile_precision = ts->percentile_precision;		
+	ts_lcl->percentile_precision = ts->percentile_precision;
 	memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
 
 	sum_thread_stats(ts_lcl, ts, 1);
@@ -2133,7 +2149,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
 	sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
 
 	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
-		if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
+		if (dst->unified_rw_rep != UNIFIED_MIXED) {
 			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
 			sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
 			sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
@@ -2188,7 +2204,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
 		dst->io_u_lat_m[k] += src->io_u_lat_m[k];
 
 	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
-		if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
+		if (dst->unified_rw_rep != UNIFIED_MIXED) {
 			dst->total_io_u[k] += src->total_io_u[k];
 			dst->short_io_u[k] += src->short_io_u[k];
 			dst->drop_io_u[k] += src->drop_io_u[k];
@@ -2204,7 +2220,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
 	for (k = 0; k < FIO_LAT_CNT; k++)
 		for (l = 0; l < DDIR_RWDIR_CNT; l++)
 			for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
-				if (!(dst->unified_rw_rep == UNIFIED_MIXED))
+				if (dst->unified_rw_rep != UNIFIED_MIXED)
 					dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m];
 				else
 					dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m];
@@ -2214,7 +2230,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
 
 	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
 		for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
-			if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
+			if (dst->unified_rw_rep != UNIFIED_MIXED) {
 				dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
 				dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
 			} else {

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-16 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-16 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 9ffe433d729101a34d9709030d7d4dd2444347ef:

  t/zbd: Avoid inappropriate blkzone command call in zone_cap_bs (2021-12-14 06:48:14 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to e86afa536b175a90546e20d7d19f2418ee1bca78:

  stat: sum sync_stat before reassigning bool first (2021-12-15 08:45:32 -0700)

----------------------------------------------------------------
Niklas Cassel (1):
      stat: sum sync_stat before reassigning bool first

 stat.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/stat.c b/stat.c
index 7e84058d..ec44c79e 100644
--- a/stat.c
+++ b/stat.c
@@ -2130,6 +2130,8 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
 {
 	int k, l, m;
 
+	sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
+
 	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
 		if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
 			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
@@ -2166,7 +2168,6 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
 		}
 	}
 
-	sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
 	dst->usr_time += src->usr_time;
 	dst->sys_time += src->sys_time;
 	dst->ctx += src->ctx;

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-15 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-15 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 2ea393df3256e44398558c264f035f8db7656b08:

  Merge branch 'github-actions' of https://github.com/sitsofe/fio (2021-12-10 11:08:26 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 9ffe433d729101a34d9709030d7d4dd2444347ef:

  t/zbd: Avoid inappropriate blkzone command call in zone_cap_bs (2021-12-14 06:48:14 -0700)

----------------------------------------------------------------
Damien Le Moal (11):
      fio: Improve documentation of ignore_zone_limits option
      zbd: define local functions as static
      zbd: move and cleanup code
      zbd: remove is_zone_open() helper
      zbd: introduce zbd_zone_align_file_sizes() helper
      zbd: fix code style issues
      zbd: simplify zbd_close_zone()
      zbd: simplify zbd_open_zone()
      zbd: rename zbd_zone_idx() and zbd_zone_nr()
      zbd: rename get_zone()
      zbd: introduce zbd_offset_to_zone() helper

Niklas Cassel (2):
      ci: temporarily remove linux-i686-gcc build
      ci: use macos 11 in virtual environment

Shin'ichiro Kawasaki (1):
      t/zbd: Avoid inappropriate blkzone command call in zone_cap_bs

 .github/workflows/ci.yml |   6 +-
 HOWTO                    |   6 +
 fio.1                    |   6 +-
 t/zbd/functions          |   6 +-
 zbd.c                    | 963 +++++++++++++++++++++++++----------------------
 5 files changed, 532 insertions(+), 455 deletions(-)

---

Diff of recent changes:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a766cfa8..8167e3d1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,6 @@ jobs:
         - linux-gcc
         - linux-clang
         - macos
-        - linux-i686-gcc
         include:
         - build: linux-gcc
           os: ubuntu-20.04
@@ -23,10 +22,7 @@ jobs:
           os: ubuntu-20.04
           cc: clang
         - build: macos
-          os: macos-10.15
-        - build: linux-i686-gcc
-          os: ubuntu-20.04
-          arch: i686
+          os: macos-11
 
     env:
       CI_TARGET_ARCH: ${{ matrix.arch }}
diff --git a/HOWTO b/HOWTO
index 8c9e4135..2956e50d 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1063,6 +1063,12 @@ Target file/device
 	Limit on the number of simultaneously opened zones per single
 	thread/process.
 
+.. option:: ignore_zone_limits=bool
+	If this option is used, fio will ignore the maximum number of open
+	zones limit of the zoned block device in use, thus allowing the
+	option :option:`max_open_zones` value to be larger than the device
+	reported limit. Default: false.
+
 .. option:: zone_reset_threshold=float
 
 	A number between zero and one that indicates the ratio of logical
diff --git a/fio.1 b/fio.1
index a3ebb67d..e0458c22 100644
--- a/fio.1
+++ b/fio.1
@@ -838,9 +838,9 @@ threads/processes.
 Limit on the number of simultaneously opened zones per single thread/process.
 .TP
 .BI ignore_zone_limits \fR=\fPbool
-If this isn't set, fio will query the max open zones limit from the zoned block
-device, and exit if the specified \fBmax_open_zones\fR value is larger than the
-limit reported by the device. Default: false.
+If this option is used, fio will ignore the maximum number of open zones limit
+of the zoned block device in use, thus allowing the option \fBmax_open_zones\fR
+value to be larger than the device reported limit. Default: false.
 .TP
 .BI zone_reset_threshold \fR=\fPfloat
 A number between zero and one that indicates the ratio of logical blocks with
diff --git a/t/zbd/functions b/t/zbd/functions
index e4e248b9..7cff18fd 100644
--- a/t/zbd/functions
+++ b/t/zbd/functions
@@ -72,9 +72,11 @@ zone_cap_bs() {
 	local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p'
 	local cap bs="$zone_size"
 
-	# When blkzone is not available or blkzone does not report capacity,
+	# When blkzone command is neither available nor relevant to the
+	# test device, or when blkzone command does not report capacity,
 	# assume that zone capacity is same as zone size for all zones.
-	if [ -z "${blkzone}" ] || ! blkzone_reports_capacity "${dev}"; then
+	if [ -z "${blkzone}" ] || [ -z "$is_zbd" ] || [ -c "$dev" ] ||
+		   ! blkzone_reports_capacity "${dev}"; then
 		echo "$zone_size"
 		return
 	fi
diff --git a/zbd.c b/zbd.c
index c18998c4..b1fd6b4b 100644
--- a/zbd.c
+++ b/zbd.c
@@ -22,13 +22,126 @@
 #include "pshared.h"
 #include "zbd.h"
 
+static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+{
+	return (uint64_t)(offset - f->file_offset) < f->io_size;
+}
+
+static inline unsigned int zbd_zone_idx(const struct fio_file *f,
+					struct fio_zone_info *zone)
+{
+	return zone - f->zbd_info->zone_info;
+}
+
+/**
+ * zbd_offset_to_zone_idx - convert an offset into a zone number
+ * @f: file pointer.
+ * @offset: offset in bytes. If this offset is in the first zone_size bytes
+ *	    past the disk size then the index of the sentinel is returned.
+ */
+static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
+					   uint64_t offset)
+{
+	uint32_t zone_idx;
+
+	if (f->zbd_info->zone_size_log2 > 0)
+		zone_idx = offset >> f->zbd_info->zone_size_log2;
+	else
+		zone_idx = offset / f->zbd_info->zone_size;
+
+	return min(zone_idx, f->zbd_info->nr_zones);
+}
+
+/**
+ * zbd_zone_end - Return zone end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+{
+	return (z+1)->start;
+}
+
+/**
+ * zbd_zone_capacity_end - Return zone capacity limit end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+{
+	return z->start + z->capacity;
+}
+
+/**
+ * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
+ * @f: file pointer.
+ * @z: zone info pointer.
+ * @required: minimum number of bytes that must remain in a zone.
+ *
+ * The caller must hold z->mutex.
+ */
+static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
+			  uint64_t required)
+{
+	assert((required & 511) == 0);
+
+	return z->has_wp &&
+		z->wp + required > zbd_zone_capacity_end(z);
+}
+
+static void zone_lock(struct thread_data *td, const struct fio_file *f,
+		      struct fio_zone_info *z)
+{
+	struct zoned_block_device_info *zbd = f->zbd_info;
+	uint32_t nz = z - zbd->zone_info;
+
+	/* A thread should never lock zones outside its working area. */
+	assert(f->min_zone <= nz && nz < f->max_zone);
+
+	assert(z->has_wp);
+
+	/*
+	 * Lock the io_u target zone. The zone will be unlocked if io_u offset
+	 * is changed or when io_u completes and zbd_put_io() executed.
+	 * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
+	 * other waiting for zone locks when building an io_u batch, first
+	 * only trylock the zone. If the zone is already locked by another job,
+	 * process the currently queued I/Os so that I/O progress is made and
+	 * zones unlocked.
+	 */
+	if (pthread_mutex_trylock(&z->mutex) != 0) {
+		if (!td_ioengine_flagged(td, FIO_SYNCIO))
+			io_u_quiesce(td);
+		pthread_mutex_lock(&z->mutex);
+	}
+}
+
+static inline void zone_unlock(struct fio_zone_info *z)
+{
+	int ret;
+
+	assert(z->has_wp);
+	ret = pthread_mutex_unlock(&z->mutex);
+	assert(!ret);
+}
+
+static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
+						 unsigned int zone_idx)
+{
+	return &f->zbd_info->zone_info[zone_idx];
+}
+
+static inline struct fio_zone_info *
+zbd_offset_to_zone(const struct fio_file *f,  uint64_t offset)
+{
+	return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
+}
+
 /**
  * zbd_get_zoned_model - Get a device zoned model
  * @td: FIO thread data
  * @f: FIO file for which to get model information
  */
-int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
-			enum zbd_zoned_model *model)
+static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
+			       enum zbd_zoned_model *model)
 {
 	int ret;
 
@@ -71,9 +184,9 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
  * upon failure. If the zone report is empty, always assume an error (device
  * problem) and return -EIO.
  */
-int zbd_report_zones(struct thread_data *td, struct fio_file *f,
-		     uint64_t offset, struct zbd_zone *zones,
-		     unsigned int nr_zones)
+static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
+			    uint64_t offset, struct zbd_zone *zones,
+			    unsigned int nr_zones)
 {
 	int ret;
 
@@ -105,8 +218,8 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f,
  * Reset the write pointer of all zones in the range @offset...@offset+@length.
  * Returns 0 upon success and a negative error code upon failure.
  */
-int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
-		 uint64_t offset, uint64_t length)
+static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
+			uint64_t offset, uint64_t length)
 {
 	int ret;
 
@@ -124,131 +237,233 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
 }
 
 /**
- * zbd_get_max_open_zones - Get the maximum number of open zones
- * @td: FIO thread data
- * @f: FIO file for which to get max open zones
- * @max_open_zones: Upon success, result will be stored here.
- *
- * A @max_open_zones value set to zero means no limit.
+ * zbd_reset_zone - reset the write pointer of a single zone
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
  *
  * Returns 0 upon success and a negative error code upon failure.
+ *
+ * The caller must hold z->mutex.
  */
-int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
-			   unsigned int *max_open_zones)
+static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
+			  struct fio_zone_info *z)
 {
-	int ret;
+	uint64_t offset = z->start;
+	uint64_t length = (z+1)->start - offset;
+	uint64_t data_in_zone = z->wp - z->start;
+	int ret = 0;
 
-	if (td->io_ops && td->io_ops->get_max_open_zones)
-		ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
-	else
-		ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
-	if (ret < 0) {
-		td_verror(td, errno, "get max open zones failed");
-		log_err("%s: get max open zones failed (%d).\n",
-			f->file_name, errno);
+	if (!data_in_zone)
+		return 0;
+
+	assert(is_valid_offset(f, offset + length - 1));
+
+	dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
+	       f->file_name, zbd_zone_idx(f, z));
+
+	switch (f->zbd_info->model) {
+	case ZBD_HOST_AWARE:
+	case ZBD_HOST_MANAGED:
+		ret = zbd_reset_wp(td, f, offset, length);
+		if (ret < 0)
+			return ret;
+		break;
+	default:
+		break;
 	}
 
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	f->zbd_info->sectors_with_data -= data_in_zone;
+	f->zbd_info->wp_sectors_with_data -= data_in_zone;
+	pthread_mutex_unlock(&f->zbd_info->mutex);
+
+	z->wp = z->start;
+	z->verify_block = 0;
+
+	td->ts.nr_zone_resets++;
+
 	return ret;
 }
 
 /**
- * zbd_zone_idx - convert an offset into a zone number
- * @f: file pointer.
- * @offset: offset in bytes. If this offset is in the first zone_size bytes
- *	    past the disk size then the index of the sentinel is returned.
+ * zbd_close_zone - Remove a zone from the open zones array.
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @zone_idx: Index of the zone to remove.
+ *
+ * The caller must hold f->zbd_info->mutex.
  */
-static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
+static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
+			   struct fio_zone_info *z)
 {
-	uint32_t zone_idx;
+	uint32_t ozi;
 
-	if (f->zbd_info->zone_size_log2 > 0)
-		zone_idx = offset >> f->zbd_info->zone_size_log2;
-	else
-		zone_idx = offset / f->zbd_info->zone_size;
+	if (!z->open)
+		return;
 
-	return min(zone_idx, f->zbd_info->nr_zones);
-}
+	for (ozi = 0; ozi < f->zbd_info->num_open_zones; ozi++) {
+		if (zbd_get_zone(f, f->zbd_info->open_zones[ozi]) == z)
+			break;
+	}
+	if (ozi == f->zbd_info->num_open_zones)
+		return;
 
-/**
- * zbd_zone_end - Return zone end location
- * @z: zone info pointer.
- */
-static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
-{
-	return (z+1)->start;
+	dprint(FD_ZBD, "%s: closing zone %u\n",
+	       f->file_name, zbd_zone_idx(f, z));
+
+	memmove(f->zbd_info->open_zones + ozi,
+		f->zbd_info->open_zones + ozi + 1,
+		(ZBD_MAX_OPEN_ZONES - (ozi + 1)) *
+		sizeof(f->zbd_info->open_zones[0]));
+
+	f->zbd_info->num_open_zones--;
+	td->num_open_zones--;
+	z->open = 0;
 }
 
 /**
- * zbd_zone_capacity_end - Return zone capacity limit end location
- * @z: zone info pointer.
+ * zbd_reset_zones - Reset a range of zones.
+ * @td: fio thread data.
+ * @f: fio file for which to reset zones
+ * @zb: first zone to reset.
+ * @ze: first zone not to reset.
+ *
+ * Returns 0 upon success and 1 upon failure.
  */
-static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
+			   struct fio_zone_info *const zb,
+			   struct fio_zone_info *const ze)
 {
-	return z->start + z->capacity;
+	struct fio_zone_info *z;
+	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+	int res = 0;
+
+	assert(min_bs);
+
+	dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
+	       f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
+
+	for (z = zb; z < ze; z++) {
+		if (!z->has_wp)
+			continue;
+
+		zone_lock(td, f, z);
+		pthread_mutex_lock(&f->zbd_info->mutex);
+		zbd_close_zone(td, f, z);
+		pthread_mutex_unlock(&f->zbd_info->mutex);
+
+		if (z->wp != z->start) {
+			dprint(FD_ZBD, "%s: resetting zone %u\n",
+			       f->file_name, zbd_zone_idx(f, z));
+			if (zbd_reset_zone(td, f, z) < 0)
+				res = 1;
+		}
+
+		zone_unlock(z);
+	}
+
+	return res;
 }
 
 /**
- * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
- * @f: file pointer.
- * @z: zone info pointer.
- * @required: minimum number of bytes that must remain in a zone.
+ * zbd_get_max_open_zones - Get the maximum number of open zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max open zones
+ * @max_open_zones: Upon success, result will be stored here.
  *
- * The caller must hold z->mutex.
+ * A @max_open_zones value set to zero means no limit.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
  */
-static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
-			  uint64_t required)
+static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+				  unsigned int *max_open_zones)
 {
-	assert((required & 511) == 0);
+	int ret;
 
-	return z->has_wp &&
-		z->wp + required > zbd_zone_capacity_end(z);
+	if (td->io_ops && td->io_ops->get_max_open_zones)
+		ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
+	else
+		ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
+	if (ret < 0) {
+		td_verror(td, errno, "get max open zones failed");
+		log_err("%s: get max open zones failed (%d).\n",
+			f->file_name, errno);
+	}
+
+	return ret;
 }
 
-static void zone_lock(struct thread_data *td, const struct fio_file *f,
-		      struct fio_zone_info *z)
+/**
+ * zbd_open_zone - Add a zone to the array of open zones.
+ * @td: fio thread data.
+ * @f: fio file that has the open zones to add.
+ * @zone_idx: Index of the zone to add.
+ *
+ * Open a ZBD zone if it is not already open. Returns true if either the zone
+ * was already open or if the zone was successfully added to the array of open
+ * zones without exceeding the maximum number of open zones. Returns false if
+ * the zone was not already open and opening the zone would cause the zone limit
+ * to be exceeded.
+ */
+static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
+			  struct fio_zone_info *z)
 {
-	struct zoned_block_device_info *zbd = f->zbd_info;
-	uint32_t nz = z - zbd->zone_info;
+	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+	struct zoned_block_device_info *zbdi = f->zbd_info;
+	uint32_t zone_idx = zbd_zone_idx(f, z);
+	bool res = true;
 
-	/* A thread should never lock zones outside its working area. */
-	assert(f->min_zone <= nz && nz < f->max_zone);
+	if (z->cond == ZBD_ZONE_COND_OFFLINE)
+		return false;
 
-	assert(z->has_wp);
+	/*
+	 * Skip full zones with data verification enabled because resetting a
+	 * zone causes data loss and hence causes verification to fail.
+	 */
+	if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
+		return false;
 
 	/*
-	 * Lock the io_u target zone. The zone will be unlocked if io_u offset
-	 * is changed or when io_u completes and zbd_put_io() executed.
-	 * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
-	 * other waiting for zone locks when building an io_u batch, first
-	 * only trylock the zone. If the zone is already locked by another job,
-	 * process the currently queued I/Os so that I/O progress is made and
-	 * zones unlocked.
+	 * zbdi->max_open_zones == 0 means that there is no limit on the maximum
+	 * number of open zones. In this case, do no track open zones in
+	 * zbdi->open_zones array.
 	 */
-	if (pthread_mutex_trylock(&z->mutex) != 0) {
-		if (!td_ioengine_flagged(td, FIO_SYNCIO))
-			io_u_quiesce(td);
-		pthread_mutex_lock(&z->mutex);
+	if (!zbdi->max_open_zones)
+		return true;
+
+	pthread_mutex_lock(&zbdi->mutex);
+
+	if (z->open) {
+		/*
+		 * If the zone is going to be completely filled by writes
+		 * already in-flight, handle it as a full zone instead of an
+		 * open zone.
+		 */
+		if (z->wp >= zbd_zone_capacity_end(z))
+			res = false;
+		goto out;
 	}
-}
 
-static inline void zone_unlock(struct fio_zone_info *z)
-{
-	int ret;
+	res = false;
+	/* Zero means no limit */
+	if (td->o.job_max_open_zones > 0 &&
+	    td->num_open_zones >= td->o.job_max_open_zones)
+		goto out;
+	if (zbdi->num_open_zones >= zbdi->max_open_zones)
+		goto out;
 
-	assert(z->has_wp);
-	ret = pthread_mutex_unlock(&z->mutex);
-	assert(!ret);
-}
+	dprint(FD_ZBD, "%s: opening zone %u\n",
+	       f->file_name, zone_idx);
 
-static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
-{
-	return (uint64_t)(offset - f->file_offset) < f->io_size;
-}
+	zbdi->open_zones[zbdi->num_open_zones++] = zone_idx;
+	td->num_open_zones++;
+	z->open = 1;
+	res = true;
 
-static inline struct fio_zone_info *get_zone(const struct fio_file *f,
-					     unsigned int zone_nr)
-{
-	return &f->zbd_info->zone_info[zone_nr];
+out:
+	pthread_mutex_unlock(&zbdi->mutex);
+	return res;
 }
 
 /* Verify whether direct I/O is used for all host-managed zoned drives. */
@@ -277,15 +492,91 @@ static bool zbd_is_seq_job(struct fio_file *f)
 	uint32_t zone_idx, zone_idx_b, zone_idx_e;
 
 	assert(f->zbd_info);
+
 	if (f->io_size == 0)
 		return false;
-	zone_idx_b = zbd_zone_idx(f, f->file_offset);
-	zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
+
+	zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
+	zone_idx_e =
+		zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
 	for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
-		if (get_zone(f, zone_idx)->has_wp)
+		if (zbd_get_zone(f, zone_idx)->has_wp)
 			return true;
 
-	return false;
+	return false;
+}
+
+/*
+ * Verify whether the file offset and size parameters are aligned with zone
+ * boundaries. If the file offset is not aligned, align it down to the start of
+ * the zone containing the start offset and align up the file io_size parameter.
+ */
+static bool zbd_zone_align_file_sizes(struct thread_data *td,
+				      struct fio_file *f)
+{
+	const struct fio_zone_info *z;
+	uint64_t new_offset, new_end;
+
+	if (!f->zbd_info)
+		return true;
+	if (f->file_offset >= f->real_file_size)
+		return true;
+	if (!zbd_is_seq_job(f))
+		return true;
+
+	if (!td->o.zone_size) {
+		td->o.zone_size = f->zbd_info->zone_size;
+		if (!td->o.zone_size) {
+			log_err("%s: invalid 0 zone size\n",
+				f->file_name);
+			return false;
+		}
+	} else if (td->o.zone_size != f->zbd_info->zone_size) {
+		log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
+			f->file_name, td->o.zone_size,
+			f->zbd_info->zone_size);
+		return false;
+	}
+
+	if (td->o.zone_skip % td->o.zone_size) {
+		log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
+			f->file_name, td->o.zone_skip,
+			td->o.zone_size);
+		return false;
+	}
+
+	z = zbd_offset_to_zone(f, f->file_offset);
+	if ((f->file_offset != z->start) &&
+	    (td->o.td_ddir != TD_DDIR_READ)) {
+		new_offset = zbd_zone_end(z);
+		if (new_offset >= f->file_offset + f->io_size) {
+			log_info("%s: io_size must be at least one zone\n",
+				 f->file_name);
+			return false;
+		}
+		log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
+			 f->file_name, f->file_offset,
+			 new_offset);
+		f->io_size -= (new_offset - f->file_offset);
+		f->file_offset = new_offset;
+	}
+
+	z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
+	new_end = z->start;
+	if ((td->o.td_ddir != TD_DDIR_READ) &&
+	    (f->file_offset + f->io_size != new_end)) {
+		if (new_end <= f->file_offset) {
+			log_info("%s: io_size must be at least one zone\n",
+				 f->file_name);
+			return false;
+		}
+		log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
+			 f->file_name, f->io_size,
+			 new_end - f->file_offset);
+		f->io_size = new_end - f->file_offset;
+	}
+
+	return true;
 }
 
 /*
@@ -293,74 +584,14 @@ static bool zbd_is_seq_job(struct fio_file *f)
  */
 static bool zbd_verify_sizes(void)
 {
-	const struct fio_zone_info *z;
 	struct thread_data *td;
 	struct fio_file *f;
-	uint64_t new_offset, new_end;
-	uint32_t zone_idx;
 	int i, j;
 
 	for_each_td(td, i) {
 		for_each_file(td, f, j) {
-			if (!f->zbd_info)
-				continue;
-			if (f->file_offset >= f->real_file_size)
-				continue;
-			if (!zbd_is_seq_job(f))
-				continue;
-
-			if (!td->o.zone_size) {
-				td->o.zone_size = f->zbd_info->zone_size;
-				if (!td->o.zone_size) {
-					log_err("%s: invalid 0 zone size\n",
-						f->file_name);
-					return false;
-				}
-			} else if (td->o.zone_size != f->zbd_info->zone_size) {
-				log_err("%s: job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
-					f->file_name, td->o.zone_size,
-					f->zbd_info->zone_size);
-				return false;
-			}
-
-			if (td->o.zone_skip % td->o.zone_size) {
-				log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
-					f->file_name, td->o.zone_skip,
-					td->o.zone_size);
+			if (!zbd_zone_align_file_sizes(td, f))
 				return false;
-			}
-
-			zone_idx = zbd_zone_idx(f, f->file_offset);
-			z = get_zone(f, zone_idx);
-			if ((f->file_offset != z->start) &&
-			    (td->o.td_ddir != TD_DDIR_READ)) {
-				new_offset = zbd_zone_end(z);
-				if (new_offset >= f->file_offset + f->io_size) {
-					log_info("%s: io_size must be at least one zone\n",
-						 f->file_name);
-					return false;
-				}
-				log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
-					 f->file_name, f->file_offset,
-					 new_offset);
-				f->io_size -= (new_offset - f->file_offset);
-				f->file_offset = new_offset;
-			}
-			zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
-			z = get_zone(f, zone_idx);
-			new_end = z->start;
-			if ((td->o.td_ddir != TD_DDIR_READ) &&
-			    (f->file_offset + f->io_size != new_end)) {
-				if (new_end <= f->file_offset) {
-					log_info("%s: io_size must be at least one zone\n",
-						 f->file_name);
-					return false;
-				}
-				log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
-					 f->file_name, f->io_size,
-					 new_end - f->file_offset);
-				f->io_size = new_end - f->file_offset;
-			}
 		}
 	}
 
@@ -385,6 +616,7 @@ static bool zbd_verify_bs(void)
 
 			if (!f->zbd_info)
 				continue;
+
 			zone_size = f->zbd_info->zone_size;
 			if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
 				log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
@@ -529,8 +761,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 		goto out;
 	}
 
-	dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n", f->file_name,
-	       nr_zones, zone_size / 1024);
+	dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
+	       f->file_name, nr_zones, zone_size / 1024);
 
 	zbd_info = scalloc(1, sizeof(*zbd_info) +
 			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -546,6 +778,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 						     PTHREAD_MUTEX_RECURSIVE);
 			p->start = z->start;
 			p->capacity = z->capacity;
+
 			switch (z->cond) {
 			case ZBD_ZONE_COND_NOT_WP:
 			case ZBD_ZONE_COND_FULL:
@@ -579,6 +812,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 		offset = z->start + z->len;
 		if (j >= nr_zones)
 			break;
+
 		nrz = zbd_report_zones(td, f, offset, zones,
 				       min((uint32_t)(nr_zones - j),
 					   ZBD_REPORT_MAX_ZONES));
@@ -646,7 +880,8 @@ out:
 	/* Ensure that the limit is not larger than FIO's internal limit */
 	if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
 		td_verror(td, EINVAL, "'max_open_zones' value is too large");
-		log_err("'max_open_zones' value is larger than %u\n", ZBD_MAX_OPEN_ZONES);
+		log_err("'max_open_zones' value is larger than %u\n",
+			ZBD_MAX_OPEN_ZONES);
 		return -EINVAL;
 	}
 
@@ -748,14 +983,10 @@ static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
 	ret = zbd_create_zone_info(td, file);
 	if (ret < 0)
 		td_verror(td, -ret, "zbd_create_zone_info() failed");
+
 	return ret;
 }
 
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
-			  uint32_t zone_idx);
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
-			  struct fio_zone_info *z);
-
 int zbd_init_files(struct thread_data *td)
 {
 	struct fio_file *f;
@@ -765,6 +996,7 @@ int zbd_init_files(struct thread_data *td)
 		if (zbd_init_zone_info(td, f))
 			return 1;
 	}
+
 	return 0;
 }
 
@@ -775,27 +1007,24 @@ void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
 
 	for_each_file(td, f, i) {
 		struct zoned_block_device_info *zbd = f->zbd_info;
-		// zonemode=strided doesn't get per-file zone size.
-		uint64_t zone_size = zbd ? zbd->zone_size : td->o.zone_size;
+		uint64_t zone_size;
 
+		/* zonemode=strided doesn't get per-file zone size. */
+		zone_size = zbd ? zbd->zone_size : td->o.zone_size;
 		if (zone_size == 0)
 			continue;
 
-		if (td->o.size_nz > 0) {
+		if (td->o.size_nz > 0)
 			td->o.size = td->o.size_nz * zone_size;
-		}
-		if (td->o.io_size_nz > 0) {
+		if (td->o.io_size_nz > 0)
 			td->o.io_size = td->o.io_size_nz * zone_size;
-		}
-		if (td->o.start_offset_nz > 0) {
+		if (td->o.start_offset_nz > 0)
 			td->o.start_offset = td->o.start_offset_nz * zone_size;
-		}
-		if (td->o.offset_increment_nz > 0) {
-			td->o.offset_increment = td->o.offset_increment_nz * zone_size;
-		}
-		if (td->o.zone_skip_nz > 0) {
+		if (td->o.offset_increment_nz > 0)
+			td->o.offset_increment =
+				td->o.offset_increment_nz * zone_size;
+		if (td->o.zone_skip_nz > 0)
 			td->o.zone_skip = td->o.zone_skip_nz * zone_size;
-		}
 	}
 }
 
@@ -822,8 +1051,9 @@ int zbd_setup_files(struct thread_data *td)
 
 		assert(zbd);
 
-		f->min_zone = zbd_zone_idx(f, f->file_offset);
-		f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
+		f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
+		f->max_zone =
+			zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
 
 		/*
 		 * When all zones in the I/O range are conventional, io_size
@@ -863,7 +1093,7 @@ int zbd_setup_files(struct thread_data *td)
 			if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
 			    z->cond != ZBD_ZONE_COND_EXP_OPEN)
 				continue;
-			if (zbd_open_zone(td, f, zi))
+			if (zbd_open_zone(td, f, z))
 				continue;
 			/*
 			 * If the number of open zones exceeds specified limits,
@@ -879,123 +1109,6 @@ int zbd_setup_files(struct thread_data *td)
 	return 0;
 }
 
-static inline unsigned int zbd_zone_nr(const struct fio_file *f,
-				       struct fio_zone_info *zone)
-{
-	return zone - f->zbd_info->zone_info;
-}
-
-/**
- * zbd_reset_zone - reset the write pointer of a single zone
- * @td: FIO thread data.
- * @f: FIO file associated with the disk for which to reset a write pointer.
- * @z: Zone to reset.
- *
- * Returns 0 upon success and a negative error code upon failure.
- *
- * The caller must hold z->mutex.
- */
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
-			  struct fio_zone_info *z)
-{
-	uint64_t offset = z->start;
-	uint64_t length = (z+1)->start - offset;
-	uint64_t data_in_zone = z->wp - z->start;
-	int ret = 0;
-
-	if (!data_in_zone)
-		return 0;
-
-	assert(is_valid_offset(f, offset + length - 1));
-
-	dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
-		zbd_zone_nr(f, z));
-	switch (f->zbd_info->model) {
-	case ZBD_HOST_AWARE:
-	case ZBD_HOST_MANAGED:
-		ret = zbd_reset_wp(td, f, offset, length);
-		if (ret < 0)
-			return ret;
-		break;
-	default:
-		break;
-	}
-
-	pthread_mutex_lock(&f->zbd_info->mutex);
-	f->zbd_info->sectors_with_data -= data_in_zone;
-	f->zbd_info->wp_sectors_with_data -= data_in_zone;
-	pthread_mutex_unlock(&f->zbd_info->mutex);
-	z->wp = z->start;
-	z->verify_block = 0;
-
-	td->ts.nr_zone_resets++;
-
-	return ret;
-}
-
-/* The caller must hold f->zbd_info->mutex */
-static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
-			   unsigned int zone_idx)
-{
-	uint32_t open_zone_idx = 0;
-
-	for (; open_zone_idx < f->zbd_info->num_open_zones; open_zone_idx++) {
-		if (f->zbd_info->open_zones[open_zone_idx] == zone_idx)
-			break;
-	}
-	if (open_zone_idx == f->zbd_info->num_open_zones)
-		return;
-
-	dprint(FD_ZBD, "%s: closing zone %d\n", f->file_name, zone_idx);
-	memmove(f->zbd_info->open_zones + open_zone_idx,
-		f->zbd_info->open_zones + open_zone_idx + 1,
-		(ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) *
-		sizeof(f->zbd_info->open_zones[0]));
-	f->zbd_info->num_open_zones--;
-	td->num_open_zones--;
-	get_zone(f, zone_idx)->open = 0;
-}
-
-/*
- * Reset a range of zones. Returns 0 upon success and 1 upon failure.
- * @td: fio thread data.
- * @f: fio file for which to reset zones
- * @zb: first zone to reset.
- * @ze: first zone not to reset.
- */
-static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
-			   struct fio_zone_info *const zb,
-			   struct fio_zone_info *const ze)
-{
-	struct fio_zone_info *z;
-	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
-	int res = 0;
-
-	assert(min_bs);
-
-	dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
-		zbd_zone_nr(f, zb), zbd_zone_nr(f, ze));
-	for (z = zb; z < ze; z++) {
-		uint32_t nz = zbd_zone_nr(f, z);
-
-		if (!z->has_wp)
-			continue;
-		zone_lock(td, f, z);
-		pthread_mutex_lock(&f->zbd_info->mutex);
-		zbd_close_zone(td, f, nz);
-		pthread_mutex_unlock(&f->zbd_info->mutex);
-		if (z->wp != z->start) {
-			dprint(FD_ZBD, "%s: resetting zone %u\n",
-			       f->file_name, zbd_zone_nr(f, z));
-			if (zbd_reset_zone(td, f, z) < 0)
-				res = 1;
-		}
-		zone_unlock(z);
-	}
-
-	return res;
-}
-
 /*
  * Reset zbd_info.write_cnt, the counter that counts down towards the next
  * zone reset.
@@ -1046,8 +1159,8 @@ static uint64_t zbd_process_swd(struct thread_data *td,
 	uint64_t swd = 0;
 	uint64_t wp_swd = 0;
 
-	zb = get_zone(f, f->min_zone);
-	ze = get_zone(f, f->max_zone);
+	zb = zbd_get_zone(f, f->min_zone);
+	ze = zbd_get_zone(f, f->max_zone);
 	for (z = zb; z < ze; z++) {
 		if (z->has_wp) {
 			zone_lock(td, f, z);
@@ -1055,6 +1168,7 @@ static uint64_t zbd_process_swd(struct thread_data *td,
 		}
 		swd += z->wp - z->start;
 	}
+
 	pthread_mutex_lock(&f->zbd_info->mutex);
 	switch (a) {
 	case CHECK_SWD:
@@ -1067,6 +1181,7 @@ static uint64_t zbd_process_swd(struct thread_data *td,
 		break;
 	}
 	pthread_mutex_unlock(&f->zbd_info->mutex);
+
 	for (z = zb; z < ze; z++)
 		if (z->has_wp)
 			zone_unlock(z);
@@ -1097,11 +1212,13 @@ void zbd_file_reset(struct thread_data *td, struct fio_file *f)
 	if (!f->zbd_info || !td_write(td))
 		return;
 
-	zb = get_zone(f, f->min_zone);
-	ze = get_zone(f, f->max_zone);
+	zb = zbd_get_zone(f, f->min_zone);
+	ze = zbd_get_zone(f, f->max_zone);
 	swd = zbd_process_swd(td, f, SET_SWD);
-	dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name,
-	       swd);
+
+	dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n",
+	       __func__, f->file_name, swd);
+
 	/*
 	 * If data verification is enabled reset the affected zones before
 	 * writing any data to avoid that a zone reset has to be issued while
@@ -1112,92 +1229,12 @@ void zbd_file_reset(struct thread_data *td, struct fio_file *f)
 	zbd_reset_write_cnt(td, f);
 }
 
-/* The caller must hold f->zbd_info->mutex. */
-static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
-			 unsigned int zone_idx)
-{
-	struct zoned_block_device_info *zbdi = f->zbd_info;
-	int i;
-
-	/* This function should never be called when zbdi->max_open_zones == 0 */
-	assert(zbdi->max_open_zones);
-	assert(td->o.job_max_open_zones == 0 || td->num_open_zones <= td->o.job_max_open_zones);
-	assert(td->o.job_max_open_zones <= zbdi->max_open_zones);
-	assert(zbdi->num_open_zones <= zbdi->max_open_zones);
-
-	for (i = 0; i < zbdi->num_open_zones; i++)
-		if (zbdi->open_zones[i] == zone_idx)
-			return true;
-
-	return false;
-}
-
-/*
- * Open a ZBD zone if it was not yet open. Returns true if either the zone was
- * already open or if opening a new zone is allowed. Returns false if the zone
- * was not yet open and opening a new zone would cause the zone limit to be
- * exceeded.
- */
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
-			  uint32_t zone_idx)
-{
-	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
-	struct zoned_block_device_info *zbdi = f->zbd_info;
-	struct fio_zone_info *z = get_zone(f, zone_idx);
-	bool res = true;
-
-	if (z->cond == ZBD_ZONE_COND_OFFLINE)
-		return false;
-
-	/*
-	 * Skip full zones with data verification enabled because resetting a
-	 * zone causes data loss and hence causes verification to fail.
-	 */
-	if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
-		return false;
-
-	/*
-	 * zbdi->max_open_zones == 0 means that there is no limit on the maximum
-	 * number of open zones. In this case, do no track open zones in
-	 * zbdi->open_zones array.
-	 */
-	if (!zbdi->max_open_zones)
-		return true;
-
-	pthread_mutex_lock(&zbdi->mutex);
-	if (is_zone_open(td, f, zone_idx)) {
-		/*
-		 * If the zone is already open and going to be full by writes
-		 * in-flight, handle it as a full zone instead of an open zone.
-		 */
-		if (z->wp >= zbd_zone_capacity_end(z))
-			res = false;
-		goto out;
-	}
-	res = false;
-	/* Zero means no limit */
-	if (td->o.job_max_open_zones > 0 &&
-	    td->num_open_zones >= td->o.job_max_open_zones)
-		goto out;
-	if (zbdi->num_open_zones >= zbdi->max_open_zones)
-		goto out;
-	dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
-	zbdi->open_zones[zbdi->num_open_zones++] = zone_idx;
-	td->num_open_zones++;
-	z->open = 1;
-	res = true;
-
-out:
-	pthread_mutex_unlock(&zbdi->mutex);
-	return res;
-}
-
 /* Return random zone index for one of the open zones. */
 static uint32_t pick_random_zone_idx(const struct fio_file *f,
 				     const struct io_u *io_u)
 {
-	return (io_u->offset - f->file_offset) * f->zbd_info->num_open_zones /
-		f->io_size;
+	return (io_u->offset - f->file_offset) *
+		f->zbd_info->num_open_zones / f->io_size;
 }
 
 static bool any_io_in_flight(void)
@@ -1244,13 +1281,15 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 		 */
 		zone_idx = zbdi->open_zones[pick_random_zone_idx(f, io_u)];
 	} else {
-		zone_idx = zbd_zone_idx(f, io_u->offset);
+		zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
 	}
 	if (zone_idx < f->min_zone)
 		zone_idx = f->min_zone;
 	else if (zone_idx >= f->max_zone)
 		zone_idx = f->max_zone - 1;
-	dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
+
+	dprint(FD_ZBD,
+	       "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
 	       __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
 
 	/*
@@ -1262,13 +1301,16 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 	for (;;) {
 		uint32_t tmp_idx;
 
-		z = get_zone(f, zone_idx);
+		z = zbd_get_zone(f, zone_idx);
 		if (z->has_wp)
 			zone_lock(td, f, z);
+
 		pthread_mutex_lock(&zbdi->mutex);
+
 		if (z->has_wp) {
 			if (z->cond != ZBD_ZONE_COND_OFFLINE &&
-			    zbdi->max_open_zones == 0 && td->o.job_max_open_zones == 0)
+			    zbdi->max_open_zones == 0 &&
+			    td->o.job_max_open_zones == 0)
 				goto examine_zone;
 			if (zbdi->num_open_zones == 0) {
 				dprint(FD_ZBD, "%s(%s): no zones are open\n",
@@ -1278,14 +1320,15 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 		}
 
 		/*
-		 * List of opened zones is per-device, shared across all threads.
-		 * Start with quasi-random candidate zone.
-		 * Ignore zones which don't belong to thread's offset/size area.
+		 * List of opened zones is per-device, shared across all
+		 * threads. Start with quasi-random candidate zone. Ignore
+		 * zones which don't belong to thread's offset/size area.
 		 */
 		open_zone_idx = pick_random_zone_idx(f, io_u);
 		assert(!open_zone_idx ||
 		       open_zone_idx < zbdi->num_open_zones);
 		tmp_idx = open_zone_idx;
+
 		for (i = 0; i < zbdi->num_open_zones; i++) {
 			uint32_t tmpz;
 
@@ -1302,9 +1345,12 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 
 		dprint(FD_ZBD, "%s(%s): no candidate zone\n",
 			__func__, f->file_name);
+
 		pthread_mutex_unlock(&zbdi->mutex);
+
 		if (z->has_wp)
 			zone_unlock(z);
+
 		return NULL;
 
 found_candidate_zone:
@@ -1312,7 +1358,9 @@ found_candidate_zone:
 		if (new_zone_idx == zone_idx)
 			break;
 		zone_idx = new_zone_idx;
+
 		pthread_mutex_unlock(&zbdi->mutex);
+
 		if (z->has_wp)
 			zone_unlock(z);
 	}
@@ -1343,7 +1391,8 @@ open_other_zone:
 	 * zone close before opening a new zone.
 	 */
 	if (wait_zone_close) {
-		dprint(FD_ZBD, "%s(%s): quiesce to allow open zones to close\n",
+		dprint(FD_ZBD,
+		       "%s(%s): quiesce to allow open zones to close\n",
 		       __func__, f->file_name);
 		io_u_quiesce(td);
 	}
@@ -1358,7 +1407,7 @@ retry:
 		if (!is_valid_offset(f, z->start)) {
 			/* Wrap-around. */
 			zone_idx = f->min_zone;
-			z = get_zone(f, zone_idx);
+			z = zbd_get_zone(f, zone_idx);
 		}
 		assert(is_valid_offset(f, z->start));
 		if (!z->has_wp)
@@ -1366,7 +1415,7 @@ retry:
 		zone_lock(td, f, z);
 		if (z->open)
 			continue;
-		if (zbd_open_zone(td, f, zone_idx))
+		if (zbd_open_zone(td, f, z))
 			goto out;
 	}
 
@@ -1381,7 +1430,7 @@ retry:
 		pthread_mutex_unlock(&zbdi->mutex);
 		zone_unlock(z);
 
-		z = get_zone(f, zone_idx);
+		z = zbd_get_zone(f, zone_idx);
 
 		zone_lock(td, f, z);
 		if (z->wp + min_bs <= zbd_zone_capacity_end(z))
@@ -1396,7 +1445,8 @@ retry:
 	 */
 	in_flight = any_io_in_flight();
 	if (in_flight || should_retry) {
-		dprint(FD_ZBD, "%s(%s): wait zone close and retry open zones\n",
+		dprint(FD_ZBD,
+		       "%s(%s): wait zone close and retry open zones\n",
 		       __func__, f->file_name);
 		pthread_mutex_unlock(&zbdi->mutex);
 		zone_unlock(z);
@@ -1407,17 +1457,22 @@ retry:
 	}
 
 	pthread_mutex_unlock(&zbdi->mutex);
+
 	zone_unlock(z);
-	dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
-	       f->file_name);
+
+	dprint(FD_ZBD, "%s(%s): did not open another zone\n",
+	       __func__, f->file_name);
+
 	return NULL;
 
 out:
-	dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name,
-	       zone_idx);
+	dprint(FD_ZBD, "%s(%s): returning zone %d\n",
+	       __func__, f->file_name, zone_idx);
+
 	io_u->offset = z->start;
 	assert(z->has_wp);
 	assert(z->cond != ZBD_ZONE_COND_OFFLINE);
+
 	return z;
 }
 
@@ -1429,25 +1484,27 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
 	const struct fio_file *f = io_u->file;
 	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 
-	if (!zbd_open_zone(td, f, zbd_zone_nr(f, z))) {
+	if (!zbd_open_zone(td, f, z)) {
 		zone_unlock(z);
 		z = zbd_convert_to_open_zone(td, io_u);
 		assert(z);
 	}
 
 	if (z->verify_block * min_bs >= z->capacity) {
-		log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n", f->file_name, z->verify_block,
-			min_bs, z->capacity);
+		log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n",
+			f->file_name, z->verify_block, min_bs, z->capacity);
 		/*
 		 * If the assertion below fails during a test run, adding
 		 * "--experimental_verify=1" to the command line may help.
 		 */
 		assert(false);
 	}
+
 	io_u->offset = z->start + z->verify_block * min_bs;
 	if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
-		log_err("%s: %llu + %llu >= %"PRIu64"\n", f->file_name, io_u->offset,
-			io_u->buflen, zbd_zone_capacity_end(z));
+		log_err("%s: %llu + %llu >= %"PRIu64"\n",
+			f->file_name, io_u->offset, io_u->buflen,
+			zbd_zone_capacity_end(z));
 		assert(false);
 	}
 	z->verify_block += io_u->buflen / min_bs;
@@ -1468,7 +1525,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
 {
 	struct fio_file *f = io_u->file;
 	struct fio_zone_info *z1, *z2;
-	const struct fio_zone_info *const zf = get_zone(f, f->min_zone);
+	const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
 
 	/*
 	 * Skip to the next non-empty zone in case of sequential I/O and to
@@ -1485,6 +1542,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
 		} else if (!td_random(td)) {
 			break;
 		}
+
 		if (td_random(td) && z2 >= zf &&
 		    z2->cond != ZBD_ZONE_COND_OFFLINE) {
 			if (z2->has_wp)
@@ -1495,8 +1553,11 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
 				zone_unlock(z2);
 		}
 	}
-	dprint(FD_ZBD, "%s: no zone has %"PRIu64" bytes of readable data\n",
+
+	dprint(FD_ZBD,
+	       "%s: no zone has %"PRIu64" bytes of readable data\n",
 	       f->file_name, min_bytes);
+
 	return NULL;
 }
 
@@ -1517,7 +1578,7 @@ static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
 	if (io_u->ddir == DDIR_WRITE &&
 	    io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
 		pthread_mutex_lock(&f->zbd_info->mutex);
-		zbd_close_zone(td, f, zbd_zone_nr(f, z));
+		zbd_close_zone(td, f, z);
 		pthread_mutex_unlock(&f->zbd_info->mutex);
 	}
 }
@@ -1537,15 +1598,11 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
 	const struct fio_file *f = io_u->file;
 	struct zoned_block_device_info *zbd_info = f->zbd_info;
 	struct fio_zone_info *z;
-	uint32_t zone_idx;
 	uint64_t zone_end;
 
 	assert(zbd_info);
 
-	zone_idx = zbd_zone_idx(f, io_u->offset);
-	assert(zone_idx < zbd_info->nr_zones);
-	z = get_zone(f, zone_idx);
-
+	z = zbd_offset_to_zone(f, io_u->offset);
 	assert(z->has_wp);
 
 	if (!success)
@@ -1553,17 +1610,18 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
 
 	dprint(FD_ZBD,
 	       "%s: queued I/O (%lld, %llu) for zone %u\n",
-	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
+	       f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
 
 	switch (io_u->ddir) {
 	case DDIR_WRITE:
 		zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
 			       zbd_zone_capacity_end(z));
-		pthread_mutex_lock(&zbd_info->mutex);
+
 		/*
 		 * z->wp > zone_end means that one or more I/O errors
 		 * have occurred.
 		 */
+		pthread_mutex_lock(&zbd_info->mutex);
 		if (z->wp <= zone_end) {
 			zbd_info->sectors_with_data += zone_end - z->wp;
 			zbd_info->wp_sectors_with_data += zone_end - z->wp;
@@ -1595,19 +1653,15 @@ static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
 	const struct fio_file *f = io_u->file;
 	struct zoned_block_device_info *zbd_info = f->zbd_info;
 	struct fio_zone_info *z;
-	uint32_t zone_idx;
 
 	assert(zbd_info);
 
-	zone_idx = zbd_zone_idx(f, io_u->offset);
-	assert(zone_idx < zbd_info->nr_zones);
-	z = get_zone(f, zone_idx);
-
+	z = zbd_offset_to_zone(f, io_u->offset);
 	assert(z->has_wp);
 
 	dprint(FD_ZBD,
 	       "%s: terminate I/O (%lld, %llu) for zone %u\n",
-	       f->file_name, io_u->offset, io_u->buflen, zone_idx);
+	       f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
 
 	zbd_end_zone_io(td, io_u, z);
 
@@ -1649,28 +1703,26 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
 	struct fio_file *f = io_u->file;
 	enum fio_ddir ddir = io_u->ddir;
 	struct fio_zone_info *z;
-	uint32_t zone_idx;
 
 	assert(td->o.zone_mode == ZONE_MODE_ZBD);
 	assert(td->o.zone_size);
 	assert(f->zbd_info);
 
-	zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
-	z = get_zone(f, zone_idx);
+	z = zbd_offset_to_zone(f, f->last_pos[ddir]);
 
 	/*
 	 * When the zone capacity is smaller than the zone size and the I/O is
 	 * sequential write, skip to zone end if the latest position is at the
 	 * zone capacity limit.
 	 */
-	if (z->capacity < f->zbd_info->zone_size && !td_random(td) &&
-	    ddir == DDIR_WRITE &&
+	if (z->capacity < f->zbd_info->zone_size &&
+	    !td_random(td) && ddir == DDIR_WRITE &&
 	    f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
 		dprint(FD_ZBD,
 		       "%s: Jump from zone capacity limit to zone end:"
 		       " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
 		       f->file_name, f->last_pos[ddir],
-		       zbd_zone_end(z), zone_idx, z->capacity);
+		       zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
 		td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
 		f->last_pos[ddir] = zbd_zone_end(z);
 	}
@@ -1751,7 +1803,6 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
 	struct zoned_block_device_info *zbdi = f->zbd_info;
-	uint32_t zone_idx_b;
 	struct fio_zone_info *zb, *zl, *orig_zb;
 	uint32_t orig_len = io_u->buflen;
 	uint64_t min_bs = td->o.min_bs[io_u->ddir];
@@ -1762,14 +1813,15 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 	assert(min_bs);
 	assert(is_valid_offset(f, io_u->offset));
 	assert(io_u->buflen);
-	zone_idx_b = zbd_zone_idx(f, io_u->offset);
-	zb = get_zone(f, zone_idx_b);
+
+	zb = zbd_offset_to_zone(f, io_u->offset);
 	orig_zb = zb;
 
 	if (!zb->has_wp) {
 		/* Accept non-write I/Os for conventional zones. */
 		if (io_u->ddir != DDIR_WRITE)
 			return io_u_accept;
+
 		/*
 		 * Make sure that writes to conventional zones
 		 * don't cross over to any sequential zones.
@@ -1783,12 +1835,16 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			       "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
 			       f->file_name, io_u->offset,
 			       min_bs, (zb + 1)->start);
-			io_u->offset = zb->start + (zb + 1)->start - io_u->offset;
-			new_len = min(io_u->buflen, (zb + 1)->start - io_u->offset);
+			io_u->offset =
+				zb->start + (zb + 1)->start - io_u->offset;
+			new_len = min(io_u->buflen,
+				      (zb + 1)->start - io_u->offset);
 		} else {
 			new_len = (zb + 1)->start - io_u->offset;
 		}
+
 		io_u->buflen = new_len / min_bs * min_bs;
+
 		return io_u_accept;
 	}
 
@@ -1810,6 +1866,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			zb = zbd_replay_write_order(td, io_u, zb);
 			goto accept;
 		}
+
 		/*
 		 * Check that there is enough written data in the zone to do an
 		 * I/O of at least min_bs B. If there isn't, find a new zone for
@@ -1820,7 +1877,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 		if (range < min_bs ||
 		    ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
 			zone_unlock(zb);
-			zl = get_zone(f, f->max_zone);
+			zl = zbd_get_zone(f, f->max_zone);
 			zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
 			if (!zb) {
 				dprint(FD_ZBD,
@@ -1839,6 +1896,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			if (!td_random(td))
 				io_u->offset = zb->start;
 		}
+
 		/*
 		 * Make sure the I/O is within the zone valid data range while
 		 * maximizing the I/O size and preserving randomness.
@@ -1849,12 +1907,14 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			io_u->offset = zb->start +
 				((io_u->offset - orig_zb->start) %
 				 (range - io_u->buflen)) / min_bs * min_bs;
+
 		/*
 		 * When zbd_find_zone() returns a conventional zone,
 		 * we can simply accept the new i/o offset here.
 		 */
 		if (!zb->has_wp)
 			return io_u_accept;
+
 		/*
 		 * Make sure the I/O does not cross over the zone wp position.
 		 */
@@ -1866,9 +1926,12 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			dprint(FD_IO, "Changed length from %u into %llu\n",
 			       orig_len, io_u->buflen);
 		}
+
 		assert(zb->start <= io_u->offset);
 		assert(io_u->offset + io_u->buflen <= zb->wp);
+
 		goto accept;
+
 	case DDIR_WRITE:
 		if (io_u->buflen > zbdi->zone_size) {
 			td_verror(td, EINVAL, "I/O buflen exceeds zone size");
@@ -1877,7 +1940,8 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			       f->file_name, io_u->buflen, zbdi->zone_size);
 			goto eof;
 		}
-		if (!zbd_open_zone(td, f, zone_idx_b)) {
+
+		if (!zbd_open_zone(td, f, zb)) {
 			zone_unlock(zb);
 			zb = zbd_convert_to_open_zone(td, io_u);
 			if (!zb) {
@@ -1886,14 +1950,14 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 				goto eof;
 			}
 		}
+
 		/* Check whether the zone reset threshold has been exceeded */
 		if (td->o.zrf.u.f) {
-			if (zbdi->wp_sectors_with_data >=
-			    f->io_size * td->o.zrt.u.f &&
-			    zbd_dec_and_reset_write_cnt(td, f)) {
+			if (zbdi->wp_sectors_with_data >= f->io_size * td->o.zrt.u.f &&
+			    zbd_dec_and_reset_write_cnt(td, f))
 				zb->reset_zone = 1;
-			}
 		}
+
 		/* Reset the zone pointer if necessary */
 		if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
 			assert(td->o.verify == VERIFY_NONE);
@@ -1916,6 +1980,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 				goto eof;
 			}
 		}
+
 		/* Make writes occur at the write pointer */
 		assert(!zbd_zone_full(f, zb, min_bs));
 		io_u->offset = zb->wp;
@@ -1925,6 +1990,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			       f->file_name, io_u->offset);
 			goto eof;
 		}
+
 		/*
 		 * Make sure that the buflen is a multiple of the minimal
 		 * block size. Give up if shrinking would make the request too
@@ -1941,10 +2007,13 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			       orig_len, io_u->buflen);
 			goto accept;
 		}
+
 		td_verror(td, EIO, "zone remainder too small");
 		log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
 			(zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
+
 		goto eof;
+
 	case DDIR_TRIM:
 		/* Check random trim targets a non-empty zone */
 		if (!td_random(td) || zb->wp > zb->start)
@@ -1952,7 +2021,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 
 		/* Find out a non-empty zone to trim */
 		zone_unlock(zb);
-		zl = get_zone(f, f->max_zone);
+		zl = zbd_get_zone(f, f->max_zone);
 		zb = zbd_find_zone(td, io_u, 1, zb, zl);
 		if (zb) {
 			io_u->offset = zb->start;
@@ -1960,7 +2029,9 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 			       f->file_name, io_u->offset);
 			goto accept;
 		}
+
 		goto eof;
+
 	case DDIR_SYNC:
 		/* fall-through */
 	case DDIR_DATASYNC:
@@ -1978,19 +2049,23 @@ accept:
 	assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
 	assert(!io_u->zbd_queue_io);
 	assert(!io_u->zbd_put_io);
+
 	io_u->zbd_queue_io = zbd_queue_io;
 	io_u->zbd_put_io = zbd_put_io;
+
 	/*
 	 * Since we return with the zone lock still held,
 	 * add an annotation to let Coverity know that it
 	 * is intentional.
 	 */
 	/* coverity[missing_unlock] */
+
 	return io_u_accept;
 
 eof:
 	if (zb && zb->has_wp)
 		zone_unlock(zb);
+
 	return io_u_eof;
 }
 
@@ -2018,17 +2093,15 @@ int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
 	struct fio_zone_info *z;
-	uint32_t zone_idx;
 	int ret;
 
-	zone_idx = zbd_zone_idx(f, io_u->offset);
-	z = get_zone(f, zone_idx);
-
+	z = zbd_offset_to_zone(f, io_u->offset);
 	if (!z->has_wp)
 		return 0;
 
 	if (io_u->offset != z->start) {
-		log_err("Trim offset not at zone start (%lld)\n", io_u->offset);
+		log_err("Trim offset not at zone start (%lld)\n",
+			io_u->offset);
 		return -EINVAL;
 	}
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-11 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-11 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 79eb6c9a17de959d72ee51c601b2764225101282:

  ioengines: libzbc: disable libzbc block backend driver (2021-12-09 21:34:21 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 2ea393df3256e44398558c264f035f8db7656b08:

  Merge branch 'github-actions' of https://github.com/sitsofe/fio (2021-12-10 11:08:26 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'github-actions' of https://github.com/sitsofe/fio

Sitsofe Wheeler (2):
      ci: add CI via GitHub Actions
      ci: retire travis configuration

 .github/workflows/ci.yml | 45 ++++++++++++++++++++++++
 .travis.yml              | 37 --------------------
 ci/actions-build.sh      | 37 ++++++++++++++++++++
 ci/actions-full-test.sh  | 15 ++++++++
 ci/actions-install.sh    | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 ci/actions-smoke-test.sh | 10 ++++++
 ci/common.sh             | 34 ++++++++++++++++++
 ci/travis-build.sh       | 32 -----------------
 ci/travis-install.sh     | 65 ----------------------------------
 9 files changed, 232 insertions(+), 134 deletions(-)
 create mode 100644 .github/workflows/ci.yml
 delete mode 100644 .travis.yml
 create mode 100755 ci/actions-build.sh
 create mode 100755 ci/actions-full-test.sh
 create mode 100755 ci/actions-install.sh
 create mode 100755 ci/actions-smoke-test.sh
 create mode 100644 ci/common.sh
 delete mode 100755 ci/travis-build.sh
 delete mode 100755 ci/travis-install.sh

---

Diff of recent changes:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..a766cfa8
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,45 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build:
+        - linux-gcc
+        - linux-clang
+        - macos
+        - linux-i686-gcc
+        include:
+        - build: linux-gcc
+          os: ubuntu-20.04
+          cc: gcc
+        - build: linux-clang
+          os: ubuntu-20.04
+          cc: clang
+        - build: macos
+          os: macos-10.15
+        - build: linux-i686-gcc
+          os: ubuntu-20.04
+          arch: i686
+
+    env:
+      CI_TARGET_ARCH: ${{ matrix.arch }}
+      CC: ${{ matrix.cc }}
+
+    steps:
+    - name: Checkout repo
+      uses: actions/checkout@v2
+    - name: Install dependencies
+      run: ./ci/actions-install.sh
+    - name: Build
+      run: ./ci/actions-build.sh
+    - name: Smoke test
+      run: ./ci/actions-smoke-test.sh
+    - name: Full test
+      run: ./ci/actions-full-test.sh
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index e35aff39..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-language: c
-dist: bionic
-os:
-  - linux
-compiler:
-  - clang
-  - gcc
-arch:
-  - amd64
-  - arm64
-env:
-  global:
-    - MAKEFLAGS="-j 2"
-matrix:
-  include:
-    - os: linux
-      compiler: gcc
-      arch: amd64
-      env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
-    # Default xcode image
-    - os: osx
-      compiler: clang # Workaround travis setting CC=["clang", "gcc"]
-      arch: amd64
-    # Latest xcode image (needs periodic updating)
-    - os: osx
-      compiler: clang
-      osx_image: xcode11.2
-      arch: amd64
-  exclude:
-    - os: osx
-      compiler: gcc
-
-install:
-  - ci/travis-install.sh
-
-script:
-  - ci/travis-build.sh
diff --git a/ci/actions-build.sh b/ci/actions-build.sh
new file mode 100755
index 00000000..74a6fdcb
--- /dev/null
+++ b/ci/actions-build.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+SCRIPT_DIR=$(dirname "$0")
+# shellcheck disable=SC1091
+. "${SCRIPT_DIR}/common.sh"
+
+main() {
+    local extra_cflags="-Werror"
+    local configure_flags=()
+
+    set_ci_target_os
+    case "${CI_TARGET_OS}" in
+        "linux")
+            case "${CI_TARGET_ARCH}" in
+                "i686")
+                    extra_cflags="${extra_cflags} -m32"
+                    export LDFLAGS="-m32"
+                    ;;
+                "x86_64")
+                    configure_flags+=(
+                        "--enable-cuda"
+                        "--enable-libiscsi"
+                        "--enable-libnbd"
+                    )
+                    ;;
+            esac
+        ;;
+    esac
+    configure_flags+=(--extra-cflags="${extra_cflags}")
+
+    ./configure "${configure_flags[@]}"
+    make -j 2
+}
+
+main
diff --git a/ci/actions-full-test.sh b/ci/actions-full-test.sh
new file mode 100755
index 00000000..4ae1dba1
--- /dev/null
+++ b/ci/actions-full-test.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+main() {
+    echo "Running long running tests..."
+    export PYTHONUNBUFFERED="TRUE"
+    if [[ "${CI_TARGET_ARCH}" == "arm64" ]]; then
+        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
+    else
+        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
+    fi
+}
+
+main
diff --git a/ci/actions-install.sh b/ci/actions-install.sh
new file mode 100755
index 00000000..7408ccb4
--- /dev/null
+++ b/ci/actions-install.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+SCRIPT_DIR=$(dirname "$0")
+# shellcheck disable=SC1091
+. "${SCRIPT_DIR}/common.sh"
+
+install_ubuntu() {
+    local pkgs
+
+    cat <<DPKGCFG | sudo tee /etc/dpkg/dpkg.cfg.d/dpkg-speedup > /dev/null
+# Skip fsync
+force-unsafe-io
+# Don't install documentation
+path-exclude=/usr/share/man/*
+path-exclude=/usr/share/locale/*/LC_MESSAGES/*.mo
+path-exclude=/usr/share/doc/*
+DPKGCFG
+    # Packages available on i686 and x86_64
+    pkgs=(
+        libaio-dev
+        libcunit1-dev
+        libcurl4-openssl-dev
+        libfl-dev
+        libibverbs-dev
+        libnuma-dev
+        librdmacm-dev
+        valgrind
+    )
+    case "${CI_TARGET_ARCH}" in
+        "i686")
+            sudo dpkg --add-architecture i386
+            pkgs=("${pkgs[@]/%/:i386}")
+            pkgs+=(
+                gcc-multilib
+                pkg-config:i386
+                zlib1g-dev:i386
+            )
+            ;;
+        "x86_64")
+            pkgs+=(
+                libglusterfs-dev
+                libgoogle-perftools-dev
+                libiscsi-dev
+                libnbd-dev
+                libpmem-dev
+                libpmemblk-dev
+                librbd-dev
+                libtcmalloc-minimal4
+                nvidia-cuda-dev
+            )
+            ;;
+    esac
+
+    # Architecture-independent packages and packages for which we don't
+    # care about the architecture.
+    pkgs+=(
+        python3-scipy
+    )
+
+    echo "Updating APT..."
+    sudo apt-get -qq update
+    echo "Installing packages..."
+    sudo apt-get install -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
+}
+
+install_linux() {
+    install_ubuntu
+}
+
+install_macos() {
+    # Assumes homebrew and python3 are already installed
+    #echo "Updating homebrew..."
+    #brew update >/dev/null 2>&1
+    echo "Installing packages..."
+    HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit
+    pip3 install scipy six
+}
+
+main() {
+    set_ci_target_os
+
+    install_function="install_${CI_TARGET_OS}"
+    ${install_function}
+
+    echo "Python3 path: $(type -p python3 2>&1)"
+    echo "Python3 version: $(python3 -V 2>&1)"
+}
+
+main
diff --git a/ci/actions-smoke-test.sh b/ci/actions-smoke-test.sh
new file mode 100755
index 00000000..c129c89f
--- /dev/null
+++ b/ci/actions-smoke-test.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+main() {
+    echo "Running smoke tests..."
+    make test
+}
+
+main
diff --git a/ci/common.sh b/ci/common.sh
new file mode 100644
index 00000000..8861f843
--- /dev/null
+++ b/ci/common.sh
@@ -0,0 +1,34 @@
+# shellcheck shell=bash
+
+function set_ci_target_os {
+    # Function that exports CI_TARGET_OS to the current OS if it is not already
+    # set.
+
+    # Don't override CI_TARGET_OS if already set
+    CI_TARGET_OS=${CI_TARGET_OS:-}
+    if [[ -z ${CI_TARGET_OS} ]]; then
+        # Detect operating system
+        case "${OSTYPE}" in
+            linux*)
+                CI_TARGET_OS="linux"
+                ;;
+            darwin*)
+                CI_TARGET_OS="macos"
+                ;;
+            msys*)
+                CI_TARGET_OS="windows"
+                ;;
+            bsd*)
+                CI_TARGET_OS="bsd"
+                ;;
+            *)
+                CI_TARGET_OS=""
+        esac
+    fi
+
+    # Don't override CI_TARGET_ARCH if already set
+    CI_TARGET_ARCH=${CI_TARGET_ARCH:-}
+    if [[ -z ${CI_TARGET_ARCH} ]]; then
+        CI_TARGET_ARCH="$(uname -m)"
+    fi
+}
diff --git a/ci/travis-build.sh b/ci/travis-build.sh
deleted file mode 100755
index 923d882d..00000000
--- a/ci/travis-build.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-set -eu
-
-CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
-EXTRA_CFLAGS="-Werror"
-export PYTHONUNBUFFERED=TRUE
-CONFIGURE_FLAGS=()
-
-case "$TRAVIS_OS_NAME" in
-    "linux")
-        CONFIGURE_FLAGS+=(--enable-libiscsi)
-        case "$CI_TARGET_ARCH" in
-            "x86")
-                EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32"
-                export LDFLAGS="-m32"
-                ;;
-            "amd64")
-                CONFIGURE_FLAGS+=(--enable-cuda)
-                ;;
-        esac
-    ;;
-esac
-CONFIGURE_FLAGS+=(--extra-cflags="${EXTRA_CFLAGS}")
-
-./configure "${CONFIGURE_FLAGS[@]}" &&
-    make &&
-    make test &&
-    if [[ "$CI_TARGET_ARCH" == "arm64" ]]; then
-        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
-    else
-        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
-    fi
diff --git a/ci/travis-install.sh b/ci/travis-install.sh
deleted file mode 100755
index 4c4c04c5..00000000
--- a/ci/travis-install.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-set -eu
-
-CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
-case "$TRAVIS_OS_NAME" in
-    "linux")
-	# Architecture-dependent packages.
-	pkgs=(
-	    libaio-dev
-	    libcunit1-dev
-	    libfl-dev
-	    libgoogle-perftools-dev
-	    libibverbs-dev
-	    libiscsi-dev
-	    libnuma-dev
-	    librbd-dev
-	    librdmacm-dev
-	    libz-dev
-	)
-	case "$CI_TARGET_ARCH" in
-	    "x86")
-		pkgs=("${pkgs[@]/%/:i386}")
-		pkgs+=(
-		    gcc-multilib
-		    pkg-config:i386
-	        )
-		;;
-	    "amd64")
-		pkgs+=(nvidia-cuda-dev)
-		;;
-	esac
-	if [[ $CI_TARGET_ARCH != "x86" ]]; then
-		pkgs+=(glusterfs-common)
-	fi
-	# Architecture-independent packages and packages for which we don't
-	# care about the architecture.
-	pkgs+=(
-	    bison
-	    flex
-	    python3
-	    python3-scipy
-	    python3-six
-	)
-	sudo apt-get -qq update
-	sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}"
-	# librpma is supported on the amd64 (x86_64) architecture for now
-	if [[ $CI_TARGET_ARCH == "amd64" ]]; then
-		# install libprotobuf-c-dev required by librpma_gpspm
-		sudo apt-get install --no-install-recommends -qq -y libprotobuf-c-dev
-		# PMDK libraries have to be installed, because
-		# libpmem is a dependency of the librpma fio engine
-		ci/travis-install-pmdk.sh
-		# install librpma from sources from GitHub
-		ci/travis-install-librpma.sh
-	fi
-	;;
-    "osx")
-	brew update >/dev/null 2>&1
-	brew install cunit
-	pip3 install scipy six
-	;;
-esac
-
-echo "Python3 path: $(type -p python3 2>&1)"
-echo "Python3 version: $(python3 -V 2>&1)"

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-10 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-10 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit fab60fa78a1832c17f8bb200292ded4a8b3eb2a5:

  Merge branch 'arm-detect-pmull' of https://github.com/sitsofe/fio (2021-12-06 13:26:52 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 79eb6c9a17de959d72ee51c601b2764225101282:

  ioengines: libzbc: disable libzbc block backend driver (2021-12-09 21:34:21 -0700)

----------------------------------------------------------------
Damien Le Moal (1):
      ioengines: libzbc: disable libzbc block backend driver

 engines/libzbc.c              |  2 +-
 t/zbd/run-tests-against-nullb | 14 --------------
 2 files changed, 1 insertion(+), 15 deletions(-)

---

Diff of recent changes:

diff --git a/engines/libzbc.c b/engines/libzbc.c
index abee2043..2bc2c7e0 100644
--- a/engines/libzbc.c
+++ b/engines/libzbc.c
@@ -85,7 +85,7 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
 		return -ENOMEM;
 
 	ret = zbc_open(f->file_name,
-		       flags | ZBC_O_DRV_BLOCK | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
+		       flags | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
 		       &ld->zdev);
 	if (ret) {
 		log_err("%s: zbc_open() failed, err=%d\n",
diff --git a/t/zbd/run-tests-against-nullb b/t/zbd/run-tests-against-nullb
index db901179..7d2c7fa8 100755
--- a/t/zbd/run-tests-against-nullb
+++ b/t/zbd/run-tests-against-nullb
@@ -19,7 +19,6 @@ usage()
 	echo -e "\t-L List the device layouts for every section without running"
 	echo -e "\t   tests."
 	echo -e "\t-s <#section> Only run the section with the given number."
-	echo -e "\t-l Use libzbc ioengine to run the tests."
 	echo -e "\t-t <#test> Only run the test with the given number in every section."
 	echo -e "\t-o <max_open_zones> Specify MaxOpen value, (${set_max_open} by default)."
 	echo -e "\t-n <#number of runs> Set the number of times to run the entire suite "
@@ -239,7 +238,6 @@ dev_size=1024
 dev_blocksize=4096
 set_max_open=8
 zbd_test_opts=()
-libzbc=0
 num_of_runs=1
 test_case=0
 quit_on_err=0
@@ -250,7 +248,6 @@ while (($#)); do
 		-o) set_max_open="${2}"; shift; shift;;
 		-L) list_only=1; shift;;
 		-r) cleanup_nullb; exit 0;;
-		-l) libzbc=1; shift;;
 		-n) num_of_runs="${2}"; shift; shift;;
 		-t) test_case="${2}"; shift; shift;;
 		-q) quit_on_err=1; shift;;
@@ -311,17 +308,6 @@ while ((run_nr <= $num_of_runs)); do
 			exit 1
 		fi
 		show_nullb_config
-		if ((libzbc)); then
-			if ((zone_capacity < zone_size)); then
-				echo "libzbc doesn't support zone capacity, skipping section $(printf "%02d" $section_number)"
-				continue
-			fi
-			if ((conv_pcnt == 100)); then
-				echo "libzbc only supports zoned devices, skipping section $(printf "%02d" $section_number)"
-				continue
-			fi
-			zbd_test_opts+=("-l")
-		fi
 		cd "${scriptdir}"
 		((intr)) && exit 1
 		((list_only)) && continue

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-07 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-07 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit fd1d8e0ab3dc852193037a3acebcf8b8bdbcd9c5:

  filesetup: create zbd_info before jumping to done label (2021-12-02 17:54:15 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to fab60fa78a1832c17f8bb200292ded4a8b3eb2a5:

  Merge branch 'arm-detect-pmull' of https://github.com/sitsofe/fio (2021-12-06 13:26:52 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'arm-detect-pmull' of https://github.com/sitsofe/fio

Sitsofe Wheeler (1):
      os: detect PMULL support before enabling accelerated crc32c on ARM

Vincent Fu (2):
      io_ddir: return appropriate string for DDIR_INVAL
      libfio: drop unneeded reset of rwmix_issues

 io_ddir.h     | 2 +-
 libfio.c      | 1 -
 os/os-linux.h | 6 +++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

---

Diff of recent changes:

diff --git a/io_ddir.h b/io_ddir.h
index a42da97a..296a9d04 100644
--- a/io_ddir.h
+++ b/io_ddir.h
@@ -24,7 +24,7 @@ static inline const char *io_ddir_name(enum fio_ddir ddir)
 					"datasync", "sync_file_range",
 					"wait", };
 
-	if (ddir < DDIR_LAST)
+	if (ddir >= 0 && ddir < DDIR_LAST)
 		return name[ddir];
 
 	return "invalid";
diff --git a/libfio.c b/libfio.c
index ed5906d4..198eaf2e 100644
--- a/libfio.c
+++ b/libfio.c
@@ -140,7 +140,6 @@ void reset_all_stats(struct thread_data *td)
 		td->io_issues[i] = 0;
 		td->ts.total_io_u[i] = 0;
 		td->ts.runtime[i] = 0;
-		td->rwmix_issues = 0;
 	}
 
 	set_epoch_time(td, td->o.log_unix_epoch);
diff --git a/os/os-linux.h b/os/os-linux.h
index 808f1d02..3001140c 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -20,6 +20,9 @@
 
 #ifdef ARCH_HAVE_CRC_CRYPTO
 #include <sys/auxv.h>
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL             (1 << 4)
+#endif /* HWCAP_PMULL */
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32             (1 << 7)
 #endif /* HWCAP_CRC32 */
@@ -405,7 +408,8 @@ static inline bool os_cpu_has(cpu_features feature)
 #ifdef ARCH_HAVE_CRC_CRYPTO
 	case CPU_ARM64_CRC32C:
 		hwcap = getauxval(AT_HWCAP);
-		have_feature = (hwcap & HWCAP_CRC32) != 0;
+		have_feature = (hwcap & (HWCAP_PMULL | HWCAP_CRC32)) ==
+			       (HWCAP_PMULL | HWCAP_CRC32);
 		break;
 #endif
 	default:

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-12-03 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-12-03 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit ed7f3a07363d62c6d6147b0c568f87f079d241a8:

  stat: make add lat percentile functions inline (2021-11-25 09:03:10 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to fd1d8e0ab3dc852193037a3acebcf8b8bdbcd9c5:

  filesetup: create zbd_info before jumping to done label (2021-12-02 17:54:15 -0700)

----------------------------------------------------------------
Niklas Cassel (1):
      filesetup: create zbd_info before jumping to done label

 filesetup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

---

Diff of recent changes:

diff --git a/filesetup.c b/filesetup.c
index 228e4fff..fb556d84 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -1119,9 +1119,6 @@ int setup_files(struct thread_data *td)
 	if (err)
 		goto err_out;
 
-	if (o->read_iolog_file)
-		goto done;
-
 	if (td->o.zone_mode == ZONE_MODE_ZBD) {
 		err = zbd_init_files(td);
 		if (err)
@@ -1129,6 +1126,9 @@ int setup_files(struct thread_data *td)
 	}
 	zbd_recalc_options_with_zone_granularity(td);
 
+	if (o->read_iolog_file)
+		goto done;
+
 	/*
 	 * check sizes. if the files/devices do not exist and the size
 	 * isn't passed to fio, abort.

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-26 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-26 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 2b00ac1c82d54795911343c9b3b3f4ef64c92d92:

  Merge branch 'fix-parse-sync-file-range' of https://github.com/oleglatin/fio (2021-11-24 10:27:20 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to ed7f3a07363d62c6d6147b0c568f87f079d241a8:

  stat: make add lat percentile functions inline (2021-11-25 09:03:10 -0700)

----------------------------------------------------------------
Niklas Cassel (6):
      docs: document quirky implementation of per priority stats reporting
      stat: add comments describing the quirky behavior of clat prio samples
      stat: rename add_lat_percentile_sample()
      stat: rename add_lat_percentile_sample_noprio()
      stat: simplify add_lat_percentile_prio_sample()
      stat: make add lat percentile functions inline

 HOWTO  |  6 +++++-
 fio.1  |  5 ++++-
 stat.c | 52 +++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 48 insertions(+), 15 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO
index a3b3acfe..8c9e4135 100644
--- a/HOWTO
+++ b/HOWTO
@@ -2169,7 +2169,11 @@ with the caveat that when used on the command line, they must come after the
     Default: 0. A single value applies to reads and writes. Comma-separated
     values may be specified for reads and writes. For this option to be
     effective, NCQ priority must be supported and enabled, and `direct=1'
-    option must be used. fio must also be run as the root user.
+    option must be used. fio must also be run as the root user. Unlike
+    slat/clat/lat stats, which can be tracked and reported independently, per
+    priority stats only track and report a single type of latency. By default,
+    completion latency (clat) will be reported, if :option:`lat_percentiles` is
+    set, total latency (lat) will be reported.
 
 .. option:: cmdprio_class=int[,int] : [io_uring] [libaio]
 
diff --git a/fio.1 b/fio.1
index a6469541..a3ebb67d 100644
--- a/fio.1
+++ b/fio.1
@@ -1967,7 +1967,10 @@ Set the percentage of I/O that will be issued with the highest priority.
 Default: 0. A single value applies to reads and writes. Comma-separated
 values may be specified for reads and writes. For this option to be effective,
 NCQ priority must be supported and enabled, and `direct=1' option must be
-used. fio must also be run as the root user.
+used. fio must also be run as the root user. Unlike slat/clat/lat stats, which
+can be tracked and reported independently, per priority stats only track and
+report a single type of latency. By default, completion latency (clat) will be
+reported, if \fBlat_percentiles\fR is set, total latency (lat) will be reported.
 .TP
 .BI (io_uring,libaio)cmdprio_class \fR=\fPint[,int]
 Set the I/O priority class to use for I/Os that must be issued with a
diff --git a/stat.c b/stat.c
index e0dc99b6..7e84058d 100644
--- a/stat.c
+++ b/stat.c
@@ -3052,8 +3052,10 @@ void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
 	add_stat_sample(&ts->sync_stat, nsec);
 }
 
-static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
-				unsigned long long nsec, enum fio_ddir ddir, enum fio_lat lat)
+static inline void add_lat_percentile_sample(struct thread_stat *ts,
+					     unsigned long long nsec,
+					     enum fio_ddir ddir,
+					     enum fio_lat lat)
 {
 	unsigned int idx = plat_val_to_idx(nsec);
 	assert(idx < FIO_IO_U_PLAT_NR);
@@ -3061,14 +3063,13 @@ static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
 	ts->io_u_plat[lat][ddir][idx]++;
 }
 
-static void add_lat_percentile_sample(struct thread_stat *ts,
-				unsigned long long nsec, enum fio_ddir ddir,
-				bool high_prio, enum fio_lat lat)
+static inline void add_lat_percentile_prio_sample(struct thread_stat *ts,
+						  unsigned long long nsec,
+						  enum fio_ddir ddir,
+						  bool high_prio)
 {
 	unsigned int idx = plat_val_to_idx(nsec);
 
-	add_lat_percentile_sample_noprio(ts, nsec, ddir, lat);
-
 	if (!high_prio)
 		ts->io_u_plat_low_prio[ddir][idx]++;
 	else
@@ -3089,6 +3090,15 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 
 	add_stat_sample(&ts->clat_stat[ddir], nsec);
 
+	/*
+	 * When lat_percentiles=1 (default 0), the reported high/low priority
+	 * percentiles and stats are used for describing total latency values,
+	 * even though the variable names themselves start with clat_.
+	 *
+	 * Because of the above definition, add a prio stat sample only when
+	 * lat_percentiles=0. add_lat_sample() will add the prio stat sample
+	 * when lat_percentiles=1.
+	 */
 	if (!ts->lat_percentiles) {
 		if (high_prio)
 			add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
@@ -3101,10 +3111,15 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 			       offset, ioprio);
 
 	if (ts->clat_percentiles) {
-		if (ts->lat_percentiles)
-			add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_CLAT);
-		else
-			add_lat_percentile_sample(ts, nsec, ddir, high_prio, FIO_CLAT);
+		/*
+		 * Because of the above definition, add a prio lat percentile
+		 * sample only when lat_percentiles=0. add_lat_sample() will add
+		 * the prio lat percentile sample when lat_percentiles=1.
+		 */
+		add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT);
+		if (!ts->lat_percentiles)
+			add_lat_percentile_prio_sample(ts, nsec, ddir,
+						       high_prio);
 	}
 
 	if (iolog && iolog->hist_msec) {
@@ -3169,7 +3184,7 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
 			       offset, ioprio);
 
 	if (ts->slat_percentiles)
-		add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_SLAT);
+		add_lat_percentile_sample(ts, nsec, ddir, FIO_SLAT);
 
 	if (needs_lock)
 		__td_io_u_unlock(td);
@@ -3194,8 +3209,19 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
 		add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
 			       offset, ioprio);
 
+	/*
+	 * When lat_percentiles=1 (default 0), the reported high/low priority
+	 * percentiles and stats are used for describing total latency values,
+	 * even though the variable names themselves start with clat_.
+	 *
+	 * Because of the above definition, add a prio stat and prio lat
+	 * percentile sample only when lat_percentiles=1. add_clat_sample() will
+	 * add the prio stat and prio lat percentile sample when
+	 * lat_percentiles=0.
+	 */
 	if (ts->lat_percentiles) {
-		add_lat_percentile_sample(ts, nsec, ddir, high_prio, FIO_LAT);
+		add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT);
+		add_lat_percentile_prio_sample(ts, nsec, ddir, high_prio);
 		if (high_prio)
 			add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
 		else

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-25 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-25 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 1d08bfb018e600cc47f122fb78c02bf74b84dee8:

  t/dedupe: style fixups (2021-11-21 06:51:11 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 2b00ac1c82d54795911343c9b3b3f4ef64c92d92:

  Merge branch 'fix-parse-sync-file-range' of https://github.com/oleglatin/fio (2021-11-24 10:27:20 -0700)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'fix-parse-sync-file-range' of https://github.com/oleglatin/fio

Oleg Latin (1):
      parse: handle comma-separated options

 parse.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

---

Diff of recent changes:

diff --git a/parse.c b/parse.c
index 45f4f2d3..d086ee48 100644
--- a/parse.c
+++ b/parse.c
@@ -477,13 +477,17 @@ static int check_int(const char *p, int *val)
 
 static size_t opt_len(const char *str)
 {
+	char delimiter[] = {',', ':'};
 	char *postfix;
+	unsigned int i;
 
-	postfix = strchr(str, ':');
-	if (!postfix)
-		return strlen(str);
+	for (i = 0; i < FIO_ARRAY_SIZE(delimiter); i++) {
+		postfix = strchr(str, delimiter[i]);
+		if (postfix)
+			return (int)(postfix - str);
+	}
 
-	return (int)(postfix - str);
+	return strlen(str);
 }
 
 static int str_match_len(const struct value_pair *vp, const char *str)

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-22 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-22 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 9f51d89c683d70cd8ab23ba09ec6e628a548af5a:

  Sync io_uring header with the kernel (2021-11-20 07:31:20 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 1d08bfb018e600cc47f122fb78c02bf74b84dee8:

  t/dedupe: style fixups (2021-11-21 06:51:11 -0700)

----------------------------------------------------------------
Bar David (2):
      Mixed dedup and compression
      fio-dedup: adjusted the binary to support compression

Jens Axboe (3):
      Merge branch 'dedupe_and_compression' of https://github.com/bardavid/fio
      t/io_uring: fix 32-bit compile warnings
      t/dedupe: style fixups

 DEDUPE-TODO  |   3 --
 dedupe.c     |  12 ++++-
 io_u.c       |  29 ++++++-----
 t/dedupe.c   | 167 +++++++++++++++++++++++++++++++++++++++++++++++------------
 t/io_uring.c |   4 +-
 5 files changed, 161 insertions(+), 54 deletions(-)

---

Diff of recent changes:

diff --git a/DEDUPE-TODO b/DEDUPE-TODO
index 1f3ee9da..4b0bfd1d 100644
--- a/DEDUPE-TODO
+++ b/DEDUPE-TODO
@@ -1,6 +1,3 @@
-- Mixed buffers of dedupe-able and compressible data.
-  Major usecase in performance benchmarking of storage subsystems.
-
 - Shifted dedup-able data.
   Allow for dedup buffer generation to shift contents by random number
   of sectors (fill the gaps with uncompressible data). Some storage
diff --git a/dedupe.c b/dedupe.c
index 043a376c..fd116dfb 100644
--- a/dedupe.c
+++ b/dedupe.c
@@ -2,12 +2,14 @@
 
 int init_dedupe_working_set_seeds(struct thread_data *td)
 {
-	unsigned long long i;
+	unsigned long long i, j, num_seed_advancements;
 	struct frand_state dedupe_working_set_state = {0};
 
 	if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
 		return 0;
 
+	num_seed_advancements = td->o.min_bs[DDIR_WRITE] /
+		min_not_zero(td->o.min_bs[DDIR_WRITE], (unsigned long long) td->o.compress_chunk);
 	/*
 	 * The dedupe working set keeps seeds of unique data (generated by buf_state).
 	 * Dedupe-ed pages will be generated using those seeds.
@@ -21,7 +23,13 @@ int init_dedupe_working_set_seeds(struct thread_data *td)
 	frand_copy(&dedupe_working_set_state, &td->buf_state);
 	for (i = 0; i < td->num_unique_pages; i++) {
 		frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
-		__get_next_seed(&dedupe_working_set_state);
+		/*
+		 * When compression is used the seed is advanced multiple times to
+		 * generate the buffer. We want to regenerate the same buffer when
+		 * deduping against this page
+		 */
+		for (j = 0; j < num_seed_advancements; j++)
+			__get_next_seed(&dedupe_working_set_state);
 	}
 
 	return 0;
diff --git a/io_u.c b/io_u.c
index 586a4bef..3c72d63d 100644
--- a/io_u.c
+++ b/io_u.c
@@ -2230,27 +2230,30 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned long long min_wr
 
 	if (o->compress_percentage || o->dedupe_percentage) {
 		unsigned int perc = td->o.compress_percentage;
-		struct frand_state *rs;
+		struct frand_state *rs = NULL;
 		unsigned long long left = max_bs;
 		unsigned long long this_write;
 
 		do {
-			rs = get_buf_state(td);
+			/*
+			 * Buffers are either entirely dedupe-able or not.
+			 * If we choose to dedup, the buffer should undergo
+			 * the same manipulation as the original write. Which
+			 * means we should retrack the steps we took for compression
+			 * as well.
+			 */
+			if (!rs)
+				rs = get_buf_state(td);
 
 			min_write = min(min_write, left);
 
-			if (perc) {
-				this_write = min_not_zero(min_write,
-							(unsigned long long) td->o.compress_chunk);
+			this_write = min_not_zero(min_write,
+						(unsigned long long) td->o.compress_chunk);
 
-				fill_random_buf_percentage(rs, buf, perc,
-					this_write, this_write,
-					o->buffer_pattern,
-					o->buffer_pattern_bytes);
-			} else {
-				fill_random_buf(rs, buf, min_write);
-				this_write = min_write;
-			}
+			fill_random_buf_percentage(rs, buf, perc,
+				this_write, this_write,
+				o->buffer_pattern,
+				o->buffer_pattern_bytes);
 
 			buf += this_write;
 			left -= this_write;
diff --git a/t/dedupe.c b/t/dedupe.c
index 8b659c76..109ea1af 100644
--- a/t/dedupe.c
+++ b/t/dedupe.c
@@ -24,19 +24,25 @@
 
 #include "../lib/bloom.h"
 #include "debug.h"
+#include "zlib.h"
+
+struct zlib_ctrl {
+	z_stream stream;
+	unsigned char *buf_in;
+	unsigned char *buf_out;
+};
 
 struct worker_thread {
+	struct zlib_ctrl zc;
 	pthread_t thread;
-
-	volatile int done;
-
-	int fd;
 	uint64_t cur_offset;
 	uint64_t size;
-
+	unsigned long long unique_capacity;
 	unsigned long items;
 	unsigned long dupes;
 	int err;
+	int fd;
+	volatile int done;
 };
 
 struct extent {
@@ -68,6 +74,7 @@ static unsigned int odirect;
 static unsigned int collision_check;
 static unsigned int print_progress = 1;
 static unsigned int use_bloom = 1;
+static unsigned int compression = 0;
 
 static uint64_t total_size;
 static uint64_t cur_offset;
@@ -87,8 +94,9 @@ static uint64_t get_size(struct fio_file *f, struct stat *sb)
 			return 0;
 		}
 		ret = bytes;
-	} else
+	} else {
 		ret = sb->st_size;
+	}
 
 	return (ret & ~((uint64_t)blocksize - 1));
 }
@@ -120,9 +128,9 @@ static int __read_block(int fd, void *buf, off_t offset, size_t count)
 	if (ret < 0) {
 		perror("pread");
 		return 1;
-	} else if (!ret)
+	} else if (!ret) {
 		return 1;
-	else if (ret != count) {
+	} else if (ret != count) {
 		log_err("dedupe: short read on block\n");
 		return 1;
 	}
@@ -135,6 +143,34 @@ static int read_block(int fd, void *buf, off_t offset)
 	return __read_block(fd, buf, offset, blocksize);
 }
 
+static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
+				    struct zlib_ctrl *zc)
+{
+	z_stream *stream = &zc->stream;
+	unsigned int compressed_len;
+	int ret;
+
+	if (read_block(file.fd, zc->buf_in, offset))
+		return;
+
+	stream->next_in = zc->buf_in;
+	stream->avail_in = blocksize;
+	stream->avail_out = deflateBound(stream, blocksize);
+	stream->next_out = zc->buf_out;
+
+	ret = deflate(stream, Z_FINISH);
+	assert(ret != Z_STREAM_ERROR);
+	compressed_len = blocksize - stream->avail_out;
+
+	if (dump_output)
+		printf("offset 0x%lx compressed to %d blocksize %d ratio %.2f \n",
+				(unsigned long) offset, compressed_len, blocksize,
+				(float)compressed_len / (float)blocksize);
+
+	*unique_capacity += compressed_len;
+	deflateReset(stream);
+}
+
 static void add_item(struct chunk *c, struct item *i)
 {
 	/*	
@@ -182,13 +218,15 @@ static struct chunk *alloc_chunk(void)
 	if (collision_check || dump_output) {
 		c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
 		INIT_FLIST_HEAD(&c->extent_list[0]);
-	} else
+	} else {
 		c = malloc(sizeof(struct chunk));
+	}
 
 	return c;
 }
 
-static void insert_chunk(struct item *i)
+static void insert_chunk(struct item *i, uint64_t *unique_capacity,
+			 struct zlib_ctrl *zc)
 {
 	struct fio_rb_node **p, *parent;
 	struct chunk *c;
@@ -201,11 +239,11 @@ static void insert_chunk(struct item *i)
 
 		c = rb_entry(parent, struct chunk, rb_node);
 		diff = memcmp(i->hash, c->hash, sizeof(i->hash));
-		if (diff < 0)
+		if (diff < 0) {
 			p = &(*p)->rb_left;
-		else if (diff > 0)
+		} else if (diff > 0) {
 			p = &(*p)->rb_right;
-		else {
+		} else {
 			int ret;
 
 			if (!collision_check)
@@ -228,12 +266,15 @@ static void insert_chunk(struct item *i)
 	memcpy(c->hash, i->hash, sizeof(i->hash));
 	rb_link_node(&c->rb_node, parent, p);
 	rb_insert_color(&c->rb_node, &rb_root);
+	if (compression)
+		account_unique_capacity(i->offset, unique_capacity, zc);
 add:
 	add_item(c, i);
 }
 
 static void insert_chunks(struct item *items, unsigned int nitems,
-			  uint64_t *ndupes)
+			  uint64_t *ndupes, uint64_t *unique_capacity,
+			  struct zlib_ctrl *zc)
 {
 	int i;
 
@@ -248,7 +289,7 @@ static void insert_chunks(struct item *items, unsigned int nitems,
 			r = bloom_set(bloom, items[i].hash, s);
 			*ndupes += r;
 		} else
-			insert_chunk(&items[i]);
+			insert_chunk(&items[i], unique_capacity, zc);
 	}
 
 	fio_sem_up(rb_lock);
@@ -277,11 +318,13 @@ static int do_work(struct worker_thread *thread, void *buf)
 	off_t offset;
 	int nitems = 0;
 	uint64_t ndupes = 0;
+	uint64_t unique_capacity = 0;
 	struct item *items;
 
 	offset = thread->cur_offset;
 
-	nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+	nblocks = read_blocks(thread->fd, buf, offset,
+				min(thread->size, (uint64_t) chunk_size));
 	if (!nblocks)
 		return 1;
 
@@ -296,20 +339,39 @@ static int do_work(struct worker_thread *thread, void *buf)
 		nitems++;
 	}
 
-	insert_chunks(items, nitems, &ndupes);
+	insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
 
 	free(items);
 	thread->items += nitems;
 	thread->dupes += ndupes;
+	thread->unique_capacity += unique_capacity;
 	return 0;
 }
 
+static void thread_init_zlib_control(struct worker_thread *thread)
+{
+	size_t sz;
+
+	z_stream *stream = &thread->zc.stream;
+	stream->zalloc = Z_NULL;
+	stream->zfree = Z_NULL;
+	stream->opaque = Z_NULL;
+
+	if (deflateInit(stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+		return;
+
+	thread->zc.buf_in = fio_memalign(blocksize, blocksize, false);
+	sz = deflateBound(stream, blocksize);
+	thread->zc.buf_out = fio_memalign(blocksize, sz, false);
+}
+
 static void *thread_fn(void *data)
 {
 	struct worker_thread *thread = data;
 	void *buf;
 
 	buf = fio_memalign(blocksize, chunk_size, false);
+	thread_init_zlib_control(thread);
 
 	do {
 		if (get_work(&thread->cur_offset, &thread->size)) {
@@ -362,15 +424,17 @@ static void show_progress(struct worker_thread *threads, unsigned long total)
 			printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
 			last_nitems = nitems;
 			fio_gettime(&last_tv, NULL);
-		} else
+		} else {
 			printf("%3.2f%% done\r", perc);
+		}
 		fflush(stdout);
 		usleep(250000);
 	};
 }
 
 static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
-			      uint64_t *nextents, uint64_t *nchunks)
+			      uint64_t *nextents, uint64_t *nchunks,
+			      uint64_t *unique_capacity)
 {
 	struct worker_thread *threads;
 	unsigned long nitems, total_items;
@@ -398,11 +462,13 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
 	nitems = 0;
 	*nextents = 0;
 	*nchunks = 1;
+	*unique_capacity = 0;
 	for (i = 0; i < num_threads; i++) {
 		void *ret;
 		pthread_join(threads[i].thread, &ret);
 		nitems += threads[i].items;
 		*nchunks += threads[i].dupes;
+		*unique_capacity += threads[i].unique_capacity;
 	}
 
 	printf("Threads(%u): %lu items processed\n", num_threads, nitems);
@@ -416,7 +482,7 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
 }
 
 static int dedupe_check(const char *filename, uint64_t *nextents,
-			uint64_t *nchunks)
+			uint64_t *nchunks, uint64_t *unique_capacity)
 {
 	uint64_t dev_size;
 	struct stat sb;
@@ -451,9 +517,11 @@ static int dedupe_check(const char *filename, uint64_t *nextents,
 		bloom = bloom_new(bloom_entries);
 	}
 
-	printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+	printf("Will check <%s>, size <%llu>, using %u threads\n", filename,
+				(unsigned long long) dev_size, num_threads);
 
-	return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+	return run_dedupe_threads(&file, dev_size, nextents, nchunks,
+					unique_capacity);
 err:
 	if (file.fd != -1)
 		close(file.fd);
@@ -466,18 +534,38 @@ static void show_chunk(struct chunk *c)
 	struct flist_head *n;
 	struct extent *e;
 
-	printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+	printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1],
+			c->hash[2], c->hash[3], (unsigned long) c->count);
 	flist_for_each(n, &c->extent_list[0]) {
 		e = flist_entry(n, struct extent, list);
 		printf("\toffset %llu\n", (unsigned long long) e->offset);
 	}
 }
 
-static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents)
+static const char *capacity_unit[] = {"b","KB", "MB", "GB", "TB", "PB", "EB"};
+
+static uint64_t bytes_to_human_readable_unit(uint64_t n, const char **unit_out)
+{
+	uint8_t i = 0;
+
+	while (n >= 1024) {
+		i++;
+		n /= 1024;
+	}
+
+	*unit_out = capacity_unit[i];
+	return n;
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents,
+		      uint64_t unique_capacity)
 {
 	double perc, ratio;
+	const char *unit;
+	uint64_t uc_human;
 
-	printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents, (unsigned long) nchunks);
+	printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents,
+						(unsigned long) nchunks);
 	if (!bloom)
 		printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
 	printf("\n");
@@ -485,22 +573,29 @@ static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents)
 	if (nchunks) {
 		ratio = (double) nextents / (double) nchunks;
 		printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
-	} else
+	} else {
 		printf("De-dupe ratio: 1:infinite\n");
+	}
 
-	if (ndupextents)
-		printf("De-dupe working set at least: %3.2f%%\n", 100.0 * (double) ndupextents / (double) nextents);
+	if (ndupextents) {
+		printf("De-dupe working set at least: %3.2f%%\n",
+			100.0 * (double) ndupextents / (double) nextents);
+	}
 
 	perc = 1.00 - ((double) nchunks / (double) nextents);
 	perc *= 100.0;
 	printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
 
+
+	if (compression) {
+		uc_human = bytes_to_human_readable_unit(unique_capacity, &unit);
+		printf("Unique capacity %lu%s\n", (unsigned long) uc_human, unit);
+	}
 }
 
 static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
 {
 	struct fio_rb_node *n;
-
 	*nchunks = *nextents = *ndupextents = 0;
 
 	n = rb_first(&rb_root);
@@ -532,18 +627,19 @@ static int usage(char *argv[])
 	log_err("\t-c\tFull collision check\n");
 	log_err("\t-B\tUse probabilistic bloom filter\n");
 	log_err("\t-p\tPrint progress indicator\n");
+	log_err("\t-C\tCalculate compressible size\n");
 	return 1;
 }
 
 int main(int argc, char *argv[])
 {
-	uint64_t nextents = 0, nchunks = 0, ndupextents = 0;
+	uint64_t nextents = 0, nchunks = 0, ndupextents = 0, unique_capacity;
 	int c, ret;
 
 	arch_init(argv);
 	debug_init();
 
-	while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+	while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:C:")) != -1) {
 		switch (c) {
 		case 'b':
 			blocksize = atoi(optarg);
@@ -566,13 +662,16 @@ int main(int argc, char *argv[])
 		case 'B':
 			use_bloom = atoi(optarg);
 			break;
+		case 'C':
+			compression = atoi(optarg);
+			break;
 		case '?':
 		default:
 			return usage(argv);
 		}
 	}
 
-	if (collision_check || dump_output)
+	if (collision_check || dump_output || compression)
 		use_bloom = 0;
 
 	if (!num_threads)
@@ -586,13 +685,13 @@ int main(int argc, char *argv[])
 	rb_root = RB_ROOT;
 	rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
 
-	ret = dedupe_check(argv[optind], &nextents, &nchunks);
+	ret = dedupe_check(argv[optind], &nextents, &nchunks, &unique_capacity);
 
 	if (!ret) {
 		if (!bloom)
 			iter_rb_tree(&nextents, &nchunks, &ndupextents);
 
-		show_stat(nextents, nchunks, ndupextents);
+		show_stat(nextents, nchunks, ndupextents, unique_capacity);
 	}
 
 	fio_sem_remove(rb_lock);
diff --git a/t/io_uring.c b/t/io_uring.c
index 7bf215c7..a98f78fd 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -192,7 +192,7 @@ unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
 	unsigned long *ovals = NULL;
 	bool is_last;
 
-	*minv = -1ULL;
+	*minv = -1UL;
 	*maxv = 0;
 
 	ovals = malloc(len * sizeof(*ovals));
@@ -498,7 +498,7 @@ static void init_io(struct submitter *s, unsigned index)
 	sqe->off = offset;
 	sqe->user_data = (unsigned long) f->fileno;
 	if (stats && stats_running)
-		sqe->user_data |= ((unsigned long)s->clock_index << 32);
+		sqe->user_data |= ((uint64_t)s->clock_index << 32);
 }
 
 static int prep_more_ios_uring(struct submitter *s, int max_ios)

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-21 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-21 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit beda9d8d9e9148ff34eaa0eeb0cde19a36f47494:

  t/io_uring: add -R option for random/sequential IO (2021-11-19 10:44:15 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 9f51d89c683d70cd8ab23ba09ec6e628a548af5a:

  Sync io_uring header with the kernel (2021-11-20 07:31:20 -0700)

----------------------------------------------------------------
Jens Axboe (2):
      io_uring: clamp CQ size to SQ size
      Sync io_uring header with the kernel

 engines/io_uring.c  |   7 ++
 os/linux/io_uring.h | 186 ++++++++++++++++++++++++++++++++++++++++++----------
 t/io_uring.c        |   7 ++
 3 files changed, 167 insertions(+), 33 deletions(-)

---

Diff of recent changes:

diff --git a/engines/io_uring.c b/engines/io_uring.c
index 8b8f35f1..00ae3482 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -692,6 +692,13 @@ static int fio_ioring_queue_init(struct thread_data *td)
 		}
 	}
 
+	/*
+	 * Clamp CQ ring size at our SQ ring size, we don't need more entries
+	 * than that.
+	 */
+	p.flags |= IORING_SETUP_CQSIZE;
+	p.cq_entries = depth;
+
 	ret = syscall(__NR_io_uring_setup, depth, &p);
 	if (ret < 0)
 		return ret;
diff --git a/os/linux/io_uring.h b/os/linux/io_uring.h
index d39b45fd..c45b5e9a 100644
--- a/os/linux/io_uring.h
+++ b/os/linux/io_uring.h
@@ -11,10 +11,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 /*
  * IO submission data structure (Submission Queue Entry)
  */
@@ -46,23 +42,25 @@ struct io_uring_sqe {
 		__u32		statx_flags;
 		__u32		fadvise_advice;
 		__u32		splice_flags;
+		__u32		rename_flags;
+		__u32		unlink_flags;
+		__u32		hardlink_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
+	/* pack this to avoid bogus arm OABI complaints */
 	union {
-		struct {
-			/* pack this to avoid bogus arm OABI complaints */
-			union {
-				/* index into fixed buffers, if used */
-				__u16	buf_index;
-				/* for grouped buffer selection */
-				__u16	buf_group;
-			} __attribute__((packed));
-			/* personality to use, if used */
-			__u16	personality;
-			__s32	splice_fd_in;
-		};
-		__u64	__pad2[3];
+		/* index into fixed buffers, if used */
+		__u16	buf_index;
+		/* for grouped buffer selection */
+		__u16	buf_group;
+	} __attribute__((packed));
+	/* personality to use, if used */
+	__u16	personality;
+	union {
+		__s32	splice_fd_in;
+		__u32	file_index;
 	};
+	__u64	__pad2[2];
 };
 
 enum {
@@ -99,6 +97,7 @@ enum {
 #define IORING_SETUP_CQSIZE	(1U << 3)	/* app defines CQ size */
 #define IORING_SETUP_CLAMP	(1U << 4)	/* clamp SQ/CQ ring sizes */
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
+#define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 
 enum {
 	IORING_OP_NOP,
@@ -135,6 +134,12 @@ enum {
 	IORING_OP_PROVIDE_BUFFERS,
 	IORING_OP_REMOVE_BUFFERS,
 	IORING_OP_TEE,
+	IORING_OP_SHUTDOWN,
+	IORING_OP_RENAMEAT,
+	IORING_OP_UNLINKAT,
+	IORING_OP_MKDIRAT,
+	IORING_OP_SYMLINKAT,
+	IORING_OP_LINKAT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -148,14 +153,35 @@ enum {
 /*
  * sqe->timeout_flags
  */
-#define IORING_TIMEOUT_ABS	(1U << 0)
-
+#define IORING_TIMEOUT_ABS		(1U << 0)
+#define IORING_TIMEOUT_UPDATE		(1U << 1)
+#define IORING_TIMEOUT_BOOTTIME		(1U << 2)
+#define IORING_TIMEOUT_REALTIME		(1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
+#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
+#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
 /*
  * sqe->splice_flags
  * extends splice(2) flags
  */
 #define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
 
+/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI	Multishot poll. Sets IORING_CQE_F_MORE if
+ *				the poll handler will continue to report
+ *				CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE		Update existing poll request, matching
+ *				sqe->addr as the old user_data field.
+ */
+#define IORING_POLL_ADD_MULTI	(1U << 0)
+#define IORING_POLL_UPDATE_EVENTS	(1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
@@ -169,8 +195,10 @@ struct io_uring_cqe {
  * cqe->flags
  *
  * IORING_CQE_F_BUFFER	If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE	If set, parent SQE will generate more CQE entries
  */
 #define IORING_CQE_F_BUFFER		(1U << 0)
+#define IORING_CQE_F_MORE		(1U << 1)
 
 enum {
 	IORING_CQE_BUFFER_SHIFT		= 16,
@@ -228,6 +256,8 @@ struct io_cqring_offsets {
  */
 #define IORING_ENTER_GETEVENTS	(1U << 0)
 #define IORING_ENTER_SQ_WAKEUP	(1U << 1)
+#define IORING_ENTER_SQ_WAIT	(1U << 2)
+#define IORING_ENTER_EXT_ARG	(1U << 3)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -255,28 +285,85 @@ struct io_uring_params {
 #define IORING_FEAT_CUR_PERSONALITY	(1U << 4)
 #define IORING_FEAT_FAST_POLL		(1U << 5)
 #define IORING_FEAT_POLL_32BITS 	(1U << 6)
+#define IORING_FEAT_SQPOLL_NONFIXED	(1U << 7)
+#define IORING_FEAT_EXT_ARG		(1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS	(1U << 9)
+#define IORING_FEAT_RSRC_TAGS		(1U << 10)
 
 /*
  * io_uring_register(2) opcodes and arguments
  */
-#define IORING_REGISTER_BUFFERS		0
-#define IORING_UNREGISTER_BUFFERS	1
-#define IORING_REGISTER_FILES		2
-#define IORING_UNREGISTER_FILES		3
-#define IORING_REGISTER_EVENTFD		4
-#define IORING_UNREGISTER_EVENTFD	5
-#define IORING_REGISTER_FILES_UPDATE	6
-#define IORING_REGISTER_EVENTFD_ASYNC	7
-#define IORING_REGISTER_PROBE		8
-#define IORING_REGISTER_PERSONALITY	9
-#define IORING_UNREGISTER_PERSONALITY	10
+enum {
+	IORING_REGISTER_BUFFERS			= 0,
+	IORING_UNREGISTER_BUFFERS		= 1,
+	IORING_REGISTER_FILES			= 2,
+	IORING_UNREGISTER_FILES			= 3,
+	IORING_REGISTER_EVENTFD			= 4,
+	IORING_UNREGISTER_EVENTFD		= 5,
+	IORING_REGISTER_FILES_UPDATE		= 6,
+	IORING_REGISTER_EVENTFD_ASYNC		= 7,
+	IORING_REGISTER_PROBE			= 8,
+	IORING_REGISTER_PERSONALITY		= 9,
+	IORING_UNREGISTER_PERSONALITY		= 10,
+	IORING_REGISTER_RESTRICTIONS		= 11,
+	IORING_REGISTER_ENABLE_RINGS		= 12,
+
+	/* extended with tagging */
+	IORING_REGISTER_FILES2			= 13,
+	IORING_REGISTER_FILES_UPDATE2		= 14,
+	IORING_REGISTER_BUFFERS2		= 15,
+	IORING_REGISTER_BUFFERS_UPDATE		= 16,
+
+	/* set/clear io-wq thread affinities */
+	IORING_REGISTER_IOWQ_AFF		= 17,
+	IORING_UNREGISTER_IOWQ_AFF		= 18,
+
+	/* set/get max number of io-wq workers */
+	IORING_REGISTER_IOWQ_MAX_WORKERS	= 19,
 
+	/* this goes last */
+	IORING_REGISTER_LAST
+};
+
+/* io-wq worker categories */
+enum {
+	IO_WQ_BOUND,
+	IO_WQ_UNBOUND,
+};
+
+/* deprecated, see struct io_uring_rsrc_update */
 struct io_uring_files_update {
 	__u32 offset;
 	__u32 resv;
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+struct io_uring_rsrc_register {
+	__u32 nr;
+	__u32 resv;
+	__u64 resv2;
+	__aligned_u64 data;
+	__aligned_u64 tags;
+};
+
+struct io_uring_rsrc_update {
+	__u32 offset;
+	__u32 resv;
+	__aligned_u64 data;
+};
+
+struct io_uring_rsrc_update2 {
+	__u32 offset;
+	__u32 resv;
+	__aligned_u64 data;
+	__aligned_u64 tags;
+	__u32 nr;
+	__u32 resv2;
+};
+
+/* Skip updating fd indexes set to this value in the fd table */
+#define IORING_REGISTER_FILES_SKIP	(-2)
+
 #define IO_URING_OP_SUPPORTED	(1U << 0)
 
 struct io_uring_probe_op {
@@ -294,8 +381,41 @@ struct io_uring_probe {
 	struct io_uring_probe_op ops[0];
 };
 
-#ifdef __cplusplus
-}
-#endif
+struct io_uring_restriction {
+	__u16 opcode;
+	union {
+		__u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
+		__u8 sqe_op;      /* IORING_RESTRICTION_SQE_OP */
+		__u8 sqe_flags;   /* IORING_RESTRICTION_SQE_FLAGS_* */
+	};
+	__u8 resv;
+	__u32 resv2[3];
+};
+
+/*
+ * io_uring_restriction->opcode values
+ */
+enum {
+	/* Allow an io_uring_register(2) opcode */
+	IORING_RESTRICTION_REGISTER_OP		= 0,
+
+	/* Allow an sqe opcode */
+	IORING_RESTRICTION_SQE_OP		= 1,
+
+	/* Allow sqe flags */
+	IORING_RESTRICTION_SQE_FLAGS_ALLOWED	= 2,
+
+	/* Require sqe flags (these flags must be set on each submission) */
+	IORING_RESTRICTION_SQE_FLAGS_REQUIRED	= 3,
+
+	IORING_RESTRICTION_LAST
+};
+
+struct io_uring_getevents_arg {
+	__u64	sigmask;
+	__u32	sigmask_sz;
+	__u32	pad;
+	__u64	ts;
+};
 
 #endif
diff --git a/t/io_uring.c b/t/io_uring.c
index b79822d7..7bf215c7 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -384,6 +384,13 @@ static int io_uring_register_files(struct submitter *s)
 
 static int io_uring_setup(unsigned entries, struct io_uring_params *p)
 {
+	/*
+	 * Clamp CQ ring size at our SQ ring size, we don't need more entries
+	 * than that.
+	 */
+	p->flags |= IORING_SETUP_CQSIZE;
+	p->cq_entries = entries;
+
 	return syscall(__NR_io_uring_setup, entries, p);
 }
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-20 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-20 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 5711325cbb37d10c21a6975d1f1ebea11799c05e:

  Makefile: Fix android compilation (2021-11-17 16:14:27 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to beda9d8d9e9148ff34eaa0eeb0cde19a36f47494:

  t/io_uring: add -R option for random/sequential IO (2021-11-19 10:44:15 -0700)

----------------------------------------------------------------
Damien Le Moal (1):
      fio: Introduce the log_entries option

Jens Axboe (2):
      t/io_uring: use internal random generator
      t/io_uring: add -R option for random/sequential IO

 HOWTO            | 12 ++++++++++++
 Makefile         |  3 +--
 cconv.c          |  2 ++
 fio.1            | 11 +++++++++++
 lib/rand.c       |  2 +-
 lib/rand.h       |  1 +
 options.c        | 12 ++++++++++++
 server.h         |  2 +-
 stat.c           | 12 +++++-------
 t/io_uring.c     | 34 +++++++++++++++++++++++++++-------
 thread_options.h |  2 ++
 11 files changed, 75 insertions(+), 18 deletions(-)

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO
index 196bca6c..a3b3acfe 100644
--- a/HOWTO
+++ b/HOWTO
@@ -3537,6 +3537,18 @@ Measurements and reporting
 	:option:`write_bw_log` for details about the filename format and `Log
 	File Formats`_ for how data is structured within the file.
 
+.. option:: log_entries=int
+
+	By default, fio will log an entry in the iops, latency, or bw log for
+	every I/O that completes. The initial number of I/O log entries is 1024.
+	When the log entries are all used, new log entries are dynamically
+	allocated.  This dynamic log entry allocation may negatively impact
+	time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+	completion latency). This option allows specifying a larger initial
+	number of log entries to avoid run-time allocations of new log entries,
+	resulting in more precise time-related I/O statistics.
+	Also see :option:`log_avg_msec`. Defaults to 1024.
+
 .. option:: log_avg_msec=int
 
 	By default, fio will log an entry in the iops, latency, or bw log for every
diff --git a/Makefile b/Makefile
index 04c1e0a7..5d17bcab 100644
--- a/Makefile
+++ b/Makefile
@@ -375,8 +375,7 @@ T_VS_PROGS = t/fio-verify-state
 T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
 T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
 
-T_IOU_RING_OBJS = t/io_uring.o
-T_IOU_RING_OBJS += t/arch.o
+T_IOU_RING_OBJS = t/io_uring.o lib/rand.o lib/pattern.o lib/strntol.o
 T_IOU_RING_PROGS = t/io_uring
 
 T_MEMLOCK_OBJS = t/memlock.o
diff --git a/cconv.c b/cconv.c
index 2104308c..4f8d27eb 100644
--- a/cconv.c
+++ b/cconv.c
@@ -187,6 +187,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
 	o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
 	o->rand_seed = le64_to_cpu(top->rand_seed);
+	o->log_entries = le32_to_cpu(top->log_entries);
 	o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
 	o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
 	o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
@@ -416,6 +417,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
 	top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
 	top->rand_seed = __cpu_to_le64(o->rand_seed);
+	top->log_entries = cpu_to_le32(o->log_entries);
 	top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
 	top->log_max = cpu_to_le32(o->log_max);
 	top->log_offset = cpu_to_le32(o->log_offset);
diff --git a/fio.1 b/fio.1
index e3c3feae..a6469541 100644
--- a/fio.1
+++ b/fio.1
@@ -3243,6 +3243,17 @@ logging (see \fBlog_avg_msec\fR) has been enabled. See
 \fBwrite_bw_log\fR for details about the filename format and \fBLOG
 FILE FORMATS\fR for how data is structured within the file.
 .TP
+.BI log_entries \fR=\fPint
+By default, fio will log an entry in the iops, latency, or bw log for
+every I/O that completes. The initial number of I/O log entries is 1024.
+When the log entries are all used, new log entries are dynamically
+allocated.  This dynamic log entry allocation may negatively impact
+time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+completion latency). This option allows specifying a larger initial
+number of log entries to avoid run-time allocation of new log entries,
+resulting in more precise time-related I/O statistics.
+Also see \fBlog_avg_msec\fR as well. Defaults to 1024.
+.TP
 .BI log_avg_msec \fR=\fPint
 By default, fio will log an entry in the iops, latency, or bw log for every
 I/O that completes. When writing to the disk log, that can quickly grow to a
diff --git a/lib/rand.c b/lib/rand.c
index e74da609..6e893e80 100644
--- a/lib/rand.c
+++ b/lib/rand.c
@@ -59,7 +59,7 @@ static void __init_rand32(struct taus88_state *state, unsigned int seed)
 		__rand32(state);
 }
 
-static void __init_rand64(struct taus258_state *state, uint64_t seed)
+void __init_rand64(struct taus258_state *state, uint64_t seed)
 {
 	int cranks = 6;
 
diff --git a/lib/rand.h b/lib/rand.h
index a8060045..2b4be788 100644
--- a/lib/rand.h
+++ b/lib/rand.h
@@ -162,6 +162,7 @@ static inline uint64_t __get_next_seed(struct frand_state *fs)
 
 extern void init_rand(struct frand_state *, bool);
 extern void init_rand_seed(struct frand_state *, uint64_t seed, bool);
+void __init_rand64(struct taus258_state *state, uint64_t seed);
 extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
 extern uint64_t fill_random_buf(struct frand_state *, void *buf, unsigned int len);
 extern void __fill_random_buf_percentage(uint64_t, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
diff --git a/options.c b/options.c
index 460cf4ff..102bcf56 100644
--- a/options.c
+++ b/options.c
@@ -4244,6 +4244,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
 	},
+	{
+		.name	= "log_entries",
+		.lname	= "Log entries",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_entries),
+		.help	= "Initial number of entries in a job IO log",
+		.def	= __fio_stringify(DEF_LOG_ENTRIES),
+		.minval	= DEF_LOG_ENTRIES,
+		.maxval	= MAX_LOG_ENTRIES,
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
 	{
 		.name	= "log_avg_msec",
 		.lname	= "Log averaging (msec)",
diff --git a/server.h b/server.h
index 44b8da12..25b6bbdc 100644
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-	FIO_SERVER_VER			= 94,
+	FIO_SERVER_VER			= 95,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
diff --git a/stat.c b/stat.c
index cd35b114..e0dc99b6 100644
--- a/stat.c
+++ b/stat.c
@@ -2688,27 +2688,25 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
  */
 static struct io_logs *get_new_log(struct io_log *iolog)
 {
-	size_t new_size, new_samples;
+	size_t new_samples;
 	struct io_logs *cur_log;
 
 	/*
 	 * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
 	 * forever
 	 */
-	if (!iolog->cur_log_max)
-		new_samples = DEF_LOG_ENTRIES;
-	else {
+	if (!iolog->cur_log_max) {
+		new_samples = iolog->td->o.log_entries;
+	} else {
 		new_samples = iolog->cur_log_max * 2;
 		if (new_samples > MAX_LOG_ENTRIES)
 			new_samples = MAX_LOG_ENTRIES;
 	}
 
-	new_size = new_samples * log_entry_sz(iolog);
-
 	cur_log = smalloc(sizeof(*cur_log));
 	if (cur_log) {
 		INIT_FLIST_HEAD(&cur_log->list);
-		cur_log->log = malloc(new_size);
+		cur_log->log = calloc(new_samples, log_entry_sz(iolog));
 		if (cur_log->log) {
 			cur_log->nr_samples = 0;
 			cur_log->max_samples = new_samples;
diff --git a/t/io_uring.c b/t/io_uring.c
index f758a6d9..b79822d7 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -28,6 +28,7 @@
 #include "../arch/arch.h"
 #include "../lib/types.h"
 #include "../lib/roundup.h"
+#include "../lib/rand.h"
 #include "../minmax.h"
 #include "../os/linux/io_uring.h"
 
@@ -59,6 +60,8 @@ static unsigned sq_ring_mask, cq_ring_mask;
 
 struct file {
 	unsigned long max_blocks;
+	unsigned long max_size;
+	unsigned long cur_off;
 	unsigned pending_ios;
 	int real_fd;
 	int fixed_fd;
@@ -86,6 +89,8 @@ struct submitter {
 
 	__s32 *fds;
 
+	struct taus258_state rand_state;
+
 	unsigned long *clock_batch;
 	int clock_index;
 	unsigned long *plat;
@@ -120,7 +125,8 @@ static int do_nop = 0;		/* no-op SQ ring commands */
 static int nthreads = 1;
 static int stats = 0;		/* generate IO stats */
 static int aio = 0;		/* use libaio */
-static int runtime = 0;	/* runtime */
+static int runtime = 0;		/* runtime */
+static int random_io = 1;	/* random or sequential IO */
 
 static unsigned long tsc_rate;
 
@@ -448,8 +454,15 @@ static void init_io(struct submitter *s, unsigned index)
 	}
 	f->pending_ios++;
 
-	r = lrand48();
-	offset = (r % (f->max_blocks - 1)) * bs;
+	if (random_io) {
+		r = __rand64(&s->rand_state);
+		offset = (r % (f->max_blocks - 1)) * bs;
+	} else {
+		offset = f->cur_off;
+		f->cur_off += bs;
+		if (f->cur_off + bs > f->max_size)
+			f->cur_off = 0;
+	}
 
 	if (register_files) {
 		sqe->flags = IOSQE_FIXED_FILE;
@@ -517,9 +530,11 @@ static int get_file_size(struct file *f)
 			return -1;
 
 		f->max_blocks = bytes / bs;
+		f->max_size = bytes;
 		return 0;
 	} else if (S_ISREG(st.st_mode)) {
 		f->max_blocks = st.st_size / bs;
+		f->max_size = st.st_size;
 		return 0;
 	}
 
@@ -586,6 +601,7 @@ static int submitter_init(struct submitter *s)
 	s->tid = gettid();
 	printf("submitter=%d, tid=%d\n", s->index, s->tid);
 
+	__init_rand64(&s->rand_state, pthread_self());
 	srand48(pthread_self());
 
 	for (i = 0; i < MAX_FDS; i++)
@@ -1066,11 +1082,12 @@ static void usage(char *argv, int status)
 		" -N <bool> : Perform just no-op requests, default %d\n"
 		" -t <bool> : Track IO latencies, default %d\n"
 		" -T <int>  : TSC rate in HZ\n"
-		" -a <bool> : Use legacy aio, default %d\n"
-		" -r <int>  : Runtime in seconds, default %s\n",
+		" -r <int>  : Runtime in seconds, default %s\n"
+		" -R <bool> : Use random IO, default %d\n"
+		" -a <bool> : Use legacy aio, default %d\n",
 		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
 		fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
-		stats, aio, runtime == 0 ? "unlimited" : runtime_str);
+		stats, runtime == 0 ? "unlimited" : runtime_str, aio, random_io);
 	exit(status);
 }
 
@@ -1130,7 +1147,7 @@ int main(int argc, char *argv[])
 	if (!do_nop && argc < 2)
 		usage(argv[0], 1);
 
-	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:h?")) != -1) {
+	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:h?")) != -1) {
 		switch (opt) {
 		case 'a':
 			aio = !!atoi(optarg);
@@ -1194,6 +1211,9 @@ int main(int argc, char *argv[])
 		case 'D':
 			dma_map = !!atoi(optarg);
 			break;
+		case 'R':
+			random_io = !!atoi(optarg);
+			break;
 		case 'h':
 		case '?':
 		default:
diff --git a/thread_options.h b/thread_options.h
index 6e1a2cdd..8f4c8a59 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -377,6 +377,7 @@ struct thread_options {
 	fio_fp64_t zrt;
 	fio_fp64_t zrf;
 
+	unsigned int log_entries;
 	unsigned int log_prio;
 };
 
@@ -683,6 +684,7 @@ struct thread_options_pack {
 	int32_t max_open_zones;
 	uint32_t ignore_zone_limits;
 
+	uint32_t log_entries;
 	uint32_t log_prio;
 } __attribute__((packed));
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-18 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-18 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit f7c3f31db877d30056d19761e48499f5b0bfa0b6:

  Merge branch 'jf_readme_typo' of https://github.com/jfpanisset/fio (2021-11-12 09:22:21 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 5711325cbb37d10c21a6975d1f1ebea11799c05e:

  Makefile: Fix android compilation (2021-11-17 16:14:27 -0700)

----------------------------------------------------------------
Gwendal Grignou (1):
      Makefile: Fix android compilation

 Makefile | 1 +
 1 file changed, 1 insertion(+)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index e9028dce..04c1e0a7 100644
--- a/Makefile
+++ b/Makefile
@@ -236,6 +236,7 @@ endif
 ifeq ($(CONFIG_TARGET_OS), Android)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
 		oslib/linux-dev-lookup.c engines/io_uring.c
+  cmdprio_SRCS = engines/cmdprio.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
 endif

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-13 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-13 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit 6619fc32c413c4ff3a24c819037fb9227af3f876:

  stat: create a init_thread_stat_min_vals() helper (2021-11-08 06:24:48 -0700)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to f7c3f31db877d30056d19761e48499f5b0bfa0b6:

  Merge branch 'jf_readme_typo' of https://github.com/jfpanisset/fio (2021-11-12 09:22:21 -0700)

----------------------------------------------------------------
Jean-Francois Panisset (1):
      Small typo fix

Jens Axboe (1):
      Merge branch 'jf_readme_typo' of https://github.com/jfpanisset/fio

Niklas Cassel (8):
      docs: update cmdprio_percentage documentation
      cmdprio: move cmdprio function definitions to a new cmdprio.c file
      cmdprio: do not allocate memory for unused data direction
      io_uring: set async IO priority to td->ioprio in fio_ioring_prep()
      libaio,io_uring: rename prio_prep() to include cmdprio in the name
      libaio,io_uring: move common cmdprio_prep() code to cmdprio
      cmdprio: add mode to make the logic easier to reason about
      libaio,io_uring: make it possible to cleanup cmdprio malloced data

 HOWTO              |   5 +-
 Makefile           |   6 ++
 README             |   2 +-
 engines/cmdprio.c  | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 engines/cmdprio.h  | 150 ++++++---------------------------
 engines/io_uring.c | 100 ++++++++--------------
 engines/libaio.c   |  72 +++++-----------
 fio.1              |   3 +-
 8 files changed, 333 insertions(+), 248 deletions(-)
 create mode 100644 engines/cmdprio.c

---

Diff of recent changes:

diff --git a/HOWTO b/HOWTO
index 297a0485..196bca6c 100644
--- a/HOWTO
+++ b/HOWTO
@@ -2167,9 +2167,8 @@ with the caveat that when used on the command line, they must come after the
 
     Set the percentage of I/O that will be issued with the highest priority.
     Default: 0. A single value applies to reads and writes. Comma-separated
-    values may be specified for reads and writes. This option cannot be used
-    with the :option:`prio` or :option:`prioclass` options. For this option
-    to be effective, NCQ priority must be supported and enabled, and `direct=1'
+    values may be specified for reads and writes. For this option to be
+    effective, NCQ priority must be supported and enabled, and `direct=1'
     option must be used. fio must also be run as the root user.
 
 .. option:: cmdprio_class=int[,int] : [io_uring] [libaio]
diff --git a/Makefile b/Makefile
index 4ae5a371..e9028dce 100644
--- a/Makefile
+++ b/Makefile
@@ -98,6 +98,7 @@ else ifdef CONFIG_32BIT
 endif
 ifdef CONFIG_LIBAIO
   libaio_SRCS = engines/libaio.c
+  cmdprio_SRCS = engines/cmdprio.c
   libaio_LIBS = -laio
   ENGINES += libaio
 endif
@@ -225,6 +226,7 @@ endif
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
 		oslib/linux-dev-lookup.c engines/io_uring.c
+  cmdprio_SRCS = engines/cmdprio.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
 endif
@@ -281,6 +283,10 @@ ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
   FIO_CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
 endif
 
+ifdef cmdprio_SRCS
+  SOURCE += $(cmdprio_SRCS)
+endif
+
 ifdef CONFIG_DYNAMIC_ENGINES
  DYNAMIC_ENGS := $(ENGINES)
 define engine_template =
diff --git a/README b/README
index 52eca5c3..d566fae3 100644
--- a/README
+++ b/README
@@ -10,7 +10,7 @@ tailored test case again and again.
 
 A test work load is difficult to define, though. There can be any number of
 processes or threads involved, and they can each be using their own way of
-generating I/O. You could have someone dirtying large amounts of memory in an
+generating I/O. You could have someone dirtying large amounts of memory in a
 memory mapped file, or maybe several threads issuing reads using asynchronous
 I/O. fio needed to be flexible enough to simulate both of these cases, and many
 more.
diff --git a/engines/cmdprio.c b/engines/cmdprio.c
new file mode 100644
index 00000000..92b752ae
--- /dev/null
+++ b/engines/cmdprio.c
@@ -0,0 +1,243 @@
+/*
+ * IO priority handling helper functions common to the libaio and io_uring
+ * engines.
+ */
+
+#include "cmdprio.h"
+
+static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
+				    enum fio_ddir ddir, char *str, bool data)
+{
+	struct cmdprio *cmdprio = cb_arg;
+	struct split split;
+	unsigned int i;
+
+	if (ddir == DDIR_TRIM)
+		return 0;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(to, &split, str, data, BSSPLIT_MAX))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	cmdprio->bssplit_nr[ddir] = split.nr;
+	cmdprio->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
+	if (!cmdprio->bssplit[ddir])
+		return 1;
+
+	for (i = 0; i < split.nr; i++) {
+		cmdprio->bssplit[ddir][i].bs = split.val1[i];
+		if (split.val2[i] == -1U) {
+			cmdprio->bssplit[ddir][i].perc = 0;
+		} else {
+			if (split.val2[i] > 100)
+				cmdprio->bssplit[ddir][i].perc = 100;
+			else
+				cmdprio->bssplit[ddir][i].perc = split.val2[i];
+		}
+	}
+
+	return 0;
+}
+
+int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
+			      struct cmdprio *cmdprio)
+{
+	char *str, *p;
+	int ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, cmdprio,
+			      false);
+
+	free(p);
+	return ret;
+}
+
+static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
+{
+	enum fio_ddir ddir = io_u->ddir;
+	struct cmdprio_options *options = cmdprio->options;
+	int i;
+
+	switch (cmdprio->mode) {
+	case CMDPRIO_MODE_PERC:
+		return options->percentage[ddir];
+	case CMDPRIO_MODE_BSSPLIT:
+		for (i = 0; i < cmdprio->bssplit_nr[ddir]; i++) {
+			if (cmdprio->bssplit[ddir][i].bs == io_u->buflen)
+				return cmdprio->bssplit[ddir][i].perc;
+		}
+		break;
+	default:
+		/*
+		 * An I/O engine should never call this function if cmdprio
+		 * is not is use.
+		 */
+		assert(0);
+	}
+
+	return 0;
+}
+
+/**
+ * fio_cmdprio_set_ioprio - Set an io_u ioprio according to cmdprio options
+ *
+ * Generates a random percentage value to determine if an io_u ioprio needs
+ * to be set. If the random percentage value is within the user specified
+ * percentage of I/Os that should use a cmdprio priority value (rather than
+ * the default priority), then this function updates the io_u with an ioprio
+ * value as defined by the cmdprio/cmdprio_class or cmdprio_bssplit options.
+ *
+ * Return true if the io_u ioprio was changed and false otherwise.
+ */
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+			    struct io_u *io_u)
+{
+	enum fio_ddir ddir = io_u->ddir;
+	struct cmdprio_options *options = cmdprio->options;
+	unsigned int p;
+	unsigned int cmdprio_value =
+		ioprio_value(options->class[ddir], options->level[ddir]);
+
+	p = fio_cmdprio_percentage(cmdprio, io_u);
+	if (p && rand_between(&td->prio_state, 0, 99) < p) {
+		io_u->ioprio = cmdprio_value;
+		if (!td->ioprio || cmdprio_value < td->ioprio) {
+			/*
+			 * The async IO priority is higher (has a lower value)
+			 * than the default priority (which is either 0 or the
+			 * value set by "prio" and "prioclass" options).
+			 */
+			io_u->flags |= IO_U_F_HIGH_PRIO;
+		}
+		return true;
+	}
+
+	if (td->ioprio && td->ioprio < cmdprio_value) {
+		/*
+		 * The IO will be executed with the default priority (which is
+		 * either 0 or the value set by "prio" and "prioclass options),
+		 * and this priority is higher (has a lower value) than the
+		 * async IO priority.
+		 */
+		io_u->flags |= IO_U_F_HIGH_PRIO;
+	}
+
+	return false;
+}
+
+static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
+					     struct cmdprio *cmdprio)
+{
+	struct cmdprio_options *options = cmdprio->options;
+	int ret;
+
+	ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str, cmdprio);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	fio_cmdprio_cleanup(cmdprio);
+
+	return ret;
+}
+
+static int fio_cmdprio_parse_and_gen(struct thread_data *td,
+				     struct cmdprio *cmdprio)
+{
+	struct cmdprio_options *options = cmdprio->options;
+	int i, ret;
+
+	switch (cmdprio->mode) {
+	case CMDPRIO_MODE_BSSPLIT:
+		ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio);
+		break;
+	case CMDPRIO_MODE_PERC:
+		ret = 0;
+		break;
+	default:
+		assert(0);
+		return 1;
+	}
+
+	/*
+	 * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
+	 * is not set, default to RT priority class.
+	 */
+	for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+		if (options->percentage[i] || cmdprio->bssplit_nr[i]) {
+			if (!options->class[i])
+				options->class[i] = IOPRIO_CLASS_RT;
+		}
+	}
+
+	return ret;
+}
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio)
+{
+	int ddir;
+
+	for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+		free(cmdprio->bssplit[ddir]);
+		cmdprio->bssplit[ddir] = NULL;
+		cmdprio->bssplit_nr[ddir] = 0;
+	}
+
+	/*
+	 * options points to a cmdprio_options struct that is part of td->eo.
+	 * td->eo itself will be freed by free_ioengine().
+	 */
+	cmdprio->options = NULL;
+}
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+		     struct cmdprio_options *options)
+{
+	struct thread_options *to = &td->o;
+	bool has_cmdprio_percentage = false;
+	bool has_cmdprio_bssplit = false;
+	int i;
+
+	cmdprio->options = options;
+
+	if (options->bssplit_str && strlen(options->bssplit_str))
+		has_cmdprio_bssplit = true;
+
+	for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+		if (options->percentage[i])
+			has_cmdprio_percentage = true;
+	}
+
+	/*
+	 * Check for option conflicts
+	 */
+	if (has_cmdprio_percentage && has_cmdprio_bssplit) {
+		log_err("%s: cmdprio_percentage and cmdprio_bssplit options "
+			"are mutually exclusive\n",
+			to->name);
+		return 1;
+	}
+
+	if (has_cmdprio_bssplit)
+		cmdprio->mode = CMDPRIO_MODE_BSSPLIT;
+	else if (has_cmdprio_percentage)
+		cmdprio->mode = CMDPRIO_MODE_PERC;
+	else
+		cmdprio->mode = CMDPRIO_MODE_NONE;
+
+	/* Nothing left to do if cmdprio is not used */
+	if (cmdprio->mode == CMDPRIO_MODE_NONE)
+		return 0;
+
+	return fio_cmdprio_parse_and_gen(td, cmdprio);
+}
diff --git a/engines/cmdprio.h b/engines/cmdprio.h
index 0edc4365..0c7bd6cf 100644
--- a/engines/cmdprio.h
+++ b/engines/cmdprio.h
@@ -8,137 +8,35 @@
 
 #include "../fio.h"
 
-struct cmdprio {
-	unsigned int percentage[DDIR_RWDIR_CNT];
-	unsigned int class[DDIR_RWDIR_CNT];
-	unsigned int level[DDIR_RWDIR_CNT];
-	unsigned int bssplit_nr[DDIR_RWDIR_CNT];
-	struct bssplit *bssplit[DDIR_RWDIR_CNT];
-};
-
-static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
-				    enum fio_ddir ddir, char *str, bool data)
-{
-	struct cmdprio *cmdprio = cb_arg;
-	struct split split;
-	unsigned int i;
-
-	if (ddir == DDIR_TRIM)
-		return 0;
-
-	memset(&split, 0, sizeof(split));
-
-	if (split_parse_ddir(to, &split, str, data, BSSPLIT_MAX))
-		return 1;
-	if (!split.nr)
-		return 0;
-
-	cmdprio->bssplit_nr[ddir] = split.nr;
-	cmdprio->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
-	if (!cmdprio->bssplit[ddir])
-		return 1;
-
-	for (i = 0; i < split.nr; i++) {
-		cmdprio->bssplit[ddir][i].bs = split.val1[i];
-		if (split.val2[i] == -1U) {
-			cmdprio->bssplit[ddir][i].perc = 0;
-		} else {
-			if (split.val2[i] > 100)
-				cmdprio->bssplit[ddir][i].perc = 100;
-			else
-				cmdprio->bssplit[ddir][i].perc = split.val2[i];
-		}
-	}
-
-	return 0;
-}
-
-static int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
-				     struct cmdprio *cmdprio)
-{
-	char *str, *p;
-	int i, ret = 0;
-
-	p = str = strdup(input);
+/* read and writes only, no trim */
+#define CMDPRIO_RWDIR_CNT 2
 
-	strip_blank_front(&str);
-	strip_blank_end(str);
-
-	ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, cmdprio, false);
-
-	if (parse_dryrun()) {
-		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-			free(cmdprio->bssplit[i]);
-			cmdprio->bssplit[i] = NULL;
-			cmdprio->bssplit_nr[i] = 0;
-		}
-	}
-
-	free(p);
-	return ret;
-}
-
-static inline int fio_cmdprio_percentage(struct cmdprio *cmdprio,
-					 struct io_u *io_u)
-{
-	enum fio_ddir ddir = io_u->ddir;
-	unsigned int p = cmdprio->percentage[ddir];
-	int i;
-
-	/*
-	 * If cmdprio_percentage option was specified, then use that
-	 * percentage. Otherwise, use cmdprio_bssplit percentages depending
-	 * on the IO size.
-	 */
-	if (p)
-		return p;
-
-	for (i = 0; i < cmdprio->bssplit_nr[ddir]; i++) {
-		if (cmdprio->bssplit[ddir][i].bs == io_u->buflen)
-			return cmdprio->bssplit[ddir][i].perc;
-	}
-
-	return 0;
-}
+enum {
+	CMDPRIO_MODE_NONE,
+	CMDPRIO_MODE_PERC,
+	CMDPRIO_MODE_BSSPLIT,
+};
 
-static int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
-			    bool *has_cmdprio)
-{
-	struct thread_options *to = &td->o;
-	bool has_cmdprio_percentage = false;
-	bool has_cmdprio_bssplit = false;
-	int i;
+struct cmdprio_options {
+	unsigned int percentage[CMDPRIO_RWDIR_CNT];
+	unsigned int class[CMDPRIO_RWDIR_CNT];
+	unsigned int level[CMDPRIO_RWDIR_CNT];
+	char *bssplit_str;
+};
 
-	/*
-	 * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
-	 * is not set, default to RT priority class.
-	 */
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		if (cmdprio->percentage[i]) {
-			if (!cmdprio->class[i])
-				cmdprio->class[i] = IOPRIO_CLASS_RT;
-			has_cmdprio_percentage = true;
-		}
-		if (cmdprio->bssplit_nr[i]) {
-			if (!cmdprio->class[i])
-				cmdprio->class[i] = IOPRIO_CLASS_RT;
-			has_cmdprio_bssplit = true;
-		}
-	}
+struct cmdprio {
+	struct cmdprio_options *options;
+	unsigned int bssplit_nr[CMDPRIO_RWDIR_CNT];
+	struct bssplit *bssplit[CMDPRIO_RWDIR_CNT];
+	unsigned int mode;
+};
 
-	/*
-	 * Check for option conflicts
-	 */
-	if (has_cmdprio_percentage && has_cmdprio_bssplit) {
-		log_err("%s: cmdprio_percentage and cmdprio_bssplit options "
-			"are mutually exclusive\n",
-			to->name);
-		return 1;
-	}
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+			    struct io_u *io_u);
 
-	*has_cmdprio = has_cmdprio_percentage || has_cmdprio_bssplit;
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio);
 
-	return 0;
-}
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+		     struct cmdprio_options *options);
 
 #endif
diff --git a/engines/io_uring.c b/engines/io_uring.c
index 27a4a678..8b8f35f1 100644
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -69,13 +69,13 @@ struct ioring_data {
 
 	struct ioring_mmap mmap[3];
 
-	bool use_cmdprio;
+	struct cmdprio cmdprio;
 };
 
 struct ioring_options {
 	struct thread_data *td;
 	unsigned int hipri;
-	struct cmdprio cmdprio;
+	struct cmdprio_options cmdprio_options;
 	unsigned int fixedbufs;
 	unsigned int registerfiles;
 	unsigned int sqpoll_thread;
@@ -106,15 +106,6 @@ static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
 	return 0;
 }
 
-static int str_cmdprio_bssplit_cb(void *data, const char *input)
-{
-	struct ioring_options *o = data;
-	struct thread_data *td = o->td;
-	struct cmdprio *cmdprio = &o->cmdprio;
-
-	return fio_cmdprio_bssplit_parse(td, input, cmdprio);
-}
-
 static struct fio_option options[] = {
 	{
 		.name	= "hipri",
@@ -131,9 +122,9 @@ static struct fio_option options[] = {
 		.lname	= "high priority percentage",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct ioring_options,
-				   cmdprio.percentage[DDIR_READ]),
+				   cmdprio_options.percentage[DDIR_READ]),
 		.off2	= offsetof(struct ioring_options,
-				   cmdprio.percentage[DDIR_WRITE]),
+				   cmdprio_options.percentage[DDIR_WRITE]),
 		.minval	= 0,
 		.maxval	= 100,
 		.help	= "Send high priority I/O this percentage of the time",
@@ -145,9 +136,9 @@ static struct fio_option options[] = {
 		.lname	= "Asynchronous I/O priority class",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct ioring_options,
-				   cmdprio.class[DDIR_READ]),
+				   cmdprio_options.class[DDIR_READ]),
 		.off2	= offsetof(struct ioring_options,
-				   cmdprio.class[DDIR_WRITE]),
+				   cmdprio_options.class[DDIR_WRITE]),
 		.help	= "Set asynchronous IO priority class",
 		.minval	= IOPRIO_MIN_PRIO_CLASS + 1,
 		.maxval	= IOPRIO_MAX_PRIO_CLASS,
@@ -160,9 +151,9 @@ static struct fio_option options[] = {
 		.lname	= "Asynchronous I/O priority level",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct ioring_options,
-				   cmdprio.level[DDIR_READ]),
+				   cmdprio_options.level[DDIR_READ]),
 		.off2	= offsetof(struct ioring_options,
-				   cmdprio.level[DDIR_WRITE]),
+				   cmdprio_options.level[DDIR_WRITE]),
 		.help	= "Set asynchronous IO priority level",
 		.minval	= IOPRIO_MIN_PRIO,
 		.maxval	= IOPRIO_MAX_PRIO,
@@ -173,9 +164,9 @@ static struct fio_option options[] = {
 	{
 		.name   = "cmdprio_bssplit",
 		.lname  = "Priority percentage block size split",
-		.type   = FIO_OPT_STR_ULL,
-		.cb     = str_cmdprio_bssplit_cb,
-		.off1   = offsetof(struct ioring_options, cmdprio.bssplit),
+		.type   = FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct ioring_options,
+				   cmdprio_options.bssplit_str),
 		.help   = "Set priority percentages for different block sizes",
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_IOURING,
@@ -338,6 +329,18 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
 			sqe->rw_flags |= RWF_UNCACHED;
 		if (o->nowait)
 			sqe->rw_flags |= RWF_NOWAIT;
+
+		/*
+		 * Since io_uring can have a submission context (sqthread_poll)
+		 * that is different from the process context, we cannot rely on
+		 * the IO priority set by ioprio_set() (option prio/prioclass)
+		 * to be inherited.
+		 * td->ioprio will have the value of the "default prio", so set
+		 * this unconditionally. This value might get overridden by
+		 * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or
+		 * cmdprio_bssplit is used.
+		 */
+		sqe->ioprio = td->ioprio;
 		sqe->off = io_u->offset;
 	} else if (ddir_sync(io_u->ddir)) {
 		sqe->ioprio = 0;
@@ -444,41 +447,14 @@ static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
 	return r < 0 ? r : events;
 }
 
-static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_ioring_cmdprio_prep(struct thread_data *td,
+					   struct io_u *io_u)
 {
-	struct ioring_options *o = td->eo;
 	struct ioring_data *ld = td->io_ops_data;
-	struct io_uring_sqe *sqe = &ld->sqes[io_u->index];
-	struct cmdprio *cmdprio = &o->cmdprio;
-	enum fio_ddir ddir = io_u->ddir;
-	unsigned int p = fio_cmdprio_percentage(cmdprio, io_u);
-	unsigned int cmdprio_value =
-		ioprio_value(cmdprio->class[ddir], cmdprio->level[ddir]);
-
-	if (p && rand_between(&td->prio_state, 0, 99) < p) {
-		sqe->ioprio = cmdprio_value;
-		if (!td->ioprio || cmdprio_value < td->ioprio) {
-			/*
-			 * The async IO priority is higher (has a lower value)
-			 * than the priority set by "prio" and "prioclass"
-			 * options.
-			 */
-			io_u->flags |= IO_U_F_HIGH_PRIO;
-		}
-	} else {
-		sqe->ioprio = td->ioprio;
-		if (cmdprio_value && td->ioprio && td->ioprio < cmdprio_value) {
-			/*
-			 * The IO will be executed with the priority set by
-			 * "prio" and "prioclass" options, and this priority
-			 * is higher (has a lower value) than the async IO
-			 * priority.
-			 */
-			io_u->flags |= IO_U_F_HIGH_PRIO;
-		}
-	}
+	struct cmdprio *cmdprio = &ld->cmdprio;
 
-	io_u->ioprio = sqe->ioprio;
+	if (fio_cmdprio_set_ioprio(td, cmdprio, io_u))
+		ld->sqes[io_u->index].ioprio = io_u->ioprio;
 }
 
 static enum fio_q_status fio_ioring_queue(struct thread_data *td,
@@ -508,8 +484,9 @@ static enum fio_q_status fio_ioring_queue(struct thread_data *td,
 	if (next_tail == atomic_load_acquire(ring->head))
 		return FIO_Q_BUSY;
 
-	if (ld->use_cmdprio)
-		fio_ioring_prio_prep(td, io_u);
+	if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+		fio_ioring_cmdprio_prep(td, io_u);
+
 	ring->array[tail & ld->sq_ring_mask] = io_u->index;
 	atomic_store_release(ring->tail, next_tail);
 
@@ -613,6 +590,7 @@ static void fio_ioring_cleanup(struct thread_data *td)
 		if (!(td->flags & TD_F_CHILD))
 			fio_ioring_unmap(ld);
 
+		fio_cmdprio_cleanup(&ld->cmdprio);
 		free(ld->io_u_index);
 		free(ld->iovecs);
 		free(ld->fds);
@@ -819,8 +797,6 @@ static int fio_ioring_init(struct thread_data *td)
 {
 	struct ioring_options *o = td->eo;
 	struct ioring_data *ld;
-	struct cmdprio *cmdprio = &o->cmdprio;
-	bool has_cmdprio = false;
 	int ret;
 
 	/* sqthread submission requires registered files */
@@ -845,22 +821,12 @@ static int fio_ioring_init(struct thread_data *td)
 
 	td->io_ops_data = ld;
 
-	ret = fio_cmdprio_init(td, cmdprio, &has_cmdprio);
+	ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
 	if (ret) {
 		td_verror(td, EINVAL, "fio_ioring_init");
 		return 1;
 	}
 
-	/*
-	 * Since io_uring can have a submission context (sqthread_poll) that is
-	 * different from the process context, we cannot rely on the the IO
-	 * priority set by ioprio_set() (option prio/prioclass) to be inherited.
-	 * Therefore, we set the sqe->ioprio field when prio/prioclass is used.
-	 */
-	ld->use_cmdprio = has_cmdprio ||
-		fio_option_is_set(&td->o, ioprio_class) ||
-		fio_option_is_set(&td->o, ioprio);
-
 	return 0;
 }
 
diff --git a/engines/libaio.c b/engines/libaio.c
index dd655355..9c278d06 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -52,25 +52,16 @@ struct libaio_data {
 	unsigned int head;
 	unsigned int tail;
 
-	bool use_cmdprio;
+	struct cmdprio cmdprio;
 };
 
 struct libaio_options {
 	struct thread_data *td;
 	unsigned int userspace_reap;
-	struct cmdprio cmdprio;
+	struct cmdprio_options cmdprio_options;
 	unsigned int nowait;
 };
 
-static int str_cmdprio_bssplit_cb(void *data, const char *input)
-{
-	struct libaio_options *o = data;
-	struct thread_data *td = o->td;
-	struct cmdprio *cmdprio = &o->cmdprio;
-
-	return fio_cmdprio_bssplit_parse(td, input, cmdprio);
-}
-
 static struct fio_option options[] = {
 	{
 		.name	= "userspace_reap",
@@ -87,9 +78,9 @@ static struct fio_option options[] = {
 		.lname	= "high priority percentage",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct libaio_options,
-				   cmdprio.percentage[DDIR_READ]),
+				   cmdprio_options.percentage[DDIR_READ]),
 		.off2	= offsetof(struct libaio_options,
-				   cmdprio.percentage[DDIR_WRITE]),
+				   cmdprio_options.percentage[DDIR_WRITE]),
 		.minval	= 0,
 		.maxval	= 100,
 		.help	= "Send high priority I/O this percentage of the time",
@@ -101,9 +92,9 @@ static struct fio_option options[] = {
 		.lname	= "Asynchronous I/O priority class",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct libaio_options,
-				   cmdprio.class[DDIR_READ]),
+				   cmdprio_options.class[DDIR_READ]),
 		.off2	= offsetof(struct libaio_options,
-				   cmdprio.class[DDIR_WRITE]),
+				   cmdprio_options.class[DDIR_WRITE]),
 		.help	= "Set asynchronous IO priority class",
 		.minval	= IOPRIO_MIN_PRIO_CLASS + 1,
 		.maxval	= IOPRIO_MAX_PRIO_CLASS,
@@ -116,9 +107,9 @@ static struct fio_option options[] = {
 		.lname	= "Asynchronous I/O priority level",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct libaio_options,
-				   cmdprio.level[DDIR_READ]),
+				   cmdprio_options.level[DDIR_READ]),
 		.off2	= offsetof(struct libaio_options,
-				   cmdprio.level[DDIR_WRITE]),
+				   cmdprio_options.level[DDIR_WRITE]),
 		.help	= "Set asynchronous IO priority level",
 		.minval	= IOPRIO_MIN_PRIO,
 		.maxval	= IOPRIO_MAX_PRIO,
@@ -129,9 +120,9 @@ static struct fio_option options[] = {
 	{
 		.name   = "cmdprio_bssplit",
 		.lname  = "Priority percentage block size split",
-		.type   = FIO_OPT_STR_ULL,
-		.cb     = str_cmdprio_bssplit_cb,
-		.off1   = offsetof(struct libaio_options, cmdprio.bssplit),
+		.type   = FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct libaio_options,
+				   cmdprio_options.bssplit_str),
 		.help   = "Set priority percentages for different block sizes",
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_LIBAIO,
@@ -205,33 +196,15 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u)
 	return 0;
 }
 
-static void fio_libaio_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_libaio_cmdprio_prep(struct thread_data *td,
+					   struct io_u *io_u)
 {
-	struct libaio_options *o = td->eo;
-	struct cmdprio *cmdprio = &o->cmdprio;
-	enum fio_ddir ddir = io_u->ddir;
-	unsigned int p = fio_cmdprio_percentage(cmdprio, io_u);
-	unsigned int cmdprio_value =
-		ioprio_value(cmdprio->class[ddir], cmdprio->level[ddir]);
-
-	if (p && rand_between(&td->prio_state, 0, 99) < p) {
-		io_u->ioprio = cmdprio_value;
-		io_u->iocb.aio_reqprio = cmdprio_value;
+	struct libaio_data *ld = td->io_ops_data;
+	struct cmdprio *cmdprio = &ld->cmdprio;
+
+	if (fio_cmdprio_set_ioprio(td, cmdprio, io_u)) {
+		io_u->iocb.aio_reqprio = io_u->ioprio;
 		io_u->iocb.u.c.flags |= IOCB_FLAG_IOPRIO;
-		if (!td->ioprio || cmdprio_value < td->ioprio) {
-			/*
-			 * The async IO priority is higher (has a lower value)
-			 * than the default context priority.
-			 */
-			io_u->flags |= IO_U_F_HIGH_PRIO;
-		}
-	} else if (td->ioprio && td->ioprio < cmdprio_value) {
-		/*
-		 * The IO will be executed with the default context priority,
-		 * and this priority is higher (has a lower value) than the
-		 * async IO priority.
-		 */
-		io_u->flags |= IO_U_F_HIGH_PRIO;
 	}
 }
 
@@ -368,8 +341,8 @@ static enum fio_q_status fio_libaio_queue(struct thread_data *td,
 		return FIO_Q_COMPLETED;
 	}
 
-	if (ld->use_cmdprio)
-		fio_libaio_prio_prep(td, io_u);
+	if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+		fio_libaio_cmdprio_prep(td, io_u);
 
 	ld->iocbs[ld->head] = &io_u->iocb;
 	ld->io_us[ld->head] = io_u;
@@ -487,6 +460,8 @@ static void fio_libaio_cleanup(struct thread_data *td)
 		 */
 		if (!(td->flags & TD_F_CHILD))
 			io_destroy(ld->aio_ctx);
+
+		fio_cmdprio_cleanup(&ld->cmdprio);
 		free(ld->aio_events);
 		free(ld->iocbs);
 		free(ld->io_us);
@@ -512,7 +487,6 @@ static int fio_libaio_init(struct thread_data *td)
 {
 	struct libaio_data *ld;
 	struct libaio_options *o = td->eo;
-	struct cmdprio *cmdprio = &o->cmdprio;
 	int ret;
 
 	ld = calloc(1, sizeof(*ld));
@@ -525,7 +499,7 @@ static int fio_libaio_init(struct thread_data *td)
 
 	td->io_ops_data = ld;
 
-	ret = fio_cmdprio_init(td, cmdprio, &ld->use_cmdprio);
+	ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
 	if (ret) {
 		td_verror(td, EINVAL, "fio_libaio_init");
 		return 1;
diff --git a/fio.1 b/fio.1
index 78988c9e..e3c3feae 100644
--- a/fio.1
+++ b/fio.1
@@ -1965,8 +1965,7 @@ with the caveat that when used on the command line, they must come after the
 .BI (io_uring,libaio)cmdprio_percentage \fR=\fPint[,int]
 Set the percentage of I/O that will be issued with the highest priority.
 Default: 0. A single value applies to reads and writes. Comma-separated
-values may be specified for reads and writes. This option cannot be used
-with the `prio` or `prioclass` options. For this option to be effective,
+values may be specified for reads and writes. For this option to be effective,
 NCQ priority must be supported and enabled, and `direct=1' option must be
 used. fio must also be run as the root user.
 .TP

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-11-11 13:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-11-11 13:00 UTC (permalink / raw)
  To: fio

The following changes since commit a87ea1a869595ca57052e7645431a397d3c7d5ac:

  Merge branch 'evelu-peak' of https://github.com/ErwanAliasr1/fio (2021-10-25 12:38:35 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 6619fc32c413c4ff3a24c819037fb9227af3f876:

  stat: create a init_thread_stat_min_vals() helper (2021-11-08 06:24:48 -0700)

----------------------------------------------------------------
Niklas Cassel (1):
      stat: create a init_thread_stat_min_vals() helper

 init.c | 11 +----------
 stat.c | 66 +++++++++++++++++++++---------------------------------------------
 stat.h |  1 +
 3 files changed, 23 insertions(+), 55 deletions(-)

---

Diff of recent changes:

diff --git a/init.c b/init.c
index ec1a2cac..5f069d9a 100644
--- a/init.c
+++ b/init.c
@@ -1548,16 +1548,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
 	memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
 	td->ts.sig_figs = o->sig_figs;
 
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		td->ts.clat_stat[i].min_val = ULONG_MAX;
-		td->ts.slat_stat[i].min_val = ULONG_MAX;
-		td->ts.lat_stat[i].min_val = ULONG_MAX;
-		td->ts.bw_stat[i].min_val = ULONG_MAX;
-		td->ts.iops_stat[i].min_val = ULONG_MAX;
-		td->ts.clat_high_prio_stat[i].min_val = ULONG_MAX;
-		td->ts.clat_low_prio_stat[i].min_val = ULONG_MAX;
-	}
-	td->ts.sync_stat.min_val = ULONG_MAX;
+	init_thread_stat_min_vals(&td->ts);
 	td->ddir_seq_nr = o->ddir_seq_nr;
 
 	if ((o->stonewall || o->new_group) && prev_group_jobs) {
diff --git a/stat.c b/stat.c
index 30f9b5c1..cd35b114 100644
--- a/stat.c
+++ b/stat.c
@@ -483,22 +483,13 @@ static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_sta
 	struct thread_stat *ts_lcl;
 
 	int i2p;
-	int ddir = 0, i;
+	int ddir = 0;
 
 	/* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
 	ts_lcl = malloc(sizeof(struct thread_stat));
 	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
 	ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		ts_lcl->clat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->slat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->lat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->bw_stat[i].min_val = ULONG_MAX;
-		ts_lcl->iops_stat[i].min_val = ULONG_MAX;
-		ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
-		ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
-	}
-	ts_lcl->sync_stat.min_val = ULONG_MAX;
+	init_thread_stat_min_vals(ts_lcl);
 
 	sum_thread_stats(ts_lcl, ts, 1);
 
@@ -1466,22 +1457,12 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
 				   int ver, struct buf_output *out)
 {
 	struct thread_stat *ts_lcl;
-	int i;
 
 	/* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
 	ts_lcl = malloc(sizeof(struct thread_stat));
 	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
 	ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		ts_lcl->clat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->slat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->lat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->bw_stat[i].min_val = ULONG_MAX;
-		ts_lcl->iops_stat[i].min_val = ULONG_MAX;
-		ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
-		ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
-	}
-	ts_lcl->sync_stat.min_val = ULONG_MAX;
+	init_thread_stat_min_vals(ts_lcl);
 	ts_lcl->lat_percentiles = ts->lat_percentiles;
 	ts_lcl->clat_percentiles = ts->clat_percentiles;
 	ts_lcl->slat_percentiles = ts->slat_percentiles;
@@ -1668,22 +1649,12 @@ static void add_mixed_ddir_status_json(struct thread_stat *ts,
 		struct group_run_stats *rs, struct json_object *parent)
 {
 	struct thread_stat *ts_lcl;
-	int i;
 
 	/* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
 	ts_lcl = malloc(sizeof(struct thread_stat));
 	memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
 	ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		ts_lcl->clat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->slat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->lat_stat[i].min_val = ULONG_MAX;
-		ts_lcl->bw_stat[i].min_val = ULONG_MAX;
-		ts_lcl->iops_stat[i].min_val = ULONG_MAX;
-		ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
-		ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
-	}
-	ts_lcl->sync_stat.min_val = ULONG_MAX;
+	init_thread_stat_min_vals(ts_lcl);
 	ts_lcl->lat_percentiles = ts->lat_percentiles;
 	ts_lcl->clat_percentiles = ts->clat_percentiles;
 	ts_lcl->slat_percentiles = ts->slat_percentiles;
@@ -2270,22 +2241,27 @@ void init_group_run_stat(struct group_run_stats *gs)
 		gs->min_bw[i] = gs->min_run[i] = ~0UL;
 }
 
-void init_thread_stat(struct thread_stat *ts)
+void init_thread_stat_min_vals(struct thread_stat *ts)
 {
-	int j;
+	int i;
 
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		ts->clat_stat[i].min_val = ULONG_MAX;
+		ts->slat_stat[i].min_val = ULONG_MAX;
+		ts->lat_stat[i].min_val = ULONG_MAX;
+		ts->bw_stat[i].min_val = ULONG_MAX;
+		ts->iops_stat[i].min_val = ULONG_MAX;
+		ts->clat_high_prio_stat[i].min_val = ULONG_MAX;
+		ts->clat_low_prio_stat[i].min_val = ULONG_MAX;
+	}
+	ts->sync_stat.min_val = ULONG_MAX;
+}
+
+void init_thread_stat(struct thread_stat *ts)
+{
 	memset(ts, 0, sizeof(*ts));
 
-	for (j = 0; j < DDIR_RWDIR_CNT; j++) {
-		ts->lat_stat[j].min_val = -1UL;
-		ts->clat_stat[j].min_val = -1UL;
-		ts->slat_stat[j].min_val = -1UL;
-		ts->bw_stat[j].min_val = -1UL;
-		ts->iops_stat[j].min_val = -1UL;
-		ts->clat_high_prio_stat[j].min_val = -1UL;
-		ts->clat_low_prio_stat[j].min_val = -1UL;
-	}
-	ts->sync_stat.min_val = -1UL;
+	init_thread_stat_min_vals(ts);
 	ts->groupid = -1;
 }
 
diff --git a/stat.h b/stat.h
index a06237e7..9ef8caa4 100644
--- a/stat.h
+++ b/stat.h
@@ -327,6 +327,7 @@ extern void show_running_run_stats(void);
 extern void check_for_running_stats(void);
 extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
 extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
+extern void init_thread_stat_min_vals(struct thread_stat *ts);
 extern void init_thread_stat(struct thread_stat *ts);
 extern void init_group_run_stat(struct group_run_stats *gs);
 extern void eta_to_str(char *str, unsigned long eta_sec);

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-10-26 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-10-26 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 515418094c61cf135513a34651af6134a8794b5d:

  Merge branch 'master' of https://github.com/bvanassche/fio (2021-10-22 10:19:04 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a87ea1a869595ca57052e7645431a397d3c7d5ac:

  Merge branch 'evelu-peak' of https://github.com/ErwanAliasr1/fio (2021-10-25 12:38:35 -0600)

----------------------------------------------------------------
Erwan Velu (3):
      t/one-core-peak: Reporting SElinux status
      t/io_uring: Fixing typo in help message
      t/one-core-peak: Don't report errors if missing NVME features

Jens Axboe (1):
      Merge branch 'evelu-peak' of https://github.com/ErwanAliasr1/fio

 t/io_uring.c       |  2 +-
 t/one-core-peak.sh | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

---

Diff of recent changes:

diff --git a/t/io_uring.c b/t/io_uring.c
index a87042f8..f758a6d9 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -1059,7 +1059,7 @@ static void usage(char *argv, int status)
 		" -b <int>  : Block size, default %d\n"
 		" -p <bool> : Polled IO, default %d\n"
 		" -B <bool> : Fixed buffers, default %d\n"
-		" -R <bool> : DMA map fixed buffers, default %d\n"
+		" -D <bool> : DMA map fixed buffers, default %d\n"
 		" -F <bool> : Register files, default %d\n"
 		" -n <int>  : Number of threads, default %d\n"
 		" -O <bool> : Use O_DIRECT, default %d\n"
diff --git a/t/one-core-peak.sh b/t/one-core-peak.sh
index fba4ec95..9da8304e 100755
--- a/t/one-core-peak.sh
+++ b/t/one-core-peak.sh
@@ -199,12 +199,18 @@ show_nvme() {
   info ${device_name} "MODEL=${model} FW=${fw} serial=${serial} PCI=${pci_addr}@${link_speed} IRQ=${irq} NUMA=${numa} CPUS=${cpus} "
   which nvme &> /dev/null
   if [ $? -eq 0 ]; then
-    NCQA=$(nvme get-feature -H -f 0x7 ${device} |grep NCQA |cut -d ':' -f 2 | xargs)
-    NSQA=$(nvme get-feature -H -f 0x7 ${device} |grep NSQA |cut -d ':' -f 2 | xargs)
-    power_state=$(nvme get-feature -H -f 0x2 ${device} | grep PS |cut -d ":" -f 2 | xargs)
-    apste=$(nvme get-feature -H -f 0xc ${device} | grep APSTE |cut -d ":" -f 2 | xargs)
-    temp=$(nvme smart-log ${device} |grep 'temperature' |cut -d ':' -f 2 |xargs)
-    info ${device_name} "Temp:${temp}, Autonomous Power State Transition:${apste}, PowerState:${power_state}, Completion Queues:${NCQA}, Submission Queues:${NSQA}"
+    status=""
+    NCQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NCQA |cut -d ':' -f 2 | xargs)
+    [ -n "${NCQA}" ] && status="${status}Completion Queues:${NCQA}, "
+    NSQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NSQA |cut -d ':' -f 2 | xargs)
+    [ -n "${NSQA}" ] && status="${status}Submission Queues:${NSQA}, "
+    power_state=$(nvme get-feature -H -f 0x2 ${device} 2>&1 | grep PS |cut -d ":" -f 2 | xargs)
+    [ -n "${power_state}" ] && status="${status}PowerState:${power_state}, "
+    apste=$(nvme get-feature -H -f 0xc ${device} 2>&1 | grep APSTE |cut -d ":" -f 2 | xargs)
+    [ -n "${apste}" ] && status="${status} Autonomous Power State Transition:${apste}, "
+    temp=$(nvme smart-log ${device} 2>&1 |grep 'temperature' |cut -d ':' -f 2 |xargs)
+    [ -n "${temp}" ] && status="${status}Temp:${temp}"
+    info ${device_name} "${status}"
   fi
 }
 
@@ -241,6 +247,7 @@ show_system() {
     info "system" "KERNEL: $(show_kernel_config_item ${config_item})"
   done
   info "system" "KERNEL: $(cat /proc/cmdline)"
+  info "system" "SElinux: $(getenforce)"
   tsc=$(journalctl -k | grep 'tsc: Refined TSC clocksource calibration:' | awk '{print $11}')
   if [ -n "${tsc}" ]; then
     info "system" "TSC: ${tsc} Mhz"

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Re: Recent changes (master)
  2021-10-25 15:42     ` Rebecca Cran
@ 2021-10-25 15:43       ` Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-10-25 15:43 UTC (permalink / raw)
  To: Rebecca Cran, fio

On 10/25/21 9:42 AM, Rebecca Cran wrote:
> On 10/25/21 9:41 AM, Jens Axboe wrote:
> 
>> On 10/25/21 9:37 AM, Rebecca Cran wrote:
>>> On 10/23/21 6:00 AM, Jens Axboe wrote:
>>>> The following changes since commit 09d0a62931df0bb7ed4ae92b83a245e35d04100a:
>>>>
>>>>     Merge branch 'patch-1' of https://github.com/sweettea/fio (2021-10-19 16:09:21 -0600)
>>>>
>>>> are available in the Git repository at:
>>>>
>>>>     git://git.kernel.dk/fio.git master
>>> I just noticed this. Is it possible to change this to specify the https
>>> URL instead, since the git protocol is insecure?
>> Both will work with git.kernel.dk
> 
> I understand, I was thinking we might want to default to a secure protocol.

If you visit the page, it is listing both. This is just my script that sends
out the changes, don't think it's one that most would find and then use to
clone :)

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Re: Recent changes (master)
  2021-10-25 15:41   ` Jens Axboe
@ 2021-10-25 15:42     ` Rebecca Cran
  2021-10-25 15:43       ` Jens Axboe
  0 siblings, 1 reply; 1137+ messages in thread
From: Rebecca Cran @ 2021-10-25 15:42 UTC (permalink / raw)
  To: Jens Axboe, fio

On 10/25/21 9:41 AM, Jens Axboe wrote:

> On 10/25/21 9:37 AM, Rebecca Cran wrote:
>> On 10/23/21 6:00 AM, Jens Axboe wrote:
>>> The following changes since commit 09d0a62931df0bb7ed4ae92b83a245e35d04100a:
>>>
>>>     Merge branch 'patch-1' of https://github.com/sweettea/fio (2021-10-19 16:09:21 -0600)
>>>
>>> are available in the Git repository at:
>>>
>>>     git://git.kernel.dk/fio.git master
>> I just noticed this. Is it possible to change this to specify the https
>> URL instead, since the git protocol is insecure?
> Both will work with git.kernel.dk

I understand, I was thinking we might want to default to a secure protocol.


-- 
Rebecca Cran



^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Re: Recent changes (master)
  2021-10-25 15:37 ` Rebecca Cran
@ 2021-10-25 15:41   ` Jens Axboe
  2021-10-25 15:42     ` Rebecca Cran
  0 siblings, 1 reply; 1137+ messages in thread
From: Jens Axboe @ 2021-10-25 15:41 UTC (permalink / raw)
  To: Rebecca Cran, fio

On 10/25/21 9:37 AM, Rebecca Cran wrote:
> On 10/23/21 6:00 AM, Jens Axboe wrote:
>> The following changes since commit 09d0a62931df0bb7ed4ae92b83a245e35d04100a:
>>
>>    Merge branch 'patch-1' of https://github.com/sweettea/fio (2021-10-19 16:09:21 -0600)
>>
>> are available in the Git repository at:
>>
>>    git://git.kernel.dk/fio.git master
> 
> I just noticed this. Is it possible to change this to specify the https 
> URL instead, since the git protocol is insecure?

Both will work with git.kernel.dk

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Re: Recent changes (master)
  2021-10-23 12:00 Jens Axboe
@ 2021-10-25 15:37 ` Rebecca Cran
  2021-10-25 15:41   ` Jens Axboe
  0 siblings, 1 reply; 1137+ messages in thread
From: Rebecca Cran @ 2021-10-25 15:37 UTC (permalink / raw)
  To: Jens Axboe, fio

On 10/23/21 6:00 AM, Jens Axboe wrote:
> The following changes since commit 09d0a62931df0bb7ed4ae92b83a245e35d04100a:
>
>    Merge branch 'patch-1' of https://github.com/sweettea/fio (2021-10-19 16:09:21 -0600)
>
> are available in the Git repository at:
>
>    git://git.kernel.dk/fio.git master

I just noticed this. Is it possible to change this to specify the https 
URL instead, since the git protocol is insecure?


-- 
Rebecca Cran



^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-10-23 12:00 Jens Axboe
  2021-10-25 15:37 ` Rebecca Cran
  0 siblings, 1 reply; 1137+ messages in thread
From: Jens Axboe @ 2021-10-23 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 09d0a62931df0bb7ed4ae92b83a245e35d04100a:

  Merge branch 'patch-1' of https://github.com/sweettea/fio (2021-10-19 16:09:21 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 515418094c61cf135513a34651af6134a8794b5d:

  Merge branch 'master' of https://github.com/bvanassche/fio (2021-10-22 10:19:04 -0600)

----------------------------------------------------------------
Bart Van Assche (1):
      Android: Add io_uring support

Jens Axboe (1):
      Merge branch 'master' of https://github.com/bvanassche/fio

 Makefile        | 2 +-
 os/os-android.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index f28c130a..4ae5a371 100644
--- a/Makefile
+++ b/Makefile
@@ -233,7 +233,7 @@ endif
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
-		oslib/linux-dev-lookup.c
+		oslib/linux-dev-lookup.c engines/io_uring.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
 endif
diff --git a/os/os-android.h b/os/os-android.h
index 18eb39ce..10c51b83 100644
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -309,4 +309,8 @@ static inline int fio_set_sched_idle(void)
 }
 #endif
 
+#ifndef RWF_UNCACHED
+#define RWF_UNCACHED	0x00000040
+#endif
+
 #endif

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-10-20 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-10-20 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit a7194b2d3d427e7e5678c55a128639df9caf4a48:

  Merge branch 'fixes_1290' of https://github.com/rthardin/fio (2021-10-18 19:29:46 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to 09d0a62931df0bb7ed4ae92b83a245e35d04100a:

  Merge branch 'patch-1' of https://github.com/sweettea/fio (2021-10-19 16:09:21 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'patch-1' of https://github.com/sweettea/fio

Sweet Tea Dorminy (1):
      t/fuzz: Clean up generated dependency makefiles

 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/Makefile b/Makefile
index c3feb53f..f28c130a 100644
--- a/Makefile
+++ b/Makefile
@@ -626,7 +626,7 @@ unittests/unittest: $(UT_OBJS) $(UT_TARGET_OBJS)
 endif
 
 clean: FORCE
-	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] t/*/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
 	@rm -f t/fio-btrace2fio t/io_uring t/read-to-pipe-async
 	@rm -rf  doc/output
 

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-10-19 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-10-19 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit aa9f26276e1961fab2d33e188f5a2432360c9c14:

  run-fio-tests: make test runs more resilient (2021-10-17 07:22:55 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to a7194b2d3d427e7e5678c55a128639df9caf4a48:

  Merge branch 'fixes_1290' of https://github.com/rthardin/fio (2021-10-18 19:29:46 -0600)

----------------------------------------------------------------
Jens Axboe (1):
      Merge branch 'fixes_1290' of https://github.com/rthardin/fio

Ryan Hardin (1):
      Use min_bs in rate_process=poisson

 backend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

---

Diff of recent changes:

diff --git a/backend.c b/backend.c
index 86fa6d41..c167f908 100644
--- a/backend.c
+++ b/backend.c
@@ -837,7 +837,7 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
 	if (td->o.rate_process == RATE_PROCESS_POISSON) {
 		uint64_t val, iops;
 
-		iops = bps / td->o.bs[ddir];
+		iops = bps / td->o.min_bs[ddir];
 		val = (int64_t) (1000000 / iops) *
 				-logf(__rand_0_1(&td->poisson_state[ddir]));
 		if (val) {

^ permalink raw reply	[flat|nested] 1137+ messages in thread

* Recent changes (master)
@ 2021-10-18 12:00 Jens Axboe
  0 siblings, 0 replies; 1137+ messages in thread
From: Jens Axboe @ 2021-10-18 12:00 UTC (permalink / raw)
  To: fio

The following changes since commit 7d1ce4b752e67868b3c7eb9aa5972ceec51210aa:

  t/io_uring: Fix the parameters calculation for multiple threads scenario (2021-10-15 06:20:47 -0600)

are available in the Git repository at:

  git://git.kernel.dk/fio.git master

for you to fetch changes up to aa9f26276e1961fab2d33e188f5a2432360c9c14:

  run-fio-tests: make test runs more resilient (2021-10-17 07:22:55 -0600)

----------------------------------------------------------------
Rebecca Cran (1):
      engines/http.c: add fallthrough annotation to _curl_trace

Shin'ichiro Kawasaki (5):
      zbd: Remove cast to unsigned long long for printf
      zbd: Fix type of local variable min_bs
      t/zbd: Do not use too large block size in test case #4
      t/zbd: Align block size to zone capacity
      t/zbd: Add -w option to ensure no open zone before write tests

Vincent Fu (1):
      run-fio-tests: make test runs more resilient

 engines/http.c         |   3 +-
 t/run-fio-tests.py     |  14 +++++--
 t/zbd/functions        |  26 ++++++++++++
 t/zbd/test-zbd-support |  40 ++++++++++--------
 zbd.c                  | 107 +++++++++++++++++++++++--------------------------
 5 files changed, 112 insertions(+), 78 deletions(-)

---

Diff of recent changes:

diff --git a/engines/http.c b/engines/http.c
index 7a61b132..35c44871 100644
--- a/engines/http.c
+++ b/engines/http.c
@@ -297,10 +297,9 @@ static int _curl_trace(CURL *handle, curl_infotype type,
 	switch (type) {
 	case CURLINFO_TEXT:
 		fprintf(stderr, "== Info: %s", data);
-		/* fall through */
+		fallthrough;
 	default:
 	case CURLINFO_SSL_DATA_OUT:
-		/* fall through */
 	case CURLINFO_SSL_DATA_IN:
 		return 0;
 
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py
index a59cdfe0..612e50ca 100755
--- a/t/run-fio-tests.py
+++ b/t/run-fio-tests.py
@@ -49,6 +49,7 @@ import shutil
 import logging
 import argparse
 import platform
+import traceback
 import subprocess
 import multiprocessing
 from pathlib import Path
@@ -1057,9 +1058,16 @@ def main():
                 skipped = skipped + 1
                 continue
 
-        test.setup(artifact_root, config['test_id'])
-        test.run()
-        test.check_result()
+        try:
+            test.setup(artifact_root, config['test_id'])
+            test.run()
+            test.check_result()
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            test.passed = False
+            test.failure_reason += str(e)
+            logging.debug("Test %d exception:\n%s\n", config['test_id'], traceback.format_exc())
         if test.passed:
             result = "PASSED"
             passed = passed + 1
diff --git a/t/zbd/functions b/t/zbd/functions
index 08a2c629..e4e248b9 100644
--- a/t/zbd/functions
+++ b/t/zbd/functions
@@ -64,6 +64,32 @@ check_blkzone() {
 	fi
 }
 
+# Check zone capacity of each zone and report block size aligned to the zone
+# capacities. If zone capacity is same as zone size for zones, report zone size.
+zone_cap_bs() {
+	local dev="${1}"
+	local zone_size="${2}"
+	local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p'
+	local cap bs="$zone_size"
+
+	# When blkzone is not available or blkzone does not report capacity,
+	# assume that zone capacity is same as zone size for all zones.
+	if [ -z "${blkzone}" ] || ! blkzone_reports_capacity "${dev}"; then
+		echo "$zone_size"
+		return
+	fi
+
+	while read -r -a line; do
+		((line[0] == line[1])) && continue
+		cap=$((line[1] * 512))
+		while ((bs > 512 && cap % bs)); do
+			bs=$((bs / 2))
+		done
+	done < <(blkzone report "${dev}" | sed -n "${sed_str}")
+
+	echo "$bs"
+}
+
 # Reports the starting sector and length of the first sequential zone of device
 # $1.
 first_sequential_zone() {
diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support
index 5103c406..7e2fff00 100755
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -12,6 +12,7 @@ usage() {
 	echo -e "\t-v Run fio with valgrind --read-var-info option"
 	echo -e "\t-l Test with libzbc ioengine"
 	echo -e "\t-r Reset all zones before test start"
+	echo -e "\t-w Reset all zones before executing each write test case"
 	echo -e "\t-o <max_open_zones> Run fio with max_open_zones limit"
 	echo -e "\t-t <test #> Run only a single test case with specified number"
 	echo -e "\t-q Quit the test run after any failed test"
@@ -182,13 +183,14 @@ run_fio_on_seq() {
     run_one_fio_job "${opts[@]}" "$@"
 }
 
-# Prepare for write test by resetting zones. When max_open_zones option is
-# specified, reset all zones of the test target to ensure that zones out of the
-# test target range do not have open zones. This allows the write test to the
-# target range to be able to open zones up to max_open_zones.
+# Prepare for write test by resetting zones. When reset_before_write or
+# max_open_zones option is specified, reset all zones of the test target to
+# ensure that zones out of the test target range do not have open zones. This
+# allows the write test to the target range to be able to open zones up to
+# max_open_zones limit specified as the option or obtained from sysfs.
 prep_write() {
-	[[ -n "${max_open_zones_opt}" && -n "${is_zbd}" ]] &&
-		reset_zone "${dev}" -1
+	[[ -n "${reset_before_write}" || -n "${max_open_zones_opt}" ]] &&
+		[[ -n "${is_zbd}" ]] && reset_zone "${dev}" -1
 }
 
 SKIP_TESTCASE=255
@@ -310,7 +312,8 @@ test4() {
     off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
     size=$((zone_size))
     [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
-    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=$size")
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off")
+    opts+=(--bs="$(min $((logical_block_size * 256)) $size)")
     opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1")
     opts+=("$(ioengine "psync")" "--rw=read" "--direct=1" "--disable_lat=1")
     opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
@@ -320,15 +323,15 @@ test4() {
 
 # Sequential write to sequential zones.
 test5() {
-    local size off capacity
+    local size off capacity bs
 
     prep_write
     off=$((first_sequential_zone_sector * 512))
     capacity=$(total_zone_capacity 4 $off $dev)
     size=$((4 * zone_size))
+    bs=$(min "$(max $((zone_size / 64)) "$logical_block_size")" "$zone_cap_bs")
     run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write	\
-		   --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
-		   --do_verify=1 --verify=md5				\
+		   --bs="$bs" --do_verify=1 --verify=md5 \
 		   >>"${logfile}.${test_number}" 2>&1 || return $?
     check_written $capacity || return $?
     check_read $capacity || return $?
@@ -336,18 +339,18 @@ test5() {
 
 # Sequential read from sequential zones.
 test6() {
-    local size off capacity
+    local size off capacity bs
 
     prep_write
     off=$((first_sequential_zone_sector * 512))
     capacity=$(total_zone_capacity 4 $off $dev)
     size=$((4 * zone_size))
+    bs=$(min "$(max $((zone_size / 64)) "$logical_block_size")" "$zone_cap_bs")
     write_and_run_one_fio_job \
 	    $((first_sequential_zone_sector * 512)) "${size}" \
 	    --offset="${off}" \
 	    --size="${size}" --zonemode=zbd --zonesize="${zone_size}" \
-	    "$(ioengine "psync")" --iodepth=1 --rw=read \
-	    --bs="$(max $((zone_size / 64)) "$logical_block_size")" \
+	    "$(ioengine "psync")" --iodepth=1 --rw=read --bs="$bs" \
 	    >>"${logfile}.${test_number}" 2>&1 || return $?
     check_read $capacity || return $?
 }
@@ -485,7 +488,7 @@ test14() {
 
 # Sequential read on a mix of empty and full zones.
 test15() {
-    local i off size
+    local i off size bs
     local w_off w_size w_capacity
 
     for ((i=0;i<4;i++)); do
@@ -499,8 +502,9 @@ test15() {
     w_capacity=$(total_zone_capacity 2 $w_off $dev)
     off=$((first_sequential_zone_sector * 512))
     size=$((4 * zone_size))
+    bs=$(min $((zone_size / 16)) "$zone_cap_bs")
     write_and_run_one_fio_job "${w_off}" "${w_size}" \
-		    "$(ioengine "psync")" --rw=read --bs=$((zone_size / 16)) \
+		    "$(ioengine "psync")" --rw=read --bs="$bs" \
 		    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
 		    --size=$((size)) >>"${logfile}.${test_number}" 2>&1 ||
 	return $?
@@ -852,7 +856,7 @@ test37() {
 	off=$(((first_sequential_zone_sector - 1) * 512))
     fi
     size=$((zone_size + 2 * 512))
-    bs=$((zone_size / 4))
+    bs=$(min $((zone_size / 4)) "$zone_cap_bs")
     run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")"	\
 		    --iodepth=1 --rw=write --do_verify=1 --verify=md5	\
 		    --bs=$bs --zonemode=zbd --zonesize="${zone_size}"	\
@@ -1245,6 +1249,7 @@ SECONDS=0
 tests=()
 dynamic_analyzer=()
 reset_all_zones=
+reset_before_write=
 use_libzbc=
 zbd_debug=
 max_open_zones_opt=
@@ -1259,6 +1264,7 @@ while [ "${1#-}" != "$1" ]; do
 	shift;;
     -l) use_libzbc=1; shift;;
     -r) reset_all_zones=1; shift;;
+    -w) reset_before_write=1; shift;;
     -t) tests+=("$2"); shift; shift;;
     -o) max_open_zones_opt="${2}"; shift; shift;;
     -v) dynamic_analyzer=(valgrind "--read-var-info=yes");
@@ -1377,6 +1383,8 @@ fi
 echo -n "First sequential zone starts at sector $first_sequential_zone_sector;"
 echo " zone size: $((zone_size >> 20)) MB"
 
+zone_cap_bs=$(zone_cap_bs "$dev" "$zone_size")
+
 if [ "${#tests[@]}" = 0 ]; then
     readarray -t tests < <(declare -F | grep "test[0-9]*" | \
 				   tr -c -d "[:digit:]\n" | sort -n)
diff --git a/zbd.c b/zbd.c
index c0b0b81c..c18998c4 100644
--- a/zbd.c
+++ b/zbd.c
@@ -83,12 +83,12 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f,
 		ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
 	if (ret < 0) {
 		td_verror(td, errno, "report zones failed");
-		log_err("%s: report zones from sector %llu failed (%d).\n",
-			f->file_name, (unsigned long long)offset >> 9, errno);
+		log_err("%s: report zones from sector %"PRIu64" failed (%d).\n",
+			f->file_name, offset >> 9, errno);
 	} else if (ret == 0) {
 		td_verror(td, errno, "Empty zone report");
-		log_err("%s: report zones from sector %llu is empty.\n",
-			f->file_name, (unsigned long long)offset >> 9);
+		log_err("%s: report zones from sector %"PRIu64" is empty.\n",
+			f->file_name, offset >> 9);
 		ret = -EIO;
 	}
 
@@ -116,9 +116,8 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
 		ret = blkzoned_reset_wp(td, f, offset, length);
 	if (ret < 0) {
 		td_verror(td, errno, "resetting wp failed");
-		log_err("%s: resetting wp for %llu sectors at sector %llu failed (%d).\n",
-			f->file_name, (unsigned long long)length >> 9,
-			(unsigned long long)offset >> 9, errno);
+		log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
+			f->file_name, length >> 9, offset >> 9, errno);
 	}
 
 	return ret;
@@ -318,16 +317,16 @@ static bool zbd_verify_sizes(void)
 					return false;
 				}
 			} else if (td->o.zone_size != f->zbd_info->zone_size) {
-				log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n",
-					f->file_name, (unsigned long long) td->o.zone_size,
-					(unsigned long long) f->zbd_info->zone_size);
+				log_err("%s: job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
+					f->file_name, td->o.zone_size,
+					f->zbd_info->zone_size);
 				return false;
 			}
 
 			if (td->o.zone_skip % td->o.zone_size) {
 				log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
-					f->file_name, (unsigned long long) td->o.zone_skip,
-					(unsigned long long) td->o.zone_size);
+					f->file_name, td->o.zone_skip,
+					td->o.zone_size);
 				return false;
 			}
 
@@ -341,9 +340,9 @@ static bool zbd_verify_sizes(void)
 						 f->file_name);
 					return false;
 				}
-				log_info("%s: rounded up offset from %llu to %llu\n",
-					 f->file_name, (unsigned long long) f->file_offset,
-					 (unsigned long long) new_offset);
+				log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
+					 f->file_name, f->file_offset,
+					 new_offset);
 				f->io_size -= (new_offset - f->file_offset);
 				f->file_offset = new_offset;
 			}
@@ -357,9 +356,9 @@ static bool zbd_verify_sizes(void)
 						 f->file_name);
 					return false;
 				}
-				log_info("%s: rounded down io_size from %llu to %llu\n",
-					 f->file_name, (unsigned long long) f->io_size,
-					 (unsigned long long) new_end - f->file_offset);
+				log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
+					 f->file_name, f->io_size,
+					 new_end - f->file_offset);
 				f->io_size = new_end - f->file_offset;
 			}
 		}
@@ -388,17 +387,17 @@ static bool zbd_verify_bs(void)
 				continue;
 			zone_size = f->zbd_info->zone_size;
 			if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
-				log_info("%s: trim block size %llu is not the zone size %llu\n",
+				log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
 					 f->file_name, td->o.bs[DDIR_TRIM],
-					 (unsigned long long)zone_size);
+					 zone_size);
 				return false;
 			}
 			for (k = 0; k < FIO_ARRAY_SIZE(td->o.bs); k++) {
 				if (td->o.verify != VERIFY_NONE &&
 				    zone_size % td->o.bs[k] != 0) {
-					log_info("%s: block size %llu is not a divisor of the zone size %llu\n",
+					log_info("%s: block size %llu is not a divisor of the zone size %"PRIu64"\n",
 						 f->file_name, td->o.bs[k],
-						 (unsigned long long)zone_size);
+						 zone_size);
 					return false;
 				}
 			}
@@ -448,8 +447,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 
 	if (zone_capacity > zone_size) {
 		log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
-			f->file_name, (unsigned long long) td->o.zone_capacity,
-			(unsigned long long) td->o.zone_size);
+			f->file_name, td->o.zone_capacity, td->o.zone_size);
 		return 1;
 	}
 
@@ -525,15 +523,14 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 	if (td->o.zone_size == 0) {
 		td->o.zone_size = zone_size;
 	} else if (td->o.zone_size != zone_size) {
-		log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n",
-			f->file_name, (unsigned long long) td->o.zone_size,
-			(unsigned long long) zone_size);
+		log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
+			f->file_name, td->o.zone_size, zone_size);
 		ret = -EINVAL;
 		goto out;
 	}
 
-	dprint(FD_ZBD, "Device %s has %d zones of size %llu KB\n", f->file_name,
-	       nr_zones, (unsigned long long) zone_size / 1024);
+	dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n", f->file_name,
+	       nr_zones, zone_size / 1024);
 
 	zbd_info = scalloc(1, sizeof(*zbd_info) +
 			   (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -587,9 +584,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 					   ZBD_REPORT_MAX_ZONES));
 		if (nrz < 0) {
 			ret = nrz;
-			log_info("fio: report zones (offset %llu) failed for %s (%d).\n",
-			 	 (unsigned long long)offset,
-				 f->file_name, -ret);
+			log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
+				 offset, f->file_name, -ret);
 			goto out;
 		}
 	}
@@ -972,7 +968,7 @@ static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
 			   struct fio_zone_info *const ze)
 {
 	struct fio_zone_info *z;
-	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 	int res = 0;
 
 	assert(min_bs);
@@ -1145,7 +1141,7 @@ static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
 static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
 			  uint32_t zone_idx)
 {
-	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 	struct zoned_block_device_info *zbdi = f->zbd_info;
 	struct fio_zone_info *z = get_zone(f, zone_idx);
 	bool res = true;
@@ -1228,7 +1224,7 @@ static bool any_io_in_flight(void)
 static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 						      struct io_u *io_u)
 {
-	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+	const uint64_t min_bs = td->o.min_bs[io_u->ddir];
 	struct fio_file *f = io_u->file;
 	struct zoned_block_device_info *zbdi = f->zbd_info;
 	struct fio_zone_info *z;
@@ -1431,7 +1427,7 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
 						    struct fio_zone_info *z)
 {
 	const struct fio_file *f = io_u->file;
-	const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+	const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 
 	if (!zbd_open_zone(td, f, zbd_zone_nr(f, z))) {
 		zone_unlock(z);
@@ -1440,8 +1436,8 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
 	}
 
 	if (z->verify_block * min_bs >= z->capacity) {
-		log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
-			min_bs, (unsigned long long)z->capacity);
+		log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n", f->file_name, z->verify_block,
+			min_bs, z->capacity);
 		/*
 		 * If the assertion below fails during a test run, adding
 		 * "--experimental_verify=1" to the command line may help.
@@ -1450,8 +1446,8 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
 	}
 	io_u->offset = z->start + z->verify_block * min_bs;
 	if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
-		log_err("%s: %llu + %llu >= %llu\n", f->file_name, io_u->offset,
-			io_u->buflen, (unsigned long long) zbd_zone_capacity_end(z));
+		log_err("%s: %llu + %llu >= %"PRIu64"\n", f->file_name, io_u->offset,
+			io_u->buflen, zbd_zone_capacity_end(z));
 		assert(false);
 	}
 	z->verify_block += io_u->buflen / min_bs;
@@ -1467,7 +1463,7 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
  * pointer, hold the mutex for the zone.
  */
 static struct fio_zone_info *
-zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint32_t min_bytes,
+zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
 	      struct fio_zone_info *zb, struct fio_zone_info *zl)
 {
 	struct fio_file *f = io_u->file;
@@ -1499,7 +1495,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint32_t min_bytes,
 				zone_unlock(z2);
 		}
 	}
-	dprint(FD_ZBD, "%s: no zone has %d bytes of readable data\n",
+	dprint(FD_ZBD, "%s: no zone has %"