[PATCH v17 25/45] sg: replace rq array with xarray

From: Douglas Gilbert <dgilbert@interlog.com>
To: linux-scsi@vger.kernel.org
Cc: martin.petersen@oracle.com, jejb@linux.vnet.ibm.com,
	hare@suse.de, kbuild test robot <lkp@intel.com>
Subject: [PATCH v17 25/45] sg: replace rq array with xarray
Date: Wed,  7 Apr 2021 21:45:11 -0400	[thread overview]
Message-ID: <20210408014531.248890-26-dgilbert@interlog.com> (raw)
In-Reply-To: <20210408014531.248890-1-dgilbert@interlog.com>

Remove the fixed size array of 16 request elements per file
descriptor and replace with the xarray added in the previous
patch. All sg_request objects are now kept, available for
re-use, until their owning file descriptor is closed. The
sg_request deletions are in sg_remove_sfp_usercontext().
Each active sg_request object has an associated block
request and a scsi_request object but they have different
lifetimes. The block request and the scsi_request object
are released much earlier; their lifetime is the same as it
was in the v3 sg driver. The lifetime of the bio is also the
same (but is stretched in a later patch).

Collect various flags into bit maps: one for requests
(SG_FRQ_*) and the other for file descriptors (SG_FFD_*).
They join a per sg_device bit map (SG_FDEV_*) added in an
earlier patch.

Prior to a new sg_request object being (re-)built, information
that will be placed in it uses a new struct sg_comm_wr_t
object.

Since the above changes touch almost every function and low
level structures, this patch is big.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>
---
 drivers/scsi/sg.c | 1544 ++++++++++++++++++++++++++++-----------------
 1 file changed, 981 insertions(+), 563 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 273861374de7..99cfa265a611 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -142,36 +142,51 @@ static struct class_interface sg_interface = {
 	.remove_dev     = sg_remove_device,
 };
 
+/* Subset of sg_io_hdr found in <scsi/sg.h>, has only [i] and [i->o] fields */
+struct sg_slice_hdr3 {
+	int interface_id;
+	int dxfer_direction;
+	u8 cmd_len;
+	u8 mx_sb_len;
+	u16 iovec_count;
+	unsigned int dxfer_len;
+	void __user *dxferp;
+	u8 __user *cmdp;
+	void __user *sbp;
+	unsigned int timeout;
+	unsigned int flags;
+	int pack_id;
+	void __user *usr_ptr;
+};
+
 struct sg_scatter_hold {     /* holding area for scsi scatter gather info */
 	struct page **pages;	/* num_sgat element array of struct page* */
 	int buflen;		/* capacity in bytes (dlen<=buflen) */
 	int dlen;		/* current valid data length of this req */
 	u16 page_order;		/* byte_len = (page_size*(2**page_order)) */
 	u16 num_sgat;		/* actual number of scatter-gather segments */
-	unsigned int sglist_len; /* size of malloc'd scatter-gather list ++ */
-	char dio_in_use;	/* 0->indirect IO (or mmap), 1->dio */
-	u8 cmd_opcode;		/* first byte of command */
 };
 
 struct sg_device;		/* forward declarations */
 struct sg_fd;
 
-struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
-	struct sg_scatter_hold data;	/* hold buffer, perhaps scatter list */
-	struct sg_io_hdr header;  /* scsi command+info, see <scsi/sg.h> */
+struct sg_request {	/* active SCSI command or inactive request */
+	struct sg_scatter_hold sgat_h;	/* hold buffer, perhaps scatter list */
+	struct sg_slice_hdr3 s_hdr3;  /* subset of sg_io_hdr */
 	u8 sense_b[SCSI_SENSE_BUFFERSIZE];
 	u32 duration;		/* cmd duration in milliseconds */
+	u32 rq_flags;		/* hold user supplied flags */
 	u32 rq_idx;		/* my index within parent's srp_arr */
-	char res_used;		/* 1 -> using reserve buffer, 0 -> not ... */
-	char orphan;		/* 1 -> drop on sight, 0 -> normal */
+	u32 rq_info;		/* info supplied by v3 and v4 interfaces */
 	u32 rq_result;		/* packed scsi request result from LLD */
-	char sg_io_owned;	/* 1 -> packet belongs to SG_IO */
-	/* done protected by rq_list_lock */
-	char done;		/* 0->before bh, 1->before read, 2->read */
+	int in_resid;		/* requested-actual byte count on data-in */
+	int pack_id;		/* user provided packet identifier field */
+	int sense_len;		/* actual sense buffer length (data-in) */
 	atomic_t rq_st;		/* request state, holds a enum sg_rq_state */
+	u8 cmd_opcode;		/* first byte of SCSI cdb */
 	u64 start_ns;		/* starting point of command duration calc */
-	unsigned long frq_bm[1];        /* see SG_FRQ_* defines above */
-	struct sg_fd *parentfp; /* pointer to owning fd, even when on fl */
+	unsigned long frq_bm[1];	/* see SG_FRQ_* defines above */
+	struct sg_fd *parentfp;	/* pointer to owning fd, even when on fl */
 	struct request *rq;	/* released in sg_rq_end_io(), bio kept */
 	struct bio *bio;	/* kept until this req -->SG_RS_INACTIVE */
 	struct execute_work ew_orph;	/* harvest orphan request */
@@ -180,7 +195,7 @@ struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
 struct sg_fd {		/* holds the state of a file descriptor */
 	struct sg_device *parentdp;	/* owning device */
 	wait_queue_head_t read_wait;	/* queue read until command done */
-	struct mutex f_mutex;	/* protect against changes in this fd */
+	struct mutex f_mutex;	/* serialize ioctls on this fd */
 	int timeout;		/* defaults to SG_DEFAULT_TIMEOUT      */
 	int timeout_user;	/* defaults to SG_DEFAULT_TIMEOUT_USER */
 	u32 idx;		/* my index within parent's sfp_arr */
@@ -188,15 +203,12 @@ struct sg_fd {		/* holds the state of a file descriptor */
 	atomic_t waiting;	/* number of requests awaiting receive */
 	atomic_t req_cnt;	/* number of requests */
 	int sgat_elem_sz;	/* initialized to scatter_elem_sz */
-	struct sg_scatter_hold reserve;	/* buffer for this file descriptor */
-	char force_packid;	/* 1 -> pack_id input to read(), 0 -> ignored */
-	char cmd_q;		/* 1 -> allow command queuing, 0 -> don't */
+	unsigned long ffd_bm[1];	/* see SG_FFD_* defines above */
+	pid_t tid;		/* thread id when opened */
 	u8 next_cmd_len;	/* 0: automatic, >0: use on next write() */
-	char keep_orphan;	/* 0 -> drop orphan (def), 1 -> keep for read() */
-	char mmap_called;	/* 0 -> mmap() never called on this fd */
-	char res_in_use;	/* 1 -> 'reserve' array in use */
-	struct fasync_struct *async_qp;	/* used by asynchronous notification */
-	struct xarray srp_arr;
+	struct sg_request *rsv_srp;/* one reserve request per fd */
+	struct fasync_struct *async_qp; /* used by asynchronous notification */
+	struct xarray srp_arr;	/* xarray of sg_request object pointers */
 	struct kref f_ref;
 	struct execute_work ew_fd;  /* harvest all fd resources and lists */
 };
@@ -219,8 +231,8 @@ struct sg_device { /* holds the state of each scsi generic device */
 
 struct sg_comm_wr_t {  /* arguments to sg_common_write() */
 	int timeout;
-	int blocking;
-	struct sg_request *srp;
+	unsigned long frq_bm[1];	/* see SG_FRQ_* defines above */
+	struct sg_io_hdr *h3p;
 	u8 *cmnd;
 };
 
@@ -228,31 +240,32 @@ struct sg_comm_wr_t {  /* arguments to sg_common_write() */
 static void sg_rq_end_io(struct request *rq, blk_status_t status);
 /* Declarations of other static functions used before they are defined */
 static int sg_proc_init(void);
-static int sg_start_req(struct sg_request *srp, u8 *cmd);
+static int sg_start_req(struct sg_request *srp, u8 *cmd, int cmd_len,
+			int dxfer_dir);
 static void sg_finish_scsi_blk_rq(struct sg_request *srp);
-static int sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp,
-		      int minlen);
-static ssize_t sg_submit(struct sg_fd *sfp, struct file *filp,
-			 const char __user *buf, size_t count, bool blocking,
-			 bool read_only, bool sg_io_owned,
-			 struct sg_request **o_srp);
-static int sg_common_write(struct sg_fd *sfp, struct sg_comm_wr_t *cwrp);
+static int sg_mk_sgat(struct sg_request *srp, struct sg_fd *sfp, int minlen);
+static int sg_submit(struct file *filp, struct sg_fd *sfp,
+		     struct sg_io_hdr *hp, bool sync,
+		     struct sg_request **o_srp);
+static struct sg_request *sg_common_write(struct sg_fd *sfp,
+					  struct sg_comm_wr_t *cwrp);
 static int sg_read_append(struct sg_request *srp, void __user *outp,
 			  int num_xfer);
-static void sg_remove_sgat(struct sg_fd *sfp, struct sg_scatter_hold *schp);
-static void sg_build_reserve(struct sg_fd *sfp, int req_size);
-static void sg_link_reserve(struct sg_fd *sfp, struct sg_request *srp,
-			    int size);
-static void sg_unlink_reserve(struct sg_fd *sfp, struct sg_request *srp);
+static void sg_remove_sgat(struct sg_request *srp);
 static struct sg_fd *sg_add_sfp(struct sg_device *sdp);
 static void sg_remove_sfp(struct kref *);
 static struct sg_request *sg_find_srp_by_id(struct sg_fd *sfp, int pack_id);
-static struct sg_request *sg_setup_req(struct sg_fd *sfp);
+static struct sg_request *sg_setup_req(struct sg_fd *sfp, int dxfr_len,
+				       struct sg_comm_wr_t *cwrp);
 static void sg_deact_request(struct sg_fd *sfp, struct sg_request *srp);
 static struct sg_device *sg_get_dev(int dev);
 static void sg_device_destroy(struct kref *kref);
+static struct sg_request *sg_mk_srp_sgat(struct sg_fd *sfp, bool first,
+					 int db_len);
 static const char *sg_rq_st_str(enum sg_rq_state rq_st, bool long_str);
 
+#define SG_WRITE_COUNT_LIMIT (32 * 1024 * 1024)
+
 #define SZ_SG_HEADER ((int)sizeof(struct sg_header))	/* v1 and v2 header */
 #define SZ_SG_IO_HDR ((int)sizeof(struct sg_io_hdr))	/* v3 header */
 #define SZ_SG_REQ_INFO ((int)sizeof(struct sg_req_info))
@@ -518,6 +531,7 @@ sg_release(struct inode *inode, struct file *filp)
 static ssize_t
 sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 {
+	bool get_v3_hdr;
 	int mxsize, cmd_size, input_size, res;
 	u8 opcode;
 	struct sg_device *sdp;
@@ -540,36 +554,61 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 	res = sg_allow_if_err_recovery(sdp, !!(filp->f_flags & O_NONBLOCK));
 	if (res)
 		return res;
-
-	if (count < SZ_SG_HEADER)
+	if (count < SZ_SG_HEADER || count > SG_WRITE_COUNT_LIMIT)
 		return -EIO;
-	if (copy_from_user(ohp, p, SZ_SG_HEADER))
-		return -EFAULT;
-	if (ohp->reply_len < 0) {	/* assume this is v3 */
-		struct sg_io_hdr *reinter_2p = (struct sg_io_hdr *)ohp;
+#ifdef CONFIG_COMPAT
+	if (in_compat_syscall())
+		get_v3_hdr = (count == sizeof(struct compat_sg_io_hdr));
+	else
+		get_v3_hdr = (count == sizeof(struct sg_io_hdr));
+#else
+	get_v3_hdr = (count == sizeof(struct sg_io_hdr));
+#endif
+	if (get_v3_hdr) {
+		if (get_sg_io_hdr(h3p, p))
+			return -EFAULT;
+	} else {
+		if (copy_from_user(ohp, p, SZ_SG_HEADER))
+			return -EFAULT;
+		if (ohp->reply_len < 0) {	/* not v2, may be v3 */
+			bool lt = false;
 
-		if (count < SZ_SG_IO_HDR)
-			return -EIO;
-		if (reinter_2p->interface_id != 'S') {
+#ifdef CONFIG_COMPAT
+			if (in_compat_syscall())
+				lt = (count < sizeof(struct compat_sg_io_hdr));
+			else
+				lt = (count < sizeof(struct sg_io_hdr));
+#else
+			lt = (count < sizeof(struct sg_io_hdr));
+#endif
+			if (lt)
+				return -EIO;
+			get_v3_hdr = true;
+			if (get_sg_io_hdr(h3p, p))
+				return -EFAULT;
+		}
+	}
+	if (get_v3_hdr) {
+		/* v3 dxfer_direction_s are all negative values by design */
+		if (h3p->dxfer_direction >= 0) {	/* so it is not v3 */
+			memcpy(ohp, h3p, count);
+			goto to_v2;
+		}
+		if (h3p->interface_id != 'S') {
 			pr_info_once("sg: %s: v3 interface only here\n",
 				     __func__);
 			return -EPERM;
 		}
-		return sg_submit(sfp, filp, p, count,
-				 !(filp->f_flags & O_NONBLOCK), false, false,
-				 NULL);
+		res = sg_submit(filp, sfp, h3p, false, NULL);
+		return res < 0 ? res : (int)count;
 	}
+to_v2:
+	/* v1 and v2 interfaces processed below this point */
 	if (count < (SZ_SG_HEADER + 6))
-		return -EIO;	/* The minimum scsi command length is 6 bytes. */
-
+		return -EIO;    /* minimum scsi command length is 6 bytes */
 	p += SZ_SG_HEADER;
 	if (get_user(opcode, p))
 		return -EFAULT;
-
-	if (!(srp = sg_setup_req(sfp))) {
-		SG_LOG(1, sfp, "%s: queue full\n", __func__);
-		return -EDOM;
-	}
 	mutex_lock(&sfp->f_mutex);
 	if (sfp->next_cmd_len > 0) {
 		cmd_size = sfp->next_cmd_len;
@@ -586,12 +625,10 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 	mxsize = max_t(int, input_size, ohp->reply_len);
 	mxsize -= SZ_SG_HEADER;
 	input_size -= SZ_SG_HEADER;
-	if (input_size < 0) {
-		sg_deact_request(sfp, srp);
-		return -EIO;	/* User did not pass enough bytes for this command. */
-	}
-	h3p = &srp->header;
-	h3p->interface_id = '\0';  /* indicator of old interface tunnelled */
+	if (input_size < 0)
+		return -EIO; /* Insufficient bytes passed for this command. */
+	memset(h3p, 0, sizeof(*h3p));
+	h3p->interface_id = '\0';/* indicate v1 or v2 interface (tunnelled) */
 	h3p->cmd_len = (u8)cmd_size;
 	h3p->iovec_count = 0;
 	h3p->mx_sb_len = 0;
@@ -612,10 +649,9 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 	h3p->flags = input_size;	/* structure abuse ... */
 	h3p->pack_id = ohp->pack_id;
 	h3p->usr_ptr = NULL;
-	if (copy_from_user(cmnd, p, cmd_size)) {
-		sg_deact_request(sfp, srp);
+	cmnd[0] = opcode;
+	if (copy_from_user(cmnd + 1, p + 1, cmd_size - 1))
 		return -EFAULT;
-	}
 	/*
 	 * SG_DXFER_TO_FROM_DEV is functionally equivalent to SG_DXFER_FROM_DEV,
 	 * but it is possible that the app intended SG_DXFER_TO_DEV, because
@@ -629,23 +665,23 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 			 __func__, ohp->reply_len - (int)SZ_SG_HEADER,
 			 input_size, (unsigned int)cmnd[0], current->comm);
 	}
+	cwr.frq_bm[0] = 0;	/* initial state clear for all req flags */
+	cwr.h3p = h3p;
 	cwr.timeout = sfp->timeout;
-	cwr.blocking = !(filp->f_flags & O_NONBLOCK);
-	cwr.srp = srp;
 	cwr.cmnd = cmnd;
-	res = sg_common_write(sfp, &cwr);
-	return (res < 0) ? res : count;
+	srp = sg_common_write(sfp, &cwr);
+	return (IS_ERR(srp)) ? PTR_ERR(srp) : (int)count;
 }
 
 static inline int
 sg_chk_mmap(struct sg_fd *sfp, int rq_flags, int len)
 {
-	if (len > sfp->reserve.buflen)
-		return -ENOMEM;	/* MMAP_IO size must fit in reserve buffer */
+	if (!xa_empty(&sfp->srp_arr))
+		return -EBUSY;  /* already active requests on fd */
+	if (len > sfp->rsv_srp->sgat_h.buflen)
+		return -ENOMEM; /* MMAP_IO size must fit in reserve */
 	if (rq_flags & SG_FLAG_DIRECT_IO)
-		return -EINVAL;	/* either MMAP_IO or DIRECT_IO (not both) */
-	if (sfp->res_in_use)
-		return -EBUSY;	/* reserve buffer already being used */
+		return -EINVAL; /* not both MMAP_IO and DIRECT_IO */
 	return 0;
 }
 
@@ -670,61 +706,40 @@ sg_fetch_cmnd(struct file *filp, struct sg_fd *sfp, const u8 __user *u_cdbp,
 	return 0;
 }
 
-static ssize_t
-sg_submit(struct sg_fd *sfp, struct file *filp, const char __user *buf,
-	  size_t count, bool blocking, bool read_only, bool sg_io_owned,
-	  struct sg_request **o_srp)
+static int
+sg_submit(struct file *filp, struct sg_fd *sfp, struct sg_io_hdr *hp,
+	  bool sync, struct sg_request **o_srp)
 {
-	int k, res, timeout;
+	int res, timeout;
+	unsigned long ul_timeout;
 	struct sg_request *srp;
-	struct sg_io_hdr *hp;
 	struct sg_comm_wr_t cwr;
 	u8 cmnd[SG_MAX_CDB_SIZE];
-	unsigned long ul_timeout;
 
-	if (count < SZ_SG_IO_HDR)
-		return -EINVAL;
-
-	sfp->cmd_q = 1;	/* when sg_io_hdr seen, set command queuing on */
-	if (!(srp = sg_setup_req(sfp))) {
-		SG_LOG(1, sfp, "%s: queue full\n", __func__);
-		return -EDOM;
-	}
-	srp->sg_io_owned = sg_io_owned;
-	hp = &srp->header;
-	/* get_sg_io_hdr() is defined in block/scsi_ioctl.c */
-	if (get_sg_io_hdr(hp, buf)) {
-		sg_deact_request(sfp, srp);
-		return -EFAULT;
-	}
-	if (hp->interface_id != 'S') {
-		sg_deact_request(sfp, srp);
-		return -ENOSYS;
-	}
+	/* now doing v3 blocking (sync) or non-blocking submission */
 	if (hp->flags & SG_FLAG_MMAP_IO) {
 		res = sg_chk_mmap(sfp, hp->flags, hp->dxfer_len);
-		if (res) {
-			sg_deact_request(sfp, srp);
+		if (res)
 			return res;
-		}
 	}
-	ul_timeout = msecs_to_jiffies(srp->header.timeout);
-	timeout = (ul_timeout < INT_MAX) ? ul_timeout : INT_MAX;
+	/* when v3 seen, allow cmd_q on this fd (def: no cmd_q) */
+	set_bit(SG_FFD_CMD_Q, sfp->ffd_bm);
+	ul_timeout = msecs_to_jiffies(hp->timeout);
+	timeout = min_t(unsigned long, ul_timeout, INT_MAX);
 	res = sg_fetch_cmnd(filp, sfp, hp->cmdp, hp->cmd_len, cmnd);
-	if (res) {
-		sg_deact_request(sfp, srp);
+	if (res)
 		return res;
-	}
+	cwr.frq_bm[0] = 0;
+	__assign_bit(SG_FRQ_SYNC_INVOC, cwr.frq_bm, (int)sync);
+	cwr.h3p = hp;
 	cwr.timeout = timeout;
-	cwr.blocking = blocking;
-	cwr.srp = srp;
 	cwr.cmnd = cmnd;
-	k = sg_common_write(sfp, &cwr);
-	if (k < 0)
-		return k;
+	srp = sg_common_write(sfp, &cwr);
+	if (IS_ERR(srp))
+		return PTR_ERR(srp);
 	if (o_srp)
-		*o_srp = cwr.srp;
-	return count;
+		*o_srp = srp;
+	return 0;
 }
 
 #if IS_ENABLED(SG_LOG_ACTIVE)
@@ -842,70 +857,69 @@ sg_rq_state_chg(struct sg_request *srp, enum sg_rq_state old_st,
  * sg_request object holding the request just issued or a negated errno
  * value twisted by ERR_PTR.
  */
-static int
+static struct sg_request *
 sg_common_write(struct sg_fd *sfp, struct sg_comm_wr_t *cwrp)
 {
 	bool at_head;
-	int k;
+	int res = 0;
+	int dxfr_len, dir, cmd_len;
+	int pack_id = SG_PACK_ID_WILDCARD;
+	u32 rq_flags;
 	struct sg_device *sdp = sfp->parentdp;
-	struct sg_request *srp = cwrp->srp;
-	struct sg_io_hdr *hp = &srp->header;
-
-	srp->data.cmd_opcode = cwrp->cmnd[0];	/* hold opcode of command */
-	hp->status = 0;
-	hp->masked_status = 0;
-	hp->msg_status = 0;
-	hp->info = 0;
-	hp->host_status = 0;
-	hp->driver_status = 0;
-	hp->resid = 0;
-	SG_LOG(4, sfp, "%s:  opcode=0x%02x, cmd_sz=%d\n", __func__,
-	       (int)cwrp->cmnd[0], hp->cmd_len);
-
-	if (hp->dxfer_len >= SZ_256M) {
-		sg_deact_request(sfp, srp);
-		return -EINVAL;
-	}
-
-	k = sg_start_req(srp, cwrp->cmnd);
-	if (k) {
-		SG_LOG(1, sfp, "%s: start_req err=%d\n", __func__, k);
-		sg_finish_scsi_blk_rq(srp);
-		sg_deact_request(sfp, srp);
-		return k;	/* probably out of space --> ENOMEM */
-	}
-	if (SG_IS_DETACHING(sdp))
+	struct sg_request *srp;
+	struct sg_io_hdr *hi_p;
+
+	hi_p = cwrp->h3p;
+	dir = hi_p->dxfer_direction;
+	dxfr_len = hi_p->dxfer_len;
+	rq_flags = hi_p->flags;
+	pack_id = hi_p->pack_id;
+	if (dxfr_len >= SZ_256M)
+		return ERR_PTR(-EINVAL);
+
+	srp = sg_setup_req(sfp, dxfr_len, cwrp);
+	if (IS_ERR(srp))
+		return srp;
+	srp->rq_flags = rq_flags;
+	srp->pack_id = pack_id;
+
+	cmd_len = hi_p->cmd_len;
+	memcpy(&srp->s_hdr3, hi_p, sizeof(srp->s_hdr3));
+	srp->cmd_opcode = cwrp->cmnd[0];/* hold opcode of command for debug */
+	SG_LOG(4, sfp, "%s: opcode=0x%02x, cdb_sz=%d, pack_id=%d\n", __func__,
+	       (int)cwrp->cmnd[0], cmd_len, pack_id);
+
+	res = sg_start_req(srp, cwrp->cmnd, cmd_len, dir);
+	if (res < 0)		/* probably out of space --> -ENOMEM */
 		goto err_out;
-
-	hp->duration = jiffies_to_msecs(jiffies);
-	if (hp->interface_id != '\0' &&	/* v3 (or later) interface */
-	    (SG_FLAG_Q_AT_TAIL & hp->flags))
-		at_head = false;
-	else
-		at_head = true;
-
-	if (!srp->sg_io_owned)
-		atomic_inc(&sfp->submitted);
+	if (unlikely(SG_IS_DETACHING(sdp))) {
+		res = -ENODEV;
+		goto err_out;
+	}
 	srp->rq->timeout = cwrp->timeout;
 	kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */
+	res = sg_rq_state_chg(srp, SG_RS_BUSY, SG_RS_INFLIGHT, false,
+			      __func__);
+	if (res)
+		goto err_out;
+	srp->start_ns = ktime_get_boottime_ns();
+	srp->duration = 0;
+
+	if (srp->s_hdr3.interface_id == '\0')
+		at_head = true; /* backward compatibility: v1+v2 interfaces */
+	else if (test_bit(SG_FFD_Q_AT_TAIL, sfp->ffd_bm))
+	/* cmd flags can override sfd setting */
+		at_head = !!(srp->rq_flags & SG_FLAG_Q_AT_HEAD);
+	else            /* this sfd is defaulting to head */
+		at_head = !(srp->rq_flags & SG_FLAG_Q_AT_TAIL);
+	if (!test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm))
+		atomic_inc(&sfp->submitted);
 	blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io);
-	return 0;
+	return srp;
 err_out:
-	if (srp->bio) {
-		scsi_req_free_cmd(scsi_req(srp->rq));
-		blk_put_request(srp->rq);
-		srp->rq = NULL;
-	}
 	sg_finish_scsi_blk_rq(srp);
 	sg_deact_request(sfp, srp);
-	return -ENODEV;
-}
-
-static inline int
-sg_rstate_chg(struct sg_request *srp, enum sg_rq_state old_st,
-	      enum sg_rq_state new_st)
-{
-	return sg_rq_state_chg(srp, old_st, new_st, false, __func__);
+	return ERR_PTR(res);
 }
 
 /*
@@ -936,21 +950,26 @@ static int
 sg_copy_sense(struct sg_request *srp)
 {
 	int sb_len_ret = 0;
-	struct sg_io_hdr *hp = &srp->header;
+	int scsi_stat;
 
 	/* If need be, copy the sense buffer to the user space */
-	if ((CHECK_CONDITION & hp->masked_status) ||
-	    (DRIVER_SENSE & hp->driver_status)) {
-		int sb_len = SCSI_SENSE_BUFFERSIZE;
-		void __user *up = hp->sbp;
-
-		sb_len = min_t(int, hp->mx_sb_len, sb_len);
-		/* Additional sense length field */
-		sb_len_ret = 8 + (int)srp->sense_b[7];
-		sb_len_ret = min_t(int, sb_len_ret, sb_len);
-		if (copy_to_user(up, srp->sense_b, sb_len_ret))
-			return -EFAULT;
-		hp->sb_len_wr = sb_len_ret;
+	scsi_stat = srp->rq_result & 0xff;
+	if ((scsi_stat & SAM_STAT_CHECK_CONDITION) ||
+	    (driver_byte(srp->rq_result) & DRIVER_SENSE)) {
+		int sb_len = min_t(int, SCSI_SENSE_BUFFERSIZE, srp->sense_len);
+		int mx_sb_len = srp->s_hdr3.mx_sb_len;
+		void __user *up = srp->s_hdr3.sbp;
+
+		if (up && mx_sb_len > 0) {
+			sb_len = min_t(int, mx_sb_len, sb_len);
+			/* Additional sense length field */
+			sb_len_ret = 8 + (int)srp->sense_b[7];
+			sb_len_ret = min_t(int, sb_len_ret, sb_len);
+			if (copy_to_user(up, srp->sense_b, sb_len_ret))
+				sb_len_ret = -EFAULT;
+		} else {
+			sb_len_ret = 0;
+		}
 	}
 	return sb_len_ret;
 }
@@ -959,12 +978,15 @@ static int
 sg_rec_state_v3(struct sg_fd *sfp, struct sg_request *srp)
 {
 	int sb_len_wr;
+	u32 rq_res = srp->rq_result;
 
 	sb_len_wr = sg_copy_sense(srp);
 	if (sb_len_wr < 0)
 		return sb_len_wr;
+	if (rq_res & SG_ML_RESULT_MSK)
+		srp->rq_info |= SG_INFO_CHECK;
 	if (unlikely(SG_IS_DETACHING(sfp->parentdp)))
-		return -ENODEV;
+		srp->rq_info |= SG_INFO_DEVICE_DETACHING;
 	return 0;
 }
 
@@ -972,8 +994,10 @@ static ssize_t
 sg_receive_v3(struct sg_fd *sfp, struct sg_request *srp, size_t count,
 	      void __user *p)
 {
-	int err = 0;
-	struct sg_io_hdr *hp = &srp->header;
+	int err, err2;
+	int rq_result = srp->rq_result;
+	struct sg_io_hdr hdr3;
+	struct sg_io_hdr *hp = &hdr3;
 
 	if (in_compat_syscall()) {
 		if (count < sizeof(struct compat_sg_io_hdr)) {
@@ -986,9 +1010,23 @@ sg_receive_v3(struct sg_fd *sfp, struct sg_request *srp, size_t count,
 	}
 	SG_LOG(3, sfp, "%s: srp=0x%pK\n", __func__, srp);
 	err = sg_rec_state_v3(sfp, srp);
-	if (hp->masked_status || hp->host_status || hp->driver_status)
-		hp->info |= SG_INFO_CHECK;
-	err = put_sg_io_hdr(hp, p);
+	memset(hp, 0, sizeof(*hp));
+	memcpy(hp, &srp->s_hdr3, sizeof(srp->s_hdr3));
+	hp->sb_len_wr = srp->sense_len;
+	hp->info = srp->rq_info;
+	hp->resid = srp->in_resid;
+	hp->duration = srp->duration;
+	hp->status = rq_result & 0xff;
+	hp->masked_status = status_byte(rq_result);
+	hp->msg_status = msg_byte(rq_result);
+	hp->host_status = host_byte(rq_result);
+	hp->driver_status = driver_byte(rq_result);
+	err2 = put_sg_io_hdr(hp, p);
+	err = err ? err : err2;
+	err2 = sg_rq_state_chg(srp, atomic_read(&srp->rq_st), SG_RS_RCV_DONE,
+			       false, __func__);
+	if (err2)
+		err = err ? err : err2;
 err_out:
 	sg_finish_scsi_blk_rq(srp);
 	sg_deact_request(sfp, srp);
@@ -1005,26 +1043,27 @@ sg_read_v1v2(void __user *buf, int count, struct sg_fd *sfp,
 	     struct sg_request *srp)
 {
 	int res = 0;
-	struct sg_io_hdr *sh3p = &srp->header;
+	u32 rq_result = srp->rq_result;
 	struct sg_header *h2p;
+	struct sg_slice_hdr3 *sh3p;
 	struct sg_header a_v2hdr;
 
 	h2p = &a_v2hdr;
 	memset(h2p, 0, SZ_SG_HEADER);
+	sh3p = &srp->s_hdr3;
 	h2p->reply_len = (int)sh3p->timeout;
 	h2p->pack_len = h2p->reply_len; /* old, strange behaviour */
 	h2p->pack_id = sh3p->pack_id;
-	h2p->twelve_byte = (srp->data.cmd_opcode >= 0xc0 &&
-			    sh3p->cmd_len == 12);
-	h2p->target_status = sh3p->masked_status;
-	h2p->host_status = sh3p->host_status;
-	h2p->driver_status = sh3p->driver_status;
-	if ((CHECK_CONDITION & h2p->target_status) ||
-	    (DRIVER_SENSE & sh3p->driver_status)) {
+	h2p->twelve_byte = (srp->cmd_opcode >= 0xc0 && sh3p->cmd_len == 12);
+	h2p->target_status = status_byte(rq_result);
+	h2p->host_status = host_byte(rq_result);
+	h2p->driver_status = driver_byte(rq_result);
+	if ((CHECK_CONDITION & status_byte(rq_result)) ||
+	    (DRIVER_SENSE & driver_byte(rq_result))) {
 		memcpy(h2p->sense_buffer, srp->sense_b,
 		       sizeof(h2p->sense_buffer));
 	}
-	switch (h2p->host_status) {
+	switch (host_byte(rq_result)) {
 	/*
 	 * This following setting of 'result' is for backward compatibility
 	 * and is best ignored by the user who should use target, host and
@@ -1048,7 +1087,7 @@ sg_read_v1v2(void __user *buf, int count, struct sg_fd *sfp,
 		h2p->result = EIO;
 		break;
 	case DID_ERROR:
-		h2p->result = (h2p->target_status == GOOD) ? 0 : EIO;
+		h2p->result = (status_byte(rq_result) == GOOD) ? 0 : EIO;
 		break;
 	default:
 		h2p->result = EIO;
@@ -1069,6 +1108,7 @@ sg_read_v1v2(void __user *buf, int count, struct sg_fd *sfp,
 	} else {
 		res = (h2p->result == 0) ? 0 : -EIO;
 	}
+	atomic_set(&srp->rq_st, SG_RS_RCV_DONE);
 	sg_finish_scsi_blk_rq(srp);
 	sg_deact_request(sfp, srp);
 	return res;
@@ -1112,13 +1152,13 @@ sg_read(struct file *filp, char __user *p, size_t count, loff_t *ppos)
 	hlen = could_be_v3 ? SZ_SG_IO_HDR : SZ_SG_HEADER;
 	h2p = (struct sg_header *)&a_sg_io_hdr;
 
-	if (sfp->force_packid && count >= hlen) {
+	if (test_bit(SG_FFD_FORCE_PACKID, sfp->ffd_bm) && (int)count >= hlen) {
 		/*
 		 * Even though this is a user space read() system call, this
 		 * code is cheating to fetch the pack_id.
 		 * Only need first three 32 bit ints to determine interface.
 		 */
-		if (unlikely(copy_from_user(h2p, p, 3 * sizeof(int))))
+		if (copy_from_user(h2p, p, 3 * sizeof(int)))
 			return -EFAULT;
 		if (h2p->reply_len < 0 && could_be_v3) {
 			struct sg_io_hdr *v3_hdr = (struct sg_io_hdr *)h2p;
@@ -1139,20 +1179,20 @@ sg_read(struct file *filp, char __user *p, size_t count, loff_t *ppos)
 	}
 	srp = sg_find_srp_by_id(sfp, want_id);
 	if (!srp) {	/* nothing available so wait on packet to arrive or */
-		if (SG_IS_DETACHING(sdp))
+		if (unlikely(SG_IS_DETACHING(sdp)))
 			return -ENODEV;
 		if (non_block) /* O_NONBLOCK or v3::flags & SGV4_FLAG_IMMED */
 			return -EAGAIN;
 		ret = wait_event_interruptible(sfp->read_wait,
 					       sg_get_ready_srp(sfp, &srp,
 								want_id));
-		if (SG_IS_DETACHING(sdp))
+		if (unlikely(SG_IS_DETACHING(sdp)))
 			return -ENODEV;
 		if (ret)	/* -ERESTARTSYS as signal hit process */
 			return ret;
 		/* otherwise srp should be valid */
 	}
-	if (srp->header.interface_id == '\0')
+	if (srp->s_hdr3.interface_id == '\0')
 		ret = sg_read_v1v2(p, (int)count, sfp, srp);
 	else
 		ret = sg_receive_v3(sfp, srp, count, p);
@@ -1223,7 +1263,7 @@ sg_calc_rq_dur(const struct sg_request *srp)
 	return (diff > (s64)U32_MAX) ? 3999999999U : (u32)diff;
 }
 
-/* Return of U32_MAX means srp is inactive */
+/* Return of U32_MAX means srp is inactive state */
 static u32
 sg_get_dur(struct sg_request *srp, const enum sg_rq_state *sr_stp,
 	   bool *is_durp)
@@ -1254,34 +1294,63 @@ static void
 sg_fill_request_element(struct sg_fd *sfp, struct sg_request *srp,
 			struct sg_req_info *rip)
 {
-	unsigned int ms;
+	unsigned long iflags;
 
-	rip->req_state = srp->done + 1;
-	rip->problem = srp->header.masked_status &
-		       srp->header.host_status &
-		       srp->header.driver_status;
-	rip->duration = sg_get_dur(srp, NULL, NULL); /* dummy */
-	if (srp->done) {
-		rip->duration = srp->header.duration;
-	} else {
-		ms = jiffies_to_msecs(jiffies);
-		rip->duration = (ms > srp->header.duration) ?
-				(ms - srp->header.duration) : 0;
-	}
-	rip->orphan = srp->orphan;
-	rip->sg_io_owned = srp->sg_io_owned;
-	rip->pack_id = srp->header.pack_id;
-	rip->usr_ptr = srp->header.usr_ptr;
+	xa_lock_irqsave(&sfp->srp_arr, iflags);
+	rip->duration = sg_get_dur(srp, NULL, NULL);
+	if (rip->duration == U32_MAX)
+		rip->duration = 0;
+	rip->orphan = test_bit(SG_FRQ_IS_ORPHAN, srp->frq_bm);
+	rip->sg_io_owned = test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm);
+	rip->problem = !!(srp->rq_result & SG_ML_RESULT_MSK);
+	rip->pack_id = srp->pack_id;
+	rip->usr_ptr = srp->s_hdr3.usr_ptr;
+	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
+}
 
+static inline bool
+sg_rq_landed(struct sg_device *sdp, struct sg_request *srp)
+{
+	return atomic_read(&srp->rq_st) != SG_RS_INFLIGHT ||
+	       unlikely(SG_IS_DETACHING(sdp));
 }
 
+/*
+ * This is a blocking wait for a specific srp. When h4p is non-NULL, it is
+ * the blocking multiple request case
+ */
 static int
-srp_done(struct sg_fd *sfp, struct sg_request *srp)
+sg_wait_event_srp(struct file *filp, struct sg_fd *sfp, void __user *p,
+		  struct sg_request *srp)
 {
-	int ret;
+	int res;
+	enum sg_rq_state sr_st;
+	struct sg_device *sdp = sfp->parentdp;
 
-	ret = srp->done;
-	return ret;
+	SG_LOG(3, sfp, "%s: about to wait_event...()\n", __func__);
+	/* usually will be woken up by sg_rq_end_io() callback */
+	res = wait_event_interruptible(sfp->read_wait,
+				       sg_rq_landed(sdp, srp));
+	if (unlikely(res)) { /* -ERESTARTSYS because signal hit thread */
+		set_bit(SG_FRQ_IS_ORPHAN, srp->frq_bm);
+		/* orphans harvested when sfp->keep_orphan is false */
+		atomic_set(&srp->rq_st, SG_RS_INFLIGHT);
+		SG_LOG(1, sfp, "%s:  wait_event_interruptible gave %d\n",
+		       __func__, res);
+		return res;
+	}
+	if (unlikely(SG_IS_DETACHING(sdp))) {
+		atomic_set(&srp->rq_st, SG_RS_INACTIVE);
+		return -ENODEV;
+	}
+	sr_st = atomic_read(&srp->rq_st);
+	if (unlikely(sr_st != SG_RS_AWAIT_RCV))
+		return -EPROTO;         /* Logic error */
+	res = sg_rq_state_chg(srp, sr_st, SG_RS_BUSY, false, __func__);
+	if (unlikely(res))
+		return res;
+	res = sg_receive_v3(sfp, srp, SZ_SG_IO_HDR, p);
+	return (res < 0) ? res : 0;
 }
 
 /*
@@ -1292,42 +1361,119 @@ static int
 sg_ctl_sg_io(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 	     void __user *p)
 {
-	bool read_only = O_RDWR != (filp->f_flags & O_ACCMODE);
 	int res;
-	struct sg_request *srp;
+	struct sg_request *srp = NULL;
+	u8 hu8arr[SZ_SG_IO_HDR];
+	struct sg_io_hdr *h3p = (struct sg_io_hdr *)hu8arr;
 
+	SG_LOG(3, sfp, "%s:  SG_IO%s\n", __func__,
+	       ((filp->f_flags & O_NONBLOCK) ? " O_NONBLOCK ignored" : ""));
 	res = sg_allow_if_err_recovery(sdp, false);
 	if (res)
 		return res;
-	res = sg_submit(sfp, filp, p, SZ_SG_IO_HDR, true, read_only,
-			true, &srp);
-	if (res < 0)
+	if (get_sg_io_hdr(h3p, p))
+		return -EFAULT;
+	if (h3p->interface_id == 'S')
+		res = sg_submit(filp, sfp, h3p, true, &srp);
+	else
+		return -EPERM;
+	if (unlikely(res < 0))
+		return res;
+	if (!srp)	/* mrq case: already processed all responses */
 		return res;
-	res = wait_event_interruptible
-		(sfp->read_wait, (srp_done(sfp, srp) || SG_IS_DETACHING(sdp)));
-	if (SG_IS_DETACHING(sdp))
-		return -ENODEV;
-	if (srp->done) {
-		srp->done = 2;
-		res = sg_receive_v3(sfp, srp, SZ_SG_IO_HDR, p);
-		return (res < 0) ? res : 0;
-	}
-	srp->orphan = 1;
+	res = sg_wait_event_srp(filp, sfp, p, srp);
+	if (res)
+		SG_LOG(1, sfp, "%s: %s=0x%pK  state: %s\n", __func__,
+		       "unexpected srp", srp,
+		       sg_rq_st_str(atomic_read(&srp->rq_st), false));
 	return res;
 }
 
+/*
+ * First normalize want_rsv_sz to be >= sfp->sgat_elem_sz and
+ * <= max_segment_size. Exit if that is the same as old size; otherwise
+ * create a new candidate request of the new size. Then decide whether to
+ * re-use an existing free list request (least buflen >= required size) or
+ * use the new candidate. If new one used, leave old one but it is no longer
+ * the reserved request. Returns 0 on success, else a negated errno value.
+ */
 static int
 sg_set_reserved_sz(struct sg_fd *sfp, int want_rsv_sz)
 {
-	if (want_rsv_sz != sfp->reserve.buflen) {
-		if (sfp->mmap_called ||
-		    sfp->res_in_use) {
-			return -EBUSY;
+	bool use_new_srp = false;
+	int res = 0;
+	int new_sz, blen;
+	unsigned long idx, iflags;
+	struct sg_request *o_srp;       /* prior reserve sg_request */
+	struct sg_request *n_srp;       /* new sg_request, may be used */
+	struct sg_request *t_srp;       /* other fl entries */
+	struct sg_device *sdp = sfp->parentdp;
+	struct xarray *xafp = &sfp->srp_arr;
+
+	o_srp = sfp->rsv_srp;
+	if (!o_srp)
+		return -EPROTO;
+	new_sz = min_t(int, want_rsv_sz, sdp->max_sgat_sz);
+	new_sz = max_t(int, new_sz, sfp->sgat_elem_sz);
+	blen = o_srp->sgat_h.buflen;
+	SG_LOG(3, sfp, "%s: was=%d, ask=%d, new=%d (sgat_elem_sz=%d)\n",
+	       __func__, blen, want_rsv_sz, new_sz, sfp->sgat_elem_sz);
+	if (blen == new_sz)
+		return 0;
+	n_srp = sg_mk_srp_sgat(sfp, true /* can take time */, new_sz);
+	if (IS_ERR(n_srp))
+		return PTR_ERR(n_srp);
+	sg_rq_state_force(n_srp, SG_RS_INACTIVE);
+	/* new sg_request object, sized correctly is now available */
+try_again:
+	o_srp = sfp->rsv_srp;
+	if (!o_srp) {
+		res = -EPROTO;
+		goto fini;
+	}
+	if (SG_RS_ACTIVE(o_srp) ||
+	    test_bit(SG_FFD_MMAP_CALLED, sfp->ffd_bm)) {
+		res = -EBUSY;
+		goto fini;
+	}
+	use_new_srp = true;
+	xa_for_each(xafp, idx, t_srp) {
+		if (t_srp != o_srp && new_sz <= t_srp->sgat_h.buflen &&
+		    !SG_RS_ACTIVE(t_srp)) {
+			/* good candidate on free list, use */
+			use_new_srp = false;
+			sfp->rsv_srp = t_srp;
+			break;
 		}
-		sg_remove_sgat(sfp, &sfp->reserve);
-		sg_build_reserve(sfp, want_rsv_sz);
 	}
-	return 0;
+	if (use_new_srp) {
+		struct sg_request *cxc_srp;
+
+		xa_lock_irqsave(xafp, iflags);
+		n_srp->rq_idx = o_srp->rq_idx;
+		idx = o_srp->rq_idx;
+		cxc_srp = __xa_cmpxchg(xafp, idx, o_srp, n_srp, GFP_ATOMIC);
+		if (o_srp == cxc_srp) {
+			sfp->rsv_srp = n_srp;
+			sg_rq_state_force(n_srp, SG_RS_INACTIVE);
+			xa_unlock_irqrestore(xafp, iflags);
+			SG_LOG(6, sfp, "%s: new rsv srp=0x%pK ++\n", __func__,
+			       n_srp);
+			sg_remove_sgat(o_srp);
+			kfree(o_srp);
+		} else {
+			xa_unlock_irqrestore(xafp, iflags);
+			SG_LOG(1, sfp, "%s: xa_cmpxchg() failed, again\n",
+			       __func__);
+			goto try_again;
+		}
+	}
+fini:
+	if (!use_new_srp) {
+		sg_remove_sgat(n_srp);
+		kfree(n_srp);   /* no-one else has seen n_srp, so safe */
+	}
+	return res;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1357,26 +1503,43 @@ static int put_compat_request_table(struct compat_sg_req_info __user *o,
 }
 #endif
 
+/*
+ * For backward compatibility, output SG_MAX_QUEUE sg_req_info objects. First
+ * fetch from the active list then, if there is still room, from the free
+ * list. Some of the trailing elements may be empty which is indicated by all
+ * fields being zero. Any requests beyond SG_MAX_QUEUE are ignored.
+ */
 static int
 sg_ctl_req_tbl(struct sg_fd *sfp, void __user *p)
 {
-	int result, val;
+	int k, result, val;
 	unsigned long idx;
 	struct sg_request *srp;
-	sg_req_info_t *rinfop;
+	struct sg_req_info *rinfop;
 
-	rinfop = kcalloc(SG_MAX_QUEUE, SZ_SG_REQ_INFO,
-			 GFP_KERNEL);
+	SG_LOG(3, sfp, "%s:    SG_GET_REQUEST_TABLE\n", __func__);
+	k = SG_MAX_QUEUE;
+	rinfop = kcalloc(k, SZ_SG_REQ_INFO, GFP_KERNEL);
 	if (!rinfop)
 		return -ENOMEM;
 	val = 0;
 	xa_for_each(&sfp->srp_arr, idx, srp) {
 		if (!srp)
 			continue;
-		if (xa_get_mark(&sfp->srp_arr, idx, SG_XA_RQ_AWAIT))
+		if (val >= SG_MAX_QUEUE)
+			break;
+		if (xa_get_mark(&sfp->srp_arr, idx, SG_XA_RQ_INACTIVE))
+			continue;
+		sg_fill_request_element(sfp, srp, rinfop + val);
+		val++;
+	}
+	xa_for_each(&sfp->srp_arr, idx, srp) {
+		if (!srp)
 			continue;
 		if (val >= SG_MAX_QUEUE)
 			break;
+		if (!xa_get_mark(&sfp->srp_arr, idx, SG_XA_RQ_INACTIVE))
+			continue;
 		sg_fill_request_element(sfp, srp, rinfop + val);
 		val++;
 	}
@@ -1443,18 +1606,15 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		sfp->force_packid = val ? 1 : 0;
+		assign_bit(SG_FFD_FORCE_PACKID, sfp->ffd_bm, !!val);
 		return 0;
 	case SG_GET_PACK_ID:    /* or tag of oldest "read"-able, -1 if none */
 		val = -1;
-		srp = NULL;
 		xa_for_each_marked(&sfp->srp_arr, idx, srp, SG_XA_RQ_AWAIT) {
 			if (!srp)
 				continue;
-			if ((1 == srp->done) && (!srp->sg_io_owned)) {
-				val = srp->header.pack_id;
-				break;
-			}
+			val = srp->pack_id;
+			break;
 		}
 		SG_LOG(3, sfp, "%s:    SG_GET_PACK_ID=%d\n", __func__, val);
 		return put_user(val, ip);
@@ -1465,44 +1625,48 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		       sdp->max_sgat_sz);
 		return put_user(sdp->max_sgat_sz, ip);
 	case SG_SET_RESERVED_SIZE:
-		mutex_lock(&sfp->f_mutex);
 		result = get_user(val, ip);
 		if (!result) {
 			if (val >= 0 && val <= (1024 * 1024 * 1024)) {
+				mutex_lock(&sfp->f_mutex);
 				result = sg_set_reserved_sz(sfp, val);
+				mutex_unlock(&sfp->f_mutex);
 			} else {
 				SG_LOG(3, sfp, "%s: invalid size\n", __func__);
 				result = -EINVAL;
 			}
 		}
-		mutex_unlock(&sfp->f_mutex);
 		return result;
 	case SG_GET_RESERVED_SIZE:
-		val = min_t(int, sfp->reserve.buflen,
-			    max_sectors_bytes(sdev->request_queue));
+		mutex_lock(&sfp->f_mutex);
+		val = min_t(int, sfp->rsv_srp->sgat_h.buflen,
+			    sdp->max_sgat_sz);
+		mutex_unlock(&sfp->f_mutex);
 		SG_LOG(3, sfp, "%s:    SG_GET_RESERVED_SIZE=%d\n",
 		       __func__, val);
-		return put_user(val, ip);
+		result = put_user(val, ip);
+		return result;
 	case SG_SET_COMMAND_Q:
 		SG_LOG(3, sfp, "%s:    SG_SET_COMMAND_Q\n", __func__);
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		sfp->cmd_q = val ? 1 : 0;
+		assign_bit(SG_FFD_CMD_Q, sfp->ffd_bm, !!val);
 		return 0;
 	case SG_GET_COMMAND_Q:
 		SG_LOG(3, sfp, "%s:    SG_GET_COMMAND_Q\n", __func__);
-		return put_user((int) sfp->cmd_q, ip);
+		return put_user(test_bit(SG_FFD_CMD_Q, sfp->ffd_bm), ip);
 	case SG_SET_KEEP_ORPHAN:
 		SG_LOG(3, sfp, "%s:    SG_SET_KEEP_ORPHAN\n", __func__);
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		sfp->keep_orphan = val;
+		assign_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm, !!val);
 		return 0;
 	case SG_GET_KEEP_ORPHAN:
 		SG_LOG(3, sfp, "%s:    SG_GET_KEEP_ORPHAN\n", __func__);
-		return put_user((int) sfp->keep_orphan, ip);
+		return put_user(test_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm),
+				ip);
 	case SG_GET_VERSION_NUM:
 		SG_LOG(3, sfp, "%s:    SG_GET_VERSION_NUM\n", __func__);
 		return put_user(sg_version_num, ip);
@@ -1557,6 +1721,8 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		return put_user(val, ip);
 	case SG_EMULATED_HOST:
 		SG_LOG(3, sfp, "%s:    SG_EMULATED_HOST\n", __func__);
+		if (unlikely(SG_IS_DETACHING(sdp)))
+			return -ENODEV;
 		return put_user(sdev->host->hostt->emulated, ip);
 	case SCSI_IOCTL_SEND_COMMAND:
 		SG_LOG(3, sfp, "%s:    SCSI_IOCTL_SEND_COMMAND\n", __func__);
@@ -1567,7 +1733,7 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		assign_bit(SG_FDEV_LOG_SENSE, sdp->fdev_bm, val);
+		assign_bit(SG_FDEV_LOG_SENSE, sdp->fdev_bm, !!val);
 		if (val == 0)	/* user can force recalculation */
 			sg_calc_sgat_param(sdp);
 		return 0;
@@ -1612,7 +1778,7 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		break;
 	}
 	result = sg_allow_if_err_recovery(sdp, filp->f_flags & O_NDELAY);
-	if (result)
+	if (unlikely(result))
 		return result;
 	return -ENOIOCTLCMD;
 }
@@ -1675,7 +1841,7 @@ sg_poll(struct file *filp, poll_table * wait)
 
 	if (unlikely(SG_IS_DETACHING(sfp->parentdp)))
 		p_res |= EPOLLHUP;
-	else if (likely(sfp->cmd_q))
+	else if (likely(test_bit(SG_FFD_CMD_Q, sfp->ffd_bm)))
 		p_res |= EPOLLOUT | EPOLLWRNORM;
 	else if (atomic_read(&sfp->submitted) == 0)
 		p_res |= EPOLLOUT | EPOLLWRNORM;
@@ -1697,9 +1863,10 @@ static vm_fault_t
 sg_vma_fault(struct vm_fault *vmf)
 {
 	int k, length;
-	unsigned long offset, len, sa;
+	unsigned long offset, len, sa, iflags;
 	struct vm_area_struct *vma = vmf->vma;
 	struct sg_scatter_hold *rsv_schp;
+	struct sg_request *srp;
 	struct sg_device *sdp;
 	struct sg_fd *sfp;
 	const char *nbp = "==NULL, bad";
@@ -1718,12 +1885,18 @@ sg_vma_fault(struct vm_fault *vmf)
 		SG_LOG(1, sfp, "%s: device detaching\n", __func__);
 		goto out_err;
 	}
-	rsv_schp = &sfp->reserve;
+	srp = sfp->rsv_srp;
+	if (!srp) {
+		SG_LOG(1, sfp, "%s: srp%s\n", __func__, nbp);
+		goto out_err;
+	}
+	xa_lock_irqsave(&sfp->srp_arr, iflags);
+	rsv_schp = &srp->sgat_h;
 	offset = vmf->pgoff << PAGE_SHIFT;
 	if (offset >= (unsigned int)rsv_schp->buflen) {
 		SG_LOG(1, sfp, "%s: offset[%lu] >= rsv.buflen\n", __func__,
 		       offset);
-		goto out_err;
+		goto out_err_unlock;
 	}
 	sa = vma->vm_start;
 	SG_LOG(3, sfp, "%s: vm_start=0x%lx, off=%lu\n", __func__, sa, offset);
@@ -1736,6 +1909,7 @@ sg_vma_fault(struct vm_fault *vmf)
 			struct page *pp;
 
 			pp = rsv_schp->pages[k];
+			xa_unlock_irqrestore(&sfp->srp_arr, iflags);
 			page = nth_page(pp, offset >> PAGE_SHIFT);
 			get_page(page); /* increment page count */
 			vmf->page = page;
@@ -1744,6 +1918,8 @@ sg_vma_fault(struct vm_fault *vmf)
 		sa += len;
 		offset -= len;
 	}
+out_err_unlock:
+	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
 out_err:
 	return VM_FAULT_SIGBUS;
 }
@@ -1758,9 +1934,10 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int k, length;
 	int ret = 0;
-	unsigned long req_sz, len, sa;
+	unsigned long req_sz, len, sa, iflags;
 	struct sg_scatter_hold *rsv_schp;
 	struct sg_fd *sfp;
+	struct sg_request *srp;
 
 	if (!filp || !vma)
 		return -ENXIO;
@@ -1776,11 +1953,13 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 		return -EINVAL; /* only an offset of 0 accepted */
 	/* Check reserve request is inactive and has large enough buffer */
 	mutex_lock(&sfp->f_mutex);
-	if (sfp->res_in_use) {
+	srp = sfp->rsv_srp;
+	xa_lock_irqsave(&sfp->srp_arr, iflags);
+	if (SG_RS_ACTIVE(srp)) {
 		ret = -EBUSY;
 		goto out;
 	}
-	rsv_schp = &sfp->reserve;
+	rsv_schp = &srp->sgat_h;
 	if (req_sz > (unsigned long)rsv_schp->buflen) {
 		ret = -ENOMEM;
 		goto out;
@@ -1793,11 +1972,12 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 		sa += len;
 	}
 
-	sfp->mmap_called = 1;
+	set_bit(SG_FFD_MMAP_CALLED, sfp->ffd_bm);
 	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data = sfp;
 	vma->vm_ops = &sg_mmap_vm_ops;
 out:
+	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
 	mutex_unlock(&sfp->f_mutex);
 	return ret;
 }
@@ -1819,8 +1999,10 @@ sg_rq_end_io_usercontext(struct work_struct *work)
 		return;
 	}
 	SG_LOG(3, sfp, "%s: srp=0x%pK\n", __func__, srp);
-	sg_finish_scsi_blk_rq(srp);
-	sg_deact_request(sfp, srp);
+	if (test_bit(SG_FRQ_DEACT_ORPHAN, srp->frq_bm)) {
+		sg_finish_scsi_blk_rq(srp);	/* clean up orphan case */
+		sg_deact_request(sfp, srp);
+	}
 	kref_put(&sfp->f_ref, sg_remove_sfp);
 }
 
@@ -1858,85 +2040,95 @@ sg_check_sense(struct sg_device *sdp, struct sg_request *srp, int sense_len)
 }
 
 /*
- * This function is a "bottom half" handler that is called by the mid
- * level when a command is completed (or has failed).
+ * This "bottom half" (soft interrupt) handler is called by the mid-level
+ * when a request has completed or failed. This callback is registered in a
+ * blk_execute_rq_nowait() call in the sg_common_write(). For ioctl(SG_IO)
+ * (sync) usage, sg_ctl_sg_io() waits to be woken up by this callback.
  */
 static void
 sg_rq_end_io(struct request *rq, blk_status_t status)
 {
+	enum sg_rq_state rqq_state = SG_RS_AWAIT_RCV;
+	int a_resid, slen;
 	struct sg_request *srp = rq->end_io_data;
 	struct scsi_request *scsi_rp = scsi_req(rq);
 	struct sg_device *sdp;
 	struct sg_fd *sfp;
-	unsigned int ms;
-	int resid, slen;
-	int done = 1;
-	unsigned long iflags;
 
-	if (WARN_ON(srp->done != 0))
+	if (!scsi_rp) {
+		WARN_ONCE("%s: scsi_req(rq) unexpectedly NULL\n", __func__);
 		return;
-
-	sfp = srp->parentfp;
-	if (WARN_ON(sfp == NULL))
+	}
+	if (!srp) {
+		WARN_ONCE("%s: srp unexpectedly NULL\n", __func__);
 		return;
-
+	}
+	if (WARN_ON(atomic_read(&srp->rq_st) != SG_RS_INFLIGHT)) {
+		pr_warn("%s: bad rq_st=%d\n", __func__,
+			atomic_read(&srp->rq_st));
+		goto early_err;
+	}
+	sfp = srp->parentfp;
+	if (unlikely(!sfp)) {
+		WARN_ONCE(1, "%s: sfp unexpectedly NULL", __func__);
+		goto early_err;
+	}
 	sdp = sfp->parentdp;
 	if (unlikely(SG_IS_DETACHING(sdp)))
 		pr_info("%s: device detaching\n", __func__);
 
 	srp->rq_result = scsi_rp->result;
-	resid = scsi_rp->resid_len;
-
-	srp->header.resid = resid;
-
 	slen = min_t(int, scsi_rp->sense_len, SCSI_SENSE_BUFFERSIZE);
+	a_resid = scsi_rp->resid_len;
 
-	SG_LOG(6, sfp, "%s: pack_id=%d, res=0x%x\n", __func__,
-	       srp->header.pack_id, srp->rq_result);
-	ms = jiffies_to_msecs(jiffies);
-	srp->header.duration = (ms > srp->header.duration) ?
-				(ms - srp->header.duration) : 0;
-	if (srp->rq_result != 0 && slen > 0)
+	if (a_resid)
+		srp->in_resid = a_resid;
+
+	SG_LOG(6, sfp, "%s: pack_id=%d, res=0x%x\n", __func__, srp->pack_id,
+	       srp->rq_result);
+	srp->duration = sg_calc_rq_dur(srp);
+	if (unlikely((srp->rq_result & SG_ML_RESULT_MSK) && slen > 0))
 		sg_check_sense(sdp, srp, slen);
 	if (slen > 0)
 		memcpy(srp->sense_b, scsi_rp->sense, slen);
-
-	/* Rely on write phase to clean out srp status values, so no "else" */
-
-	if (!srp->sg_io_owned)
+	srp->sense_len = slen;
+	if (unlikely(test_bit(SG_FRQ_IS_ORPHAN, srp->frq_bm))) {
+		if (test_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm)) {
+			clear_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm);
+		} else {
+			rqq_state = SG_RS_BUSY;
+			set_bit(SG_FRQ_DEACT_ORPHAN, srp->frq_bm);
+		}
+	}
+	if (!test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm))
 		atomic_inc(&sfp->waiting);
+	if (unlikely(sg_rq_state_chg(srp, SG_RS_INFLIGHT, rqq_state,
+				     false, __func__)))
+		pr_warn("%s: can't set rq_st\n", __func__);
 	/*
-	 * Free the request as soon as it is complete so that its resources
-	 * can be reused without waiting for userspace to read() the
-	 * result.  But keep the associated bio (if any) around until
-	 * blk_rq_unmap_user() can be called from user context.
+	 * Free the mid-level resources apart from the bio (if any). The bio's
+	 * blk_rq_unmap_user() can be called later from user context.
 	 */
 	srp->rq = NULL;
-	scsi_req_free_cmd(scsi_req(rq));
+	scsi_req_free_cmd(scsi_rp);
 	blk_put_request(rq);
 
-	if (unlikely(srp->orphan)) {
-		if (sfp->keep_orphan)
-			srp->sg_io_owned = 0;
-		else
-			done = 0;
-	}
-	srp->done = done;
-
-	if (likely(done)) {
-		/* Now wake up any sg_read() that is waiting for this
-		 * packet.
-		 */
-		xa_lock_irqsave(&sfp->srp_arr, iflags);
-		__xa_set_mark(&sfp->srp_arr, srp->rq_idx, SG_XA_RQ_AWAIT);
-		xa_unlock_irqrestore(&sfp->srp_arr, iflags);
+	if (likely(rqq_state == SG_RS_AWAIT_RCV)) {
+		/* Wake any sg_read()/ioctl(SG_IORECEIVE) awaiting this req */
 		wake_up_interruptible(&sfp->read_wait);
 		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN);
 		kref_put(&sfp->f_ref, sg_remove_sfp);
-	} else {
+	} else {        /* clean up orphaned request that aren't being kept */
 		INIT_WORK(&srp->ew_orph.work, sg_rq_end_io_usercontext);
 		schedule_work(&srp->ew_orph.work);
 	}
+	return;
+
+early_err:
+	srp->rq = NULL;
+	if (scsi_rp)
+		scsi_req_free_cmd(scsi_rp);
+	blk_put_request(rq);
 }
 
 static const struct file_operations sg_fops = {
@@ -1964,9 +2156,9 @@ static struct sg_device *
 sg_add_device_helper(struct gendisk *disk, struct scsi_device *scsidp)
 {
 	struct sg_device *sdp;
-	unsigned long iflags;
 	int error;
 	u32 k;
+	unsigned long iflags;
 
 	sdp = kzalloc(sizeof(*sdp), GFP_KERNEL);
 	if (!sdp)
@@ -1984,7 +2176,7 @@ sg_add_device_helper(struct gendisk *disk, struct scsi_device *scsidp)
 			error = -ENODEV;
 		} else {
 			sdev_printk(KERN_WARNING, scsidp,
-				    "%s: idr alloc sg_device failure: %d\n",
+				"%s: idr allocation sg_device failure: %d\n",
 				    __func__, error);
 		}
 		goto out_unlock;
@@ -2186,6 +2378,7 @@ init_sg(void)
 				    SG_MAX_DEVS, "sg");
 	if (rc)
 		return rc;
+
 	pr_info("Registered %s[char major=0x%x], version: %s, date: %s\n",
 		"sg device ", SCSI_GENERIC_MAJOR, SG_VERSION_STR,
 		sg_version_date);
@@ -2201,6 +2394,7 @@ init_sg(void)
 		return 0;
 	}
 	class_destroy(sg_sysfs_class);
+
 err_out_unreg:
 	unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS);
 	return rc;
@@ -2227,6 +2421,19 @@ exit_sg(void)
 	idr_destroy(&sg_index_idr);
 }
 
+static inline bool
+sg_chk_dio_allowed(struct sg_device *sdp, struct sg_request *srp,
+		   int iov_count, int dir)
+{
+	if (sg_allow_dio && (srp->rq_flags & SG_FLAG_DIRECT_IO)) {
+		if (dir != SG_DXFER_UNKNOWN && !iov_count) {
+			if (!sdp->device->host->unchecked_isa_dma)
+				return true;
+		}
+	}
+	return false;
+}
+
 static void
 sg_set_map_data(const struct sg_scatter_hold *schp, bool up_valid,
 		struct rq_map_data *mdp)
@@ -2240,31 +2447,40 @@ sg_set_map_data(const struct sg_scatter_hold *schp, bool up_valid,
 }
 
 static int
-sg_start_req(struct sg_request *srp, u8 *cmd)
+sg_start_req(struct sg_request *srp, u8 *cmd, int cmd_len, int dxfer_dir)
 {
-	int res;
+	bool reserved, us_xfer;
+	int res = 0;
+	int dxfer_len = 0;
+	int r0w = READ;
+	unsigned int iov_count = 0;
+	void __user *up;
 	struct request *rq;
-	struct scsi_request *req;
+	struct scsi_request *scsi_rp;
 	struct sg_fd *sfp = srp->parentfp;
-	struct sg_io_hdr *hp = &srp->header;
-	int dxfer_len = (int) hp->dxfer_len;
-	int dxfer_dir = hp->dxfer_direction;
-	unsigned int iov_count = hp->iovec_count;
-	struct sg_scatter_hold *req_schp = &srp->data;
-	struct sg_scatter_hold *rsv_schp = &sfp->reserve;
-	struct request_queue *q = sfp->parentdp->device->request_queue;
-	struct rq_map_data *md, map_data;
-	int r0w = hp->dxfer_direction == SG_DXFER_TO_DEV ? WRITE : READ;
+	struct sg_device *sdp;
+	struct sg_scatter_hold *req_schp;
+	struct request_queue *q;
+	struct rq_map_data *md = (void *)srp; /* want any non-NULL value */
 	u8 *long_cmdp = NULL;
+	__maybe_unused const char *cp = "";
+	struct sg_slice_hdr3 *sh3p = &srp->s_hdr3;
+	struct rq_map_data map_data;
 
-	if (hp->cmd_len > BLK_MAX_CDB) {
-		long_cmdp = kzalloc(hp->cmd_len, GFP_KERNEL);
+	sdp = sfp->parentdp;
+	if (cmd_len > BLK_MAX_CDB) {	/* for longer SCSI cdb_s */
+		long_cmdp = kzalloc(cmd_len, GFP_KERNEL);
 		if (!long_cmdp)
 			return -ENOMEM;
 		SG_LOG(5, sfp, "%s: long_cmdp=0x%pK ++\n", __func__, long_cmdp);
 	}
+	up = sh3p->dxferp;
+	dxfer_len = (int)sh3p->dxfer_len;
+	iov_count = sh3p->iovec_count;
+	r0w = dxfer_dir == SG_DXFER_TO_DEV ? WRITE : READ;
 	SG_LOG(4, sfp, "%s: dxfer_len=%d, data-%s\n", __func__, dxfer_len,
 	       (r0w ? "OUT" : "IN"));
+	q = sdp->device->request_queue;
 
 	/*
 	 * NOTE
@@ -2277,125 +2493,142 @@ sg_start_req(struct sg_request *srp, u8 *cmd)
 	 * do not want to use BLK_MQ_REQ_NOWAIT here because userspace might
 	 * not expect an EWOULDBLOCK from this condition.
 	 */
-	rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+	rq = blk_get_request(q, (r0w ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN), 0);
 	if (IS_ERR(rq)) {
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
 	}
-	req = scsi_req(rq);
-
-	if (hp->cmd_len > BLK_MAX_CDB)
-		req->cmd = long_cmdp;
-	memcpy(req->cmd, cmd, hp->cmd_len);
-	req->cmd_len = hp->cmd_len;
-
+	/* current sg_request protected by SG_RS_BUSY state */
+	scsi_rp = scsi_req(rq);
 	srp->rq = rq;
-	rq->end_io_data = srp;
-	req->retries = SG_DEFAULT_RETRIES;
-
-	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
-		return 0;
 
-	if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
-	    dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
-	    !sfp->parentdp->device->host->unchecked_isa_dma &&
-	    blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len))
+	if (cmd_len > BLK_MAX_CDB)
+		scsi_rp->cmd = long_cmdp;
+	memcpy(scsi_rp->cmd, cmd, cmd_len);
+	scsi_rp->cmd_len = cmd_len;
+	us_xfer = !(srp->rq_flags & SG_FLAG_NO_DXFER);
+	assign_bit(SG_FRQ_NO_US_XFER, srp->frq_bm, !us_xfer);
+	reserved = (sfp->rsv_srp == srp);
+	rq->end_io_data = srp;
+	scsi_rp->retries = SG_DEFAULT_RETRIES;
+	req_schp = &srp->sgat_h;
+
+	if (dxfer_len <= 0 || dxfer_dir == SG_DXFER_NONE) {
+		SG_LOG(4, sfp, "%s: no data xfer [0x%pK]\n", __func__, srp);
+		set_bit(SG_FRQ_NO_US_XFER, srp->frq_bm);
+		goto fini;	/* path of reqs with no din nor dout */
+	} else if (sg_chk_dio_allowed(sdp, srp, iov_count, dxfer_dir) &&
+		   blk_rq_aligned(q, (unsigned long)up, dxfer_len)) {
+		srp->rq_info |= SG_INFO_DIRECT_IO;
 		md = NULL;
-	else
+		if (IS_ENABLED(CONFIG_SCSI_PROC_FS))
+			cp = "direct_io, ";
+	} else {	/* normal IO and failed conditions for dio path */
 		md = &map_data;
+	}
 
-	if (md) {
-		mutex_lock(&sfp->f_mutex);
-		if (dxfer_len <= rsv_schp->buflen &&
-		    !sfp->res_in_use) {
-			sfp->res_in_use = 1;
-			sg_link_reserve(sfp, srp, dxfer_len);
-		} else if (hp->flags & SG_FLAG_MMAP_IO) {
-			res = -EBUSY; /* sfp->res_in_use == 1 */
-			if (dxfer_len > rsv_schp->buflen)
-				res = -ENOMEM;
-			mutex_unlock(&sfp->f_mutex);
-			return res;
-		} else {
-			res = sg_mk_sgat(req_schp, sfp, dxfer_len);
-			if (res) {
-				mutex_unlock(&sfp->f_mutex);
-				return res;
-			}
+	if (likely(md)) {	/* normal, "indirect" IO */
+		if (unlikely((srp->rq_flags & SG_FLAG_MMAP_IO))) {
+			/* mmap IO must use and fit in reserve request */
+			if (!reserved || dxfer_len > req_schp->buflen)
+				res = reserved ? -ENOMEM : -EBUSY;
+		} else if (req_schp->buflen == 0) {
+			int up_sz = max_t(int, dxfer_len, sfp->sgat_elem_sz);
+
+			res = sg_mk_sgat(srp, sfp, up_sz);
 		}
-		mutex_unlock(&sfp->f_mutex);
+		if (res)
+			goto fini;
 
-		sg_set_map_data(req_schp, !!hp->dxferp, md);
+		sg_set_map_data(req_schp, !!up, md);
 		md->from_user = (dxfer_dir == SG_DXFER_TO_FROM_DEV);
 	}
 
-	if (iov_count) {
+	if (unlikely(iov_count)) {
 		struct iovec *iov = NULL;
 		struct iov_iter i;
 
-		res = import_iovec(r0w, hp->dxferp, iov_count, 0, &iov, &i);
+		res = import_iovec(r0w, up, iov_count, 0, &iov, &i);
 		if (res < 0)
-			return res;
+			goto fini;
 
-		iov_iter_truncate(&i, hp->dxfer_len);
+		iov_iter_truncate(&i, dxfer_len);
 		if (!iov_iter_count(&i)) {
 			kfree(iov);
-			return -EINVAL;
+			res = -EINVAL;
+			goto fini;
 		}
 
-		res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
+		if (us_xfer)
+			res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
 		kfree(iov);
-	} else
-		res = blk_rq_map_user(q, rq, md, hp->dxferp,
-				      hp->dxfer_len, GFP_ATOMIC);
-
-	if (!res) {
+		if (IS_ENABLED(CONFIG_SCSI_PROC_FS))
+			cp = "iov_count > 0";
+	} else if (us_xfer) { /* setup for transfer data to/from user space */
+		res = blk_rq_map_user(q, rq, md, up, dxfer_len, GFP_ATOMIC);
+		if (IS_ENABLED(CONFIG_SCSI_PROC_FS) && res)
+			SG_LOG(1, sfp, "%s: blk_rq_map_user() res=%d\n",
+			       __func__, res);
+	}
+fini:
+	if (unlikely(res)) {		/* failure, free up resources */
+		scsi_req_free_cmd(scsi_rp);
+		srp->rq = NULL;
+		blk_put_request(rq);
+	} else {
 		srp->bio = rq->bio;
-
-		if (!md) {
-			req_schp->dio_in_use = 1;
-			hp->info |= SG_INFO_DIRECT_IO;
-		}
 	}
+	SG_LOG((res ? 1 : 4), sfp, "%s: %s res=%d [0x%pK]\n", __func__, cp,
+	       res, srp);
 	return res;
 }
 
+/*
+ * Clean up mid-level and block layer resources of finished request. Sometimes
+ * blk_rq_unmap_user() returns -4 (-EINTR) and this is why: "If we're in a
+ * workqueue, the request is orphaned, so don't copy into a random user
+ * address space, just free and return -EINTR so user space doesn't expect
+ * any data." [block/bio.c]
+ */
 static void
 sg_finish_scsi_blk_rq(struct sg_request *srp)
 {
 	int ret;
-
 	struct sg_fd *sfp = srp->parentfp;
-	struct sg_scatter_hold *req_schp = &srp->data;
+	struct request *rq = srp->rq;
 
 	SG_LOG(4, sfp, "%s: srp=0x%pK%s\n", __func__, srp,
-	       (srp->res_used) ? " rsv" : "");
-	if (!srp->sg_io_owned) {
+	       (srp->parentfp->rsv_srp == srp) ? " rsv" : "");
+	if (!test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm)) {
 		atomic_dec(&sfp->submitted);
 		atomic_dec(&sfp->waiting);
 	}
 	if (srp->bio) {
-		ret = blk_rq_unmap_user(srp->bio);
-		if (ret)	/* -EINTR (-4) can be ignored */
-			SG_LOG(6, sfp, "%s: blk_rq_unmap_user() --> %d\n",
-			       __func__, ret);
+		bool us_xfer = !test_bit(SG_FRQ_NO_US_XFER, srp->frq_bm);
+
+		if (us_xfer) {
+			ret = blk_rq_unmap_user(srp->bio);
+			if (ret) {	/* -EINTR (-4) can be ignored */
+				SG_LOG(6, sfp,
+				       "%s: blk_rq_unmap_user() --> %d\n",
+				       __func__, ret);
+			}
+		}
 		srp->bio = NULL;
 	}
+	/* In worst case READ data returned to user space by this point */
 
-	if (srp->rq) {
-		scsi_req_free_cmd(scsi_req(srp->rq));
-		blk_put_request(srp->rq);
+	/* Expect blk_put_request(rq) already called in sg_rq_end_io() */
+	if (rq) {       /* blk_get_request() may have failed */
+		if (scsi_req(rq))
+			scsi_req_free_cmd(scsi_req(rq));
+		srp->rq = NULL;
+		blk_put_request(rq);
 	}
-
-	if (srp->res_used)
-		sg_unlink_reserve(sfp, srp);
-	else
-		sg_remove_sgat(sfp, req_schp);
 }
 
 static int
-sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp, int minlen)
+sg_mk_sgat(struct sg_request *srp, struct sg_fd *sfp, int minlen)
 {
 	int j, k, rem_sz, align_sz, order, o_order;
 	int mx_sgat_elems = sfp->parentdp->max_sgat_elems;
@@ -2404,6 +2637,7 @@ sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp, int minlen)
 	gfp_t mask_ap = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN | __GFP_ZERO;
 	gfp_t mask_kz = GFP_ATOMIC | __GFP_NOWARN;
 	struct sg_device *sdp = sfp->parentdp;
+	struct sg_scatter_hold *schp = &srp->sgat_h;
 
 	if (unlikely(minlen <= 0)) {
 		if (minlen < 0)
@@ -2414,9 +2648,8 @@ sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp, int minlen)
 	align_sz = ALIGN(minlen, SG_DEF_SECTOR_SZ);
 
 	schp->pages = kcalloc(mx_sgat_elems, ptr_sz, mask_kz);
-	SG_LOG(4, sfp, "%s: minlen=%d, align_sz=%d [sz=%zu, 0x%pK ++]\n",
-	       __func__, minlen, align_sz, mx_sgat_elems * ptr_sz,
-	       schp->pages);
+	SG_LOG(4, sfp, "%s: minlen=%d [sz=%zu, 0x%pK ++]\n", __func__, minlen,
+	       mx_sgat_elems * ptr_sz, schp->pages);
 	if (unlikely(!schp->pages))
 		return -ENOMEM;
 
@@ -2480,13 +2713,15 @@ sg_remove_sgat_helper(struct sg_fd *sfp, struct sg_scatter_hold *schp)
 
 /* Remove the data (possibly a sgat list) held by srp, not srp itself */
 static void
-sg_remove_sgat(struct sg_fd *sfp, struct sg_scatter_hold *schp)
+sg_remove_sgat(struct sg_request *srp)
 {
+	struct sg_scatter_hold *schp = &srp->sgat_h; /* care: remove own data */
+	struct sg_fd *sfp = srp->parentfp;
+
 	SG_LOG(4, sfp, "%s: num_sgat=%d%s\n", __func__, schp->num_sgat,
-	       ((sfp ? (&sfp->reserve == schp) : false) ?
+	       ((srp->parentfp ? (sfp->rsv_srp == srp) : false) ?
 		" [rsv]" : ""));
-	if (!schp->dio_in_use)
-		sg_remove_sgat_helper(sfp, schp);
+	sg_remove_sgat_helper(sfp, schp);
 
 	memset(schp, 0, sizeof(*schp));         /* zeros buflen and dlen */
 }
@@ -2501,7 +2736,7 @@ sg_read_append(struct sg_request *srp, void __user *outp, int num_xfer)
 {
 	int k, num, res;
 	struct page *pgp;
-	struct sg_scatter_hold *schp = &srp->data;
+	struct sg_scatter_hold *schp = &srp->sgat_h;
 
 	SG_LOG(4, srp->parentfp, "%s: num_xfer=%d\n", __func__, num_xfer);
 	if (unlikely(!outp || num_xfer <= 0))
@@ -2515,11 +2750,11 @@ sg_read_append(struct sg_request *srp, void __user *outp, int num_xfer)
 			break;
 		}
 		if (num > num_xfer) {
-			if (__copy_to_user(outp, page_address(pgp), num_xfer))
+			if (copy_to_user(outp, page_address(pgp), num_xfer))
 				res = -EFAULT;
 			break;
 		} else {
-			if (__copy_to_user(outp, page_address(pgp), num)) {
+			if (copy_to_user(outp, page_address(pgp), num)) {
 				res = -EFAULT;
 				break;
 			}
@@ -2542,135 +2777,273 @@ sg_read_append(struct sg_request *srp, void __user *outp, int num_xfer)
 static struct sg_request *
 sg_find_srp_by_id(struct sg_fd *sfp, int pack_id)
 {
+	__maybe_unused bool is_bad_st = false;
+	__maybe_unused enum sg_rq_state bad_sr_st = SG_RS_INACTIVE;
+	bool search_for_1 = (pack_id != SG_PACK_ID_WILDCARD);
+	int res;
+	int num_waiting = atomic_read(&sfp->waiting);
 	unsigned long idx;
-	struct sg_request *resp;
+	struct sg_request *srp = NULL;
+	struct xarray *xafp = &sfp->srp_arr;
 
-	xa_for_each_marked(&sfp->srp_arr, idx, resp, SG_XA_RQ_AWAIT) {
-		if (!resp)
-			continue;
-		/* look for requests that are ready + not SG_IO owned */
-		if (resp->done == 1 && !resp->sg_io_owned &&
-		    (-1 == pack_id || resp->header.pack_id == pack_id)) {
-			resp->done = 2;	/* guard against other readers */
-			return resp;
+	if (num_waiting < 1)
+		return NULL;
+	if (unlikely(search_for_1)) {
+		xa_for_each_marked(xafp, idx, srp, SG_XA_RQ_AWAIT) {
+			if (!srp)
+				continue;
+			if (srp->pack_id != pack_id)
+				continue;
+			res = sg_rq_state_chg(srp, SG_RS_AWAIT_RCV, SG_RS_BUSY,
+					      false, __func__);
+			if (likely(res == 0))
+				goto good;
+			/* else another caller got it, move on */
+			if (IS_ENABLED(CONFIG_SCSI_PROC_FS)) {
+				is_bad_st = true;
+				bad_sr_st = atomic_read(&srp->rq_st);
+			}
+			break;
+		}
+	} else {        /* search for any request is more likely */
+		xa_for_each_marked(xafp, idx, srp, SG_XA_RQ_AWAIT) {
+			if (!srp)
+				continue;
+			res = sg_rq_state_chg(srp, SG_RS_AWAIT_RCV, SG_RS_BUSY,
+					      false, __func__);
+			if (likely(res == 0))
+				goto good;
+		}
+	}
+	/* here if one of above loops does _not_ find a match */
+	if (IS_ENABLED(CONFIG_SCSI_PROC_FS)) {
+		if (search_for_1) {
+			const char *cptp = "pack_id=";
+
+			if (is_bad_st)
+				SG_LOG(1, sfp, "%s: %s%d wrong state: %s\n",
+				       __func__, cptp, pack_id,
+				       sg_rq_st_str(bad_sr_st, true));
+			else
+				SG_LOG(6, sfp, "%s: %s%d not awaiting read\n",
+				       __func__, cptp, pack_id);
 		}
 	}
 	return NULL;
+good:
+	SG_LOG(6, sfp, "%s: %s%d found [srp=0x%pK]\n", __func__, "pack_id=",
+	       pack_id, srp);
+	return srp;
 }
 
-static void
-sg_link_reserve(struct sg_fd *sfp, struct sg_request *srp, int size)
+/*
+ * Makes a new sg_request object. If 'first' is set then use GFP_KERNEL which
+ * may take time but has improved chance of success, otherwise use GFP_ATOMIC.
+ * Note that basic initialization is done but srp is not added to either sfp
+ * list. On error returns twisted negated errno value (not NULL).
+ */
+static struct sg_request *
+sg_mk_srp(struct sg_fd *sfp, bool first)
 {
-	struct sg_scatter_hold *req_schp = &srp->data;
-	struct sg_scatter_hold *rsv_schp = &sfp->reserve;
-	int k, num, rem;
-
-	srp->res_used = 1;
-	SG_LOG(4, sfp, "%s: size=%d\n", __func__, size);
-	rem = size;
-
-	num = 1 << (PAGE_SHIFT + rsv_schp->page_order);
-	for (k = 0; k < rsv_schp->num_sgat; k++) {
-		if (rem <= num) {
-			req_schp->num_sgat = k + 1;
-			req_schp->sglist_len = rsv_schp->sglist_len;
-			req_schp->pages = rsv_schp->pages;
+	struct sg_request *srp;
+	gfp_t gfp =  __GFP_NOWARN;
 
-			req_schp->buflen = size;
-			req_schp->page_order = rsv_schp->page_order;
-			break;
-		} else
-			rem -= num;
+	if (first)      /* prepared to wait if none already outstanding */
+		srp = kzalloc(sizeof(*srp), gfp | GFP_KERNEL);
+	else
+		srp = kzalloc(sizeof(*srp), gfp | GFP_ATOMIC);
+	if (srp) {
+		atomic_set(&srp->rq_st, SG_RS_BUSY);
+		srp->parentfp = sfp;
+		return srp;
+	} else {
+		return ERR_PTR(-ENOMEM);
 	}
-
-	if (k >= rsv_schp->num_sgat)
-		SG_LOG(1, sfp, "%s: BAD size\n", __func__);
 }
 
-static void
-sg_unlink_reserve(struct sg_fd *sfp, struct sg_request *srp)
+static struct sg_request *
+sg_mk_srp_sgat(struct sg_fd *sfp, bool first, int db_len)
 {
-	struct sg_scatter_hold *req_schp = &srp->data;
+	int res;
+	struct sg_request *n_srp = sg_mk_srp(sfp, first);
 
-	SG_LOG(4, srp->parentfp, "%s: req->num_sgat=%d\n", __func__,
-	       (int)req_schp->num_sgat);
-	req_schp->num_sgat = 0;
-	req_schp->buflen = 0;
-	req_schp->pages = NULL;
-	req_schp->page_order = 0;
-	req_schp->sglist_len = 0;
-	srp->res_used = 0;
-	/* Called without mutex lock to avoid deadlock */
-	sfp->res_in_use = 0;
+	if (IS_ERR(n_srp))
+		return n_srp;
+	if (db_len > 0) {
+		res = sg_mk_sgat(n_srp, sfp, db_len);
+		if (res) {
+			kfree(n_srp);
+			return ERR_PTR(res);
+		}
+	}
+	return n_srp;
 }
 
-static void
-sg_build_reserve(struct sg_fd *sfp, int req_size)
+/*
+ * Irrespective of the given reserve request size, the minimum size requested
+ * will be PAGE_SIZE (often 4096 bytes). Returns a pointer to reserve object or
+ * a negated errno value twisted by ERR_PTR() macro. The actual number of bytes
+ * allocated (maybe less than buflen) is in srp->sgat_h.buflen . Note that this
+ * function is only called in contexts where locking is not required.
+ */
+static struct sg_request *
+sg_build_reserve(struct sg_fd *sfp, int buflen)
 {
-	struct sg_scatter_hold *schp = &sfp->reserve;
+	bool go_out = false;
+	int res;
+	struct sg_request *srp;
 
-	SG_LOG(3, sfp, "%s: buflen=%d\n", __func__, req_size);
+	SG_LOG(3, sfp, "%s: buflen=%d\n", __func__, buflen);
+	srp = sg_mk_srp(sfp, xa_empty(&sfp->srp_arr));
+	if (IS_ERR(srp))
+		return srp;
+	sfp->rsv_srp = srp;
 	do {
-		if (req_size < PAGE_SIZE)
-			req_size = PAGE_SIZE;
-		if (sg_mk_sgat(schp, sfp, req_size) == 0)
-			return;
-		sg_remove_sgat(sfp, schp);
-		req_size >>= 1;	/* divide by 2 */
-	} while (req_size > (PAGE_SIZE / 2));
+		if (buflen < (int)PAGE_SIZE) {
+			buflen = PAGE_SIZE;
+			go_out = true;
+		}
+		res = sg_mk_sgat(srp, sfp, buflen);
+		if (res == 0) {
+			SG_LOG(4, sfp, "%s: final buflen=%d, srp=0x%pK ++\n",
+			       __func__, buflen, srp);
+			return srp;
+		}
+		if (go_out)
+			return ERR_PTR(res);
+		/* failed so remove, halve buflen, try again */
+		sg_remove_sgat(srp);
+		buflen >>= 1;   /* divide by 2 */
+	} while (true);
 }
 
-/* always adds to end of list */
+/*
+ * Setup an active request (soon to carry a SCSI command) to the current file
+ * descriptor by creating a new one or re-using a request from the free
+ * list (fl). If successful returns a valid pointer in SG_RS_BUSY state. On
+ * failure returns a negated errno value twisted by ERR_PTR() macro.
+ */
 static struct sg_request *
-sg_setup_req(struct sg_fd *sfp)
+sg_setup_req(struct sg_fd *sfp, int dxfr_len, struct sg_comm_wr_t *cwrp)
 {
+	bool act_empty = false;
 	bool found = false;
+	bool mk_new_srp = false;
+	bool try_harder = false;
 	int res;
-	unsigned long idx, iflags;
-	struct sg_request *rp;
+	int num_inactive = 0;
+	unsigned long idx, last_idx, iflags;
+	struct sg_request *r_srp = NULL;	/* request to return */
 	struct xarray *xafp = &sfp->srp_arr;
-
-	if (!xa_empty(xafp)) {
-		xa_for_each_marked(xafp, idx, rp, SG_XA_RQ_INACTIVE) {
-			if (!rp)
+	__maybe_unused const char *cp;
+
+start_again:
+	cp = "";
+	if (xa_empty(xafp)) {
+		act_empty = true;
+		mk_new_srp = true;
+	} else if (!try_harder && dxfr_len < SG_DEF_SECTOR_SZ) {
+		last_idx = ~0UL;
+		xa_for_each_marked(xafp, idx, r_srp, SG_XA_RQ_INACTIVE) {
+			if (!r_srp)
 				continue;
-			if (sg_rstate_chg(rp, SG_RS_INACTIVE, SG_RS_BUSY))
+			++num_inactive;
+			if (dxfr_len < SG_DEF_SECTOR_SZ) {
+				last_idx = idx;
 				continue;
-			memset(rp, 0, sizeof(*rp));
-			rp->rq_idx = idx;
-			xa_lock_irqsave(xafp, iflags);
-			__xa_clear_mark(xafp, idx, SG_XA_RQ_INACTIVE);
-			xa_unlock_irqrestore(xafp, iflags);
+			}
+		}
+		/* If dxfr_len is small, use last inactive request */
+		if (last_idx != ~0UL) {
+			idx = last_idx;
+			r_srp = xa_load(xafp, idx);
+			if (!r_srp)
+				goto start_again;
+			if (sg_rq_state_chg(r_srp, SG_RS_INACTIVE, SG_RS_BUSY,
+					    false, __func__))
+				goto start_again; /* gone to another thread */
+			cp = "toward back of srp_arr";
 			found = true;
-			break;
+		}
+	} else {
+		xa_for_each_marked(xafp, idx, r_srp, SG_XA_RQ_INACTIVE) {
+			if (!r_srp)
+				continue;
+			if (r_srp->sgat_h.buflen >= dxfr_len) {
+				if (sg_rq_state_chg
+					(r_srp, SG_RS_INACTIVE, SG_RS_BUSY,
+					 false, __func__))
+					continue;
+				cp = "from front of srp_arr";
+				found = true;
+				break;
+			}
 		}
 	}
-	if (!found) {
-		rp = kzalloc(sizeof(*rp), GFP_KERNEL);
-		if (!rp)
-			return NULL;
+	if (found) {
+		r_srp->in_resid = 0;
+		r_srp->rq_info = 0;
+		r_srp->sense_len = 0;
+		mk_new_srp = false;
+	} else {
+		mk_new_srp = true;
 	}
-	rp->parentfp = sfp;
-	rp->header.duration = jiffies_to_msecs(jiffies);
-	if (!found) {
+	if (mk_new_srp) {
+		bool allow_cmd_q = test_bit(SG_FFD_CMD_Q, sfp->ffd_bm);
 		u32 n_idx;
 		struct xa_limit xal = { .max = 0, .min = 0 };
 
-		atomic_set(&rp->rq_st, SG_RS_BUSY);
+		cp = "new";
+		if (!allow_cmd_q && atomic_read(&sfp->submitted) > 0) {
+			r_srp = ERR_PTR(-EDOM);
+			SG_LOG(6, sfp, "%s: trying 2nd req but cmd_q=false\n",
+			       __func__);
+			goto fini;
+		}
+		r_srp = sg_mk_srp_sgat(sfp, act_empty, dxfr_len);
+		if (IS_ERR(r_srp)) {
+			if (!try_harder && dxfr_len < SG_DEF_SECTOR_SZ &&
+			    num_inactive > 0) {
+				try_harder = true;
+				goto start_again;
+			}
+			goto fini;
+		}
+		atomic_set(&r_srp->rq_st, SG_RS_BUSY);
 		xa_lock_irqsave(xafp, iflags);
 		xal.max = atomic_inc_return(&sfp->req_cnt);
-		res = __xa_alloc(xafp, &n_idx, rp, xal, GFP_KERNEL);
+		res = __xa_alloc(xafp, &n_idx, r_srp, xal, GFP_KERNEL);
 		xa_unlock_irqrestore(xafp, iflags);
 		if (res < 0) {
-			pr_warn("%s: don't expect xa_alloc() to fail, errno=%d\n",
-				__func__,  -res);
-			return NULL;
+			SG_LOG(1, sfp, "%s: xa_alloc() failed, errno=%d\n",
+			       __func__,  -res);
+			sg_remove_sgat(r_srp);
+			kfree(r_srp);
+			r_srp = ERR_PTR(-EPROTOTYPE);
+			goto fini;
 		}
-		rp->rq_idx = n_idx;
-	}
-	return rp;
+		idx = n_idx;
+		r_srp->rq_idx = idx;
+		r_srp->parentfp = sfp;
+		SG_LOG(4, sfp, "%s: mk_new_srp=0x%pK ++\n", __func__, r_srp);
+	}
+	r_srp->frq_bm[0] = cwrp->frq_bm[0];	/* assumes <= 32 req flags */
+	r_srp->sgat_h.dlen = dxfr_len;/* must be <= r_srp->sgat_h.buflen */
+	r_srp->cmd_opcode = 0xff;  /* set invalid opcode (VS), 0x0 is TUR */
+fini:
+	if (IS_ERR(r_srp))
+		SG_LOG(1, sfp, "%s: err=%ld\n", __func__, PTR_ERR(r_srp));
+	if (!IS_ERR(r_srp))
+		SG_LOG(4, sfp, "%s: %s r_srp=0x%pK\n", __func__, cp, r_srp);
+	return r_srp;
 }
 
+/*
+ * Moves a completed sg_request object to the free list and sets it to
+ * SG_RS_INACTIVE which makes it available for re-use. Requests with no data
+ * associated are appended to the tail of the free list while other requests
+ * are prepended to the head of the free list.
+ */
 static void
 sg_deact_request(struct sg_fd *sfp, struct sg_request *srp)
 {
@@ -2678,34 +3051,43 @@ sg_deact_request(struct sg_fd *sfp, struct sg_request *srp)
 
 	if (WARN_ON(!sfp || !srp))
 		return;
+	atomic_set(&srp->rq_st, SG_RS_INACTIVE);
 	xa_lock_irqsave(&sfp->srp_arr, iflags);
 	__xa_set_mark(&sfp->srp_arr, srp->rq_idx, SG_XA_RQ_INACTIVE);
+	__xa_clear_mark(&sfp->srp_arr, srp->rq_idx, SG_XA_RQ_AWAIT);
 	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
-	atomic_set(&srp->rq_st, SG_RS_INACTIVE);
 }
 
+/* Returns pointer to sg_fd object or negated errno twisted by ERR_PTR */
 static struct sg_fd *
 sg_add_sfp(struct sg_device *sdp)
 {
+	bool reduced = false;
 	int rbuf_len, res;
 	u32 idx;
+	long err;
 	unsigned long iflags;
 	struct sg_fd *sfp;
+	struct sg_request *srp = NULL;
+	struct xarray *xadp = &sdp->sfp_arr;
+	struct xarray *xafp;
 	struct xa_limit xal;
 
 	sfp = kzalloc(sizeof(*sfp), GFP_ATOMIC | __GFP_NOWARN);
 	if (!sfp)
 		return ERR_PTR(-ENOMEM);
-
 	init_waitqueue_head(&sfp->read_wait);
 	xa_init_flags(&sfp->srp_arr, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
+	xafp = &sfp->srp_arr;
 	kref_init(&sfp->f_ref);
 	mutex_init(&sfp->f_mutex);
 	sfp->timeout = SG_DEFAULT_TIMEOUT;
 	sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER;
-	sfp->force_packid = SG_DEF_FORCE_PACK_ID;
-	sfp->cmd_q = SG_DEF_COMMAND_Q;
-	sfp->keep_orphan = SG_DEF_KEEP_ORPHAN;
+	/* other bits in sfp->ffd_bm[1] cleared by kzalloc() above */
+	__assign_bit(SG_FFD_FORCE_PACKID, sfp->ffd_bm, SG_DEF_FORCE_PACK_ID);
+	__assign_bit(SG_FFD_CMD_Q, sfp->ffd_bm, SG_DEF_COMMAND_Q);
+	__assign_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm, SG_DEF_KEEP_ORPHAN);
+	__assign_bit(SG_FFD_Q_AT_TAIL, sfp->ffd_bm, SG_DEFAULT_Q_AT);
 	/*
 	 * SG_SCATTER_SZ initializes scatter_elem_sz but different value may
 	 * be given as driver/module parameter (e.g. 'scatter_elem_sz=8192').
@@ -2719,28 +3101,64 @@ sg_add_sfp(struct sg_device *sdp)
 	atomic_set(&sfp->waiting, 0);
 	atomic_set(&sfp->req_cnt, 0);
 
-	if (SG_IS_DETACHING(sdp)) {
+	if (unlikely(SG_IS_DETACHING(sdp))) {
+		SG_LOG(1, sfp, "%s: detaching\n", __func__);
 		kfree(sfp);
 		return ERR_PTR(-ENODEV);
 	}
-	SG_LOG(3, sfp, "%s: sfp=0x%pK\n", __func__, sfp);
 	if (unlikely(sg_big_buff != def_reserved_size))
 		sg_big_buff = def_reserved_size;
 
 	rbuf_len = min_t(int, sg_big_buff, sdp->max_sgat_sz);
-	if (rbuf_len > 0)
-		sg_build_reserve(sfp, rbuf_len);
-
-	xa_lock_irqsave(&sdp->sfp_arr, iflags);
+	if (rbuf_len > 0) {
+		struct xa_limit xalrq = { .max = 0, .min = 0 };
+
+		srp = sg_build_reserve(sfp, rbuf_len);
+		if (IS_ERR(srp)) {
+			err = PTR_ERR(srp);
+			SG_LOG(1, sfp, "%s: build reserve err=%ld\n", __func__,
+			       -err);
+			kfree(sfp);
+			return ERR_PTR(err);
+		}
+		if (srp->sgat_h.buflen < rbuf_len) {
+			reduced = true;
+			SG_LOG(2, sfp,
+			       "%s: reserve reduced from %d to buflen=%d\n",
+			       __func__, rbuf_len, srp->sgat_h.buflen);
+		}
+		xa_lock_irqsave(xafp, iflags);
+		xalrq.max = atomic_inc_return(&sfp->req_cnt);
+		res = __xa_alloc(xafp, &idx, srp, xalrq, GFP_ATOMIC);
+		xa_unlock_irqrestore(xafp, iflags);
+		if (res < 0) {
+			SG_LOG(1, sfp, "%s: xa_alloc(srp) bad, errno=%d\n",
+			       __func__,  -res);
+			sg_remove_sgat(srp);
+			kfree(srp);
+			kfree(sfp);
+			return ERR_PTR(-EPROTOTYPE);
+		}
+		srp->rq_idx = idx;
+		srp->parentfp = sfp;
+		sg_rq_state_chg(srp, 0, SG_RS_INACTIVE, true, __func__);
+	}
+	if (!reduced) {
+		SG_LOG(4, sfp, "%s: built reserve buflen=%d\n", __func__,
+		       rbuf_len);
+	}
+	xa_lock_irqsave(xadp, iflags);
 	xal.min = 0;
 	xal.max = atomic_read(&sdp->open_cnt);
-	res = __xa_alloc(&sdp->sfp_arr, &idx, sfp, xal, GFP_KERNEL);
-	xa_unlock_irqrestore(&sdp->sfp_arr, iflags);
+	res = __xa_alloc(xadp, &idx, sfp, xal, GFP_KERNEL);
+	xa_unlock_irqrestore(xadp, iflags);
 	if (res < 0) {
 		pr_warn("%s: xa_alloc(sdp) bad, o_count=%d, errno=%d\n",
 			__func__, xal.max, -res);
-		if (rbuf_len > 0)
-			sg_remove_sgat(sfp, &sfp->reserve);
+		if (srp) {
+			sg_remove_sgat(srp);
+			kfree(srp);
+		}
 		kfree(sfp);
 		return ERR_PTR(res);
 	}
@@ -2771,12 +3189,14 @@ sg_remove_sfp_usercontext(struct work_struct *work)
 	struct sg_request *srp;
 	struct sg_request *e_srp;
 	struct xarray *xafp = &sfp->srp_arr;
+	struct xarray *xadp;
 
 	if (!sfp) {
 		pr_warn("sg: %s: sfp is NULL\n", __func__);
 		return;
 	}
 	sdp = sfp->parentdp;
+	xadp = &sdp->sfp_arr;
 
 	/* Cleanup any responses which were never read(). */
 	xa_for_each(xafp, idx, srp) {
@@ -2784,24 +3204,20 @@ sg_remove_sfp_usercontext(struct work_struct *work)
 			continue;
 		if (!xa_get_mark(xafp, srp->rq_idx, SG_XA_RQ_INACTIVE))
 			sg_finish_scsi_blk_rq(srp);
+		sg_remove_sgat(srp);
 		xa_lock_irqsave(xafp, iflags);
 		e_srp = __xa_erase(xafp, srp->rq_idx);
 		xa_unlock_irqrestore(xafp, iflags);
 		if (srp != e_srp)
 			SG_LOG(1, sfp, "%s: xa_erase() return unexpected\n",
 			       __func__);
+		SG_LOG(6, sfp, "%s: kfree: srp=%pK --\n", __func__, srp);
 		kfree(srp);
 	}
 	xa_destroy(xafp);
-	if (sfp->reserve.buflen > 0) {
-		SG_LOG(6, sfp, "%s:    buflen=%d, num_sgat=%d\n", __func__,
-		       (int)sfp->reserve.buflen, (int)sfp->reserve.num_sgat);
-		sg_remove_sgat(sfp, &sfp->reserve);
-	}
-
-	xa_lock_irqsave(&sdp->sfp_arr, iflags);
-	e_sfp = __xa_erase(&sdp->sfp_arr, sfp->idx);
-	xa_unlock_irqrestore(&sdp->sfp_arr, iflags);
+	xa_lock_irqsave(xadp, iflags);
+	e_sfp = __xa_erase(xadp, sfp->idx);
+	xa_unlock_irqrestore(xadp, iflags);
 	if (unlikely(sfp != e_sfp))
 		SG_LOG(1, sfp, "%s: xa_erase() return unexpected\n",
 		       __func__);
@@ -3051,6 +3467,7 @@ sg_proc_seq_show_devhdr(struct seq_file *s, void *v)
 struct sg_proc_deviter {
 	loff_t	index;
 	size_t	max;
+	int fd_index;
 };
 
 static void *
@@ -3134,11 +3551,10 @@ sg_proc_seq_show_devstrs(struct seq_file *s, void *v)
 static void
 sg_proc_debug_helper(struct seq_file *s, struct sg_device *sdp)
 {
-	int k, new_interface, blen, usg;
+	int k;
 	unsigned long idx, idx2;
 	struct sg_request *srp;
 	struct sg_fd *fp;
-	const struct sg_io_hdr *hp;
 	const char * cp;
 	unsigned int ms;
 
@@ -3149,51 +3565,53 @@ sg_proc_debug_helper(struct seq_file *s, struct sg_device *sdp)
 		k++;
 		seq_printf(s, "   FD(%d): timeout=%dms buflen=%d (res)sgat=%d low_dma=%d idx=%lu\n",
 			   k, jiffies_to_msecs(fp->timeout),
-			   fp->reserve.buflen, (int)fp->reserve.num_sgat,
+			   fp->rsv_srp->sgat_h.buflen,
+			   (int)fp->rsv_srp->sgat_h.num_sgat,
 			   (int)sdp->device->host->unchecked_isa_dma, idx);
 		seq_printf(s, "   cmd_q=%d f_packid=%d k_orphan=%d closed=0\n",
-			   (int) fp->cmd_q, (int) fp->force_packid,
-			   (int) fp->keep_orphan);
+			   (int)test_bit(SG_FFD_CMD_Q, fp->ffd_bm),
+			   (int)test_bit(SG_FFD_FORCE_PACKID, fp->ffd_bm),
+			   (int)test_bit(SG_FFD_KEEP_ORPHAN, fp->ffd_bm));
 		seq_printf(s, "   submitted=%d waiting=%d\n",
 			   atomic_read(&fp->submitted),
 			   atomic_read(&fp->waiting));
 		xa_for_each(&fp->srp_arr, idx2, srp) {
+			const struct sg_slice_hdr3 *sh3p = &srp->s_hdr3;
+			bool is_v3 = (sh3p->interface_id != '\0');
+			enum sg_rq_state rq_st = atomic_read(&srp->rq_st);
+
 			if (!srp)
 				continue;
-			hp = &srp->header;
-			new_interface = (hp->interface_id == '\0') ? 0 : 1;
-			if (srp->res_used) {
-				if (new_interface &&
-				    (SG_FLAG_MMAP_IO & hp->flags))
+			if (srp->parentfp->rsv_srp == srp) {
+				if (is_v3 && (SG_FLAG_MMAP_IO & sh3p->flags))
 					cp = "     mmap>> ";
 				else
 					cp = "     rb>> ";
 			} else {
-				if (SG_INFO_DIRECT_IO_MASK & hp->info)
+				if (SG_INFO_DIRECT_IO_MASK & srp->rq_info)
 					cp = "     dio>> ";
 				else
 					cp = "     ";
 			}
 			seq_puts(s, cp);
-			blen = srp->data.buflen;
-			usg = srp->data.num_sgat;
-			seq_puts(s, srp->done ?
-				 ((1 == srp->done) ?  "rcv:" : "fin:")
-				  : "act:");
-			seq_printf(s, " id=%d blen=%d",
-				   srp->header.pack_id, blen);
-			if (srp->done)
-				seq_printf(s, " dur=%d", hp->duration);
-			else {
-				ms = jiffies_to_msecs(jiffies);
-				seq_printf(s, " t_o/elap=%d/%d",
-					(new_interface ? hp->timeout :
-						  jiffies_to_msecs(fp->timeout)),
-					(ms > hp->duration ? ms - hp->duration : 0));
+			seq_puts(s, sg_rq_st_str(rq_st, false));
+			seq_printf(s, ": id=%d len/blen=%d/%d",
+				   srp->pack_id, srp->sgat_h.dlen,
+				   srp->sgat_h.buflen);
+			if (rq_st == SG_RS_AWAIT_RCV ||
+			    rq_st == SG_RS_RCV_DONE) {
+				seq_printf(s, " dur=%d", srp->duration);
+				goto fin_line;
 			}
-			seq_printf(s, "ms sgat=%d op=0x%02x dummy: %s\n", usg,
-				   (int)srp->data.cmd_opcode,
-				   sg_rq_st_str(SG_RS_INACTIVE, false));
+			ms = jiffies_to_msecs(jiffies);
+			seq_printf(s, " t_o/elap=%d/%d",
+				   (is_v3 ? sh3p->timeout :
+					    jiffies_to_msecs(fp->timeout)),
+				   (ms > srp->duration ?  ms - srp->duration :
+							  0));
+fin_line:
+			seq_printf(s, "ms sgat=%d op=0x%02x\n",
+				   srp->sgat_h.num_sgat, (int)srp->cmd_opcode);
 		}
 		if (xa_empty(&fp->srp_arr))
 			seq_puts(s, "     No requests active\n");
-- 
2.25.1