All of lore.kernel.org
 help / color / mirror / Atom feed
From: Douglas Gilbert <dgilbert@interlog.com>
To: linux-scsi@vger.kernel.org
Cc: martin.petersen@oracle.com, jejb@linux.vnet.ibm.com,
	hare@suse.de, kbuild test robot <lkp@intel.com>
Subject: [PATCH v17 25/45] sg: replace rq array with xarray
Date: Wed,  7 Apr 2021 21:45:11 -0400	[thread overview]
Message-ID: <20210408014531.248890-26-dgilbert@interlog.com> (raw)
In-Reply-To: <20210408014531.248890-1-dgilbert@interlog.com>

Remove the fixed size array of 16 request elements per file
descriptor and replace with the xarray added in the previous
patch. All sg_request objects are now kept, available for
re-use, until their owning file descriptor is closed. The
sg_request deletions are in sg_remove_sfp_usercontext().
Each active sg_request object has an associated block
request and a scsi_request object but they have different
lifetimes. The block request and the scsi_request object
are released much earlier; their lifetime is the same as it
was in the v3 sg driver. The lifetime of the bio is also the
same (but is stretched in a later patch).

Collect various flags into bit maps: one for requests
(SG_FRQ_*) and the other for file descriptors (SG_FFD_*).
They join a per sg_device bit map (SG_FDEV_*) added in an
earlier patch.

Prior to a new sg_request object being (re-)built, information
that will be placed in it uses a new struct sg_comm_wr_t
object.

Since the above changes touch almost every function and low
level structures, this patch is big.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Douglas Gilbert <dgilbert@interlog.com>
---
 drivers/scsi/sg.c | 1544 ++++++++++++++++++++++++++++-----------------
 1 file changed, 981 insertions(+), 563 deletions(-)

diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 273861374de7..99cfa265a611 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -142,36 +142,51 @@ static struct class_interface sg_interface = {
 	.remove_dev     = sg_remove_device,
 };
 
+/* Subset of sg_io_hdr found in <scsi/sg.h>, has only [i] and [i->o] fields */
+struct sg_slice_hdr3 {
+	int interface_id;
+	int dxfer_direction;
+	u8 cmd_len;
+	u8 mx_sb_len;
+	u16 iovec_count;
+	unsigned int dxfer_len;
+	void __user *dxferp;
+	u8 __user *cmdp;
+	void __user *sbp;
+	unsigned int timeout;
+	unsigned int flags;
+	int pack_id;
+	void __user *usr_ptr;
+};
+
 struct sg_scatter_hold {     /* holding area for scsi scatter gather info */
 	struct page **pages;	/* num_sgat element array of struct page* */
 	int buflen;		/* capacity in bytes (dlen<=buflen) */
 	int dlen;		/* current valid data length of this req */
 	u16 page_order;		/* byte_len = (page_size*(2**page_order)) */
 	u16 num_sgat;		/* actual number of scatter-gather segments */
-	unsigned int sglist_len; /* size of malloc'd scatter-gather list ++ */
-	char dio_in_use;	/* 0->indirect IO (or mmap), 1->dio */
-	u8 cmd_opcode;		/* first byte of command */
 };
 
 struct sg_device;		/* forward declarations */
 struct sg_fd;
 
-struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
-	struct sg_scatter_hold data;	/* hold buffer, perhaps scatter list */
-	struct sg_io_hdr header;  /* scsi command+info, see <scsi/sg.h> */
+struct sg_request {	/* active SCSI command or inactive request */
+	struct sg_scatter_hold sgat_h;	/* hold buffer, perhaps scatter list */
+	struct sg_slice_hdr3 s_hdr3;  /* subset of sg_io_hdr */
 	u8 sense_b[SCSI_SENSE_BUFFERSIZE];
 	u32 duration;		/* cmd duration in milliseconds */
+	u32 rq_flags;		/* hold user supplied flags */
 	u32 rq_idx;		/* my index within parent's srp_arr */
-	char res_used;		/* 1 -> using reserve buffer, 0 -> not ... */
-	char orphan;		/* 1 -> drop on sight, 0 -> normal */
+	u32 rq_info;		/* info supplied by v3 and v4 interfaces */
 	u32 rq_result;		/* packed scsi request result from LLD */
-	char sg_io_owned;	/* 1 -> packet belongs to SG_IO */
-	/* done protected by rq_list_lock */
-	char done;		/* 0->before bh, 1->before read, 2->read */
+	int in_resid;		/* requested-actual byte count on data-in */
+	int pack_id;		/* user provided packet identifier field */
+	int sense_len;		/* actual sense buffer length (data-in) */
 	atomic_t rq_st;		/* request state, holds a enum sg_rq_state */
+	u8 cmd_opcode;		/* first byte of SCSI cdb */
 	u64 start_ns;		/* starting point of command duration calc */
-	unsigned long frq_bm[1];        /* see SG_FRQ_* defines above */
-	struct sg_fd *parentfp; /* pointer to owning fd, even when on fl */
+	unsigned long frq_bm[1];	/* see SG_FRQ_* defines above */
+	struct sg_fd *parentfp;	/* pointer to owning fd, even when on fl */
 	struct request *rq;	/* released in sg_rq_end_io(), bio kept */
 	struct bio *bio;	/* kept until this req -->SG_RS_INACTIVE */
 	struct execute_work ew_orph;	/* harvest orphan request */
@@ -180,7 +195,7 @@ struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
 struct sg_fd {		/* holds the state of a file descriptor */
 	struct sg_device *parentdp;	/* owning device */
 	wait_queue_head_t read_wait;	/* queue read until command done */
-	struct mutex f_mutex;	/* protect against changes in this fd */
+	struct mutex f_mutex;	/* serialize ioctls on this fd */
 	int timeout;		/* defaults to SG_DEFAULT_TIMEOUT      */
 	int timeout_user;	/* defaults to SG_DEFAULT_TIMEOUT_USER */
 	u32 idx;		/* my index within parent's sfp_arr */
@@ -188,15 +203,12 @@ struct sg_fd {		/* holds the state of a file descriptor */
 	atomic_t waiting;	/* number of requests awaiting receive */
 	atomic_t req_cnt;	/* number of requests */
 	int sgat_elem_sz;	/* initialized to scatter_elem_sz */
-	struct sg_scatter_hold reserve;	/* buffer for this file descriptor */
-	char force_packid;	/* 1 -> pack_id input to read(), 0 -> ignored */
-	char cmd_q;		/* 1 -> allow command queuing, 0 -> don't */
+	unsigned long ffd_bm[1];	/* see SG_FFD_* defines above */
+	pid_t tid;		/* thread id when opened */
 	u8 next_cmd_len;	/* 0: automatic, >0: use on next write() */
-	char keep_orphan;	/* 0 -> drop orphan (def), 1 -> keep for read() */
-	char mmap_called;	/* 0 -> mmap() never called on this fd */
-	char res_in_use;	/* 1 -> 'reserve' array in use */
-	struct fasync_struct *async_qp;	/* used by asynchronous notification */
-	struct xarray srp_arr;
+	struct sg_request *rsv_srp;/* one reserve request per fd */
+	struct fasync_struct *async_qp; /* used by asynchronous notification */
+	struct xarray srp_arr;	/* xarray of sg_request object pointers */
 	struct kref f_ref;
 	struct execute_work ew_fd;  /* harvest all fd resources and lists */
 };
@@ -219,8 +231,8 @@ struct sg_device { /* holds the state of each scsi generic device */
 
 struct sg_comm_wr_t {  /* arguments to sg_common_write() */
 	int timeout;
-	int blocking;
-	struct sg_request *srp;
+	unsigned long frq_bm[1];	/* see SG_FRQ_* defines above */
+	struct sg_io_hdr *h3p;
 	u8 *cmnd;
 };
 
@@ -228,31 +240,32 @@ struct sg_comm_wr_t {  /* arguments to sg_common_write() */
 static void sg_rq_end_io(struct request *rq, blk_status_t status);
 /* Declarations of other static functions used before they are defined */
 static int sg_proc_init(void);
-static int sg_start_req(struct sg_request *srp, u8 *cmd);
+static int sg_start_req(struct sg_request *srp, u8 *cmd, int cmd_len,
+			int dxfer_dir);
 static void sg_finish_scsi_blk_rq(struct sg_request *srp);
-static int sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp,
-		      int minlen);
-static ssize_t sg_submit(struct sg_fd *sfp, struct file *filp,
-			 const char __user *buf, size_t count, bool blocking,
-			 bool read_only, bool sg_io_owned,
-			 struct sg_request **o_srp);
-static int sg_common_write(struct sg_fd *sfp, struct sg_comm_wr_t *cwrp);
+static int sg_mk_sgat(struct sg_request *srp, struct sg_fd *sfp, int minlen);
+static int sg_submit(struct file *filp, struct sg_fd *sfp,
+		     struct sg_io_hdr *hp, bool sync,
+		     struct sg_request **o_srp);
+static struct sg_request *sg_common_write(struct sg_fd *sfp,
+					  struct sg_comm_wr_t *cwrp);
 static int sg_read_append(struct sg_request *srp, void __user *outp,
 			  int num_xfer);
-static void sg_remove_sgat(struct sg_fd *sfp, struct sg_scatter_hold *schp);
-static void sg_build_reserve(struct sg_fd *sfp, int req_size);
-static void sg_link_reserve(struct sg_fd *sfp, struct sg_request *srp,
-			    int size);
-static void sg_unlink_reserve(struct sg_fd *sfp, struct sg_request *srp);
+static void sg_remove_sgat(struct sg_request *srp);
 static struct sg_fd *sg_add_sfp(struct sg_device *sdp);
 static void sg_remove_sfp(struct kref *);
 static struct sg_request *sg_find_srp_by_id(struct sg_fd *sfp, int pack_id);
-static struct sg_request *sg_setup_req(struct sg_fd *sfp);
+static struct sg_request *sg_setup_req(struct sg_fd *sfp, int dxfr_len,
+				       struct sg_comm_wr_t *cwrp);
 static void sg_deact_request(struct sg_fd *sfp, struct sg_request *srp);
 static struct sg_device *sg_get_dev(int dev);
 static void sg_device_destroy(struct kref *kref);
+static struct sg_request *sg_mk_srp_sgat(struct sg_fd *sfp, bool first,
+					 int db_len);
 static const char *sg_rq_st_str(enum sg_rq_state rq_st, bool long_str);
 
+#define SG_WRITE_COUNT_LIMIT (32 * 1024 * 1024)
+
 #define SZ_SG_HEADER ((int)sizeof(struct sg_header))	/* v1 and v2 header */
 #define SZ_SG_IO_HDR ((int)sizeof(struct sg_io_hdr))	/* v3 header */
 #define SZ_SG_REQ_INFO ((int)sizeof(struct sg_req_info))
@@ -518,6 +531,7 @@ sg_release(struct inode *inode, struct file *filp)
 static ssize_t
 sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 {
+	bool get_v3_hdr;
 	int mxsize, cmd_size, input_size, res;
 	u8 opcode;
 	struct sg_device *sdp;
@@ -540,36 +554,61 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 	res = sg_allow_if_err_recovery(sdp, !!(filp->f_flags & O_NONBLOCK));
 	if (res)
 		return res;
-
-	if (count < SZ_SG_HEADER)
+	if (count < SZ_SG_HEADER || count > SG_WRITE_COUNT_LIMIT)
 		return -EIO;
-	if (copy_from_user(ohp, p, SZ_SG_HEADER))
-		return -EFAULT;
-	if (ohp->reply_len < 0) {	/* assume this is v3 */
-		struct sg_io_hdr *reinter_2p = (struct sg_io_hdr *)ohp;
+#ifdef CONFIG_COMPAT
+	if (in_compat_syscall())
+		get_v3_hdr = (count == sizeof(struct compat_sg_io_hdr));
+	else
+		get_v3_hdr = (count == sizeof(struct sg_io_hdr));
+#else
+	get_v3_hdr = (count == sizeof(struct sg_io_hdr));
+#endif
+	if (get_v3_hdr) {
+		if (get_sg_io_hdr(h3p, p))
+			return -EFAULT;
+	} else {
+		if (copy_from_user(ohp, p, SZ_SG_HEADER))
+			return -EFAULT;
+		if (ohp->reply_len < 0) {	/* not v2, may be v3 */
+			bool lt = false;
 
-		if (count < SZ_SG_IO_HDR)
-			return -EIO;
-		if (reinter_2p->interface_id != 'S') {
+#ifdef CONFIG_COMPAT
+			if (in_compat_syscall())
+				lt = (count < sizeof(struct compat_sg_io_hdr));
+			else
+				lt = (count < sizeof(struct sg_io_hdr));
+#else
+			lt = (count < sizeof(struct sg_io_hdr));
+#endif
+			if (lt)
+				return -EIO;
+			get_v3_hdr = true;
+			if (get_sg_io_hdr(h3p, p))
+				return -EFAULT;
+		}
+	}
+	if (get_v3_hdr) {
+		/* v3 dxfer_direction_s are all negative values by design */
+		if (h3p->dxfer_direction >= 0) {	/* so it is not v3 */
+			memcpy(ohp, h3p, count);
+			goto to_v2;
+		}
+		if (h3p->interface_id != 'S') {
 			pr_info_once("sg: %s: v3 interface only here\n",
 				     __func__);
 			return -EPERM;
 		}
-		return sg_submit(sfp, filp, p, count,
-				 !(filp->f_flags & O_NONBLOCK), false, false,
-				 NULL);
+		res = sg_submit(filp, sfp, h3p, false, NULL);
+		return res < 0 ? res : (int)count;
 	}
+to_v2:
+	/* v1 and v2 interfaces processed below this point */
 	if (count < (SZ_SG_HEADER + 6))
-		return -EIO;	/* The minimum scsi command length is 6 bytes. */
-
+		return -EIO;    /* minimum scsi command length is 6 bytes */
 	p += SZ_SG_HEADER;
 	if (get_user(opcode, p))
 		return -EFAULT;
-
-	if (!(srp = sg_setup_req(sfp))) {
-		SG_LOG(1, sfp, "%s: queue full\n", __func__);
-		return -EDOM;
-	}
 	mutex_lock(&sfp->f_mutex);
 	if (sfp->next_cmd_len > 0) {
 		cmd_size = sfp->next_cmd_len;
@@ -586,12 +625,10 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 	mxsize = max_t(int, input_size, ohp->reply_len);
 	mxsize -= SZ_SG_HEADER;
 	input_size -= SZ_SG_HEADER;
-	if (input_size < 0) {
-		sg_deact_request(sfp, srp);
-		return -EIO;	/* User did not pass enough bytes for this command. */
-	}
-	h3p = &srp->header;
-	h3p->interface_id = '\0';  /* indicator of old interface tunnelled */
+	if (input_size < 0)
+		return -EIO; /* Insufficient bytes passed for this command. */
+	memset(h3p, 0, sizeof(*h3p));
+	h3p->interface_id = '\0';/* indicate v1 or v2 interface (tunnelled) */
 	h3p->cmd_len = (u8)cmd_size;
 	h3p->iovec_count = 0;
 	h3p->mx_sb_len = 0;
@@ -612,10 +649,9 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 	h3p->flags = input_size;	/* structure abuse ... */
 	h3p->pack_id = ohp->pack_id;
 	h3p->usr_ptr = NULL;
-	if (copy_from_user(cmnd, p, cmd_size)) {
-		sg_deact_request(sfp, srp);
+	cmnd[0] = opcode;
+	if (copy_from_user(cmnd + 1, p + 1, cmd_size - 1))
 		return -EFAULT;
-	}
 	/*
 	 * SG_DXFER_TO_FROM_DEV is functionally equivalent to SG_DXFER_FROM_DEV,
 	 * but it is possible that the app intended SG_DXFER_TO_DEV, because
@@ -629,23 +665,23 @@ sg_write(struct file *filp, const char __user *p, size_t count, loff_t *ppos)
 			 __func__, ohp->reply_len - (int)SZ_SG_HEADER,
 			 input_size, (unsigned int)cmnd[0], current->comm);
 	}
+	cwr.frq_bm[0] = 0;	/* initial state clear for all req flags */
+	cwr.h3p = h3p;
 	cwr.timeout = sfp->timeout;
-	cwr.blocking = !(filp->f_flags & O_NONBLOCK);
-	cwr.srp = srp;
 	cwr.cmnd = cmnd;
-	res = sg_common_write(sfp, &cwr);
-	return (res < 0) ? res : count;
+	srp = sg_common_write(sfp, &cwr);
+	return (IS_ERR(srp)) ? PTR_ERR(srp) : (int)count;
 }
 
 static inline int
 sg_chk_mmap(struct sg_fd *sfp, int rq_flags, int len)
 {
-	if (len > sfp->reserve.buflen)
-		return -ENOMEM;	/* MMAP_IO size must fit in reserve buffer */
+	if (!xa_empty(&sfp->srp_arr))
+		return -EBUSY;  /* already active requests on fd */
+	if (len > sfp->rsv_srp->sgat_h.buflen)
+		return -ENOMEM; /* MMAP_IO size must fit in reserve */
 	if (rq_flags & SG_FLAG_DIRECT_IO)
-		return -EINVAL;	/* either MMAP_IO or DIRECT_IO (not both) */
-	if (sfp->res_in_use)
-		return -EBUSY;	/* reserve buffer already being used */
+		return -EINVAL; /* not both MMAP_IO and DIRECT_IO */
 	return 0;
 }
 
@@ -670,61 +706,40 @@ sg_fetch_cmnd(struct file *filp, struct sg_fd *sfp, const u8 __user *u_cdbp,
 	return 0;
 }
 
-static ssize_t
-sg_submit(struct sg_fd *sfp, struct file *filp, const char __user *buf,
-	  size_t count, bool blocking, bool read_only, bool sg_io_owned,
-	  struct sg_request **o_srp)
+static int
+sg_submit(struct file *filp, struct sg_fd *sfp, struct sg_io_hdr *hp,
+	  bool sync, struct sg_request **o_srp)
 {
-	int k, res, timeout;
+	int res, timeout;
+	unsigned long ul_timeout;
 	struct sg_request *srp;
-	struct sg_io_hdr *hp;
 	struct sg_comm_wr_t cwr;
 	u8 cmnd[SG_MAX_CDB_SIZE];
-	unsigned long ul_timeout;
 
-	if (count < SZ_SG_IO_HDR)
-		return -EINVAL;
-
-	sfp->cmd_q = 1;	/* when sg_io_hdr seen, set command queuing on */
-	if (!(srp = sg_setup_req(sfp))) {
-		SG_LOG(1, sfp, "%s: queue full\n", __func__);
-		return -EDOM;
-	}
-	srp->sg_io_owned = sg_io_owned;
-	hp = &srp->header;
-	/* get_sg_io_hdr() is defined in block/scsi_ioctl.c */
-	if (get_sg_io_hdr(hp, buf)) {
-		sg_deact_request(sfp, srp);
-		return -EFAULT;
-	}
-	if (hp->interface_id != 'S') {
-		sg_deact_request(sfp, srp);
-		return -ENOSYS;
-	}
+	/* now doing v3 blocking (sync) or non-blocking submission */
 	if (hp->flags & SG_FLAG_MMAP_IO) {
 		res = sg_chk_mmap(sfp, hp->flags, hp->dxfer_len);
-		if (res) {
-			sg_deact_request(sfp, srp);
+		if (res)
 			return res;
-		}
 	}
-	ul_timeout = msecs_to_jiffies(srp->header.timeout);
-	timeout = (ul_timeout < INT_MAX) ? ul_timeout : INT_MAX;
+	/* when v3 seen, allow cmd_q on this fd (def: no cmd_q) */
+	set_bit(SG_FFD_CMD_Q, sfp->ffd_bm);
+	ul_timeout = msecs_to_jiffies(hp->timeout);
+	timeout = min_t(unsigned long, ul_timeout, INT_MAX);
 	res = sg_fetch_cmnd(filp, sfp, hp->cmdp, hp->cmd_len, cmnd);
-	if (res) {
-		sg_deact_request(sfp, srp);
+	if (res)
 		return res;
-	}
+	cwr.frq_bm[0] = 0;
+	__assign_bit(SG_FRQ_SYNC_INVOC, cwr.frq_bm, (int)sync);
+	cwr.h3p = hp;
 	cwr.timeout = timeout;
-	cwr.blocking = blocking;
-	cwr.srp = srp;
 	cwr.cmnd = cmnd;
-	k = sg_common_write(sfp, &cwr);
-	if (k < 0)
-		return k;
+	srp = sg_common_write(sfp, &cwr);
+	if (IS_ERR(srp))
+		return PTR_ERR(srp);
 	if (o_srp)
-		*o_srp = cwr.srp;
-	return count;
+		*o_srp = srp;
+	return 0;
 }
 
 #if IS_ENABLED(SG_LOG_ACTIVE)
@@ -842,70 +857,69 @@ sg_rq_state_chg(struct sg_request *srp, enum sg_rq_state old_st,
  * sg_request object holding the request just issued or a negated errno
  * value twisted by ERR_PTR.
  */
-static int
+static struct sg_request *
 sg_common_write(struct sg_fd *sfp, struct sg_comm_wr_t *cwrp)
 {
 	bool at_head;
-	int k;
+	int res = 0;
+	int dxfr_len, dir, cmd_len;
+	int pack_id = SG_PACK_ID_WILDCARD;
+	u32 rq_flags;
 	struct sg_device *sdp = sfp->parentdp;
-	struct sg_request *srp = cwrp->srp;
-	struct sg_io_hdr *hp = &srp->header;
-
-	srp->data.cmd_opcode = cwrp->cmnd[0];	/* hold opcode of command */
-	hp->status = 0;
-	hp->masked_status = 0;
-	hp->msg_status = 0;
-	hp->info = 0;
-	hp->host_status = 0;
-	hp->driver_status = 0;
-	hp->resid = 0;
-	SG_LOG(4, sfp, "%s:  opcode=0x%02x, cmd_sz=%d\n", __func__,
-	       (int)cwrp->cmnd[0], hp->cmd_len);
-
-	if (hp->dxfer_len >= SZ_256M) {
-		sg_deact_request(sfp, srp);
-		return -EINVAL;
-	}
-
-	k = sg_start_req(srp, cwrp->cmnd);
-	if (k) {
-		SG_LOG(1, sfp, "%s: start_req err=%d\n", __func__, k);
-		sg_finish_scsi_blk_rq(srp);
-		sg_deact_request(sfp, srp);
-		return k;	/* probably out of space --> ENOMEM */
-	}
-	if (SG_IS_DETACHING(sdp))
+	struct sg_request *srp;
+	struct sg_io_hdr *hi_p;
+
+	hi_p = cwrp->h3p;
+	dir = hi_p->dxfer_direction;
+	dxfr_len = hi_p->dxfer_len;
+	rq_flags = hi_p->flags;
+	pack_id = hi_p->pack_id;
+	if (dxfr_len >= SZ_256M)
+		return ERR_PTR(-EINVAL);
+
+	srp = sg_setup_req(sfp, dxfr_len, cwrp);
+	if (IS_ERR(srp))
+		return srp;
+	srp->rq_flags = rq_flags;
+	srp->pack_id = pack_id;
+
+	cmd_len = hi_p->cmd_len;
+	memcpy(&srp->s_hdr3, hi_p, sizeof(srp->s_hdr3));
+	srp->cmd_opcode = cwrp->cmnd[0];/* hold opcode of command for debug */
+	SG_LOG(4, sfp, "%s: opcode=0x%02x, cdb_sz=%d, pack_id=%d\n", __func__,
+	       (int)cwrp->cmnd[0], cmd_len, pack_id);
+
+	res = sg_start_req(srp, cwrp->cmnd, cmd_len, dir);
+	if (res < 0)		/* probably out of space --> -ENOMEM */
 		goto err_out;
-
-	hp->duration = jiffies_to_msecs(jiffies);
-	if (hp->interface_id != '\0' &&	/* v3 (or later) interface */
-	    (SG_FLAG_Q_AT_TAIL & hp->flags))
-		at_head = false;
-	else
-		at_head = true;
-
-	if (!srp->sg_io_owned)
-		atomic_inc(&sfp->submitted);
+	if (unlikely(SG_IS_DETACHING(sdp))) {
+		res = -ENODEV;
+		goto err_out;
+	}
 	srp->rq->timeout = cwrp->timeout;
 	kref_get(&sfp->f_ref); /* sg_rq_end_io() does kref_put(). */
+	res = sg_rq_state_chg(srp, SG_RS_BUSY, SG_RS_INFLIGHT, false,
+			      __func__);
+	if (res)
+		goto err_out;
+	srp->start_ns = ktime_get_boottime_ns();
+	srp->duration = 0;
+
+	if (srp->s_hdr3.interface_id == '\0')
+		at_head = true; /* backward compatibility: v1+v2 interfaces */
+	else if (test_bit(SG_FFD_Q_AT_TAIL, sfp->ffd_bm))
+	/* cmd flags can override sfd setting */
+		at_head = !!(srp->rq_flags & SG_FLAG_Q_AT_HEAD);
+	else            /* this sfd is defaulting to head */
+		at_head = !(srp->rq_flags & SG_FLAG_Q_AT_TAIL);
+	if (!test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm))
+		atomic_inc(&sfp->submitted);
 	blk_execute_rq_nowait(sdp->disk, srp->rq, at_head, sg_rq_end_io);
-	return 0;
+	return srp;
 err_out:
-	if (srp->bio) {
-		scsi_req_free_cmd(scsi_req(srp->rq));
-		blk_put_request(srp->rq);
-		srp->rq = NULL;
-	}
 	sg_finish_scsi_blk_rq(srp);
 	sg_deact_request(sfp, srp);
-	return -ENODEV;
-}
-
-static inline int
-sg_rstate_chg(struct sg_request *srp, enum sg_rq_state old_st,
-	      enum sg_rq_state new_st)
-{
-	return sg_rq_state_chg(srp, old_st, new_st, false, __func__);
+	return ERR_PTR(res);
 }
 
 /*
@@ -936,21 +950,26 @@ static int
 sg_copy_sense(struct sg_request *srp)
 {
 	int sb_len_ret = 0;
-	struct sg_io_hdr *hp = &srp->header;
+	int scsi_stat;
 
 	/* If need be, copy the sense buffer to the user space */
-	if ((CHECK_CONDITION & hp->masked_status) ||
-	    (DRIVER_SENSE & hp->driver_status)) {
-		int sb_len = SCSI_SENSE_BUFFERSIZE;
-		void __user *up = hp->sbp;
-
-		sb_len = min_t(int, hp->mx_sb_len, sb_len);
-		/* Additional sense length field */
-		sb_len_ret = 8 + (int)srp->sense_b[7];
-		sb_len_ret = min_t(int, sb_len_ret, sb_len);
-		if (copy_to_user(up, srp->sense_b, sb_len_ret))
-			return -EFAULT;
-		hp->sb_len_wr = sb_len_ret;
+	scsi_stat = srp->rq_result & 0xff;
+	if ((scsi_stat & SAM_STAT_CHECK_CONDITION) ||
+	    (driver_byte(srp->rq_result) & DRIVER_SENSE)) {
+		int sb_len = min_t(int, SCSI_SENSE_BUFFERSIZE, srp->sense_len);
+		int mx_sb_len = srp->s_hdr3.mx_sb_len;
+		void __user *up = srp->s_hdr3.sbp;
+
+		if (up && mx_sb_len > 0) {
+			sb_len = min_t(int, mx_sb_len, sb_len);
+			/* Additional sense length field */
+			sb_len_ret = 8 + (int)srp->sense_b[7];
+			sb_len_ret = min_t(int, sb_len_ret, sb_len);
+			if (copy_to_user(up, srp->sense_b, sb_len_ret))
+				sb_len_ret = -EFAULT;
+		} else {
+			sb_len_ret = 0;
+		}
 	}
 	return sb_len_ret;
 }
@@ -959,12 +978,15 @@ static int
 sg_rec_state_v3(struct sg_fd *sfp, struct sg_request *srp)
 {
 	int sb_len_wr;
+	u32 rq_res = srp->rq_result;
 
 	sb_len_wr = sg_copy_sense(srp);
 	if (sb_len_wr < 0)
 		return sb_len_wr;
+	if (rq_res & SG_ML_RESULT_MSK)
+		srp->rq_info |= SG_INFO_CHECK;
 	if (unlikely(SG_IS_DETACHING(sfp->parentdp)))
-		return -ENODEV;
+		srp->rq_info |= SG_INFO_DEVICE_DETACHING;
 	return 0;
 }
 
@@ -972,8 +994,10 @@ static ssize_t
 sg_receive_v3(struct sg_fd *sfp, struct sg_request *srp, size_t count,
 	      void __user *p)
 {
-	int err = 0;
-	struct sg_io_hdr *hp = &srp->header;
+	int err, err2;
+	int rq_result = srp->rq_result;
+	struct sg_io_hdr hdr3;
+	struct sg_io_hdr *hp = &hdr3;
 
 	if (in_compat_syscall()) {
 		if (count < sizeof(struct compat_sg_io_hdr)) {
@@ -986,9 +1010,23 @@ sg_receive_v3(struct sg_fd *sfp, struct sg_request *srp, size_t count,
 	}
 	SG_LOG(3, sfp, "%s: srp=0x%pK\n", __func__, srp);
 	err = sg_rec_state_v3(sfp, srp);
-	if (hp->masked_status || hp->host_status || hp->driver_status)
-		hp->info |= SG_INFO_CHECK;
-	err = put_sg_io_hdr(hp, p);
+	memset(hp, 0, sizeof(*hp));
+	memcpy(hp, &srp->s_hdr3, sizeof(srp->s_hdr3));
+	hp->sb_len_wr = srp->sense_len;
+	hp->info = srp->rq_info;
+	hp->resid = srp->in_resid;
+	hp->duration = srp->duration;
+	hp->status = rq_result & 0xff;
+	hp->masked_status = status_byte(rq_result);
+	hp->msg_status = msg_byte(rq_result);
+	hp->host_status = host_byte(rq_result);
+	hp->driver_status = driver_byte(rq_result);
+	err2 = put_sg_io_hdr(hp, p);
+	err = err ? err : err2;
+	err2 = sg_rq_state_chg(srp, atomic_read(&srp->rq_st), SG_RS_RCV_DONE,
+			       false, __func__);
+	if (err2)
+		err = err ? err : err2;
 err_out:
 	sg_finish_scsi_blk_rq(srp);
 	sg_deact_request(sfp, srp);
@@ -1005,26 +1043,27 @@ sg_read_v1v2(void __user *buf, int count, struct sg_fd *sfp,
 	     struct sg_request *srp)
 {
 	int res = 0;
-	struct sg_io_hdr *sh3p = &srp->header;
+	u32 rq_result = srp->rq_result;
 	struct sg_header *h2p;
+	struct sg_slice_hdr3 *sh3p;
 	struct sg_header a_v2hdr;
 
 	h2p = &a_v2hdr;
 	memset(h2p, 0, SZ_SG_HEADER);
+	sh3p = &srp->s_hdr3;
 	h2p->reply_len = (int)sh3p->timeout;
 	h2p->pack_len = h2p->reply_len; /* old, strange behaviour */
 	h2p->pack_id = sh3p->pack_id;
-	h2p->twelve_byte = (srp->data.cmd_opcode >= 0xc0 &&
-			    sh3p->cmd_len == 12);
-	h2p->target_status = sh3p->masked_status;
-	h2p->host_status = sh3p->host_status;
-	h2p->driver_status = sh3p->driver_status;
-	if ((CHECK_CONDITION & h2p->target_status) ||
-	    (DRIVER_SENSE & sh3p->driver_status)) {
+	h2p->twelve_byte = (srp->cmd_opcode >= 0xc0 && sh3p->cmd_len == 12);
+	h2p->target_status = status_byte(rq_result);
+	h2p->host_status = host_byte(rq_result);
+	h2p->driver_status = driver_byte(rq_result);
+	if ((CHECK_CONDITION & status_byte(rq_result)) ||
+	    (DRIVER_SENSE & driver_byte(rq_result))) {
 		memcpy(h2p->sense_buffer, srp->sense_b,
 		       sizeof(h2p->sense_buffer));
 	}
-	switch (h2p->host_status) {
+	switch (host_byte(rq_result)) {
 	/*
 	 * This following setting of 'result' is for backward compatibility
 	 * and is best ignored by the user who should use target, host and
@@ -1048,7 +1087,7 @@ sg_read_v1v2(void __user *buf, int count, struct sg_fd *sfp,
 		h2p->result = EIO;
 		break;
 	case DID_ERROR:
-		h2p->result = (h2p->target_status == GOOD) ? 0 : EIO;
+		h2p->result = (status_byte(rq_result) == GOOD) ? 0 : EIO;
 		break;
 	default:
 		h2p->result = EIO;
@@ -1069,6 +1108,7 @@ sg_read_v1v2(void __user *buf, int count, struct sg_fd *sfp,
 	} else {
 		res = (h2p->result == 0) ? 0 : -EIO;
 	}
+	atomic_set(&srp->rq_st, SG_RS_RCV_DONE);
 	sg_finish_scsi_blk_rq(srp);
 	sg_deact_request(sfp, srp);
 	return res;
@@ -1112,13 +1152,13 @@ sg_read(struct file *filp, char __user *p, size_t count, loff_t *ppos)
 	hlen = could_be_v3 ? SZ_SG_IO_HDR : SZ_SG_HEADER;
 	h2p = (struct sg_header *)&a_sg_io_hdr;
 
-	if (sfp->force_packid && count >= hlen) {
+	if (test_bit(SG_FFD_FORCE_PACKID, sfp->ffd_bm) && (int)count >= hlen) {
 		/*
 		 * Even though this is a user space read() system call, this
 		 * code is cheating to fetch the pack_id.
 		 * Only need first three 32 bit ints to determine interface.
 		 */
-		if (unlikely(copy_from_user(h2p, p, 3 * sizeof(int))))
+		if (copy_from_user(h2p, p, 3 * sizeof(int)))
 			return -EFAULT;
 		if (h2p->reply_len < 0 && could_be_v3) {
 			struct sg_io_hdr *v3_hdr = (struct sg_io_hdr *)h2p;
@@ -1139,20 +1179,20 @@ sg_read(struct file *filp, char __user *p, size_t count, loff_t *ppos)
 	}
 	srp = sg_find_srp_by_id(sfp, want_id);
 	if (!srp) {	/* nothing available so wait on packet to arrive or */
-		if (SG_IS_DETACHING(sdp))
+		if (unlikely(SG_IS_DETACHING(sdp)))
 			return -ENODEV;
 		if (non_block) /* O_NONBLOCK or v3::flags & SGV4_FLAG_IMMED */
 			return -EAGAIN;
 		ret = wait_event_interruptible(sfp->read_wait,
 					       sg_get_ready_srp(sfp, &srp,
 								want_id));
-		if (SG_IS_DETACHING(sdp))
+		if (unlikely(SG_IS_DETACHING(sdp)))
 			return -ENODEV;
 		if (ret)	/* -ERESTARTSYS as signal hit process */
 			return ret;
 		/* otherwise srp should be valid */
 	}
-	if (srp->header.interface_id == '\0')
+	if (srp->s_hdr3.interface_id == '\0')
 		ret = sg_read_v1v2(p, (int)count, sfp, srp);
 	else
 		ret = sg_receive_v3(sfp, srp, count, p);
@@ -1223,7 +1263,7 @@ sg_calc_rq_dur(const struct sg_request *srp)
 	return (diff > (s64)U32_MAX) ? 3999999999U : (u32)diff;
 }
 
-/* Return of U32_MAX means srp is inactive */
+/* Return of U32_MAX means srp is inactive state */
 static u32
 sg_get_dur(struct sg_request *srp, const enum sg_rq_state *sr_stp,
 	   bool *is_durp)
@@ -1254,34 +1294,63 @@ static void
 sg_fill_request_element(struct sg_fd *sfp, struct sg_request *srp,
 			struct sg_req_info *rip)
 {
-	unsigned int ms;
+	unsigned long iflags;
 
-	rip->req_state = srp->done + 1;
-	rip->problem = srp->header.masked_status &
-		       srp->header.host_status &
-		       srp->header.driver_status;
-	rip->duration = sg_get_dur(srp, NULL, NULL); /* dummy */
-	if (srp->done) {
-		rip->duration = srp->header.duration;
-	} else {
-		ms = jiffies_to_msecs(jiffies);
-		rip->duration = (ms > srp->header.duration) ?
-				(ms - srp->header.duration) : 0;
-	}
-	rip->orphan = srp->orphan;
-	rip->sg_io_owned = srp->sg_io_owned;
-	rip->pack_id = srp->header.pack_id;
-	rip->usr_ptr = srp->header.usr_ptr;
+	xa_lock_irqsave(&sfp->srp_arr, iflags);
+	rip->duration = sg_get_dur(srp, NULL, NULL);
+	if (rip->duration == U32_MAX)
+		rip->duration = 0;
+	rip->orphan = test_bit(SG_FRQ_IS_ORPHAN, srp->frq_bm);
+	rip->sg_io_owned = test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm);
+	rip->problem = !!(srp->rq_result & SG_ML_RESULT_MSK);
+	rip->pack_id = srp->pack_id;
+	rip->usr_ptr = srp->s_hdr3.usr_ptr;
+	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
+}
 
+static inline bool
+sg_rq_landed(struct sg_device *sdp, struct sg_request *srp)
+{
+	return atomic_read(&srp->rq_st) != SG_RS_INFLIGHT ||
+	       unlikely(SG_IS_DETACHING(sdp));
 }
 
+/*
+ * This is a blocking wait for a specific srp. When h4p is non-NULL, it is
+ * the blocking multiple request case
+ */
 static int
-srp_done(struct sg_fd *sfp, struct sg_request *srp)
+sg_wait_event_srp(struct file *filp, struct sg_fd *sfp, void __user *p,
+		  struct sg_request *srp)
 {
-	int ret;
+	int res;
+	enum sg_rq_state sr_st;
+	struct sg_device *sdp = sfp->parentdp;
 
-	ret = srp->done;
-	return ret;
+	SG_LOG(3, sfp, "%s: about to wait_event...()\n", __func__);
+	/* usually will be woken up by sg_rq_end_io() callback */
+	res = wait_event_interruptible(sfp->read_wait,
+				       sg_rq_landed(sdp, srp));
+	if (unlikely(res)) { /* -ERESTARTSYS because signal hit thread */
+		set_bit(SG_FRQ_IS_ORPHAN, srp->frq_bm);
+		/* orphans harvested when sfp->keep_orphan is false */
+		atomic_set(&srp->rq_st, SG_RS_INFLIGHT);
+		SG_LOG(1, sfp, "%s:  wait_event_interruptible gave %d\n",
+		       __func__, res);
+		return res;
+	}
+	if (unlikely(SG_IS_DETACHING(sdp))) {
+		atomic_set(&srp->rq_st, SG_RS_INACTIVE);
+		return -ENODEV;
+	}
+	sr_st = atomic_read(&srp->rq_st);
+	if (unlikely(sr_st != SG_RS_AWAIT_RCV))
+		return -EPROTO;         /* Logic error */
+	res = sg_rq_state_chg(srp, sr_st, SG_RS_BUSY, false, __func__);
+	if (unlikely(res))
+		return res;
+	res = sg_receive_v3(sfp, srp, SZ_SG_IO_HDR, p);
+	return (res < 0) ? res : 0;
 }
 
 /*
@@ -1292,42 +1361,119 @@ static int
 sg_ctl_sg_io(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 	     void __user *p)
 {
-	bool read_only = O_RDWR != (filp->f_flags & O_ACCMODE);
 	int res;
-	struct sg_request *srp;
+	struct sg_request *srp = NULL;
+	u8 hu8arr[SZ_SG_IO_HDR];
+	struct sg_io_hdr *h3p = (struct sg_io_hdr *)hu8arr;
 
+	SG_LOG(3, sfp, "%s:  SG_IO%s\n", __func__,
+	       ((filp->f_flags & O_NONBLOCK) ? " O_NONBLOCK ignored" : ""));
 	res = sg_allow_if_err_recovery(sdp, false);
 	if (res)
 		return res;
-	res = sg_submit(sfp, filp, p, SZ_SG_IO_HDR, true, read_only,
-			true, &srp);
-	if (res < 0)
+	if (get_sg_io_hdr(h3p, p))
+		return -EFAULT;
+	if (h3p->interface_id == 'S')
+		res = sg_submit(filp, sfp, h3p, true, &srp);
+	else
+		return -EPERM;
+	if (unlikely(res < 0))
+		return res;
+	if (!srp)	/* mrq case: already processed all responses */
 		return res;
-	res = wait_event_interruptible
-		(sfp->read_wait, (srp_done(sfp, srp) || SG_IS_DETACHING(sdp)));
-	if (SG_IS_DETACHING(sdp))
-		return -ENODEV;
-	if (srp->done) {
-		srp->done = 2;
-		res = sg_receive_v3(sfp, srp, SZ_SG_IO_HDR, p);
-		return (res < 0) ? res : 0;
-	}
-	srp->orphan = 1;
+	res = sg_wait_event_srp(filp, sfp, p, srp);
+	if (res)
+		SG_LOG(1, sfp, "%s: %s=0x%pK  state: %s\n", __func__,
+		       "unexpected srp", srp,
+		       sg_rq_st_str(atomic_read(&srp->rq_st), false));
 	return res;
 }
 
+/*
+ * First normalize want_rsv_sz to be >= sfp->sgat_elem_sz and
+ * <= max_segment_size. Exit if that is the same as old size; otherwise
+ * create a new candidate request of the new size. Then decide whether to
+ * re-use an existing free list request (least buflen >= required size) or
+ * use the new candidate. If new one used, leave old one but it is no longer
+ * the reserved request. Returns 0 on success, else a negated errno value.
+ */
 static int
 sg_set_reserved_sz(struct sg_fd *sfp, int want_rsv_sz)
 {
-	if (want_rsv_sz != sfp->reserve.buflen) {
-		if (sfp->mmap_called ||
-		    sfp->res_in_use) {
-			return -EBUSY;
+	bool use_new_srp = false;
+	int res = 0;
+	int new_sz, blen;
+	unsigned long idx, iflags;
+	struct sg_request *o_srp;       /* prior reserve sg_request */
+	struct sg_request *n_srp;       /* new sg_request, may be used */
+	struct sg_request *t_srp;       /* other fl entries */
+	struct sg_device *sdp = sfp->parentdp;
+	struct xarray *xafp = &sfp->srp_arr;
+
+	o_srp = sfp->rsv_srp;
+	if (!o_srp)
+		return -EPROTO;
+	new_sz = min_t(int, want_rsv_sz, sdp->max_sgat_sz);
+	new_sz = max_t(int, new_sz, sfp->sgat_elem_sz);
+	blen = o_srp->sgat_h.buflen;
+	SG_LOG(3, sfp, "%s: was=%d, ask=%d, new=%d (sgat_elem_sz=%d)\n",
+	       __func__, blen, want_rsv_sz, new_sz, sfp->sgat_elem_sz);
+	if (blen == new_sz)
+		return 0;
+	n_srp = sg_mk_srp_sgat(sfp, true /* can take time */, new_sz);
+	if (IS_ERR(n_srp))
+		return PTR_ERR(n_srp);
+	sg_rq_state_force(n_srp, SG_RS_INACTIVE);
+	/* new sg_request object, sized correctly is now available */
+try_again:
+	o_srp = sfp->rsv_srp;
+	if (!o_srp) {
+		res = -EPROTO;
+		goto fini;
+	}
+	if (SG_RS_ACTIVE(o_srp) ||
+	    test_bit(SG_FFD_MMAP_CALLED, sfp->ffd_bm)) {
+		res = -EBUSY;
+		goto fini;
+	}
+	use_new_srp = true;
+	xa_for_each(xafp, idx, t_srp) {
+		if (t_srp != o_srp && new_sz <= t_srp->sgat_h.buflen &&
+		    !SG_RS_ACTIVE(t_srp)) {
+			/* good candidate on free list, use */
+			use_new_srp = false;
+			sfp->rsv_srp = t_srp;
+			break;
 		}
-		sg_remove_sgat(sfp, &sfp->reserve);
-		sg_build_reserve(sfp, want_rsv_sz);
 	}
-	return 0;
+	if (use_new_srp) {
+		struct sg_request *cxc_srp;
+
+		xa_lock_irqsave(xafp, iflags);
+		n_srp->rq_idx = o_srp->rq_idx;
+		idx = o_srp->rq_idx;
+		cxc_srp = __xa_cmpxchg(xafp, idx, o_srp, n_srp, GFP_ATOMIC);
+		if (o_srp == cxc_srp) {
+			sfp->rsv_srp = n_srp;
+			sg_rq_state_force(n_srp, SG_RS_INACTIVE);
+			xa_unlock_irqrestore(xafp, iflags);
+			SG_LOG(6, sfp, "%s: new rsv srp=0x%pK ++\n", __func__,
+			       n_srp);
+			sg_remove_sgat(o_srp);
+			kfree(o_srp);
+		} else {
+			xa_unlock_irqrestore(xafp, iflags);
+			SG_LOG(1, sfp, "%s: xa_cmpxchg() failed, again\n",
+			       __func__);
+			goto try_again;
+		}
+	}
+fini:
+	if (!use_new_srp) {
+		sg_remove_sgat(n_srp);
+		kfree(n_srp);   /* no-one else has seen n_srp, so safe */
+	}
+	return res;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1357,26 +1503,43 @@ static int put_compat_request_table(struct compat_sg_req_info __user *o,
 }
 #endif
 
+/*
+ * For backward compatibility, output SG_MAX_QUEUE sg_req_info objects. First
+ * fetch from the active list then, if there is still room, from the free
+ * list. Some of the trailing elements may be empty which is indicated by all
+ * fields being zero. Any requests beyond SG_MAX_QUEUE are ignored.
+ */
 static int
 sg_ctl_req_tbl(struct sg_fd *sfp, void __user *p)
 {
-	int result, val;
+	int k, result, val;
 	unsigned long idx;
 	struct sg_request *srp;
-	sg_req_info_t *rinfop;
+	struct sg_req_info *rinfop;
 
-	rinfop = kcalloc(SG_MAX_QUEUE, SZ_SG_REQ_INFO,
-			 GFP_KERNEL);
+	SG_LOG(3, sfp, "%s:    SG_GET_REQUEST_TABLE\n", __func__);
+	k = SG_MAX_QUEUE;
+	rinfop = kcalloc(k, SZ_SG_REQ_INFO, GFP_KERNEL);
 	if (!rinfop)
 		return -ENOMEM;
 	val = 0;
 	xa_for_each(&sfp->srp_arr, idx, srp) {
 		if (!srp)
 			continue;
-		if (xa_get_mark(&sfp->srp_arr, idx, SG_XA_RQ_AWAIT))
+		if (val >= SG_MAX_QUEUE)
+			break;
+		if (xa_get_mark(&sfp->srp_arr, idx, SG_XA_RQ_INACTIVE))
+			continue;
+		sg_fill_request_element(sfp, srp, rinfop + val);
+		val++;
+	}
+	xa_for_each(&sfp->srp_arr, idx, srp) {
+		if (!srp)
 			continue;
 		if (val >= SG_MAX_QUEUE)
 			break;
+		if (!xa_get_mark(&sfp->srp_arr, idx, SG_XA_RQ_INACTIVE))
+			continue;
 		sg_fill_request_element(sfp, srp, rinfop + val);
 		val++;
 	}
@@ -1443,18 +1606,15 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		sfp->force_packid = val ? 1 : 0;
+		assign_bit(SG_FFD_FORCE_PACKID, sfp->ffd_bm, !!val);
 		return 0;
 	case SG_GET_PACK_ID:    /* or tag of oldest "read"-able, -1 if none */
 		val = -1;
-		srp = NULL;
 		xa_for_each_marked(&sfp->srp_arr, idx, srp, SG_XA_RQ_AWAIT) {
 			if (!srp)
 				continue;
-			if ((1 == srp->done) && (!srp->sg_io_owned)) {
-				val = srp->header.pack_id;
-				break;
-			}
+			val = srp->pack_id;
+			break;
 		}
 		SG_LOG(3, sfp, "%s:    SG_GET_PACK_ID=%d\n", __func__, val);
 		return put_user(val, ip);
@@ -1465,44 +1625,48 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		       sdp->max_sgat_sz);
 		return put_user(sdp->max_sgat_sz, ip);
 	case SG_SET_RESERVED_SIZE:
-		mutex_lock(&sfp->f_mutex);
 		result = get_user(val, ip);
 		if (!result) {
 			if (val >= 0 && val <= (1024 * 1024 * 1024)) {
+				mutex_lock(&sfp->f_mutex);
 				result = sg_set_reserved_sz(sfp, val);
+				mutex_unlock(&sfp->f_mutex);
 			} else {
 				SG_LOG(3, sfp, "%s: invalid size\n", __func__);
 				result = -EINVAL;
 			}
 		}
-		mutex_unlock(&sfp->f_mutex);
 		return result;
 	case SG_GET_RESERVED_SIZE:
-		val = min_t(int, sfp->reserve.buflen,
-			    max_sectors_bytes(sdev->request_queue));
+		mutex_lock(&sfp->f_mutex);
+		val = min_t(int, sfp->rsv_srp->sgat_h.buflen,
+			    sdp->max_sgat_sz);
+		mutex_unlock(&sfp->f_mutex);
 		SG_LOG(3, sfp, "%s:    SG_GET_RESERVED_SIZE=%d\n",
 		       __func__, val);
-		return put_user(val, ip);
+		result = put_user(val, ip);
+		return result;
 	case SG_SET_COMMAND_Q:
 		SG_LOG(3, sfp, "%s:    SG_SET_COMMAND_Q\n", __func__);
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		sfp->cmd_q = val ? 1 : 0;
+		assign_bit(SG_FFD_CMD_Q, sfp->ffd_bm, !!val);
 		return 0;
 	case SG_GET_COMMAND_Q:
 		SG_LOG(3, sfp, "%s:    SG_GET_COMMAND_Q\n", __func__);
-		return put_user((int) sfp->cmd_q, ip);
+		return put_user(test_bit(SG_FFD_CMD_Q, sfp->ffd_bm), ip);
 	case SG_SET_KEEP_ORPHAN:
 		SG_LOG(3, sfp, "%s:    SG_SET_KEEP_ORPHAN\n", __func__);
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		sfp->keep_orphan = val;
+		assign_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm, !!val);
 		return 0;
 	case SG_GET_KEEP_ORPHAN:
 		SG_LOG(3, sfp, "%s:    SG_GET_KEEP_ORPHAN\n", __func__);
-		return put_user((int) sfp->keep_orphan, ip);
+		return put_user(test_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm),
+				ip);
 	case SG_GET_VERSION_NUM:
 		SG_LOG(3, sfp, "%s:    SG_GET_VERSION_NUM\n", __func__);
 		return put_user(sg_version_num, ip);
@@ -1557,6 +1721,8 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		return put_user(val, ip);
 	case SG_EMULATED_HOST:
 		SG_LOG(3, sfp, "%s:    SG_EMULATED_HOST\n", __func__);
+		if (unlikely(SG_IS_DETACHING(sdp)))
+			return -ENODEV;
 		return put_user(sdev->host->hostt->emulated, ip);
 	case SCSI_IOCTL_SEND_COMMAND:
 		SG_LOG(3, sfp, "%s:    SCSI_IOCTL_SEND_COMMAND\n", __func__);
@@ -1567,7 +1733,7 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		result = get_user(val, ip);
 		if (result)
 			return result;
-		assign_bit(SG_FDEV_LOG_SENSE, sdp->fdev_bm, val);
+		assign_bit(SG_FDEV_LOG_SENSE, sdp->fdev_bm, !!val);
 		if (val == 0)	/* user can force recalculation */
 			sg_calc_sgat_param(sdp);
 		return 0;
@@ -1612,7 +1778,7 @@ sg_ioctl_common(struct file *filp, struct sg_device *sdp, struct sg_fd *sfp,
 		break;
 	}
 	result = sg_allow_if_err_recovery(sdp, filp->f_flags & O_NDELAY);
-	if (result)
+	if (unlikely(result))
 		return result;
 	return -ENOIOCTLCMD;
 }
@@ -1675,7 +1841,7 @@ sg_poll(struct file *filp, poll_table * wait)
 
 	if (unlikely(SG_IS_DETACHING(sfp->parentdp)))
 		p_res |= EPOLLHUP;
-	else if (likely(sfp->cmd_q))
+	else if (likely(test_bit(SG_FFD_CMD_Q, sfp->ffd_bm)))
 		p_res |= EPOLLOUT | EPOLLWRNORM;
 	else if (atomic_read(&sfp->submitted) == 0)
 		p_res |= EPOLLOUT | EPOLLWRNORM;
@@ -1697,9 +1863,10 @@ static vm_fault_t
 sg_vma_fault(struct vm_fault *vmf)
 {
 	int k, length;
-	unsigned long offset, len, sa;
+	unsigned long offset, len, sa, iflags;
 	struct vm_area_struct *vma = vmf->vma;
 	struct sg_scatter_hold *rsv_schp;
+	struct sg_request *srp;
 	struct sg_device *sdp;
 	struct sg_fd *sfp;
 	const char *nbp = "==NULL, bad";
@@ -1718,12 +1885,18 @@ sg_vma_fault(struct vm_fault *vmf)
 		SG_LOG(1, sfp, "%s: device detaching\n", __func__);
 		goto out_err;
 	}
-	rsv_schp = &sfp->reserve;
+	srp = sfp->rsv_srp;
+	if (!srp) {
+		SG_LOG(1, sfp, "%s: srp%s\n", __func__, nbp);
+		goto out_err;
+	}
+	xa_lock_irqsave(&sfp->srp_arr, iflags);
+	rsv_schp = &srp->sgat_h;
 	offset = vmf->pgoff << PAGE_SHIFT;
 	if (offset >= (unsigned int)rsv_schp->buflen) {
 		SG_LOG(1, sfp, "%s: offset[%lu] >= rsv.buflen\n", __func__,
 		       offset);
-		goto out_err;
+		goto out_err_unlock;
 	}
 	sa = vma->vm_start;
 	SG_LOG(3, sfp, "%s: vm_start=0x%lx, off=%lu\n", __func__, sa, offset);
@@ -1736,6 +1909,7 @@ sg_vma_fault(struct vm_fault *vmf)
 			struct page *pp;
 
 			pp = rsv_schp->pages[k];
+			xa_unlock_irqrestore(&sfp->srp_arr, iflags);
 			page = nth_page(pp, offset >> PAGE_SHIFT);
 			get_page(page); /* increment page count */
 			vmf->page = page;
@@ -1744,6 +1918,8 @@ sg_vma_fault(struct vm_fault *vmf)
 		sa += len;
 		offset -= len;
 	}
+out_err_unlock:
+	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
 out_err:
 	return VM_FAULT_SIGBUS;
 }
@@ -1758,9 +1934,10 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	int k, length;
 	int ret = 0;
-	unsigned long req_sz, len, sa;
+	unsigned long req_sz, len, sa, iflags;
 	struct sg_scatter_hold *rsv_schp;
 	struct sg_fd *sfp;
+	struct sg_request *srp;
 
 	if (!filp || !vma)
 		return -ENXIO;
@@ -1776,11 +1953,13 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 		return -EINVAL; /* only an offset of 0 accepted */
 	/* Check reserve request is inactive and has large enough buffer */
 	mutex_lock(&sfp->f_mutex);
-	if (sfp->res_in_use) {
+	srp = sfp->rsv_srp;
+	xa_lock_irqsave(&sfp->srp_arr, iflags);
+	if (SG_RS_ACTIVE(srp)) {
 		ret = -EBUSY;
 		goto out;
 	}
-	rsv_schp = &sfp->reserve;
+	rsv_schp = &srp->sgat_h;
 	if (req_sz > (unsigned long)rsv_schp->buflen) {
 		ret = -ENOMEM;
 		goto out;
@@ -1793,11 +1972,12 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma)
 		sa += len;
 	}
 
-	sfp->mmap_called = 1;
+	set_bit(SG_FFD_MMAP_CALLED, sfp->ffd_bm);
 	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_private_data = sfp;
 	vma->vm_ops = &sg_mmap_vm_ops;
 out:
+	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
 	mutex_unlock(&sfp->f_mutex);
 	return ret;
 }
@@ -1819,8 +1999,10 @@ sg_rq_end_io_usercontext(struct work_struct *work)
 		return;
 	}
 	SG_LOG(3, sfp, "%s: srp=0x%pK\n", __func__, srp);
-	sg_finish_scsi_blk_rq(srp);
-	sg_deact_request(sfp, srp);
+	if (test_bit(SG_FRQ_DEACT_ORPHAN, srp->frq_bm)) {
+		sg_finish_scsi_blk_rq(srp);	/* clean up orphan case */
+		sg_deact_request(sfp, srp);
+	}
 	kref_put(&sfp->f_ref, sg_remove_sfp);
 }
 
@@ -1858,85 +2040,95 @@ sg_check_sense(struct sg_device *sdp, struct sg_request *srp, int sense_len)
 }
 
 /*
- * This function is a "bottom half" handler that is called by the mid
- * level when a command is completed (or has failed).
+ * This "bottom half" (soft interrupt) handler is called by the mid-level
+ * when a request has completed or failed. This callback is registered in a
+ * blk_execute_rq_nowait() call in the sg_common_write(). For ioctl(SG_IO)
+ * (sync) usage, sg_ctl_sg_io() waits to be woken up by this callback.
  */
 static void
 sg_rq_end_io(struct request *rq, blk_status_t status)
 {
+	enum sg_rq_state rqq_state = SG_RS_AWAIT_RCV;
+	int a_resid, slen;
 	struct sg_request *srp = rq->end_io_data;
 	struct scsi_request *scsi_rp = scsi_req(rq);
 	struct sg_device *sdp;
 	struct sg_fd *sfp;
-	unsigned int ms;
-	int resid, slen;
-	int done = 1;
-	unsigned long iflags;
 
-	if (WARN_ON(srp->done != 0))
+	if (!scsi_rp) {
+		WARN_ONCE("%s: scsi_req(rq) unexpectedly NULL\n", __func__);
 		return;
-
-	sfp = srp->parentfp;
-	if (WARN_ON(sfp == NULL))
+	}
+	if (!srp) {
+		WARN_ONCE("%s: srp unexpectedly NULL\n", __func__);
 		return;
-
+	}
+	if (WARN_ON(atomic_read(&srp->rq_st) != SG_RS_INFLIGHT)) {
+		pr_warn("%s: bad rq_st=%d\n", __func__,
+			atomic_read(&srp->rq_st));
+		goto early_err;
+	}
+	sfp = srp->parentfp;
+	if (unlikely(!sfp)) {
+		WARN_ONCE(1, "%s: sfp unexpectedly NULL", __func__);
+		goto early_err;
+	}
 	sdp = sfp->parentdp;
 	if (unlikely(SG_IS_DETACHING(sdp)))
 		pr_info("%s: device detaching\n", __func__);
 
 	srp->rq_result = scsi_rp->result;
-	resid = scsi_rp->resid_len;
-
-	srp->header.resid = resid;
-
 	slen = min_t(int, scsi_rp->sense_len, SCSI_SENSE_BUFFERSIZE);
+	a_resid = scsi_rp->resid_len;
 
-	SG_LOG(6, sfp, "%s: pack_id=%d, res=0x%x\n", __func__,
-	       srp->header.pack_id, srp->rq_result);
-	ms = jiffies_to_msecs(jiffies);
-	srp->header.duration = (ms > srp->header.duration) ?
-				(ms - srp->header.duration) : 0;
-	if (srp->rq_result != 0 && slen > 0)
+	if (a_resid)
+		srp->in_resid = a_resid;
+
+	SG_LOG(6, sfp, "%s: pack_id=%d, res=0x%x\n", __func__, srp->pack_id,
+	       srp->rq_result);
+	srp->duration = sg_calc_rq_dur(srp);
+	if (unlikely((srp->rq_result & SG_ML_RESULT_MSK) && slen > 0))
 		sg_check_sense(sdp, srp, slen);
 	if (slen > 0)
 		memcpy(srp->sense_b, scsi_rp->sense, slen);
-
-	/* Rely on write phase to clean out srp status values, so no "else" */
-
-	if (!srp->sg_io_owned)
+	srp->sense_len = slen;
+	if (unlikely(test_bit(SG_FRQ_IS_ORPHAN, srp->frq_bm))) {
+		if (test_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm)) {
+			clear_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm);
+		} else {
+			rqq_state = SG_RS_BUSY;
+			set_bit(SG_FRQ_DEACT_ORPHAN, srp->frq_bm);
+		}
+	}
+	if (!test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm))
 		atomic_inc(&sfp->waiting);
+	if (unlikely(sg_rq_state_chg(srp, SG_RS_INFLIGHT, rqq_state,
+				     false, __func__)))
+		pr_warn("%s: can't set rq_st\n", __func__);
 	/*
-	 * Free the request as soon as it is complete so that its resources
-	 * can be reused without waiting for userspace to read() the
-	 * result.  But keep the associated bio (if any) around until
-	 * blk_rq_unmap_user() can be called from user context.
+	 * Free the mid-level resources apart from the bio (if any). The bio's
+	 * blk_rq_unmap_user() can be called later from user context.
 	 */
 	srp->rq = NULL;
-	scsi_req_free_cmd(scsi_req(rq));
+	scsi_req_free_cmd(scsi_rp);
 	blk_put_request(rq);
 
-	if (unlikely(srp->orphan)) {
-		if (sfp->keep_orphan)
-			srp->sg_io_owned = 0;
-		else
-			done = 0;
-	}
-	srp->done = done;
-
-	if (likely(done)) {
-		/* Now wake up any sg_read() that is waiting for this
-		 * packet.
-		 */
-		xa_lock_irqsave(&sfp->srp_arr, iflags);
-		__xa_set_mark(&sfp->srp_arr, srp->rq_idx, SG_XA_RQ_AWAIT);
-		xa_unlock_irqrestore(&sfp->srp_arr, iflags);
+	if (likely(rqq_state == SG_RS_AWAIT_RCV)) {
+		/* Wake any sg_read()/ioctl(SG_IORECEIVE) awaiting this req */
 		wake_up_interruptible(&sfp->read_wait);
 		kill_fasync(&sfp->async_qp, SIGPOLL, POLL_IN);
 		kref_put(&sfp->f_ref, sg_remove_sfp);
-	} else {
+	} else {        /* clean up orphaned request that aren't being kept */
 		INIT_WORK(&srp->ew_orph.work, sg_rq_end_io_usercontext);
 		schedule_work(&srp->ew_orph.work);
 	}
+	return;
+
+early_err:
+	srp->rq = NULL;
+	if (scsi_rp)
+		scsi_req_free_cmd(scsi_rp);
+	blk_put_request(rq);
 }
 
 static const struct file_operations sg_fops = {
@@ -1964,9 +2156,9 @@ static struct sg_device *
 sg_add_device_helper(struct gendisk *disk, struct scsi_device *scsidp)
 {
 	struct sg_device *sdp;
-	unsigned long iflags;
 	int error;
 	u32 k;
+	unsigned long iflags;
 
 	sdp = kzalloc(sizeof(*sdp), GFP_KERNEL);
 	if (!sdp)
@@ -1984,7 +2176,7 @@ sg_add_device_helper(struct gendisk *disk, struct scsi_device *scsidp)
 			error = -ENODEV;
 		} else {
 			sdev_printk(KERN_WARNING, scsidp,
-				    "%s: idr alloc sg_device failure: %d\n",
+				"%s: idr allocation sg_device failure: %d\n",
 				    __func__, error);
 		}
 		goto out_unlock;
@@ -2186,6 +2378,7 @@ init_sg(void)
 				    SG_MAX_DEVS, "sg");
 	if (rc)
 		return rc;
+
 	pr_info("Registered %s[char major=0x%x], version: %s, date: %s\n",
 		"sg device ", SCSI_GENERIC_MAJOR, SG_VERSION_STR,
 		sg_version_date);
@@ -2201,6 +2394,7 @@ init_sg(void)
 		return 0;
 	}
 	class_destroy(sg_sysfs_class);
+
 err_out_unreg:
 	unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS);
 	return rc;
@@ -2227,6 +2421,19 @@ exit_sg(void)
 	idr_destroy(&sg_index_idr);
 }
 
+static inline bool
+sg_chk_dio_allowed(struct sg_device *sdp, struct sg_request *srp,
+		   int iov_count, int dir)
+{
+	if (sg_allow_dio && (srp->rq_flags & SG_FLAG_DIRECT_IO)) {
+		if (dir != SG_DXFER_UNKNOWN && !iov_count) {
+			if (!sdp->device->host->unchecked_isa_dma)
+				return true;
+		}
+	}
+	return false;
+}
+
 static void
 sg_set_map_data(const struct sg_scatter_hold *schp, bool up_valid,
 		struct rq_map_data *mdp)
@@ -2240,31 +2447,40 @@ sg_set_map_data(const struct sg_scatter_hold *schp, bool up_valid,
 }
 
 static int
-sg_start_req(struct sg_request *srp, u8 *cmd)
+sg_start_req(struct sg_request *srp, u8 *cmd, int cmd_len, int dxfer_dir)
 {
-	int res;
+	bool reserved, us_xfer;
+	int res = 0;
+	int dxfer_len = 0;
+	int r0w = READ;
+	unsigned int iov_count = 0;
+	void __user *up;
 	struct request *rq;
-	struct scsi_request *req;
+	struct scsi_request *scsi_rp;
 	struct sg_fd *sfp = srp->parentfp;
-	struct sg_io_hdr *hp = &srp->header;
-	int dxfer_len = (int) hp->dxfer_len;
-	int dxfer_dir = hp->dxfer_direction;
-	unsigned int iov_count = hp->iovec_count;
-	struct sg_scatter_hold *req_schp = &srp->data;
-	struct sg_scatter_hold *rsv_schp = &sfp->reserve;
-	struct request_queue *q = sfp->parentdp->device->request_queue;
-	struct rq_map_data *md, map_data;
-	int r0w = hp->dxfer_direction == SG_DXFER_TO_DEV ? WRITE : READ;
+	struct sg_device *sdp;
+	struct sg_scatter_hold *req_schp;
+	struct request_queue *q;
+	struct rq_map_data *md = (void *)srp; /* want any non-NULL value */
 	u8 *long_cmdp = NULL;
+	__maybe_unused const char *cp = "";
+	struct sg_slice_hdr3 *sh3p = &srp->s_hdr3;
+	struct rq_map_data map_data;
 
-	if (hp->cmd_len > BLK_MAX_CDB) {
-		long_cmdp = kzalloc(hp->cmd_len, GFP_KERNEL);
+	sdp = sfp->parentdp;
+	if (cmd_len > BLK_MAX_CDB) {	/* for longer SCSI cdb_s */
+		long_cmdp = kzalloc(cmd_len, GFP_KERNEL);
 		if (!long_cmdp)
 			return -ENOMEM;
 		SG_LOG(5, sfp, "%s: long_cmdp=0x%pK ++\n", __func__, long_cmdp);
 	}
+	up = sh3p->dxferp;
+	dxfer_len = (int)sh3p->dxfer_len;
+	iov_count = sh3p->iovec_count;
+	r0w = dxfer_dir == SG_DXFER_TO_DEV ? WRITE : READ;
 	SG_LOG(4, sfp, "%s: dxfer_len=%d, data-%s\n", __func__, dxfer_len,
 	       (r0w ? "OUT" : "IN"));
+	q = sdp->device->request_queue;
 
 	/*
 	 * NOTE
@@ -2277,125 +2493,142 @@ sg_start_req(struct sg_request *srp, u8 *cmd)
 	 * do not want to use BLK_MQ_REQ_NOWAIT here because userspace might
 	 * not expect an EWOULDBLOCK from this condition.
 	 */
-	rq = blk_get_request(q, hp->dxfer_direction == SG_DXFER_TO_DEV ?
-			REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
+	rq = blk_get_request(q, (r0w ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN), 0);
 	if (IS_ERR(rq)) {
 		kfree(long_cmdp);
 		return PTR_ERR(rq);
 	}
-	req = scsi_req(rq);
-
-	if (hp->cmd_len > BLK_MAX_CDB)
-		req->cmd = long_cmdp;
-	memcpy(req->cmd, cmd, hp->cmd_len);
-	req->cmd_len = hp->cmd_len;
-
+	/* current sg_request protected by SG_RS_BUSY state */
+	scsi_rp = scsi_req(rq);
 	srp->rq = rq;
-	rq->end_io_data = srp;
-	req->retries = SG_DEFAULT_RETRIES;
-
-	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
-		return 0;
 
-	if (sg_allow_dio && hp->flags & SG_FLAG_DIRECT_IO &&
-	    dxfer_dir != SG_DXFER_UNKNOWN && !iov_count &&
-	    !sfp->parentdp->device->host->unchecked_isa_dma &&
-	    blk_rq_aligned(q, (unsigned long)hp->dxferp, dxfer_len))
+	if (cmd_len > BLK_MAX_CDB)
+		scsi_rp->cmd = long_cmdp;
+	memcpy(scsi_rp->cmd, cmd, cmd_len);
+	scsi_rp->cmd_len = cmd_len;
+	us_xfer = !(srp->rq_flags & SG_FLAG_NO_DXFER);
+	assign_bit(SG_FRQ_NO_US_XFER, srp->frq_bm, !us_xfer);
+	reserved = (sfp->rsv_srp == srp);
+	rq->end_io_data = srp;
+	scsi_rp->retries = SG_DEFAULT_RETRIES;
+	req_schp = &srp->sgat_h;
+
+	if (dxfer_len <= 0 || dxfer_dir == SG_DXFER_NONE) {
+		SG_LOG(4, sfp, "%s: no data xfer [0x%pK]\n", __func__, srp);
+		set_bit(SG_FRQ_NO_US_XFER, srp->frq_bm);
+		goto fini;	/* path of reqs with no din nor dout */
+	} else if (sg_chk_dio_allowed(sdp, srp, iov_count, dxfer_dir) &&
+		   blk_rq_aligned(q, (unsigned long)up, dxfer_len)) {
+		srp->rq_info |= SG_INFO_DIRECT_IO;
 		md = NULL;
-	else
+		if (IS_ENABLED(CONFIG_SCSI_PROC_FS))
+			cp = "direct_io, ";
+	} else {	/* normal IO and failed conditions for dio path */
 		md = &map_data;
+	}
 
-	if (md) {
-		mutex_lock(&sfp->f_mutex);
-		if (dxfer_len <= rsv_schp->buflen &&
-		    !sfp->res_in_use) {
-			sfp->res_in_use = 1;
-			sg_link_reserve(sfp, srp, dxfer_len);
-		} else if (hp->flags & SG_FLAG_MMAP_IO) {
-			res = -EBUSY; /* sfp->res_in_use == 1 */
-			if (dxfer_len > rsv_schp->buflen)
-				res = -ENOMEM;
-			mutex_unlock(&sfp->f_mutex);
-			return res;
-		} else {
-			res = sg_mk_sgat(req_schp, sfp, dxfer_len);
-			if (res) {
-				mutex_unlock(&sfp->f_mutex);
-				return res;
-			}
+	if (likely(md)) {	/* normal, "indirect" IO */
+		if (unlikely((srp->rq_flags & SG_FLAG_MMAP_IO))) {
+			/* mmap IO must use and fit in reserve request */
+			if (!reserved || dxfer_len > req_schp->buflen)
+				res = reserved ? -ENOMEM : -EBUSY;
+		} else if (req_schp->buflen == 0) {
+			int up_sz = max_t(int, dxfer_len, sfp->sgat_elem_sz);
+
+			res = sg_mk_sgat(srp, sfp, up_sz);
 		}
-		mutex_unlock(&sfp->f_mutex);
+		if (res)
+			goto fini;
 
-		sg_set_map_data(req_schp, !!hp->dxferp, md);
+		sg_set_map_data(req_schp, !!up, md);
 		md->from_user = (dxfer_dir == SG_DXFER_TO_FROM_DEV);
 	}
 
-	if (iov_count) {
+	if (unlikely(iov_count)) {
 		struct iovec *iov = NULL;
 		struct iov_iter i;
 
-		res = import_iovec(r0w, hp->dxferp, iov_count, 0, &iov, &i);
+		res = import_iovec(r0w, up, iov_count, 0, &iov, &i);
 		if (res < 0)
-			return res;
+			goto fini;
 
-		iov_iter_truncate(&i, hp->dxfer_len);
+		iov_iter_truncate(&i, dxfer_len);
 		if (!iov_iter_count(&i)) {
 			kfree(iov);
-			return -EINVAL;
+			res = -EINVAL;
+			goto fini;
 		}
 
-		res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
+		if (us_xfer)
+			res = blk_rq_map_user_iov(q, rq, md, &i, GFP_ATOMIC);
 		kfree(iov);
-	} else
-		res = blk_rq_map_user(q, rq, md, hp->dxferp,
-				      hp->dxfer_len, GFP_ATOMIC);
-
-	if (!res) {
+		if (IS_ENABLED(CONFIG_SCSI_PROC_FS))
+			cp = "iov_count > 0";
+	} else if (us_xfer) { /* setup for transfer data to/from user space */
+		res = blk_rq_map_user(q, rq, md, up, dxfer_len, GFP_ATOMIC);
+		if (IS_ENABLED(CONFIG_SCSI_PROC_FS) && res)
+			SG_LOG(1, sfp, "%s: blk_rq_map_user() res=%d\n",
+			       __func__, res);
+	}
+fini:
+	if (unlikely(res)) {		/* failure, free up resources */
+		scsi_req_free_cmd(scsi_rp);
+		srp->rq = NULL;
+		blk_put_request(rq);
+	} else {
 		srp->bio = rq->bio;
-
-		if (!md) {
-			req_schp->dio_in_use = 1;
-			hp->info |= SG_INFO_DIRECT_IO;
-		}
 	}
+	SG_LOG((res ? 1 : 4), sfp, "%s: %s res=%d [0x%pK]\n", __func__, cp,
+	       res, srp);
 	return res;
 }
 
+/*
+ * Clean up mid-level and block layer resources of finished request. Sometimes
+ * blk_rq_unmap_user() returns -4 (-EINTR) and this is why: "If we're in a
+ * workqueue, the request is orphaned, so don't copy into a random user
+ * address space, just free and return -EINTR so user space doesn't expect
+ * any data." [block/bio.c]
+ */
 static void
 sg_finish_scsi_blk_rq(struct sg_request *srp)
 {
 	int ret;
-
 	struct sg_fd *sfp = srp->parentfp;
-	struct sg_scatter_hold *req_schp = &srp->data;
+	struct request *rq = srp->rq;
 
 	SG_LOG(4, sfp, "%s: srp=0x%pK%s\n", __func__, srp,
-	       (srp->res_used) ? " rsv" : "");
-	if (!srp->sg_io_owned) {
+	       (srp->parentfp->rsv_srp == srp) ? " rsv" : "");
+	if (!test_bit(SG_FRQ_SYNC_INVOC, srp->frq_bm)) {
 		atomic_dec(&sfp->submitted);
 		atomic_dec(&sfp->waiting);
 	}
 	if (srp->bio) {
-		ret = blk_rq_unmap_user(srp->bio);
-		if (ret)	/* -EINTR (-4) can be ignored */
-			SG_LOG(6, sfp, "%s: blk_rq_unmap_user() --> %d\n",
-			       __func__, ret);
+		bool us_xfer = !test_bit(SG_FRQ_NO_US_XFER, srp->frq_bm);
+
+		if (us_xfer) {
+			ret = blk_rq_unmap_user(srp->bio);
+			if (ret) {	/* -EINTR (-4) can be ignored */
+				SG_LOG(6, sfp,
+				       "%s: blk_rq_unmap_user() --> %d\n",
+				       __func__, ret);
+			}
+		}
 		srp->bio = NULL;
 	}
+	/* In worst case READ data returned to user space by this point */
 
-	if (srp->rq) {
-		scsi_req_free_cmd(scsi_req(srp->rq));
-		blk_put_request(srp->rq);
+	/* Expect blk_put_request(rq) already called in sg_rq_end_io() */
+	if (rq) {       /* blk_get_request() may have failed */
+		if (scsi_req(rq))
+			scsi_req_free_cmd(scsi_req(rq));
+		srp->rq = NULL;
+		blk_put_request(rq);
 	}
-
-	if (srp->res_used)
-		sg_unlink_reserve(sfp, srp);
-	else
-		sg_remove_sgat(sfp, req_schp);
 }
 
 static int
-sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp, int minlen)
+sg_mk_sgat(struct sg_request *srp, struct sg_fd *sfp, int minlen)
 {
 	int j, k, rem_sz, align_sz, order, o_order;
 	int mx_sgat_elems = sfp->parentdp->max_sgat_elems;
@@ -2404,6 +2637,7 @@ sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp, int minlen)
 	gfp_t mask_ap = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN | __GFP_ZERO;
 	gfp_t mask_kz = GFP_ATOMIC | __GFP_NOWARN;
 	struct sg_device *sdp = sfp->parentdp;
+	struct sg_scatter_hold *schp = &srp->sgat_h;
 
 	if (unlikely(minlen <= 0)) {
 		if (minlen < 0)
@@ -2414,9 +2648,8 @@ sg_mk_sgat(struct sg_scatter_hold *schp, struct sg_fd *sfp, int minlen)
 	align_sz = ALIGN(minlen, SG_DEF_SECTOR_SZ);
 
 	schp->pages = kcalloc(mx_sgat_elems, ptr_sz, mask_kz);
-	SG_LOG(4, sfp, "%s: minlen=%d, align_sz=%d [sz=%zu, 0x%pK ++]\n",
-	       __func__, minlen, align_sz, mx_sgat_elems * ptr_sz,
-	       schp->pages);
+	SG_LOG(4, sfp, "%s: minlen=%d [sz=%zu, 0x%pK ++]\n", __func__, minlen,
+	       mx_sgat_elems * ptr_sz, schp->pages);
 	if (unlikely(!schp->pages))
 		return -ENOMEM;
 
@@ -2480,13 +2713,15 @@ sg_remove_sgat_helper(struct sg_fd *sfp, struct sg_scatter_hold *schp)
 
 /* Remove the data (possibly a sgat list) held by srp, not srp itself */
 static void
-sg_remove_sgat(struct sg_fd *sfp, struct sg_scatter_hold *schp)
+sg_remove_sgat(struct sg_request *srp)
 {
+	struct sg_scatter_hold *schp = &srp->sgat_h; /* care: remove own data */
+	struct sg_fd *sfp = srp->parentfp;
+
 	SG_LOG(4, sfp, "%s: num_sgat=%d%s\n", __func__, schp->num_sgat,
-	       ((sfp ? (&sfp->reserve == schp) : false) ?
+	       ((srp->parentfp ? (sfp->rsv_srp == srp) : false) ?
 		" [rsv]" : ""));
-	if (!schp->dio_in_use)
-		sg_remove_sgat_helper(sfp, schp);
+	sg_remove_sgat_helper(sfp, schp);
 
 	memset(schp, 0, sizeof(*schp));         /* zeros buflen and dlen */
 }
@@ -2501,7 +2736,7 @@ sg_read_append(struct sg_request *srp, void __user *outp, int num_xfer)
 {
 	int k, num, res;
 	struct page *pgp;
-	struct sg_scatter_hold *schp = &srp->data;
+	struct sg_scatter_hold *schp = &srp->sgat_h;
 
 	SG_LOG(4, srp->parentfp, "%s: num_xfer=%d\n", __func__, num_xfer);
 	if (unlikely(!outp || num_xfer <= 0))
@@ -2515,11 +2750,11 @@ sg_read_append(struct sg_request *srp, void __user *outp, int num_xfer)
 			break;
 		}
 		if (num > num_xfer) {
-			if (__copy_to_user(outp, page_address(pgp), num_xfer))
+			if (copy_to_user(outp, page_address(pgp), num_xfer))
 				res = -EFAULT;
 			break;
 		} else {
-			if (__copy_to_user(outp, page_address(pgp), num)) {
+			if (copy_to_user(outp, page_address(pgp), num)) {
 				res = -EFAULT;
 				break;
 			}
@@ -2542,135 +2777,273 @@ sg_read_append(struct sg_request *srp, void __user *outp, int num_xfer)
 static struct sg_request *
 sg_find_srp_by_id(struct sg_fd *sfp, int pack_id)
 {
+	__maybe_unused bool is_bad_st = false;
+	__maybe_unused enum sg_rq_state bad_sr_st = SG_RS_INACTIVE;
+	bool search_for_1 = (pack_id != SG_PACK_ID_WILDCARD);
+	int res;
+	int num_waiting = atomic_read(&sfp->waiting);
 	unsigned long idx;
-	struct sg_request *resp;
+	struct sg_request *srp = NULL;
+	struct xarray *xafp = &sfp->srp_arr;
 
-	xa_for_each_marked(&sfp->srp_arr, idx, resp, SG_XA_RQ_AWAIT) {
-		if (!resp)
-			continue;
-		/* look for requests that are ready + not SG_IO owned */
-		if (resp->done == 1 && !resp->sg_io_owned &&
-		    (-1 == pack_id || resp->header.pack_id == pack_id)) {
-			resp->done = 2;	/* guard against other readers */
-			return resp;
+	if (num_waiting < 1)
+		return NULL;
+	if (unlikely(search_for_1)) {
+		xa_for_each_marked(xafp, idx, srp, SG_XA_RQ_AWAIT) {
+			if (!srp)
+				continue;
+			if (srp->pack_id != pack_id)
+				continue;
+			res = sg_rq_state_chg(srp, SG_RS_AWAIT_RCV, SG_RS_BUSY,
+					      false, __func__);
+			if (likely(res == 0))
+				goto good;
+			/* else another caller got it, move on */
+			if (IS_ENABLED(CONFIG_SCSI_PROC_FS)) {
+				is_bad_st = true;
+				bad_sr_st = atomic_read(&srp->rq_st);
+			}
+			break;
+		}
+	} else {        /* search for any request is more likely */
+		xa_for_each_marked(xafp, idx, srp, SG_XA_RQ_AWAIT) {
+			if (!srp)
+				continue;
+			res = sg_rq_state_chg(srp, SG_RS_AWAIT_RCV, SG_RS_BUSY,
+					      false, __func__);
+			if (likely(res == 0))
+				goto good;
+		}
+	}
+	/* here if one of above loops does _not_ find a match */
+	if (IS_ENABLED(CONFIG_SCSI_PROC_FS)) {
+		if (search_for_1) {
+			const char *cptp = "pack_id=";
+
+			if (is_bad_st)
+				SG_LOG(1, sfp, "%s: %s%d wrong state: %s\n",
+				       __func__, cptp, pack_id,
+				       sg_rq_st_str(bad_sr_st, true));
+			else
+				SG_LOG(6, sfp, "%s: %s%d not awaiting read\n",
+				       __func__, cptp, pack_id);
 		}
 	}
 	return NULL;
+good:
+	SG_LOG(6, sfp, "%s: %s%d found [srp=0x%pK]\n", __func__, "pack_id=",
+	       pack_id, srp);
+	return srp;
 }
 
-static void
-sg_link_reserve(struct sg_fd *sfp, struct sg_request *srp, int size)
+/*
+ * Makes a new sg_request object. If 'first' is set then use GFP_KERNEL which
+ * may take time but has improved chance of success, otherwise use GFP_ATOMIC.
+ * Note that basic initialization is done but srp is not added to either sfp
+ * list. On error returns twisted negated errno value (not NULL).
+ */
+static struct sg_request *
+sg_mk_srp(struct sg_fd *sfp, bool first)
 {
-	struct sg_scatter_hold *req_schp = &srp->data;
-	struct sg_scatter_hold *rsv_schp = &sfp->reserve;
-	int k, num, rem;
-
-	srp->res_used = 1;
-	SG_LOG(4, sfp, "%s: size=%d\n", __func__, size);
-	rem = size;
-
-	num = 1 << (PAGE_SHIFT + rsv_schp->page_order);
-	for (k = 0; k < rsv_schp->num_sgat; k++) {
-		if (rem <= num) {
-			req_schp->num_sgat = k + 1;
-			req_schp->sglist_len = rsv_schp->sglist_len;
-			req_schp->pages = rsv_schp->pages;
+	struct sg_request *srp;
+	gfp_t gfp =  __GFP_NOWARN;
 
-			req_schp->buflen = size;
-			req_schp->page_order = rsv_schp->page_order;
-			break;
-		} else
-			rem -= num;
+	if (first)      /* prepared to wait if none already outstanding */
+		srp = kzalloc(sizeof(*srp), gfp | GFP_KERNEL);
+	else
+		srp = kzalloc(sizeof(*srp), gfp | GFP_ATOMIC);
+	if (srp) {
+		atomic_set(&srp->rq_st, SG_RS_BUSY);
+		srp->parentfp = sfp;
+		return srp;
+	} else {
+		return ERR_PTR(-ENOMEM);
 	}
-
-	if (k >= rsv_schp->num_sgat)
-		SG_LOG(1, sfp, "%s: BAD size\n", __func__);
 }
 
-static void
-sg_unlink_reserve(struct sg_fd *sfp, struct sg_request *srp)
+static struct sg_request *
+sg_mk_srp_sgat(struct sg_fd *sfp, bool first, int db_len)
 {
-	struct sg_scatter_hold *req_schp = &srp->data;
+	int res;
+	struct sg_request *n_srp = sg_mk_srp(sfp, first);
 
-	SG_LOG(4, srp->parentfp, "%s: req->num_sgat=%d\n", __func__,
-	       (int)req_schp->num_sgat);
-	req_schp->num_sgat = 0;
-	req_schp->buflen = 0;
-	req_schp->pages = NULL;
-	req_schp->page_order = 0;
-	req_schp->sglist_len = 0;
-	srp->res_used = 0;
-	/* Called without mutex lock to avoid deadlock */
-	sfp->res_in_use = 0;
+	if (IS_ERR(n_srp))
+		return n_srp;
+	if (db_len > 0) {
+		res = sg_mk_sgat(n_srp, sfp, db_len);
+		if (res) {
+			kfree(n_srp);
+			return ERR_PTR(res);
+		}
+	}
+	return n_srp;
 }
 
-static void
-sg_build_reserve(struct sg_fd *sfp, int req_size)
+/*
+ * Irrespective of the given reserve request size, the minimum size requested
+ * will be PAGE_SIZE (often 4096 bytes). Returns a pointer to reserve object or
+ * a negated errno value twisted by ERR_PTR() macro. The actual number of bytes
+ * allocated (maybe less than buflen) is in srp->sgat_h.buflen . Note that this
+ * function is only called in contexts where locking is not required.
+ */
+static struct sg_request *
+sg_build_reserve(struct sg_fd *sfp, int buflen)
 {
-	struct sg_scatter_hold *schp = &sfp->reserve;
+	bool go_out = false;
+	int res;
+	struct sg_request *srp;
 
-	SG_LOG(3, sfp, "%s: buflen=%d\n", __func__, req_size);
+	SG_LOG(3, sfp, "%s: buflen=%d\n", __func__, buflen);
+	srp = sg_mk_srp(sfp, xa_empty(&sfp->srp_arr));
+	if (IS_ERR(srp))
+		return srp;
+	sfp->rsv_srp = srp;
 	do {
-		if (req_size < PAGE_SIZE)
-			req_size = PAGE_SIZE;
-		if (sg_mk_sgat(schp, sfp, req_size) == 0)
-			return;
-		sg_remove_sgat(sfp, schp);
-		req_size >>= 1;	/* divide by 2 */
-	} while (req_size > (PAGE_SIZE / 2));
+		if (buflen < (int)PAGE_SIZE) {
+			buflen = PAGE_SIZE;
+			go_out = true;
+		}
+		res = sg_mk_sgat(srp, sfp, buflen);
+		if (res == 0) {
+			SG_LOG(4, sfp, "%s: final buflen=%d, srp=0x%pK ++\n",
+			       __func__, buflen, srp);
+			return srp;
+		}
+		if (go_out)
+			return ERR_PTR(res);
+		/* failed so remove, halve buflen, try again */
+		sg_remove_sgat(srp);
+		buflen >>= 1;   /* divide by 2 */
+	} while (true);
 }
 
-/* always adds to end of list */
+/*
+ * Setup an active request (soon to carry a SCSI command) to the current file
+ * descriptor by creating a new one or re-using a request from the free
+ * list (fl). If successful returns a valid pointer in SG_RS_BUSY state. On
+ * failure returns a negated errno value twisted by ERR_PTR() macro.
+ */
 static struct sg_request *
-sg_setup_req(struct sg_fd *sfp)
+sg_setup_req(struct sg_fd *sfp, int dxfr_len, struct sg_comm_wr_t *cwrp)
 {
+	bool act_empty = false;
 	bool found = false;
+	bool mk_new_srp = false;
+	bool try_harder = false;
 	int res;
-	unsigned long idx, iflags;
-	struct sg_request *rp;
+	int num_inactive = 0;
+	unsigned long idx, last_idx, iflags;
+	struct sg_request *r_srp = NULL;	/* request to return */
 	struct xarray *xafp = &sfp->srp_arr;
-
-	if (!xa_empty(xafp)) {
-		xa_for_each_marked(xafp, idx, rp, SG_XA_RQ_INACTIVE) {
-			if (!rp)
+	__maybe_unused const char *cp;
+
+start_again:
+	cp = "";
+	if (xa_empty(xafp)) {
+		act_empty = true;
+		mk_new_srp = true;
+	} else if (!try_harder && dxfr_len < SG_DEF_SECTOR_SZ) {
+		last_idx = ~0UL;
+		xa_for_each_marked(xafp, idx, r_srp, SG_XA_RQ_INACTIVE) {
+			if (!r_srp)
 				continue;
-			if (sg_rstate_chg(rp, SG_RS_INACTIVE, SG_RS_BUSY))
+			++num_inactive;
+			if (dxfr_len < SG_DEF_SECTOR_SZ) {
+				last_idx = idx;
 				continue;
-			memset(rp, 0, sizeof(*rp));
-			rp->rq_idx = idx;
-			xa_lock_irqsave(xafp, iflags);
-			__xa_clear_mark(xafp, idx, SG_XA_RQ_INACTIVE);
-			xa_unlock_irqrestore(xafp, iflags);
+			}
+		}
+		/* If dxfr_len is small, use last inactive request */
+		if (last_idx != ~0UL) {
+			idx = last_idx;
+			r_srp = xa_load(xafp, idx);
+			if (!r_srp)
+				goto start_again;
+			if (sg_rq_state_chg(r_srp, SG_RS_INACTIVE, SG_RS_BUSY,
+					    false, __func__))
+				goto start_again; /* gone to another thread */
+			cp = "toward back of srp_arr";
 			found = true;
-			break;
+		}
+	} else {
+		xa_for_each_marked(xafp, idx, r_srp, SG_XA_RQ_INACTIVE) {
+			if (!r_srp)
+				continue;
+			if (r_srp->sgat_h.buflen >= dxfr_len) {
+				if (sg_rq_state_chg
+					(r_srp, SG_RS_INACTIVE, SG_RS_BUSY,
+					 false, __func__))
+					continue;
+				cp = "from front of srp_arr";
+				found = true;
+				break;
+			}
 		}
 	}
-	if (!found) {
-		rp = kzalloc(sizeof(*rp), GFP_KERNEL);
-		if (!rp)
-			return NULL;
+	if (found) {
+		r_srp->in_resid = 0;
+		r_srp->rq_info = 0;
+		r_srp->sense_len = 0;
+		mk_new_srp = false;
+	} else {
+		mk_new_srp = true;
 	}
-	rp->parentfp = sfp;
-	rp->header.duration = jiffies_to_msecs(jiffies);
-	if (!found) {
+	if (mk_new_srp) {
+		bool allow_cmd_q = test_bit(SG_FFD_CMD_Q, sfp->ffd_bm);
 		u32 n_idx;
 		struct xa_limit xal = { .max = 0, .min = 0 };
 
-		atomic_set(&rp->rq_st, SG_RS_BUSY);
+		cp = "new";
+		if (!allow_cmd_q && atomic_read(&sfp->submitted) > 0) {
+			r_srp = ERR_PTR(-EDOM);
+			SG_LOG(6, sfp, "%s: trying 2nd req but cmd_q=false\n",
+			       __func__);
+			goto fini;
+		}
+		r_srp = sg_mk_srp_sgat(sfp, act_empty, dxfr_len);
+		if (IS_ERR(r_srp)) {
+			if (!try_harder && dxfr_len < SG_DEF_SECTOR_SZ &&
+			    num_inactive > 0) {
+				try_harder = true;
+				goto start_again;
+			}
+			goto fini;
+		}
+		atomic_set(&r_srp->rq_st, SG_RS_BUSY);
 		xa_lock_irqsave(xafp, iflags);
 		xal.max = atomic_inc_return(&sfp->req_cnt);
-		res = __xa_alloc(xafp, &n_idx, rp, xal, GFP_KERNEL);
+		res = __xa_alloc(xafp, &n_idx, r_srp, xal, GFP_KERNEL);
 		xa_unlock_irqrestore(xafp, iflags);
 		if (res < 0) {
-			pr_warn("%s: don't expect xa_alloc() to fail, errno=%d\n",
-				__func__,  -res);
-			return NULL;
+			SG_LOG(1, sfp, "%s: xa_alloc() failed, errno=%d\n",
+			       __func__,  -res);
+			sg_remove_sgat(r_srp);
+			kfree(r_srp);
+			r_srp = ERR_PTR(-EPROTOTYPE);
+			goto fini;
 		}
-		rp->rq_idx = n_idx;
-	}
-	return rp;
+		idx = n_idx;
+		r_srp->rq_idx = idx;
+		r_srp->parentfp = sfp;
+		SG_LOG(4, sfp, "%s: mk_new_srp=0x%pK ++\n", __func__, r_srp);
+	}
+	r_srp->frq_bm[0] = cwrp->frq_bm[0];	/* assumes <= 32 req flags */
+	r_srp->sgat_h.dlen = dxfr_len;/* must be <= r_srp->sgat_h.buflen */
+	r_srp->cmd_opcode = 0xff;  /* set invalid opcode (VS), 0x0 is TUR */
+fini:
+	if (IS_ERR(r_srp))
+		SG_LOG(1, sfp, "%s: err=%ld\n", __func__, PTR_ERR(r_srp));
+	if (!IS_ERR(r_srp))
+		SG_LOG(4, sfp, "%s: %s r_srp=0x%pK\n", __func__, cp, r_srp);
+	return r_srp;
 }
 
+/*
+ * Moves a completed sg_request object to the free list and sets it to
+ * SG_RS_INACTIVE which makes it available for re-use. Requests with no data
+ * associated are appended to the tail of the free list while other requests
+ * are prepended to the head of the free list.
+ */
 static void
 sg_deact_request(struct sg_fd *sfp, struct sg_request *srp)
 {
@@ -2678,34 +3051,43 @@ sg_deact_request(struct sg_fd *sfp, struct sg_request *srp)
 
 	if (WARN_ON(!sfp || !srp))
 		return;
+	atomic_set(&srp->rq_st, SG_RS_INACTIVE);
 	xa_lock_irqsave(&sfp->srp_arr, iflags);
 	__xa_set_mark(&sfp->srp_arr, srp->rq_idx, SG_XA_RQ_INACTIVE);
+	__xa_clear_mark(&sfp->srp_arr, srp->rq_idx, SG_XA_RQ_AWAIT);
 	xa_unlock_irqrestore(&sfp->srp_arr, iflags);
-	atomic_set(&srp->rq_st, SG_RS_INACTIVE);
 }
 
+/* Returns pointer to sg_fd object or negated errno twisted by ERR_PTR */
 static struct sg_fd *
 sg_add_sfp(struct sg_device *sdp)
 {
+	bool reduced = false;
 	int rbuf_len, res;
 	u32 idx;
+	long err;
 	unsigned long iflags;
 	struct sg_fd *sfp;
+	struct sg_request *srp = NULL;
+	struct xarray *xadp = &sdp->sfp_arr;
+	struct xarray *xafp;
 	struct xa_limit xal;
 
 	sfp = kzalloc(sizeof(*sfp), GFP_ATOMIC | __GFP_NOWARN);
 	if (!sfp)
 		return ERR_PTR(-ENOMEM);
-
 	init_waitqueue_head(&sfp->read_wait);
 	xa_init_flags(&sfp->srp_arr, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
+	xafp = &sfp->srp_arr;
 	kref_init(&sfp->f_ref);
 	mutex_init(&sfp->f_mutex);
 	sfp->timeout = SG_DEFAULT_TIMEOUT;
 	sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER;
-	sfp->force_packid = SG_DEF_FORCE_PACK_ID;
-	sfp->cmd_q = SG_DEF_COMMAND_Q;
-	sfp->keep_orphan = SG_DEF_KEEP_ORPHAN;
+	/* other bits in sfp->ffd_bm[1] cleared by kzalloc() above */
+	__assign_bit(SG_FFD_FORCE_PACKID, sfp->ffd_bm, SG_DEF_FORCE_PACK_ID);
+	__assign_bit(SG_FFD_CMD_Q, sfp->ffd_bm, SG_DEF_COMMAND_Q);
+	__assign_bit(SG_FFD_KEEP_ORPHAN, sfp->ffd_bm, SG_DEF_KEEP_ORPHAN);
+	__assign_bit(SG_FFD_Q_AT_TAIL, sfp->ffd_bm, SG_DEFAULT_Q_AT);
 	/*
 	 * SG_SCATTER_SZ initializes scatter_elem_sz but different value may
 	 * be given as driver/module parameter (e.g. 'scatter_elem_sz=8192').
@@ -2719,28 +3101,64 @@ sg_add_sfp(struct sg_device *sdp)
 	atomic_set(&sfp->waiting, 0);
 	atomic_set(&sfp->req_cnt, 0);
 
-	if (SG_IS_DETACHING(sdp)) {
+	if (unlikely(SG_IS_DETACHING(sdp))) {
+		SG_LOG(1, sfp, "%s: detaching\n", __func__);
 		kfree(sfp);
 		return ERR_PTR(-ENODEV);
 	}
-	SG_LOG(3, sfp, "%s: sfp=0x%pK\n", __func__, sfp);
 	if (unlikely(sg_big_buff != def_reserved_size))
 		sg_big_buff = def_reserved_size;
 
 	rbuf_len = min_t(int, sg_big_buff, sdp->max_sgat_sz);
-	if (rbuf_len > 0)
-		sg_build_reserve(sfp, rbuf_len);
-
-	xa_lock_irqsave(&sdp->sfp_arr, iflags);
+	if (rbuf_len > 0) {
+		struct xa_limit xalrq = { .max = 0, .min = 0 };
+
+		srp = sg_build_reserve(sfp, rbuf_len);
+		if (IS_ERR(srp)) {
+			err = PTR_ERR(srp);
+			SG_LOG(1, sfp, "%s: build reserve err=%ld\n", __func__,
+			       -err);
+			kfree(sfp);
+			return ERR_PTR(err);
+		}
+		if (srp->sgat_h.buflen < rbuf_len) {
+			reduced = true;
+			SG_LOG(2, sfp,
+			       "%s: reserve reduced from %d to buflen=%d\n",
+			       __func__, rbuf_len, srp->sgat_h.buflen);
+		}
+		xa_lock_irqsave(xafp, iflags);
+		xalrq.max = atomic_inc_return(&sfp->req_cnt);
+		res = __xa_alloc(xafp, &idx, srp, xalrq, GFP_ATOMIC);
+		xa_unlock_irqrestore(xafp, iflags);
+		if (res < 0) {
+			SG_LOG(1, sfp, "%s: xa_alloc(srp) bad, errno=%d\n",
+			       __func__,  -res);
+			sg_remove_sgat(srp);
+			kfree(srp);
+			kfree(sfp);
+			return ERR_PTR(-EPROTOTYPE);
+		}
+		srp->rq_idx = idx;
+		srp->parentfp = sfp;
+		sg_rq_state_chg(srp, 0, SG_RS_INACTIVE, true, __func__);
+	}
+	if (!reduced) {
+		SG_LOG(4, sfp, "%s: built reserve buflen=%d\n", __func__,
+		       rbuf_len);
+	}
+	xa_lock_irqsave(xadp, iflags);
 	xal.min = 0;
 	xal.max = atomic_read(&sdp->open_cnt);
-	res = __xa_alloc(&sdp->sfp_arr, &idx, sfp, xal, GFP_KERNEL);
-	xa_unlock_irqrestore(&sdp->sfp_arr, iflags);
+	res = __xa_alloc(xadp, &idx, sfp, xal, GFP_KERNEL);
+	xa_unlock_irqrestore(xadp, iflags);
 	if (res < 0) {
 		pr_warn("%s: xa_alloc(sdp) bad, o_count=%d, errno=%d\n",
 			__func__, xal.max, -res);
-		if (rbuf_len > 0)
-			sg_remove_sgat(sfp, &sfp->reserve);
+		if (srp) {
+			sg_remove_sgat(srp);
+			kfree(srp);
+		}
 		kfree(sfp);
 		return ERR_PTR(res);
 	}
@@ -2771,12 +3189,14 @@ sg_remove_sfp_usercontext(struct work_struct *work)
 	struct sg_request *srp;
 	struct sg_request *e_srp;
 	struct xarray *xafp = &sfp->srp_arr;
+	struct xarray *xadp;
 
 	if (!sfp) {
 		pr_warn("sg: %s: sfp is NULL\n", __func__);
 		return;
 	}
 	sdp = sfp->parentdp;
+	xadp = &sdp->sfp_arr;
 
 	/* Cleanup any responses which were never read(). */
 	xa_for_each(xafp, idx, srp) {
@@ -2784,24 +3204,20 @@ sg_remove_sfp_usercontext(struct work_struct *work)
 			continue;
 		if (!xa_get_mark(xafp, srp->rq_idx, SG_XA_RQ_INACTIVE))
 			sg_finish_scsi_blk_rq(srp);
+		sg_remove_sgat(srp);
 		xa_lock_irqsave(xafp, iflags);
 		e_srp = __xa_erase(xafp, srp->rq_idx);
 		xa_unlock_irqrestore(xafp, iflags);
 		if (srp != e_srp)
 			SG_LOG(1, sfp, "%s: xa_erase() return unexpected\n",
 			       __func__);
+		SG_LOG(6, sfp, "%s: kfree: srp=%pK --\n", __func__, srp);
 		kfree(srp);
 	}
 	xa_destroy(xafp);
-	if (sfp->reserve.buflen > 0) {
-		SG_LOG(6, sfp, "%s:    buflen=%d, num_sgat=%d\n", __func__,
-		       (int)sfp->reserve.buflen, (int)sfp->reserve.num_sgat);
-		sg_remove_sgat(sfp, &sfp->reserve);
-	}
-
-	xa_lock_irqsave(&sdp->sfp_arr, iflags);
-	e_sfp = __xa_erase(&sdp->sfp_arr, sfp->idx);
-	xa_unlock_irqrestore(&sdp->sfp_arr, iflags);
+	xa_lock_irqsave(xadp, iflags);
+	e_sfp = __xa_erase(xadp, sfp->idx);
+	xa_unlock_irqrestore(xadp, iflags);
 	if (unlikely(sfp != e_sfp))
 		SG_LOG(1, sfp, "%s: xa_erase() return unexpected\n",
 		       __func__);
@@ -3051,6 +3467,7 @@ sg_proc_seq_show_devhdr(struct seq_file *s, void *v)
 struct sg_proc_deviter {
 	loff_t	index;
 	size_t	max;
+	int fd_index;
 };
 
 static void *
@@ -3134,11 +3551,10 @@ sg_proc_seq_show_devstrs(struct seq_file *s, void *v)
 static void
 sg_proc_debug_helper(struct seq_file *s, struct sg_device *sdp)
 {
-	int k, new_interface, blen, usg;
+	int k;
 	unsigned long idx, idx2;
 	struct sg_request *srp;
 	struct sg_fd *fp;
-	const struct sg_io_hdr *hp;
 	const char * cp;
 	unsigned int ms;
 
@@ -3149,51 +3565,53 @@ sg_proc_debug_helper(struct seq_file *s, struct sg_device *sdp)
 		k++;
 		seq_printf(s, "   FD(%d): timeout=%dms buflen=%d (res)sgat=%d low_dma=%d idx=%lu\n",
 			   k, jiffies_to_msecs(fp->timeout),
-			   fp->reserve.buflen, (int)fp->reserve.num_sgat,
+			   fp->rsv_srp->sgat_h.buflen,
+			   (int)fp->rsv_srp->sgat_h.num_sgat,
 			   (int)sdp->device->host->unchecked_isa_dma, idx);
 		seq_printf(s, "   cmd_q=%d f_packid=%d k_orphan=%d closed=0\n",
-			   (int) fp->cmd_q, (int) fp->force_packid,
-			   (int) fp->keep_orphan);
+			   (int)test_bit(SG_FFD_CMD_Q, fp->ffd_bm),
+			   (int)test_bit(SG_FFD_FORCE_PACKID, fp->ffd_bm),
+			   (int)test_bit(SG_FFD_KEEP_ORPHAN, fp->ffd_bm));
 		seq_printf(s, "   submitted=%d waiting=%d\n",
 			   atomic_read(&fp->submitted),
 			   atomic_read(&fp->waiting));
 		xa_for_each(&fp->srp_arr, idx2, srp) {
+			const struct sg_slice_hdr3 *sh3p = &srp->s_hdr3;
+			bool is_v3 = (sh3p->interface_id != '\0');
+			enum sg_rq_state rq_st = atomic_read(&srp->rq_st);
+
 			if (!srp)
 				continue;
-			hp = &srp->header;
-			new_interface = (hp->interface_id == '\0') ? 0 : 1;
-			if (srp->res_used) {
-				if (new_interface &&
-				    (SG_FLAG_MMAP_IO & hp->flags))
+			if (srp->parentfp->rsv_srp == srp) {
+				if (is_v3 && (SG_FLAG_MMAP_IO & sh3p->flags))
 					cp = "     mmap>> ";
 				else
 					cp = "     rb>> ";
 			} else {
-				if (SG_INFO_DIRECT_IO_MASK & hp->info)
+				if (SG_INFO_DIRECT_IO_MASK & srp->rq_info)
 					cp = "     dio>> ";
 				else
 					cp = "     ";
 			}
 			seq_puts(s, cp);
-			blen = srp->data.buflen;
-			usg = srp->data.num_sgat;
-			seq_puts(s, srp->done ?
-				 ((1 == srp->done) ?  "rcv:" : "fin:")
-				  : "act:");
-			seq_printf(s, " id=%d blen=%d",
-				   srp->header.pack_id, blen);
-			if (srp->done)
-				seq_printf(s, " dur=%d", hp->duration);
-			else {
-				ms = jiffies_to_msecs(jiffies);
-				seq_printf(s, " t_o/elap=%d/%d",
-					(new_interface ? hp->timeout :
-						  jiffies_to_msecs(fp->timeout)),
-					(ms > hp->duration ? ms - hp->duration : 0));
+			seq_puts(s, sg_rq_st_str(rq_st, false));
+			seq_printf(s, ": id=%d len/blen=%d/%d",
+				   srp->pack_id, srp->sgat_h.dlen,
+				   srp->sgat_h.buflen);
+			if (rq_st == SG_RS_AWAIT_RCV ||
+			    rq_st == SG_RS_RCV_DONE) {
+				seq_printf(s, " dur=%d", srp->duration);
+				goto fin_line;
 			}
-			seq_printf(s, "ms sgat=%d op=0x%02x dummy: %s\n", usg,
-				   (int)srp->data.cmd_opcode,
-				   sg_rq_st_str(SG_RS_INACTIVE, false));
+			ms = jiffies_to_msecs(jiffies);
+			seq_printf(s, " t_o/elap=%d/%d",
+				   (is_v3 ? sh3p->timeout :
+					    jiffies_to_msecs(fp->timeout)),
+				   (ms > srp->duration ?  ms - srp->duration :
+							  0));
+fin_line:
+			seq_printf(s, "ms sgat=%d op=0x%02x\n",
+				   srp->sgat_h.num_sgat, (int)srp->cmd_opcode);
 		}
 		if (xa_empty(&fp->srp_arr))
 			seq_puts(s, "     No requests active\n");
-- 
2.25.1


  parent reply	other threads:[~2021-04-08  1:46 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-08  1:44 [PATCH v17 00/45] sg: add v4 interface Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 01/45] sg: move functions around Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 02/45] sg: remove typedefs, type+formatting cleanup Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 03/45] sg: sg_log and is_enabled Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 04/45] sg: rework sg_poll(), minor changes Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 05/45] sg: bitops in sg_device Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 06/45] sg: make open count an atomic Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 07/45] sg: move header to uapi section Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 08/45] sg: speed sg_poll and sg_get_num_waiting Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 09/45] sg: sg_allow_if_err_recovery and renames Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 10/45] sg: improve naming Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 11/45] sg: change rwlock to spinlock Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 12/45] sg: ioctl handling Douglas Gilbert
2021-04-08  1:44 ` [PATCH v17 13/45] sg: split sg_read Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 14/45] sg: sg_common_write add structure for arguments Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 15/45] sg: rework sg_vma_fault Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 16/45] sg: rework sg_mmap Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 17/45] sg: replace sg_allow_access Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 18/45] sg: rework scatter gather handling Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 19/45] sg: introduce request state machine Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 20/45] sg: sg_find_srp_by_id Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 21/45] sg: sg_fill_request_element Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 22/45] sg: printk change %p to %pK Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 23/45] sg: xarray for fds in device Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 24/45] sg: xarray for reqs in fd Douglas Gilbert
2021-04-08  8:05   ` Hannes Reinecke
2021-04-08  1:45 ` Douglas Gilbert [this message]
2021-04-08  1:45 ` [PATCH v17 26/45] sg: sense buffer rework Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 27/45] sg: add sg v4 interface support Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 28/45] sg: rework debug info Douglas Gilbert
2021-04-08  8:06   ` Hannes Reinecke
2021-04-08  1:45 ` [PATCH v17 29/45] sg: add 8 byte SCSI LUN to sg_scsi_id Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 30/45] sg: expand sg_comm_wr_t Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 31/45] sg: add sg_iosubmit_v3 and sg_ioreceive_v3 ioctls Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 32/45] sg: add some __must_hold macros Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 33/45] sg: move procfs objects to avoid forward decls Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 34/45] sg: protect multiple receivers Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 35/45] sg: first debugfs support Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 36/45] sg: rework mmap support Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 37/45] sg: defang allow_dio Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 38/45] sg: warn v3 write system call users Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 39/45] sg: add mmap_sz tracking Douglas Gilbert
2021-04-08  8:07   ` Hannes Reinecke
2021-04-08  1:45 ` [PATCH v17 40/45] sg: remove rcv_done request state Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 41/45] sg: track lowest inactive and await indexes Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 42/45] sg: remove unit attention check for device changed Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 43/45] sg: no_dxfer: move to/from kernel buffers Douglas Gilbert
2021-04-08  8:07   ` Hannes Reinecke
2021-04-08  1:45 ` [PATCH v17 44/45] sg: add blk_poll support Douglas Gilbert
2021-04-08  8:14   ` Hannes Reinecke
2021-04-08 16:28     ` Douglas Gilbert
2021-04-09  6:00       ` Douglas Gilbert
2021-04-08  1:45 ` [PATCH v17 45/45] sg: bump version to 4.0.12 Douglas Gilbert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210408014531.248890-26-dgilbert@interlog.com \
    --to=dgilbert@interlog.com \
    --cc=hare@suse.de \
    --cc=jejb@linux.vnet.ibm.com \
    --cc=linux-scsi@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=martin.petersen@oracle.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.