linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling
@ 2020-09-10 15:50 Bodo Stroesser
  2020-09-10 15:50 ` [PATCH 1/3] scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt Bodo Stroesser
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Bodo Stroesser @ 2020-09-10 15:50 UTC (permalink / raw)
  To: Martin K. Petersen, Mike Christie, linux-scsi, target-devel
  Cc: Bodo Stroesser

This small series is made on top of Martin's 5.10/scsi-queue branch.

The patches simplify some code, splits off new helper functions
or implement a cleaner code sequence to prevent double work.
The 3rd patch speeds up buffer preparation for SCSI commands with
long read data.
The series is also base for cleaned up version v3 of my patch
  scsi: target: tcmu: add compat mode for 32bit userspace on 64bit kernel

Bodo Stroesser (3):
  scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt
  scsi: target: tcmu: optimize queue_cmd_ring
  scsi: target: tcmu: optimize scatter_data_area

 drivers/target/target_core_user.c | 340 +++++++++++++++++++-------------------
 1 file changed, 170 insertions(+), 170 deletions(-)

-- 
2.12.3


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/3] scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt
  2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
@ 2020-09-10 15:50 ` Bodo Stroesser
  2020-09-10 15:50 ` [PATCH 2/3] scsi: target: tcmu: optimize queue_cmd_ring Bodo Stroesser
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Bodo Stroesser @ 2020-09-10 15:50 UTC (permalink / raw)
  To: Martin K. Petersen, Mike Christie, linux-scsi, target-devel
  Cc: Bodo Stroesser

Simplify code by joining tcmu_cmd_get_data_length and
tcmu_cmd_get_block_cnt into tcmu_cmd_set_block_cnts.
The new function sets tcmu_cmd->dbi_cnt and also the new
field tcmu_cmd->dbi_bidi_cnt, which is needed for
further enhancements in following patches.
Simplyfy some code by using tcmu_cmd->dbi(_bidi)_cnt
instead of calculation from length.

Please note: the calculation of the number of dbis needed for
bidi was wrong. It was based on the length of the first bidi sg
only. I changed it to correctly sum up entier length of all
bidi sgs.

Signed-off-by: Bodo Stroesser <bstroesser@ts.fujitsu.com>
---
 drivers/target/target_core_user.c | 53 +++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 9b7592350502..fa0c4a42e435 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -177,6 +177,7 @@ struct tcmu_cmd {
 	/* Can't use se_cmd when cleaning up expired cmds, because if
 	   cmd has been completed then accessing se_cmd is off limits */
 	uint32_t dbi_cnt;
+	uint32_t dbi_bidi_cnt;
 	uint32_t dbi_cur;
 	uint32_t *dbi;
 
@@ -558,25 +559,20 @@ static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd)
 	kmem_cache_free(tcmu_cmd_cache, tcmu_cmd);
 }
 
-static inline size_t tcmu_cmd_get_data_length(struct tcmu_cmd *tcmu_cmd)
+static inline void tcmu_cmd_set_block_cnts(struct tcmu_cmd *cmd)
 {
-	struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
-	size_t data_length = round_up(se_cmd->data_length, DATA_BLOCK_SIZE);
+	int i, len;
+	struct se_cmd *se_cmd = cmd->se_cmd;
+
+	cmd->dbi_cnt = DIV_ROUND_UP(se_cmd->data_length, DATA_BLOCK_SIZE);
 
 	if (se_cmd->se_cmd_flags & SCF_BIDI) {
 		BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents));
-		data_length += round_up(se_cmd->t_bidi_data_sg->length,
-				DATA_BLOCK_SIZE);
+		for (i = 0, len = 0; i < se_cmd->t_bidi_data_nents; i++)
+			len += se_cmd->t_bidi_data_sg[i].length;
+		cmd->dbi_bidi_cnt = DIV_ROUND_UP(len, DATA_BLOCK_SIZE);
+		cmd->dbi_cnt += cmd->dbi_bidi_cnt;
 	}
-
-	return data_length;
-}
-
-static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd)
-{
-	size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
-
-	return data_length / DATA_BLOCK_SIZE;
 }
 
 static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
@@ -593,8 +589,7 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 	tcmu_cmd->se_cmd = se_cmd;
 	tcmu_cmd->tcmu_dev = udev;
 
-	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
-	tcmu_cmd->dbi_cnt = tcmu_cmd_get_block_cnt(tcmu_cmd);
+	tcmu_cmd_set_block_cnts(tcmu_cmd);
 	tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t),
 				GFP_NOIO);
 	if (!tcmu_cmd->dbi) {
@@ -767,13 +762,12 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 		data_sg = se_cmd->t_data_sg;
 		data_nents = se_cmd->t_data_nents;
 	} else {
-
 		/*
 		 * For bidi case, the first count blocks are for Data-Out
 		 * buffer blocks, and before gathering the Data-In buffer
-		 * the Data-Out buffer blocks should be discarded.
+		 * the Data-Out buffer blocks should be skipped.
 		 */
-		count = DIV_ROUND_UP(se_cmd->data_length, DATA_BLOCK_SIZE);
+		count = cmd->dbi_cnt - cmd->dbi_bidi_cnt;
 
 		data_sg = se_cmd->t_bidi_data_sg;
 		data_nents = se_cmd->t_bidi_data_nents;
@@ -827,11 +821,9 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
  * Called with ring lock held.
  */
 static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
-		size_t cmd_size, size_t data_needed)
+				size_t cmd_size)
 {
 	struct tcmu_mailbox *mb = udev->mb_addr;
-	uint32_t blocks_needed = (data_needed + DATA_BLOCK_SIZE - 1)
-				/ DATA_BLOCK_SIZE;
 	size_t space, cmd_needed;
 	u32 cmd_head;
 
@@ -855,23 +847,23 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 		return false;
 	}
 
-	if (!data_needed)
+	if (!cmd || !cmd->dbi_cnt)
 		return true;
 
 	/* try to check and get the data blocks as needed */
 	space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
-	if ((space * DATA_BLOCK_SIZE) < data_needed) {
+	if (space < cmd->dbi_cnt) {
 		unsigned long blocks_left =
 				(udev->max_blocks - udev->dbi_thresh) + space;
 
-		if (blocks_left < blocks_needed) {
+		if (blocks_left < cmd->dbi_cnt) {
 			pr_debug("no data space: only %lu available, but ask for %zu\n",
 					blocks_left * DATA_BLOCK_SIZE,
-					data_needed);
+					cmd->dbi_cnt * DATA_BLOCK_SIZE);
 			return false;
 		}
 
-		udev->dbi_thresh += blocks_needed;
+		udev->dbi_thresh += cmd->dbi_cnt;
 		if (udev->dbi_thresh > udev->max_blocks)
 			udev->dbi_thresh = udev->max_blocks;
 	}
@@ -990,7 +982,8 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 	uint32_t cmd_head;
 	uint64_t cdb_off;
 	bool copy_to_data_area;
-	size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd);
+	/* size of data buffer needed */
+	size_t data_length = (size_t)tcmu_cmd->dbi_cnt * DATA_BLOCK_SIZE;
 
 	*scsi_err = TCM_NO_SENSE;
 
@@ -1031,7 +1024,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 		return -1;
 	}
 
-	if (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
+	if (!is_ring_space_avail(udev, tcmu_cmd, command_size)) {
 		/*
 		 * Don't leave commands partially setup because the unmap
 		 * thread might need the blocks to make forward progress.
@@ -1145,7 +1138,7 @@ queue_tmr_ring(struct tcmu_dev *udev, struct tcmu_tmr *tmr)
 	cmd_size = round_up(sizeof(*entry) + id_list_sz, TCMU_OP_ALIGN_SIZE);
 
 	if (!list_empty(&udev->tmr_queue) ||
-	    !is_ring_space_avail(udev, NULL, cmd_size, 0)) {
+	    !is_ring_space_avail(udev, NULL, cmd_size)) {
 		list_add_tail(&tmr->queue_entry, &udev->tmr_queue);
 		pr_debug("adding tmr %p on dev %s to TMR ring space wait queue\n",
 			 tmr, udev->name);
-- 
2.12.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/3] scsi: target: tcmu: optimize queue_cmd_ring
  2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
  2020-09-10 15:50 ` [PATCH 1/3] scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt Bodo Stroesser
@ 2020-09-10 15:50 ` Bodo Stroesser
  2020-09-10 15:50 ` [PATCH 3/3] scsi: target: tcmu: optimize scatter_data_area Bodo Stroesser
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Bodo Stroesser @ 2020-09-10 15:50 UTC (permalink / raw)
  To: Martin K. Petersen, Mike Christie, linux-scsi, target-devel
  Cc: Bodo Stroesser

queue_cmd_ring() needs to check whether there is enough space
in cmd ring and data area for the cmd to queue.

Currently the sequence is:
 1) calculate size the cmd will occupy on the ring
    based on estimation of needed iovs.
 2) check whether there is enough space on the ring based
    on size from 1)
 2) allocate buffers on data area
 3) calculate number of iovs the commands really needs
    while copying incoming data (if any) to data area.
 4) Re-calculate real size of cmd on ring, based on real number
    of iovs
 5) setup possible padding and cmd on the ring

In 1) we must not underestimate the cmd size, so we use max
possible number of iovs for the given IO data size. The
resulting overestimation can be really high, so this sequence
is not ideal. We can calculate the real number of iovs earliest
after data buffer allocation. Therefore I reworked the code to
implement the following sequence:
 A) allocate buffers on data area and calculate number of
    necessary iovs during this.
 B) calculate real size of cmd on ring based on number of iovs
 C) check whether there is enough space on the ring
 D) setup possible padding and cmd on the ring

The new sequence enforces to split of new function
tcmu_alloc_data_space from is_ring_space_avail. Using this
function queue_cmd_ring was changed according to the new sequence.
Further changes were necessary in routines called by
tcmu_alloc_data_space to allow calculating and returning the iov
count. In scatter_data_area counting of iovs is removed.

Signed-off-by: Bodo Stroesser <bstroesser@ts.fujitsu.com>
---
 drivers/target/target_core_user.c | 184 +++++++++++++++++++++-----------------
 1 file changed, 102 insertions(+), 82 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index fa0c4a42e435..5587bd4d1060 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -493,15 +493,16 @@ static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len)
 		clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap);
 }
 
-static inline bool tcmu_get_empty_block(struct tcmu_dev *udev,
-					struct tcmu_cmd *tcmu_cmd)
+static inline int tcmu_get_empty_block(struct tcmu_dev *udev,
+				       struct tcmu_cmd *tcmu_cmd,
+				       int prev_dbi, int *iov_cnt)
 {
 	struct page *page;
 	int ret, dbi;
 
 	dbi = find_first_zero_bit(udev->data_bitmap, udev->dbi_thresh);
 	if (dbi == udev->dbi_thresh)
-		return false;
+		return -1;
 
 	page = radix_tree_lookup(&udev->data_blocks, dbi);
 	if (!page) {
@@ -525,24 +526,30 @@ static inline bool tcmu_get_empty_block(struct tcmu_dev *udev,
 	set_bit(dbi, udev->data_bitmap);
 	tcmu_cmd_set_dbi(tcmu_cmd, dbi);
 
-	return true;
+	if (dbi != prev_dbi + 1)
+		*iov_cnt += 1;
+
+	return dbi;
 err_insert:
 	__free_page(page);
 err_alloc:
 	atomic_dec(&global_db_count);
-	return false;
+	return -1;
 }
 
-static bool tcmu_get_empty_blocks(struct tcmu_dev *udev,
-				  struct tcmu_cmd *tcmu_cmd)
+static int tcmu_get_empty_blocks(struct tcmu_dev *udev,
+				 struct tcmu_cmd *tcmu_cmd, int dbi_cnt)
 {
-	int i;
+	/* start value of dbi + 1 must not be a valid dbi */
+	int dbi = -2;
+	int i, iov_cnt = 0;
 
-	for (i = tcmu_cmd->dbi_cur; i < tcmu_cmd->dbi_cnt; i++) {
-		if (!tcmu_get_empty_block(udev, tcmu_cmd))
-			return false;
+	for (i = 0; i < dbi_cnt; i++) {
+		dbi = tcmu_get_empty_block(udev, tcmu_cmd, dbi, &iov_cnt);
+		if (dbi < 0)
+			return -1;
 	}
-	return true;
+	return iov_cnt;
 }
 
 static inline struct page *
@@ -639,13 +646,12 @@ static inline size_t head_to_end(size_t head, size_t size)
 	return size - head;
 }
 
-static inline void new_iov(struct iovec **iov, int *iov_cnt)
+static inline void new_iov(struct iovec **iov, bool first)
 {
 	struct iovec *iovec;
 
-	if (*iov_cnt != 0)
+	if (!first)
 		(*iov)++;
-	(*iov_cnt)++;
 
 	iovec = *iov;
 	memset(iovec, 0, sizeof(struct iovec));
@@ -668,8 +674,7 @@ static inline size_t iov_tail(struct iovec *iov)
 
 static void scatter_data_area(struct tcmu_dev *udev,
 	struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg,
-	unsigned int data_nents, struct iovec **iov,
-	int *iov_cnt, bool copy_data)
+	unsigned int data_nents, struct iovec **iov, bool copy_data)
 {
 	int i, dbi;
 	int block_remaining = 0;
@@ -677,6 +682,7 @@ static void scatter_data_area(struct tcmu_dev *udev,
 	size_t copy_bytes, to_offset, offset;
 	struct scatterlist *sg;
 	struct page *page;
+	bool first = true;
 
 	for_each_sg(data_sg, sg, data_nents, i) {
 		int sg_remaining = sg->length;
@@ -707,8 +713,7 @@ static void scatter_data_area(struct tcmu_dev *udev,
 			 */
 			copy_bytes = min_t(size_t, sg_remaining,
 					block_remaining);
-			if (*iov_cnt != 0 &&
-			    to_offset == iov_tail(*iov)) {
+			if (!first && to_offset == iov_tail(*iov)) {
 				/*
 				 * Will append to the current iovec, because
 				 * the current block page is next to the
@@ -721,9 +726,10 @@ static void scatter_data_area(struct tcmu_dev *udev,
 				 * first time here or the current block page
 				 * is not next to the previous one.
 				 */
-				new_iov(iov, iov_cnt);
+				new_iov(iov, first);
 				(*iov)->iov_base = (void __user *)to_offset;
 				(*iov)->iov_len = copy_bytes;
+				first = false;
 			}
 
 			if (copy_data) {
@@ -815,13 +821,11 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
 }
 
 /*
- * We can't queue a command until we have space available on the cmd ring *and*
- * space available on the data area.
+ * We can't queue a command until we have space available on the cmd ring.
  *
  * Called with ring lock held.
  */
-static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
-				size_t cmd_size)
+static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size)
 {
 	struct tcmu_mailbox *mb = udev->mb_addr;
 	size_t space, cmd_needed;
@@ -846,9 +850,22 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 		       udev->cmdr_last_cleaned, udev->cmdr_size);
 		return false;
 	}
+	return true;
+}
+
+/*
+ * We have to allocate data buffers before we can queue a command.
+ * Returns -1 on error (not enough space) or number of needed iovs on success
+ *
+ * Called with ring lock held.
+ */
+static int tcmu_alloc_data_space(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
+				  int *iov_bidi_cnt)
+{
+	int space, iov_cnt = 0, ret = 0;
 
-	if (!cmd || !cmd->dbi_cnt)
-		return true;
+	if (!cmd->dbi_cnt)
+		goto wr_iov_cnts;
 
 	/* try to check and get the data blocks as needed */
 	space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
@@ -857,10 +874,10 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 				(udev->max_blocks - udev->dbi_thresh) + space;
 
 		if (blocks_left < cmd->dbi_cnt) {
-			pr_debug("no data space: only %lu available, but ask for %zu\n",
+			pr_debug("no data space: only %lu available, but ask for %lu\n",
 					blocks_left * DATA_BLOCK_SIZE,
 					cmd->dbi_cnt * DATA_BLOCK_SIZE);
-			return false;
+			return -1;
 		}
 
 		udev->dbi_thresh += cmd->dbi_cnt;
@@ -868,7 +885,19 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
 			udev->dbi_thresh = udev->max_blocks;
 	}
 
-	return tcmu_get_empty_blocks(udev, cmd);
+	iov_cnt = tcmu_get_empty_blocks(udev, cmd,
+					cmd->dbi_cnt - cmd->dbi_bidi_cnt);
+	if (iov_cnt < 0)
+		return -1;
+
+	if (cmd->dbi_bidi_cnt) {
+		ret = tcmu_get_empty_blocks(udev, cmd, cmd->dbi_bidi_cnt);
+		if (ret < 0)
+			return -1;
+	}
+wr_iov_cnts:
+	*iov_bidi_cnt = ret;
+	return iov_cnt + ret;
 }
 
 static inline size_t tcmu_cmd_get_base_cmd_size(size_t iov_cnt)
@@ -978,7 +1007,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 	struct tcmu_mailbox *mb = udev->mb_addr;
 	struct tcmu_cmd_entry *entry;
 	struct iovec *iov;
-	int iov_cnt, cmd_id;
+	int iov_cnt, iov_bidi_cnt, cmd_id;
 	uint32_t cmd_head;
 	uint64_t cdb_off;
 	bool copy_to_data_area;
@@ -997,42 +1026,54 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 		return -1;
 	}
 
+	if (!list_empty(&udev->qfull_queue))
+		goto queue;
+
+	if (data_length > udev->data_size) {
+		pr_warn("TCMU: Request of size %zu is too big for %zu data area\n",
+			data_length, udev->data_size);
+		*scsi_err = TCM_INVALID_CDB_FIELD;
+		return -1;
+	}
+
+	iov_cnt = tcmu_alloc_data_space(udev, tcmu_cmd, &iov_bidi_cnt);
+	if (iov_cnt < 0)
+		goto free_and_queue;
+
 	/*
 	 * Must be a certain minimum size for response sense info, but
 	 * also may be larger if the iov array is large.
-	 *
-	 * We prepare as many iovs as possbile for potential uses here,
-	 * because it's expensive to tell how many regions are freed in
-	 * the bitmap & global data pool, as the size calculated here
-	 * will only be used to do the checks.
-	 *
-	 * The size will be recalculated later as actually needed to save
-	 * cmd area memories.
 	 */
-	base_command_size = tcmu_cmd_get_base_cmd_size(tcmu_cmd->dbi_cnt);
+	base_command_size = tcmu_cmd_get_base_cmd_size(iov_cnt);
 	command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size);
 
-	if (!list_empty(&udev->qfull_queue))
-		goto queue;
-
-	if ((command_size > (udev->cmdr_size / 2)) ||
-	    data_length > udev->data_size) {
-		pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu "
-			"cmd ring/data area\n", command_size, data_length,
-			udev->cmdr_size, udev->data_size);
+	if (command_size > (udev->cmdr_size / 2)) {
+		pr_warn("TCMU: Request of size %zu is too big for %u cmd ring\n",
+			command_size, udev->cmdr_size);
+		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
 		*scsi_err = TCM_INVALID_CDB_FIELD;
 		return -1;
 	}
 
-	if (!is_ring_space_avail(udev, tcmu_cmd, command_size)) {
+	if (!is_ring_space_avail(udev, command_size))
 		/*
 		 * Don't leave commands partially setup because the unmap
 		 * thread might need the blocks to make forward progress.
 		 */
-		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
-		tcmu_cmd_reset_dbi_cur(tcmu_cmd);
-		goto queue;
+		goto free_and_queue;
+
+	cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 1, USHRT_MAX, GFP_NOWAIT);
+	if (cmd_id < 0) {
+		pr_err("tcmu: Could not allocate cmd id.\n");
+
+		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
+		*scsi_err = TCM_OUT_OF_RESOURCES;
+		return -1;
 	}
+	tcmu_cmd->cmd_id = cmd_id;
+
+	pr_debug("allocated cmd id %u for cmd %p dev %s\n", tcmu_cmd->cmd_id,
+		 tcmu_cmd, udev->name);
 
 	cmd_head = ring_insert_padding(udev, command_size);
 
@@ -1040,52 +1081,27 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 	memset(entry, 0, command_size);
 	tcmu_hdr_set_op(&entry->hdr.len_op, TCMU_OP_CMD);
 
-	/* Handle allocating space from the data area */
+	/* prepare iov list and copy data to data area if necessary */
 	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
 	iov = &entry->req.iov[0];
-	iov_cnt = 0;
 	copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
 		|| se_cmd->se_cmd_flags & SCF_BIDI);
 	scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
-			  se_cmd->t_data_nents, &iov, &iov_cnt,
-			  copy_to_data_area);
-	entry->req.iov_cnt = iov_cnt;
+			  se_cmd->t_data_nents, &iov, copy_to_data_area);
+	entry->req.iov_cnt = iov_cnt - iov_bidi_cnt;
 
 	/* Handle BIDI commands */
-	iov_cnt = 0;
 	if (se_cmd->se_cmd_flags & SCF_BIDI) {
 		iov++;
 		scatter_data_area(udev, tcmu_cmd, se_cmd->t_bidi_data_sg,
-				  se_cmd->t_bidi_data_nents, &iov, &iov_cnt,
-				  false);
+				  se_cmd->t_bidi_data_nents, &iov, false);
+		entry->req.iov_bidi_cnt = iov_bidi_cnt;
 	}
-	entry->req.iov_bidi_cnt = iov_cnt;
-
-	cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 1, USHRT_MAX, GFP_NOWAIT);
-	if (cmd_id < 0) {
-		pr_err("tcmu: Could not allocate cmd id.\n");
-
-		tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
-		*scsi_err = TCM_OUT_OF_RESOURCES;
-		return -1;
-	}
-	tcmu_cmd->cmd_id = cmd_id;
-
-	pr_debug("allocated cmd id %u for cmd %p dev %s\n", tcmu_cmd->cmd_id,
-		 tcmu_cmd, udev->name);
 
 	tcmu_setup_cmd_timer(tcmu_cmd, udev->cmd_time_out, &udev->cmd_timer);
 
 	entry->hdr.cmd_id = tcmu_cmd->cmd_id;
 
-	/*
-	 * Recalaulate the command's base size and size according
-	 * to the actual needs
-	 */
-	base_command_size = tcmu_cmd_get_base_cmd_size(entry->req.iov_cnt +
-						       entry->req.iov_bidi_cnt);
-	command_size = tcmu_cmd_get_cmd_size(tcmu_cmd, base_command_size);
-
 	tcmu_hdr_set_len(&entry->hdr.len_op, command_size);
 
 	/* All offsets relative to mb_addr, not start of entry! */
@@ -1104,6 +1120,10 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 
 	return 0;
 
+free_and_queue:
+	tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
+	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
+
 queue:
 	if (add_to_qfull_queue(tcmu_cmd)) {
 		*scsi_err = TCM_OUT_OF_RESOURCES;
@@ -1138,7 +1158,7 @@ queue_tmr_ring(struct tcmu_dev *udev, struct tcmu_tmr *tmr)
 	cmd_size = round_up(sizeof(*entry) + id_list_sz, TCMU_OP_ALIGN_SIZE);
 
 	if (!list_empty(&udev->tmr_queue) ||
-	    !is_ring_space_avail(udev, NULL, cmd_size)) {
+	    !is_ring_space_avail(udev, cmd_size)) {
 		list_add_tail(&tmr->queue_entry, &udev->tmr_queue);
 		pr_debug("adding tmr %p on dev %s to TMR ring space wait queue\n",
 			 tmr, udev->name);
-- 
2.12.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/3] scsi: target: tcmu: optimize scatter_data_area
  2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
  2020-09-10 15:50 ` [PATCH 1/3] scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt Bodo Stroesser
  2020-09-10 15:50 ` [PATCH 2/3] scsi: target: tcmu: optimize queue_cmd_ring Bodo Stroesser
@ 2020-09-10 15:50 ` Bodo Stroesser
  2020-09-22 16:20 ` [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Mike Christie
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Bodo Stroesser @ 2020-09-10 15:50 UTC (permalink / raw)
  To: Martin K. Petersen, Mike Christie, linux-scsi, target-devel
  Cc: Bodo Stroesser

scatter_data_area has two purposes:
 1) create the iovs for data area buffer of a SCSI cmd
 2) If there is data in DMA_TO_DEVICE direction, it copies
   the data from sg_list to data area buffer.
Both is done in a common loop.

In case of DMA_FROM_DEVICE data transfer, scatter_data_area is
called with parameter copy_data = false. But this flag is used
just to skip memcpy() for data, while radix_tree_lookup still
is called for every dbi of the area area buffer, and kmap and
kunmap are called for every page from sg_list and data_area as
well as flush_dcache_page() for the data area pages.
Since the only thing to do with copy_data = false would be to
set up the iovs, this is a noticeable overhead.
Therefore I reworked the iov creation in the main loop of
scatter_data_area providing the new function new_block_to_iov.
Based on this I created the short new function tcmu_setup_iovs
that only writes the iovs with no overhead.
This new function is now called instead of scatter_data_area
for bidi buffers and for data buffers in those cases, where
memcpy would have been skipped.

Signed-off-by: Bodo Stroesser <bstroesser@ts.fujitsu.com>
---
 drivers/target/target_core_user.c | 139 +++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 76 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 5587bd4d1060..18798c5422c7 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -181,6 +181,8 @@ struct tcmu_cmd {
 	uint32_t dbi_cur;
 	uint32_t *dbi;
 
+	uint32_t data_len_bidi;
+
 	unsigned long deadline;
 
 #define TCMU_CMD_BIT_EXPIRED 0
@@ -579,9 +581,47 @@ static inline void tcmu_cmd_set_block_cnts(struct tcmu_cmd *cmd)
 			len += se_cmd->t_bidi_data_sg[i].length;
 		cmd->dbi_bidi_cnt = DIV_ROUND_UP(len, DATA_BLOCK_SIZE);
 		cmd->dbi_cnt += cmd->dbi_bidi_cnt;
+		cmd->data_len_bidi = len;
 	}
 }
 
+static int new_block_to_iov(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
+			    struct iovec **iov, int prev_dbi, int *remain)
+{
+	/* Get the next dbi */
+	int dbi = tcmu_cmd_get_dbi(cmd);
+	/* Do not add more than DATA_BLOCK_SIZE to iov */
+	int len = min_t(int, DATA_BLOCK_SIZE, *remain);
+
+	*remain -= len;
+	/*
+	 * The following code will gather and map the blocks to the same iovec
+	 * when the blocks are all next to each other.
+	 */
+	if (dbi != prev_dbi + 1) {
+		/* dbi is not next to previous dbi, so start new iov */
+		if (prev_dbi >= 0)
+			(*iov)++;
+		/* write offset relative to mb_addr */
+		(*iov)->iov_base = (void __user *)
+				(udev->data_off + dbi * DATA_BLOCK_SIZE);
+	}
+	(*iov)->iov_len += len;
+
+	return dbi;
+}
+
+static void tcmu_setup_iovs(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
+			    struct iovec **iov, int data_length)
+{
+	/* start value of dbi + 1 must not be a valid dbi */
+	int dbi = -2;
+
+	/* We prepare the IOVs for DMA_FROM_DEVICE transfer direction */
+	while (data_length > 0)
+		dbi = new_block_to_iov(udev, cmd, iov, dbi, &data_length);
+}
+
 static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd)
 {
 	struct se_device *se_dev = se_cmd->se_dev;
@@ -646,45 +686,22 @@ static inline size_t head_to_end(size_t head, size_t size)
 	return size - head;
 }
 
-static inline void new_iov(struct iovec **iov, bool first)
-{
-	struct iovec *iovec;
-
-	if (!first)
-		(*iov)++;
-
-	iovec = *iov;
-	memset(iovec, 0, sizeof(struct iovec));
-}
-
 #define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size)
 
-/* offset is relative to mb_addr */
-static inline size_t get_block_offset_user(struct tcmu_dev *dev,
-		int dbi, int remaining)
-{
-	return dev->data_off + dbi * DATA_BLOCK_SIZE +
-		DATA_BLOCK_SIZE - remaining;
-}
-
-static inline size_t iov_tail(struct iovec *iov)
-{
-	return (size_t)iov->iov_base + iov->iov_len;
-}
-
-static void scatter_data_area(struct tcmu_dev *udev,
-	struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg,
-	unsigned int data_nents, struct iovec **iov, bool copy_data)
+static void scatter_data_area(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd,
+			      struct iovec **iov)
 {
-	int i, dbi;
+	struct se_cmd *se_cmd = tcmu_cmd->se_cmd;
+	/* start value of dbi + 1 must not be a valid dbi */
+	int i, dbi = -2;
 	int block_remaining = 0;
+	int data_len = se_cmd->data_length;
 	void *from, *to = NULL;
-	size_t copy_bytes, to_offset, offset;
+	size_t copy_bytes, offset;
 	struct scatterlist *sg;
 	struct page *page;
-	bool first = true;
 
-	for_each_sg(data_sg, sg, data_nents, i) {
+	for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) {
 		int sg_remaining = sg->length;
 		from = kmap_atomic(sg_page(sg)) + sg->offset;
 		while (sg_remaining > 0) {
@@ -694,50 +711,19 @@ static void scatter_data_area(struct tcmu_dev *udev,
 					kunmap_atomic(to);
 				}
 
-				block_remaining = DATA_BLOCK_SIZE;
-				dbi = tcmu_cmd_get_dbi(tcmu_cmd);
+				/* get next dbi and add to IOVs */
+				dbi = new_block_to_iov(udev, tcmu_cmd, iov, dbi,
+						       &data_len);
 				page = tcmu_get_block_page(udev, dbi);
 				to = kmap_atomic(page);
+				block_remaining = DATA_BLOCK_SIZE;
 			}
 
-			/*
-			 * Covert to virtual offset of the ring data area.
-			 */
-			to_offset = get_block_offset_user(udev, dbi,
-					block_remaining);
-
-			/*
-			 * The following code will gather and map the blocks
-			 * to the same iovec when the blocks are all next to
-			 * each other.
-			 */
 			copy_bytes = min_t(size_t, sg_remaining,
 					block_remaining);
-			if (!first && to_offset == iov_tail(*iov)) {
-				/*
-				 * Will append to the current iovec, because
-				 * the current block page is next to the
-				 * previous one.
-				 */
-				(*iov)->iov_len += copy_bytes;
-			} else {
-				/*
-				 * Will allocate a new iovec because we are
-				 * first time here or the current block page
-				 * is not next to the previous one.
-				 */
-				new_iov(iov, first);
-				(*iov)->iov_base = (void __user *)to_offset;
-				(*iov)->iov_len = copy_bytes;
-				first = false;
-			}
-
-			if (copy_data) {
-				offset = DATA_BLOCK_SIZE - block_remaining;
-				memcpy(to + offset,
-				       from + sg->length - sg_remaining,
-				       copy_bytes);
-			}
+			offset = DATA_BLOCK_SIZE - block_remaining;
+			memcpy(to + offset, from + sg->length - sg_remaining,
+			       copy_bytes);
 
 			sg_remaining -= copy_bytes;
 			block_remaining -= copy_bytes;
@@ -1010,7 +996,6 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 	int iov_cnt, iov_bidi_cnt, cmd_id;
 	uint32_t cmd_head;
 	uint64_t cdb_off;
-	bool copy_to_data_area;
 	/* size of data buffer needed */
 	size_t data_length = (size_t)tcmu_cmd->dbi_cnt * DATA_BLOCK_SIZE;
 
@@ -1084,17 +1069,19 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err)
 	/* prepare iov list and copy data to data area if necessary */
 	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
 	iov = &entry->req.iov[0];
-	copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
-		|| se_cmd->se_cmd_flags & SCF_BIDI);
-	scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
-			  se_cmd->t_data_nents, &iov, copy_to_data_area);
+
+	if (se_cmd->data_direction == DMA_TO_DEVICE ||
+	    se_cmd->se_cmd_flags & SCF_BIDI)
+		scatter_data_area(udev, tcmu_cmd, &iov);
+	else
+		tcmu_setup_iovs(udev, tcmu_cmd, &iov, se_cmd->data_length);
+
 	entry->req.iov_cnt = iov_cnt - iov_bidi_cnt;
 
 	/* Handle BIDI commands */
 	if (se_cmd->se_cmd_flags & SCF_BIDI) {
 		iov++;
-		scatter_data_area(udev, tcmu_cmd, se_cmd->t_bidi_data_sg,
-				  se_cmd->t_bidi_data_nents, &iov, false);
+		tcmu_setup_iovs(udev, tcmu_cmd, &iov, tcmu_cmd->data_len_bidi);
 		entry->req.iov_bidi_cnt = iov_bidi_cnt;
 	}
 
-- 
2.12.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling
  2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
                   ` (2 preceding siblings ...)
  2020-09-10 15:50 ` [PATCH 3/3] scsi: target: tcmu: optimize scatter_data_area Bodo Stroesser
@ 2020-09-22 16:20 ` Mike Christie
  2020-09-22 21:32 ` Martin K. Petersen
  2020-09-30  3:34 ` Martin K. Petersen
  5 siblings, 0 replies; 7+ messages in thread
From: Mike Christie @ 2020-09-22 16:20 UTC (permalink / raw)
  To: Bodo Stroesser, Martin K. Petersen, linux-scsi, target-devel

On 9/10/20 10:50 AM, Bodo Stroesser wrote:
> This small series is made on top of Martin's 5.10/scsi-queue branch.
> 
> The patches simplify some code, splits off new helper functions
> or implement a cleaner code sequence to prevent double work.
> The 3rd patch speeds up buffer preparation for SCSI commands with
> long read data.
> The series is also base for cleaned up version v3 of my patch
>   scsi: target: tcmu: add compat mode for 32bit userspace on 64bit kernel
> 
> Bodo Stroesser (3):
>   scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt
>   scsi: target: tcmu: optimize queue_cmd_ring
>   scsi: target: tcmu: optimize scatter_data_area
> 
>  drivers/target/target_core_user.c | 340 +++++++++++++++++++-------------------
>  1 file changed, 170 insertions(+), 170 deletions(-)
> 

Acked-by: Mike Christie <michael.christie@oracle.com>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling
  2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
                   ` (3 preceding siblings ...)
  2020-09-22 16:20 ` [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Mike Christie
@ 2020-09-22 21:32 ` Martin K. Petersen
  2020-09-30  3:34 ` Martin K. Petersen
  5 siblings, 0 replies; 7+ messages in thread
From: Martin K. Petersen @ 2020-09-22 21:32 UTC (permalink / raw)
  To: Bodo Stroesser
  Cc: Martin K. Petersen, Mike Christie, linux-scsi, target-devel


Bodo,

> The patches simplify some code, splits off new helper functions or
> implement a cleaner code sequence to prevent double work.  The 3rd
> patch speeds up buffer preparation for SCSI commands with long read
> data.

Applied to 5.10/scsi-staging, thanks!

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling
  2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
                   ` (4 preceding siblings ...)
  2020-09-22 21:32 ` Martin K. Petersen
@ 2020-09-30  3:34 ` Martin K. Petersen
  5 siblings, 0 replies; 7+ messages in thread
From: Martin K. Petersen @ 2020-09-30  3:34 UTC (permalink / raw)
  To: linux-scsi, Bodo Stroesser, Mike Christie, target-devel
  Cc: Martin K . Petersen

On Thu, 10 Sep 2020 17:50:38 +0200, Bodo Stroesser wrote:

> This small series is made on top of Martin's 5.10/scsi-queue branch.
> 
> The patches simplify some code, splits off new helper functions
> or implement a cleaner code sequence to prevent double work.
> The 3rd patch speeds up buffer preparation for SCSI commands with
> long read data.
> The series is also base for cleaned up version v3 of my patch
>   scsi: target: tcmu: add compat mode for 32bit userspace on 64bit kernel
> 
> [...]

Applied to 5.10/scsi-queue, thanks!

[1/3] scsi: target: tcmu: Join tcmu_cmd_get_data_length() and tcmu_cmd_get_block_cnt()
      https://git.kernel.org/mkp/scsi/c/52ef2743f16c
[2/3] scsi: target: tcmu: Optimize queue_cmd_ring()
      https://git.kernel.org/mkp/scsi/c/7e98905e9d0d
[3/3] scsi: target: tcmu: Optimize scatter_data_area()
      https://git.kernel.org/mkp/scsi/c/3c9a7c58ea3d

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-09-30  3:35 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-10 15:50 [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Bodo Stroesser
2020-09-10 15:50 ` [PATCH 1/3] scsi: target: tcmu: join tcmu_cmd_get_data_length and tcmu_cmd_get_block_cnt Bodo Stroesser
2020-09-10 15:50 ` [PATCH 2/3] scsi: target: tcmu: optimize queue_cmd_ring Bodo Stroesser
2020-09-10 15:50 ` [PATCH 3/3] scsi: target: tcmu: optimize scatter_data_area Bodo Stroesser
2020-09-22 16:20 ` [PATCH 0/3] target: scsi: tcmu: code rework and speed up of read data handling Mike Christie
2020-09-22 21:32 ` Martin K. Petersen
2020-09-30  3:34 ` Martin K. Petersen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).