linux-cifs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
@ 2022-09-06  1:30 Enzo Matsumiya
  2022-09-06  2:36 ` ronnie sahlberg
  2022-09-07  8:37 ` Aurélien Aptel
  0 siblings, 2 replies; 10+ messages in thread
From: Enzo Matsumiya @ 2022-09-06  1:30 UTC (permalink / raw)
  To: linux-cifs; +Cc: smfrench, pc, ronniesahlberg, nspmangalore

Replace hardcoded alignment computations (e.g. (len + 7) & ~0x7) of
(mostly) SMB2 messages length by ALIGN()/IS_ALIGNED() macros (right
tools for the job, less prone to errors).

But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
if not optimized by the compiler, has the overhead of a multiplication
and a division. Do the same for roundup() by replacing it by round_up()
(division-less version, but requires the multiple to be a power of 2,
which is always the case for us).

Also remove some unecessary checks where !IS_ALIGNED() would fit, but
calling round_up() directly is just fine as it'll be a no-op if the
value is already aligned.

Afraid this was a useless micro-optimization, I ran some tests with a
very light workload, and I observed a ~50% perf improvement already:

Record all cifs.ko functions:
  # trace-cmd record --module cifs -p function_graph

(test-dir has ~50MB with ~4000 files)
Test commands after share is mounted and with no activity:
  # cp -r test-dir /mnt/test
  # sync
  # umount /mnt/test

Number of traced functions:
  # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | wc -l
- unpatched: 307746
- patched: 313199

Measuring the average latency of all traced functions:
  # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | jq -s add/length
- unpatched: 27105.577791262822 us
- patched: 14548.665733635338 us

So even though the patched version made 5k+ more function calls (for
whatever reason), it still did it with ~50% reduced latency.

On a larger scale, given the affected code paths, this patch should
show a relevant improvement.

Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
---
Please let me know if my measurements are enough/valid.

 fs/cifs/cifssmb.c   |  7 +++---
 fs/cifs/connect.c   | 11 ++++++--
 fs/cifs/sess.c      | 18 +++++--------
 fs/cifs/smb2inode.c |  4 +--
 fs/cifs/smb2misc.c  |  2 +-
 fs/cifs/smb2pdu.c   | 61 +++++++++++++++++++--------------------------
 6 files changed, 47 insertions(+), 56 deletions(-)

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 7aa91e272027..addf3fc62aef 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
 					remap);
 	}
 	rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
-	count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
+	count = sizeof(struct set_file_rename) + (2 * len_of_str);
 	byte_count += count;
 	pSMB->DataCount = cpu_to_le16(count);
 	pSMB->TotalDataCount = pSMB->DataCount;
@@ -2796,7 +2796,7 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
 		cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
 		goto qreparse_out;
 	}
-	end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
+	end_of_smb = sizeof(__le16) + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
 	reparse_buf = (struct reparse_symlink_data *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
 	if ((char *)reparse_buf >= end_of_smb) {
@@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 	pSMBr = (struct smb_com_ntransact_rsp *)buf;
 
 	bcc = get_bcc(&pSMBr->hdr);
-	end_of_smb = 2 /* sizeof byte count */ + bcc +
-			(char *)&pSMBr->ByteCount;
+	end_of_smb = sizeof(__le16) + bcc + (char *)&pSMBr->ByteCount;
 
 	data_offset = le32_to_cpu(pSMBr->DataOffset);
 	data_count = le32_to_cpu(pSMBr->DataCount);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a0a06b6f252b..389127e21563 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
 	 * sessinit is sent but no second negprot
 	 */
 	struct rfc1002_session_packet *ses_init_buf;
+	unsigned int req_noscope_len;
 	struct smb_hdr *smb_buf;
+
 	ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
 			       GFP_KERNEL);
+
 	if (ses_init_buf) {
 		ses_init_buf->trailer.session_req.called_len = 32;
 
@@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
 		ses_init_buf->trailer.session_req.scope2 = 0;
 		smb_buf = (struct smb_hdr *)ses_init_buf;
 
-		/* sizeof RFC1002_SESSION_REQUEST with no scope */
-		smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
+		/* sizeof RFC1002_SESSION_REQUEST with no scopes */
+		req_noscope_len = sizeof(struct rfc1002_session_packet) - 2;
+
+		/* == cpu_to_be32(0x81000044) */
+		smb_buf->smb_buf_length =
+			cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len);
 		rc = smb_send(server, smb_buf, 0x44);
 		kfree(ses_init_buf);
 		/*
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3af3b05b6c74..951874928d70 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -601,11 +601,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 	/* BB FIXME add check that strings total less
 	than 335 or will need to send them as arrays */
 
-	/* unicode strings, must be word aligned before the call */
-/*	if ((long) bcc_ptr % 2)	{
-		*bcc_ptr = 0;
-		bcc_ptr++;
-	} */
 	/* copy user */
 	if (ses->user_name == NULL) {
 		/* null user mount */
@@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
 	}
 
 	if (ses->capabilities & CAP_UNICODE) {
-		if (sess_data->iov[0].iov_len % 2) {
+		if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) {
 			*bcc_ptr = 0;
 			bcc_ptr++;
 		}
@@ -1358,7 +1353,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
 		/* no string area to decode, do nothing */
 	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 		/* unicode string area must be word-aligned */
-		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+		if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
 			++bcc_ptr;
 			--bytes_remaining;
 		}
@@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
 
 	if (ses->capabilities & CAP_UNICODE) {
 		/* unicode strings must be word aligned */
-		if ((sess_data->iov[0].iov_len
-			+ sess_data->iov[1].iov_len) % 2) {
+		if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
 			*bcc_ptr = 0;
 			bcc_ptr++;
 		}
@@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
 		/* no string area to decode, do nothing */
 	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 		/* unicode string area must be word-aligned */
-		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+		if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
 			++bcc_ptr;
 			--bytes_remaining;
 		}
@@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
 
 	bcc_ptr = sess_data->iov[2].iov_base;
 	/* unicode strings must be word aligned */
-	if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+	if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
 		*bcc_ptr = 0;
 		bcc_ptr++;
 	}
@@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
 		/* no string area to decode, do nothing */
 	} else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
 		/* unicode string area must be word-aligned */
-		if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+		if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
 			++bcc_ptr;
 			--bytes_remaining;
 		}
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index b83f59051b26..4eefbe574b82 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 		rqst[num_rqst].rq_iov = &vars->si_iov[0];
 		rqst[num_rqst].rq_nvec = 1;
 
-		size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
+		size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
 		data[0] = &delete_pending[0];
 
 		rc = SMB2_set_info_init(tcon, server,
@@ -225,7 +225,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 		rqst[num_rqst].rq_iov = &vars->si_iov[0];
 		rqst[num_rqst].rq_nvec = 1;
 
-		size[0] = 8; /* sizeof __le64 */
+		size[0] = sizeof(__le64);
 		data[0] = ptr;
 
 		rc = SMB2_set_info_init(tcon, server,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index d73e5672aac4..258b01306d85 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
 		 * Some windows servers (win2016) will pad also the final
 		 * PDU in a compound to 8 bytes.
 		 */
-		if (((calc_len + 7) & ~7) == len)
+		if (ALIGN(calc_len, 8) == len)
 			return 0;
 
 		/*
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 6352ab32c7e7..5da0b596c8a0 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
 	/*
 	 * Context Data length must be rounded to multiple of 8 for some servers
 	 */
-	pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
-				sizeof(struct smb2_signing_capabilities) -
-				sizeof(struct smb2_neg_context) +
-				(num_algs * 2 /* sizeof u16 */), 8) * 8);
+	pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) -
+					    sizeof(struct smb2_neg_context) +
+					    (num_algs * sizeof(u16)), 8));
 	pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
 	pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC);
 
-	ctxt_len += 2 /* sizeof le16 */ * num_algs;
-	ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
+	ctxt_len += sizeof(__le16) * num_algs;
+	ctxt_len = ALIGN(ctxt_len, 8);
 	return ctxt_len;
 	/* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */
 }
@@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
 	/* copy up to max of first 100 bytes of server name to NetName field */
 	pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
 	/* context size is DataLength + minimal smb2_neg_context */
-	return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
-			sizeof(struct smb2_neg_context), 8) * 8;
+	return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8);
 }
 
 static void
@@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
 	 * round up total_len of fixed part of SMB3 negotiate request to 8
 	 * byte boundary before adding negotiate contexts
 	 */
-	*total_len = roundup(*total_len, 8);
+	*total_len = ALIGN(*total_len, 8);
 
 	pneg_ctxt = (*total_len) + (char *)req;
 	req->NegotiateContextOffset = cpu_to_le32(*total_len);
 
 	build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
-	ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
+	ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8);
 	*total_len += ctxt_len;
 	pneg_ctxt += ctxt_len;
 
 	build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
-	ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
+	ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8);
 	*total_len += ctxt_len;
 	pneg_ctxt += ctxt_len;
 
@@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
 	if (server->compress_algorithm) {
 		build_compression_ctxt((struct smb2_compression_capabilities_context *)
 				pneg_ctxt);
-		ctxt_len = DIV_ROUND_UP(
-			sizeof(struct smb2_compression_capabilities_context),
-				8) * 8;
+		ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
 		*total_len += ctxt_len;
 		pneg_ctxt += ctxt_len;
 		neg_context_count++;
@@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
 		if (rc)
 			break;
 		/* offsets must be 8 byte aligned */
-		clen = (clen + 7) & ~0x7;
+		clen = ALIGN(clen, 8);
 		offset += clen + sizeof(struct smb2_neg_context);
 		len_of_ctxts -= clen;
 	}
@@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
 	unsigned int group_offset = 0;
 	struct smb3_acl acl;
 
-	*len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
+	*len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
 
 	if (set_owner) {
 		/* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
@@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
 	memcpy(aclptr, &acl, sizeof(struct smb3_acl));
 
 	buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
-	*len = roundup(ptr - (__u8 *)buf, 8);
+	*len = round_up((unsigned int)(ptr - (__u8 *)buf), 8);
 
 	return buf;
 }
@@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
 	 * final path needs to be 8-byte aligned as specified in
 	 * MS-SMB2 2.2.13 SMB2 CREATE Request.
 	 */
-	*out_size = roundup(*out_len * sizeof(__le16), 8);
+	*out_size = round_up(*out_len * sizeof(__le16), 8);
 	*out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
 	if (!*out_path)
 		return -ENOMEM;
@@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
 		uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path, PATH_MAX)) + 2;
 		/* MUST set path len (NameLength) to 0 opening root of share */
 		req->NameLength = cpu_to_le16(uni_path_len - 2);
-		if (uni_path_len % 8 != 0) {
-			copy_size = roundup(uni_path_len, 8);
-			copy_path = kzalloc(copy_size, GFP_KERNEL);
-			if (!copy_path) {
-				rc = -ENOMEM;
-				goto err_free_req;
-			}
-			memcpy((char *)copy_path, (const char *)utf16_path,
-			       uni_path_len);
-			uni_path_len = copy_size;
-			/* free before overwriting resource */
-			kfree(utf16_path);
-			utf16_path = copy_path;
+		copy_size = round_up(uni_path_len, 8);
+		copy_path = kzalloc(copy_size, GFP_KERNEL);
+		if (!copy_path) {
+			rc = -ENOMEM;
+			goto err_free_req;
 		}
+		memcpy((char *)copy_path, (const char *)utf16_path, uni_path_len);
+		uni_path_len = copy_size;
+		/* free before overwriting resource */
+		kfree(utf16_path);
+		utf16_path = copy_path;
 	}
 
 	iov[1].iov_len = uni_path_len;
@@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 		uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
 		/* MUST set path len (NameLength) to 0 opening root of share */
 		req->NameLength = cpu_to_le16(uni_path_len - 2);
-		copy_size = uni_path_len;
-		if (copy_size % 8 != 0)
-			copy_size = roundup(copy_size, 8);
+		copy_size = round_up(uni_path_len, 8);
 		copy_path = kzalloc(copy_size, GFP_KERNEL);
 		if (!copy_path)
 			return -ENOMEM;
@@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
 	if (request_type & CHAINED_REQUEST) {
 		if (!(request_type & END_OF_CHAIN)) {
 			/* next 8-byte aligned request */
-			*total_len = DIV_ROUND_UP(*total_len, 8) * 8;
+			*total_len = ALIGN(*total_len, 8);
 			shdr->NextCommand = cpu_to_le32(*total_len);
 		} else /* END_OF_CHAIN */
 			shdr->NextCommand = 0;
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-06  1:30 [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up() Enzo Matsumiya
@ 2022-09-06  2:36 ` ronnie sahlberg
  2022-09-06 14:41   ` Enzo Matsumiya
  2022-09-07  8:37 ` Aurélien Aptel
  1 sibling, 1 reply; 10+ messages in thread
From: ronnie sahlberg @ 2022-09-06  2:36 UTC (permalink / raw)
  To: Enzo Matsumiya; +Cc: linux-cifs, smfrench, pc, nspmangalore

Very nice. The performance change can not be explained, but nice cleanup.

Reviewed by me

On Tue, 6 Sept 2022 at 11:31, Enzo Matsumiya <ematsumiya@suse.de> wrote:
>
> Replace hardcoded alignment computations (e.g. (len + 7) & ~0x7) of
> (mostly) SMB2 messages length by ALIGN()/IS_ALIGNED() macros (right
> tools for the job, less prone to errors).
>
> But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
> if not optimized by the compiler, has the overhead of a multiplication
> and a division. Do the same for roundup() by replacing it by round_up()
> (division-less version, but requires the multiple to be a power of 2,
> which is always the case for us).
>
> Also remove some unecessary checks where !IS_ALIGNED() would fit, but
> calling round_up() directly is just fine as it'll be a no-op if the
> value is already aligned.
>
> Afraid this was a useless micro-optimization, I ran some tests with a
> very light workload, and I observed a ~50% perf improvement already:
>
> Record all cifs.ko functions:
>   # trace-cmd record --module cifs -p function_graph
>
> (test-dir has ~50MB with ~4000 files)
> Test commands after share is mounted and with no activity:
>   # cp -r test-dir /mnt/test
>   # sync
>   # umount /mnt/test
>
> Number of traced functions:
>   # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | wc -l
> - unpatched: 307746
> - patched: 313199
>
> Measuring the average latency of all traced functions:
>   # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | jq -s add/length
> - unpatched: 27105.577791262822 us
> - patched: 14548.665733635338 us
>
> So even though the patched version made 5k+ more function calls (for
> whatever reason), it still did it with ~50% reduced latency.
>
> On a larger scale, given the affected code paths, this patch should
> show a relevant improvement.
>
> Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
> ---
> Please let me know if my measurements are enough/valid.
>
>  fs/cifs/cifssmb.c   |  7 +++---
>  fs/cifs/connect.c   | 11 ++++++--
>  fs/cifs/sess.c      | 18 +++++--------
>  fs/cifs/smb2inode.c |  4 +--
>  fs/cifs/smb2misc.c  |  2 +-
>  fs/cifs/smb2pdu.c   | 61 +++++++++++++++++++--------------------------
>  6 files changed, 47 insertions(+), 56 deletions(-)
>
> diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
> index 7aa91e272027..addf3fc62aef 100644
> --- a/fs/cifs/cifssmb.c
> +++ b/fs/cifs/cifssmb.c
> @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
>                                         remap);
>         }
>         rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
> -       count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
> +       count = sizeof(struct set_file_rename) + (2 * len_of_str);
>         byte_count += count;
>         pSMB->DataCount = cpu_to_le16(count);
>         pSMB->TotalDataCount = pSMB->DataCount;
> @@ -2796,7 +2796,7 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
>                 cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
>                 goto qreparse_out;
>         }
> -       end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
> +       end_of_smb = sizeof(__le16) + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
>         reparse_buf = (struct reparse_symlink_data *)
>                                 ((char *)&pSMBr->hdr.Protocol + data_offset);
>         if ((char *)reparse_buf >= end_of_smb) {
> @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
>         pSMBr = (struct smb_com_ntransact_rsp *)buf;
>
>         bcc = get_bcc(&pSMBr->hdr);
> -       end_of_smb = 2 /* sizeof byte count */ + bcc +
> -                       (char *)&pSMBr->ByteCount;
> +       end_of_smb = sizeof(__le16) + bcc + (char *)&pSMBr->ByteCount;
>
>         data_offset = le32_to_cpu(pSMBr->DataOffset);
>         data_count = le32_to_cpu(pSMBr->DataCount);
> diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
> index a0a06b6f252b..389127e21563 100644
> --- a/fs/cifs/connect.c
> +++ b/fs/cifs/connect.c
> @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
>          * sessinit is sent but no second negprot
>          */
>         struct rfc1002_session_packet *ses_init_buf;
> +       unsigned int req_noscope_len;
>         struct smb_hdr *smb_buf;
> +
>         ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
>                                GFP_KERNEL);
> +
>         if (ses_init_buf) {
>                 ses_init_buf->trailer.session_req.called_len = 32;
>
> @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
>                 ses_init_buf->trailer.session_req.scope2 = 0;
>                 smb_buf = (struct smb_hdr *)ses_init_buf;
>
> -               /* sizeof RFC1002_SESSION_REQUEST with no scope */
> -               smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
> +               /* sizeof RFC1002_SESSION_REQUEST with no scopes */
> +               req_noscope_len = sizeof(struct rfc1002_session_packet) - 2;
> +
> +               /* == cpu_to_be32(0x81000044) */
> +               smb_buf->smb_buf_length =
> +                       cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len);
>                 rc = smb_send(server, smb_buf, 0x44);
>                 kfree(ses_init_buf);
>                 /*
> diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
> index 3af3b05b6c74..951874928d70 100644
> --- a/fs/cifs/sess.c
> +++ b/fs/cifs/sess.c
> @@ -601,11 +601,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
>         /* BB FIXME add check that strings total less
>         than 335 or will need to send them as arrays */
>
> -       /* unicode strings, must be word aligned before the call */
> -/*     if ((long) bcc_ptr % 2) {
> -               *bcc_ptr = 0;
> -               bcc_ptr++;
> -       } */
>         /* copy user */
>         if (ses->user_name == NULL) {
>                 /* null user mount */
> @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>         }
>
>         if (ses->capabilities & CAP_UNICODE) {
> -               if (sess_data->iov[0].iov_len % 2) {
> +               if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) {
>                         *bcc_ptr = 0;
>                         bcc_ptr++;
>                 }
> @@ -1358,7 +1353,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>                 /* no string area to decode, do nothing */
>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>                 /* unicode string area must be word-aligned */
> -               if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
>                         ++bcc_ptr;
>                         --bytes_remaining;
>                 }
> @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>
>         if (ses->capabilities & CAP_UNICODE) {
>                 /* unicode strings must be word aligned */
> -               if ((sess_data->iov[0].iov_len
> -                       + sess_data->iov[1].iov_len) % 2) {
> +               if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
>                         *bcc_ptr = 0;
>                         bcc_ptr++;
>                 }
> @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>                 /* no string area to decode, do nothing */
>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>                 /* unicode string area must be word-aligned */
> -               if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
>                         ++bcc_ptr;
>                         --bytes_remaining;
>                 }
> @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
>
>         bcc_ptr = sess_data->iov[2].iov_base;
>         /* unicode strings must be word aligned */
> -       if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
> +       if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
>                 *bcc_ptr = 0;
>                 bcc_ptr++;
>         }
> @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
>                 /* no string area to decode, do nothing */
>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>                 /* unicode string area must be word-aligned */
> -               if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
>                         ++bcc_ptr;
>                         --bytes_remaining;
>                 }
> diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
> index b83f59051b26..4eefbe574b82 100644
> --- a/fs/cifs/smb2inode.c
> +++ b/fs/cifs/smb2inode.c
> @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
>                 rqst[num_rqst].rq_iov = &vars->si_iov[0];
>                 rqst[num_rqst].rq_nvec = 1;
>
> -               size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
> +               size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
>                 data[0] = &delete_pending[0];
>
>                 rc = SMB2_set_info_init(tcon, server,
> @@ -225,7 +225,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
>                 rqst[num_rqst].rq_iov = &vars->si_iov[0];
>                 rqst[num_rqst].rq_nvec = 1;
>
> -               size[0] = 8; /* sizeof __le64 */
> +               size[0] = sizeof(__le64);
>                 data[0] = ptr;
>
>                 rc = SMB2_set_info_init(tcon, server,
> diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
> index d73e5672aac4..258b01306d85 100644
> --- a/fs/cifs/smb2misc.c
> +++ b/fs/cifs/smb2misc.c
> @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
>                  * Some windows servers (win2016) will pad also the final
>                  * PDU in a compound to 8 bytes.
>                  */
> -               if (((calc_len + 7) & ~7) == len)
> +               if (ALIGN(calc_len, 8) == len)
>                         return 0;
>
>                 /*
> diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
> index 6352ab32c7e7..5da0b596c8a0 100644
> --- a/fs/cifs/smb2pdu.c
> +++ b/fs/cifs/smb2pdu.c
> @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
>         /*
>          * Context Data length must be rounded to multiple of 8 for some servers
>          */
> -       pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
> -                               sizeof(struct smb2_signing_capabilities) -
> -                               sizeof(struct smb2_neg_context) +
> -                               (num_algs * 2 /* sizeof u16 */), 8) * 8);
> +       pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) -
> +                                           sizeof(struct smb2_neg_context) +
> +                                           (num_algs * sizeof(u16)), 8));
>         pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
>         pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC);
>
> -       ctxt_len += 2 /* sizeof le16 */ * num_algs;
> -       ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
> +       ctxt_len += sizeof(__le16) * num_algs;
> +       ctxt_len = ALIGN(ctxt_len, 8);
>         return ctxt_len;
>         /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */
>  }
> @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
>         /* copy up to max of first 100 bytes of server name to NetName field */
>         pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
>         /* context size is DataLength + minimal smb2_neg_context */
> -       return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
> -                       sizeof(struct smb2_neg_context), 8) * 8;
> +       return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8);
>  }
>
>  static void
> @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
>          * round up total_len of fixed part of SMB3 negotiate request to 8
>          * byte boundary before adding negotiate contexts
>          */
> -       *total_len = roundup(*total_len, 8);
> +       *total_len = ALIGN(*total_len, 8);
>
>         pneg_ctxt = (*total_len) + (char *)req;
>         req->NegotiateContextOffset = cpu_to_le32(*total_len);
>
>         build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
> -       ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
> +       ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8);
>         *total_len += ctxt_len;
>         pneg_ctxt += ctxt_len;
>
>         build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
> -       ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
> +       ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8);
>         *total_len += ctxt_len;
>         pneg_ctxt += ctxt_len;
>
> @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
>         if (server->compress_algorithm) {
>                 build_compression_ctxt((struct smb2_compression_capabilities_context *)
>                                 pneg_ctxt);
> -               ctxt_len = DIV_ROUND_UP(
> -                       sizeof(struct smb2_compression_capabilities_context),
> -                               8) * 8;
> +               ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
>                 *total_len += ctxt_len;
>                 pneg_ctxt += ctxt_len;
>                 neg_context_count++;
> @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
>                 if (rc)
>                         break;
>                 /* offsets must be 8 byte aligned */
> -               clen = (clen + 7) & ~0x7;
> +               clen = ALIGN(clen, 8);
>                 offset += clen + sizeof(struct smb2_neg_context);
>                 len_of_ctxts -= clen;
>         }
> @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
>         unsigned int group_offset = 0;
>         struct smb3_acl acl;
>
> -       *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
> +       *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
>
>         if (set_owner) {
>                 /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
> @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
>         memcpy(aclptr, &acl, sizeof(struct smb3_acl));
>
>         buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
> -       *len = roundup(ptr - (__u8 *)buf, 8);
> +       *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8);
>
>         return buf;
>  }
> @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
>          * final path needs to be 8-byte aligned as specified in
>          * MS-SMB2 2.2.13 SMB2 CREATE Request.
>          */
> -       *out_size = roundup(*out_len * sizeof(__le16), 8);
> +       *out_size = round_up(*out_len * sizeof(__le16), 8);
>         *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
>         if (!*out_path)
>                 return -ENOMEM;
> @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
>                 uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path, PATH_MAX)) + 2;
>                 /* MUST set path len (NameLength) to 0 opening root of share */
>                 req->NameLength = cpu_to_le16(uni_path_len - 2);
> -               if (uni_path_len % 8 != 0) {
> -                       copy_size = roundup(uni_path_len, 8);
> -                       copy_path = kzalloc(copy_size, GFP_KERNEL);
> -                       if (!copy_path) {
> -                               rc = -ENOMEM;
> -                               goto err_free_req;
> -                       }
> -                       memcpy((char *)copy_path, (const char *)utf16_path,
> -                              uni_path_len);
> -                       uni_path_len = copy_size;
> -                       /* free before overwriting resource */
> -                       kfree(utf16_path);
> -                       utf16_path = copy_path;
> +               copy_size = round_up(uni_path_len, 8);
> +               copy_path = kzalloc(copy_size, GFP_KERNEL);
> +               if (!copy_path) {
> +                       rc = -ENOMEM;
> +                       goto err_free_req;
>                 }
> +               memcpy((char *)copy_path, (const char *)utf16_path, uni_path_len);
> +               uni_path_len = copy_size;
> +               /* free before overwriting resource */
> +               kfree(utf16_path);
> +               utf16_path = copy_path;
>         }
>
>         iov[1].iov_len = uni_path_len;
> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
>                 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
>                 /* MUST set path len (NameLength) to 0 opening root of share */
>                 req->NameLength = cpu_to_le16(uni_path_len - 2);
> -               copy_size = uni_path_len;
> -               if (copy_size % 8 != 0)
> -                       copy_size = roundup(copy_size, 8);
> +               copy_size = round_up(uni_path_len, 8);
>                 copy_path = kzalloc(copy_size, GFP_KERNEL);
>                 if (!copy_path)
>                         return -ENOMEM;
> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
>         if (request_type & CHAINED_REQUEST) {
>                 if (!(request_type & END_OF_CHAIN)) {
>                         /* next 8-byte aligned request */
> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
> +                       *total_len = ALIGN(*total_len, 8);
>                         shdr->NextCommand = cpu_to_le32(*total_len);
>                 } else /* END_OF_CHAIN */
>                         shdr->NextCommand = 0;
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-06  2:36 ` ronnie sahlberg
@ 2022-09-06 14:41   ` Enzo Matsumiya
  2022-09-06 15:47     ` Tom Talpey
  0 siblings, 1 reply; 10+ messages in thread
From: Enzo Matsumiya @ 2022-09-06 14:41 UTC (permalink / raw)
  To: ronnie sahlberg; +Cc: linux-cifs, smfrench, pc, nspmangalore

On 09/06, ronnie sahlberg wrote:
>Very nice. The performance change can not be explained, but nice cleanup.
>
>Reviewed by me

Thanks, Ronnie. Multiplication and division operations have higher CPU
cost (cycles) than simple bitwise operations; in x86-64, multiplications
can cost ~5x more, and divisions ~20x more (there's actually a huge
variation from system to system, but I'd say those numbers are a good
guesstimate). So the ALIGN()/round_up() macros, that don't do any mult/div,
are faster, and in cases like ours, where they'll be called billions of
times on any non-trivial workload, the number of cycles saved adds up
and performance improves.


Cheers,

Enzo

>
>On Tue, 6 Sept 2022 at 11:31, Enzo Matsumiya <ematsumiya@suse.de> wrote:
>>
>> Replace hardcoded alignment computations (e.g. (len + 7) & ~0x7) of
>> (mostly) SMB2 messages length by ALIGN()/IS_ALIGNED() macros (right
>> tools for the job, less prone to errors).
>>
>> But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
>> if not optimized by the compiler, has the overhead of a multiplication
>> and a division. Do the same for roundup() by replacing it by round_up()
>> (division-less version, but requires the multiple to be a power of 2,
>> which is always the case for us).
>>
>> Also remove some unecessary checks where !IS_ALIGNED() would fit, but
>> calling round_up() directly is just fine as it'll be a no-op if the
>> value is already aligned.
>>
>> Afraid this was a useless micro-optimization, I ran some tests with a
>> very light workload, and I observed a ~50% perf improvement already:
>>
>> Record all cifs.ko functions:
>>   # trace-cmd record --module cifs -p function_graph
>>
>> (test-dir has ~50MB with ~4000 files)
>> Test commands after share is mounted and with no activity:
>>   # cp -r test-dir /mnt/test
>>   # sync
>>   # umount /mnt/test
>>
>> Number of traced functions:
>>   # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | wc -l
>> - unpatched: 307746
>> - patched: 313199
>>
>> Measuring the average latency of all traced functions:
>>   # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | jq -s add/length
>> - unpatched: 27105.577791262822 us
>> - patched: 14548.665733635338 us
>>
>> So even though the patched version made 5k+ more function calls (for
>> whatever reason), it still did it with ~50% reduced latency.
>>
>> On a larger scale, given the affected code paths, this patch should
>> show a relevant improvement.
>>
>> Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
>> ---
>> Please let me know if my measurements are enough/valid.
>>
>>  fs/cifs/cifssmb.c   |  7 +++---
>>  fs/cifs/connect.c   | 11 ++++++--
>>  fs/cifs/sess.c      | 18 +++++--------
>>  fs/cifs/smb2inode.c |  4 +--
>>  fs/cifs/smb2misc.c  |  2 +-
>>  fs/cifs/smb2pdu.c   | 61 +++++++++++++++++++--------------------------
>>  6 files changed, 47 insertions(+), 56 deletions(-)
>>
>> diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
>> index 7aa91e272027..addf3fc62aef 100644
>> --- a/fs/cifs/cifssmb.c
>> +++ b/fs/cifs/cifssmb.c
>> @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
>>                                         remap);
>>         }
>>         rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
>> -       count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
>> +       count = sizeof(struct set_file_rename) + (2 * len_of_str);
>>         byte_count += count;
>>         pSMB->DataCount = cpu_to_le16(count);
>>         pSMB->TotalDataCount = pSMB->DataCount;
>> @@ -2796,7 +2796,7 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
>>                 cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
>>                 goto qreparse_out;
>>         }
>> -       end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
>> +       end_of_smb = sizeof(__le16) + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
>>         reparse_buf = (struct reparse_symlink_data *)
>>                                 ((char *)&pSMBr->hdr.Protocol + data_offset);
>>         if ((char *)reparse_buf >= end_of_smb) {
>> @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
>>         pSMBr = (struct smb_com_ntransact_rsp *)buf;
>>
>>         bcc = get_bcc(&pSMBr->hdr);
>> -       end_of_smb = 2 /* sizeof byte count */ + bcc +
>> -                       (char *)&pSMBr->ByteCount;
>> +       end_of_smb = sizeof(__le16) + bcc + (char *)&pSMBr->ByteCount;
>>
>>         data_offset = le32_to_cpu(pSMBr->DataOffset);
>>         data_count = le32_to_cpu(pSMBr->DataCount);
>> diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
>> index a0a06b6f252b..389127e21563 100644
>> --- a/fs/cifs/connect.c
>> +++ b/fs/cifs/connect.c
>> @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
>>          * sessinit is sent but no second negprot
>>          */
>>         struct rfc1002_session_packet *ses_init_buf;
>> +       unsigned int req_noscope_len;
>>         struct smb_hdr *smb_buf;
>> +
>>         ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
>>                                GFP_KERNEL);
>> +
>>         if (ses_init_buf) {
>>                 ses_init_buf->trailer.session_req.called_len = 32;
>>
>> @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
>>                 ses_init_buf->trailer.session_req.scope2 = 0;
>>                 smb_buf = (struct smb_hdr *)ses_init_buf;
>>
>> -               /* sizeof RFC1002_SESSION_REQUEST with no scope */
>> -               smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
>> +               /* sizeof RFC1002_SESSION_REQUEST with no scopes */
>> +               req_noscope_len = sizeof(struct rfc1002_session_packet) - 2;
>> +
>> +               /* == cpu_to_be32(0x81000044) */
>> +               smb_buf->smb_buf_length =
>> +                       cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len);
>>                 rc = smb_send(server, smb_buf, 0x44);
>>                 kfree(ses_init_buf);
>>                 /*
>> diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
>> index 3af3b05b6c74..951874928d70 100644
>> --- a/fs/cifs/sess.c
>> +++ b/fs/cifs/sess.c
>> @@ -601,11 +601,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
>>         /* BB FIXME add check that strings total less
>>         than 335 or will need to send them as arrays */
>>
>> -       /* unicode strings, must be word aligned before the call */
>> -/*     if ((long) bcc_ptr % 2) {
>> -               *bcc_ptr = 0;
>> -               bcc_ptr++;
>> -       } */
>>         /* copy user */
>>         if (ses->user_name == NULL) {
>>                 /* null user mount */
>> @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>>         }
>>
>>         if (ses->capabilities & CAP_UNICODE) {
>> -               if (sess_data->iov[0].iov_len % 2) {
>> +               if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) {
>>                         *bcc_ptr = 0;
>>                         bcc_ptr++;
>>                 }
>> @@ -1358,7 +1353,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>>                 /* no string area to decode, do nothing */
>>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>>                 /* unicode string area must be word-aligned */
>> -               if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
>> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
>>                         ++bcc_ptr;
>>                         --bytes_remaining;
>>                 }
>> @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>>
>>         if (ses->capabilities & CAP_UNICODE) {
>>                 /* unicode strings must be word aligned */
>> -               if ((sess_data->iov[0].iov_len
>> -                       + sess_data->iov[1].iov_len) % 2) {
>> +               if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
>>                         *bcc_ptr = 0;
>>                         bcc_ptr++;
>>                 }
>> @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>>                 /* no string area to decode, do nothing */
>>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>>                 /* unicode string area must be word-aligned */
>> -               if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
>> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
>>                         ++bcc_ptr;
>>                         --bytes_remaining;
>>                 }
>> @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
>>
>>         bcc_ptr = sess_data->iov[2].iov_base;
>>         /* unicode strings must be word aligned */
>> -       if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
>> +       if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
>>                 *bcc_ptr = 0;
>>                 bcc_ptr++;
>>         }
>> @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
>>                 /* no string area to decode, do nothing */
>>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>>                 /* unicode string area must be word-aligned */
>> -               if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
>> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) {
>>                         ++bcc_ptr;
>>                         --bytes_remaining;
>>                 }
>> diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
>> index b83f59051b26..4eefbe574b82 100644
>> --- a/fs/cifs/smb2inode.c
>> +++ b/fs/cifs/smb2inode.c
>> @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
>>                 rqst[num_rqst].rq_iov = &vars->si_iov[0];
>>                 rqst[num_rqst].rq_nvec = 1;
>>
>> -               size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
>> +               size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
>>                 data[0] = &delete_pending[0];
>>
>>                 rc = SMB2_set_info_init(tcon, server,
>> @@ -225,7 +225,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
>>                 rqst[num_rqst].rq_iov = &vars->si_iov[0];
>>                 rqst[num_rqst].rq_nvec = 1;
>>
>> -               size[0] = 8; /* sizeof __le64 */
>> +               size[0] = sizeof(__le64);
>>                 data[0] = ptr;
>>
>>                 rc = SMB2_set_info_init(tcon, server,
>> diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
>> index d73e5672aac4..258b01306d85 100644
>> --- a/fs/cifs/smb2misc.c
>> +++ b/fs/cifs/smb2misc.c
>> @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
>>                  * Some windows servers (win2016) will pad also the final
>>                  * PDU in a compound to 8 bytes.
>>                  */
>> -               if (((calc_len + 7) & ~7) == len)
>> +               if (ALIGN(calc_len, 8) == len)
>>                         return 0;
>>
>>                 /*
>> diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
>> index 6352ab32c7e7..5da0b596c8a0 100644
>> --- a/fs/cifs/smb2pdu.c
>> +++ b/fs/cifs/smb2pdu.c
>> @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
>>         /*
>>          * Context Data length must be rounded to multiple of 8 for some servers
>>          */
>> -       pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
>> -                               sizeof(struct smb2_signing_capabilities) -
>> -                               sizeof(struct smb2_neg_context) +
>> -                               (num_algs * 2 /* sizeof u16 */), 8) * 8);
>> +       pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) -
>> +                                           sizeof(struct smb2_neg_context) +
>> +                                           (num_algs * sizeof(u16)), 8));
>>         pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
>>         pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC);
>>
>> -       ctxt_len += 2 /* sizeof le16 */ * num_algs;
>> -       ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
>> +       ctxt_len += sizeof(__le16) * num_algs;
>> +       ctxt_len = ALIGN(ctxt_len, 8);
>>         return ctxt_len;
>>         /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */
>>  }
>> @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
>>         /* copy up to max of first 100 bytes of server name to NetName field */
>>         pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
>>         /* context size is DataLength + minimal smb2_neg_context */
>> -       return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
>> -                       sizeof(struct smb2_neg_context), 8) * 8;
>> +       return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8);
>>  }
>>
>>  static void
>> @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
>>          * round up total_len of fixed part of SMB3 negotiate request to 8
>>          * byte boundary before adding negotiate contexts
>>          */
>> -       *total_len = roundup(*total_len, 8);
>> +       *total_len = ALIGN(*total_len, 8);
>>
>>         pneg_ctxt = (*total_len) + (char *)req;
>>         req->NegotiateContextOffset = cpu_to_le32(*total_len);
>>
>>         build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt);
>> -       ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8;
>> +       ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8);
>>         *total_len += ctxt_len;
>>         pneg_ctxt += ctxt_len;
>>
>>         build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt);
>> -       ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8;
>> +       ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8);
>>         *total_len += ctxt_len;
>>         pneg_ctxt += ctxt_len;
>>
>> @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
>>         if (server->compress_algorithm) {
>>                 build_compression_ctxt((struct smb2_compression_capabilities_context *)
>>                                 pneg_ctxt);
>> -               ctxt_len = DIV_ROUND_UP(
>> -                       sizeof(struct smb2_compression_capabilities_context),
>> -                               8) * 8;
>> +               ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
>>                 *total_len += ctxt_len;
>>                 pneg_ctxt += ctxt_len;
>>                 neg_context_count++;
>> @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
>>                 if (rc)
>>                         break;
>>                 /* offsets must be 8 byte aligned */
>> -               clen = (clen + 7) & ~0x7;
>> +               clen = ALIGN(clen, 8);
>>                 offset += clen + sizeof(struct smb2_neg_context);
>>                 len_of_ctxts -= clen;
>>         }
>> @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
>>         unsigned int group_offset = 0;
>>         struct smb3_acl acl;
>>
>> -       *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
>> +       *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
>>
>>         if (set_owner) {
>>                 /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */
>> @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
>>         memcpy(aclptr, &acl, sizeof(struct smb3_acl));
>>
>>         buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
>> -       *len = roundup(ptr - (__u8 *)buf, 8);
>> +       *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8);
>>
>>         return buf;
>>  }
>> @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
>>          * final path needs to be 8-byte aligned as specified in
>>          * MS-SMB2 2.2.13 SMB2 CREATE Request.
>>          */
>> -       *out_size = roundup(*out_len * sizeof(__le16), 8);
>> +       *out_size = round_up(*out_len * sizeof(__le16), 8);
>>         *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL);
>>         if (!*out_path)
>>                 return -ENOMEM;
>> @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
>>                 uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path, PATH_MAX)) + 2;
>>                 /* MUST set path len (NameLength) to 0 opening root of share */
>>                 req->NameLength = cpu_to_le16(uni_path_len - 2);
>> -               if (uni_path_len % 8 != 0) {
>> -                       copy_size = roundup(uni_path_len, 8);
>> -                       copy_path = kzalloc(copy_size, GFP_KERNEL);
>> -                       if (!copy_path) {
>> -                               rc = -ENOMEM;
>> -                               goto err_free_req;
>> -                       }
>> -                       memcpy((char *)copy_path, (const char *)utf16_path,
>> -                              uni_path_len);
>> -                       uni_path_len = copy_size;
>> -                       /* free before overwriting resource */
>> -                       kfree(utf16_path);
>> -                       utf16_path = copy_path;
>> +               copy_size = round_up(uni_path_len, 8);
>> +               copy_path = kzalloc(copy_size, GFP_KERNEL);
>> +               if (!copy_path) {
>> +                       rc = -ENOMEM;
>> +                       goto err_free_req;
>>                 }
>> +               memcpy((char *)copy_path, (const char *)utf16_path, uni_path_len);
>> +               uni_path_len = copy_size;
>> +               /* free before overwriting resource */
>> +               kfree(utf16_path);
>> +               utf16_path = copy_path;
>>         }
>>
>>         iov[1].iov_len = uni_path_len;
>> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
>>                 uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2;
>>                 /* MUST set path len (NameLength) to 0 opening root of share */
>>                 req->NameLength = cpu_to_le16(uni_path_len - 2);
>> -               copy_size = uni_path_len;
>> -               if (copy_size % 8 != 0)
>> -                       copy_size = roundup(copy_size, 8);
>> +               copy_size = round_up(uni_path_len, 8);
>>                 copy_path = kzalloc(copy_size, GFP_KERNEL);
>>                 if (!copy_path)
>>                         return -ENOMEM;
>> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
>>         if (request_type & CHAINED_REQUEST) {
>>                 if (!(request_type & END_OF_CHAIN)) {
>>                         /* next 8-byte aligned request */
>> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
>> +                       *total_len = ALIGN(*total_len, 8);
>>                         shdr->NextCommand = cpu_to_le32(*total_len);
>>                 } else /* END_OF_CHAIN */
>>                         shdr->NextCommand = 0;
>> --
>> 2.35.3
>>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-06 14:41   ` Enzo Matsumiya
@ 2022-09-06 15:47     ` Tom Talpey
  2022-09-06 16:05       ` Enzo Matsumiya
  0 siblings, 1 reply; 10+ messages in thread
From: Tom Talpey @ 2022-09-06 15:47 UTC (permalink / raw)
  To: Enzo Matsumiya, ronnie sahlberg; +Cc: linux-cifs, smfrench, pc, nspmangalore

On 9/6/2022 10:41 AM, Enzo Matsumiya wrote:
> On 09/06, ronnie sahlberg wrote:
>> Very nice. The performance change can not be explained, but nice cleanup.
>>
>> Reviewed by me
> 
> Thanks, Ronnie. Multiplication and division operations have higher CPU
> cost (cycles) than simple bitwise operations; in x86-64, multiplications
> can cost ~5x more, and divisions ~20x more (there's actually a huge
> variation from system to system, but I'd say those numbers are a good
> guesstimate). So the ALIGN()/round_up() macros, that don't do any mult/div,
> are faster, and in cases like ours, where they'll be called billions of
> times on any non-trivial workload, the number of cycles saved adds up
> and performance improves.

In addition to polluting fewer registers and generally being more
lightweight to frob just four low bits.

I am a bit shocked at the improvement though. It's almost worth a
fully profiled before/after analysis.

Reviewed-by: Tom Talpey <tom@talpey.com>


> Cheers,
> 
> Enzo
> 
>>
>> On Tue, 6 Sept 2022 at 11:31, Enzo Matsumiya <ematsumiya@suse.de> wrote:
>>>
>>> Replace hardcoded alignment computations (e.g. (len + 7) & ~0x7) of
>>> (mostly) SMB2 messages length by ALIGN()/IS_ALIGNED() macros (right
>>> tools for the job, less prone to errors).
>>>
>>> But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
>>> if not optimized by the compiler, has the overhead of a multiplication
>>> and a division. Do the same for roundup() by replacing it by round_up()
>>> (division-less version, but requires the multiple to be a power of 2,
>>> which is always the case for us).
>>>
>>> Also remove some unecessary checks where !IS_ALIGNED() would fit, but
>>> calling round_up() directly is just fine as it'll be a no-op if the
>>> value is already aligned.
>>>
>>> Afraid this was a useless micro-optimization, I ran some tests with a
>>> very light workload, and I observed a ~50% perf improvement already:
>>>
>>> Record all cifs.ko functions:
>>>   # trace-cmd record --module cifs -p function_graph
>>>
>>> (test-dir has ~50MB with ~4000 files)
>>> Test commands after share is mounted and with no activity:
>>>   # cp -r test-dir /mnt/test
>>>   # sync
>>>   # umount /mnt/test
>>>
>>> Number of traced functions:
>>>   # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | wc -l
>>> - unpatched: 307746
>>> - patched: 313199
>>>
>>> Measuring the average latency of all traced functions:
>>>   # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | jq 
>>> -s add/length
>>> - unpatched: 27105.577791262822 us
>>> - patched: 14548.665733635338 us
>>>
>>> So even though the patched version made 5k+ more function calls (for
>>> whatever reason), it still did it with ~50% reduced latency.
>>>
>>> On a larger scale, given the affected code paths, this patch should
>>> show a relevant improvement.
>>>
>>> Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
>>> ---
>>> Please let me know if my measurements are enough/valid.
>>>
>>>  fs/cifs/cifssmb.c   |  7 +++---
>>>  fs/cifs/connect.c   | 11 ++++++--
>>>  fs/cifs/sess.c      | 18 +++++--------
>>>  fs/cifs/smb2inode.c |  4 +--
>>>  fs/cifs/smb2misc.c  |  2 +-
>>>  fs/cifs/smb2pdu.c   | 61 +++++++++++++++++++--------------------------
>>>  6 files changed, 47 insertions(+), 56 deletions(-)
>>>
>>> diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
>>> index 7aa91e272027..addf3fc62aef 100644
>>> --- a/fs/cifs/cifssmb.c
>>> +++ b/fs/cifs/cifssmb.c
>>> @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int 
>>> xid, struct cifs_tcon *pTcon,
>>>                                         remap);
>>>         }
>>>         rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
>>> -       count = 12 /* sizeof(struct set_file_rename) */ + (2 * 
>>> len_of_str);
>>> +       count = sizeof(struct set_file_rename) + (2 * len_of_str);
>>>         byte_count += count;
>>>         pSMB->DataCount = cpu_to_le16(count);
>>>         pSMB->TotalDataCount = pSMB->DataCount;
>>> @@ -2796,7 +2796,7 @@ CIFSSMBQuerySymLink(const unsigned int xid, 
>>> struct cifs_tcon *tcon,
>>>                 cifs_dbg(FYI, "Invalid return data count on get 
>>> reparse info ioctl\n");
>>>                 goto qreparse_out;
>>>         }
>>> -       end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char 
>>> *)&pSMBr->ByteCount;
>>> +       end_of_smb = sizeof(__le16) + get_bcc(&pSMBr->hdr) + (char 
>>> *)&pSMBr->ByteCount;
>>>         reparse_buf = (struct reparse_symlink_data *)
>>>                                 ((char *)&pSMBr->hdr.Protocol + 
>>> data_offset);
>>>         if ((char *)reparse_buf >= end_of_smb) {
>>> @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, 
>>> char **ppdata,
>>>         pSMBr = (struct smb_com_ntransact_rsp *)buf;
>>>
>>>         bcc = get_bcc(&pSMBr->hdr);
>>> -       end_of_smb = 2 /* sizeof byte count */ + bcc +
>>> -                       (char *)&pSMBr->ByteCount;
>>> +       end_of_smb = sizeof(__le16) + bcc + (char *)&pSMBr->ByteCount;
>>>
>>>         data_offset = le32_to_cpu(pSMBr->DataOffset);
>>>         data_count = le32_to_cpu(pSMBr->DataCount);
>>> diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
>>> index a0a06b6f252b..389127e21563 100644
>>> --- a/fs/cifs/connect.c
>>> +++ b/fs/cifs/connect.c
>>> @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info 
>>> *server)
>>>          * sessinit is sent but no second negprot
>>>          */
>>>         struct rfc1002_session_packet *ses_init_buf;
>>> +       unsigned int req_noscope_len;
>>>         struct smb_hdr *smb_buf;
>>> +
>>>         ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet),
>>>                                GFP_KERNEL);
>>> +
>>>         if (ses_init_buf) {
>>>                 ses_init_buf->trailer.session_req.called_len = 32;
>>>
>>> @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info 
>>> *server)
>>>                 ses_init_buf->trailer.session_req.scope2 = 0;
>>>                 smb_buf = (struct smb_hdr *)ses_init_buf;
>>>
>>> -               /* sizeof RFC1002_SESSION_REQUEST with no scope */
>>> -               smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
>>> +               /* sizeof RFC1002_SESSION_REQUEST with no scopes */
>>> +               req_noscope_len = sizeof(struct 
>>> rfc1002_session_packet) - 2;
>>> +
>>> +               /* == cpu_to_be32(0x81000044) */
>>> +               smb_buf->smb_buf_length =
>>> +                       cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | 
>>> req_noscope_len);
>>>                 rc = smb_send(server, smb_buf, 0x44);
>>>                 kfree(ses_init_buf);
>>>                 /*
>>> diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
>>> index 3af3b05b6c74..951874928d70 100644
>>> --- a/fs/cifs/sess.c
>>> +++ b/fs/cifs/sess.c
>>> @@ -601,11 +601,6 @@ static void unicode_ssetup_strings(char 
>>> **pbcc_area, struct cifs_ses *ses,
>>>         /* BB FIXME add check that strings total less
>>>         than 335 or will need to send them as arrays */
>>>
>>> -       /* unicode strings, must be word aligned before the call */
>>> -/*     if ((long) bcc_ptr % 2) {
>>> -               *bcc_ptr = 0;
>>> -               bcc_ptr++;
>>> -       } */
>>>         /* copy user */
>>>         if (ses->user_name == NULL) {
>>>                 /* null user mount */
>>> @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>>>         }
>>>
>>>         if (ses->capabilities & CAP_UNICODE) {
>>> -               if (sess_data->iov[0].iov_len % 2) {
>>> +               if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) {
>>>                         *bcc_ptr = 0;
>>>                         bcc_ptr++;
>>>                 }
>>> @@ -1358,7 +1353,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>>>                 /* no string area to decode, do nothing */
>>>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>>>                 /* unicode string area must be word-aligned */
>>> -               if (((unsigned long) bcc_ptr - (unsigned long) 
>>> smb_buf) % 2) {
>>> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned 
>>> long)smb_buf, 2)) {
>>>                         ++bcc_ptr;
>>>                         --bytes_remaining;
>>>                 }
>>> @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>>>
>>>         if (ses->capabilities & CAP_UNICODE) {
>>>                 /* unicode strings must be word aligned */
>>> -               if ((sess_data->iov[0].iov_len
>>> -                       + sess_data->iov[1].iov_len) % 2) {
>>> +               if (!IS_ALIGNED(sess_data->iov[0].iov_len + 
>>> sess_data->iov[1].iov_len, 2)) {
>>>                         *bcc_ptr = 0;
>>>                         bcc_ptr++;
>>>                 }
>>> @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>>>                 /* no string area to decode, do nothing */
>>>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>>>                 /* unicode string area must be word-aligned */
>>> -               if (((unsigned long) bcc_ptr - (unsigned long) 
>>> smb_buf) % 2) {
>>> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned 
>>> long)smb_buf, 2)) {
>>>                         ++bcc_ptr;
>>>                         --bytes_remaining;
>>>                 }
>>> @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct 
>>> sess_data *sess_data)
>>>
>>>         bcc_ptr = sess_data->iov[2].iov_base;
>>>         /* unicode strings must be word aligned */
>>> -       if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 
>>> 2) {
>>> +       if (!IS_ALIGNED(sess_data->iov[0].iov_len + 
>>> sess_data->iov[1].iov_len, 2)) {
>>>                 *bcc_ptr = 0;
>>>                 bcc_ptr++;
>>>         }
>>> @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct 
>>> sess_data *sess_data)
>>>                 /* no string area to decode, do nothing */
>>>         } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
>>>                 /* unicode string area must be word-aligned */
>>> -               if (((unsigned long) bcc_ptr - (unsigned long) 
>>> smb_buf) % 2) {
>>> +               if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned 
>>> long)smb_buf, 2)) {
>>>                         ++bcc_ptr;
>>>                         --bytes_remaining;
>>>                 }
>>> diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
>>> index b83f59051b26..4eefbe574b82 100644
>>> --- a/fs/cifs/smb2inode.c
>>> +++ b/fs/cifs/smb2inode.c
>>> @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct 
>>> cifs_tcon *tcon,
>>>                 rqst[num_rqst].rq_iov = &vars->si_iov[0];
>>>                 rqst[num_rqst].rq_nvec = 1;
>>>
>>> -               size[0] = 1; /* sizeof __u8 See MS-FSCC section 
>>> 2.4.11 */
>>> +               size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
>>>                 data[0] = &delete_pending[0];
>>>
>>>                 rc = SMB2_set_info_init(tcon, server,
>>> @@ -225,7 +225,7 @@ smb2_compound_op(const unsigned int xid, struct 
>>> cifs_tcon *tcon,
>>>                 rqst[num_rqst].rq_iov = &vars->si_iov[0];
>>>                 rqst[num_rqst].rq_nvec = 1;
>>>
>>> -               size[0] = 8; /* sizeof __le64 */
>>> +               size[0] = sizeof(__le64);
>>>                 data[0] = ptr;
>>>
>>>                 rc = SMB2_set_info_init(tcon, server,
>>> diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
>>> index d73e5672aac4..258b01306d85 100644
>>> --- a/fs/cifs/smb2misc.c
>>> +++ b/fs/cifs/smb2misc.c
>>> @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, 
>>> struct TCP_Server_Info *server)
>>>                  * Some windows servers (win2016) will pad also the 
>>> final
>>>                  * PDU in a compound to 8 bytes.
>>>                  */
>>> -               if (((calc_len + 7) & ~7) == len)
>>> +               if (ALIGN(calc_len, 8) == len)
>>>                         return 0;
>>>
>>>                 /*
>>> diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
>>> index 6352ab32c7e7..5da0b596c8a0 100644
>>> --- a/fs/cifs/smb2pdu.c
>>> +++ b/fs/cifs/smb2pdu.c
>>> @@ -466,15 +466,14 @@ build_signing_ctxt(struct 
>>> smb2_signing_capabilities *pneg_ctxt)
>>>         /*
>>>          * Context Data length must be rounded to multiple of 8 for 
>>> some servers
>>>          */
>>> -       pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP(
>>> -                               sizeof(struct 
>>> smb2_signing_capabilities) -
>>> -                               sizeof(struct smb2_neg_context) +
>>> -                               (num_algs * 2 /* sizeof u16 */), 8) * 
>>> 8);
>>> +       pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct 
>>> smb2_signing_capabilities) -
>>> +                                           sizeof(struct 
>>> smb2_neg_context) +
>>> +                                           (num_algs * sizeof(u16)), 
>>> 8));
>>>         pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs);
>>>         pneg_ctxt->SigningAlgorithms[0] = 
>>> cpu_to_le16(SIGNING_ALG_AES_CMAC);
>>>
>>> -       ctxt_len += 2 /* sizeof le16 */ * num_algs;
>>> -       ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8;
>>> +       ctxt_len += sizeof(__le16) * num_algs;
>>> +       ctxt_len = ALIGN(ctxt_len, 8);
>>>         return ctxt_len;
>>>         /* TBD add SIGNING_ALG_AES_GMAC and/or 
>>> SIGNING_ALG_HMAC_SHA256 */
>>>  }
>>> @@ -511,8 +510,7 @@ build_netname_ctxt(struct 
>>> smb2_netname_neg_context *pneg_ctxt, char *hostname)
>>>         /* copy up to max of first 100 bytes of server name to 
>>> NetName field */
>>>         pneg_ctxt->DataLength = cpu_to_le16(2 * 
>>> cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp));
>>>         /* context size is DataLength + minimal smb2_neg_context */
>>> -       return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) +
>>> -                       sizeof(struct smb2_neg_context), 8) * 8;
>>> +       return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + 
>>> sizeof(struct smb2_neg_context), 8);
>>>  }
>>>
>>>  static void
>>> @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req 
>>> *req,
>>>          * round up total_len of fixed part of SMB3 negotiate request 
>>> to 8
>>>          * byte boundary before adding negotiate contexts
>>>          */
>>> -       *total_len = roundup(*total_len, 8);
>>> +       *total_len = ALIGN(*total_len, 8);
>>>
>>>         pneg_ctxt = (*total_len) + (char *)req;
>>>         req->NegotiateContextOffset = cpu_to_le32(*total_len);
>>>
>>>         build_preauth_ctxt((struct smb2_preauth_neg_context 
>>> *)pneg_ctxt);
>>> -       ctxt_len = DIV_ROUND_UP(sizeof(struct 
>>> smb2_preauth_neg_context), 8) * 8;
>>> +       ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8);
>>>         *total_len += ctxt_len;
>>>         pneg_ctxt += ctxt_len;
>>>
>>>         build_encrypt_ctxt((struct smb2_encryption_neg_context 
>>> *)pneg_ctxt);
>>> -       ctxt_len = DIV_ROUND_UP(sizeof(struct 
>>> smb2_encryption_neg_context), 8) * 8;
>>> +       ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8);
>>>         *total_len += ctxt_len;
>>>         pneg_ctxt += ctxt_len;
>>>
>>> @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req 
>>> *req,
>>>         if (server->compress_algorithm) {
>>>                 build_compression_ctxt((struct 
>>> smb2_compression_capabilities_context *)
>>>                                 pneg_ctxt);
>>> -               ctxt_len = DIV_ROUND_UP(
>>> -                       sizeof(struct 
>>> smb2_compression_capabilities_context),
>>> -                               8) * 8;
>>> +               ctxt_len = ALIGN(sizeof(struct 
>>> smb2_compression_capabilities_context), 8);
>>>                 *total_len += ctxt_len;
>>>                 pneg_ctxt += ctxt_len;
>>>                 neg_context_count++;
>>> @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct 
>>> smb2_negotiate_rsp *rsp,
>>>                 if (rc)
>>>                         break;
>>>                 /* offsets must be 8 byte aligned */
>>> -               clen = (clen + 7) & ~0x7;
>>> +               clen = ALIGN(clen, 8);
>>>                 offset += clen + sizeof(struct smb2_neg_context);
>>>                 len_of_ctxts -= clen;
>>>         }
>>> @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, 
>>> unsigned int *len)
>>>         unsigned int group_offset = 0;
>>>         struct smb3_acl acl;
>>>
>>> -       *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct 
>>> cifs_ace) * 4), 8);
>>> +       *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct 
>>> cifs_ace) * 4), 8);
>>>
>>>         if (set_owner) {
>>>                 /* sizeof(struct owner_group_sids) is already 
>>> multiple of 8 so no need to round */
>>> @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, 
>>> unsigned int *len)
>>>         memcpy(aclptr, &acl, sizeof(struct smb3_acl));
>>>
>>>         buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
>>> -       *len = roundup(ptr - (__u8 *)buf, 8);
>>> +       *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8);
>>>
>>>         return buf;
>>>  }
>>> @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, 
>>> int *out_size, int *out_len,
>>>          * final path needs to be 8-byte aligned as specified in
>>>          * MS-SMB2 2.2.13 SMB2 CREATE Request.
>>>          */
>>> -       *out_size = roundup(*out_len * sizeof(__le16), 8);
>>> +       *out_size = round_up(*out_len * sizeof(__le16), 8);
>>>         *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, 
>>> GFP_KERNEL);
>>>         if (!*out_path)
>>>                 return -ENOMEM;
>>> @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int 
>>> xid, struct inode *inode,
>>>                 uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path, 
>>> PATH_MAX)) + 2;
>>>                 /* MUST set path len (NameLength) to 0 opening root 
>>> of share */
>>>                 req->NameLength = cpu_to_le16(uni_path_len - 2);
>>> -               if (uni_path_len % 8 != 0) {
>>> -                       copy_size = roundup(uni_path_len, 8);
>>> -                       copy_path = kzalloc(copy_size, GFP_KERNEL);
>>> -                       if (!copy_path) {
>>> -                               rc = -ENOMEM;
>>> -                               goto err_free_req;
>>> -                       }
>>> -                       memcpy((char *)copy_path, (const char 
>>> *)utf16_path,
>>> -                              uni_path_len);
>>> -                       uni_path_len = copy_size;
>>> -                       /* free before overwriting resource */
>>> -                       kfree(utf16_path);
>>> -                       utf16_path = copy_path;
>>> +               copy_size = round_up(uni_path_len, 8);
>>> +               copy_path = kzalloc(copy_size, GFP_KERNEL);
>>> +               if (!copy_path) {
>>> +                       rc = -ENOMEM;
>>> +                       goto err_free_req;
>>>                 }
>>> +               memcpy((char *)copy_path, (const char *)utf16_path, 
>>> uni_path_len);
>>> +               uni_path_len = copy_size;
>>> +               /* free before overwriting resource */
>>> +               kfree(utf16_path);
>>> +               utf16_path = copy_path;
>>>         }
>>>
>>>         iov[1].iov_len = uni_path_len;
>>> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct 
>>> TCP_Server_Info *server,
>>>                 uni_path_len = (2 * UniStrnlen((wchar_t *)path, 
>>> PATH_MAX)) + 2;
>>>                 /* MUST set path len (NameLength) to 0 opening root 
>>> of share */
>>>                 req->NameLength = cpu_to_le16(uni_path_len - 2);
>>> -               copy_size = uni_path_len;
>>> -               if (copy_size % 8 != 0)
>>> -                       copy_size = roundup(copy_size, 8);
>>> +               copy_size = round_up(uni_path_len, 8);
>>>                 copy_path = kzalloc(copy_size, GFP_KERNEL);
>>>                 if (!copy_path)
>>>                         return -ENOMEM;
>>> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int 
>>> *total_len,
>>>         if (request_type & CHAINED_REQUEST) {
>>>                 if (!(request_type & END_OF_CHAIN)) {
>>>                         /* next 8-byte aligned request */
>>> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
>>> +                       *total_len = ALIGN(*total_len, 8);
>>>                         shdr->NextCommand = cpu_to_le32(*total_len);
>>>                 } else /* END_OF_CHAIN */
>>>                         shdr->NextCommand = 0;
>>> -- 
>>> 2.35.3
>>>
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-06 15:47     ` Tom Talpey
@ 2022-09-06 16:05       ` Enzo Matsumiya
  0 siblings, 0 replies; 10+ messages in thread
From: Enzo Matsumiya @ 2022-09-06 16:05 UTC (permalink / raw)
  To: Tom Talpey; +Cc: ronnie sahlberg, linux-cifs, smfrench, pc, nspmangalore

On 09/06, Tom Talpey wrote:
>In addition to polluting fewer registers and generally being more
>lightweight to frob just four low bits.
>
>I am a bit shocked at the improvement though. It's almost worth a
>fully profiled before/after analysis.
>
>Reviewed-by: Tom Talpey <tom@talpey.com>

Thanks, Tom. (see below)

>>>>Afraid this was a useless micro-optimization, I ran some tests with a
>>>>very light workload, and I observed a ~50% perf improvement already:
>>>>
>>>>Record all cifs.ko functions:
>>>>  # trace-cmd record --module cifs -p function_graph
>>>>
>>>>(test-dir has ~50MB with ~4000 files)
>>>>Test commands after share is mounted and with no activity:
>>>>  # cp -r test-dir /mnt/test
>>>>  # sync
>>>>  # umount /mnt/test
>>>>
>>>>Number of traced functions:
>>>>  # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | wc -l
>>>>- unpatched: 307746
>>>>- patched: 313199
>>>>
>>>>Measuring the average latency of all traced functions:
>>>>  # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | 
>>>>jq -s add/length
>>>>- unpatched: 27105.577791262822 us
>>>>- patched: 14548.665733635338 us
>>>>
>>>>So even though the patched version made 5k+ more function calls (for
>>>>whatever reason), it still did it with ~50% reduced latency.
>>>>
>>>>On a larger scale, given the affected code paths, this patch should
>>>>show a relevant improvement.

So I ran these tests several times to confirm what I was seeing. Again,
they might look (and actually are) micro-optimizations at first sight.
But the improvement comes from the fact that cifs.ko calls the affected
functions many, many times, even for such light (50MB copy) workloads.

On those tests of mine, the user-perceived (i.e. wall clock) improvement
was presented as a 0.02 - 0.05 seconds difference -- still looks minor
for, e.g. NAS boxes that does backups once a week, but aiming for 24/7
90% loaded datacenter systems, my expectation is this will be called an
improvement.


Cheers,

Enzo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-06  1:30 [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up() Enzo Matsumiya
  2022-09-06  2:36 ` ronnie sahlberg
@ 2022-09-07  8:37 ` Aurélien Aptel
  2022-09-07 18:15   ` Enzo Matsumiya
  2022-09-07 20:41   ` Enzo Matsumiya
  1 sibling, 2 replies; 10+ messages in thread
From: Aurélien Aptel @ 2022-09-07  8:37 UTC (permalink / raw)
  To: Enzo Matsumiya; +Cc: linux-cifs, smfrench, pc, ronniesahlberg, nspmangalore

Hi,

Changes are good from a readability stand point but like the others
I'm very skeptical about the perf improvement claims.

On Tue, Sep 6, 2022 at 3:34 AM Enzo Matsumiya <ematsumiya@suse.de> wrote:
> But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
> if not optimized by the compiler, has the overhead of a multiplication
> and a division. Do the same for roundup() by replacing it by round_up()
> (division-less version, but requires the multiple to be a power of 2,
> which is always the case for us).

Many of these compile to the same division-less instructions
especially if any of the values are known at compile time.

> @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,

smb1 and computed at compile time

> @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,

smb1

> @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
> @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)

connect time

> @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
> @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
> @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
> @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
> @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)

connect time

> @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
> -               size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
> +               size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
> -               size[0] = 8; /* sizeof __le64 */
> +               size[0] = sizeof(__le64);

Hot path but no functional change

> @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
>                  * Some windows servers (win2016) will pad also the final
>                  * PDU in a compound to 8 bytes.
>                  */
> -               if (((calc_len + 7) & ~7) == len)
> +               if (ALIGN(calc_len, 8) == len)

Hot path but should compile to the same thing

> @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
> @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
> @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
> @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
> @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,

connect time

> @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
> @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)

only relevant if mounted with some acl flags

> @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
> -       *out_size = roundup(*out_len * sizeof(__le16), 8);
> +       *out_size = round_up(*out_len * sizeof(__le16), 8);

Hot path but from my experiments, round_up() compiles to an *extra*
instruction. See below.

> @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,

Only relevant on posix mounts.

> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
> -               copy_size = uni_path_len;
> -               if (copy_size % 8 != 0)
> -                       copy_size = roundup(copy_size, 8);
> +               copy_size = round_up(uni_path_len, 8);
> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
> +                       *total_len = ALIGN(*total_len, 8);

These 2 are also hot paths, but skeptical about the optimizations.

I've looked at those macros in Compiler Explorer and sure enough they
compile to the same thing on x86_64.
Even worse, the optimized versions compile with extra instructions for some:

https://godbolt.org/z/z1xhhW9sj

size_t mod2(size_t num) {
    return num%2;
}

size_t is_aligned2(size_t num) {
    return IS_ALIGNED(num, 2);
}

mod2:
        mov     rax, rdi
        and     eax, 1
        ret

is_aligned2:
        mov     rax, rdi
        not     rax  <=== extra
        and     eax, 1
        ret
--------------------------

size_t align8(size_t num) {
    return ALIGN(num, 8);
}

size_t align_andshift8(size_t num) {
    return ((num + 7) & ~7);
}


align8:
        lea     rax, [rdi+7]
        and     rax, -8
        ret
align_andshift8:
        lea     rax, [rdi+7]
        and     rax, -8
        ret

same code
--------------------------

size_t dru8(size_t num) {
    return DIV_ROUND_UP(num, 8)*8;
}

size_t rnup8_a(size_t num) {
    return round_up(num, 8);
}

size_t rnup8_b(size_t num) {
    return roundup(num, 8);
}

dru8:
        lea     rax, [rdi+7]
        and     rax, -8
        ret
rnup8_a:
        lea     rax, [rdi-1]
        or      rax, 7 <==== extra
        add     rax, 1
        ret
rnup8_b:
        lea     rax, [rdi+7]
        and     rax, -8
        ret

round_up has an extra instruction
--------------------------

I suspect the improvements are more likely to be related to caches,
system load, server load, ...
You can try to use perf to make a flamegraph and compare.

Cheers,

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-07  8:37 ` Aurélien Aptel
@ 2022-09-07 18:15   ` Enzo Matsumiya
  2022-09-07 20:41   ` Enzo Matsumiya
  1 sibling, 0 replies; 10+ messages in thread
From: Enzo Matsumiya @ 2022-09-07 18:15 UTC (permalink / raw)
  To: Aurélien Aptel
  Cc: linux-cifs, smfrench, pc, ronniesahlberg, nspmangalore

Hi Aurelien,

On 09/07, Aurélien Aptel wrote:
>Hi,
>
>Changes are good from a readability stand point but like the others
>I'm very skeptical about the perf improvement claims.

I also am/was. That's why I ran 5 times each test after sending the
patch. For each run, I made sure to have the mount point clean, both
server and client freshly booted, and aside from the patch, the build
env/compile options were exactly the same.

The server had no parallel workload at all as it's only a test server.

>Many of these compile to the same division-less instructions
>especially if any of the values are known at compile time.

<snip>

>> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
>> -               copy_size = uni_path_len;
>> -               if (copy_size % 8 != 0)
>> -                       copy_size = roundup(copy_size, 8);
>> +               copy_size = round_up(uni_path_len, 8);
>> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
>> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
>> +                       *total_len = ALIGN(*total_len, 8);
>
>These 2 are also hot paths, but skeptical about the optimizations.

As you point out, SMB2_open_init() was indeed the function with greater
improvement, and as per my measurements, the one that actually impacted
the general latency.

>I've looked at those macros in Compiler Explorer and sure enough they
>compile to the same thing on x86_64.
>Even worse, the optimized versions compile with extra instructions for some:
>
>https://godbolt.org/z/z1xhhW9sj

I did the same comparison on a userspace program and got similar
results, but I didn't bother to check the kernel objects as testing it
was quicker to me. But I'll do it today.

>I suspect the improvements are more likely to be related to caches,
>system load, server load, ...

See above.

>You can try to use perf to make a flamegraph and compare.
>
>Cheers,

Not really used to perf, but will check it out, thanks!


Cheers,

Enzo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-07  8:37 ` Aurélien Aptel
  2022-09-07 18:15   ` Enzo Matsumiya
@ 2022-09-07 20:41   ` Enzo Matsumiya
  2022-09-14  2:50     ` Steve French
  1 sibling, 1 reply; 10+ messages in thread
From: Enzo Matsumiya @ 2022-09-07 20:41 UTC (permalink / raw)
  To: Aurélien Aptel
  Cc: linux-cifs, smfrench, pc, ronniesahlberg, nspmangalore

On 09/07, Aurélien Aptel wrote:
>Hi,
>
>Changes are good from a readability stand point but like the others
>I'm very skeptical about the perf improvement claims.

Just to be clear, as I re-read the commit message and realize I might have
sounded a tad sensationalist; the "~50% improvement" was from the measure of
the average latency for a single copy operation. As I replied to Tom Talpey in
https://lore.kernel.org/linux-cifs/20220906160508.x4ahjzo3tr34w6k5@cyberdelia/
the actual perceptible gain was only 0.02 to 0.05 seconds on my tests.

Pardon me for any confusion.


Enzo

>On Tue, Sep 6, 2022 at 3:34 AM Enzo Matsumiya <ematsumiya@suse.de> wrote:
>> But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
>> if not optimized by the compiler, has the overhead of a multiplication
>> and a division. Do the same for roundup() by replacing it by round_up()
>> (division-less version, but requires the multiple to be a power of 2,
>> which is always the case for us).
>
>Many of these compile to the same division-less instructions
>especially if any of the values are known at compile time.
>
>> @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
>
>smb1 and computed at compile time
>
>> @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
>
>smb1
>
>> @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
>> @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
>
>connect time
>
>> @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
>> @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>> @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
>> @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
>> @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
>
>connect time
>
>> @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
>> -               size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
>> +               size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
>> -               size[0] = 8; /* sizeof __le64 */
>> +               size[0] = sizeof(__le64);
>
>Hot path but no functional change
>
>> @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
>>                  * Some windows servers (win2016) will pad also the final
>>                  * PDU in a compound to 8 bytes.
>>                  */
>> -               if (((calc_len + 7) & ~7) == len)
>> +               if (ALIGN(calc_len, 8) == len)
>
>Hot path but should compile to the same thing
>
>> @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
>> @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
>> @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
>> @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
>> @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
>
>connect time
>
>> @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
>> @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
>
>only relevant if mounted with some acl flags
>
>> @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
>> -       *out_size = roundup(*out_len * sizeof(__le16), 8);
>> +       *out_size = round_up(*out_len * sizeof(__le16), 8);
>
>Hot path but from my experiments, round_up() compiles to an *extra*
>instruction. See below.
>
>> @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
>
>Only relevant on posix mounts.
>
>> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
>> -               copy_size = uni_path_len;
>> -               if (copy_size % 8 != 0)
>> -                       copy_size = roundup(copy_size, 8);
>> +               copy_size = round_up(uni_path_len, 8);
>> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
>> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
>> +                       *total_len = ALIGN(*total_len, 8);
>
>These 2 are also hot paths, but skeptical about the optimizations.
>
>I've looked at those macros in Compiler Explorer and sure enough they
>compile to the same thing on x86_64.
>Even worse, the optimized versions compile with extra instructions for some:
>
>https://godbolt.org/z/z1xhhW9sj
>
>size_t mod2(size_t num) {
>    return num%2;
>}
>
>size_t is_aligned2(size_t num) {
>    return IS_ALIGNED(num, 2);
>}
>
>mod2:
>        mov     rax, rdi
>        and     eax, 1
>        ret
>
>is_aligned2:
>        mov     rax, rdi
>        not     rax  <=== extra
>        and     eax, 1
>        ret
>--------------------------
>
>size_t align8(size_t num) {
>    return ALIGN(num, 8);
>}
>
>size_t align_andshift8(size_t num) {
>    return ((num + 7) & ~7);
>}
>
>
>align8:
>        lea     rax, [rdi+7]
>        and     rax, -8
>        ret
>align_andshift8:
>        lea     rax, [rdi+7]
>        and     rax, -8
>        ret
>
>same code
>--------------------------
>
>size_t dru8(size_t num) {
>    return DIV_ROUND_UP(num, 8)*8;
>}
>
>size_t rnup8_a(size_t num) {
>    return round_up(num, 8);
>}
>
>size_t rnup8_b(size_t num) {
>    return roundup(num, 8);
>}
>
>dru8:
>        lea     rax, [rdi+7]
>        and     rax, -8
>        ret
>rnup8_a:
>        lea     rax, [rdi-1]
>        or      rax, 7 <==== extra
>        add     rax, 1
>        ret
>rnup8_b:
>        lea     rax, [rdi+7]
>        and     rax, -8
>        ret
>
>round_up has an extra instruction
>--------------------------
>
>I suspect the improvements are more likely to be related to caches,
>system load, server load, ...
>You can try to use perf to make a flamegraph and compare.
>
>Cheers,

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-07 20:41   ` Enzo Matsumiya
@ 2022-09-14  2:50     ` Steve French
  2022-09-14 14:38       ` Enzo Matsumiya
  0 siblings, 1 reply; 10+ messages in thread
From: Steve French @ 2022-09-14  2:50 UTC (permalink / raw)
  To: Enzo Matsumiya
  Cc: Aurélien Aptel, linux-cifs, pc, ronniesahlberg, nspmangalore

Most of the changes look ok to me, although not much point in changing
the lines like:

     x = 2 /* sizeof(__u16) */ ....
to
     x = sizeof(__u16) ...

not sure that those  particular kinds of changes help enough with
readability (but they make backports harder) - and they have 0 impact
on performance.   So even if that type of change helps readability a
small amount, it could be split out from the things which could help
performance (and/or clearly improve readability).

The one area that looked confusing is this part.  Can you explain why
this part of the change?

@@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid,
struct inode *inode,
                uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path,
PATH_MAX)) + 2;
                /* MUST set path len (NameLength) to 0 opening root of share */
                req->NameLength = cpu_to_le16(uni_path_len - 2);
-               if (uni_path_len % 8 != 0) {
-                       copy_size = roundup(uni_path_len, 8);
-                       copy_path = kzalloc(copy_size, GFP_KERNEL);
-                       if (!copy_path) {
-                               rc = -ENOMEM;
-                               goto err_free_req;
-                       }
-                       memcpy((char *)copy_path, (const char *)utf16_path,
-                              uni_path_len);
-                       uni_path_len = copy_size;
-                       /* free before overwriting resource */
-                       kfree(utf16_path);
-                       utf16_path = copy_path;
+               copy_size = round_up(uni_path_len, 8);
+               copy_path = kzalloc(copy_size, GFP_KERNEL);
+               if (!copy_path) {
+                       rc = -ENOMEM;
+                       goto err_free_req;
                }
+               memcpy((char *)copy_path, (const char *)utf16_path,
uni_path_len);
+               uni_path_len = copy_size;
+               /* free before overwriting resource */
+               kfree(utf16_path);
+               utf16_path = copy_path;
        }


On Wed, Sep 7, 2022 at 3:41 PM Enzo Matsumiya <ematsumiya@suse.de> wrote:
>
> On 09/07, Aurélien Aptel wrote:
> >Hi,
> >
> >Changes are good from a readability stand point but like the others
> >I'm very skeptical about the perf improvement claims.
>
> Just to be clear, as I re-read the commit message and realize I might have
> sounded a tad sensationalist; the "~50% improvement" was from the measure of
> the average latency for a single copy operation. As I replied to Tom Talpey in
> https://lore.kernel.org/linux-cifs/20220906160508.x4ahjzo3tr34w6k5@cyberdelia/
> the actual perceptible gain was only 0.02 to 0.05 seconds on my tests.
>
> Pardon me for any confusion.
>
>
> Enzo
>
> >On Tue, Sep 6, 2022 at 3:34 AM Enzo Matsumiya <ematsumiya@suse.de> wrote:
> >> But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
> >> if not optimized by the compiler, has the overhead of a multiplication
> >> and a division. Do the same for roundup() by replacing it by round_up()
> >> (division-less version, but requires the multiple to be a power of 2,
> >> which is always the case for us).
> >
> >Many of these compile to the same division-less instructions
> >especially if any of the values are known at compile time.
> >
> >> @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
> >
> >smb1 and computed at compile time
> >
> >> @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
> >
> >smb1
> >
> >> @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
> >> @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
> >
> >connect time
> >
> >> @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
> >> @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
> >> @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
> >> @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
> >> @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
> >
> >connect time
> >
> >> @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
> >> -               size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
> >> +               size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
> >> -               size[0] = 8; /* sizeof __le64 */
> >> +               size[0] = sizeof(__le64);
> >
> >Hot path but no functional change
> >
> >> @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
> >>                  * Some windows servers (win2016) will pad also the final
> >>                  * PDU in a compound to 8 bytes.
> >>                  */
> >> -               if (((calc_len + 7) & ~7) == len)
> >> +               if (ALIGN(calc_len, 8) == len)
> >
> >Hot path but should compile to the same thing
> >
> >> @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
> >> @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
> >> @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
> >> @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
> >> @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
> >
> >connect time
> >
> >> @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
> >> @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
> >
> >only relevant if mounted with some acl flags
> >
> >> @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
> >> -       *out_size = roundup(*out_len * sizeof(__le16), 8);
> >> +       *out_size = round_up(*out_len * sizeof(__le16), 8);
> >
> >Hot path but from my experiments, round_up() compiles to an *extra*
> >instruction. See below.
> >
> >> @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
> >
> >Only relevant on posix mounts.
> >
> >> @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
> >> -               copy_size = uni_path_len;
> >> -               if (copy_size % 8 != 0)
> >> -                       copy_size = roundup(copy_size, 8);
> >> +               copy_size = round_up(uni_path_len, 8);
> >> @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
> >> -                       *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
> >> +                       *total_len = ALIGN(*total_len, 8);
> >
> >These 2 are also hot paths, but skeptical about the optimizations.
> >
> >I've looked at those macros in Compiler Explorer and sure enough they
> >compile to the same thing on x86_64.
> >Even worse, the optimized versions compile with extra instructions for some:
> >
> >https://godbolt.org/z/z1xhhW9sj
> >
> >size_t mod2(size_t num) {
> >    return num%2;
> >}
> >
> >size_t is_aligned2(size_t num) {
> >    return IS_ALIGNED(num, 2);
> >}
> >
> >mod2:
> >        mov     rax, rdi
> >        and     eax, 1
> >        ret
> >
> >is_aligned2:
> >        mov     rax, rdi
> >        not     rax  <=== extra
> >        and     eax, 1
> >        ret
> >--------------------------
> >
> >size_t align8(size_t num) {
> >    return ALIGN(num, 8);
> >}
> >
> >size_t align_andshift8(size_t num) {
> >    return ((num + 7) & ~7);
> >}
> >
> >
> >align8:
> >        lea     rax, [rdi+7]
> >        and     rax, -8
> >        ret
> >align_andshift8:
> >        lea     rax, [rdi+7]
> >        and     rax, -8
> >        ret
> >
> >same code
> >--------------------------
> >
> >size_t dru8(size_t num) {
> >    return DIV_ROUND_UP(num, 8)*8;
> >}
> >
> >size_t rnup8_a(size_t num) {
> >    return round_up(num, 8);
> >}
> >
> >size_t rnup8_b(size_t num) {
> >    return roundup(num, 8);
> >}
> >
> >dru8:
> >        lea     rax, [rdi+7]
> >        and     rax, -8
> >        ret
> >rnup8_a:
> >        lea     rax, [rdi-1]
> >        or      rax, 7 <==== extra
> >        add     rax, 1
> >        ret
> >rnup8_b:
> >        lea     rax, [rdi+7]
> >        and     rax, -8
> >        ret
> >
> >round_up has an extra instruction
> >--------------------------
> >
> >I suspect the improvements are more likely to be related to caches,
> >system load, server load, ...
> >You can try to use perf to make a flamegraph and compare.
> >
> >Cheers,



-- 
Thanks,

Steve

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up()
  2022-09-14  2:50     ` Steve French
@ 2022-09-14 14:38       ` Enzo Matsumiya
  0 siblings, 0 replies; 10+ messages in thread
From: Enzo Matsumiya @ 2022-09-14 14:38 UTC (permalink / raw)
  To: Steve French
  Cc: Aurélien Aptel, linux-cifs, pc, ronniesahlberg, nspmangalore

On 09/13, Steve French wrote:
>Most of the changes look ok to me, although not much point in changing
>the lines like:
>
>     x = 2 /* sizeof(__u16) */ ....
>to
>     x = sizeof(__u16) ...
>
>not sure that those  particular kinds of changes help enough with
>readability (but they make backports harder) - and they have 0 impact
>on performance.   So even if that type of change helps readability a
>small amount, it could be split out from the things which could help
>performance (and/or clearly improve readability).

Those kind of changes were not aimed at performance at all. IMHO it
improves readability becase a) remove a hardcoded constant, and b)
remove the (necessity of) inline comments.

I can drop those changes if you'd like.

>The one area that looked confusing is this part.  Can you explain why
>this part of the change?
>
>@@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid,
>struct inode *inode,
>                uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path,
>PATH_MAX)) + 2;
>                /* MUST set path len (NameLength) to 0 opening root of share */
>                req->NameLength = cpu_to_le16(uni_path_len - 2);
>-               if (uni_path_len % 8 != 0) {
>-                       copy_size = roundup(uni_path_len, 8);
>-                       copy_path = kzalloc(copy_size, GFP_KERNEL);
>-                       if (!copy_path) {
>-                               rc = -ENOMEM;
>-                               goto err_free_req;
>-                       }
>-                       memcpy((char *)copy_path, (const char *)utf16_path,
>-                              uni_path_len);
>-                       uni_path_len = copy_size;
>-                       /* free before overwriting resource */
>-                       kfree(utf16_path);
>-                       utf16_path = copy_path;
>+               copy_size = round_up(uni_path_len, 8);
>+               copy_path = kzalloc(copy_size, GFP_KERNEL);
>+               if (!copy_path) {
>+                       rc = -ENOMEM;
>+                       goto err_free_req;
>                }
>+               memcpy((char *)copy_path, (const char *)utf16_path,
>uni_path_len);
>+               uni_path_len = copy_size;
>+               /* free before overwriting resource */
>+               kfree(utf16_path);
>+               utf16_path = copy_path;
>        }

This change only removes the check "if (uni_path_len  %8 != 0)" because
uni_path_len will be aligned by round_up() (if unaliged), or a no-op if
already aligned; hence no need to check it. Unless I missed something?


Cheers,

Enzo

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-09-14 14:39 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-06  1:30 [PATCH] cifs: perf improvement - use faster macros ALIGN() and round_up() Enzo Matsumiya
2022-09-06  2:36 ` ronnie sahlberg
2022-09-06 14:41   ` Enzo Matsumiya
2022-09-06 15:47     ` Tom Talpey
2022-09-06 16:05       ` Enzo Matsumiya
2022-09-07  8:37 ` Aurélien Aptel
2022-09-07 18:15   ` Enzo Matsumiya
2022-09-07 20:41   ` Enzo Matsumiya
2022-09-14  2:50     ` Steve French
2022-09-14 14:38       ` Enzo Matsumiya

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).