[RFC/RFT PATCH] Introduce try_cmpxchg64 and use it in vmx/posted_intr.c

From: Uros Bizjak <ubizjak@gmail.com>
To: kvm@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Sean Christopherson <seanjc@google.com>
Subject: [RFC/RFT PATCH] Introduce try_cmpxchg64 and use it in vmx/posted_intr.c
Date: Wed, 2 Feb 2022 14:15:59 +0100	[thread overview]
Message-ID: <CAFULd4bQa-EkiTc06VysryKRb+zXBTRZFPkEJe7wCUu3RyON+g@mail.gmail.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 3067 bytes --]

Hello!

Attached RFC patch introduces try_cmpxchg64 and uses it in
vmx/posted_intr.c. While the resulting code improvements on x86_64 are
somehow minor (a compare and a move saved), the improvements on x86_32
are quite noticeable. The code around cmpxchg8b improves from:

  84:    89 74 24 30              mov    %esi,0x30(%esp)
  88:    89 fe                    mov    %edi,%esi
  8a:    0f b7 0c 02              movzwl (%edx,%eax,1),%ecx
  8e:    c1 e1 08                 shl    $0x8,%ecx
  91:    0f b7 c9                 movzwl %cx,%ecx
  94:    89 4c 24 34              mov    %ecx,0x34(%esp)
  98:    8b 96 24 1e 00 00        mov    0x1e24(%esi),%edx
  9e:    8b 86 20 1e 00 00        mov    0x1e20(%esi),%eax
  a4:    8b 5c 24 34              mov    0x34(%esp),%ebx
  a8:    8b 7c 24 30              mov    0x30(%esp),%edi
  ac:    89 44 24 38              mov    %eax,0x38(%esp)
  b0:    0f b6 44 24 38           movzbl 0x38(%esp),%eax
  b5:    8b 4c 24 38              mov    0x38(%esp),%ecx
  b9:    89 54 24 3c              mov    %edx,0x3c(%esp)
  bd:    83 e0 fd                 and    $0xfffffffd,%eax
  c0:    89 5c 24 64              mov    %ebx,0x64(%esp)
  c4:    8b 54 24 3c              mov    0x3c(%esp),%edx
  c8:    89 4c 24 60              mov    %ecx,0x60(%esp)
  cc:    8b 4c 24 34              mov    0x34(%esp),%ecx
  d0:    88 44 24 60              mov    %al,0x60(%esp)
  d4:    8b 44 24 38              mov    0x38(%esp),%eax
  d8:    c6 44 24 62 f2           movb   $0xf2,0x62(%esp)
  dd:    8b 5c 24 60              mov    0x60(%esp),%ebx
  e1:    f0 0f c7 0f              lock cmpxchg8b (%edi)
  e5:    89 d1                    mov    %edx,%ecx
  e7:    8b 54 24 3c              mov    0x3c(%esp),%edx
  eb:    33 44 24 38              xor    0x38(%esp),%eax
  ef:    31 ca                    xor    %ecx,%edx
  f1:    09 c2                    or     %eax,%edx
  f3:    75 a3                    jne    98 <vmx_vcpu_pi_load+0x98>

to:

  84:    0f b7 0c 02              movzwl (%edx,%eax,1),%ecx
  88:    c1 e1 08                 shl    $0x8,%ecx
  8b:    0f b7 c9                 movzwl %cx,%ecx
  8e:    8b 86 20 1e 00 00        mov    0x1e20(%esi),%eax
  94:    8b 96 24 1e 00 00        mov    0x1e24(%esi),%edx
  9a:    89 4c 24 64              mov    %ecx,0x64(%esp)
  9e:    89 c3                    mov    %eax,%ebx
  a0:    89 44 24 60              mov    %eax,0x60(%esp)
  a4:    83 e3 fd                 and    $0xfffffffd,%ebx
  a7:    c6 44 24 62 f2           movb   $0xf2,0x62(%esp)
  ac:    88 5c 24 60              mov    %bl,0x60(%esp)
  b0:    8b 5c 24 60              mov    0x60(%esp),%ebx
  b4:    f0 0f c7 0f              lock cmpxchg8b (%edi)
  b8:    75 d4                    jne    8e <vmx_vcpu_pi_load+0x8e>

The patch was only lightly tested, so I would like to ask someone to
spare a few cycles for a thorough test on 64bit and 32bit targets. As
shown above, the try_cmpxchg64 functions should be generally usable
for x86 targets, I plan to propose a patch that introduces these to
x86 maintainers.

Uros.

[-- Attachment #2: try_cmpxchg-4.diff.txt --]
[-- Type: text/plain, Size: 6896 bytes --]

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 0a7fe0321613..e874ff7f7529 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
 #define arch_cmpxchg64_local(ptr, o, n)					\
 	((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
 					       (unsigned long long)(n)))
+#define arch_try_cmpxchg64(ptr, po, n)					\
+	((__typeof__(*(ptr)))__try_cmpxchg64((ptr), (unsigned long long *)(po), \
+					     (unsigned long long)(n)))
 #endif
 
 static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new)
@@ -70,6 +73,25 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
 	return prev;
 }
 
+static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new)
+{
+	bool success;
+	u64 prev;
+	asm volatile(LOCK_PREFIX "cmpxchg8b %2"
+		     CC_SET(z)
+		     : CC_OUT(z) (success),
+		       "=A" (prev),
+		       "+m" (*ptr)
+		     : "b" ((u32)new),
+		       "c" ((u32)(new >> 32)),
+		       "1" (*pold)
+		     : "memory");
+
+	if (unlikely(!success))
+		*pold = prev;
+	return success;
+}
+
 #ifndef CONFIG_X86_CMPXCHG64
 /*
  * Building a kernel capable running on 80386 and 80486. It may be necessary
@@ -108,6 +130,27 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new)
 		       : "memory");				\
 	__ret; })
 
+#define arch_try_cmpxchg64(ptr, po, n)				\
+({								\
+	bool success;						\
+	__typeof__(*(ptr)) __prev;				\
+	__typeof__(ptr) _old = (__typeof__(ptr))(po);		\
+	__typeof__(*(ptr)) __old = *_old;			\
+	__typeof__(*(ptr)) __new = (n);				\
+	alternative_io(LOCK_PREFIX_HERE				\
+			"call cmpxchg8b_emu",			\
+			"lock; cmpxchg8b (%%esi)" ,		\
+		       X86_FEATURE_CX8,				\
+		       "=A" (__prev),				\
+		       "S" ((ptr)), "0" (__old),		\
+		       "b" ((unsigned int)__new),		\
+		       "c" ((unsigned int)(__new>>32))		\
+		       : "memory");				\
+	success = (__prev == __old);				\
+	if (unlikely(!success))					\
+		*_old = __prev;					\
+	likely(success);					\
+})
 #endif
 
 #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 072e5459fe2f..250187ac8248 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
 	arch_cmpxchg_local((ptr), (o), (n));				\
 })
 
+#define arch_try_cmpxchg64(ptr, po, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	arch_try_cmpxchg((ptr), (po), (n));				\
+})
+
 #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16)
 
 #endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index aa1fe9085d77..5ce185caa92c 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -34,7 +34,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 	return &(to_vmx(vcpu)->pi_desc);
 }
 
-static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
 {
 	/*
 	 * PID.ON can be set at any time by a different vCPU or by hardware,
@@ -42,7 +42,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
 	 * update must be retried with a fresh snapshot an ON change causes
 	 * the cmpxchg to fail.
 	 */
-	if (cmpxchg64(&pi_desc->control, old, new) != old)
+	if (!try_cmpxchg64(&pi_desc->control, pold, new))
 		return -EBUSY;
 
 	return 0;
@@ -111,7 +111,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 		 * descriptor was modified on "put" to use the wakeup vector.
 		 */
 		new.nv = POSTED_INTR_VECTOR;
-	} while (pi_try_set_control(pi_desc, old.control, new.control));
+	} while (pi_try_set_control(pi_desc, &old.control, new.control));
 
 	local_irq_restore(flags);
 
@@ -161,7 +161,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
 
 		/* set 'NV' to 'wakeup vector' */
 		new.nv = POSTED_INTR_WAKEUP_VECTOR;
-	} while (pi_try_set_control(pi_desc, old.control, new.control));
+	} while (pi_try_set_control(pi_desc, &old.control, new.control));
 
 	/*
 	 * Send a wakeup IPI to this CPU if an interrupt may have been posted
diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h
index 5d69b143c28e..7a139ec030b0 100644
--- a/include/linux/atomic/atomic-instrumented.h
+++ b/include/linux/atomic/atomic-instrumented.h
@@ -2006,6 +2006,44 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 	arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
 })
 
+#define try_cmpxchg64(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_mb(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_acquire(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_release(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	kcsan_release(); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
+#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
+({ \
+	typeof(ptr) __ai_ptr = (ptr); \
+	typeof(oldp) __ai_oldp = (oldp); \
+	instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \
+	instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \
+	arch_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
+})
+
 #define cmpxchg_local(ptr, ...) \
 ({ \
 	typeof(ptr) __ai_ptr = (ptr); \
@@ -2045,4 +2083,4 @@ atomic_long_dec_if_positive(atomic_long_t *v)
 })
 
 #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
-// 87c974b93032afd42143613434d1a7788fa598f9
+// 764f741eb77a7ad565dc8d99ce2837d5542e8aee
diff --git a/scripts/atomic/gen-atomic-instrumented.sh b/scripts/atomic/gen-atomic-instrumented.sh
index 68f902731d01..77c06526a574 100755
--- a/scripts/atomic/gen-atomic-instrumented.sh
+++ b/scripts/atomic/gen-atomic-instrumented.sh
@@ -166,7 +166,7 @@ grep '^[a-z]' "$1" | while read name meta args; do
 done
 
 
-for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg"; do
+for xchg in "xchg" "cmpxchg" "cmpxchg64" "try_cmpxchg" "try_cmpxchg64"; do
 	for order in "" "_acquire" "_release" "_relaxed"; do
 		gen_xchg "${xchg}" "${order}" ""
 		printf "\n"