All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/microcode: Attempt late loading only when new microcode is present
@ 2018-03-14 18:36 Borislav Petkov
  2018-03-14 18:36 ` [PATCH 2/2] x86/microcode: Fix CPU synchronization routine Borislav Petkov
  2018-03-16 20:00 ` [tip:x86/pti] x86/microcode: Attempt late loading only when new microcode is present tip-bot for Borislav Petkov
  0 siblings, 2 replies; 9+ messages in thread
From: Borislav Petkov @ 2018-03-14 18:36 UTC (permalink / raw)
  To: X86 ML; +Cc: Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML

From: Borislav Petkov <bp@suse.de>

Return UCODE_NEW from the scanning functions to denote that new
microcode was found and only then attempt the expensive synchronization
dance.

Reported-and-tested-by: Emanuel Czirai <xftroxgpx@protonmail.com>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/microcode.h      |  1 +
 arch/x86/kernel/cpu/microcode/amd.c   | 34 +++++++++++++++++++++-------------
 arch/x86/kernel/cpu/microcode/core.c  |  8 +++-----
 arch/x86/kernel/cpu/microcode/intel.c |  4 +++-
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 871714e2e4c6..2b7cc5397f80 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -25,6 +25,7 @@ struct device;
 
 enum ucode_state {
 	UCODE_OK	= 0,
+	UCODE_NEW,
 	UCODE_UPDATED,
 	UCODE_NFOUND,
 	UCODE_ERROR,
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a998e1a7d46f..48179928ff38 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -339,7 +339,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 		return -EINVAL;
 
 	ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
-	if (ret != UCODE_OK)
+	if (ret > UCODE_UPDATED)
 		return -EINVAL;
 
 	return 0;
@@ -683,27 +683,35 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
 static enum ucode_state
 load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
 {
+	struct ucode_patch *p;
 	enum ucode_state ret;
 
 	/* free old equiv table */
 	free_equiv_cpu_table();
 
 	ret = __load_microcode_amd(family, data, size);
-
-	if (ret != UCODE_OK)
+	if (ret != UCODE_OK) {
 		cleanup();
+		return ret;
+	}
 
-#ifdef CONFIG_X86_32
-	/* save BSP's matching patch for early load */
-	if (save) {
-		struct ucode_patch *p = find_patch(0);
-		if (p) {
-			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
-			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
-							       PATCH_MAX_SIZE));
-		}
+	p = find_patch(0);
+	if (!p) {
+		return ret;
+	} else {
+		if (boot_cpu_data.microcode == p->patch_id)
+			return ret;
+
+		ret = UCODE_NEW;
 	}
-#endif
+
+	/* save BSP's matching patch for early load */
+	if (!save)
+		return ret;
+
+	memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+	memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+
 	return ret;
 }
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 70ecbc8099c9..9f0fe5bb450d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -607,7 +607,7 @@ static ssize_t reload_store(struct device *dev,
 		return size;
 
 	tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
-	if (tmp_ret != UCODE_OK)
+	if (tmp_ret != UCODE_NEW)
 		return size;
 
 	get_online_cpus();
@@ -691,10 +691,8 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
 	if (system_state != SYSTEM_RUNNING)
 		return UCODE_NFOUND;
 
-	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
-						     refresh_fw);
-
-	if (ustate == UCODE_OK) {
+	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, refresh_fw);
+	if (ustate == UCODE_NEW) {
 		pr_debug("CPU%d updated upon init\n", cpu);
 		apply_microcode_on_target(cpu);
 	}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 2aded9db1d42..32b8e5724f96 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -862,6 +862,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	unsigned int leftover = size;
 	unsigned int curr_mc_size = 0, new_mc_size = 0;
 	unsigned int csig, cpf;
+	enum ucode_state ret = UCODE_OK;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -903,6 +904,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			new_mc  = mc;
 			new_mc_size = mc_size;
 			mc = NULL;	/* trigger new vmalloc */
+			ret = UCODE_NEW;
 		}
 
 		ucode_ptr += mc_size;
@@ -932,7 +934,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
 		 cpu, new_rev, uci->cpu_sig.rev);
 
-	return UCODE_OK;
+	return ret;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)
-- 
2.13.0

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] x86/microcode: Fix CPU synchronization routine
  2018-03-14 18:36 [PATCH 1/2] x86/microcode: Attempt late loading only when new microcode is present Borislav Petkov
@ 2018-03-14 18:36 ` Borislav Petkov
  2018-03-15  1:00   ` Henrique de Moraes Holschuh
  2018-03-16 20:01   ` [tip:x86/pti] " tip-bot for Borislav Petkov
  2018-03-16 20:00 ` [tip:x86/pti] x86/microcode: Attempt late loading only when new microcode is present tip-bot for Borislav Petkov
  1 sibling, 2 replies; 9+ messages in thread
From: Borislav Petkov @ 2018-03-14 18:36 UTC (permalink / raw)
  To: X86 ML; +Cc: Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML

From: Borislav Petkov <bp@suse.de>

Emanuel reported an issue with a hang during microcode update because my
dumb idea to use one atomic synchronization variable for both rendezvous
- before and after update - was simply bollocks:

  microcode: microcode_reload_late: late_cpus: 4
  microcode: __reload_late: cpu 2 entered
  microcode: __reload_late: cpu 1 entered
  microcode: __reload_late: cpu 3 entered
  microcode: __reload_late: cpu 0 entered
  microcode: __reload_late: cpu 1 left
  microcode: Timeout while waiting for CPUs rendezvous, remaining: 1

CPU1 above would finish, leave and the others will still spin waiting
for it to join.

So do two synchronization atomics instead. Which makes the code a lot
more straightforward.

Also, since the update is serialized and it also takes a couple of
thousand cycles per microcode engine, increase the exit timeout by the
number of CPUs on the system.

That's ok because the moment all CPUs are done, that timeout will be cut
short.

Furthermore, panic when some of the CPUs timeout when returning from a
microcode update: we can't allow a system with not all cores updated.

Also, as an optimization, do not do the exit sync if microcode wasn't
updated.

Reported-and-tested-by: Emanuel Czirai <xftroxgpx@protonmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/cpu/microcode/core.c | 68 ++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9f0fe5bb450d..10c4fc2c91f8 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -517,7 +517,29 @@ static int check_online_cpus(void)
 	return -EINVAL;
 }
 
-static atomic_t late_cpus;
+static atomic_t late_cpus_in;
+static atomic_t late_cpus_out;
+
+static int __wait_for_cpus(atomic_t *t, long long timeout)
+{
+	int all_cpus = num_online_cpus();
+
+	atomic_inc(t);
+
+	while (atomic_read(t) < all_cpus) {
+		if (timeout < SPINUNIT) {
+			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+				all_cpus - atomic_read(t));
+			return 1;
+		}
+
+		ndelay(SPINUNIT);
+		timeout -= SPINUNIT;
+
+		touch_nmi_watchdog();
+	}
+	return 0;
+}
 
 /*
  * Returns:
@@ -527,30 +549,16 @@ static atomic_t late_cpus;
  */
 static int __reload_late(void *info)
 {
-	unsigned int timeout = NSEC_PER_SEC;
-	int all_cpus = num_online_cpus();
 	int cpu = smp_processor_id();
 	enum ucode_state err;
 	int ret = 0;
 
-	atomic_dec(&late_cpus);
-
 	/*
 	 * Wait for all CPUs to arrive. A load will not be attempted unless all
 	 * CPUs show up.
 	 * */
-	while (atomic_read(&late_cpus)) {
-		if (timeout < SPINUNIT) {
-			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
-				atomic_read(&late_cpus));
-			return -1;
-		}
-
-		ndelay(SPINUNIT);
-		timeout -= SPINUNIT;
-
-		touch_nmi_watchdog();
-	}
+	if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
+		return -1;
 
 	spin_lock(&update_lock);
 	apply_microcode_local(&err);
@@ -558,15 +566,22 @@ static int __reload_late(void *info)
 
 	if (err > UCODE_NFOUND) {
 		pr_warn("Error reloading microcode on CPU %d\n", cpu);
-		ret = -1;
-	} else if (err == UCODE_UPDATED) {
+		return -1;
+	/* siblings return UCODE_OK because their engine got updated already */
+	} else if (err == UCODE_UPDATED || err == UCODE_OK) {
 		ret = 1;
+	} else {
+		return ret;
 	}
 
-	atomic_inc(&late_cpus);
-
-	while (atomic_read(&late_cpus) != all_cpus)
-		cpu_relax();
+	/*
+	 * Increase the wait timeout to a safe value here since we're
+	 * serializing the microcode update and that could take a while on a
+	 * large number of CPUs. And that is fine as the *actual* timeout will
+	 * be determined by the last CPU finished updating and thus cut short.
+	 */
+	if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
+		panic("Timeout during microcode update!\n");
 
 	return ret;
 }
@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
 {
 	int ret;
 
-	atomic_set(&late_cpus, num_online_cpus());
+	atomic_set(&late_cpus_in,  0);
+	atomic_set(&late_cpus_out, 0);
 
 	ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
-	if (ret < 0)
-		return ret;
-	else if (ret > 0)
+	if (ret > 0)
 		microcode_check();
 
 	return ret;
-- 
2.13.0

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] x86/microcode: Fix CPU synchronization routine
  2018-03-14 18:36 ` [PATCH 2/2] x86/microcode: Fix CPU synchronization routine Borislav Petkov
@ 2018-03-15  1:00   ` Henrique de Moraes Holschuh
  2018-03-15  1:01     ` Borislav Petkov
  2018-03-16 20:01   ` [tip:x86/pti] " tip-bot for Borislav Petkov
  1 sibling, 1 reply; 9+ messages in thread
From: Henrique de Moraes Holschuh @ 2018-03-15  1:00 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: X86 ML, Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML

On Wed, 14 Mar 2018, Borislav Petkov wrote:
> Also, since the update is serialized and it also takes a couple of
> thousand cycles per microcode engine, increase the exit timeout by the
> number of CPUs on the system.

Maybe on AMD it is that fast ;-)

Intel takes anything from twenty thousand cycles to several *million*
cycles per core, proportional to microcode update size.

-- 
  Henrique Holschuh

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] x86/microcode: Fix CPU synchronization routine
  2018-03-15  1:00   ` Henrique de Moraes Holschuh
@ 2018-03-15  1:01     ` Borislav Petkov
  2018-03-15  4:01       ` Henrique de Moraes Holschuh
  0 siblings, 1 reply; 9+ messages in thread
From: Borislav Petkov @ 2018-03-15  1:01 UTC (permalink / raw)
  To: Henrique de Moraes Holschuh
  Cc: X86 ML, Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML

On Wed, Mar 14, 2018 at 10:00:14PM -0300, Henrique de Moraes Holschuh wrote:
> Intel takes anything from twenty thousand cycles to several *million*
> cycles per core, proportional to microcode update size.

Got any hard data to back that claim up?

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] x86/microcode: Fix CPU synchronization routine
  2018-03-15  1:01     ` Borislav Petkov
@ 2018-03-15  4:01       ` Henrique de Moraes Holschuh
  2018-03-15  9:58         ` Borislav Petkov
  0 siblings, 1 reply; 9+ messages in thread
From: Henrique de Moraes Holschuh @ 2018-03-15  4:01 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: X86 ML, Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML

On Thu, 15 Mar 2018, Borislav Petkov wrote:
> On Wed, Mar 14, 2018 at 10:00:14PM -0300, Henrique de Moraes Holschuh wrote:
> > Intel takes anything from twenty thousand cycles to several *million*
> > cycles per core, proportional to microcode update size.
> 
> Got any hard data to back that claim up?

A reasonably well-known paper on intel microcode updates[1] profiled
that very well, years ago (2013).  The information about a linear
increase in update time versus update size comes from that paper (I did
not attempt to reproduce his findings, though).

When I measured my Xeon X5550 workstation doing an early update, the
Xeon took about 1M cycles for the BSP, and 800k cycles for the APs (see
below).

To measure that, as far as I recall I just did a rdtsc right before the
wrmsr, and another right after, and stashed the result somewhere to be
able to print it out later in the BSP's case.  I repeated the process
(by rebooting) a few times.  There was a *lot* of variation, but not
enough to get it wrong by an order of magnitude.

I am surprised that this would be news to you, though.  It is not like I
have been quiet about how expensive these updates are on Intel over the
past years every time I sent you a patch related to this...

Anyway, here's my measurement data from 2013:

Xeon X5550:
microcode_early: CPU0: entire core updated early to revision 0x19, in 1016168 cycles
microcode_early: CPU1: entire core updated early to revision 0x19, in 842264 cycles
microcode_early: CPU2: entire core updated early to revision 0x19, in 846784 cycles
microcode_early: CPU3: entire core updated early to revision 0x19, in 838196 cycles


[1] HAWKES, Ben. "Notes on Intel Microcode Updates", March 2013.

-- 
  Henrique Holschuh

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] x86/microcode: Fix CPU synchronization routine
  2018-03-15  4:01       ` Henrique de Moraes Holschuh
@ 2018-03-15  9:58         ` Borislav Petkov
  2018-03-15 14:54           ` Henrique de Moraes Holschuh
  0 siblings, 1 reply; 9+ messages in thread
From: Borislav Petkov @ 2018-03-15  9:58 UTC (permalink / raw)
  To: Henrique de Moraes Holschuh
  Cc: X86 ML, Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML, Arjan Van De Ven

+ Arjan.

On Thu, Mar 15, 2018 at 01:01:32AM -0300, Henrique de Moraes Holschuh wrote:
> A reasonably well-known paper on intel microcode updates[1] profiled
> that very well, years ago (2013).  The information about a linear
> increase in update time versus update size comes from that paper (I did
> not attempt to reproduce his findings, though).

WTF?

If I read this paper correctly:

http://www.inertiawar.com/microcode/

it is injecting faults and attempting to manipulate some size field -
I'm guessing the encrypted data size. And I'm also guessing that if you
manipulate that size, it would simply take a lot longer to attempt to
decrypt and verify that it is broken microcode and reject it. So it is
not actually a real update - it is just taking a lot longer to reject
it.

Now, I'm talking about genuine microcode updates. And that paper also
claims that they take thousands of cycles.

Now let's look at your previous, hm, "statement":

> Intel takes anything from twenty thousand cycles to several *million*
> cycles per core, proportional to microcode update size.

So you took a fault injection measurement out of context to claim that
*regular* microcode updates take millions of cycles.

So you had to say something - doesn't matter if it is apples and oranges
- as long as it is dramatic. Fuck the truth.

> When I measured my Xeon X5550 workstation doing an early update, the
> Xeon took about 1M cycles for the BSP, and 800k cycles for the APs (see
> below).
>
> To measure that, as far as I recall I just did a rdtsc right before the

RDTSC gets executed speculatively, so you need barriers around it. I
hope you added them.

> wrmsr, and another right after, and stashed the result somewhere to be
> able to print it out later in the BSP's case.  I repeated the process
> (by rebooting) a few times.  There was a *lot* of variation, but not
> enough to get it wrong by an order of magnitude.
> 
> I am surprised that this would be news to you, though.  It is not like I
> have been quiet about how expensive these updates are on Intel over the
> past years every time I sent you a patch related to this...

Frankly, I don't take you seriously. Even less so after this.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] x86/microcode: Fix CPU synchronization routine
  2018-03-15  9:58         ` Borislav Petkov
@ 2018-03-15 14:54           ` Henrique de Moraes Holschuh
  0 siblings, 0 replies; 9+ messages in thread
From: Henrique de Moraes Holschuh @ 2018-03-15 14:54 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: X86 ML, Emanuel Czirai, Ashok Raj, Tom Lendacky, LKML, Arjan Van De Ven

On Thu, 15 Mar 2018, Borislav Petkov wrote:
> it is injecting faults and attempting to manipulate some size field -
> I'm guessing the encrypted data size. And I'm also guessing that if you
> manipulate that size, it would simply take a lot longer to attempt to
> decrypt and verify that it is broken microcode and reject it. So it is
> not actually a real update - it is just taking a lot longer to reject
> it.

That paper measures the sucessful updates too (see below).

That the fault injection tests took less cycles than the sucessful
updates did, is not explicitly written in the paper.  IMO, it is implied
by "fig 7" and also by the text nearby and immediately after "fig 8"
(looking at the html version of the paper), though.

> Now, I'm talking about genuine microcode updates. And that paper also
> claims that they take thousands of cycles.

"Observation #4" in the paper is the sucessful non-fault-injection, i.e.
regular microcode update measurement.  Here's the numbers from the paper
for a regular, sucessfull update on a Core i5 M460 (microcode revisions
were not disclosed by the paper):

Average:  488953 cycles
Std. Dev:  12270 cycles.

What I observed on my Xeon X5550 (signature 0x106a5, microcode update is
~10KiB in size) matches what the paper describes: the Xeon X5550 has
microcode updates that are a little bigger than the ones for the Core i5
M460 (signature 0x20655, microcode update is ~3KiB in size).

This is not a potshot at Intel.  Their microcode update loader does a
lot of work, so it would make sense for it to take a lot of cycles to do
it.  AFAIK, it is doing RSA-2048 decription in microcode, updating
several internal subsystems as well as the MCRAM, etc.

> Now let's look at your previous, hm, "statement":
> 
> > Intel takes anything from twenty thousand cycles to several *million*
> > cycles per core, proportional to microcode update size.
> 
> So you took a fault injection measurement out of context to claim that
> *regular* microcode updates take millions of cycles.

I tested a *few* interations of sucessful, regular microcode updates on
a Xeon X5550, and it matched the magnitude of the number of cycles given
in the paper for a successful, regular microcode update.

I claimed an intel microcode update would take from 20000 to millions of
cycles depending on update size, based on:

1. the paper's description and measurements of how the loader does the
   signature validation;

2. the fact that a 10KiB update took ~800000 cycles on a Xeon X5550 core
   (the BSP, data I measured myself), and that a 3KiB update took ~480000
   cycles in average on a Core i5 M460 (data from the inertiawar paper).

Now, this might be comparing two different subspecies of oranges, so I
did try to run my measurement patch at a Core i5-2500 I had access to,
so as to compare *regular*, sucessfull microcode early updates using the
same kernel on two different processors.

I recall the results on the i5-2500 were coherent with the paper's
findings and the results for the Xeon X5550.  Unfortunately, I have
misplaced the results for the i5-2500, or I would have already provided
them.  According to the notes I found, these tests were done circa
august 2014, and I was not exactly considering them to be important
stuff that needs to be documented and archived properly.

> So you had to say something - doesn't matter if it is apples and oranges
> - as long as it is dramatic. Fuck the truth.

I do believe that [intel microcode updates on recent processors take
hundreds of thousands of cycles up to millions of cycles in the worst
case] to be the truth from both my reading of that paper, and my own
simple attempts to verify the time it took for a sucessful, regular
microcode update.

If I am wrong about this, it will be because I measured things
incorrectly, and that would also be true for the inertiawar paper as far
as I can tell.

I am certainly not doing anything out of malice, or trying to be
dramatic.  Please consider that you presently believe that an Intel
microcode update [on recent processors] takes on the order of thousands
of cycles per core, where I presently believe it takes at least a
hundred times more cycles than that.

Wouldn't that difference in beliefs, regardless of which one is correct,
account for any of my comments related to microcode update cycle cost
appearing to be overly dramatic to you?

It was not my intention to annoy or upset you, and I had no idea we
expected intel microcode updates to have cycle costs that are two to
three orders of magnitude apart, so I never took that into account on
any of my comments.

> > When I measured my Xeon X5550 workstation doing an early update, the
> > Xeon took about 1M cycles for the BSP, and 800k cycles for the APs (see
> > below).
> >
> > To measure that, as far as I recall I just did a rdtsc right before the
> 
> RDTSC gets executed speculatively, so you need barriers around it. I
> hope you added them.

I searched for, and found some old notes I took at the time.  FWIW:

I did not have speculation barriers before the WRMSR that does the
microcode update.  There was a compiler barrier() only, as in
"barrier(); time1 = get_cycles(); WRMSR".  But that WRMSR is supposed to
be a serializing instruction, and it does seem to behave like that.

I did have a compiler barrier() right after the second rdtsc, and a
sync_core() fully serializing barrier a bit later in the code flow, but
before I used any of the rdtsc results.

I did look at the resulting object code at the time, and the rdtsc calls
were at the proper place before and after the wrmsr instruction, as
expected given the use of barrier() to get the compiler to behave.  And
the CPUID implementing sync_core() was between the rdtsc, and the code
that used the result.

>From my notes, I did not use local_irq_disable(), since I was
instrumenting the early loader.  That might have skewed the results for
the BSP if interrupts were already being serviced at that point.

The fact that I did not have a sync_core() right after the second rdtsc
might have added some systematic error, but I would not expect it to be
large enough to matter since the noise was already above 10000 cycles.

Sibling hyperthreads were properly skipped by the early loader as being
already up-to-date.  The cpu ordering on that box also ensured the other
hyperthread of a core was still "offline" (in whatever state it was left
by the BIOS -- no UEFI on that box) while the first was doing the
microcode update: CPU 0-3 were the first hyperthreads of cores 0 to 3,
and CPU 4-7 were the second hyperthreads of cores 0 to 3.

-- 
  Henrique Holschuh

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip:x86/pti] x86/microcode: Attempt late loading only when new microcode is present
  2018-03-14 18:36 [PATCH 1/2] x86/microcode: Attempt late loading only when new microcode is present Borislav Petkov
  2018-03-14 18:36 ` [PATCH 2/2] x86/microcode: Fix CPU synchronization routine Borislav Petkov
@ 2018-03-16 20:00 ` tip-bot for Borislav Petkov
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot for Borislav Petkov @ 2018-03-16 20:00 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, tglx, bp, xftroxgpx, mingo, ashok.raj, hpa,
	thomas.lendacky

Commit-ID:  2613f36ed965d0e5a595a1d931fd3b480e82d6fd
Gitweb:     https://git.kernel.org/tip/2613f36ed965d0e5a595a1d931fd3b480e82d6fd
Author:     Borislav Petkov <bp@suse.de>
AuthorDate: Wed, 14 Mar 2018 19:36:14 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 16 Mar 2018 20:55:51 +0100

x86/microcode: Attempt late loading only when new microcode is present

Return UCODE_NEW from the scanning functions to denote that new microcode
was found and only then attempt the expensive synchronization dance.

Reported-by: Emanuel Czirai <xftroxgpx@protonmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Emanuel Czirai <xftroxgpx@protonmail.com>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lkml.kernel.org/r/20180314183615.17629-1-bp@alien8.de

---
 arch/x86/include/asm/microcode.h      |  1 +
 arch/x86/kernel/cpu/microcode/amd.c   | 34 +++++++++++++++++++++-------------
 arch/x86/kernel/cpu/microcode/core.c  |  8 +++-----
 arch/x86/kernel/cpu/microcode/intel.c |  4 +++-
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 7fb1047d61c7..6cf0e4cb7b97 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -39,6 +39,7 @@ struct device;
 
 enum ucode_state {
 	UCODE_OK	= 0,
+	UCODE_NEW,
 	UCODE_UPDATED,
 	UCODE_NFOUND,
 	UCODE_ERROR,
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index a998e1a7d46f..48179928ff38 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -339,7 +339,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 		return -EINVAL;
 
 	ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
-	if (ret != UCODE_OK)
+	if (ret > UCODE_UPDATED)
 		return -EINVAL;
 
 	return 0;
@@ -683,27 +683,35 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
 static enum ucode_state
 load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
 {
+	struct ucode_patch *p;
 	enum ucode_state ret;
 
 	/* free old equiv table */
 	free_equiv_cpu_table();
 
 	ret = __load_microcode_amd(family, data, size);
-
-	if (ret != UCODE_OK)
+	if (ret != UCODE_OK) {
 		cleanup();
+		return ret;
+	}
 
-#ifdef CONFIG_X86_32
-	/* save BSP's matching patch for early load */
-	if (save) {
-		struct ucode_patch *p = find_patch(0);
-		if (p) {
-			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
-			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
-							       PATCH_MAX_SIZE));
-		}
+	p = find_patch(0);
+	if (!p) {
+		return ret;
+	} else {
+		if (boot_cpu_data.microcode == p->patch_id)
+			return ret;
+
+		ret = UCODE_NEW;
 	}
-#endif
+
+	/* save BSP's matching patch for early load */
+	if (!save)
+		return ret;
+
+	memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+	memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+
 	return ret;
 }
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 70ecbc8099c9..9f0fe5bb450d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -607,7 +607,7 @@ static ssize_t reload_store(struct device *dev,
 		return size;
 
 	tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
-	if (tmp_ret != UCODE_OK)
+	if (tmp_ret != UCODE_NEW)
 		return size;
 
 	get_online_cpus();
@@ -691,10 +691,8 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
 	if (system_state != SYSTEM_RUNNING)
 		return UCODE_NFOUND;
 
-	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
-						     refresh_fw);
-
-	if (ustate == UCODE_OK) {
+	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, refresh_fw);
+	if (ustate == UCODE_NEW) {
 		pr_debug("CPU%d updated upon init\n", cpu);
 		apply_microcode_on_target(cpu);
 	}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 2aded9db1d42..32b8e5724f96 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -862,6 +862,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	unsigned int leftover = size;
 	unsigned int curr_mc_size = 0, new_mc_size = 0;
 	unsigned int csig, cpf;
+	enum ucode_state ret = UCODE_OK;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -903,6 +904,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			new_mc  = mc;
 			new_mc_size = mc_size;
 			mc = NULL;	/* trigger new vmalloc */
+			ret = UCODE_NEW;
 		}
 
 		ucode_ptr += mc_size;
@@ -932,7 +934,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
 		 cpu, new_rev, uci->cpu_sig.rev);
 
-	return UCODE_OK;
+	return ret;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [tip:x86/pti] x86/microcode: Fix CPU synchronization routine
  2018-03-14 18:36 ` [PATCH 2/2] x86/microcode: Fix CPU synchronization routine Borislav Petkov
  2018-03-15  1:00   ` Henrique de Moraes Holschuh
@ 2018-03-16 20:01   ` tip-bot for Borislav Petkov
  1 sibling, 0 replies; 9+ messages in thread
From: tip-bot for Borislav Petkov @ 2018-03-16 20:01 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: ashok.raj, thomas.lendacky, linux-kernel, tglx, hpa, mingo,
	xftroxgpx, bp

Commit-ID:  bb8c13d61a629276a162c1d2b1a20a815cbcfbb7
Gitweb:     https://git.kernel.org/tip/bb8c13d61a629276a162c1d2b1a20a815cbcfbb7
Author:     Borislav Petkov <bp@suse.de>
AuthorDate: Wed, 14 Mar 2018 19:36:15 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Fri, 16 Mar 2018 20:55:51 +0100

x86/microcode: Fix CPU synchronization routine

Emanuel reported an issue with a hang during microcode update because my
dumb idea to use one atomic synchronization variable for both rendezvous
- before and after update - was simply bollocks:

  microcode: microcode_reload_late: late_cpus: 4
  microcode: __reload_late: cpu 2 entered
  microcode: __reload_late: cpu 1 entered
  microcode: __reload_late: cpu 3 entered
  microcode: __reload_late: cpu 0 entered
  microcode: __reload_late: cpu 1 left
  microcode: Timeout while waiting for CPUs rendezvous, remaining: 1

CPU1 above would finish, leave and the others will still spin waiting for
it to join.

So do two synchronization atomics instead, which makes the code a lot more
straightforward.

Also, since the update is serialized and it also takes quite some time per
microcode engine, increase the exit timeout by the number of CPUs on the
system.

That's ok because the moment all CPUs are done, that timeout will be cut
short.

Furthermore, panic when some of the CPUs timeout when returning from a
microcode update: we can't allow a system with not all cores updated.

Also, as an optimization, do not do the exit sync if microcode wasn't
updated.

Reported-by: Emanuel Czirai <xftroxgpx@protonmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Emanuel Czirai <xftroxgpx@protonmail.com>
Tested-by: Ashok Raj <ashok.raj@intel.com>
Tested-by: Tom Lendacky <thomas.lendacky@amd.com>
Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@alien8.de

---
 arch/x86/kernel/cpu/microcode/core.c | 68 ++++++++++++++++++++++--------------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 9f0fe5bb450d..10c4fc2c91f8 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -517,7 +517,29 @@ static int check_online_cpus(void)
 	return -EINVAL;
 }
 
-static atomic_t late_cpus;
+static atomic_t late_cpus_in;
+static atomic_t late_cpus_out;
+
+static int __wait_for_cpus(atomic_t *t, long long timeout)
+{
+	int all_cpus = num_online_cpus();
+
+	atomic_inc(t);
+
+	while (atomic_read(t) < all_cpus) {
+		if (timeout < SPINUNIT) {
+			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+				all_cpus - atomic_read(t));
+			return 1;
+		}
+
+		ndelay(SPINUNIT);
+		timeout -= SPINUNIT;
+
+		touch_nmi_watchdog();
+	}
+	return 0;
+}
 
 /*
  * Returns:
@@ -527,30 +549,16 @@ static atomic_t late_cpus;
  */
 static int __reload_late(void *info)
 {
-	unsigned int timeout = NSEC_PER_SEC;
-	int all_cpus = num_online_cpus();
 	int cpu = smp_processor_id();
 	enum ucode_state err;
 	int ret = 0;
 
-	atomic_dec(&late_cpus);
-
 	/*
 	 * Wait for all CPUs to arrive. A load will not be attempted unless all
 	 * CPUs show up.
 	 * */
-	while (atomic_read(&late_cpus)) {
-		if (timeout < SPINUNIT) {
-			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
-				atomic_read(&late_cpus));
-			return -1;
-		}
-
-		ndelay(SPINUNIT);
-		timeout -= SPINUNIT;
-
-		touch_nmi_watchdog();
-	}
+	if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
+		return -1;
 
 	spin_lock(&update_lock);
 	apply_microcode_local(&err);
@@ -558,15 +566,22 @@ static int __reload_late(void *info)
 
 	if (err > UCODE_NFOUND) {
 		pr_warn("Error reloading microcode on CPU %d\n", cpu);
-		ret = -1;
-	} else if (err == UCODE_UPDATED) {
+		return -1;
+	/* siblings return UCODE_OK because their engine got updated already */
+	} else if (err == UCODE_UPDATED || err == UCODE_OK) {
 		ret = 1;
+	} else {
+		return ret;
 	}
 
-	atomic_inc(&late_cpus);
-
-	while (atomic_read(&late_cpus) != all_cpus)
-		cpu_relax();
+	/*
+	 * Increase the wait timeout to a safe value here since we're
+	 * serializing the microcode update and that could take a while on a
+	 * large number of CPUs. And that is fine as the *actual* timeout will
+	 * be determined by the last CPU finished updating and thus cut short.
+	 */
+	if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
+		panic("Timeout during microcode update!\n");
 
 	return ret;
 }
@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
 {
 	int ret;
 
-	atomic_set(&late_cpus, num_online_cpus());
+	atomic_set(&late_cpus_in,  0);
+	atomic_set(&late_cpus_out, 0);
 
 	ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
-	if (ret < 0)
-		return ret;
-	else if (ret > 0)
+	if (ret > 0)
 		microcode_check();
 
 	return ret;

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-03-16 20:01 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-14 18:36 [PATCH 1/2] x86/microcode: Attempt late loading only when new microcode is present Borislav Petkov
2018-03-14 18:36 ` [PATCH 2/2] x86/microcode: Fix CPU synchronization routine Borislav Petkov
2018-03-15  1:00   ` Henrique de Moraes Holschuh
2018-03-15  1:01     ` Borislav Petkov
2018-03-15  4:01       ` Henrique de Moraes Holschuh
2018-03-15  9:58         ` Borislav Petkov
2018-03-15 14:54           ` Henrique de Moraes Holschuh
2018-03-16 20:01   ` [tip:x86/pti] " tip-bot for Borislav Petkov
2018-03-16 20:00 ` [tip:x86/pti] x86/microcode: Attempt late loading only when new microcode is present tip-bot for Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.