All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Roger Pau Monné" <roger.pau@citrix.com>
To: Chao Gao <chao.gao@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>, Wei Liu <wei.liu2@citrix.com>,
	Jan Beulich <jbeulich@suse.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Jun Nakajima <jun.nakajima@intel.com>,
	xen-devel@lists.xenproject.org,
	Thomas Gleixner <tglx@linutronix.de>,
	Borislav Petkov <bp@suse.de>, Ashok Raj <ashok.raj@intel.com>
Subject: Re: [PATCH v4 6/6] x86/microcode: Synchronize late microcode loading
Date: Wed, 28 Nov 2018 16:22:09 +0100	[thread overview]
Message-ID: <20181128152209.hqpgqtuy2mx7ytk6@mac> (raw)
In-Reply-To: <1543383256-12371-7-git-send-email-chao.gao@intel.com>

On Wed, Nov 28, 2018 at 01:34:16PM +0800, Chao Gao wrote:
> This patch ports microcode improvement patches from linux kernel.
> 
> Before you read any further: the early loading method is still the
> preferred one and you should always do that. The following patch is
> improving the late loading mechanism for long running jobs and cloud use
> cases.
> 
> Gather all cores and serialize the microcode update on them by doing it
> one-by-one to make the late update process as reliable as possible and
> avoid potential issues caused by the microcode update.
> 
> Signed-off-by: Chao Gao <chao.gao@intel.com>
> Tested-by: Chao Gao <chao.gao@intel.com>
> [linux commit: a5321aec6412b20b5ad15db2d6b916c05349dbff]
> [linux commit: bb8c13d61a629276a162c1d2b1a20a815cbcfbb7]

If this patch is the squash of two Linux commits, please post the
ported versions of the two commits separately.

> Cc: Kevin Tian <kevin.tian@intel.com>
> Cc: Jun Nakajima <jun.nakajima@intel.com>
> Cc: Ashok Raj <ashok.raj@intel.com>
> Cc: Borislav Petkov <bp@suse.de>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> ---
>  xen/arch/x86/microcode.c | 123 +++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 97 insertions(+), 26 deletions(-)
> 
> diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
> index 0b435f4..d5a2a94 100644
> --- a/xen/arch/x86/microcode.c
> +++ b/xen/arch/x86/microcode.c
> @@ -22,6 +22,7 @@
>   */
>  
>  #include <xen/cpu.h>
> +#include <xen/cpumask.h>
>  #include <xen/lib.h>
>  #include <xen/kernel.h>
>  #include <xen/init.h>
> @@ -30,18 +31,25 @@
>  #include <xen/smp.h>
>  #include <xen/softirq.h>
>  #include <xen/spinlock.h>
> +#include <xen/stop_machine.h>
>  #include <xen/tasklet.h>
>  #include <xen/guest_access.h>
>  #include <xen/earlycpio.h>
> +#include <xen/watchdog.h>
>  
> +#include <asm/delay.h>
>  #include <asm/msr.h>
>  #include <asm/processor.h>
>  #include <asm/setup.h>
>  #include <asm/microcode.h>
>  
> +/* By default, wait for 30000us */
> +#define MICROCODE_DEFAULT_TIMEOUT_US 30000
> +
>  static module_t __initdata ucode_mod;
>  static signed int __initdata ucode_mod_idx;
>  static bool_t __initdata ucode_mod_forced;
> +static unsigned int nr_cores;
>  
>  /*
>   * If we scan the initramfs.cpio for the early microcode code
> @@ -189,8 +197,7 @@ static DEFINE_SPINLOCK(microcode_mutex);
>  DEFINE_PER_CPU(struct ucode_cpu_info, ucode_cpu_info);
>  
>  struct microcode_info {
> -    unsigned int cpu;
> -    int error;
> +    atomic_t cpu_in, cpu_out;

Can you make this variables global to the file and just remove
microcode_info?

>  };
>  
>  static void __microcode_fini_cpu(unsigned int cpu)
> @@ -242,31 +249,62 @@ static int microcode_update_cpu(void)
>      return err;
>  }
>  
> -static long do_microcode_update(void *_info)
> +/* Wait for all CPUs to rendezvous with a timeout (us) */
> +static int wait_for_cpus(atomic_t *cnt, unsigned int timeout)
>  {
> -    struct microcode_info *info = _info;
> -    int error;
> +    unsigned int cpus = num_online_cpus();
>  
> -    BUG_ON(info->cpu != smp_processor_id());
> +    atomic_inc(cnt);
>  
> -    error = microcode_update_cpu();
> -    if ( error )
> -        info->error = error;
> +    while ( atomic_read(cnt) != cpus )
> +    {
> +        if ( timeout <= 0 )
> +        {
> +            printk("Timeout when waiting for CPUs calling in\n");
> +            return -EBUSY;
> +        }
> +        udelay(1);
> +        timeout--;
> +    }
>  
> -    info->cpu = cpumask_next(info->cpu, &cpu_online_map);
> -    if ( info->cpu < nr_cpu_ids )
> -        return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
> +    return 0;
> +}
>  
> -    error = info->error;
> -    xfree(info);
> -    return error;
> +static int do_microcode_update(void *_info)
> +{
> +    struct microcode_info *info = _info;
> +    unsigned int cpu = smp_processor_id();
> +    int ret;
> +
> +    ret = wait_for_cpus(&info->cpu_in, MICROCODE_DEFAULT_TIMEOUT_US);
> +    if ( ret )
> +        return ret;
> +
> +    /*
> +     * Initiate an update on all processors which don't have an online sibling
> +     * thread with a lower thread id. Other sibling threads just await the
> +     * completion of microcode update.
> +     */
> +    if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
> +        ret = microcode_update_cpu();
> +    /*
> +     * Increase the wait timeout to a safe value here since we're serializing
> +     * the microcode update and that could take a while on a large number of
> +     * CPUs. And that is fine as the *actual* timeout will be determined by
> +     * the last CPU finished updating and thus cut short
> +     */
> +    if ( wait_for_cpus(&info->cpu_out, MICROCODE_DEFAULT_TIMEOUT_US *
> +                                       nr_cores) )

Isn't this likely to trigger the watchdog on big systems? Oh I see
below that you disable the watchdog.

> +        panic("Timeout when finishing updating microcode");
> +
> +    return ret;
>  }
>  
>  int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
>  {
>      int ret;
> -    struct microcode_info *info;
>      unsigned int cpu = smp_processor_id();
> +    struct microcode_info *info;
>      struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
>      void * buffer;
>  
> @@ -283,19 +321,20 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
>  
>      ret = copy_from_guest(buffer, buf, len);
>      if ( ret != 0 )
> +        goto free;
> +
> +    /* cpu_online_map must not change during update */
> +    if ( !get_cpu_maps() )
>      {
> -        xfree(info);
> -        return ret;
> +        ret = -EBUSY;
> +        goto free;
>      }
>  
>      if ( microcode_ops->start_update )
>      {
>          ret = microcode_ops->start_update();
>          if ( ret != 0 )
> -        {
> -            xfree(info);
> -            return ret;
> -        }
> +            goto put;
>      }
>  
>      spin_lock(&microcode_mutex);
> @@ -311,13 +350,45 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
>      if ( ret <= 0 )
>      {
>          printk("No valid or newer microcode found. Update abort!\n");
> -        return -EINVAL;
> +        ret = -EINVAL;
> +        goto put;
>      }
>  
> -    info->error = 0;
> -    info->cpu = cpumask_first(&cpu_online_map);
> +    atomic_set(&info->cpu_in, 0);
> +    atomic_set(&info->cpu_out, 0);
> +
> +    /* Calculate the number of online CPU core */
> +    nr_cores = 0;
> +    for_each_online_cpu(cpu)
> +        if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
> +            nr_cores++;
> +
> +    printk("%d cores are to update its microcode\n", nr_cores);
>  
> -    return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
> +    /*
> +     * We intend to disable interrupt for long time, which may lead to
> +     * watchdog timeout.
> +     */
> +    watchdog_disable();
> +    /*
> +     * Late loading dance. Why the heavy-handed stop_machine effort?
> +     *
> +     * - HT siblings must be idle and not execute other code while the other
> +     *   sibling is loading microcode in order to avoid any negative
> +     *   interactions cause by the loading.

Well, the HT siblings will be executing code, since they are in a
while loop waiting for the non-siblings cores to finish updating.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

  reply	other threads:[~2018-11-28 15:27 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-28  5:34 [PATCH v4 0/6] improve late microcode loading Chao Gao
2018-11-28  5:34 ` [PATCH v4 1/6] microcode/intel: extend microcode_update_match() Chao Gao
2018-11-28 10:58   ` Roger Pau Monné
2018-11-29  2:00     ` Chao Gao
2018-11-29  9:14       ` Roger Pau Monné
2018-11-28  5:34 ` [PATCH v4 2/6] microcode: save all microcodes which pass sanity check Chao Gao
2018-11-28 12:00   ` Roger Pau Monné
2018-11-29  2:40     ` Chao Gao
2018-11-29  9:22       ` Roger Pau Monné
2018-11-30  7:55         ` Chao Gao
2018-11-30  9:32           ` Jan Beulich
2019-01-15 15:07             ` Andrew Cooper
2018-12-04 22:39         ` Woods, Brian
2018-12-05  7:38           ` Chao Gao
2018-11-29 10:19       ` Jan Beulich
2019-01-15 15:15         ` Andrew Cooper
2018-11-28  5:34 ` [PATCH v4 3/6] microcode: delete 'mc' field from struct ucode_cpu_info Chao Gao
2018-11-28 12:32   ` Roger Pau Monné
2018-11-28  5:34 ` [PATCH v4 4/6] microcode: don't call apply_microcode() in cpu_request_microcode() Chao Gao
2018-11-28 15:02   ` Roger Pau Monné
2018-11-29  4:28     ` Chao Gao
2018-11-29  9:46       ` Roger Pau Monné
2018-11-30  8:57         ` Chao Gao
2018-11-30  9:38           ` Jan Beulich
2018-11-28  5:34 ` [PATCH v4 5/6] microcode: delete microcode pointer and size from microcode_info Chao Gao
2018-11-28 15:04   ` Roger Pau Monné
2018-11-28  5:34 ` [PATCH v4 6/6] x86/microcode: Synchronize late microcode loading Chao Gao
2018-11-28 15:22   ` Roger Pau Monné [this message]
2018-11-29  4:43     ` Chao Gao
2018-11-29  9:56       ` Roger Pau Monné
2018-11-29 22:43         ` Boris Ostrovsky
2018-11-30  9:46           ` Jan Beulich
2018-11-30 16:49             ` Boris Ostrovsky
2018-11-30  9:01         ` Chao Gao
2019-01-15 15:24           ` Andrew Cooper
2019-01-15 16:24             ` Roger Pau Monné
2018-12-11 17:01   ` Jan Beulich
2018-12-11 18:16     ` Raj, Ashok
2018-12-12  7:26       ` Jan Beulich
2018-12-13  2:10         ` Boris Ostrovsky
2018-12-12  4:53     ` Chao Gao

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181128152209.hqpgqtuy2mx7ytk6@mac \
    --to=roger.pau@citrix.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=ashok.raj@intel.com \
    --cc=bp@suse.de \
    --cc=chao.gao@intel.com \
    --cc=jbeulich@suse.com \
    --cc=jun.nakajima@intel.com \
    --cc=kevin.tian@intel.com \
    --cc=tglx@linutronix.de \
    --cc=wei.liu2@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.