All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
@ 2011-05-08 11:23 Liu, Jinsong
  2011-05-13 14:34 ` Liu, Jinsong
  0 siblings, 1 reply; 5+ messages in thread
From: Liu, Jinsong @ 2011-05-08 11:23 UTC (permalink / raw)
  To: Keir Fraser, xen-devel; +Cc: Jiang, Yunhong, Li, Xin

[-- Attachment #1: Type: text/plain, Size: 11053 bytes --]

MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension

This patch simplify mce_action(). Basically the 2 set of mce status flags MCA_... and MCER_... are highly functional similar. This patch remove the redundant middle-level flag MCA_..., and its related data structure and function, so that mce_action() logic is more clean and simpler.
This patch also update mem err handler. intel_memerr_dhandler() will be commonly used by SRAO and SRAR, so for the extension of future SRAR case, this patch remove the default fail flag from intel_memerr_dhandler() outside to SRAO/SRAR level, since only SRAO/SRAR handler itself has the knowledge of how to handle the failure when mem err handler fail.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>

diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun May 08 13:39:40 2011 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun May 08 18:15:13 2011 +0800
@@ -172,23 +172,14 @@ static unsigned int __read_mostly mce_dh
 static unsigned int __read_mostly mce_dhandler_num;
 static unsigned int __read_mostly mce_uhandler_num;
 
-enum mce_result
-{
-    MCER_NOERROR,
-    MCER_RECOVERED,
-    /* Not recoverd, but can continue */
-    MCER_CONTINUE,
-    MCER_RESET,
-};
-
 /* Maybe called in MCE context, no lock, no printk */
 static enum mce_result mce_action(struct cpu_user_regs *regs,
                       mctelem_cookie_t mctc)
 {
     struct mc_info *local_mi;
-    enum mce_result ret = MCER_NOERROR;
+    enum mce_result bank_result = MCER_NOERROR;
+    enum mce_result worst_result = MCER_NOERROR;
     struct mcinfo_common *mic = NULL;
-    struct mca_handle_result mca_res;
     struct mca_binfo binfo;
     const struct mca_error_handler *handlers = mce_dhandlers;
     unsigned int i, handler_num = mce_dhandler_num;
@@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
     /* Processing bank information */
     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
 
-    for ( ; ret != MCER_RESET && mic && mic->size;
+    for ( ; bank_result != MCER_RESET && mic && mic->size;
           mic = x86_mcinfo_next(mic) )
     {
         if (mic->type != MC_TYPE_BANK) {
@@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
         }
         binfo.mib = (struct mcinfo_bank*)mic;
         binfo.bank = binfo.mib->mc_bank;
-        memset(&mca_res, 0x0f, sizeof(mca_res));
+        bank_result = MCER_NOERROR;
         for ( i = 0; i < handler_num; i++ ) {
             if (handlers[i].owned_error(binfo.mib->mc_status))
             {
-                handlers[i].recovery_handler(&binfo, &mca_res);
-
-                if (mca_res.result & MCA_OWNER)
-                    binfo.mib->mc_domid = mca_res.owner;
-
-                if (mca_res.result == MCA_NEED_RESET)
-                    ret = MCER_RESET;
-                else if (mca_res.result == MCA_RECOVERED)
-                {
-                    if (ret < MCER_RECOVERED)
-                        ret = MCER_RECOVERED;
-                }
-                else if (mca_res.result == MCA_NO_ACTION)
-                {
-                    if (ret < MCER_CONTINUE)
-                        ret = MCER_CONTINUE;
-                }
+                handlers[i].recovery_handler(&binfo, &bank_result);
+                if (worst_result < bank_result)
+                    worst_result = bank_result;
                 break;
             }
         }
         ASSERT(i != handler_num);
     }
 
-    return ret;
+    return worst_result;
 }
 
 /*
@@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
 
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     struct mcinfo_bank *bank = binfo->mib;
     struct mcinfo_global *global = binfo->mig;
@@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
     uint64_t mc_status, mc_misc;
 
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
-    result->result = MCA_NEED_RESET;
 
     mc_status = bank->mc_status;
     mc_misc = bank->mc_misc;
@@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
         !(mc_status & MCi_STATUS_MISCV) ||
         ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
     {
-        result->result |= MCA_NO_ACTION;
         dprintk(XENLOG_WARNING,
             "No physical address provided for memory error\n");
         return;
@@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
 
     /* This is free page */
     if (status & PG_OFFLINE_OFFLINED)
-        result->result = MCA_RECOVERED;
+        *result = MCER_RECOVERED;
     else if (status & PG_OFFLINE_PENDING) {
         /* This page has owner */
         if (status & PG_OFFLINE_OWNED) {
-            result->result |= MCA_OWNER;
-            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
+            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
             mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-              " by DOM %d\n", result->owner);
-            /* Fill vMCE# injection and vMCE# MSR virtualization "
-             * "related data */
-            bank->mc_domid = result->owner;
+              " by DOM %d\n", bank->mc_domid);
             /* XXX: Cannot handle shared pages yet 
              * (this should identify all domains and gfn mapping to
              *  the mfn in question) */
-            BUG_ON( result->owner == DOMID_COW );
-            if ( result->owner != DOMID_XEN ) {
-                d = get_domain_by_id(result->owner);
+            BUG_ON( bank->mc_domid == DOMID_COW );
+            if ( bank->mc_domid != DOMID_XEN ) {
+                d = get_domain_by_id(bank->mc_domid);
                 ASSERT(d);
                 gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
 
@@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
                       global->mc_gstatus) == -1 )
                 {
                     mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
-                      "failed\n", result->owner);
+                      "failed\n", bank->mc_domid);
                     goto vmce_failed;
                 }
 
@@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
                  * For xen, it has contained the error and finished
                  * its own recovery job.
                  */
-                result->result = MCA_RECOVERED;
+                *result = MCER_RECOVERED;
                 put_domain(d);
 
                 return;
@@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
 
 static void intel_srao_dhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     uint64_t status = binfo->mib->mc_status;
 
     /* For unknown srao error code, no action required */
+    *result = MCER_CONTINUE;
+
     if ( status & MCi_STATUS_VAL )
     {
         switch ( status & INTEL_MCCOD_MASK )
@@ -744,7 +717,7 @@ static int intel_default_check(uint64_t 
 
 static void intel_default_mce_dhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
     type = intel_check_mce_type(status);
 
     if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
-        result->result = MCA_NEED_RESET;
-    else if (type == intel_mce_ucr_srao)
-        result->result = MCA_NO_ACTION;
+        *result = MCER_RESET;
+    else
+        *result = MCER_CONTINUE;
 }
 
 static const struct mca_error_handler intel_mce_dhandlers[] = {
@@ -764,7 +737,7 @@ static const struct mca_error_handler in
 
 static void intel_default_mce_uhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
     /* Panic if no handler for SRAR error */
     case intel_mce_ucr_srar:
     case intel_mce_fatal:
-        result->result = MCA_NEED_RESET;
+        *result = MCER_RESET;
         break;
     default:
-        result->result = MCA_NO_ACTION;
+        *result = MCER_CONTINUE;
         break;
     }
 }
diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun May 08 13:39:40 2011 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun May 08 18:15:13 2011 +0800
@@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
 void mcabanks_free(struct mca_banks *banks);
 extern struct mca_banks *mca_allbanks;
 
-/* Below interfaces are defined for MCA internal processing:
- * a. pre_handler will be called early in MCA ISR context, mainly for early
- *    need_reset detection for avoiding log missing. Also, it is used to judge
- *    impacted DOMAIN if possible.
- * b. mca_error_handler is actually a (error_action_index,
- *    recovery_hanlder pointer) pair. The defined recovery_handler
- *    performs the actual recovery operations such as page_offline, cpu_offline
- *    in softIRQ context when the per_bank MCA error matching the corresponding
- *    mca_code index. If pre_handler can't judge the impacted domain,
- *    recovery_handler must figure it out.
-*/
-
-/* MCA error has been recovered successfully by the recovery action*/
-#define MCA_RECOVERED (0x1 << 0)
-/* MCA error impact the specified DOMAIN in owner field below */
-#define MCA_OWNER (0x1 << 1)
-/* MCA error can't be recovered and need reset */
-#define MCA_NEED_RESET (0x1 << 2)
-/* MCA error did not have any action yet */
-#define MCA_NO_ACTION (0x1 << 3)
-
-struct mca_handle_result
-{
-    uint32_t result;
-    /* Used one result & MCA_OWNER */
-    domid_t owner;
-    /* Used by mca_error_handler, result & MCA_RECOVRED */
-    struct recovery_action *action;
-};
-
 /*Keep bank so that we can get staus even if mib is NULL */
 struct mca_binfo {
     int bank;
@@ -163,8 +133,14 @@ struct mca_binfo {
     struct cpu_user_regs *regs;
 };
 
-extern void (*mca_prehandler)( struct cpu_user_regs *regs,
-                        struct mca_handle_result *result);
+enum mce_result
+{
+    MCER_NOERROR,
+    MCER_RECOVERED,
+    /* Not recoverd, but can continue */
+    MCER_CONTINUE,
+    MCER_RESET,
+};
 
 struct mca_error_handler
 {
@@ -175,7 +151,7 @@ struct mca_error_handler
     */
     int (*owned_error)(uint64_t status);
     void (*recovery_handler)(struct mca_binfo *binfo,
-                    struct mca_handle_result *result);
+                    enum mce_result *result);
 };
 
 /* Global variables */

[-- Attachment #2: mca-cleanup-7.patch --]
[-- Type: application/octet-stream, Size: 10772 bytes --]

MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension

This patch simplify mce_action(). Basically the 2 set of mce status flags MCA_... and MCER_... are highly functional similar. This patch remove the redundant middle-level flag MCA_..., and its related data structure and function, so that mce_action() logic is more clean and simpler.
This patch also update mem err handler. intel_memerr_dhandler() will be commonly used by SRAO and SRAR, so for the extension of future SRAR case, this patch remove the default fail flag from intel_memerr_dhandler() outside to SRAO/SRAR level, since only SRAO/SRAR handler itself has the knowledge of how to handle the failure when mem err handler fail.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>

diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun May 08 13:39:40 2011 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun May 08 18:15:13 2011 +0800
@@ -172,23 +172,14 @@ static unsigned int __read_mostly mce_dh
 static unsigned int __read_mostly mce_dhandler_num;
 static unsigned int __read_mostly mce_uhandler_num;
 
-enum mce_result
-{
-    MCER_NOERROR,
-    MCER_RECOVERED,
-    /* Not recoverd, but can continue */
-    MCER_CONTINUE,
-    MCER_RESET,
-};
-
 /* Maybe called in MCE context, no lock, no printk */
 static enum mce_result mce_action(struct cpu_user_regs *regs,
                       mctelem_cookie_t mctc)
 {
     struct mc_info *local_mi;
-    enum mce_result ret = MCER_NOERROR;
+    enum mce_result bank_result = MCER_NOERROR;
+    enum mce_result worst_result = MCER_NOERROR;
     struct mcinfo_common *mic = NULL;
-    struct mca_handle_result mca_res;
     struct mca_binfo binfo;
     const struct mca_error_handler *handlers = mce_dhandlers;
     unsigned int i, handler_num = mce_dhandler_num;
@@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
     /* Processing bank information */
     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
 
-    for ( ; ret != MCER_RESET && mic && mic->size;
+    for ( ; bank_result != MCER_RESET && mic && mic->size;
           mic = x86_mcinfo_next(mic) )
     {
         if (mic->type != MC_TYPE_BANK) {
@@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
         }
         binfo.mib = (struct mcinfo_bank*)mic;
         binfo.bank = binfo.mib->mc_bank;
-        memset(&mca_res, 0x0f, sizeof(mca_res));
+        bank_result = MCER_NOERROR;
         for ( i = 0; i < handler_num; i++ ) {
             if (handlers[i].owned_error(binfo.mib->mc_status))
             {
-                handlers[i].recovery_handler(&binfo, &mca_res);
-
-                if (mca_res.result & MCA_OWNER)
-                    binfo.mib->mc_domid = mca_res.owner;
-
-                if (mca_res.result == MCA_NEED_RESET)
-                    ret = MCER_RESET;
-                else if (mca_res.result == MCA_RECOVERED)
-                {
-                    if (ret < MCER_RECOVERED)
-                        ret = MCER_RECOVERED;
-                }
-                else if (mca_res.result == MCA_NO_ACTION)
-                {
-                    if (ret < MCER_CONTINUE)
-                        ret = MCER_CONTINUE;
-                }
+                handlers[i].recovery_handler(&binfo, &bank_result);
+                if (worst_result < bank_result)
+                    worst_result = bank_result;
                 break;
             }
         }
         ASSERT(i != handler_num);
     }
 
-    return ret;
+    return worst_result;
 }
 
 /*
@@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
 
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     struct mcinfo_bank *bank = binfo->mib;
     struct mcinfo_global *global = binfo->mig;
@@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
     uint64_t mc_status, mc_misc;
 
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
-    result->result = MCA_NEED_RESET;
 
     mc_status = bank->mc_status;
     mc_misc = bank->mc_misc;
@@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
         !(mc_status & MCi_STATUS_MISCV) ||
         ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
     {
-        result->result |= MCA_NO_ACTION;
         dprintk(XENLOG_WARNING,
             "No physical address provided for memory error\n");
         return;
@@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
 
     /* This is free page */
     if (status & PG_OFFLINE_OFFLINED)
-        result->result = MCA_RECOVERED;
+        *result = MCER_RECOVERED;
     else if (status & PG_OFFLINE_PENDING) {
         /* This page has owner */
         if (status & PG_OFFLINE_OWNED) {
-            result->result |= MCA_OWNER;
-            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
+            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
             mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-              " by DOM %d\n", result->owner);
-            /* Fill vMCE# injection and vMCE# MSR virtualization "
-             * "related data */
-            bank->mc_domid = result->owner;
+              " by DOM %d\n", bank->mc_domid);
             /* XXX: Cannot handle shared pages yet 
              * (this should identify all domains and gfn mapping to
              *  the mfn in question) */
-            BUG_ON( result->owner == DOMID_COW );
-            if ( result->owner != DOMID_XEN ) {
-                d = get_domain_by_id(result->owner);
+            BUG_ON( bank->mc_domid == DOMID_COW );
+            if ( bank->mc_domid != DOMID_XEN ) {
+                d = get_domain_by_id(bank->mc_domid);
                 ASSERT(d);
                 gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
 
@@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
                       global->mc_gstatus) == -1 )
                 {
                     mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
-                      "failed\n", result->owner);
+                      "failed\n", bank->mc_domid);
                     goto vmce_failed;
                 }
 
@@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
                  * For xen, it has contained the error and finished
                  * its own recovery job.
                  */
-                result->result = MCA_RECOVERED;
+                *result = MCER_RECOVERED;
                 put_domain(d);
 
                 return;
@@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
 
 static void intel_srao_dhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     uint64_t status = binfo->mib->mc_status;
 
     /* For unknown srao error code, no action required */
+    *result = MCER_CONTINUE;
+
     if ( status & MCi_STATUS_VAL )
     {
         switch ( status & INTEL_MCCOD_MASK )
@@ -744,7 +717,7 @@ static int intel_default_check(uint64_t 
 
 static void intel_default_mce_dhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
     type = intel_check_mce_type(status);
 
     if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
-        result->result = MCA_NEED_RESET;
-    else if (type == intel_mce_ucr_srao)
-        result->result = MCA_NO_ACTION;
+        *result = MCER_RESET;
+    else
+        *result = MCER_CONTINUE;
 }
 
 static const struct mca_error_handler intel_mce_dhandlers[] = {
@@ -764,7 +737,7 @@ static const struct mca_error_handler in
 
 static void intel_default_mce_uhandler(
              struct mca_binfo *binfo,
-             struct mca_handle_result *result)
+             enum mce_result *result)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
     /* Panic if no handler for SRAR error */
     case intel_mce_ucr_srar:
     case intel_mce_fatal:
-        result->result = MCA_NEED_RESET;
+        *result = MCER_RESET;
         break;
     default:
-        result->result = MCA_NO_ACTION;
+        *result = MCER_CONTINUE;
         break;
     }
 }
diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun May 08 13:39:40 2011 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun May 08 18:15:13 2011 +0800
@@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
 void mcabanks_free(struct mca_banks *banks);
 extern struct mca_banks *mca_allbanks;
 
-/* Below interfaces are defined for MCA internal processing:
- * a. pre_handler will be called early in MCA ISR context, mainly for early
- *    need_reset detection for avoiding log missing. Also, it is used to judge
- *    impacted DOMAIN if possible.
- * b. mca_error_handler is actually a (error_action_index,
- *    recovery_hanlder pointer) pair. The defined recovery_handler
- *    performs the actual recovery operations such as page_offline, cpu_offline
- *    in softIRQ context when the per_bank MCA error matching the corresponding
- *    mca_code index. If pre_handler can't judge the impacted domain,
- *    recovery_handler must figure it out.
-*/
-
-/* MCA error has been recovered successfully by the recovery action*/
-#define MCA_RECOVERED (0x1 << 0)
-/* MCA error impact the specified DOMAIN in owner field below */
-#define MCA_OWNER (0x1 << 1)
-/* MCA error can't be recovered and need reset */
-#define MCA_NEED_RESET (0x1 << 2)
-/* MCA error did not have any action yet */
-#define MCA_NO_ACTION (0x1 << 3)
-
-struct mca_handle_result
-{
-    uint32_t result;
-    /* Used one result & MCA_OWNER */
-    domid_t owner;
-    /* Used by mca_error_handler, result & MCA_RECOVRED */
-    struct recovery_action *action;
-};
-
 /*Keep bank so that we can get staus even if mib is NULL */
 struct mca_binfo {
     int bank;
@@ -163,8 +133,14 @@ struct mca_binfo {
     struct cpu_user_regs *regs;
 };
 
-extern void (*mca_prehandler)( struct cpu_user_regs *regs,
-                        struct mca_handle_result *result);
+enum mce_result
+{
+    MCER_NOERROR,
+    MCER_RECOVERED,
+    /* Not recoverd, but can continue */
+    MCER_CONTINUE,
+    MCER_RESET,
+};
 
 struct mca_error_handler
 {
@@ -175,7 +151,7 @@ struct mca_error_handler
     */
     int (*owned_error)(uint64_t status);
     void (*recovery_handler)(struct mca_binfo *binfo,
-                    struct mca_handle_result *result);
+                    enum mce_result *result);
 };
 
 /* Global variables */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
  2011-05-08 11:23 [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension Liu, Jinsong
@ 2011-05-13 14:34 ` Liu, Jinsong
  2011-05-14  9:45   ` Keir Fraser
  0 siblings, 1 reply; 5+ messages in thread
From: Liu, Jinsong @ 2011-05-13 14:34 UTC (permalink / raw)
  To: Jan Beulich, Keir Fraser, xen-devel; +Cc: Jiang, Yunhong, Li, Xin

Jan and Keir,

Any comments?

Thanks
Jinsong


Liu, Jinsong wrote:
> MCA: simplify mce actoin, and some update of mem err handler for
> future SRAR extension 
> 
> This patch simplify mce_action(). Basically the 2 set of mce status
> flags MCA_... and MCER_... are highly functional similar. This patch
> remove the redundant middle-level flag MCA_..., and its related data
> structure and function, so that mce_action() logic is more clean and
> simpler.    
> This patch also update mem err handler. intel_memerr_dhandler() will
> be commonly used by SRAO and SRAR, so for the extension of future
> SRAR case, this patch remove the default fail flag from
> intel_memerr_dhandler() outside to SRAO/SRAR level, since only
> SRAO/SRAR handler itself has the knowledge of how to handle the
> failure when mem err handler fail.     
> 
> Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
> 
> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun May 08 13:39:40 2011
> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c	Sun May 08 18:15:13
> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>  static unsigned int __read_mostly mce_uhandler_num;
> 
> -enum mce_result
> -{
> -    MCER_NOERROR,
> -    MCER_RECOVERED,
> -    /* Not recoverd, but can continue */
> -    MCER_CONTINUE,
> -    MCER_RESET,
> -};
> -
>  /* Maybe called in MCE context, no lock, no printk */
>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>                        mctelem_cookie_t mctc)
>  {
>      struct mc_info *local_mi;
> -    enum mce_result ret = MCER_NOERROR;
> +    enum mce_result bank_result = MCER_NOERROR;
> +    enum mce_result worst_result = MCER_NOERROR;
>      struct mcinfo_common *mic = NULL;
> -    struct mca_handle_result mca_res;
>      struct mca_binfo binfo;
>      const struct mca_error_handler *handlers = mce_dhandlers;
>      unsigned int i, handler_num = mce_dhandler_num;
> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>      /* Processing bank information */
>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
> 
> -    for ( ; ret != MCER_RESET && mic && mic->size;
> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>            mic = x86_mcinfo_next(mic) )
>      {
>          if (mic->type != MC_TYPE_BANK) {
> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
>          }
>          binfo.mib = (struct mcinfo_bank*)mic;
>          binfo.bank = binfo.mib->mc_bank;
> -        memset(&mca_res, 0x0f, sizeof(mca_res));
> +        bank_result = MCER_NOERROR;
>          for ( i = 0; i < handler_num; i++ ) {
>              if (handlers[i].owned_error(binfo.mib->mc_status))
>              {
> -                handlers[i].recovery_handler(&binfo, &mca_res);
> -
> -                if (mca_res.result & MCA_OWNER)
> -                    binfo.mib->mc_domid = mca_res.owner;
> -
> -                if (mca_res.result == MCA_NEED_RESET)
> -                    ret = MCER_RESET;
> -                else if (mca_res.result == MCA_RECOVERED)
> -                {
> -                    if (ret < MCER_RECOVERED)
> -                        ret = MCER_RECOVERED;
> -                }
> -                else if (mca_res.result == MCA_NO_ACTION)
> -                {
> -                    if (ret < MCER_CONTINUE)
> -                        ret = MCER_CONTINUE;
> -                }
> +                handlers[i].recovery_handler(&binfo, &bank_result);
> +                if (worst_result < bank_result)
> +                    worst_result = bank_result;
>                  break;
>              }
>          }
>          ASSERT(i != handler_num);
>      }
> 
> -    return ret;
> +    return worst_result;
>  }
> 
>  /*
> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
> 
>  static void intel_memerr_dhandler(
>               struct mca_binfo *binfo,
> -             struct mca_handle_result *result)
> +             enum mce_result *result)
>  {
>      struct mcinfo_bank *bank = binfo->mib;
>      struct mcinfo_global *global = binfo->mig;
> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>      uint64_t mc_status, mc_misc;
> 
>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
> -    result->result = MCA_NEED_RESET;
> 
>      mc_status = bank->mc_status;
>      mc_misc = bank->mc_misc;
> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>          !(mc_status & MCi_STATUS_MISCV) ||
>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>      {
> -        result->result |= MCA_NO_ACTION;
>          dprintk(XENLOG_WARNING,
>              "No physical address provided for memory error\n");
>          return;
> @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
> 
>      /* This is free page */
>      if (status & PG_OFFLINE_OFFLINED)
> -        result->result = MCA_RECOVERED;
> +        *result = MCER_RECOVERED;
>      else if (status & PG_OFFLINE_PENDING) {
>          /* This page has owner */
>          if (status & PG_OFFLINE_OWNED) {
> -            result->result |= MCA_OWNER;
> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
> -              " by DOM %d\n", result->owner);
> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
> -             * "related data */
> -            bank->mc_domid = result->owner;
> +              " by DOM %d\n", bank->mc_domid);
>              /* XXX: Cannot handle shared pages yet
>               * (this should identify all domains and gfn mapping to
>               *  the mfn in question) */
> -            BUG_ON( result->owner == DOMID_COW );
> -            if ( result->owner != DOMID_XEN ) {
> -                d = get_domain_by_id(result->owner);
> +            BUG_ON( bank->mc_domid == DOMID_COW );
> +            if ( bank->mc_domid != DOMID_XEN ) {
> +                d = get_domain_by_id(bank->mc_domid);
>                  ASSERT(d);
>                  gfn = get_gpfn_from_mfn((bank->mc_addr) >>
> PAGE_SHIFT); 
> 
> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>                        global->mc_gstatus) == -1 )
>                  {
>                      mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d
> " -                      "failed\n", result->owner);
> +                      "failed\n", bank->mc_domid);
>                      goto vmce_failed;
>                  }
> 
> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>                   * For xen, it has contained the error and finished
>                   * its own recovery job.
>                   */
> -                result->result = MCA_RECOVERED;
> +                *result = MCER_RECOVERED;
>                  put_domain(d);
> 
>                  return;
> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
> 
>  static void intel_srao_dhandler(
>               struct mca_binfo *binfo,
> -             struct mca_handle_result *result)
> +             enum mce_result *result)
>  {
>      uint64_t status = binfo->mib->mc_status;
> 
>      /* For unknown srao error code, no action required */
> +    *result = MCER_CONTINUE;
> +
>      if ( status & MCi_STATUS_VAL )
>      {
>          switch ( status & INTEL_MCCOD_MASK )
> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
> 
>  static void intel_default_mce_dhandler(
>               struct mca_binfo *binfo,
> -             struct mca_handle_result *result)
> +             enum mce_result *result)
>  {
>      uint64_t status = binfo->mib->mc_status;
>      enum intel_mce_type type;
> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>      type = intel_check_mce_type(status);
> 
>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
> -        result->result = MCA_NEED_RESET;
> -    else if (type == intel_mce_ucr_srao)
> -        result->result = MCA_NO_ACTION;
> +        *result = MCER_RESET;
> +    else
> +        *result = MCER_CONTINUE;
>  }
> 
>  static const struct mca_error_handler intel_mce_dhandlers[] = {
> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
> 
>  static void intel_default_mce_uhandler(
>               struct mca_binfo *binfo,
> -             struct mca_handle_result *result)
> +             enum mce_result *result)
>  {
>      uint64_t status = binfo->mib->mc_status;
>      enum intel_mce_type type;
> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>      /* Panic if no handler for SRAR error */
>      case intel_mce_ucr_srar:
>      case intel_mce_fatal:
> -        result->result = MCA_NEED_RESET;
> +        *result = MCER_RESET;
>          break;
>      default:
> -        result->result = MCA_NO_ACTION;
> +        *result = MCER_CONTINUE;
>          break;
>      }
>  }
> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun May 08 13:39:40 2011 +0800
> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Sun May 08 18:15:13 2011 +0800
> @@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
>  void mcabanks_free(struct mca_banks *banks);
>  extern struct mca_banks *mca_allbanks;
> 
> -/* Below interfaces are defined for MCA internal processing:
> - * a. pre_handler will be called early in MCA ISR context, mainly
> for early 
> - *    need_reset detection for avoiding log missing. Also, it is
> used to judge 
> - *    impacted DOMAIN if possible.
> - * b. mca_error_handler is actually a (error_action_index,
> - *    recovery_hanlder pointer) pair. The defined recovery_handler
> - *    performs the actual recovery operations such as page_offline,
> cpu_offline 
> - *    in softIRQ context when the per_bank MCA error matching the
> corresponding 
> - *    mca_code index. If pre_handler can't judge the impacted domain,
> - *    recovery_handler must figure it out.
> -*/
> -
> -/* MCA error has been recovered successfully by the recovery action*/
> -#define MCA_RECOVERED (0x1 << 0)
> -/* MCA error impact the specified DOMAIN in owner field below */
> -#define MCA_OWNER (0x1 << 1)
> -/* MCA error can't be recovered and need reset */
> -#define MCA_NEED_RESET (0x1 << 2)
> -/* MCA error did not have any action yet */
> -#define MCA_NO_ACTION (0x1 << 3)
> -
> -struct mca_handle_result
> -{
> -    uint32_t result;
> -    /* Used one result & MCA_OWNER */
> -    domid_t owner;
> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
> -    struct recovery_action *action;
> -};
> -
>  /*Keep bank so that we can get staus even if mib is NULL */
>  struct mca_binfo {
>      int bank;
> @@ -163,8 +133,14 @@ struct mca_binfo {
>      struct cpu_user_regs *regs;
>  };
> 
> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
> -                        struct mca_handle_result *result);
> +enum mce_result
> +{
> +    MCER_NOERROR,
> +    MCER_RECOVERED,
> +    /* Not recoverd, but can continue */
> +    MCER_CONTINUE,
> +    MCER_RESET,
> +};
> 
>  struct mca_error_handler
>  {
> @@ -175,7 +151,7 @@ struct mca_error_handler
>      */
>      int (*owned_error)(uint64_t status);
>      void (*recovery_handler)(struct mca_binfo *binfo,
> -                    struct mca_handle_result *result);
> +                    enum mce_result *result);
>  };
> 
>  /* Global variables */

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
  2011-05-13 14:34 ` Liu, Jinsong
@ 2011-05-14  9:45   ` Keir Fraser
  2011-05-14 10:01     ` Keir Fraser
  0 siblings, 1 reply; 5+ messages in thread
From: Keir Fraser @ 2011-05-14  9:45 UTC (permalink / raw)
  To: Liu, Jinsong, Jan Beulich, xen-devel; +Cc: Jiang, Yunhong, Li, Xin

I don't seem to have this one in my queue. You should resend.

 -- Keir

On 13/05/2011 15:34, "Liu, Jinsong" <jinsong.liu@intel.com> wrote:

> Jan and Keir,
> 
> Any comments?
> 
> Thanks
> Jinsong
> 
> 
> Liu, Jinsong wrote:
>> MCA: simplify mce actoin, and some update of mem err handler for
>> future SRAR extension
>> 
>> This patch simplify mce_action(). Basically the 2 set of mce status
>> flags MCA_... and MCER_... are highly functional similar. This patch
>> remove the redundant middle-level flag MCA_..., and its related data
>> structure and function, so that mce_action() logic is more clean and
>> simpler.    
>> This patch also update mem err handler. intel_memerr_dhandler() will
>> be commonly used by SRAO and SRAR, so for the extension of future
>> SRAR case, this patch remove the default fail flag from
>> intel_memerr_dhandler() outside to SRAO/SRAR level, since only
>> SRAO/SRAR handler itself has the knowledge of how to handle the
>> failure when mem err handler fail.
>> 
>> Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
>> 
>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
>> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 13:39:40 2011
>> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 18:15:13
>> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>>  static unsigned int __read_mostly mce_uhandler_num;
>> 
>> -enum mce_result
>> -{
>> -    MCER_NOERROR,
>> -    MCER_RECOVERED,
>> -    /* Not recoverd, but can continue */
>> -    MCER_CONTINUE,
>> -    MCER_RESET,
>> -};
>> -
>>  /* Maybe called in MCE context, no lock, no printk */
>>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>>                        mctelem_cookie_t mctc)
>>  {
>>      struct mc_info *local_mi;
>> -    enum mce_result ret = MCER_NOERROR;
>> +    enum mce_result bank_result = MCER_NOERROR;
>> +    enum mce_result worst_result = MCER_NOERROR;
>>      struct mcinfo_common *mic = NULL;
>> -    struct mca_handle_result mca_res;
>>      struct mca_binfo binfo;
>>      const struct mca_error_handler *handlers = mce_dhandlers;
>>      unsigned int i, handler_num = mce_dhandler_num;
>> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>>      /* Processing bank information */
>>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
>> 
>> -    for ( ; ret != MCER_RESET && mic && mic->size;
>> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>>            mic = x86_mcinfo_next(mic) )
>>      {
>>          if (mic->type != MC_TYPE_BANK) {
>> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
>>          }
>>          binfo.mib = (struct mcinfo_bank*)mic;
>>          binfo.bank = binfo.mib->mc_bank;
>> -        memset(&mca_res, 0x0f, sizeof(mca_res));
>> +        bank_result = MCER_NOERROR;
>>          for ( i = 0; i < handler_num; i++ ) {
>>              if (handlers[i].owned_error(binfo.mib->mc_status))
>>              {
>> -                handlers[i].recovery_handler(&binfo, &mca_res);
>> -
>> -                if (mca_res.result & MCA_OWNER)
>> -                    binfo.mib->mc_domid = mca_res.owner;
>> -
>> -                if (mca_res.result == MCA_NEED_RESET)
>> -                    ret = MCER_RESET;
>> -                else if (mca_res.result == MCA_RECOVERED)
>> -                {
>> -                    if (ret < MCER_RECOVERED)
>> -                        ret = MCER_RECOVERED;
>> -                }
>> -                else if (mca_res.result == MCA_NO_ACTION)
>> -                {
>> -                    if (ret < MCER_CONTINUE)
>> -                        ret = MCER_CONTINUE;
>> -                }
>> +                handlers[i].recovery_handler(&binfo, &bank_result);
>> +                if (worst_result < bank_result)
>> +                    worst_result = bank_result;
>>                  break;
>>              }
>>          }
>>          ASSERT(i != handler_num);
>>      }
>> 
>> -    return ret;
>> +    return worst_result;
>>  }
>> 
>>  /*
>> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
>> 
>>  static void intel_memerr_dhandler(
>>               struct mca_binfo *binfo,
>> -             struct mca_handle_result *result)
>> +             enum mce_result *result)
>>  {
>>      struct mcinfo_bank *bank = binfo->mib;
>>      struct mcinfo_global *global = binfo->mig;
>> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>>      uint64_t mc_status, mc_misc;
>> 
>>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
>> -    result->result = MCA_NEED_RESET;
>> 
>>      mc_status = bank->mc_status;
>>      mc_misc = bank->mc_misc;
>> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>>          !(mc_status & MCi_STATUS_MISCV) ||
>>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>>      {
>> -        result->result |= MCA_NO_ACTION;
>>          dprintk(XENLOG_WARNING,
>>              "No physical address provided for memory error\n");
>>          return;
>> @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
>> 
>>      /* This is free page */
>>      if (status & PG_OFFLINE_OFFLINED)
>> -        result->result = MCA_RECOVERED;
>> +        *result = MCER_RECOVERED;
>>      else if (status & PG_OFFLINE_PENDING) {
>>          /* This page has owner */
>>          if (status & PG_OFFLINE_OWNED) {
>> -            result->result |= MCA_OWNER;
>> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
>> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
>> -              " by DOM %d\n", result->owner);
>> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
>> -             * "related data */
>> -            bank->mc_domid = result->owner;
>> +              " by DOM %d\n", bank->mc_domid);
>>              /* XXX: Cannot handle shared pages yet
>>               * (this should identify all domains and gfn mapping to
>>               *  the mfn in question) */
>> -            BUG_ON( result->owner == DOMID_COW );
>> -            if ( result->owner != DOMID_XEN ) {
>> -                d = get_domain_by_id(result->owner);
>> +            BUG_ON( bank->mc_domid == DOMID_COW );
>> +            if ( bank->mc_domid != DOMID_XEN ) {
>> +                d = get_domain_by_id(bank->mc_domid);
>>                  ASSERT(d);
>>                  gfn = get_gpfn_from_mfn((bank->mc_addr) >>
>> PAGE_SHIFT); 
>> 
>> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>>                        global->mc_gstatus) == -1 )
>>                  {
>>                      mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d
>> " -                      "failed\n", result->owner);
>> +                      "failed\n", bank->mc_domid);
>>                      goto vmce_failed;
>>                  }
>> 
>> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>>                   * For xen, it has contained the error and finished
>>                   * its own recovery job.
>>                   */
>> -                result->result = MCA_RECOVERED;
>> +                *result = MCER_RECOVERED;
>>                  put_domain(d);
>> 
>>                  return;
>> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
>> 
>>  static void intel_srao_dhandler(
>>               struct mca_binfo *binfo,
>> -             struct mca_handle_result *result)
>> +             enum mce_result *result)
>>  {
>>      uint64_t status = binfo->mib->mc_status;
>> 
>>      /* For unknown srao error code, no action required */
>> +    *result = MCER_CONTINUE;
>> +
>>      if ( status & MCi_STATUS_VAL )
>>      {
>>          switch ( status & INTEL_MCCOD_MASK )
>> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
>> 
>>  static void intel_default_mce_dhandler(
>>               struct mca_binfo *binfo,
>> -             struct mca_handle_result *result)
>> +             enum mce_result *result)
>>  {
>>      uint64_t status = binfo->mib->mc_status;
>>      enum intel_mce_type type;
>> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>>      type = intel_check_mce_type(status);
>> 
>>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
>> -        result->result = MCA_NEED_RESET;
>> -    else if (type == intel_mce_ucr_srao)
>> -        result->result = MCA_NO_ACTION;
>> +        *result = MCER_RESET;
>> +    else
>> +        *result = MCER_CONTINUE;
>>  }
>> 
>>  static const struct mca_error_handler intel_mce_dhandlers[] = {
>> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
>> 
>>  static void intel_default_mce_uhandler(
>>               struct mca_binfo *binfo,
>> -             struct mca_handle_result *result)
>> +             enum mce_result *result)
>>  {
>>      uint64_t status = binfo->mib->mc_status;
>>      enum intel_mce_type type;
>> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>>      /* Panic if no handler for SRAR error */
>>      case intel_mce_ucr_srar:
>>      case intel_mce_fatal:
>> -        result->result = MCA_NEED_RESET;
>> +        *result = MCER_RESET;
>>          break;
>>      default:
>> -        result->result = MCA_NO_ACTION;
>> +        *result = MCER_CONTINUE;
>>          break;
>>      }
>>  }
>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
>> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 13:39:40 2011 +0800
>> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 18:15:13 2011 +0800
>> @@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
>>  void mcabanks_free(struct mca_banks *banks);
>>  extern struct mca_banks *mca_allbanks;
>> 
>> -/* Below interfaces are defined for MCA internal processing:
>> - * a. pre_handler will be called early in MCA ISR context, mainly
>> for early 
>> - *    need_reset detection for avoiding log missing. Also, it is
>> used to judge 
>> - *    impacted DOMAIN if possible.
>> - * b. mca_error_handler is actually a (error_action_index,
>> - *    recovery_hanlder pointer) pair. The defined recovery_handler
>> - *    performs the actual recovery operations such as page_offline,
>> cpu_offline 
>> - *    in softIRQ context when the per_bank MCA error matching the
>> corresponding 
>> - *    mca_code index. If pre_handler can't judge the impacted domain,
>> - *    recovery_handler must figure it out.
>> -*/
>> -
>> -/* MCA error has been recovered successfully by the recovery action*/
>> -#define MCA_RECOVERED (0x1 << 0)
>> -/* MCA error impact the specified DOMAIN in owner field below */
>> -#define MCA_OWNER (0x1 << 1)
>> -/* MCA error can't be recovered and need reset */
>> -#define MCA_NEED_RESET (0x1 << 2)
>> -/* MCA error did not have any action yet */
>> -#define MCA_NO_ACTION (0x1 << 3)
>> -
>> -struct mca_handle_result
>> -{
>> -    uint32_t result;
>> -    /* Used one result & MCA_OWNER */
>> -    domid_t owner;
>> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
>> -    struct recovery_action *action;
>> -};
>> -
>>  /*Keep bank so that we can get staus even if mib is NULL */
>>  struct mca_binfo {
>>      int bank;
>> @@ -163,8 +133,14 @@ struct mca_binfo {
>>      struct cpu_user_regs *regs;
>>  };
>> 
>> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
>> -                        struct mca_handle_result *result);
>> +enum mce_result
>> +{
>> +    MCER_NOERROR,
>> +    MCER_RECOVERED,
>> +    /* Not recoverd, but can continue */
>> +    MCER_CONTINUE,
>> +    MCER_RESET,
>> +};
>> 
>>  struct mca_error_handler
>>  {
>> @@ -175,7 +151,7 @@ struct mca_error_handler
>>      */
>>      int (*owned_error)(uint64_t status);
>>      void (*recovery_handler)(struct mca_binfo *binfo,
>> -                    struct mca_handle_result *result);
>> +                    enum mce_result *result);
>>  };
>> 
>>  /* Global variables */
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
  2011-05-14  9:45   ` Keir Fraser
@ 2011-05-14 10:01     ` Keir Fraser
  2011-05-15  5:08       ` Liu, Jinsong
  0 siblings, 1 reply; 5+ messages in thread
From: Keir Fraser @ 2011-05-14 10:01 UTC (permalink / raw)
  To: Liu, Jinsong, Jan Beulich, xen-devel; +Cc: Jiang, Yunhong, Li, Xin

Ah no, I found it! I'll take a look at it on Monday.

 -- Keir

On 14/05/2011 10:45, "Keir Fraser" <keir@xen.org> wrote:

> I don't seem to have this one in my queue. You should resend.
> 
>  -- Keir
> 
> On 13/05/2011 15:34, "Liu, Jinsong" <jinsong.liu@intel.com> wrote:
> 
>> Jan and Keir,
>> 
>> Any comments?
>> 
>> Thanks
>> Jinsong
>> 
>> 
>> Liu, Jinsong wrote:
>>> MCA: simplify mce actoin, and some update of mem err handler for
>>> future SRAR extension
>>> 
>>> This patch simplify mce_action(). Basically the 2 set of mce status
>>> flags MCA_... and MCER_... are highly functional similar. This patch
>>> remove the redundant middle-level flag MCA_..., and its related data
>>> structure and function, so that mce_action() logic is more clean and
>>> simpler.    
>>> This patch also update mem err handler. intel_memerr_dhandler() will
>>> be commonly used by SRAO and SRAR, so for the extension of future
>>> SRAR case, this patch remove the default fail flag from
>>> intel_memerr_dhandler() outside to SRAO/SRAR level, since only
>>> SRAO/SRAR handler itself has the knowledge of how to handle the
>>> failure when mem err handler fail.
>>> 
>>> Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
>>> 
>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
>>> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 13:39:40 2011
>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 18:15:13
>>> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>>>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>>>  static unsigned int __read_mostly mce_uhandler_num;
>>> 
>>> -enum mce_result
>>> -{
>>> -    MCER_NOERROR,
>>> -    MCER_RECOVERED,
>>> -    /* Not recoverd, but can continue */
>>> -    MCER_CONTINUE,
>>> -    MCER_RESET,
>>> -};
>>> -
>>>  /* Maybe called in MCE context, no lock, no printk */
>>>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>>>                        mctelem_cookie_t mctc)
>>>  {
>>>      struct mc_info *local_mi;
>>> -    enum mce_result ret = MCER_NOERROR;
>>> +    enum mce_result bank_result = MCER_NOERROR;
>>> +    enum mce_result worst_result = MCER_NOERROR;
>>>      struct mcinfo_common *mic = NULL;
>>> -    struct mca_handle_result mca_res;
>>>      struct mca_binfo binfo;
>>>      const struct mca_error_handler *handlers = mce_dhandlers;
>>>      unsigned int i, handler_num = mce_dhandler_num;
>>> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>>>      /* Processing bank information */
>>>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
>>> 
>>> -    for ( ; ret != MCER_RESET && mic && mic->size;
>>> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>>>            mic = x86_mcinfo_next(mic) )
>>>      {
>>>          if (mic->type != MC_TYPE_BANK) {
>>> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct
>>>          }
>>>          binfo.mib = (struct mcinfo_bank*)mic;
>>>          binfo.bank = binfo.mib->mc_bank;
>>> -        memset(&mca_res, 0x0f, sizeof(mca_res));
>>> +        bank_result = MCER_NOERROR;
>>>          for ( i = 0; i < handler_num; i++ ) {
>>>              if (handlers[i].owned_error(binfo.mib->mc_status))
>>>              {
>>> -                handlers[i].recovery_handler(&binfo, &mca_res);
>>> -
>>> -                if (mca_res.result & MCA_OWNER)
>>> -                    binfo.mib->mc_domid = mca_res.owner;
>>> -
>>> -                if (mca_res.result == MCA_NEED_RESET)
>>> -                    ret = MCER_RESET;
>>> -                else if (mca_res.result == MCA_RECOVERED)
>>> -                {
>>> -                    if (ret < MCER_RECOVERED)
>>> -                        ret = MCER_RECOVERED;
>>> -                }
>>> -                else if (mca_res.result == MCA_NO_ACTION)
>>> -                {
>>> -                    if (ret < MCER_CONTINUE)
>>> -                        ret = MCER_CONTINUE;
>>> -                }
>>> +                handlers[i].recovery_handler(&binfo, &bank_result);
>>> +                if (worst_result < bank_result)
>>> +                    worst_result = bank_result;
>>>                  break;
>>>              }
>>>          }
>>>          ASSERT(i != handler_num);
>>>      }
>>> 
>>> -    return ret;
>>> +    return worst_result;
>>>  }
>>> 
>>>  /*
>>> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
>>> 
>>>  static void intel_memerr_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      struct mcinfo_bank *bank = binfo->mib;
>>>      struct mcinfo_global *global = binfo->mig;
>>> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>>>      uint64_t mc_status, mc_misc;
>>> 
>>>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
>>> -    result->result = MCA_NEED_RESET;
>>> 
>>>      mc_status = bank->mc_status;
>>>      mc_misc = bank->mc_misc;
>>> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>>>          !(mc_status & MCi_STATUS_MISCV) ||
>>>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>>>      {
>>> -        result->result |= MCA_NO_ACTION;
>>>          dprintk(XENLOG_WARNING,
>>>              "No physical address provided for memory error\n");
>>>          return;
>>> @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
>>> 
>>>      /* This is free page */
>>>      if (status & PG_OFFLINE_OFFLINED)
>>> -        result->result = MCA_RECOVERED;
>>> +        *result = MCER_RECOVERED;
>>>      else if (status & PG_OFFLINE_PENDING) {
>>>          /* This page has owner */
>>>          if (status & PG_OFFLINE_OWNED) {
>>> -            result->result |= MCA_OWNER;
>>> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
>>> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>>>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
>>> -              " by DOM %d\n", result->owner);
>>> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
>>> -             * "related data */
>>> -            bank->mc_domid = result->owner;
>>> +              " by DOM %d\n", bank->mc_domid);
>>>              /* XXX: Cannot handle shared pages yet
>>>               * (this should identify all domains and gfn mapping to
>>>               *  the mfn in question) */
>>> -            BUG_ON( result->owner == DOMID_COW );
>>> -            if ( result->owner != DOMID_XEN ) {
>>> -                d = get_domain_by_id(result->owner);
>>> +            BUG_ON( bank->mc_domid == DOMID_COW );
>>> +            if ( bank->mc_domid != DOMID_XEN ) {
>>> +                d = get_domain_by_id(bank->mc_domid);
>>>                  ASSERT(d);
>>>                  gfn = get_gpfn_from_mfn((bank->mc_addr) >>
>>> PAGE_SHIFT); 
>>> 
>>> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>>>                        global->mc_gstatus) == -1 )
>>>                  {
>>>                      mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d
>>> " -                      "failed\n", result->owner);
>>> +                      "failed\n", bank->mc_domid);
>>>                      goto vmce_failed;
>>>                  }
>>> 
>>> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>>>                   * For xen, it has contained the error and finished
>>>                   * its own recovery job.
>>>                   */
>>> -                result->result = MCA_RECOVERED;
>>> +                *result = MCER_RECOVERED;
>>>                  put_domain(d);
>>> 
>>>                  return;
>>> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
>>> 
>>>  static void intel_srao_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>> 
>>>      /* For unknown srao error code, no action required */
>>> +    *result = MCER_CONTINUE;
>>> +
>>>      if ( status & MCi_STATUS_VAL )
>>>      {
>>>          switch ( status & INTEL_MCCOD_MASK )
>>> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
>>> 
>>>  static void intel_default_mce_dhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>>      enum intel_mce_type type;
>>> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>>>      type = intel_check_mce_type(status);
>>> 
>>>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
>>> -        result->result = MCA_NEED_RESET;
>>> -    else if (type == intel_mce_ucr_srao)
>>> -        result->result = MCA_NO_ACTION;
>>> +        *result = MCER_RESET;
>>> +    else
>>> +        *result = MCER_CONTINUE;
>>>  }
>>> 
>>>  static const struct mca_error_handler intel_mce_dhandlers[] = {
>>> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
>>> 
>>>  static void intel_default_mce_uhandler(
>>>               struct mca_binfo *binfo,
>>> -             struct mca_handle_result *result)
>>> +             enum mce_result *result)
>>>  {
>>>      uint64_t status = binfo->mib->mc_status;
>>>      enum intel_mce_type type;
>>> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>>>      /* Panic if no handler for SRAR error */
>>>      case intel_mce_ucr_srar:
>>>      case intel_mce_fatal:
>>> -        result->result = MCA_NEED_RESET;
>>> +        *result = MCER_RESET;
>>>          break;
>>>      default:
>>> -        result->result = MCA_NO_ACTION;
>>> +        *result = MCER_CONTINUE;
>>>          break;
>>>      }
>>>  }
>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
>>> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 13:39:40 2011 +0800
>>> +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 18:15:13 2011 +0800
>>> @@ -124,36 +124,6 @@ void mcabanks_free(struct mca_banks *ban
>>>  void mcabanks_free(struct mca_banks *banks);
>>>  extern struct mca_banks *mca_allbanks;
>>> 
>>> -/* Below interfaces are defined for MCA internal processing:
>>> - * a. pre_handler will be called early in MCA ISR context, mainly
>>> for early 
>>> - *    need_reset detection for avoiding log missing. Also, it is
>>> used to judge 
>>> - *    impacted DOMAIN if possible.
>>> - * b. mca_error_handler is actually a (error_action_index,
>>> - *    recovery_hanlder pointer) pair. The defined recovery_handler
>>> - *    performs the actual recovery operations such as page_offline,
>>> cpu_offline 
>>> - *    in softIRQ context when the per_bank MCA error matching the
>>> corresponding 
>>> - *    mca_code index. If pre_handler can't judge the impacted domain,
>>> - *    recovery_handler must figure it out.
>>> -*/
>>> -
>>> -/* MCA error has been recovered successfully by the recovery action*/
>>> -#define MCA_RECOVERED (0x1 << 0)
>>> -/* MCA error impact the specified DOMAIN in owner field below */
>>> -#define MCA_OWNER (0x1 << 1)
>>> -/* MCA error can't be recovered and need reset */
>>> -#define MCA_NEED_RESET (0x1 << 2)
>>> -/* MCA error did not have any action yet */
>>> -#define MCA_NO_ACTION (0x1 << 3)
>>> -
>>> -struct mca_handle_result
>>> -{
>>> -    uint32_t result;
>>> -    /* Used one result & MCA_OWNER */
>>> -    domid_t owner;
>>> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
>>> -    struct recovery_action *action;
>>> -};
>>> -
>>>  /*Keep bank so that we can get staus even if mib is NULL */
>>>  struct mca_binfo {
>>>      int bank;
>>> @@ -163,8 +133,14 @@ struct mca_binfo {
>>>      struct cpu_user_regs *regs;
>>>  };
>>> 
>>> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
>>> -                        struct mca_handle_result *result);
>>> +enum mce_result
>>> +{
>>> +    MCER_NOERROR,
>>> +    MCER_RECOVERED,
>>> +    /* Not recoverd, but can continue */
>>> +    MCER_CONTINUE,
>>> +    MCER_RESET,
>>> +};
>>> 
>>>  struct mca_error_handler
>>>  {
>>> @@ -175,7 +151,7 @@ struct mca_error_handler
>>>      */
>>>      int (*owned_error)(uint64_t status);
>>>      void (*recovery_handler)(struct mca_binfo *binfo,
>>> -                    struct mca_handle_result *result);
>>> +                    enum mce_result *result);
>>>  };
>>> 
>>>  /* Global variables */
>> 
> 
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension
  2011-05-14 10:01     ` Keir Fraser
@ 2011-05-15  5:08       ` Liu, Jinsong
  0 siblings, 0 replies; 5+ messages in thread
From: Liu, Jinsong @ 2011-05-15  5:08 UTC (permalink / raw)
  To: Keir Fraser, Jan Beulich, xen-devel; +Cc: Jiang, Yunhong, Li, Xin

OK, thanks :)

Jinsong

Keir Fraser wrote:
> Ah no, I found it! I'll take a look at it on Monday.
> 
>  -- Keir
> 
> On 14/05/2011 10:45, "Keir Fraser" <keir@xen.org> wrote:
> 
>> I don't seem to have this one in my queue. You should resend.
>> 
>>  -- Keir
>> 
>> On 13/05/2011 15:34, "Liu, Jinsong" <jinsong.liu@intel.com> wrote:
>> 
>>> Jan and Keir,
>>> 
>>> Any comments?
>>> 
>>> Thanks
>>> Jinsong
>>> 
>>> 
>>> Liu, Jinsong wrote:
>>>> MCA: simplify mce actoin, and some update of mem err handler for
>>>> future SRAR extension 
>>>> 
>>>> This patch simplify mce_action(). Basically the 2 set of mce status
>>>> flags MCA_... and MCER_... are highly functional similar. This
>>>> patch remove the redundant middle-level flag MCA_..., and its
>>>> related data structure and function, so that mce_action() logic is
>>>> more clean and simpler. This patch also update mem err handler.
>>>> intel_memerr_dhandler() will be commonly used by SRAO and SRAR, so
>>>> for the extension of future SRAR case, this patch remove the
>>>> default fail flag from intel_memerr_dhandler() outside to
>>>> SRAO/SRAR level, since only SRAO/SRAR handler itself has the
>>>> knowledge of how to handle the failure when mem err handler fail.
>>>> 
>>>> Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
>>>> 
>>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/mce_intel.c
>>>> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 13:39:40 2011
>>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun May 08 18:15:13
>>>> 2011 +0800 @@ -172,23 +172,14 @@ static unsigned int __read_mostly
>>>>  mce_dh static unsigned int __read_mostly mce_dhandler_num;
>>>>  static unsigned int __read_mostly mce_uhandler_num;
>>>> 
>>>> -enum mce_result
>>>> -{
>>>> -    MCER_NOERROR,
>>>> -    MCER_RECOVERED,
>>>> -    /* Not recoverd, but can continue */
>>>> -    MCER_CONTINUE,
>>>> -    MCER_RESET,
>>>> -};
>>>> -
>>>>  /* Maybe called in MCE context, no lock, no printk */
>>>>  static enum mce_result mce_action(struct cpu_user_regs *regs,
>>>>                        mctelem_cookie_t mctc)
>>>>  {
>>>>      struct mc_info *local_mi;
>>>> -    enum mce_result ret = MCER_NOERROR;
>>>> +    enum mce_result bank_result = MCER_NOERROR;
>>>> +    enum mce_result worst_result = MCER_NOERROR;
>>>>      struct mcinfo_common *mic = NULL;
>>>> -    struct mca_handle_result mca_res;
>>>>      struct mca_binfo binfo;
>>>>      const struct mca_error_handler *handlers = mce_dhandlers;
>>>>      unsigned int i, handler_num = mce_dhandler_num;
>>>> @@ -217,7 +208,7 @@ static enum mce_result mce_action(struct
>>>>      /* Processing bank information */
>>>>      x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
>>>> 
>>>> -    for ( ; ret != MCER_RESET && mic && mic->size;
>>>> +    for ( ; bank_result != MCER_RESET && mic && mic->size;
>>>>            mic = x86_mcinfo_next(mic) )
>>>>      {
>>>>          if (mic->type != MC_TYPE_BANK) {
>>>> @@ -225,34 +216,20 @@ static enum mce_result mce_action(struct    
>>>>          } binfo.mib = (struct mcinfo_bank*)mic;
>>>>          binfo.bank = binfo.mib->mc_bank;
>>>> -        memset(&mca_res, 0x0f, sizeof(mca_res));
>>>> +        bank_result = MCER_NOERROR;
>>>>          for ( i = 0; i < handler_num; i++ ) {
>>>>              if (handlers[i].owned_error(binfo.mib->mc_status))   
>>>> { 
>>>> -                handlers[i].recovery_handler(&binfo, &mca_res); -
>>>> -                if (mca_res.result & MCA_OWNER)
>>>> -                    binfo.mib->mc_domid = mca_res.owner; -
>>>> -                if (mca_res.result == MCA_NEED_RESET)
>>>> -                    ret = MCER_RESET;
>>>> -                else if (mca_res.result == MCA_RECOVERED)
>>>> -                {
>>>> -                    if (ret < MCER_RECOVERED)
>>>> -                        ret = MCER_RECOVERED;
>>>> -                }
>>>> -                else if (mca_res.result == MCA_NO_ACTION)
>>>> -                {
>>>> -                    if (ret < MCER_CONTINUE)
>>>> -                        ret = MCER_CONTINUE;
>>>> -                }
>>>> +                handlers[i].recovery_handler(&binfo,
>>>> &bank_result); +                if (worst_result < bank_result)
>>>> +                    worst_result = bank_result;
>>>>                  break;
>>>>              }
>>>>          }
>>>>          ASSERT(i != handler_num);
>>>>      }
>>>> 
>>>> -    return ret;
>>>> +    return worst_result;
>>>>  }
>>>> 
>>>>  /*
>>>> @@ -608,7 +585,7 @@ struct mcinfo_recovery *mci_add_pageoff_
>>>> 
>>>>  static void intel_memerr_dhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      struct mcinfo_bank *bank = binfo->mib;
>>>>      struct mcinfo_global *global = binfo->mig;
>>>> @@ -618,7 +595,6 @@ static void intel_memerr_dhandler(
>>>>      uint64_t mc_status, mc_misc;
>>>> 
>>>>      mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
>>>> -    result->result = MCA_NEED_RESET;
>>>> 
>>>>      mc_status = bank->mc_status;
>>>>      mc_misc = bank->mc_misc;
>>>> @@ -626,7 +602,6 @@ static void intel_memerr_dhandler(
>>>>          !(mc_status & MCi_STATUS_MISCV) ||
>>>>          ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
>>>> { -        result->result |= MCA_NO_ACTION;
>>>>          dprintk(XENLOG_WARNING,
>>>>              "No physical address provided for memory error\n");  
>>>> return; @@ -644,23 +619,19 @@ static void intel_memerr_dhandler(
>>>> 
>>>>      /* This is free page */
>>>>      if (status & PG_OFFLINE_OFFLINED)
>>>> -        result->result = MCA_RECOVERED;
>>>> +        *result = MCER_RECOVERED;
>>>>      else if (status & PG_OFFLINE_PENDING) {
>>>>          /* This page has owner */
>>>>          if (status & PG_OFFLINE_OWNED) {
>>>> -            result->result |= MCA_OWNER;
>>>> -            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
>>>> +            bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
>>>>              mce_printk(MCE_QUIET, "MCE: This error page is ownded"
>>>> -              " by DOM %d\n", result->owner);
>>>> -            /* Fill vMCE# injection and vMCE# MSR virtualization "
>>>> -             * "related data */
>>>> -            bank->mc_domid = result->owner;
>>>> +              " by DOM %d\n", bank->mc_domid);
>>>>              /* XXX: Cannot handle shared pages yet
>>>>               * (this should identify all domains and gfn mapping
>>>> to 
>>>>               *  the mfn in question) */
>>>> -            BUG_ON( result->owner == DOMID_COW );
>>>> -            if ( result->owner != DOMID_XEN ) {
>>>> -                d = get_domain_by_id(result->owner);
>>>> +            BUG_ON( bank->mc_domid == DOMID_COW );
>>>> +            if ( bank->mc_domid != DOMID_XEN ) {
>>>> +                d = get_domain_by_id(bank->mc_domid);            
>>>>                  ASSERT(d); gfn =
>>>> get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT); 
>>>> 
>>>> @@ -683,7 +654,7 @@ static void intel_memerr_dhandler(
>>>>                        global->mc_gstatus) == -1 )
>>>>                  {
>>>>                      mce_printk(MCE_QUIET, "Fill vMCE# data for
>>>> DOM%d " -                      "failed\n", result->owner);
>>>> +                      "failed\n", bank->mc_domid);
>>>>                      goto vmce_failed;
>>>>                  }
>>>> 
>>>> @@ -699,7 +670,7 @@ static void intel_memerr_dhandler(
>>>>                   * For xen, it has contained the error and
>>>> finished 
>>>>                   * its own recovery job.
>>>>                   */
>>>> -                result->result = MCA_RECOVERED;
>>>> +                *result = MCER_RECOVERED;
>>>>                  put_domain(d);
>>>> 
>>>>                  return;
>>>> @@ -718,11 +689,13 @@ static int intel_srao_check(uint64_t sta
>>>> 
>>>>  static void intel_srao_dhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      uint64_t status = binfo->mib->mc_status;
>>>> 
>>>>      /* For unknown srao error code, no action required */ +   
>>>> *result = MCER_CONTINUE; +
>>>>      if ( status & MCi_STATUS_VAL )
>>>>      {
>>>>          switch ( status & INTEL_MCCOD_MASK )
>>>> @@ -744,7 +717,7 @@ static int intel_default_check(uint64_t
>>>> 
>>>>  static void intel_default_mce_dhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      uint64_t status = binfo->mib->mc_status;
>>>>      enum intel_mce_type type;
>>>> @@ -752,9 +725,9 @@ static void intel_default_mce_dhandler(
>>>>      type = intel_check_mce_type(status);
>>>> 
>>>>      if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
>>>> -        result->result = MCA_NEED_RESET;
>>>> -    else if (type == intel_mce_ucr_srao)
>>>> -        result->result = MCA_NO_ACTION;
>>>> +        *result = MCER_RESET;
>>>> +    else
>>>> +        *result = MCER_CONTINUE;
>>>>  }
>>>> 
>>>>  static const struct mca_error_handler intel_mce_dhandlers[] = {
>>>> @@ -764,7 +737,7 @@ static const struct mca_error_handler in
>>>> 
>>>>  static void intel_default_mce_uhandler(
>>>>               struct mca_binfo *binfo,
>>>> -             struct mca_handle_result *result)
>>>> +             enum mce_result *result)
>>>>  {
>>>>      uint64_t status = binfo->mib->mc_status;
>>>>      enum intel_mce_type type;
>>>> @@ -776,10 +749,10 @@ static void intel_default_mce_uhandler(
>>>>      /* Panic if no handler for SRAR error */
>>>>      case intel_mce_ucr_srar:
>>>>      case intel_mce_fatal:
>>>> -        result->result = MCA_NEED_RESET;
>>>> +        *result = MCER_RESET;
>>>>          break;
>>>>      default:
>>>> -        result->result = MCA_NO_ACTION;
>>>> +        *result = MCER_CONTINUE;
>>>>          break;
>>>>      }
>>>>  }
>>>> diff -r e7cd79cd6626 xen/arch/x86/cpu/mcheck/x86_mca.h
>>>> --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 13:39:40 2011
>>>> +0800 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun May 08 18:15:13
>>>> 2011 +0800 @@ -124,36 +124,6 @@ void mcabanks_free(struct
>>>>  mca_banks *ban void mcabanks_free(struct mca_banks *banks);
>>>>  extern struct mca_banks *mca_allbanks;
>>>> 
>>>> -/* Below interfaces are defined for MCA internal processing:
>>>> - * a. pre_handler will be called early in MCA ISR context, mainly
>>>> for early 
>>>> - *    need_reset detection for avoiding log missing. Also, it is
>>>> used to judge 
>>>> - *    impacted DOMAIN if possible.
>>>> - * b. mca_error_handler is actually a (error_action_index,
>>>> - *    recovery_hanlder pointer) pair. The defined recovery_handler
>>>> - *    performs the actual recovery operations such as
>>>> page_offline, cpu_offline 
>>>> - *    in softIRQ context when the per_bank MCA error matching the
>>>> corresponding 
>>>> - *    mca_code index. If pre_handler can't judge the impacted
>>>> domain, 
>>>> - *    recovery_handler must figure it out.
>>>> -*/
>>>> -
>>>> -/* MCA error has been recovered successfully by the recovery
>>>> action*/ 
>>>> -#define MCA_RECOVERED (0x1 << 0)
>>>> -/* MCA error impact the specified DOMAIN in owner field below */
>>>> -#define MCA_OWNER (0x1 << 1)
>>>> -/* MCA error can't be recovered and need reset */
>>>> -#define MCA_NEED_RESET (0x1 << 2)
>>>> -/* MCA error did not have any action yet */
>>>> -#define MCA_NO_ACTION (0x1 << 3)
>>>> -
>>>> -struct mca_handle_result
>>>> -{
>>>> -    uint32_t result;
>>>> -    /* Used one result & MCA_OWNER */
>>>> -    domid_t owner;
>>>> -    /* Used by mca_error_handler, result & MCA_RECOVRED */
>>>> -    struct recovery_action *action;
>>>> -};
>>>> -
>>>>  /*Keep bank so that we can get staus even if mib is NULL */ 
>>>>      struct mca_binfo { int bank;
>>>> @@ -163,8 +133,14 @@ struct mca_binfo {
>>>>      struct cpu_user_regs *regs;
>>>>  };
>>>> 
>>>> -extern void (*mca_prehandler)( struct cpu_user_regs *regs,
>>>> -                        struct mca_handle_result *result); +enum
>>>> mce_result +{
>>>> +    MCER_NOERROR,
>>>> +    MCER_RECOVERED,
>>>> +    /* Not recoverd, but can continue */
>>>> +    MCER_CONTINUE,
>>>> +    MCER_RESET,
>>>> +};
>>>> 
>>>>  struct mca_error_handler
>>>>  {
>>>> @@ -175,7 +151,7 @@ struct mca_error_handler
>>>>      */
>>>>      int (*owned_error)(uint64_t status);
>>>>      void (*recovery_handler)(struct mca_binfo *binfo,
>>>> -                    struct mca_handle_result *result);
>>>> +                    enum mce_result *result);
>>>>  };
>>>> 
>>>>  /* Global variables */

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2011-05-15  5:08 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-05-08 11:23 [PATCH 7] MCA: simplify mce actoin, and some update of mem err handler for future SRAR extension Liu, Jinsong
2011-05-13 14:34 ` Liu, Jinsong
2011-05-14  9:45   ` Keir Fraser
2011-05-14 10:01     ` Keir Fraser
2011-05-15  5:08       ` Liu, Jinsong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.