Hi,

I have tried with following patch and I am still getting same kernel panic.

-------------X++++++++++++++++++++X---------------------

diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c
index 0f4554e..05592aa 100644
--- a/drivers/pci/pcie/aer/aerdrv_core.c
+++ b/drivers/pci/pcie/aer/aerdrv_core.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/kfifo.h>
 #include "aerdrv.h"
+#include "../../pci.h"

 static bool forceload;
 static bool nosourceid;
@@ -82,7 +82,7 @@ EXPORT_SYMBOL_GPL(pci_cleanup_aer_uncorrect_error_status);
 static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
 {
  if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
- e_info->dev[e_info->error_dev_num] = dev;
+ e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
  e_info->error_dev_num++;
  return 0;
  }
@@ -659,6 +659,9 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
  if (!pos)
  return 1;

+        if (pci_dev_is_disconnected(dev))
+                return 0;
+
  if (info->severity == AER_CORRECTABLE) {
  pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS,
  &info->status);
@@ -710,6 +713,8 @@ static inline void aer_process_err_devices(struct pcie_device *p_device,
  for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
  if (get_device_error_info(e_info->dev[i], e_info))
  handle_error_source(p_device, e_info->dev[i], e_info);
+
+                pci_dev_put(e_info->dev[i]);
  }
 }
-------------X++++++++++++++++++++X---------------------


Note: I have configured CONFIG_HOTPLUG_PCI_PCIE and CONFIG_HOTPLUG_PCI as modules and  loading in start up using script. 

root@/proc/:~# cat config | grep -i HOT
CONFIG_TICK_ONESHOT=y
CONFIG_HOTPLUG=y
# CONFIG_MEMORY_HOTPLUG is not set
CONFIG_HOTPLUG_CPU=y
# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set
# CONFIG_DEBUG_HOTPLUG_CPU0 is not set
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_HOTPLUG_PCI_PCIE=m
CONFIG_HOTPLUG_PCI=m
# CONFIG_HOTPLUG_PCI_CPCI is not set
# CONFIG_HOTPLUG_PCI_SHPC is not set
CONFIG_DM_SNAPSHOT=y
# CONFIG_USB_STORAGE_JUMPSHOT is not set
# CONFIG_TRACER_SNAPSHOT is not set
root@/proc/:~#

Panic back trace :
crash> bt
PID: 24     TASK: ffff880274ac0000  CPU: 0   COMMAND: "kworker/0:1"
 #0 [ffff880274abbac8] machine_kexec at ffffffff8102cf18
 #1 [ffff880274abbb28] crash_kexec at ffffffff810a6b05
 #2 [ffff880274abbbf0] oops_end at ffffffff8176d8a0
 #3 [ffff880274abbc18] die at ffffffff810060db
 #4 [ffff880274abbc48] do_general_protection at ffffffff8176d392
 #5 [ffff880274abbc70] general_protection at ffffffff8176cd32
    [exception RIP: pci_bus_read_config_dword+100]
    RIP: ffffffff813405f4  RSP: ffff880274abbd20  RFLAGS: 00010046
    RAX: 435f494350006963  RBX: ffff880274891800  RCX: 0000000000000004
    RDX: 0000000000000ffc  RSI: 0000000000000060  RDI: ffff880274891800
    RBP: ffff880274abbd48   R8: ffff880274abbd2c   R9: 00000000000002b8
    R10: ffff880274340000  R11: 0000000000000246  R12: ffff880274abbd5c
    R13: 0000000000000246  R14: 0000000000000000  R15: ffff880274920000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #6 [ffff880274abbd50] pci_find_next_ext_capability at ffffffff81345db6
 #7 [ffff880274abbd90] pci_find_ext_capability at ffffffff81347225
 #8 [ffff880274abbda0] get_device_error_info at ffffffff81356c4d
 #9 [ffff880274abbdd0] aer_isr at ffffffff81357ab0
#10 [ffff880274abbe28] process_one_work at ffffffff8105d4c0
#11 [ffff880274abbe70] worker_thread at ffffffff8105e251
#12 [ffff880274abbed0] kthread at ffffffff81064260
#13 [ffff880274abbf50] ret_from_fork at ffffffff81773978
crash>


Regards,
Gokul 

On Thu, Aug 2, 2018 at 10:39 PM, Thomas Tai <thomas.tai@oracle.com> wrote:

On 08/02/2018 11:07 AM, Lukas Wunner wrote:
[cc += Thomas Tai]

Hi Lukas,
Thank you very much for cc me.

On Thu, Aug 02, 2018 at 10:46:57AM +0200, Lukas Wunner wrote:
On Thu, Aug 02, 2018 at 12:59:18PM +0530, gokul cg wrote:
I am suspecting a possible race condition in the kernel between PCI driver
and AER handling.

The solution is to acquire a ref on each device in add_error_device().
Then release the ref aer_process_err_devices() by calling pci_dev_put().

So in case it wasn't clear, the below is what I had in mind.
Completely untested though.  Does this work for you?

For v3.10 compatibility, cherry-pick 89ee9f768003 (or alternatively
cherry-pick 8496e85c20e7 and replace pci_dev_is_disconnected(dev)
with !pci_device_is_present(dev)).

-- >8 --
Subject: [PATCH] PCI/AER: Fix use-after-free on surprise removal

The work item to consume errors, aer_isr(), walks the hierarchy using
pci_walk_bus() and stores a pointer to PCI devices which reported an
error in an array.  As long as pci_walk_bus() runs, those pointers are
valid because pci_bus_sem is held.  But once pci_walk_bus() finishes,
nothing prevents the pointers from becoming invalid, e.g. through
unplugging of the PCI devices.  The unprotected pointers are then
dereferenced in aer_process_err_devices(), which may oops:

I like your idea to increment the refcount during pci_walk_bus(), that should fix the use-after-free issue. We just need Gokul to confirm if it fixes his issue or not.

Thanks,
Thomas



   #5  general_protection at ffffffff8176cdf2
       [exception RIP: pci_bus_read_config_dword+100]
   #6  pci_find_next_ext_capability at ffffffff81345d7b
   #7  pci_find_ext_capability at ffffffff81347225
   #8  get_device_error_info at ffffffff81356c4d
   #9  aer_isr at ffffffff81357a38

Fix by holding a ref on the devices until they have been processed.
Skip processing of unplugged devices.

Reported-by: gokul cg <gokuljnpr@gmail.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
---
  drivers/pci/pcie/aer.c | 6 +++++-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index a2e8838..937592e 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -657,7 +657,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
  static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
  {
        if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
-               e_info->dev[e_info->error_dev_num] = dev;
+               e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
                e_info->error_dev_num++;
                return 0;
        }
@@ -898,6 +898,9 @@ static int get_device_error_info(struct pci_dev *dev, struct aer_err_info *info)
        if (!pos)
                return 0;
  +     if (pci_dev_is_disconnected(dev))
+               return 0;
+
        if (info->severity == AER_CORRECTABLE) {
                pci_read_config_dword(dev, pos + PCI_ERR_COR_STATUS,
                        &info->status);
@@ -948,6 +951,7 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
        for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++) {
                if (get_device_error_info(e_info->dev[i], e_info))
                        handle_error_source(e_info->dev[i], e_info);
+               pci_dev_put(e_info->dev[i]);
        }
  }