[PATCH 04/11] megaraid_sas : Firmware crash dump feature support

* [PATCH 04/11] megaraid_sas : Firmware crash dump feature support
@ 2014-09-06 13:25 Sumit.Saxena
  2014-09-09 15:54 ` Tomas Henzl
  0 siblings, 1 reply; 14+ messages in thread
From: Sumit.Saxena @ 2014-09-06 13:25 UTC (permalink / raw)
  To: linux-scsi
  Cc: thenzl, martin.petersen, hch, jbottomley, kashyap.desai, aradford

This feature will provide similar interface as kernel crash dump feature.
When megaraid firmware encounter any crash, driver will collect the firmware raw image and 
dump it into pre-configured location.

Driver will allocate two different segment of memory. 
#1 Non-DMA able large buffer (will be allocated on demand) to capture actual FW crash dump.
#2 DMA buffer (persistence allocation) just to do a arbitrator job. 

Firmware will keep writing Crash dump data in chucks of DMA buffer size into #2, 
which will be copy back by driver to the host memory as described in #1.

Driver-Firmware interface:
==================
A.) Host driver can allocate maximum 512MB Host memory to store crash dump data. 

This memory will be internal to the host and will not be exposed to the Firmware.
Driver may not be able to allocate 512 MB. In that case, driver will do possible memory 
(available at run time) allocation to store crash dump data. 

Let’s call this buffer as Host Crash Buffer. 

Host Crash buffer will not be contigious as a whole, but it will have multiple chunk of contigious memory. 
This will be internal to driver and firmware/application are unaware of it. 
Partial allocation of Host Crash buffer may have valid information to debug depending upon 
what was collected in that buffer and depending on nature of failure. 

Complete Crash dump is the best case, but we do want to capture partial buffer just to grab something rather than nothing.
Host Crash buffer will be allocated only when FW Crash dump data is available, 
and will be deallocated once application copy Host Crash buffer to the file. 
Host Crash buffer size can be anything between 1MB to 512MB. (It will be multiple of 1MBs)


B.) Irrespective of underlying Firmware capability of crash dump support, 
driver will allocate DMA buffer at start of the day for each MR controllers. 
Let’s call this buffer as “DMA Crash Buffer”.

For this feature, size of DMA crash buffer will be 1MB. 
(We will not gain much even if DMA buffer size is increased.) 

C.) Driver will now read Controller Info sending existing dcmd “MR_DCMD_CTRL_GET_INFO”. 
Driver should extract the information from ctrl info provided by firmware and 
figure out if firmware support crash dump feature or not.

Driver will enable crash dump feature only if
“Firmware support Crash dump” +
“Driver was able to create DMA Crash Buffer”.

If either one from above is not set, Crash dump feature should be disable in driver.
Firmware will enable crash dump feature only if “Driver Send DCMD- MR_DCMD_SET_CRASH_BUF_PARA with MR_CRASH_BUF_TURN_ON”

Helper application/script should use sysfs parameter fw_crash_xxx to actually copy data from
host memory to the filesystem.

Signed-off-by: Sumit Saxena <sumit.saxena@avagotech.com>
Signed-off-by: Kashyap Desai <kashyap.desai@avagotech.com>
---
 drivers/scsi/megaraid/megaraid_sas.h        |  58 +++++-
 drivers/scsi/megaraid/megaraid_sas_base.c   | 292 +++++++++++++++++++++++++++-
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 172 +++++++++++++++-
 3 files changed, 517 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h
index bc7adcf..e0f03e2 100644
--- a/drivers/scsi/megaraid/megaraid_sas.h
+++ b/drivers/scsi/megaraid/megaraid_sas.h
@@ -105,6 +105,9 @@
 #define MFI_STATE_READY				0xB0000000
 #define MFI_STATE_OPERATIONAL			0xC0000000
 #define MFI_STATE_FAULT				0xF0000000
+#define MFI_STATE_FORCE_OCR			0x00000080
+#define MFI_STATE_DMADONE			0x00000008
+#define MFI_STATE_CRASH_DUMP_DONE		0x00000004
 #define MFI_RESET_REQUIRED			0x00000001
 #define MFI_RESET_ADAPTER			0x00000002
 #define MEGAMFI_FRAME_SIZE			64
@@ -191,6 +194,9 @@
 #define MR_DCMD_CLUSTER_RESET_LD		0x08010200
 #define MR_DCMD_PD_LIST_QUERY                   0x02010100
 
+#define MR_DCMD_CTRL_SET_CRASH_DUMP_PARAMS	0x01190100
+#define MR_DRIVER_SET_APP_CRASHDUMP_MODE	(0xF0010000 | 0x0600)
+
 /*
  * Global functions
  */
@@ -264,6 +270,25 @@ enum MFI_STAT {
 };
 
 /*
+ * Crash dump related defines
+ */
+#define MAX_CRASH_DUMP_SIZE 512
+#define CRASH_DMA_BUF_SIZE  (1024 * 1024)
+
+enum MR_FW_CRASH_DUMP_STATE {
+	UNAVAILABLE = 0,
+	AVAILABLE = 1,
+	COPYING = 2,
+	COPIED = 3,
+	COPY_ERROR = 4,
+};
+
+enum _MR_CRASH_BUF_STATUS {
+	MR_CRASH_BUF_TURN_OFF = 0,
+	MR_CRASH_BUF_TURN_ON = 1,
+};
+
+/*
  * Number of mailbox bytes in DCMD message frame
  */
 #define MFI_MBOX_SIZE				12
@@ -933,7 +958,19 @@ struct megasas_ctrl_info {
 		u8  reserved;                   /*0x7E7*/
 	} iov;
 
-	u8          pad[0x800-0x7E8];           /*0x7E8 pad to 2k */
+	struct {
+#if defined(__BIG_ENDIAN_BITFIELD)
+		u32     reserved:25;
+		u32     supportCrashDump:1;
+		u32     reserved1:6;
+#else
+		u32     reserved1:6;
+		u32     supportCrashDump:1;
+		u32     reserved:25;
+#endif
+	} adapterOperations3;
+
+	u8          pad[0x800-0x7EC];
 } __packed;
 
 /*
@@ -1559,6 +1596,20 @@ struct megasas_instance {
 	u32 *reply_queue;
 	dma_addr_t reply_queue_h;
 
+	u32 *crash_dump_buf;
+	dma_addr_t crash_dump_h;
+	void *crash_buf[MAX_CRASH_DUMP_SIZE];
+	u32 crash_buf_pages;
+	unsigned int    fw_crash_buffer_size;
+	unsigned int    fw_crash_state;
+	unsigned int    fw_crash_buffer_offset;
+	u32 drv_buf_index;
+	u32 drv_buf_alloc;
+	u32 crash_dump_fw_support;
+	u32 crash_dump_drv_support;
+	u32 crash_dump_app_support;
+	spinlock_t crashdump_lock;
+
 	struct megasas_register_set __iomem *reg_set;
 	u32 *reply_post_host_index_addr[MR_MAX_MSIX_REG_ARRAY];
 	struct megasas_pd_list          pd_list[MEGASAS_MAX_PD];
@@ -1606,6 +1657,7 @@ struct megasas_instance {
 	struct megasas_instance_template *instancet;
 	struct tasklet_struct isr_tasklet;
 	struct work_struct work_init;
+	struct work_struct crash_init;
 
 	u8 flag;
 	u8 unload;
@@ -1830,4 +1882,8 @@ u16 MR_LdSpanArrayGet(u32 ld, u32 span, struct MR_FW_RAID_MAP_ALL *map);
 u16 MR_PdDevHandleGet(u32 pd, struct MR_FW_RAID_MAP_ALL *map);
 u16 MR_GetLDTgtId(u32 ld, struct MR_FW_RAID_MAP_ALL *map);
 
+int megasas_set_crash_dump_params(struct megasas_instance *instance,
+		u8 crash_buf_state);
+void megasas_free_host_crash_buffer(struct megasas_instance *instance);
+void megasas_fusion_crash_dump_wq(struct work_struct *work);
 #endif				/*LSI_MEGARAID_SAS_H */
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index a894f13..5b58e39d 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -2560,6 +2560,152 @@ static int megasas_change_queue_depth(struct scsi_device *sdev,
 	return queue_depth;
 }
 
+static ssize_t
+megasas_fw_crash_buffer_store(struct device *cdev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(cdev);
+	struct megasas_instance *instance =
+		(struct megasas_instance *) shost->hostdata;
+	int val = 0;
+	unsigned long flags;
+
+	if (kstrtoint(buf, 0, &val) != 0)
+		return -EINVAL;
+
+	spin_lock_irqsave(&instance->crashdump_lock, flags);
+	instance->fw_crash_buffer_offset = val;
+	spin_unlock_irqrestore(&instance->crashdump_lock, flags);
+	return strlen(buf);
+}
+
+static ssize_t
+megasas_fw_crash_buffer_show(struct device *cdev,
+	struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(cdev);
+	struct megasas_instance *instance =
+		(struct megasas_instance *) shost->hostdata;
+	u32 size;
+	unsigned long buff_addr;
+	unsigned long dmachunk = CRASH_DMA_BUF_SIZE;
+	unsigned long src_addr;
+	unsigned long flags;
+	u32 buff_offset;
+
+	buff_offset = instance->fw_crash_buffer_offset;
+	spin_lock_irqsave(&instance->crashdump_lock, flags);
+	if (!instance->crash_dump_buf &&
+		!((instance->fw_crash_state == AVAILABLE) ||
+		(instance->fw_crash_state == COPYING))) {
+		dev_err(&instance->pdev->dev,
+			"Firmware crash dump is not available\n");
+		spin_unlock_irqrestore(&instance->crashdump_lock, flags);
+		return -EINVAL;
+	}
+
+	buff_addr = (unsigned long) buf;
+
+	if (buff_offset >
+		(instance->fw_crash_buffer_size * dmachunk)) {
+		dev_err(&instance->pdev->dev,
+			"Firmware crash dump offset is out of range\n");
+		spin_unlock_irqrestore(&instance->crashdump_lock, flags);
+		return 0;
+	}
+
+	size = (instance->fw_crash_buffer_size * dmachunk) - buff_offset;
+	size = (size >= PAGE_SIZE) ? (PAGE_SIZE - 1) : size;
+
+	src_addr = (unsigned long)instance->crash_buf[buff_offset / dmachunk] +
+		(buff_offset % dmachunk);
+	memcpy(buf, (void *)src_addr,  size);
+	spin_unlock_irqrestore(&instance->crashdump_lock, flags);
+
+	return size;
+}
+
+static ssize_t
+megasas_fw_crash_buffer_size_show(struct device *cdev,
+	struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(cdev);
+	struct megasas_instance *instance =
+		(struct megasas_instance *) shost->hostdata;
+
+	return snprintf(buf, PAGE_SIZE, "%ld\n", (unsigned long)
+		((instance->fw_crash_buffer_size) * 1024 * 1024)/PAGE_SIZE);
+}
+
+static ssize_t
+megasas_fw_crash_state_store(struct device *cdev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(cdev);
+	struct megasas_instance *instance =
+		(struct megasas_instance *) shost->hostdata;
+	int val = 0;
+	unsigned long flags;
+
+	if (kstrtoint(buf, 0, &val) != 0)
+		return -EINVAL;
+
+	if ((val <= AVAILABLE || val > COPY_ERROR)) {
+		dev_err(&instance->pdev->dev, "application updates invalid "
+			"firmware crash state\n");
+		return -EINVAL;
+	}
+
+	instance->fw_crash_state = val;
+
+	if ((val == COPIED) || (val == COPY_ERROR)) {
+		spin_lock_irqsave(&instance->crashdump_lock, flags);
+		megasas_free_host_crash_buffer(instance);
+		spin_unlock_irqrestore(&instance->crashdump_lock, flags);
+		if (val == COPY_ERROR)
+			dev_info(&instance->pdev->dev, "application failed to "
+				"copy Firmware crash dump\n");
+		else
+			dev_info(&instance->pdev->dev, "Firmware crash dump "
+				"copied successfully\n");
+	}
+	return strlen(buf);
+}
+
+static ssize_t
+megasas_fw_crash_state_show(struct device *cdev,
+	struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(cdev);
+	struct megasas_instance *instance =
+		(struct megasas_instance *) shost->hostdata;
+	return snprintf(buf, PAGE_SIZE, "%d\n", instance->fw_crash_state);
+}
+
+static ssize_t
+megasas_page_size_show(struct device *cdev,
+	struct device_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%ld\n", (unsigned long)PAGE_SIZE - 1);
+}
+
+static DEVICE_ATTR(fw_crash_buffer, S_IRUGO | S_IWUSR,
+	megasas_fw_crash_buffer_show, megasas_fw_crash_buffer_store);
+static DEVICE_ATTR(fw_crash_buffer_size, S_IRUGO,
+	megasas_fw_crash_buffer_size_show, NULL);
+static DEVICE_ATTR(fw_crash_state, S_IRUGO | S_IWUSR,
+	megasas_fw_crash_state_show, megasas_fw_crash_state_store);
+static DEVICE_ATTR(page_size, S_IRUGO,
+	megasas_page_size_show, NULL);
+
+struct device_attribute *megaraid_host_attrs[] = {
+	&dev_attr_fw_crash_buffer_size,
+	&dev_attr_fw_crash_buffer,
+	&dev_attr_fw_crash_state,
+	&dev_attr_page_size,
+	NULL,
+};
+
 /*
  * Scsi host template for megaraid_sas driver
  */
@@ -2575,6 +2721,7 @@ static struct scsi_host_template megasas_template = {
 	.eh_bus_reset_handler = megasas_reset_bus_host,
 	.eh_host_reset_handler = megasas_reset_bus_host,
 	.eh_timed_out = megasas_reset_timer,
+	.shost_attrs = megaraid_host_attrs,
 	.bios_param = megasas_bios_param,
 	.use_clustering = ENABLE_CLUSTERING,
 	.change_queue_depth = megasas_change_queue_depth,
@@ -3887,6 +4034,59 @@ megasas_get_ctrl_info(struct megasas_instance *instance,
 	return ret;
 }
 
+/*
+ * megasas_set_crash_dump_params -	Sends address of crash dump DMA buffer
+ *					to firmware
+ *
+ * @instance:				Adapter soft state
+ * @crash_buf_state		-	tell FW to turn ON/OFF crash dump feature
+					MR_CRASH_BUF_TURN_OFF = 0
+					MR_CRASH_BUF_TURN_ON = 1
+ * @return 0 on success non-zero on failure.
+ * Issues an internal command (DCMD) to set parameters for crash dump feature.
+ * Driver will send address of crash dump DMA buffer and set mbox to tell FW
+ * that driver supports crash dump feature. This DCMD will be sent only if
+ * crash dump feature is supported by the FW.
+ *
+ */
+int megasas_set_crash_dump_params(struct megasas_instance *instance,
+	u8 crash_buf_state)
+{
+	int ret = 0;
+	struct megasas_cmd *cmd;
+	struct megasas_dcmd_frame *dcmd;
+
+	cmd = megasas_get_cmd(instance);
+
+	if (!cmd) {
+		dev_err(&instance->pdev->dev, "Failed to get a free cmd\n");
+		return -ENOMEM;
+	}
+
+
+	dcmd = &cmd->frame->dcmd;
+
+	memset(dcmd->mbox.b, 0, MFI_MBOX_SIZE);
+	dcmd->mbox.b[0] = crash_buf_state;
+	dcmd->cmd = MFI_CMD_DCMD;
+	dcmd->cmd_status = 0xFF;
+	dcmd->sge_count = 1;
+	dcmd->flags = cpu_to_le16(MFI_FRAME_DIR_NONE);
+	dcmd->timeout = 0;
+	dcmd->pad_0 = 0;
+	dcmd->data_xfer_len = cpu_to_le32(CRASH_DMA_BUF_SIZE);
+	dcmd->opcode = cpu_to_le32(MR_DCMD_CTRL_SET_CRASH_DUMP_PARAMS);
+	dcmd->sgl.sge32[0].phys_addr = cpu_to_le32(instance->crash_dump_h);
+	dcmd->sgl.sge32[0].length = cpu_to_le32(CRASH_DMA_BUF_SIZE);
+
+	if (!megasas_issue_polled(instance, cmd))
+		ret = 0;
+	else
+		ret = -1;
+	megasas_return_cmd(instance, cmd);
+	return ret;
+}
+
 /**
  * megasas_issue_init_mfi -	Initializes the FW
  * @instance:		Adapter soft state
@@ -4272,6 +4472,27 @@ static int megasas_init_fw(struct megasas_instance *instance)
 			printk(KERN_WARNING "megaraid_sas: I am VF "
 			       "requestorId %d\n", instance->requestorId);
 		}
+
+		le32_to_cpus((u32 *)&ctrl_info->adapterOperations3);
+		instance->crash_dump_fw_support =
+			ctrl_info->adapterOperations3.supportCrashDump;
+		instance->crash_dump_drv_support =
+			(instance->crash_dump_fw_support &&
+			instance->crash_dump_buf);
+		if (instance->crash_dump_drv_support) {
+			dev_info(&instance->pdev->dev, "Firmware Crash dump "
+				"feature is supported\n");
+			megasas_set_crash_dump_params(instance,
+				MR_CRASH_BUF_TURN_OFF);
+
+		} else {
+			if (instance->crash_dump_buf)
+				pci_free_consistent(instance->pdev,
+					CRASH_DMA_BUF_SIZE,
+					instance->crash_dump_buf,
+					instance->crash_dump_h);
+			instance->crash_dump_buf = NULL;
+		}
 	}
 	instance->max_sectors_per_req = instance->max_num_sge *
 						PAGE_SIZE / 512;
@@ -4791,6 +5012,21 @@ static int megasas_probe_one(struct pci_dev *pdev,
 		break;
 	}
 
+	/* Crash dump feature related initialisation*/
+	instance->drv_buf_index = 0;
+	instance->drv_buf_alloc = 0;
+	instance->crash_dump_fw_support = 0;
+	instance->crash_dump_app_support = 0;
+	instance->fw_crash_state = UNAVAILABLE;
+	spin_lock_init(&instance->crashdump_lock);
+
+	instance->crash_dump_buf = pci_alloc_consistent(pdev,
+						CRASH_DMA_BUF_SIZE,
+						&instance->crash_dump_h);
+	if (!instance->crash_dump_buf)
+		dev_err(&instance->pdev->dev, "Can't allocate Firmware "
+			"crash dump DMA buffer\n");
+
 	megasas_poll_wait_aen = 0;
 	instance->flag_ieee = 0;
 	instance->ev = NULL;
@@ -4852,9 +5088,10 @@ static int megasas_probe_one(struct pci_dev *pdev,
 	if ((instance->pdev->device == PCI_DEVICE_ID_LSI_FUSION) ||
 	    (instance->pdev->device == PCI_DEVICE_ID_LSI_PLASMA) ||
 	    (instance->pdev->device == PCI_DEVICE_ID_LSI_INVADER) ||
-	    (instance->pdev->device == PCI_DEVICE_ID_LSI_FURY))
+	    (instance->pdev->device == PCI_DEVICE_ID_LSI_FURY)) {
 		INIT_WORK(&instance->work_init, megasas_fusion_ocr_wq);
-	else
+		INIT_WORK(&instance->crash_init, megasas_fusion_crash_dump_wq);
+	} else
 		INIT_WORK(&instance->work_init, process_fw_state_change_wq);
 
 	/*
@@ -5342,6 +5579,8 @@ static void megasas_detach_one(struct pci_dev *pdev)
 	if (instance->requestorId && !instance->skip_heartbeat_timer_del)
 		del_timer_sync(&instance->sriov_heartbeat_timer);
 
+	if (instance->fw_crash_state != UNAVAILABLE)
+		megasas_free_host_crash_buffer(instance);
 	scsi_remove_host(instance->host);
 	megasas_flush_cache(instance);
 	megasas_shutdown_controller(instance, MR_DCMD_CTRL_SHUTDOWN);
@@ -5432,6 +5671,10 @@ static void megasas_detach_one(struct pci_dev *pdev)
 				    instance->hb_host_mem,
 				    instance->hb_host_mem_h);
 
+	if (instance->crash_dump_buf)
+		pci_free_consistent(pdev, CRASH_DMA_BUF_SIZE,
+			    instance->crash_dump_buf, instance->crash_dump_h);
+
 	scsi_host_put(host);
 
 	pci_disable_device(pdev);
@@ -5523,6 +5766,45 @@ static unsigned int megasas_mgmt_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+/*
+ * megasas_set_crash_dump_params_ioctl:
+ *		Send CRASH_DUMP_MODE DCMD to all controllers
+ * @cmd:	MFI command frame
+ */
+
+static int megasas_set_crash_dump_params_ioctl(
+	struct megasas_cmd *cmd)
+{
+	struct megasas_instance *local_instance;
+	int i, error = 0;
+	int crash_support;
+
+	crash_support = cmd->frame->dcmd.mbox.w[0];
+
+	for (i = 0; i < megasas_mgmt_info.max_index; i++) {
+		local_instance = megasas_mgmt_info.instance[i];
+		if (local_instance && local_instance->crash_dump_drv_support) {
+			if ((local_instance->adprecovery ==
+				MEGASAS_HBA_OPERATIONAL) &&
+				!megasas_set_crash_dump_params(local_instance,
+					crash_support)) {
+				local_instance->crash_dump_app_support =
+					crash_support;
+				dev_info(&local_instance->pdev->dev,
+					"Application firmware crash "
+					"dump mode set success\n");
+				error = 0;
+			} else {
+				dev_info(&local_instance->pdev->dev,
+					"Application firmware crash "
+					"dump mode set failed\n");
+				error = -1;
+			}
+		}
+	}
+	return error;
+}
+
 /**
  * megasas_mgmt_fw_ioctl -	Issues management ioctls to FW
  * @instance:			Adapter soft state
@@ -5569,6 +5851,12 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance,
 					       MFI_FRAME_SGL64 |
 					       MFI_FRAME_SENSE64));
 
+	if (cmd->frame->dcmd.opcode == MR_DRIVER_SET_APP_CRASHDUMP_MODE) {
+		error = megasas_set_crash_dump_params_ioctl(cmd);
+		megasas_return_cmd(instance, cmd);
+		return error;
+	}
+
 	/*
 	 * The management interface between applications and the fw uses
 	 * MFI frames. E.g, RAID configuration changes, LD property changes
diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index f30297d..aaba2a7 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -91,6 +91,8 @@ void megasas_start_timer(struct megasas_instance *instance,
 extern struct megasas_mgmt_info megasas_mgmt_info;
 extern int resetwaittime;
 
+
+
 /**
  * megasas_enable_intr_fusion -	Enables interrupts
  * @regs:			MFI register set
@@ -2057,7 +2059,7 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp)
 {
 	struct megasas_irq_context *irq_context = devp;
 	struct megasas_instance *instance = irq_context->instance;
-	u32 mfiStatus, fw_state;
+	u32 mfiStatus, fw_state, dma_state;
 
 	if (instance->mask_interrupts)
 		return IRQ_NONE;
@@ -2079,7 +2081,16 @@ irqreturn_t megasas_isr_fusion(int irq, void *devp)
 		/* If we didn't complete any commands, check for FW fault */
 		fw_state = instance->instancet->read_fw_status_reg(
 			instance->reg_set) & MFI_STATE_MASK;
-		if (fw_state == MFI_STATE_FAULT) {
+		dma_state = instance->instancet->read_fw_status_reg
+			(instance->reg_set) & MFI_STATE_DMADONE;
+		if (instance->crash_dump_drv_support &&
+			instance->crash_dump_app_support) {
+			/* Start collecting crash, if DMA bit is done */
+			if ((fw_state == MFI_STATE_FAULT) && dma_state)
+				schedule_work(&instance->crash_init);
+			else if (fw_state == MFI_STATE_FAULT)
+				schedule_work(&instance->work_init);
+		} else if (fw_state == MFI_STATE_FAULT) {
 			printk(KERN_WARNING "megaraid_sas: Iop2SysDoorbellInt"
 			       "for scsi%d\n", instance->host->host_no);
 			schedule_work(&instance->work_init);
@@ -2232,6 +2243,49 @@ megasas_read_fw_status_reg_fusion(struct megasas_register_set __iomem *regs)
 }
 
 /**
+ * megasas_alloc_host_crash_buffer -	Host buffers for Crash dump collection from Firmware
+ * @instance:				Controller's soft instance
+ * return:			        Number of allocated host crash buffers
+ */
+static void
+megasas_alloc_host_crash_buffer(struct megasas_instance *instance)
+{
+	unsigned int i;
+
+	instance->crash_buf_pages = get_order(CRASH_DMA_BUF_SIZE);
+	for (i = 0; i < MAX_CRASH_DUMP_SIZE; i++) {
+		instance->crash_buf[i] = (void	*)__get_free_pages(GFP_KERNEL,
+				instance->crash_buf_pages);
+		if (!instance->crash_buf[i]) {
+			dev_info(&instance->pdev->dev, "Firmware crash dump "
+				"memory allocation failed at index %d\n", i);
+			break;
+		}
+	}
+	instance->drv_buf_alloc = i;
+}
+
+/**
+ * megasas_free_host_crash_buffer -	Host buffers for Crash dump collection from Firmware
+ * @instance:				Controller's soft instance
+ */
+void
+megasas_free_host_crash_buffer(struct megasas_instance *instance)
+{
+	unsigned int i
+;
+	for (i = 0; i < MAX_CRASH_DUMP_SIZE; i++) {
+		if (instance->crash_buf[i])
+			free_pages((ulong)instance->crash_buf[i],
+					instance->crash_buf_pages);
+	}
+	instance->drv_buf_index = 0;
+	instance->drv_buf_alloc = 0;
+	instance->fw_crash_state = UNAVAILABLE;
+	instance->fw_crash_buffer_size = 0;
+}
+
+/**
  * megasas_adp_reset_fusion -	For controller reset
  * @regs:				MFI register set
  */
@@ -2374,6 +2428,7 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
 	struct megasas_cmd *cmd_mfi;
 	union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;
 	u32 host_diag, abs_state, status_reg, reset_adapter;
+	u32 io_timeout_in_crash_mode = 0;
 
 	instance = (struct megasas_instance *)shost->hostdata;
 	fusion = instance->ctrl_context;
@@ -2387,6 +2442,42 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
 		mutex_unlock(&instance->reset_mutex);
 		return FAILED;
 	}
+	status_reg = instance->instancet->read_fw_status_reg(instance->reg_set);
+	abs_state = status_reg & MFI_STATE_MASK;
+
+	/* IO timeout detected, forcibly put FW in FAULT state */
+	if (abs_state != MFI_STATE_FAULT && instance->crash_dump_buf &&
+		instance->crash_dump_app_support && iotimeout) {
+		dev_info(&instance->pdev->dev, "IO timeout is detected, "
+			"forcibly FAULT Firmware\n");
+		instance->adprecovery = MEGASAS_ADPRESET_SM_INFAULT;
+		status_reg = readl(&instance->reg_set->doorbell);
+		writel(status_reg | MFI_STATE_FORCE_OCR,
+			&instance->reg_set->doorbell);
+		readl(&instance->reg_set->doorbell);
+		mutex_unlock(&instance->reset_mutex);
+		do {
+			ssleep(3);
+			io_timeout_in_crash_mode++;
+			dev_dbg(&instance->pdev->dev, "waiting for [%d] "
+				"seconds for crash dump collection and OCR "
+				"to be done\n", (io_timeout_in_crash_mode * 3));
+		} while ((instance->adprecovery != MEGASAS_HBA_OPERATIONAL) &&
+			(io_timeout_in_crash_mode < 80));
+
+		if (instance->adprecovery == MEGASAS_HBA_OPERATIONAL) {
+			dev_info(&instance->pdev->dev, "OCR done for IO "
+				"timeout case\n");
+			retval = SUCCESS;
+		} else {
+			dev_info(&instance->pdev->dev, "Controller is not "
+				"operational after 240 seconds wait for IO "
+				"timeout case in FW crash dump mode\n do "
+				"OCR/kill adapter\n");
+			retval = megasas_reset_fusion(shost, 0);
+		}
+		return retval;
+	}
 
 	if (instance->requestorId && !instance->skip_heartbeat_timer_del)
 		del_timer_sync(&instance->sriov_heartbeat_timer);
@@ -2653,6 +2744,15 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int iotimeout)
 			printk(KERN_WARNING "megaraid_sas: Reset "
 			       "successful for scsi%d.\n",
 				instance->host->host_no);
+
+			if (instance->crash_dump_drv_support) {
+				if (instance->crash_dump_app_support)
+					megasas_set_crash_dump_params(instance,
+						MR_CRASH_BUF_TURN_ON);
+				else
+					megasas_set_crash_dump_params(instance,
+						MR_CRASH_BUF_TURN_OFF);
+			}
 			retval = SUCCESS;
 			goto out;
 		}
@@ -2681,6 +2781,74 @@ out:
 	return retval;
 }
 
+/* Fusion Crash dump collection work queue */
+void  megasas_fusion_crash_dump_wq(struct work_struct *work)
+{
+	struct megasas_instance *instance =
+		container_of(work, struct megasas_instance, crash_init);
+	u32 status_reg;
+	u8 partial_copy = 0;
+
+
+	status_reg = instance->instancet->read_fw_status_reg(instance->reg_set);
+
+	/*
+	 * Allocate host crash buffers to copy data from 1 MB DMA crash buffer
+	 * to host crash buffers
+	 */
+	if (instance->drv_buf_index == 0) {
+		/* Buffer is already allocated for old Crash dump.
+		 * Do OCR and do not wait for crash dump collection
+		 */
+		if (instance->drv_buf_alloc) {
+			dev_info(&instance->pdev->dev, "earlier crash dump is "
+				"not yet copied by application, ignoring this "
+				"crash dump and initiating OCR\n");
+			status_reg |= MFI_STATE_CRASH_DUMP_DONE;
+			writel(status_reg,
+				&instance->reg_set->outbound_scratch_pad);
+			readl(&instance->reg_set->outbound_scratch_pad);
+			return;
+		}
+		megasas_alloc_host_crash_buffer(instance);
+		dev_info(&instance->pdev->dev, "Number of host crash buffers "
+			"allocated: %d\n", instance->drv_buf_alloc);
+	}
+
+	/*
+	 * Driver has allocated max buffers, which can be allocated
+	 * and FW has more crash dump data, then driver will
+	 * ignore the data.
+	 */
+	if (instance->drv_buf_index >= (instance->drv_buf_alloc)) {
+		dev_info(&instance->pdev->dev, "Driver is done copying "
+			"the buffer: %d\n", instance->drv_buf_alloc);
+		status_reg |= MFI_STATE_CRASH_DUMP_DONE;
+		partial_copy = 1;
+	} else {
+		memcpy(instance->crash_buf[instance->drv_buf_index],
+			instance->crash_dump_buf, CRASH_DMA_BUF_SIZE);
+		instance->drv_buf_index++;
+		status_reg &= ~MFI_STATE_DMADONE;
+	}
+
+	if (status_reg & MFI_STATE_CRASH_DUMP_DONE) {
+		dev_info(&instance->pdev->dev, "Crash Dump is available,number "
+			"of copied buffers: %d\n", instance->drv_buf_index);
+		instance->fw_crash_buffer_size =  instance->drv_buf_index;
+		instance->fw_crash_state = AVAILABLE;
+		instance->drv_buf_index = 0;
+		writel(status_reg, &instance->reg_set->outbound_scratch_pad);
+		readl(&instance->reg_set->outbound_scratch_pad);
+		if (!partial_copy)
+			megasas_reset_fusion(instance->host, 0);
+	} else {
+		writel(status_reg, &instance->reg_set->outbound_scratch_pad);
+		readl(&instance->reg_set->outbound_scratch_pad);
+	}
+}
+
+
 /* Fusion OCR work queue */
 void megasas_fusion_ocr_wq(struct work_struct *work)
 {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 14+ messages in thread