> > + if (reply_dma) > > + mpi3mr_repost_reply_buf(mrioc, reply_dma); > > + num_op_reply++; > > + > > + if (++reply_ci == op_reply_q->num_replies) { > > + reply_ci = 0; > > + exp_phase ^= 1; > > + } > > + > > + reply_desc = mpi3mr_get_reply_desc(op_reply_q, reply_ci); > > + > > + if ((le16_to_cpu(reply_desc->reply_flags) & > > + MPI3_REPLY_DESCRIPT_FLAGS_PHASE_MASK) != > exp_phase) > > + break; > > + > > + } while (1); > > + > > Is this loop bounded by something? > The way it looks like we might end up having to process at lot of replies, > causing a bogus hangcheck trigger. > Have you check for that condition? Hi Hannes - You are correct. There can be a bogus CPU lockup issue here. We could have use irq_poll interface as we used in mpt3sas, megaraid_sas driver. Since we have used hybrid threaded ISR in mpi3mr driver, we have covered this scenario using threaded ISR. See patch #0017. We have below check to handle CPU lockup - if (num_op_reply > mrioc->max_host_ios) { intr_info->op_reply_q->enable_irq_poll = true; break; } > > > + writel(reply_ci, > > + &mrioc->sysif_regs- > >oper_queue_indexes[reply_qidx].consumer_index); > > + op_reply_q->ci = reply_ci; > > + op_reply_q->ephase = exp_phase; > > + > > + return num_op_reply; > > +} > > + > > static irqreturn_t mpi3mr_isr_primary(int irq, void *privdata) { > > struct mpi3mr_intr_info *intr_info = privdata; @@ -1302,6 +1395,74 > > @@ static int mpi3mr_create_op_queues(struct mpi3mr_ioc *mrioc) > > return retval; > > } > > > > +/** > > + * mpi3mr_op_request_post - Post request to operational queue > > + * @mrioc: Adapter reference > > + * @op_req_q: Operational request queue info > > + * @req: MPI3 request > > + * > > + * Post the MPI3 request into operational request queue and > > + * inform the controller, if the queue is full return > > + * appropriate error. > > + * > > + * Return: 0 on success, non-zero on failure. > > + */ > > +int mpi3mr_op_request_post(struct mpi3mr_ioc *mrioc, > > + struct op_req_qinfo *op_req_q, u8 *req) { > > + u16 pi = 0, max_entries, reply_qidx = 0, midx; > > + int retval = 0; > > + unsigned long flags; > > + u8 *req_entry; > > + void *segment_base_addr; > > + u16 req_sz = mrioc->facts.op_req_sz; > > + struct segments *segments = op_req_q->q_segments; > > + > > + reply_qidx = op_req_q->reply_qid - 1; > > + > > + if (mrioc->unrecoverable) > > + return -EFAULT; > > + > > + spin_lock_irqsave(&op_req_q->q_lock, flags); > > + pi = op_req_q->pi; > > + max_entries = op_req_q->num_requests; > > + > > + if (mpi3mr_check_req_qfull(op_req_q)) { > > + midx = REPLY_QUEUE_IDX_TO_MSIX_IDX( > > + reply_qidx, mrioc->op_reply_q_offset); > > + mpi3mr_process_op_reply_q(mrioc, &mrioc- > >intr_info[midx]); > > + > > + if (mpi3mr_check_req_qfull(op_req_q)) { > > + retval = -EAGAIN; > > + goto out; > > + } > > + } > > + > > + if (mrioc->reset_in_progress) { > > + ioc_err(mrioc, "OpReqQ submit reset in progress\n"); > > + retval = -EAGAIN; > > + goto out; > > + } > > + > > Have you considered a different error code here? > reset in progress and queue full are two different scenarios; especially > the > latter might warrant some further action like decreasing the queue depth, > which is getting hard if you have the same error ... There is really no difference if we use different error code, because caller of mpi3mr_op_request_post checks zero or non-zero. I got your point that we should have some better error code (if possible) in case of controller is in reset. I am thinking of using scsi_block_requests/scsi_unblock_requests pair in controller reset path. We want stable first out of mpi3mr and also such changes require additional testing, I will accommodate changes in this area in next series. Is it OK with you ? > > + switch (ioc_status) { > > + case MPI3_IOCSTATUS_BUSY: > > + case MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES: > > + scmd->result = SAM_STAT_BUSY; > > + break; > > + case MPI3_IOCSTATUS_SCSI_DEVICE_NOT_THERE: > > + scmd->result = DID_NO_CONNECT << 16; > > + break; > > + case MPI3_IOCSTATUS_SCSI_IOC_TERMINATED: > > + scmd->result = DID_SOFT_ERROR << 16; > > + break; > > + case MPI3_IOCSTATUS_SCSI_TASK_TERMINATED: > > + case MPI3_IOCSTATUS_SCSI_EXT_TERMINATED: > > + scmd->result = DID_RESET << 16; > > + break; > > + case MPI3_IOCSTATUS_SCSI_RESIDUAL_MISMATCH: > > + if ((xfer_count == 0) || (scmd->underflow > xfer_count)) > > + scmd->result = DID_SOFT_ERROR << 16; > > + else > > + scmd->result = (DID_OK << 16) | scsi_status; > > + break; > > + case MPI3_IOCSTATUS_SCSI_DATA_UNDERRUN: > > + scmd->result = (DID_OK << 16) | scsi_status; > > + if (sense_state == MPI3_SCSI_STATE_SENSE_VALID) > > + break; > > + if (xfer_count < scmd->underflow) { > > + if (scsi_status == SAM_STAT_BUSY) > > + scmd->result = SAM_STAT_BUSY; > > + else > > + scmd->result = DID_SOFT_ERROR << 16; > > + } else if ((scsi_state & (MPI3_SCSI_STATE_NO_SCSI_STATUS)) > || > > + (sense_state != > MPI3_SCSI_STATE_SENSE_NOT_AVAILABLE)) > > + scmd->result = DID_SOFT_ERROR << 16; > > + else if (scsi_state & MPI3_SCSI_STATE_TERMINATED) > > + scmd->result = DID_RESET << 16; > > + else if (!xfer_count && scmd->cmnd[0] == REPORT_LUNS) { > > + scsi_status = SAM_STAT_CHECK_CONDITION; > > + scmd->result = (DRIVER_SENSE << 24) | > > + SAM_STAT_CHECK_CONDITION; > > + scmd->sense_buffer[0] = 0x70; > > + scmd->sense_buffer[2] = ILLEGAL_REQUEST; > > + scmd->sense_buffer[12] = 0x20; > > + scmd->sense_buffer[13] = 0; > > + } > > Huh? A separate error handling just for REPORT LUNS? > Did you mess up your firmware? > And if you know REPORT LUNS is not supported, by bother sending it to the > firmware and not completing it straightaway in queuecommand()? This special case handling ported from past solution we used in mpt3sas. I am not sure but it was due to mix of FW and Kernel behavior. Some older kernel had lots of prints if we do not have above special handling in driver. We will remove above check from mpi3mr driver and based on any actual issue, we can add fix in future. I will handle it in V6 submission. Kashyap