Hi,

Ming Lei posted below patch series and performance improved for megaraid_sas driver. I used the same kernel base and figure out some more possible performance improvement in block layer. This RFC improves performance as well as CPU utilization. If this patch fits the design aspect of the blk-mq and scsi-mq, I can convert it into PATCH and submit the same/modified version.

https://marc.info/?l=linux-block&m=153062994403732&w=2

Description of change -

Do not insert request into software queue if BLK_MQ_F_NO_SCHED is set. Submit request from blk_mq_make_request to low level driver directly as depicted through below function call.

blk_mq_try_issue_directly                                                                                                                                            
      __blk_mq_try_issue_directly                                                                                                                           
                 scsi_queue_rq

Low level driver attached to scsi.mq can set BLK_MQ_F_NO_SCHED,  If they do not want benefit from io scheduler (e.a in case of SSDs connected to IT/MR controller). In case of HDD drives connected to HBA, driver can avoid setting BLK_MQ_F_NO_SCHED so that default elevator is set to mq-deadline.

Setup and performance number detail listed below -

I have created one R0 VD consist of 8 SSD on MegarRaid adapter.

Without RFC - IOPS goes 840K and CPU utilization goes upto 11%. Below is perf top output

   5.17%  [kernel]                 [k] _raw_spin_lock
   4.62%  [kernel]                 [k] try_to_grab_pending
   2.29%  [kernel]                 [k] syscall_return_via_sysret
   1.37%  [kernel]                 [k] blk_mq_flush_busy_ctxs
   1.29%  [kernel]                 [k] kobject_get
   1.27%  fio                      [.] axmap_isset
   1.25%  [kernel]                 [k] flush_busy_ctx
   1.20%  [kernel]                 [k] scsi_dispatch_cmd
   1.18%  [kernel]                 [k] blk_mq_get_request
   1.16%  [kernel]                 [k] blk_mq_hctx_mark_pending.isra.45
   1.09%  [kernel]                 [k] irq_entries_start
   0.94%  [kernel]                 [k] del_timer
   0.91%  [kernel]                 [k] scsi_softirq_done
   0.90%  [kernel]                 [k] sbitmap_any_bit_set
   0.83%  [kernel]                 [k] blk_mq_free_request
   0.82%  [kernel]                 [k] kobject_put
   0.81%  [sd_mod]                 [k] sd_setup_read_write_cmnd
   0.80%  [kernel]                 [k] scsi_mq_get_budget
   0.79%  [kernel]                 [k] blk_mq_get_tag
   0.70%  [kernel]                 [k] blk_mq_dispatch_rq_list
   0.61%  [kernel]                 [k] bt_iter
   0.60%  fio                      [.] __fio_gettime
   0.59%  [kernel]                 [k] blk_mq_complete_request
   0.59%  [kernel]                 [k] gup_pgd_range
   0.57%  [kernel]                 [k] scsi_queue_rq


After applying RFC - IOPS goes 1066K and CPU utilization goes up to 6%.

   2.56%  [kernel]             [k] syscall_return_via_sysret
   2.46%  [kernel]             [k] irq_entries_start
   2.43%  [kernel]             [k] kobject_get
   2.40%  [kernel]             [k] bt_iter
   2.16%  fio                  [.] axmap_isset
   2.06%  [kernel]             [k] _raw_spin_lock
   1.76%  [kernel]             [k] __audit_syscall_exit
   1.51%  [kernel]             [k] scsi_dispatch_cmd
   1.49%  [kernel]             [k] blk_mq_free_request
   1.49%  [sd_mod]             [k] sd_setup_read_write_cmnd
   1.45%  [kernel]             [k] scsi_softirq_done
   1.32%  [kernel]             [k] switch_mm_irqs_off
   1.28%  [kernel]             [k] scsi_mq_get_budget
   1.22%  [kernel]             [k] blk_mq_check_inflight
   1.13%  [kernel]             [k] kobject_put
   1.11%  fio                  [.] __fio_gettime
   0.95%  [kernel]             [k] gup_pgd_range
   0.90%  [kernel]             [k] blk_mq_get_tag
   0.88%  [kernel]             [k] read_tsc
   0.85%  [kernel]             [k] scsi_end_request
   0.85%  fio                  [.] get_io_u
   0.84%  [kernel]             [k] lookup_ioctx
   0.81%  [kernel]             [k] blk_mq_complete_request
   0.80%  [kernel]             [k] blk_mq_get_request

Signed-off-by: Kashyap Desai < kashyap.desai@broadcom.com>
---

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4d1c048..ab27788 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1811,32 +1811,35 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         blk_insert_flush(rq);
         blk_mq_run_hw_queue(data.hctx, true);
     } else if (plug && q->nr_hw_queues == 1) {
-        struct request *last = NULL;
-
         blk_mq_put_ctx(data.ctx);
         blk_mq_bio_to_request(rq, bio);
+        /* bypass scheduler for no sched flag set */
+        if (q->tag_set->flags & BLK_MQ_F_NO_SCHED)
+            blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+        else {
+            struct request *last = NULL;
+            /*
+             * @request_count may become stale because of schedule
+             * out, so check the list again.
+             */
+            if (list_empty(&plug->mq_list))
+                request_count = 0;
+            else if (blk_queue_nomerges(q))
+                request_count = blk_plug_queued_count(q);
+
+            if (!request_count)
+                trace_block_plug(q);
+            else
+                last = list_entry_rq(plug->mq_list.prev);
+
+            if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+                blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
+                blk_flush_plug_list(plug, false);
+                trace_block_plug(q);
+            }
 
-        /*
-         * @request_count may become stale because of schedule
-         * out, so check the list again.
-         */
-        if (list_empty(&plug->mq_list))
-            request_count = 0;
-        else if (blk_queue_nomerges(q))
-            request_count = blk_plug_queued_count(q);
-
-        if (!request_count)
-            trace_block_plug(q);
-        else
-            last = list_entry_rq(plug->mq_list.prev);
-
-        if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
-            blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
-            blk_flush_plug_list(plug, false);
-            trace_block_plug(q);
+            list_add_tail(&rq->queuelist, &plug->mq_list);
         }
-
-        list_add_tail(&rq->queuelist, &plug->mq_list);
     } else if (plug && !blk_queue_nomerges(q)) {
         blk_mq_bio_to_request(rq, bio);