I/O performance with block sizes > 128k

* I/O performance with block sizes > 128k
@ 2020-03-09 21:49 Bijan Mottahedeh
  2020-03-09 22:47 ` Keith Busch
  2020-03-10 17:08 ` Christoph Hellwig
  0 siblings, 2 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-03-09 21:49 UTC (permalink / raw)
  To: linux-nvme

I'm seeing a sizeable drop in perf with polled fio tests for block sizes 
 > 128k:

filename=/dev/nvme0n1
rw=randread
direct=1
time_based=1
randrepeat=1
gtod_reduce=1

fio --readonly --ioengine=io_uring --iodepth 1024 --fixedbufs --hipri 
--numjob

The problem seems to be related to switching from prp_small_pool to 
prp_page_pool; the former is optimized for I/O between 4k and 128k.

Expanding the small pool size to cover up to 256k increases the 
performance.  I'm not sure however if this is the proper and general 
fix.  For one thing, expanding the pool size bumps the numbers for a 
short burst test (10 sec) but the numbers drop again significantly 
during a longer test.  The behavior is not unique to io_uring either.  
Included below are a couple of 256k fio pvsync2 tests as well.

A related question, is it required for dma pool allocations to use 
GFP_ATOMIC?  Looks they can only be called from nvme_queue_rq().Just as 
a test, I changed the flag to GFP_NOWAIT, and that seems to be ok.

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d3f23d6..16ae0d7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -40,6 +40,8 @@
  #define NVME_MAX_KB_SZ 4096
  #define NVME_MAX_SEGS  127

+#define        PRP_SMALL_SZ    512
+
  static int use_threaded_interrupts;
  module_param(use_threaded_interrupts, int, 0);

@@ -618,7 +620,7 @@ static blk_status_t nvme_pci_setup_prps(struct 
nvme_dev *dev
         }

         nprps = DIV_ROUND_UP(length, page_size);
-       if (nprps <= (256 / 8)) {
+       if (nprps <= (PRP_SMALL_SZ / 8)) {
                 pool = dev->prp_small_pool;
                 iod->npages = 0;
         } else {
@@ -626,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct 
nvme_dev *dev
                 iod->npages = 1;
         }

-       prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+       prp_list = dma_pool_alloc(pool, GFP_NOWAIT, &prp_dma);
         if (!prp_list) {
                 iod->first_dma = dma_addr;
                 iod->npages = -1;
@@ -638,7 +640,7 @@ static blk_status_t nvme_pci_setup_prps(struct 
nvme_dev *dev
         for (;;) {
                 if (i == page_size >> 3) {
                         __le64 *old_prp_list = prp_list;
-                       prp_list = dma_pool_alloc(pool, GFP_ATOMIC, 
&prp_dma);
+                       prp_list = dma_pool_alloc(pool, GFP_NOWAIT, 
&prp_dma);
                         if (!prp_list)
                                 return BLK_STS_RESOURCE;
                         list[iod->npages++] = prp_list;
@@ -713,7 +715,7 @@ static blk_status_t nvme_pci_setup_sgls(struct 
nvme_dev *dev
                 return BLK_STS_OK;
         }

-       if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+       if (entries <= (PRP_SMALL_SZ / sizeof(struct nvme_sgl_desc))) {
                 pool = dev->prp_small_pool;
                 iod->npages = 0;
         } else {
@@ -721,7 +723,7 @@ static blk_status_t nvme_pci_setup_sgls(struct 
nvme_dev *dev
                 iod->npages = 1;
         }

-       sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+       sg_list = dma_pool_alloc(pool, GFP_NOWAIT, &sgl_dma);
         if (!sg_list) {
                 iod->npages = -1;
                 return BLK_STS_RESOURCE;
@@ -737,7 +739,7 @@ static blk_status_t nvme_pci_setup_sgls(struct 
nvme_dev *dev
                         struct nvme_sgl_desc *old_sg_desc = sg_list;
                         struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

-                       sg_list = dma_pool_alloc(pool, GFP_ATOMIC, 
&sgl_dma);
+                       sg_list = dma_pool_alloc(pool, GFP_NOWAIT, 
&sgl_dma);
                         if (!sg_list)
                                 return BLK_STS_RESOURCE;

@@ -814,7 +816,7 @@ static blk_status_t nvme_map_data(struct nvme_dev 
*dev, stru
         }

         iod->dma_len = 0;
-       iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+       iod->sg = mempool_alloc(dev->iod_mempool, GFP_NOWAIT);
         if (!iod->sg)
                 return BLK_STS_RESOURCE;
         sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
@@ -2475,9 +2477,9 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
         if (!dev->prp_page_pool)
                 return -ENOMEM;

-       /* Optimisation for I/Os between 4k and 128k */
-       dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
-                                               256, 256, 0);
+       /* Optimisation for I/Os between 4k and 256k */
+       dev->prp_small_pool = dma_pool_create("prp list small", dev->dev,
+ PRP_SMALL_SZ, PRP_SMALL_SZ, 0);
         if (!dev->prp_small_pool) {
                 dma_pool_destroy(dev->prp_page_pool);
                 return -ENOMEM;


prp_pool 256
bw=3338MiB/s (3500MB/s) 128k, 10sec
bw=868MiB/s (911MB/s)   144k, 10sec
bw=1607MiB/s (1685MB/s) 256k, 10sec

prp_pool 512
bw=3345MiB/s (3507MB/s) 128k, 10sec
bw=3453MiB/s (3621MB/s) 144k, 10sec
bw=4603MiB/s (4826MB/s) 256k, 10sec
bw=2982MiB/s (3127MB/s) 128k, 5min
bw=1762MiB/s (1847MB/s) 256k, 5min
bw=3579MiB/s (3753MB/s) 256k, 10sec, pvsync2
bw=1752MiB/s (1837MB/s) 256k, 5min,  pvsync2

prp_pool 512 , GFP_NOWAIT
bw=3295MiB/s (3455MB/s) 128k, 10sec
bw=3467MiB/s (3635MB/s) 144k, 10sec
bw=4486MiB/s (4704MB/s) 256k, 10sec
bw=3013MiB/s (3159MB/s) 128k, 5min
bw=1764MiB/s (1849MB/s) 256k, 5min


--bijan

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 10+ messages in thread