All of lore.kernel.org
 help / color / mirror / Atom feed
* I/O performance with block sizes > 128k
@ 2020-03-09 21:49 Bijan Mottahedeh
  2020-03-09 22:47 ` Keith Busch
  2020-03-10 17:08 ` Christoph Hellwig
  0 siblings, 2 replies; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-03-09 21:49 UTC (permalink / raw)
  To: linux-nvme

I'm seeing a sizeable drop in perf with polled fio tests for block sizes 
 > 128k:

filename=/dev/nvme0n1
rw=randread
direct=1
time_based=1
randrepeat=1
gtod_reduce=1

fio --readonly --ioengine=io_uring --iodepth 1024 --fixedbufs --hipri 
--numjob

The problem seems to be related to switching from prp_small_pool to 
prp_page_pool; the former is optimized for I/O between 4k and 128k.

Expanding the small pool size to cover up to 256k increases the 
performance.  I'm not sure however if this is the proper and general 
fix.  For one thing, expanding the pool size bumps the numbers for a 
short burst test (10 sec) but the numbers drop again significantly 
during a longer test.  The behavior is not unique to io_uring either.  
Included below are a couple of 256k fio pvsync2 tests as well.

A related question, is it required for dma pool allocations to use 
GFP_ATOMIC?  Looks they can only be called from nvme_queue_rq().Just as 
a test, I changed the flag to GFP_NOWAIT, and that seems to be ok.


diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d3f23d6..16ae0d7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -40,6 +40,8 @@
  #define NVME_MAX_KB_SZ 4096
  #define NVME_MAX_SEGS  127

+#define        PRP_SMALL_SZ    512
+
  static int use_threaded_interrupts;
  module_param(use_threaded_interrupts, int, 0);

@@ -618,7 +620,7 @@ static blk_status_t nvme_pci_setup_prps(struct 
nvme_dev *dev
         }

         nprps = DIV_ROUND_UP(length, page_size);
-       if (nprps <= (256 / 8)) {
+       if (nprps <= (PRP_SMALL_SZ / 8)) {
                 pool = dev->prp_small_pool;
                 iod->npages = 0;
         } else {
@@ -626,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct 
nvme_dev *dev
                 iod->npages = 1;
         }

-       prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+       prp_list = dma_pool_alloc(pool, GFP_NOWAIT, &prp_dma);
         if (!prp_list) {
                 iod->first_dma = dma_addr;
                 iod->npages = -1;
@@ -638,7 +640,7 @@ static blk_status_t nvme_pci_setup_prps(struct 
nvme_dev *dev
         for (;;) {
                 if (i == page_size >> 3) {
                         __le64 *old_prp_list = prp_list;
-                       prp_list = dma_pool_alloc(pool, GFP_ATOMIC, 
&prp_dma);
+                       prp_list = dma_pool_alloc(pool, GFP_NOWAIT, 
&prp_dma);
                         if (!prp_list)
                                 return BLK_STS_RESOURCE;
                         list[iod->npages++] = prp_list;
@@ -713,7 +715,7 @@ static blk_status_t nvme_pci_setup_sgls(struct 
nvme_dev *dev
                 return BLK_STS_OK;
         }

-       if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+       if (entries <= (PRP_SMALL_SZ / sizeof(struct nvme_sgl_desc))) {
                 pool = dev->prp_small_pool;
                 iod->npages = 0;
         } else {
@@ -721,7 +723,7 @@ static blk_status_t nvme_pci_setup_sgls(struct 
nvme_dev *dev
                 iod->npages = 1;
         }

-       sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+       sg_list = dma_pool_alloc(pool, GFP_NOWAIT, &sgl_dma);
         if (!sg_list) {
                 iod->npages = -1;
                 return BLK_STS_RESOURCE;
@@ -737,7 +739,7 @@ static blk_status_t nvme_pci_setup_sgls(struct 
nvme_dev *dev
                         struct nvme_sgl_desc *old_sg_desc = sg_list;
                         struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

-                       sg_list = dma_pool_alloc(pool, GFP_ATOMIC, 
&sgl_dma);
+                       sg_list = dma_pool_alloc(pool, GFP_NOWAIT, 
&sgl_dma);
                         if (!sg_list)
                                 return BLK_STS_RESOURCE;

@@ -814,7 +816,7 @@ static blk_status_t nvme_map_data(struct nvme_dev 
*dev, stru
         }

         iod->dma_len = 0;
-       iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+       iod->sg = mempool_alloc(dev->iod_mempool, GFP_NOWAIT);
         if (!iod->sg)
                 return BLK_STS_RESOURCE;
         sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
@@ -2475,9 +2477,9 @@ static int nvme_setup_prp_pools(struct nvme_dev *dev)
         if (!dev->prp_page_pool)
                 return -ENOMEM;

-       /* Optimisation for I/Os between 4k and 128k */
-       dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
-                                               256, 256, 0);
+       /* Optimisation for I/Os between 4k and 256k */
+       dev->prp_small_pool = dma_pool_create("prp list small", dev->dev,
+ PRP_SMALL_SZ, PRP_SMALL_SZ, 0);
         if (!dev->prp_small_pool) {
                 dma_pool_destroy(dev->prp_page_pool);
                 return -ENOMEM;


prp_pool 256
bw=3338MiB/s (3500MB/s) 128k, 10sec
bw=868MiB/s (911MB/s)   144k, 10sec
bw=1607MiB/s (1685MB/s) 256k, 10sec

prp_pool 512
bw=3345MiB/s (3507MB/s) 128k, 10sec
bw=3453MiB/s (3621MB/s) 144k, 10sec
bw=4603MiB/s (4826MB/s) 256k, 10sec
bw=2982MiB/s (3127MB/s) 128k, 5min
bw=1762MiB/s (1847MB/s) 256k, 5min
bw=3579MiB/s (3753MB/s) 256k, 10sec, pvsync2
bw=1752MiB/s (1837MB/s) 256k, 5min,  pvsync2

prp_pool 512 , GFP_NOWAIT
bw=3295MiB/s (3455MB/s) 128k, 10sec
bw=3467MiB/s (3635MB/s) 144k, 10sec
bw=4486MiB/s (4704MB/s) 256k, 10sec
bw=3013MiB/s (3159MB/s) 128k, 5min
bw=1764MiB/s (1849MB/s) 256k, 5min


--bijan

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-09 21:49 I/O performance with block sizes > 128k Bijan Mottahedeh
@ 2020-03-09 22:47 ` Keith Busch
  2020-03-09 23:11   ` Bijan Mottahedeh
  2020-03-10 17:08 ` Christoph Hellwig
  1 sibling, 1 reply; 10+ messages in thread
From: Keith Busch @ 2020-03-09 22:47 UTC (permalink / raw)
  To: Bijan Mottahedeh; +Cc: linux-nvme

On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
> I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
> 128k:
> 
> filename=/dev/nvme0n1
> rw=randread
> direct=1
> time_based=1
> randrepeat=1
> gtod_reduce=1
> 
> fio --readonly --ioengine=io_uring --iodepth 1024 --fixedbufs --hipri
> --numjob
> 
> The problem seems to be related to switching from prp_small_pool to
> prp_page_pool; the former is optimized for I/O between 4k and 128k.
> 
> Expanding the small pool size to cover up to 256k increases the
> performance.  I'm not sure however if this is the proper and general fix. 
> For one thing, expanding the pool size bumps the numbers for a short burst
> test (10 sec) but the numbers drop again significantly during a longer
> test.  The behavior is not unique to io_uring either.  Included below are a
> couple of 256k fio pvsync2 tests as well.

I am surprised you're seeing such a drop just from the prp pool used.

What CPU architecture are you using? Reason I ask: the driver allocates
PAGE_SIZE for the large prp pool, but we really want ctrl->page_size
(always 4k). If your CPU architecture has a PAGE_SIZE larger than 4k,
could you try the following?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index da392b50f73e..6ed07164d1e7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2458,7 +2458,7 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
 {
 	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
-						PAGE_SIZE, PAGE_SIZE, 0);
+						4096, 4096, 0);
 	if (!dev->prp_page_pool)
 		return -ENOMEM;
 
--
 
> A related question, is it required for dma pool allocations to use
> GFP_ATOMIC?  Looks they can only be called from nvme_queue_rq().Just as a
> test, I changed the flag to GFP_NOWAIT, and that seems to be ok.

Yes, the atomic alloc is left over from a time when this happened under
a spinlock. NOWAIT should be fine here.

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-09 22:47 ` Keith Busch
@ 2020-03-09 23:11   ` Bijan Mottahedeh
  2020-03-10  3:56     ` Ming Lei
  0 siblings, 1 reply; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-03-09 23:11 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme

On 3/9/2020 3:47 PM, Keith Busch wrote:
> On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
>> I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
>> 128k:
>>
>> filename=/dev/nvme0n1
>> rw=randread
>> direct=1
>> time_based=1
>> randrepeat=1
>> gtod_reduce=1
>>
>> fio --readonly --ioengine=io_uring --iodepth 1024 --fixedbufs --hipri
>> --numjob
>>
>> The problem seems to be related to switching from prp_small_pool to
>> prp_page_pool; the former is optimized for I/O between 4k and 128k.
>>
>> Expanding the small pool size to cover up to 256k increases the
>> performance.  I'm not sure however if this is the proper and general fix.
>> For one thing, expanding the pool size bumps the numbers for a short burst
>> test (10 sec) but the numbers drop again significantly during a longer
>> test.  The behavior is not unique to io_uring either.  Included below are a
>> couple of 256k fio pvsync2 tests as well.
> I am surprised you're seeing such a drop just from the prp pool used.
>
> What CPU architecture are you using? Reason I ask: the driver allocates
> PAGE_SIZE for the large prp pool, but we really want ctrl->page_size
> (always 4k). If your CPU architecture has a PAGE_SIZE larger than 4k,
> could you try the following?

It's an x86_64 vm with 8GB of memory.  Is the 4k pool size meant to 
support up to a 2MB i/o size then?

The main seems to be a lock contention, this what I see with the 256k 
test after running a while.  However,  I can't pinpoint the lock with 
perf lock; is there a better way to do that?

     65.08%  [kernel]       [k] __pv_queued_spin_lock_slowpath
      2.43%  [kernel]       [k] mutex_spin_on_owner

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              16
On-line CPU(s) list: 0-15
Thread(s) per core:  1
Core(s) per socket:  1
Socket(s):           16
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               63
Model name:          Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz
Stepping:            2
CPU MHz:             2294.876
BogoMIPS:            4589.75
Virtualization:      VT-x
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            4096K
L3 cache:            16384K
NUMA node0 CPU(s):   0-15
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr 
pge mca cr

>
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index da392b50f73e..6ed07164d1e7 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2458,7 +2458,7 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
>   static int nvme_setup_prp_pools(struct nvme_dev *dev)
>   {
>   	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
> -						PAGE_SIZE, PAGE_SIZE, 0);
> +						4096, 4096, 0);
>   	if (!dev->prp_page_pool)
>   		return -ENOMEM;
>   
> --
>   
>> A related question, is it required for dma pool allocations to use
>> GFP_ATOMIC?  Looks they can only be called from nvme_queue_rq().Just as a
>> test, I changed the flag to GFP_NOWAIT, and that seems to be ok.
> Yes, the atomic alloc is left over from a time when this happened under
> a spinlock. NOWAIT should be fine here.


_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-09 23:11   ` Bijan Mottahedeh
@ 2020-03-10  3:56     ` Ming Lei
  0 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2020-03-10  3:56 UTC (permalink / raw)
  To: Bijan Mottahedeh; +Cc: Keith Busch, linux-nvme

On Tue, Mar 10, 2020 at 7:11 AM Bijan Mottahedeh
<bijan.mottahedeh@oracle.com> wrote:
>
> On 3/9/2020 3:47 PM, Keith Busch wrote:
> > On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
> >> I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
> >> 128k:
> >>
> >> filename=/dev/nvme0n1
> >> rw=randread
> >> direct=1
> >> time_based=1
> >> randrepeat=1
> >> gtod_reduce=1
> >>
> >> fio --readonly --ioengine=io_uring --iodepth 1024 --fixedbufs --hipri
> >> --numjob
> >>
> >> The problem seems to be related to switching from prp_small_pool to
> >> prp_page_pool; the former is optimized for I/O between 4k and 128k.
> >>
> >> Expanding the small pool size to cover up to 256k increases the
> >> performance.  I'm not sure however if this is the proper and general fix.
> >> For one thing, expanding the pool size bumps the numbers for a short burst
> >> test (10 sec) but the numbers drop again significantly during a longer
> >> test.  The behavior is not unique to io_uring either.  Included below are a
> >> couple of 256k fio pvsync2 tests as well.
> > I am surprised you're seeing such a drop just from the prp pool used.
> >
> > What CPU architecture are you using? Reason I ask: the driver allocates
> > PAGE_SIZE for the large prp pool, but we really want ctrl->page_size
> > (always 4k). If your CPU architecture has a PAGE_SIZE larger than 4k,
> > could you try the following?
>
> It's an x86_64 vm with 8GB of memory.  Is the 4k pool size meant to
> support up to a 2MB i/o size then?
>
> The main seems to be a lock contention, this what I see with the 256k
> test after running a while.  However,  I can't pinpoint the lock with
> perf lock; is there a better way to do that?
>
>      65.08%  [kernel]       [k] __pv_queued_spin_lock_slowpath
>       2.43%  [kernel]       [k] mutex_spin_on_owner

It should be figured out by 'perf record -g'.

>
> Architecture:        x86_64
> CPU op-mode(s):      32-bit, 64-bit
> Byte Order:          Little Endian
> CPU(s):              16
> On-line CPU(s) list: 0-15
> Thread(s) per core:  1
> Core(s) per socket:  1
> Socket(s):           16

Looks processor emulation is weird,  it said you have 16 sockets, and each
socket has just one CPU core. Maybe this is the real problem.


Thanks,
Ming Lei

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-09 21:49 I/O performance with block sizes > 128k Bijan Mottahedeh
  2020-03-09 22:47 ` Keith Busch
@ 2020-03-10 17:08 ` Christoph Hellwig
  2020-03-10 19:44   ` Keith Busch
  1 sibling, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2020-03-10 17:08 UTC (permalink / raw)
  To: Bijan Mottahedeh; +Cc: linux-nvme

On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
> I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
> 128k:

Try using a controller with SGL support.  NVMe PRP unfortunately are
a completely brain dead scheme once you use non-tiny I/O sizes (
and actually really painful even for those for other reasons).

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-10 17:08 ` Christoph Hellwig
@ 2020-03-10 19:44   ` Keith Busch
  2020-03-10 20:43     ` Chaitanya Kulkarni
  2020-03-11  0:52     ` Bijan Mottahedeh
  0 siblings, 2 replies; 10+ messages in thread
From: Keith Busch @ 2020-03-10 19:44 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Bijan Mottahedeh, linux-nvme

On Tue, Mar 10, 2020 at 10:08:18AM -0700, Christoph Hellwig wrote:
> On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
> > I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
> > 128k:
> 
> Try using a controller with SGL support.  NVMe PRP unfortunately are
> a completely brain dead scheme once you use non-tiny I/O sizes (
> and actually really painful even for those for other reasons).

SGL would really help if you have phyiscal continuity, and never needs
to allocate from the dma pools. If most pages in the transfer are not
physically contiguous, though, PRP is still more memory efficient. But
yes, the PRP format is ... unique. :)

FWIW, I couldn't measure a performance drop on real hardware comparing
the large pool vs the small one.

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-10 19:44   ` Keith Busch
@ 2020-03-10 20:43     ` Chaitanya Kulkarni
  2020-03-11  0:52     ` Bijan Mottahedeh
  1 sibling, 0 replies; 10+ messages in thread
From: Chaitanya Kulkarni @ 2020-03-10 20:43 UTC (permalink / raw)
  To: Keith Busch, Christoph Hellwig; +Cc: Bijan Mottahedeh, linux-nvme

On 03/10/2020 12:44 PM, Keith Busch wrote:
>> >Try using a controller with SGL support.  NVMe PRP unfortunately are
>> >a completely brain dead scheme once you use non-tiny I/O sizes (
>> >and actually really painful even for those for other reasons).
> SGL would really help if you have phyiscal continuity, and never needs
> to allocate from the dma pools. If most pages in the transfer are not
> physically contiguous, though, PRP is still more memory efficient. But
> yes, the PRP format is ... unique.:)
>

When we added SGL support I remember having ~5% increase with the 
controller which was supporting SGLs with fio block size > 32K, also 
there was performance drop when SGL is used for block size < 32k
(i.e. 4k,8k,16k).

Maybe it is worth taking numbers on the controller with varying block
size with PRPs vs SGLs (4k, 8k, 16k, 32k, 64k) with just random-read to 
avoid any gc effect ?

> FWIW, I couldn't measure a performance drop on real hardware comparing
> the large pool vs the small one.


_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-10 19:44   ` Keith Busch
  2020-03-10 20:43     ` Chaitanya Kulkarni
@ 2020-03-11  0:52     ` Bijan Mottahedeh
  2020-03-11  1:27       ` Keith Busch
  1 sibling, 1 reply; 10+ messages in thread
From: Bijan Mottahedeh @ 2020-03-11  0:52 UTC (permalink / raw)
  To: Keith Busch, Christoph Hellwig; +Cc: Jens Axboe, linux-nvme

On 3/10/2020 12:44 PM, Keith Busch wrote:
> On Tue, Mar 10, 2020 at 10:08:18AM -0700, Christoph Hellwig wrote:
>> On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
>>> I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
>>> 128k:
>> Try using a controller with SGL support.  NVMe PRP unfortunately are
>> a completely brain dead scheme once you use non-tiny I/O sizes (
>> and actually really painful even for those for other reasons).
> SGL would really help if you have phyiscal continuity, and never needs
> to allocate from the dma pools. If most pages in the transfer are not
> physically contiguous, though, PRP is still more memory efficient. But
> yes, the PRP format is ... unique. :)
So with SGL you can potentially bypass dma pool allocation but with PRP 
you always have to regardless of contiguity?

>
> FWIW, I couldn't measure a performance drop on real hardware comparing
> the large pool vs the small one.

I took out my workaround.

I can't see a big drop on h/w with 256k either; however, I can't push 
the tests far enough because they hang depending on the number of fio 
jobs and iodepth.

They seem to be stuck in an io_uring poll loop:

     58.91%  [kernel]          [k] blk_poll
     27.01%  [kernel]          [k] io_iopoll_getevents
      8.56%  [kernel]          [k] blkdev_iopoll

I had to kill the fio jobs and got a bunch of this output:

Jobs: 103 (f=103): 
[r(4),_(1),r(12),_(1),r(2),_(1),r(10),_(1),r(2),_(1),r(1),_(1),r(7),_(1),r(1),_(2),r(16),_(1),r(5),_(1),r(6),_(2),r(2),_(1),r(2),_(1),r(2),_(1),r(3),_(1),r(9),_(1),r(4),_(2),r(5),_(1),r(3),_(1),r(1),_(1),r(4),_(2),r(2)][0fio: 
job 'fiotest' (state=5) hasn't exited in 300 seconds, it appears to be 
stuck. Doing forceful exit of this job.

Hopefully Jens can comment what this means.

--bijan

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-11  0:52     ` Bijan Mottahedeh
@ 2020-03-11  1:27       ` Keith Busch
  2020-03-11  1:29         ` Keith Busch
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Busch @ 2020-03-11  1:27 UTC (permalink / raw)
  To: Bijan Mottahedeh; +Cc: Christoph Hellwig, Jens Axboe, linux-nvme

On Tue, Mar 10, 2020 at 05:52:02PM -0700, Bijan Mottahedeh wrote:
> On 3/10/2020 12:44 PM, Keith Busch wrote:
> > On Tue, Mar 10, 2020 at 10:08:18AM -0700, Christoph Hellwig wrote:
> > > On Mon, Mar 09, 2020 at 02:49:10PM -0700, Bijan Mottahedeh wrote:
> > > > I'm seeing a sizeable drop in perf with polled fio tests for block sizes >
> > > > 128k:
> > > Try using a controller with SGL support.  NVMe PRP unfortunately are
> > > a completely brain dead scheme once you use non-tiny I/O sizes (
> > > and actually really painful even for those for other reasons).
> > SGL would really help if you have phyiscal continuity, and never needs
> > to allocate from the dma pools. If most pages in the transfer are not
> > physically contiguous, though, PRP is still more memory efficient. But
> > yes, the PRP format is ... unique. :)
>
> So with SGL you can potentially bypass dma pool allocation but with PRP you
> always have to regardless of contiguity?

Yes, if the payload can be expressed as a single address range, an
SGL capable controller can describe that in the NVMe command without
allocating a scatter list payload to accompany the command.

The maximum length of a PRP is 4k. If you're transferring 256k per
command, that will require at least 64 PRP entries regardless of the
number of ranges in the DMA mapped scatter-list.

> > FWIW, I couldn't measure a performance drop on real hardware comparing
> > the large pool vs the small one.
> 
> I took out my workaround.
> 
> I can't see a big drop on h/w with 256k either; however, I can't push the
> tests far enough because they hang depending on the number of fio jobs and
> iodepth.
> 
> They seem to be stuck in an io_uring poll loop:
> 
>     58.91%  [kernel]          [k] blk_poll
>     27.01%  [kernel]          [k] io_iopoll_getevents
>      8.56%  [kernel]          [k] blkdev_iopoll

That looks pretty normal for polled io.
 
> I had to kill the fio jobs and got a bunch of this output:
> 
> Jobs: 103 (f=103): [r(4),_(1),r(12),_(1),r(2),_(1),r(10),_(1),r(2),_(1),r(1),_(1),r(7),_(1),r(1),_(2),r(16),_(1),r(5),_(1),r(6),_(2),r(2),_(1),r(2),_(1),r(2),_(1),r(3),_(1),r(9),_(1),r(4),_(2),r(5),_(1),r(3),_(1),r(1),_(1),r(4),_(2),r(2)][0fio:
> job 'fiotest' (state=5) hasn't exited in 300 seconds, it appears to be
> stuck. Doing forceful exit of this job.
> 
> Hopefully Jens can comment what this means.

What kernel are you using? A bug observed with multiple threads polling
the same queue was just fixed in the 5.6-rc5 release (also in 5.5.8,
5.4.24 stables).

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: I/O performance with block sizes > 128k
  2020-03-11  1:27       ` Keith Busch
@ 2020-03-11  1:29         ` Keith Busch
  0 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2020-03-11  1:29 UTC (permalink / raw)
  To: Bijan Mottahedeh; +Cc: Christoph Hellwig, Jens Axboe, linux-nvme

On Tue, Mar 10, 2020 at 06:27:01PM -0700, Keith Busch wrote:
> On Tue, Mar 10, 2020 at 05:52:02PM -0700, Bijan Mottahedeh wrote:
> > I had to kill the fio jobs and got a bunch of this output:
> > 
> > Jobs: 103 (f=103): [r(4),_(1),r(12),_(1),r(2),_(1),r(10),_(1),r(2),_(1),r(1),_(1),r(7),_(1),r(1),_(2),r(16),_(1),r(5),_(1),r(6),_(2),r(2),_(1),r(2),_(1),r(2),_(1),r(3),_(1),r(9),_(1),r(4),_(2),r(5),_(1),r(3),_(1),r(1),_(1),r(4),_(2),r(2)][0fio:
> > job 'fiotest' (state=5) hasn't exited in 300 seconds, it appears to be
> > stuck. Doing forceful exit of this job.
> > 
> > Hopefully Jens can comment what this means.
> 
> What kernel are you using? A bug observed with multiple threads polling
> the same queue was just fixed in the 5.6-rc5 release


Oops, I suppose you already know that ... you fixed it! :)

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-03-11  1:29 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-03-09 21:49 I/O performance with block sizes > 128k Bijan Mottahedeh
2020-03-09 22:47 ` Keith Busch
2020-03-09 23:11   ` Bijan Mottahedeh
2020-03-10  3:56     ` Ming Lei
2020-03-10 17:08 ` Christoph Hellwig
2020-03-10 19:44   ` Keith Busch
2020-03-10 20:43     ` Chaitanya Kulkarni
2020-03-11  0:52     ` Bijan Mottahedeh
2020-03-11  1:27       ` Keith Busch
2020-03-11  1:29         ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.