All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] NVMe: nvme_queue made cache friendly.
  2015-05-20 18:01 ` Parav Pandit
@ 2015-05-20 13:20   ` Matthew Wilcox
  -1 siblings, 0 replies; 6+ messages in thread
From: Matthew Wilcox @ 2015-05-20 13:20 UTC (permalink / raw)
  To: Parav Pandit; +Cc: linux-nvme, axboe, linux-kernel

On Wed, May 20, 2015 at 02:01:03PM -0400, Parav Pandit wrote:
> nvme_queue structure made 64B cache friendly so that majority of the
> data elements of the structure during IO and completion path can be
> found in typical single 64B cache line size which was previously spanning beyond
> single 64B cache line size.

Have you done any performance measurements on this?  I find it hard to
believe that moving q_lock to the second 64B cache line results in a
performance improvement.  Seems to me it would result in a performance
loss, since you have to grab the lock before operating on the queue,
and cache line prefetching tends to prefetch the _next_ line, not the
_previous_ line.

> @@ -98,23 +98,23 @@ struct async_cmd_info {
>  struct nvme_queue {
>  	struct device *q_dmadev;
>  	struct nvme_dev *dev;
> -	char irqname[24];	/* nvme4294967295-65535\0 */
> -	spinlock_t q_lock;
>  	struct nvme_command *sq_cmds;
> +	struct blk_mq_hw_ctx *hctx;
>  	volatile struct nvme_completion *cqes;
> -	dma_addr_t sq_dma_addr;
> -	dma_addr_t cq_dma_addr;
>  	u32 __iomem *q_db;
>  	u16 q_depth;
> -	s16 cq_vector;
>  	u16 sq_head;
>  	u16 sq_tail;
>  	u16 cq_head;
>  	u16 qid;
> +	s16 cq_vector;
>  	u8 cq_phase;
>  	u8 cqe_seen;
> +	spinlock_t q_lock;
>  	struct async_cmd_info cmdinfo;
> -	struct blk_mq_hw_ctx *hctx;
> +	char irqname[24];	/* nvme4294967295-65535\0 */
> +	dma_addr_t sq_dma_addr;
> +	dma_addr_t cq_dma_addr;
>  };
>  
>  /*
> -- 
> 1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: nvme_queue made cache friendly.
@ 2015-05-20 13:20   ` Matthew Wilcox
  0 siblings, 0 replies; 6+ messages in thread
From: Matthew Wilcox @ 2015-05-20 13:20 UTC (permalink / raw)


On Wed, May 20, 2015@02:01:03PM -0400, Parav Pandit wrote:
> nvme_queue structure made 64B cache friendly so that majority of the
> data elements of the structure during IO and completion path can be
> found in typical single 64B cache line size which was previously spanning beyond
> single 64B cache line size.

Have you done any performance measurements on this?  I find it hard to
believe that moving q_lock to the second 64B cache line results in a
performance improvement.  Seems to me it would result in a performance
loss, since you have to grab the lock before operating on the queue,
and cache line prefetching tends to prefetch the _next_ line, not the
_previous_ line.

> @@ -98,23 +98,23 @@ struct async_cmd_info {
>  struct nvme_queue {
>  	struct device *q_dmadev;
>  	struct nvme_dev *dev;
> -	char irqname[24];	/* nvme4294967295-65535\0 */
> -	spinlock_t q_lock;
>  	struct nvme_command *sq_cmds;
> +	struct blk_mq_hw_ctx *hctx;
>  	volatile struct nvme_completion *cqes;
> -	dma_addr_t sq_dma_addr;
> -	dma_addr_t cq_dma_addr;
>  	u32 __iomem *q_db;
>  	u16 q_depth;
> -	s16 cq_vector;
>  	u16 sq_head;
>  	u16 sq_tail;
>  	u16 cq_head;
>  	u16 qid;
> +	s16 cq_vector;
>  	u8 cq_phase;
>  	u8 cqe_seen;
> +	spinlock_t q_lock;
>  	struct async_cmd_info cmdinfo;
> -	struct blk_mq_hw_ctx *hctx;
> +	char irqname[24];	/* nvme4294967295-65535\0 */
> +	dma_addr_t sq_dma_addr;
> +	dma_addr_t cq_dma_addr;
>  };
>  
>  /*
> -- 
> 1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] NVMe: nvme_queue made cache friendly.
  2015-05-20 13:20   ` Matthew Wilcox
@ 2015-05-20 13:34     ` Parav Pandit
  -1 siblings, 0 replies; 6+ messages in thread
From: Parav Pandit @ 2015-05-20 13:34 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-nvme, axboe, linux-kernel

On Wed, May 20, 2015 at 6:50 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Wed, May 20, 2015 at 02:01:03PM -0400, Parav Pandit wrote:
>> nvme_queue structure made 64B cache friendly so that majority of the
>> data elements of the structure during IO and completion path can be
>> found in typical single 64B cache line size which was previously spanning beyond
>> single 64B cache line size.
>
> Have you done any performance measurements on this?

I have not done the performance test yet.

>  I find it hard to
> believe that moving q_lock to the second 64B cache line results in a
> performance improvement.

Newly aligned structure including q_lock actually fits completely in
first 64 bytes. Did I miss anything in calculation?
q_lock appears to be taken at the end of the IO processing, which
means by than sq_cmds, hctx etc fields are already accessed in same
line.
May be I should move it after q_db, instead of last element?


> Seems to me it would result in a performance
> loss, since you have to grab the lock before operating on the queue,
> and cache line prefetching tends to prefetch the _next_ line, not the
> _previous_ line.
>
>> @@ -98,23 +98,23 @@ struct async_cmd_info {
>>  struct nvme_queue {
>>       struct device *q_dmadev;
>>       struct nvme_dev *dev;
>> -     char irqname[24];       /* nvme4294967295-65535\0 */
>> -     spinlock_t q_lock;
>>       struct nvme_command *sq_cmds;
>> +     struct blk_mq_hw_ctx *hctx;
>>       volatile struct nvme_completion *cqes;
>> -     dma_addr_t sq_dma_addr;
>> -     dma_addr_t cq_dma_addr;
>>       u32 __iomem *q_db;
>>       u16 q_depth;
>> -     s16 cq_vector;
>>       u16 sq_head;
>>       u16 sq_tail;
>>       u16 cq_head;
>>       u16 qid;
>> +     s16 cq_vector;
>>       u8 cq_phase;
>>       u8 cqe_seen;
>> +     spinlock_t q_lock;
>>       struct async_cmd_info cmdinfo;
>> -     struct blk_mq_hw_ctx *hctx;
>> +     char irqname[24];       /* nvme4294967295-65535\0 */
>> +     dma_addr_t sq_dma_addr;
>> +     dma_addr_t cq_dma_addr;
>>  };
>>
>>  /*
>> --
>> 1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: nvme_queue made cache friendly.
@ 2015-05-20 13:34     ` Parav Pandit
  0 siblings, 0 replies; 6+ messages in thread
From: Parav Pandit @ 2015-05-20 13:34 UTC (permalink / raw)


On Wed, May 20, 2015@6:50 PM, Matthew Wilcox <willy@linux.intel.com> wrote:
> On Wed, May 20, 2015@02:01:03PM -0400, Parav Pandit wrote:
>> nvme_queue structure made 64B cache friendly so that majority of the
>> data elements of the structure during IO and completion path can be
>> found in typical single 64B cache line size which was previously spanning beyond
>> single 64B cache line size.
>
> Have you done any performance measurements on this?

I have not done the performance test yet.

>  I find it hard to
> believe that moving q_lock to the second 64B cache line results in a
> performance improvement.

Newly aligned structure including q_lock actually fits completely in
first 64 bytes. Did I miss anything in calculation?
q_lock appears to be taken at the end of the IO processing, which
means by than sq_cmds, hctx etc fields are already accessed in same
line.
May be I should move it after q_db, instead of last element?


> Seems to me it would result in a performance
> loss, since you have to grab the lock before operating on the queue,
> and cache line prefetching tends to prefetch the _next_ line, not the
> _previous_ line.
>
>> @@ -98,23 +98,23 @@ struct async_cmd_info {
>>  struct nvme_queue {
>>       struct device *q_dmadev;
>>       struct nvme_dev *dev;
>> -     char irqname[24];       /* nvme4294967295-65535\0 */
>> -     spinlock_t q_lock;
>>       struct nvme_command *sq_cmds;
>> +     struct blk_mq_hw_ctx *hctx;
>>       volatile struct nvme_completion *cqes;
>> -     dma_addr_t sq_dma_addr;
>> -     dma_addr_t cq_dma_addr;
>>       u32 __iomem *q_db;
>>       u16 q_depth;
>> -     s16 cq_vector;
>>       u16 sq_head;
>>       u16 sq_tail;
>>       u16 cq_head;
>>       u16 qid;
>> +     s16 cq_vector;
>>       u8 cq_phase;
>>       u8 cqe_seen;
>> +     spinlock_t q_lock;
>>       struct async_cmd_info cmdinfo;
>> -     struct blk_mq_hw_ctx *hctx;
>> +     char irqname[24];       /* nvme4294967295-65535\0 */
>> +     dma_addr_t sq_dma_addr;
>> +     dma_addr_t cq_dma_addr;
>>  };
>>
>>  /*
>> --
>> 1.8.3.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: nvme_queue made cache friendly.
@ 2015-05-20 18:01 ` Parav Pandit
  0 siblings, 0 replies; 6+ messages in thread
From: Parav Pandit @ 2015-05-20 18:01 UTC (permalink / raw)
  To: linux-nvme, willy; +Cc: parav.pandit, axboe, linux-kernel

nvme_queue structure made 64B cache friendly so that majority of the
data elements of the structure during IO and completion path can be
found in typical single 64B cache line size which was previously spanning beyond
single 64B cache line size.

By aligning most of the fields are found at start of the structure.
Elements which are not used in frequent IO path are moved at the end of structure.

Signed-off-by: Parav Pandit <parav.pandit@avagotech.com>
---
 drivers/block/nvme-core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b9ba36f..1585d7d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -98,23 +98,23 @@ struct async_cmd_info {
 struct nvme_queue {
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
-	char irqname[24];	/* nvme4294967295-65535\0 */
-	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
+	struct blk_mq_hw_ctx *hctx;
 	volatile struct nvme_completion *cqes;
-	dma_addr_t sq_dma_addr;
-	dma_addr_t cq_dma_addr;
 	u32 __iomem *q_db;
 	u16 q_depth;
-	s16 cq_vector;
 	u16 sq_head;
 	u16 sq_tail;
 	u16 cq_head;
 	u16 qid;
+	s16 cq_vector;
 	u8 cq_phase;
 	u8 cqe_seen;
+	spinlock_t q_lock;
 	struct async_cmd_info cmdinfo;
-	struct blk_mq_hw_ctx *hctx;
+	char irqname[24];	/* nvme4294967295-65535\0 */
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
 };
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] NVMe: nvme_queue made cache friendly.
@ 2015-05-20 18:01 ` Parav Pandit
  0 siblings, 0 replies; 6+ messages in thread
From: Parav Pandit @ 2015-05-20 18:01 UTC (permalink / raw)


nvme_queue structure made 64B cache friendly so that majority of the
data elements of the structure during IO and completion path can be
found in typical single 64B cache line size which was previously spanning beyond
single 64B cache line size.

By aligning most of the fields are found at start of the structure.
Elements which are not used in frequent IO path are moved at the end of structure.

Signed-off-by: Parav Pandit <parav.pandit at avagotech.com>
---
 drivers/block/nvme-core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index b9ba36f..1585d7d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -98,23 +98,23 @@ struct async_cmd_info {
 struct nvme_queue {
 	struct device *q_dmadev;
 	struct nvme_dev *dev;
-	char irqname[24];	/* nvme4294967295-65535\0 */
-	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
+	struct blk_mq_hw_ctx *hctx;
 	volatile struct nvme_completion *cqes;
-	dma_addr_t sq_dma_addr;
-	dma_addr_t cq_dma_addr;
 	u32 __iomem *q_db;
 	u16 q_depth;
-	s16 cq_vector;
 	u16 sq_head;
 	u16 sq_tail;
 	u16 cq_head;
 	u16 qid;
+	s16 cq_vector;
 	u8 cq_phase;
 	u8 cqe_seen;
+	spinlock_t q_lock;
 	struct async_cmd_info cmdinfo;
-	struct blk_mq_hw_ctx *hctx;
+	char irqname[24];	/* nvme4294967295-65535\0 */
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
 };
 
 /*
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-05-20 18:01 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-20 18:01 [PATCH] NVMe: nvme_queue made cache friendly Parav Pandit
2015-05-20 18:01 ` Parav Pandit
2015-05-20 13:20 ` Matthew Wilcox
2015-05-20 13:20   ` Matthew Wilcox
2015-05-20 13:34   ` Parav Pandit
2015-05-20 13:34     ` Parav Pandit

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.