linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
@ 2020-04-10  9:57 Weiping Zhang
  2020-04-12 12:38 ` Max Gurtovoy
  0 siblings, 1 reply; 10+ messages in thread
From: Weiping Zhang @ 2020-04-10  9:57 UTC (permalink / raw)
  To: hch, axboe, kbusch, sagi; +Cc: linux-nvme

Since the commit 147b27e4bd0 "nvme-pci: allocate device queues storage space at probe"
nvme_alloc_queue will not alloc struct nvme_queue any more.
If user change write/poll_queues to larger than the number of
allocated queue in nvme_probe, nvme_alloc_queue will touch
the memory out of boundary.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
---
 drivers/nvme/host/pci.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4e79e412b276..cc10258e578e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -89,6 +89,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
  */
 struct nvme_dev {
 	struct nvme_queue *queues;
+	int nr_allocated_queue;
 	struct blk_mq_tag_set tagset;
 	struct blk_mq_tag_set admin_tagset;
 	u32 __iomem *dbs;
@@ -2074,6 +2075,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	unsigned long size;
 
 	nr_io_queues = max_io_queues();
+	if (nr_io_queues > dev->nr_allocated_queue - 1)
+		nr_io_queues = dev->nr_allocated_queue - 1;
 
 	/*
 	 * If tags are shared with admin queue (Apple bug), then
@@ -2742,7 +2745,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie)
 
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	int node, result = -ENOMEM;
+	int node, nr_queue, result = -ENOMEM;
 	struct nvme_dev *dev;
 	unsigned long quirks = id->driver_data;
 	size_t alloc_size;
@@ -2755,11 +2758,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (!dev)
 		return -ENOMEM;
 
-	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
+	nr_queue = max_queue_count();
+	dev->queues = kcalloc_node(nr_queue, sizeof(struct nvme_queue),
 					GFP_KERNEL, node);
 	if (!dev->queues)
 		goto free;
 
+	dev->nr_allocated_queue = nr_queue;
+
 	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);
 
-- 
2.18.1


_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-10  9:57 [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe Weiping Zhang
@ 2020-04-12 12:38 ` Max Gurtovoy
  2020-04-13  1:01   ` Weiping Zhang
  0 siblings, 1 reply; 10+ messages in thread
From: Max Gurtovoy @ 2020-04-12 12:38 UTC (permalink / raw)
  To: Weiping Zhang, hch, axboe, kbusch, sagi; +Cc: linux-nvme

hi,

how about the following minor update:

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4e79e41..46ab28b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -89,6 +89,7 @@
   */
  struct nvme_dev {
         struct nvme_queue *queues;
+       int nr_allocated_queue;
         struct blk_mq_tag_set tagset;
         struct blk_mq_tag_set admin_tagset;
         u32 __iomem *dbs;
@@ -209,15 +210,15 @@ struct nvme_iod {
         struct scatterlist *sg;
  };

-static unsigned int max_io_queues(void)
+static unsigned int nr_dev_io_queues(struct nvme_dev *dev)
  {
-       return num_possible_cpus() + write_queues + poll_queues;
+       return dev->nr_allocated_queue - 1;
  }

  static unsigned int max_queue_count(void)
  {
         /* IO queues + admin queue */
-       return 1 + max_io_queues();
+       return 1 + num_possible_cpus() + write_queues + poll_queues;
  }

  static inline unsigned int nvme_dbbuf_size(u32 stride)
@@ -2073,7 +2074,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
         int result, nr_io_queues;
         unsigned long size;

-       nr_io_queues = max_io_queues();
+       nr_io_queues = nr_dev_io_queues(dev);

         /*
          * If tags are shared with admin queue (Apple bug), then
@@ -2742,7 +2743,7 @@ static void nvme_async_probe(void *data, 
async_cookie_t cookie)

  static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id 
*id)
  {
-       int node, result = -ENOMEM;
+       int node, nr_queues, result = -ENOMEM;
         struct nvme_dev *dev;
         unsigned long quirks = id->driver_data;
         size_t alloc_size;
@@ -2755,11 +2756,14 @@ static int nvme_probe(struct pci_dev *pdev, 
const struct pci_device_id *id)
         if (!dev)
                 return -ENOMEM;

-       dev->queues = kcalloc_node(max_queue_count(), sizeof(struct 
nvme_queue),
-                                       GFP_KERNEL, node);
+       nr_queues =  max_queue_count();
+       dev->queues = kcalloc_node(nr_queues, sizeof(struct nvme_queue),
+                                  GFP_KERNEL, node);
         if (!dev->queues)
                 goto free;

+       dev->nr_allocated_queue = nr_queues;
+
         dev->dev = get_device(&pdev->dev);
         pci_set_drvdata(pdev, dev);


-Max

On 4/10/2020 12:57 PM, Weiping Zhang wrote:
> Since the commit 147b27e4bd0 "nvme-pci: allocate device queues storage space at probe"
> nvme_alloc_queue will not alloc struct nvme_queue any more.
> If user change write/poll_queues to larger than the number of
> allocated queue in nvme_probe, nvme_alloc_queue will touch
> the memory out of boundary.
>
> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
> ---
>   drivers/nvme/host/pci.c | 10 ++++++++--
>   1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4e79e412b276..cc10258e578e 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -89,6 +89,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
>    */
>   struct nvme_dev {
>   	struct nvme_queue *queues;
> +	int nr_allocated_queue;
>   	struct blk_mq_tag_set tagset;
>   	struct blk_mq_tag_set admin_tagset;
>   	u32 __iomem *dbs;
> @@ -2074,6 +2075,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>   	unsigned long size;
>   
>   	nr_io_queues = max_io_queues();
> +	if (nr_io_queues > dev->nr_allocated_queue - 1)
> +		nr_io_queues = dev->nr_allocated_queue - 1;
>   
>   	/*
>   	 * If tags are shared with admin queue (Apple bug), then
> @@ -2742,7 +2745,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie)
>   
>   static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>   {
> -	int node, result = -ENOMEM;
> +	int node, nr_queue, result = -ENOMEM;
>   	struct nvme_dev *dev;
>   	unsigned long quirks = id->driver_data;
>   	size_t alloc_size;
> @@ -2755,11 +2758,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>   	if (!dev)
>   		return -ENOMEM;
>   
> -	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
> +	nr_queue = max_queue_count();
> +	dev->queues = kcalloc_node(nr_queue, sizeof(struct nvme_queue),
>   					GFP_KERNEL, node);
>   	if (!dev->queues)
>   		goto free;
>   
> +	dev->nr_allocated_queue = nr_queue;
> +
>   	dev->dev = get_device(&pdev->dev);
>   	pci_set_drvdata(pdev, dev);
>   

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-12 12:38 ` Max Gurtovoy
@ 2020-04-13  1:01   ` Weiping Zhang
  2020-04-13  9:37     ` Max Gurtovoy
  0 siblings, 1 reply; 10+ messages in thread
From: Weiping Zhang @ 2020-04-13  1:01 UTC (permalink / raw)
  To: Max Gurtovoy
  Cc: Jens Axboe, sagi, Weiping Zhang, linux-nvme, Christoph Hellwig,
	Keith Busch

On Sun, Apr 12, 2020 at 8:38 PM Max Gurtovoy <maxg@mellanox.com> wrote:
>
Hi Max,

> hi,
>
> how about the following minor update:
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 4e79e41..46ab28b 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -89,6 +89,7 @@
>    */
>   struct nvme_dev {
>          struct nvme_queue *queues;
> +       int nr_allocated_queue;
>          struct blk_mq_tag_set tagset;
>          struct blk_mq_tag_set admin_tagset;
>          u32 __iomem *dbs;
> @@ -209,15 +210,15 @@ struct nvme_iod {
>          struct scatterlist *sg;
>   };
>
> -static unsigned int max_io_queues(void)
> +static unsigned int nr_dev_io_queues(struct nvme_dev *dev)
>   {
> -       return num_possible_cpus() + write_queues + poll_queues;
> +       return dev->nr_allocated_queue - 1;
>   }
>
>   static unsigned int max_queue_count(void)
>   {
>          /* IO queues + admin queue */
> -       return 1 + max_io_queues();
> +       return 1 + num_possible_cpus() + write_queues + poll_queues;
>   }
>
>   static inline unsigned int nvme_dbbuf_size(u32 stride)
> @@ -2073,7 +2074,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>          int result, nr_io_queues;
>          unsigned long size;
>
> -       nr_io_queues = max_io_queues();
> +       nr_io_queues = nr_dev_io_queues(dev);
>
It may have some problem when user decrease queue count for multiple tagset map.
For example, there are total 128 IO and 96 cpus(system),
insmod nvme write_queues=32
nvme_probe will allocate 129(128io + 1 admin), nr_allocated_queue=129;
and then user decrease queue count
echo 2 > /sys/module/nvme/parameters/write_queues
echo 1 > /sys/block/nvme0n1/device/reset_controller.
nvme_setup_io_queues should use
96(num_possible_cpus) + 2(write_queues) instead of 129(nr_allocated_queue).

>          /*
>           * If tags are shared with admin queue (Apple bug), then
> @@ -2742,7 +2743,7 @@ static void nvme_async_probe(void *data,
> async_cookie_t cookie)
>
>   static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id
> *id)
>   {
> -       int node, result = -ENOMEM;
> +       int node, nr_queues, result = -ENOMEM;
>          struct nvme_dev *dev;
>          unsigned long quirks = id->driver_data;
>          size_t alloc_size;
> @@ -2755,11 +2756,14 @@ static int nvme_probe(struct pci_dev *pdev,
> const struct pci_device_id *id)
>          if (!dev)
>                  return -ENOMEM;
>
> -       dev->queues = kcalloc_node(max_queue_count(), sizeof(struct
> nvme_queue),
> -                                       GFP_KERNEL, node);
> +       nr_queues =  max_queue_count();
> +       dev->queues = kcalloc_node(nr_queues, sizeof(struct nvme_queue),
> +                                  GFP_KERNEL, node);
>          if (!dev->queues)
>                  goto free;
>
> +       dev->nr_allocated_queue = nr_queues;
> +
>          dev->dev = get_device(&pdev->dev);
>          pci_set_drvdata(pdev, dev);
>
>
> -Max
>
> On 4/10/2020 12:57 PM, Weiping Zhang wrote:
> > Since the commit 147b27e4bd0 "nvme-pci: allocate device queues storage space at probe"
> > nvme_alloc_queue will not alloc struct nvme_queue any more.
> > If user change write/poll_queues to larger than the number of
> > allocated queue in nvme_probe, nvme_alloc_queue will touch
> > the memory out of boundary.
> >
> > Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
> > ---
> >   drivers/nvme/host/pci.c | 10 ++++++++--
> >   1 file changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index 4e79e412b276..cc10258e578e 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -89,6 +89,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
> >    */
> >   struct nvme_dev {
> >       struct nvme_queue *queues;
> > +     int nr_allocated_queue;
> >       struct blk_mq_tag_set tagset;
> >       struct blk_mq_tag_set admin_tagset;
> >       u32 __iomem *dbs;
> > @@ -2074,6 +2075,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
> >       unsigned long size;
> >
> >       nr_io_queues = max_io_queues();
> > +     if (nr_io_queues > dev->nr_allocated_queue - 1)
> > +             nr_io_queues = dev->nr_allocated_queue - 1;
> >
> >       /*
> >        * If tags are shared with admin queue (Apple bug), then
> > @@ -2742,7 +2745,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie)
> >
> >   static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >   {
> > -     int node, result = -ENOMEM;
> > +     int node, nr_queue, result = -ENOMEM;
> >       struct nvme_dev *dev;
> >       unsigned long quirks = id->driver_data;
> >       size_t alloc_size;
> > @@ -2755,11 +2758,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> >       if (!dev)
> >               return -ENOMEM;
> >
> > -     dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
> > +     nr_queue = max_queue_count();
> > +     dev->queues = kcalloc_node(nr_queue, sizeof(struct nvme_queue),
> >                                       GFP_KERNEL, node);
> >       if (!dev->queues)
> >               goto free;
> >
> > +     dev->nr_allocated_queue = nr_queue;
> > +
> >       dev->dev = get_device(&pdev->dev);
> >       pci_set_drvdata(pdev, dev);
> >
>
> _______________________________________________
> linux-nvme mailing list
> linux-nvme@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme

Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-13  1:01   ` Weiping Zhang
@ 2020-04-13  9:37     ` Max Gurtovoy
  2020-04-13 12:00       ` Weiping Zhang
  0 siblings, 1 reply; 10+ messages in thread
From: Max Gurtovoy @ 2020-04-13  9:37 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: Jens Axboe, sagi, Weiping Zhang, linux-nvme, Christoph Hellwig,
	Keith Busch


On 4/13/2020 4:01 AM, Weiping Zhang wrote:
> On Sun, Apr 12, 2020 at 8:38 PM Max Gurtovoy <maxg@mellanox.com> wrote:
> Hi Max,
>
>> hi,
>>
>> how about the following minor update:
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4e79e41..46ab28b 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -89,6 +89,7 @@
>>     */
>>    struct nvme_dev {
>>           struct nvme_queue *queues;
>> +       int nr_allocated_queue;
>>           struct blk_mq_tag_set tagset;
>>           struct blk_mq_tag_set admin_tagset;
>>           u32 __iomem *dbs;
>> @@ -209,15 +210,15 @@ struct nvme_iod {
>>           struct scatterlist *sg;
>>    };
>>
>> -static unsigned int max_io_queues(void)
>> +static unsigned int nr_dev_io_queues(struct nvme_dev *dev)
>>    {
>> -       return num_possible_cpus() + write_queues + poll_queues;
>> +       return dev->nr_allocated_queue - 1;
>>    }
>>
>>    static unsigned int max_queue_count(void)
>>    {
>>           /* IO queues + admin queue */
>> -       return 1 + max_io_queues();
>> +       return 1 + num_possible_cpus() + write_queues + poll_queues;
>>    }
>>
>>    static inline unsigned int nvme_dbbuf_size(u32 stride)
>> @@ -2073,7 +2074,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>>           int result, nr_io_queues;
>>           unsigned long size;
>>
>> -       nr_io_queues = max_io_queues();
>> +       nr_io_queues = nr_dev_io_queues(dev);
>>
> It may have some problem when user decrease queue count for multiple tagset map.
> For example, there are total 128 IO and 96 cpus(system),
> insmod nvme write_queues=32
> nvme_probe will allocate 129(128io + 1 admin), nr_allocated_queue=129;
> and then user decrease queue count
> echo 2 > /sys/module/nvme/parameters/write_queues
> echo 1 > /sys/block/nvme0n1/device/reset_controller.
> nvme_setup_io_queues should use
> 96(num_possible_cpus) + 2(write_queues) instead of 129(nr_allocated_queue).

Any change that you will try to do (increase/decrease) will not effect.

If you want it to effect, you need to make nvme_probe to run.

I don't see a value only for making the code not to crash but not really 
effect the queue count.

write_queues and poll queues shouldn't be writable IMO.

Since nvme_dbbuf_dma_alloc/nvme_dbbuf_dma_free also call 
max_queue_count() that uses writable module params.

we can save this values locally or make it read-only param.


>>           /*
>>            * If tags are shared with admin queue (Apple bug), then
>> @@ -2742,7 +2743,7 @@ static void nvme_async_probe(void *data,
>> async_cookie_t cookie)
>>
>>    static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id
>> *id)
>>    {
>> -       int node, result = -ENOMEM;
>> +       int node, nr_queues, result = -ENOMEM;
>>           struct nvme_dev *dev;
>>           unsigned long quirks = id->driver_data;
>>           size_t alloc_size;
>> @@ -2755,11 +2756,14 @@ static int nvme_probe(struct pci_dev *pdev,
>> const struct pci_device_id *id)
>>           if (!dev)
>>                   return -ENOMEM;
>>
>> -       dev->queues = kcalloc_node(max_queue_count(), sizeof(struct
>> nvme_queue),
>> -                                       GFP_KERNEL, node);
>> +       nr_queues =  max_queue_count();
>> +       dev->queues = kcalloc_node(nr_queues, sizeof(struct nvme_queue),
>> +                                  GFP_KERNEL, node);
>>           if (!dev->queues)
>>                   goto free;
>>
>> +       dev->nr_allocated_queue = nr_queues;
>> +
>>           dev->dev = get_device(&pdev->dev);
>>           pci_set_drvdata(pdev, dev);
>>
>>
>> -Max
>>
>> On 4/10/2020 12:57 PM, Weiping Zhang wrote:
>>> Since the commit 147b27e4bd0 "nvme-pci: allocate device queues storage space at probe"
>>> nvme_alloc_queue will not alloc struct nvme_queue any more.
>>> If user change write/poll_queues to larger than the number of
>>> allocated queue in nvme_probe, nvme_alloc_queue will touch
>>> the memory out of boundary.
>>>
>>> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
>>> ---
>>>    drivers/nvme/host/pci.c | 10 ++++++++--
>>>    1 file changed, 8 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>>> index 4e79e412b276..cc10258e578e 100644
>>> --- a/drivers/nvme/host/pci.c
>>> +++ b/drivers/nvme/host/pci.c
>>> @@ -89,6 +89,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
>>>     */
>>>    struct nvme_dev {
>>>        struct nvme_queue *queues;
>>> +     int nr_allocated_queue;
>>>        struct blk_mq_tag_set tagset;
>>>        struct blk_mq_tag_set admin_tagset;
>>>        u32 __iomem *dbs;
>>> @@ -2074,6 +2075,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>>>        unsigned long size;
>>>
>>>        nr_io_queues = max_io_queues();
>>> +     if (nr_io_queues > dev->nr_allocated_queue - 1)
>>> +             nr_io_queues = dev->nr_allocated_queue - 1;
>>>
>>>        /*
>>>         * If tags are shared with admin queue (Apple bug), then
>>> @@ -2742,7 +2745,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie)
>>>
>>>    static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>>    {
>>> -     int node, result = -ENOMEM;
>>> +     int node, nr_queue, result = -ENOMEM;
>>>        struct nvme_dev *dev;
>>>        unsigned long quirks = id->driver_data;
>>>        size_t alloc_size;
>>> @@ -2755,11 +2758,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>>        if (!dev)
>>>                return -ENOMEM;
>>>
>>> -     dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
>>> +     nr_queue = max_queue_count();
>>> +     dev->queues = kcalloc_node(nr_queue, sizeof(struct nvme_queue),
>>>                                        GFP_KERNEL, node);
>>>        if (!dev->queues)
>>>                goto free;
>>>
>>> +     dev->nr_allocated_queue = nr_queue;
>>> +
>>>        dev->dev = get_device(&pdev->dev);
>>>        pci_set_drvdata(pdev, dev);
>>>
>> _______________________________________________
>> linux-nvme mailing list
>> linux-nvme@lists.infradead.org
>> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists.infradead.org%2Fmailman%2Flistinfo%2Flinux-nvme&amp;data=02%7C01%7Cmaxg%40mellanox.com%7Ceaf57db9f05f425f94eb08d7df4643de%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C637223365187382864&amp;sdata=NdQYX2fHf8vr8nOlXGhT%2Fr4jHV64ubuBER%2FEPDl3Z%2FU%3D&amp;reserved=0
> Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-13  9:37     ` Max Gurtovoy
@ 2020-04-13 12:00       ` Weiping Zhang
  2020-04-14 12:59         ` Max Gurtovoy
  0 siblings, 1 reply; 10+ messages in thread
From: Weiping Zhang @ 2020-04-13 12:00 UTC (permalink / raw)
  To: Max Gurtovoy
  Cc: Jens Axboe, sagi, Weiping Zhang, linux-nvme, Christoph Hellwig,
	Keith Busch

On Mon, Apr 13, 2020 at 5:37 PM Max Gurtovoy <maxg@mellanox.com> wrote:
>
>
> On 4/13/2020 4:01 AM, Weiping Zhang wrote:
> > On Sun, Apr 12, 2020 at 8:38 PM Max Gurtovoy <maxg@mellanox.com> wrote:
> > Hi Max,
> >
> >> hi,
> >>
> >> how about the following minor update:
> >>
> >> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> >> index 4e79e41..46ab28b 100644
> >> --- a/drivers/nvme/host/pci.c
> >> +++ b/drivers/nvme/host/pci.c
> >> @@ -89,6 +89,7 @@
> >>     */
> >>    struct nvme_dev {
> >>           struct nvme_queue *queues;
> >> +       int nr_allocated_queue;
> >>           struct blk_mq_tag_set tagset;
> >>           struct blk_mq_tag_set admin_tagset;
> >>           u32 __iomem *dbs;
> >> @@ -209,15 +210,15 @@ struct nvme_iod {
> >>           struct scatterlist *sg;
> >>    };
> >>
> >> -static unsigned int max_io_queues(void)
> >> +static unsigned int nr_dev_io_queues(struct nvme_dev *dev)
> >>    {
> >> -       return num_possible_cpus() + write_queues + poll_queues;
> >> +       return dev->nr_allocated_queue - 1;
> >>    }
> >>
> >>    static unsigned int max_queue_count(void)
> >>    {
> >>           /* IO queues + admin queue */
> >> -       return 1 + max_io_queues();
> >> +       return 1 + num_possible_cpus() + write_queues + poll_queues;
> >>    }
> >>
> >>    static inline unsigned int nvme_dbbuf_size(u32 stride)
> >> @@ -2073,7 +2074,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
> >>           int result, nr_io_queues;
> >>           unsigned long size;
> >>
> >> -       nr_io_queues = max_io_queues();
> >> +       nr_io_queues = nr_dev_io_queues(dev);
> >>
> > It may have some problem when user decrease queue count for multiple tagset map.
> > For example, there are total 128 IO and 96 cpus(system),
> > insmod nvme write_queues=32
> > nvme_probe will allocate 129(128io + 1 admin), nr_allocated_queue=129;
> > and then user decrease queue count
> > echo 2 > /sys/module/nvme/parameters/write_queues
> > echo 1 > /sys/block/nvme0n1/device/reset_controller.
> > nvme_setup_io_queues should use
> > 96(num_possible_cpus) + 2(write_queues) instead of 129(nr_allocated_queue).
>
> Any change that you will try to do (increase/decrease) will not effect.
>
> If you want it to effect, you need to make nvme_probe to run.
>
> I don't see a value only for making the code not to crash but not really
> effect the queue count.
>
> write_queues and poll queues shouldn't be writable IMO.
>
I think we can keep it writeable, the user case is that setup as many io
queues as possible when load nvme module, then change queue count
for each tag set map dynamically.

> Since nvme_dbbuf_dma_alloc/nvme_dbbuf_dma_free also call
> max_queue_count() that uses writable module params.
>
> we can save this values locally or make it read-only param.

It's not safe to use max_queue_count() for these two function,
and there is also a problem in nvme_dbbuf_dma_alloc,
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
        unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

        if (dev->dbbuf_dbs)
                return 0;
it does not aware queue count was changed or not.

But it can be fixed by
unsigned int mem_size = nvme_dev->nr_allocated_queue * 8 * dev->db_stride;

Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-13 12:00       ` Weiping Zhang
@ 2020-04-14 12:59         ` Max Gurtovoy
  2020-04-22  8:37           ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: Max Gurtovoy @ 2020-04-14 12:59 UTC (permalink / raw)
  To: Weiping Zhang
  Cc: Jens Axboe, sagi, Weiping Zhang, linux-nvme, Christoph Hellwig,
	Keith Busch


On 4/13/2020 3:00 PM, Weiping Zhang wrote:
> On Mon, Apr 13, 2020 at 5:37 PM Max Gurtovoy <maxg@mellanox.com> wrote:
>>
>> On 4/13/2020 4:01 AM, Weiping Zhang wrote:
>>> On Sun, Apr 12, 2020 at 8:38 PM Max Gurtovoy <maxg@mellanox.com> wrote:
>>> Hi Max,
>>>
>>>> hi,
>>>>
>>>> how about the following minor update:
>>>>
>>>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>>>> index 4e79e41..46ab28b 100644
>>>> --- a/drivers/nvme/host/pci.c
>>>> +++ b/drivers/nvme/host/pci.c
>>>> @@ -89,6 +89,7 @@
>>>>      */
>>>>     struct nvme_dev {
>>>>            struct nvme_queue *queues;
>>>> +       int nr_allocated_queue;
>>>>            struct blk_mq_tag_set tagset;
>>>>            struct blk_mq_tag_set admin_tagset;
>>>>            u32 __iomem *dbs;
>>>> @@ -209,15 +210,15 @@ struct nvme_iod {
>>>>            struct scatterlist *sg;
>>>>     };
>>>>
>>>> -static unsigned int max_io_queues(void)
>>>> +static unsigned int nr_dev_io_queues(struct nvme_dev *dev)
>>>>     {
>>>> -       return num_possible_cpus() + write_queues + poll_queues;
>>>> +       return dev->nr_allocated_queue - 1;
>>>>     }
>>>>
>>>>     static unsigned int max_queue_count(void)
>>>>     {
>>>>            /* IO queues + admin queue */
>>>> -       return 1 + max_io_queues();
>>>> +       return 1 + num_possible_cpus() + write_queues + poll_queues;
>>>>     }
>>>>
>>>>     static inline unsigned int nvme_dbbuf_size(u32 stride)
>>>> @@ -2073,7 +2074,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>>>>            int result, nr_io_queues;
>>>>            unsigned long size;
>>>>
>>>> -       nr_io_queues = max_io_queues();
>>>> +       nr_io_queues = nr_dev_io_queues(dev);
>>>>
>>> It may have some problem when user decrease queue count for multiple tagset map.
>>> For example, there are total 128 IO and 96 cpus(system),
>>> insmod nvme write_queues=32
>>> nvme_probe will allocate 129(128io + 1 admin), nr_allocated_queue=129;
>>> and then user decrease queue count
>>> echo 2 > /sys/module/nvme/parameters/write_queues
>>> echo 1 > /sys/block/nvme0n1/device/reset_controller.
>>> nvme_setup_io_queues should use
>>> 96(num_possible_cpus) + 2(write_queues) instead of 129(nr_allocated_queue).
>> Any change that you will try to do (increase/decrease) will not effect.
>>
>> If you want it to effect, you need to make nvme_probe to run.
>>
>> I don't see a value only for making the code not to crash but not really
>> effect the queue count.
>>
>> write_queues and poll queues shouldn't be writable IMO.
>>
> I think we can keep it writeable, the user case is that setup as many io
> queues as possible when load nvme module, then change queue count
> for each tag set map dynamically.

We can keep it writable but I prefer not change the controller initial 
queue count after reset controller operation.

So we can keep dev->write_queues and dev->poll_queues count for that.

You can use the writable param in case you aim to hotplug a new device 
and you want it to probe with less/more queues.

IMO this feature should've somehow configured using nvme-cli as we do 
with fabrics controllers that we never change this values after initial 
connection.

Keith/Christoph,

what is the right approach in your opinion ?


>
>> Since nvme_dbbuf_dma_alloc/nvme_dbbuf_dma_free also call
>> max_queue_count() that uses writable module params.
>>
>> we can save this values locally or make it read-only param.
> It's not safe to use max_queue_count() for these two function,
> and there is also a problem in nvme_dbbuf_dma_alloc,
> static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
> {
>          unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
>
>          if (dev->dbbuf_dbs)
>                  return 0;
> it does not aware queue count was changed or not.
>
> But it can be fixed by
> unsigned int mem_size = nvme_dev->nr_allocated_queue * 8 * dev->db_stride;
>
> Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-14 12:59         ` Max Gurtovoy
@ 2020-04-22  8:37           ` Christoph Hellwig
  2020-04-22  9:24             ` weiping zhang
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2020-04-22  8:37 UTC (permalink / raw)
  To: Max Gurtovoy
  Cc: Jens Axboe, Weiping Zhang, Weiping Zhang, linux-nvme,
	Christoph Hellwig, Keith Busch, sagi

On Tue, Apr 14, 2020 at 03:59:12PM +0300, Max Gurtovoy wrote:
> > > write_queues and poll queues shouldn't be writable IMO.
> > > 
> > I think we can keep it writeable, the user case is that setup as many io
> > queues as possible when load nvme module, then change queue count
> > for each tag set map dynamically.
> 
> We can keep it writable but I prefer not change the controller initial queue
> count after reset controller operation.
> 
> So we can keep dev->write_queues and dev->poll_queues count for that.
> 
> You can use the writable param in case you aim to hotplug a new device and
> you want it to probe with less/more queues.
> 
> IMO this feature should've somehow configured using nvme-cli as we do with
> fabrics controllers that we never change this values after initial
> connection.
> 
> Keith/Christoph,
> 
> what is the right approach in your opinion ?

The problem with PCIe is that we only have a per-controller interface
once the controller is probed.  So a global paramter that can be
changed, but only is sampled once at probe time seems the easiest to
me.  We could also allow a per-controller sysfs file that only takes
effect after a reset, which seems a little nicer, but adds a lot of
boilerplate for just being a little nicer, so I'm not entirely sure
if it is worth the effort.

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-22  8:37           ` Christoph Hellwig
@ 2020-04-22  9:24             ` weiping zhang
  2020-04-22 16:57               ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: weiping zhang @ 2020-04-22  9:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Weiping Zhang, Weiping Zhang, linux-nvme,
	Keith Busch, Max Gurtovoy, sagi

On Wed, Apr 22, 2020 at 01:37:47AM -0700, Christoph Hellwig wrote:
> On Tue, Apr 14, 2020 at 03:59:12PM +0300, Max Gurtovoy wrote:
> > > > write_queues and poll queues shouldn't be writable IMO.
> > > > 
> > > I think we can keep it writeable, the user case is that setup as many io
> > > queues as possible when load nvme module, then change queue count
> > > for each tag set map dynamically.
> > 
> > We can keep it writable but I prefer not change the controller initial queue
> > count after reset controller operation.
> > 
> > So we can keep dev->write_queues and dev->poll_queues count for that.
> > 
> > You can use the writable param in case you aim to hotplug a new device and
> > you want it to probe with less/more queues.
> > 
> > IMO this feature should've somehow configured using nvme-cli as we do with
> > fabrics controllers that we never change this values after initial
> > connection.
> > 
> > Keith/Christoph,
> > 
> > what is the right approach in your opinion ?
> 
> The problem with PCIe is that we only have a per-controller interface
> once the controller is probed.  So a global paramter that can be
> changed, but only is sampled once at probe time seems the easiest to
> me.  We could also allow a per-controller sysfs file that only takes
> effect after a reset, which seems a little nicer, but adds a lot of
> boilerplate for just being a little nicer, so I'm not entirely sure
> if it is worth the effort.
Hi Christoph,

Because in the real user case, the number of each type queue may
not very suitable, it needs a ability to adjust them without hotplug.

If so, the nvme_dev needs record how many write/poll
queues saved in nvme_probe, and then use them in reset flow.

struct nvme_dev {
...
	unsigned int write_queues;
	unsigned int poll_queues;
}


How about add sysfs file
/sys/block/nvme0n1/device/io_queues_reload

and then check it in nvme_setup_io_queues, if it's true,
use module paramter, otherwise use parameters saved in nvme_probe.

Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-22  9:24             ` weiping zhang
@ 2020-04-22 16:57               ` Christoph Hellwig
  2020-04-23  7:59                 ` Weiping Zhang
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2020-04-22 16:57 UTC (permalink / raw)
  To: weiping zhang
  Cc: Jens Axboe, Weiping Zhang, Weiping Zhang, linux-nvme,
	Christoph Hellwig, Keith Busch, Max Gurtovoy, sagi

On Wed, Apr 22, 2020 at 05:24:27PM +0800, weiping zhang wrote:
> > The problem with PCIe is that we only have a per-controller interface
> > once the controller is probed.  So a global paramter that can be
> > changed, but only is sampled once at probe time seems the easiest to
> > me.  We could also allow a per-controller sysfs file that only takes
> > effect after a reset, which seems a little nicer, but adds a lot of
> > boilerplate for just being a little nicer, so I'm not entirely sure
> > if it is worth the effort.
> Hi Christoph,
> 
> Because in the real user case, the number of each type queue may
> not very suitable, it needs a ability to adjust them without hotplug.
> 
> If so, the nvme_dev needs record how many write/poll
> queues saved in nvme_probe, and then use them in reset flow.

When I wrote probe above I meant to include reset, sorry.

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
  2020-04-22 16:57               ` Christoph Hellwig
@ 2020-04-23  7:59                 ` Weiping Zhang
  0 siblings, 0 replies; 10+ messages in thread
From: Weiping Zhang @ 2020-04-23  7:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, sagi, Weiping Zhang, linux-nvme, weiping zhang,
	Keith Busch, Max Gurtovoy

On Thu, Apr 23, 2020 at 12:57 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Wed, Apr 22, 2020 at 05:24:27PM +0800, weiping zhang wrote:
> > > The problem with PCIe is that we only have a per-controller interface
> > > once the controller is probed.  So a global paramter that can be
> > > changed, but only is sampled once at probe time seems the easiest to
> > > me.  We could also allow a per-controller sysfs file that only takes
> > > effect after a reset, which seems a little nicer, but adds a lot of
> > > boilerplate for just being a little nicer, so I'm not entirely sure
> > > if it is worth the effort.
> > Hi Christoph,
> >
> > Because in the real user case, the number of each type queue may
> > not very suitable, it needs a ability to adjust them without hotplug.
> >
> > If so, the nvme_dev needs record how many write/poll
> > queues saved in nvme_probe, and then use them in reset flow.
>
> When I wrote probe above I meant to include reset, sorry.

Hi Christoph,

I send V2 base on nvme-5.8 0e6e74230.

Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2020-04-23  7:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-10  9:57 [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe Weiping Zhang
2020-04-12 12:38 ` Max Gurtovoy
2020-04-13  1:01   ` Weiping Zhang
2020-04-13  9:37     ` Max Gurtovoy
2020-04-13 12:00       ` Weiping Zhang
2020-04-14 12:59         ` Max Gurtovoy
2020-04-22  8:37           ` Christoph Hellwig
2020-04-22  9:24             ` weiping zhang
2020-04-22 16:57               ` Christoph Hellwig
2020-04-23  7:59                 ` Weiping Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).