Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe

From: Max Gurtovoy <maxg@mellanox.com>
To: Weiping Zhang <zwp10758@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>,
	sagi@grimberg.me, Weiping Zhang <zhangweiping@didiglobal.com>,
	linux-nvme@lists.infradead.org,
	Christoph Hellwig <hch@infradead.org>,
	Keith Busch <kbusch@kernel.org>
Subject: Re: [PATCH] nvme: align io queue count with allocted nvme_queue in nvme_probe
Date: Mon, 13 Apr 2020 12:37:00 +0300	[thread overview]
Message-ID: <66add5c2-62b9-5c2d-977b-0499834b2b7a@mellanox.com> (raw)
In-Reply-To: <CAA70yB63bHGcFOzKPrLz+-bjHoEMQWiZEpFyoZ72rzDy9ZaO3Q@mail.gmail.com>

On 4/13/2020 4:01 AM, Weiping Zhang wrote:
> On Sun, Apr 12, 2020 at 8:38 PM Max Gurtovoy <maxg@mellanox.com> wrote:
> Hi Max,
>
>> hi,
>>
>> how about the following minor update:
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 4e79e41..46ab28b 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -89,6 +89,7 @@
>>     */
>>    struct nvme_dev {
>>           struct nvme_queue *queues;
>> +       int nr_allocated_queue;
>>           struct blk_mq_tag_set tagset;
>>           struct blk_mq_tag_set admin_tagset;
>>           u32 __iomem *dbs;
>> @@ -209,15 +210,15 @@ struct nvme_iod {
>>           struct scatterlist *sg;
>>    };
>>
>> -static unsigned int max_io_queues(void)
>> +static unsigned int nr_dev_io_queues(struct nvme_dev *dev)
>>    {
>> -       return num_possible_cpus() + write_queues + poll_queues;
>> +       return dev->nr_allocated_queue - 1;
>>    }
>>
>>    static unsigned int max_queue_count(void)
>>    {
>>           /* IO queues + admin queue */
>> -       return 1 + max_io_queues();
>> +       return 1 + num_possible_cpus() + write_queues + poll_queues;
>>    }
>>
>>    static inline unsigned int nvme_dbbuf_size(u32 stride)
>> @@ -2073,7 +2074,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>>           int result, nr_io_queues;
>>           unsigned long size;
>>
>> -       nr_io_queues = max_io_queues();
>> +       nr_io_queues = nr_dev_io_queues(dev);
>>
> It may have some problem when user decrease queue count for multiple tagset map.
> For example, there are total 128 IO and 96 cpus(system),
> insmod nvme write_queues=32
> nvme_probe will allocate 129(128io + 1 admin), nr_allocated_queue=129;
> and then user decrease queue count
> echo 2 > /sys/module/nvme/parameters/write_queues
> echo 1 > /sys/block/nvme0n1/device/reset_controller.
> nvme_setup_io_queues should use
> 96(num_possible_cpus) + 2(write_queues) instead of 129(nr_allocated_queue).

Any change that you will try to do (increase/decrease) will not effect.

If you want it to effect, you need to make nvme_probe to run.

I don't see a value only for making the code not to crash but not really 
effect the queue count.

write_queues and poll queues shouldn't be writable IMO.

Since nvme_dbbuf_dma_alloc/nvme_dbbuf_dma_free also call 
max_queue_count() that uses writable module params.

we can save this values locally or make it read-only param.

>>           /*
>>            * If tags are shared with admin queue (Apple bug), then
>> @@ -2742,7 +2743,7 @@ static void nvme_async_probe(void *data,
>> async_cookie_t cookie)
>>
>>    static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id
>> *id)
>>    {
>> -       int node, result = -ENOMEM;
>> +       int node, nr_queues, result = -ENOMEM;
>>           struct nvme_dev *dev;
>>           unsigned long quirks = id->driver_data;
>>           size_t alloc_size;
>> @@ -2755,11 +2756,14 @@ static int nvme_probe(struct pci_dev *pdev,
>> const struct pci_device_id *id)
>>           if (!dev)
>>                   return -ENOMEM;
>>
>> -       dev->queues = kcalloc_node(max_queue_count(), sizeof(struct
>> nvme_queue),
>> -                                       GFP_KERNEL, node);
>> +       nr_queues =  max_queue_count();
>> +       dev->queues = kcalloc_node(nr_queues, sizeof(struct nvme_queue),
>> +                                  GFP_KERNEL, node);
>>           if (!dev->queues)
>>                   goto free;
>>
>> +       dev->nr_allocated_queue = nr_queues;
>> +
>>           dev->dev = get_device(&pdev->dev);
>>           pci_set_drvdata(pdev, dev);
>>
>>
>> -Max
>>
>> On 4/10/2020 12:57 PM, Weiping Zhang wrote:
>>> Since the commit 147b27e4bd0 "nvme-pci: allocate device queues storage space at probe"
>>> nvme_alloc_queue will not alloc struct nvme_queue any more.
>>> If user change write/poll_queues to larger than the number of
>>> allocated queue in nvme_probe, nvme_alloc_queue will touch
>>> the memory out of boundary.
>>>
>>> Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
>>> ---
>>>    drivers/nvme/host/pci.c | 10 ++++++++--
>>>    1 file changed, 8 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>>> index 4e79e412b276..cc10258e578e 100644
>>> --- a/drivers/nvme/host/pci.c
>>> +++ b/drivers/nvme/host/pci.c
>>> @@ -89,6 +89,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
>>>     */
>>>    struct nvme_dev {
>>>        struct nvme_queue *queues;
>>> +     int nr_allocated_queue;
>>>        struct blk_mq_tag_set tagset;
>>>        struct blk_mq_tag_set admin_tagset;
>>>        u32 __iomem *dbs;
>>> @@ -2074,6 +2075,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
>>>        unsigned long size;
>>>
>>>        nr_io_queues = max_io_queues();
>>> +     if (nr_io_queues > dev->nr_allocated_queue - 1)
>>> +             nr_io_queues = dev->nr_allocated_queue - 1;
>>>
>>>        /*
>>>         * If tags are shared with admin queue (Apple bug), then
>>> @@ -2742,7 +2745,7 @@ static void nvme_async_probe(void *data, async_cookie_t cookie)
>>>
>>>    static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>>    {
>>> -     int node, result = -ENOMEM;
>>> +     int node, nr_queue, result = -ENOMEM;
>>>        struct nvme_dev *dev;
>>>        unsigned long quirks = id->driver_data;
>>>        size_t alloc_size;
>>> @@ -2755,11 +2758,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>>        if (!dev)
>>>                return -ENOMEM;
>>>
>>> -     dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
>>> +     nr_queue = max_queue_count();
>>> +     dev->queues = kcalloc_node(nr_queue, sizeof(struct nvme_queue),
>>>                                        GFP_KERNEL, node);
>>>        if (!dev->queues)
>>>                goto free;
>>>
>>> +     dev->nr_allocated_queue = nr_queue;
>>> +
>>>        dev->dev = get_device(&pdev->dev);
>>>        pci_set_drvdata(pdev, dev);
>>>
>> _______________________________________________
>> linux-nvme mailing list
>> linux-nvme@lists.infradead.org
>> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Flists.infradead.org%2Fmailman%2Flistinfo%2Flinux-nvme&amp;data=02%7C01%7Cmaxg%40mellanox.com%7Ceaf57db9f05f425f94eb08d7df4643de%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C637223365187382864&amp;sdata=NdQYX2fHf8vr8nOlXGhT%2Fr4jHV64ubuBER%2FEPDl3Z%2FU%3D&amp;reserved=0
> Thanks

_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme