amd-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] drm/amdgpu: Fix incorrect return value
@ 2024-04-03  7:06 YiPeng Chai
  2024-04-03 10:35 ` Zhou1, Tao
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: YiPeng Chai @ 2024-04-03  7:06 UTC (permalink / raw)
  To: amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang, YiPeng Chai

[Why]
  After calling amdgpu_vram_mgr_reserve_range
multiple times with the same address, calling
amdgpu_vram_mgr_query_page_status will always
return -EBUSY.
  From the second call to amdgpu_vram_mgr_reserve_range,
the same address will be added to the reservations_pending
list again and is never moved to the reserved_pages list
because the address had been reserved.

[How]
  First add the address status check before calling
amdgpu_vram_mgr_do_reserve, if the address is already
reserved, do nothing; If the address is already in the
reservations_pending list, directly reserve memory;
only add new nodes for the addresses that are not in the
reserved_pages list and reservations_pending list.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28 +++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 1e36c428d254..0bf3f4092900 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
 
 		dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
 			rsv->start, rsv->size);
-
 		vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
 		atomic64_add(vis_usage, &mgr->vis_usage);
 		spin_lock(&man->bdev->lru_lock);
@@ -340,19 +339,30 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr,
 				  uint64_t start, uint64_t size)
 {
 	struct amdgpu_vram_reservation *rsv;
+	int ret = 0;
 
-	rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
-	if (!rsv)
-		return -ENOMEM;
+	ret = amdgpu_vram_mgr_query_page_status(mgr, start);
+	if (!ret)
+		return 0;
+
+	if (ret == -ENOENT) {
+		rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
+		if (!rsv)
+			return -ENOMEM;
 
-	INIT_LIST_HEAD(&rsv->allocated);
-	INIT_LIST_HEAD(&rsv->blocks);
+		INIT_LIST_HEAD(&rsv->allocated);
+		INIT_LIST_HEAD(&rsv->blocks);
 
-	rsv->start = start;
-	rsv->size = size;
+		rsv->start = start;
+		rsv->size = size;
+
+		mutex_lock(&mgr->lock);
+		list_add_tail(&rsv->blocks, &mgr->reservations_pending);
+		mutex_unlock(&mgr->lock);
+
+	}
 
 	mutex_lock(&mgr->lock);
-	list_add_tail(&rsv->blocks, &mgr->reservations_pending);
 	amdgpu_vram_mgr_do_reserve(&mgr->manager);
 	mutex_unlock(&mgr->lock);
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-03  7:06 [PATCH] drm/amdgpu: Fix incorrect return value YiPeng Chai
@ 2024-04-03 10:35 ` Zhou1, Tao
  2024-04-07  2:20   ` Chai, Thomas
  2024-04-09  2:51 ` Zhou1, Tao
  2024-04-12  9:22 ` Christian König
  2 siblings, 1 reply; 8+ messages in thread
From: Zhou1, Tao @ 2024-04-03 10:35 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, April 3, 2024 3:07 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [Why]
>   After calling amdgpu_vram_mgr_reserve_range multiple times with the same
> address, calling amdgpu_vram_mgr_query_page_status will always return -
> EBUSY.

[Tao] could you explain why we call amdgpu_vram_mgr_reserve_range multiple times with the same
 address? IIRC, we skip duplicate address before reserve memory.

>   From the second call to amdgpu_vram_mgr_reserve_range, the same address
> will be added to the reservations_pending list again and is never moved to the
> reserved_pages list because the address had been reserved.
>
> [How]
>   First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do nothing; If
> the address is already in the reservations_pending list, directly reserve memory;
> only add new nodes for the addresses that are not in the reserved_pages list and
> reservations_pending list.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28 +++++++++++++-------
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>               dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>                       rsv->start, rsv->size);
> -
>               vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>               atomic64_add(vis_usage, &mgr->vis_usage);
>               spin_lock(&man->bdev->lru_lock);
> @@ -340,19 +339,30 @@ int amdgpu_vram_mgr_reserve_range(struct
> amdgpu_vram_mgr *mgr,
>                                 uint64_t start, uint64_t size)
>  {
>       struct amdgpu_vram_reservation *rsv;
> +     int ret = 0;
>
> -     rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> -     if (!rsv)
> -             return -ENOMEM;
> +     ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> +     if (!ret)
> +             return 0;
> +
> +     if (ret == -ENOENT) {
> +             rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> +             if (!rsv)
> +                     return -ENOMEM;
>
> -     INIT_LIST_HEAD(&rsv->allocated);
> -     INIT_LIST_HEAD(&rsv->blocks);
> +             INIT_LIST_HEAD(&rsv->allocated);
> +             INIT_LIST_HEAD(&rsv->blocks);
>
> -     rsv->start = start;
> -     rsv->size = size;
> +             rsv->start = start;
> +             rsv->size = size;
> +
> +             mutex_lock(&mgr->lock);
> +             list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> +             mutex_unlock(&mgr->lock);
> +
> +     }
>
>       mutex_lock(&mgr->lock);
> -     list_add_tail(&rsv->blocks, &mgr->reservations_pending);
>       amdgpu_vram_mgr_do_reserve(&mgr->manager);
>       mutex_unlock(&mgr->lock);
>
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-03 10:35 ` Zhou1, Tao
@ 2024-04-07  2:20   ` Chai, Thomas
  2024-04-08  8:40     ` Zhou1, Tao
  0 siblings, 1 reply; 8+ messages in thread
From: Chai, Thomas @ 2024-04-07  2:20 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Wednesday, April 3, 2024 6:36 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, April 3, 2024 3:07 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> <YiPeng.Chai@amd.com>
> Subject: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [Why]
>   After calling amdgpu_vram_mgr_reserve_range multiple times with the
> same address, calling amdgpu_vram_mgr_query_page_status will always
> return - EBUSY.

>[Tao] could you explain why we call amdgpu_vram_mgr_reserve_range multiple times with the same  address? IIRC, we skip duplicate address before reserve memory.

[Thomas]
   When poison creation interrupt is received, since some poisoning addresses may have been allocated by some processes, reserving these memories will fail.
These memory will be tried to reserve again after killing the poisoned process in the subsequent poisoning consumption interrupt handler.
so amdgpu_vram_mgr_reserve_range needs to be called multiple times with the same address.

>   From the second call to amdgpu_vram_mgr_reserve_range, the same
> address will be added to the reservations_pending list again and is
> never moved to the reserved_pages list because the address had been reserved.
>
> [How]
>   First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> nothing; If the address is already in the reservations_pending list,
> directly reserve memory; only add new nodes for the addresses that are
> not in the reserved_pages list and reservations_pending list.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> +++++++++++++-------
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>               dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>                       rsv->start, rsv->size);
> -
>               vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>               atomic64_add(vis_usage, &mgr->vis_usage);
>               spin_lock(&man->bdev->lru_lock); @@ -340,19 +339,30 @@
> int amdgpu_vram_mgr_reserve_range(struct
> amdgpu_vram_mgr *mgr,
>                                 uint64_t start, uint64_t size)  {
>       struct amdgpu_vram_reservation *rsv;
> +     int ret = 0;
>
> -     rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> -     if (!rsv)
> -             return -ENOMEM;
> +     ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> +     if (!ret)
> +             return 0;
> +
> +     if (ret == -ENOENT) {
> +             rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> +             if (!rsv)
> +                     return -ENOMEM;
>
> -     INIT_LIST_HEAD(&rsv->allocated);
> -     INIT_LIST_HEAD(&rsv->blocks);
> +             INIT_LIST_HEAD(&rsv->allocated);
> +             INIT_LIST_HEAD(&rsv->blocks);
>
> -     rsv->start = start;
> -     rsv->size = size;
> +             rsv->start = start;
> +             rsv->size = size;
> +
> +             mutex_lock(&mgr->lock);
> +             list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> +             mutex_unlock(&mgr->lock);
> +
> +     }
>
>       mutex_lock(&mgr->lock);
> -     list_add_tail(&rsv->blocks, &mgr->reservations_pending);
>       amdgpu_vram_mgr_do_reserve(&mgr->manager);
>       mutex_unlock(&mgr->lock);
>
> --
> 2.34.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-07  2:20   ` Chai, Thomas
@ 2024-04-08  8:40     ` Zhou1, Tao
  2024-04-08  9:35       ` Chai, Thomas
  0 siblings, 1 reply; 8+ messages in thread
From: Zhou1, Tao @ 2024-04-08  8:40 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Sunday, April 7, 2024 10:21 AM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [AMD Official Use Only - General]
>
> -----------------
> Best Regards,
> Thomas
>
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Wednesday, April 3, 2024 6:36 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [AMD Official Use Only - General]
>
> > -----Original Message-----
> > From: Chai, Thomas <YiPeng.Chai@amd.com>
> > Sent: Wednesday, April 3, 2024 3:07 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> > <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> > <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> > Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> > <YiPeng.Chai@amd.com>
> > Subject: [PATCH] drm/amdgpu: Fix incorrect return value
> >
> > [Why]
> >   After calling amdgpu_vram_mgr_reserve_range multiple times with the
> > same address, calling amdgpu_vram_mgr_query_page_status will always
> > return - EBUSY.
>
> >[Tao] could you explain why we call amdgpu_vram_mgr_reserve_range multiple
> times with the same  address? IIRC, we skip duplicate address before reserve
> memory.
>
> [Thomas]
>    When poison creation interrupt is received, since some poisoning addresses may
> have been allocated by some processes, reserving these memories will fail.
> These memory will be tried to reserve again after killing the poisoned process in
> the subsequent poisoning consumption interrupt handler.
> so amdgpu_vram_mgr_reserve_range needs to be called multiple times with the
> same address.
>
> >   From the second call to amdgpu_vram_mgr_reserve_range, the same
> > address will be added to the reservations_pending list again and is
> > never moved to the reserved_pages list because the address had been
> reserved.

[Tao] but if a page is added to reservations_pending list, it should also be put in data->bps array, and when we call amdgpu_ras_add_bad_pages again, amdgpu_ras_check_bad_page_unlock could ignore this page.
So except for amdgpu_ras_add_bad_pages, would you like to call amdgpu_vram_mgr_reserve_range in other place?

> >
> > [How]
> >   First add the address status check before calling
> > amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> > nothing; If the address is already in the reservations_pending list,
> > directly reserve memory; only add new nodes for the addresses that are
> > not in the reserved_pages list and reservations_pending list.
> >
> > Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> > +++++++++++++-------
> >  1 file changed, 19 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > index 1e36c428d254..0bf3f4092900 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> > ttm_resource_manager *man)
> >
> >               dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
> >                       rsv->start, rsv->size);
> > -
> >               vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
> >               atomic64_add(vis_usage, &mgr->vis_usage);
> >               spin_lock(&man->bdev->lru_lock); @@ -340,19 +339,30 @@
> > int amdgpu_vram_mgr_reserve_range(struct
> > amdgpu_vram_mgr *mgr,
> >                                 uint64_t start, uint64_t size)  {
> >       struct amdgpu_vram_reservation *rsv;
> > +     int ret = 0;
> >
> > -     rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> > -     if (!rsv)
> > -             return -ENOMEM;
> > +     ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> > +     if (!ret)
> > +             return 0;
> > +
> > +     if (ret == -ENOENT) {
> > +             rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> > +             if (!rsv)
> > +                     return -ENOMEM;
> >
> > -     INIT_LIST_HEAD(&rsv->allocated);
> > -     INIT_LIST_HEAD(&rsv->blocks);
> > +             INIT_LIST_HEAD(&rsv->allocated);
> > +             INIT_LIST_HEAD(&rsv->blocks);
> >
> > -     rsv->start = start;
> > -     rsv->size = size;
> > +             rsv->start = start;
> > +             rsv->size = size;
> > +
> > +             mutex_lock(&mgr->lock);
> > +             list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> > +             mutex_unlock(&mgr->lock);
> > +
> > +     }
> >
> >       mutex_lock(&mgr->lock);
> > -     list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> >       amdgpu_vram_mgr_do_reserve(&mgr->manager);
> >       mutex_unlock(&mgr->lock);
> >
> > --
> > 2.34.1
>
>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-08  8:40     ` Zhou1, Tao
@ 2024-04-08  9:35       ` Chai, Thomas
  0 siblings, 0 replies; 8+ messages in thread
From: Chai, Thomas @ 2024-04-08  9:35 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Monday, April 8, 2024 4:41 PM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Sunday, April 7, 2024 10:21 AM
> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> Yang, Stanley <Stanley.Yang@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [AMD Official Use Only - General]
>
> -----------------
> Best Regards,
> Thomas
>
> -----Original Message-----
> From: Zhou1, Tao <Tao.Zhou1@amd.com>
> Sent: Wednesday, April 3, 2024 6:36 PM
> To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> Yang, Stanley <Stanley.Yang@amd.com>
> Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [AMD Official Use Only - General]
>
> > -----Original Message-----
> > From: Chai, Thomas <YiPeng.Chai@amd.com>
> > Sent: Wednesday, April 3, 2024 3:07 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> > <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> > <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> > Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> > <YiPeng.Chai@amd.com>
> > Subject: [PATCH] drm/amdgpu: Fix incorrect return value
> >
> > [Why]
> >   After calling amdgpu_vram_mgr_reserve_range multiple times with
> > the same address, calling amdgpu_vram_mgr_query_page_status will
> > always return - EBUSY.
>
> >[Tao] could you explain why we call amdgpu_vram_mgr_reserve_range
> >multiple
> times with the same  address? IIRC, we skip duplicate address before
> reserve memory.
>
> [Thomas]
>    When poison creation interrupt is received, since some poisoning
> addresses may have been allocated by some processes, reserving these memories will fail.
> These memory will be tried to reserve again after killing the poisoned
> process in the subsequent poisoning consumption interrupt handler.
> so amdgpu_vram_mgr_reserve_range needs to be called multiple times
> with the same address.
>
> >   From the second call to amdgpu_vram_mgr_reserve_range, the same
> > address will be added to the reservations_pending list again and is
> > never moved to the reserved_pages list because the address had been
> reserved.

>[Tao] but if a page is added to reservations_pending list, it should also be put in data->bps array, and when we call amdgpu_ras_add_bad_pages again, amdgpu_ras_check_bad_page_unlock could ignore this page.
So except for amdgpu_ras_add_bad_pages, would you like to call amdgpu_vram_mgr_reserve_range in other place?

[Thomas] Yes,  Since after amdgpu_ras_add_bad_pages is called, the bad pages will be saved to eeprom. When a large number of bad pages need to be reserved, this will delay subsequent memory reservation.
I want to call amdgpu_vram_mgr_reserve_range to reserve memory immediately when driver receives poison creation interrupt, this can reduce the probability of bad memory pages being allocated and storing the bad pages to eeprom can be done slowly.

> >
> > [How]
> >   First add the address status check before calling
> > amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> > nothing; If the address is already in the reservations_pending list,
> > directly reserve memory; only add new nodes for the addresses that
> > are not in the reserved_pages list and reservations_pending list.
> >
> > Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> > +++++++++++++-------
> >  1 file changed, 19 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > index 1e36c428d254..0bf3f4092900 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> > ttm_resource_manager *man)
> >
> >               dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
> >                       rsv->start, rsv->size);
> > -
> >               vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
> >               atomic64_add(vis_usage, &mgr->vis_usage);
> >               spin_lock(&man->bdev->lru_lock); @@ -340,19 +339,30 @@
> > int amdgpu_vram_mgr_reserve_range(struct
> > amdgpu_vram_mgr *mgr,
> >                                 uint64_t start, uint64_t size)  {
> >       struct amdgpu_vram_reservation *rsv;
> > +     int ret = 0;
> >
> > -     rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> > -     if (!rsv)
> > -             return -ENOMEM;
> > +     ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> > +     if (!ret)
> > +             return 0;
> > +
> > +     if (ret == -ENOENT) {
> > +             rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> > +             if (!rsv)
> > +                     return -ENOMEM;
> >
> > -     INIT_LIST_HEAD(&rsv->allocated);
> > -     INIT_LIST_HEAD(&rsv->blocks);
> > +             INIT_LIST_HEAD(&rsv->allocated);
> > +             INIT_LIST_HEAD(&rsv->blocks);
> >
> > -     rsv->start = start;
> > -     rsv->size = size;
> > +             rsv->start = start;
> > +             rsv->size = size;
> > +
> > +             mutex_lock(&mgr->lock);
> > +             list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> > +             mutex_unlock(&mgr->lock);
> > +
> > +     }
> >
> >       mutex_lock(&mgr->lock);
> > -     list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> >       amdgpu_vram_mgr_do_reserve(&mgr->manager);
> >       mutex_unlock(&mgr->lock);
> >
> > --
> > 2.34.1
>
>



^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-03  7:06 [PATCH] drm/amdgpu: Fix incorrect return value YiPeng Chai
  2024-04-03 10:35 ` Zhou1, Tao
@ 2024-04-09  2:51 ` Zhou1, Tao
  2024-04-09  9:28   ` Chai, Thomas
  2024-04-12  9:22 ` Christian König
  2 siblings, 1 reply; 8+ messages in thread
From: Zhou1, Tao @ 2024-04-09  2:51 UTC (permalink / raw)
  To: Chai, Thomas, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, April 3, 2024 3:07 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang,
> Stanley <Stanley.Yang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>
> Subject: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [Why]
>   After calling amdgpu_vram_mgr_reserve_range multiple times with the same
> address, calling amdgpu_vram_mgr_query_page_status will always return -
> EBUSY.
>   From the second call to amdgpu_vram_mgr_reserve_range, the same address
> will be added to the reservations_pending list again and is never moved to the
> reserved_pages list because the address had been reserved.
>
> [How]
>   First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do nothing; If
> the address is already in the reservations_pending list, directly reserve memory;
> only add new nodes for the addresses that are not in the reserved_pages list and
> reservations_pending list.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28 +++++++++++++-------
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>               dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>                       rsv->start, rsv->size);
> -
>               vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>               atomic64_add(vis_usage, &mgr->vis_usage);
>               spin_lock(&man->bdev->lru_lock);
> @@ -340,19 +339,30 @@ int amdgpu_vram_mgr_reserve_range(struct
> amdgpu_vram_mgr *mgr,
>                                 uint64_t start, uint64_t size)
>  {
>       struct amdgpu_vram_reservation *rsv;
> +     int ret = 0;
>
> -     rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> -     if (!rsv)
> -             return -ENOMEM;
> +     ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> +     if (!ret)
> +             return 0;
> +
> +     if (ret == -ENOENT) {
> +             rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> +             if (!rsv)
> +                     return -ENOMEM;
>
> -     INIT_LIST_HEAD(&rsv->allocated);
> -     INIT_LIST_HEAD(&rsv->blocks);
> +             INIT_LIST_HEAD(&rsv->allocated);
> +             INIT_LIST_HEAD(&rsv->blocks);
>
> -     rsv->start = start;
> -     rsv->size = size;
> +             rsv->start = start;
> +             rsv->size = size;
> +
> +             mutex_lock(&mgr->lock);
> +             list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> +             mutex_unlock(&mgr->lock);

[Tao] we can drop the mutex_unlock and add if (ret != -ENOENT) for the second mutex_lock to avoid unlocking/locking repeatedly.

> +
> +     }
>
>       mutex_lock(&mgr->lock);
> -     list_add_tail(&rsv->blocks, &mgr->reservations_pending);
>       amdgpu_vram_mgr_do_reserve(&mgr->manager);
>       mutex_unlock(&mgr->lock);
>
> --
> 2.34.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-09  2:51 ` Zhou1, Tao
@ 2024-04-09  9:28   ` Chai, Thomas
  0 siblings, 0 replies; 8+ messages in thread
From: Chai, Thomas @ 2024-04-09  9:28 UTC (permalink / raw)
  To: Zhou1, Tao, amd-gfx
  Cc: Zhang, Hawking, Li, Candice, Wang, Yang(Kevin), Yang, Stanley

[AMD Official Use Only - General]

OK


-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@amd.com>
Sent: Tuesday, April 9, 2024 10:52 AM
To: Chai, Thomas <YiPeng.Chai@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang@amd.com>; Li, Candice <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>; Yang, Stanley <Stanley.Yang@amd.com>
Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value

[AMD Official Use Only - General]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@amd.com>
> Sent: Wednesday, April 3, 2024 3:07 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai@amd.com>; Zhang, Hawking
> <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Li, Candice
> <Candice.Li@amd.com>; Wang, Yang(Kevin) <KevinYang.Wang@amd.com>;
> Yang, Stanley <Stanley.Yang@amd.com>; Chai, Thomas
> <YiPeng.Chai@amd.com>
> Subject: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [Why]
>   After calling amdgpu_vram_mgr_reserve_range multiple times with the
> same address, calling amdgpu_vram_mgr_query_page_status will always
> return - EBUSY.
>   From the second call to amdgpu_vram_mgr_reserve_range, the same
> address will be added to the reservations_pending list again and is
> never moved to the reserved_pages list because the address had been reserved.
>
> [How]
>   First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> nothing; If the address is already in the reservations_pending list,
> directly reserve memory; only add new nodes for the addresses that are
> not in the reserved_pages list and reservations_pending list.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> +++++++++++++-------
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>               dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>                       rsv->start, rsv->size);
> -
>               vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>               atomic64_add(vis_usage, &mgr->vis_usage);
>               spin_lock(&man->bdev->lru_lock); @@ -340,19 +339,30 @@
> int amdgpu_vram_mgr_reserve_range(struct
> amdgpu_vram_mgr *mgr,
>                                 uint64_t start, uint64_t size)  {
>       struct amdgpu_vram_reservation *rsv;
> +     int ret = 0;
>
> -     rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> -     if (!rsv)
> -             return -ENOMEM;
> +     ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> +     if (!ret)
> +             return 0;
> +
> +     if (ret == -ENOENT) {
> +             rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> +             if (!rsv)
> +                     return -ENOMEM;
>
> -     INIT_LIST_HEAD(&rsv->allocated);
> -     INIT_LIST_HEAD(&rsv->blocks);
> +             INIT_LIST_HEAD(&rsv->allocated);
> +             INIT_LIST_HEAD(&rsv->blocks);
>
> -     rsv->start = start;
> -     rsv->size = size;
> +             rsv->start = start;
> +             rsv->size = size;
> +
> +             mutex_lock(&mgr->lock);
> +             list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> +             mutex_unlock(&mgr->lock);

[Tao] we can drop the mutex_unlock and add if (ret != -ENOENT) for the second mutex_lock to avoid unlocking/locking repeatedly.

> +
> +     }
>
>       mutex_lock(&mgr->lock);
> -     list_add_tail(&rsv->blocks, &mgr->reservations_pending);
>       amdgpu_vram_mgr_do_reserve(&mgr->manager);
>       mutex_unlock(&mgr->lock);
>
> --
> 2.34.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] drm/amdgpu: Fix incorrect return value
  2024-04-03  7:06 [PATCH] drm/amdgpu: Fix incorrect return value YiPeng Chai
  2024-04-03 10:35 ` Zhou1, Tao
  2024-04-09  2:51 ` Zhou1, Tao
@ 2024-04-12  9:22 ` Christian König
  2 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2024-04-12  9:22 UTC (permalink / raw)
  To: YiPeng Chai, amd-gfx
  Cc: yipechai, Hawking.Zhang, Tao.Zhou1, Candice.Li, KevinYang.Wang,
	Stanley.Yang

Am 03.04.24 um 09:06 schrieb YiPeng Chai:
> [Why]
>    After calling amdgpu_vram_mgr_reserve_range
> multiple times with the same address, calling
> amdgpu_vram_mgr_query_page_status will always
> return -EBUSY.
>    From the second call to amdgpu_vram_mgr_reserve_range,
> the same address will be added to the reservations_pending
> list again and is never moved to the reserved_pages list
> because the address had been reserved.

Well that sounds like a really bad idea to me. Why is the function 
called multiple times with the same address in the first place ?

Apart from that a note on the coding style below.

>
> [How]
>    First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already
> reserved, do nothing; If the address is already in the
> reservations_pending list, directly reserve memory;
> only add new nodes for the addresses that are not in the
> reserved_pages list and reservations_pending list.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28 +++++++++++++-------
>   1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
>   
>   		dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>   			rsv->start, rsv->size);
> -
>   		vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>   		atomic64_add(vis_usage, &mgr->vis_usage);
>   		spin_lock(&man->bdev->lru_lock);
> @@ -340,19 +339,30 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr,
>   				  uint64_t start, uint64_t size)
>   {
>   	struct amdgpu_vram_reservation *rsv;
> +	int ret = 0;

Don't initialize local variables when it isn't necessary.

>   
> -	rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> -	if (!rsv)
> -		return -ENOMEM;
> +	ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> +	if (!ret)
> +		return 0;
> +
> +	if (ret == -ENOENT) {
> +		rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> +		if (!rsv)
> +			return -ENOMEM;
>   
> -	INIT_LIST_HEAD(&rsv->allocated);
> -	INIT_LIST_HEAD(&rsv->blocks);
> +		INIT_LIST_HEAD(&rsv->allocated);
> +		INIT_LIST_HEAD(&rsv->blocks);
>   
> -	rsv->start = start;
> -	rsv->size = size;
> +		rsv->start = start;
> +		rsv->size = size;
> +
> +		mutex_lock(&mgr->lock);
> +		list_add_tail(&rsv->blocks, &mgr->reservations_pending);
> +		mutex_unlock(&mgr->lock);
> +
> +	}

You should probably not lock/unlock here.

Regards,
Christian.

>   
>   	mutex_lock(&mgr->lock);
> -	list_add_tail(&rsv->blocks, &mgr->reservations_pending);
>   	amdgpu_vram_mgr_do_reserve(&mgr->manager);
>   	mutex_unlock(&mgr->lock);
>   


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-04-12  9:22 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-03  7:06 [PATCH] drm/amdgpu: Fix incorrect return value YiPeng Chai
2024-04-03 10:35 ` Zhou1, Tao
2024-04-07  2:20   ` Chai, Thomas
2024-04-08  8:40     ` Zhou1, Tao
2024-04-08  9:35       ` Chai, Thomas
2024-04-09  2:51 ` Zhou1, Tao
2024-04-09  9:28   ` Chai, Thomas
2024-04-12  9:22 ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).