All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19  2:28 ` xinhui pan
  0 siblings, 0 replies; 30+ messages in thread
From: xinhui pan @ 2021-05-19  2:28 UTC (permalink / raw)
  To: amd-gfx
  Cc: Felix.Kuehling, xinhui pan, dri-devel, alexander.deucher,
	christian.koenig

cpu 1                                           cpu 2
kfd alloc BO A(userptr)                         alloc BO B(GTT)
    ->init -> validate				-> init -> validate -> populate
    init_user_pages				  -> swapout BO A //hit ttm pages limit
	-> get_user_pages (fill up ttm->pages)
	 -> validate -> populate
          -> swapin BO A // Now hit the BUG

We know that get_user_pages may race with swapout on same BO.
Threre are some issues I have met.
1) memory corruption.
This is because we do a swap before memory is setup. ttm_tt_swapout()
just create a swap_storage with its content being 0x0. So when we setup
memory after the swapout. The following swapin makes the memory
corrupted.

2) panic
When swapout happes with get_user_pages, they touch ttm->pages without
anylock. It causes memory corruption too. But I hit page fault mostly.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 928e8d57cd08..42460e4480f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 	struct amdkfd_process_info *process_info = mem->process_info;
 	struct amdgpu_bo *bo = mem->bo;
 	struct ttm_operation_ctx ctx = { true, false };
+	struct page **pages;
 	int ret = 0;
 
 	mutex_lock(&process_info->lock);
@@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 		goto out;
 	}
 
-	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+	pages = kvmalloc_array(bo->tbo.ttm->num_pages,
+			sizeof(struct page *),
+			GFP_KERNEL | __GFP_ZERO);
+	if (!pages)
+		goto unregister_out;
+
+	ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
 	if (ret) {
 		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
 		goto unregister_out;
@@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 		pr_err("%s: Failed to reserve BO\n", __func__);
 		goto release_out;
 	}
+
+	WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
+
+	memcpy(bo->tbo.ttm->pages,
+			pages,
+			sizeof(struct page*) * bo->tbo.ttm->num_pages);
 	amdgpu_bo_placement_from_domain(bo, mem->domain);
 	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 	if (ret)
@@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 release_out:
 	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
 unregister_out:
+	kvfree(pages);
 	if (ret)
 		amdgpu_mn_unregister(bo);
 out:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19  2:28 ` xinhui pan
  0 siblings, 0 replies; 30+ messages in thread
From: xinhui pan @ 2021-05-19  2:28 UTC (permalink / raw)
  To: amd-gfx
  Cc: Felix.Kuehling, xinhui pan, dri-devel, daniel, alexander.deucher,
	christian.koenig

cpu 1                                           cpu 2
kfd alloc BO A(userptr)                         alloc BO B(GTT)
    ->init -> validate				-> init -> validate -> populate
    init_user_pages				  -> swapout BO A //hit ttm pages limit
	-> get_user_pages (fill up ttm->pages)
	 -> validate -> populate
          -> swapin BO A // Now hit the BUG

We know that get_user_pages may race with swapout on same BO.
Threre are some issues I have met.
1) memory corruption.
This is because we do a swap before memory is setup. ttm_tt_swapout()
just create a swap_storage with its content being 0x0. So when we setup
memory after the swapout. The following swapin makes the memory
corrupted.

2) panic
When swapout happes with get_user_pages, they touch ttm->pages without
anylock. It causes memory corruption too. But I hit page fault mostly.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 928e8d57cd08..42460e4480f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 	struct amdkfd_process_info *process_info = mem->process_info;
 	struct amdgpu_bo *bo = mem->bo;
 	struct ttm_operation_ctx ctx = { true, false };
+	struct page **pages;
 	int ret = 0;
 
 	mutex_lock(&process_info->lock);
@@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 		goto out;
 	}
 
-	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+	pages = kvmalloc_array(bo->tbo.ttm->num_pages,
+			sizeof(struct page *),
+			GFP_KERNEL | __GFP_ZERO);
+	if (!pages)
+		goto unregister_out;
+
+	ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
 	if (ret) {
 		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
 		goto unregister_out;
@@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 		pr_err("%s: Failed to reserve BO\n", __func__);
 		goto release_out;
 	}
+
+	WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
+
+	memcpy(bo->tbo.ttm->pages,
+			pages,
+			sizeof(struct page*) * bo->tbo.ttm->num_pages);
 	amdgpu_bo_placement_from_domain(bo, mem->domain);
 	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
 	if (ret)
@@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 release_out:
 	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
 unregister_out:
+	kvfree(pages);
 	if (ret)
 		amdgpu_mn_unregister(bo);
 out:
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.
  2021-05-19  2:28 ` xinhui pan
@ 2021-05-19  2:28   ` xinhui pan
  -1 siblings, 0 replies; 30+ messages in thread
From: xinhui pan @ 2021-05-19  2:28 UTC (permalink / raw)
  To: amd-gfx
  Cc: Felix.Kuehling, xinhui pan, dri-devel, alexander.deucher,
	christian.koenig

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/ttm/ttm_device.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 510e3e001dab..a9772fcc8f9c 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -146,6 +146,7 @@ int ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx,
 				uint32_t num_pages;
 
 				if (!bo->ttm ||
+				    !bo->ttm->pages || !bo->ttm->pages[0] ||
 				    bo->ttm->page_flags & TTM_PAGE_FLAG_SG ||
 				    bo->ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)
 					continue;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.
@ 2021-05-19  2:28   ` xinhui pan
  0 siblings, 0 replies; 30+ messages in thread
From: xinhui pan @ 2021-05-19  2:28 UTC (permalink / raw)
  To: amd-gfx
  Cc: Felix.Kuehling, xinhui pan, dri-devel, daniel, alexander.deucher,
	christian.koenig

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/ttm/ttm_device.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 510e3e001dab..a9772fcc8f9c 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -146,6 +146,7 @@ int ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx,
 				uint32_t num_pages;
 
 				if (!bo->ttm ||
+				    !bo->ttm->pages || !bo->ttm->pages[0] ||
 				    bo->ttm->page_flags & TTM_PAGE_FLAG_SG ||
 				    bo->ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)
 					continue;
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19  2:28 ` xinhui pan
@ 2021-05-19  2:39   ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-19  2:39 UTC (permalink / raw)
  To: amd-gfx; +Cc: Deucher, Alexander, Kuehling, Felix, Koenig, Christian, dri-devel

[AMD Official Use Only]

To observe the issue. I made one kfdtest case for debug.
It just alloc a userptr memory and detect if memory is corrupted.
I can hit this failure in 2 minutes. :(

diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp
index 70c8033..a72f53f 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -584,6 +584,32 @@ TEST_F(KFDMemoryTest, ZeroMemorySizeAlloc) {
     TEST_END
 }

+TEST_F(KFDMemoryTest, swap) {
+    TEST_START(TESTPROFILE_RUNALL)
+
+    unsigned int size = 128<<20;
+    unsigned int*tmp = (unsigned int *)mmap(0,
+                   size,
+                   PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE,
+                   -1,
+                   0);
+    EXPECT_NE(tmp, MAP_FAILED);
+
+    LOG() << "pls run this with KFDMemoryTest.LargestSysBufferTest" << std::endl;
+    do {
+           memset(tmp, 0xcc, size);
+
+           HsaMemoryBuffer buf(tmp, size);
+           sleep(1);
+           EXPECT_EQ(tmp[0], 0xcccccccc);
+    } while (true);
+
+    munmap(tmp, size);
+
+    TEST_END
+}
+
 // Basic test for hsaKmtAllocMemory
 TEST_F(KFDMemoryTest, MemoryAlloc) {
     TEST_START(TESTPROFILE_RUNALL)
--
2.25.1

________________________________________
发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
发送时间: 2021年5月19日 10:28
收件人: amd-gfx@lists.freedesktop.org
抄送: Kuehling, Felix; Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch; Pan, Xinhui
主题: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

cpu 1                                           cpu 2
kfd alloc BO A(userptr)                         alloc BO B(GTT)
    ->init -> validate                          -> init -> validate -> populate
    init_user_pages                               -> swapout BO A //hit ttm pages limit
        -> get_user_pages (fill up ttm->pages)
         -> validate -> populate
          -> swapin BO A // Now hit the BUG

We know that get_user_pages may race with swapout on same BO.
Threre are some issues I have met.
1) memory corruption.
This is because we do a swap before memory is setup. ttm_tt_swapout()
just create a swap_storage with its content being 0x0. So when we setup
memory after the swapout. The following swapin makes the memory
corrupted.

2) panic
When swapout happes with get_user_pages, they touch ttm->pages without
anylock. It causes memory corruption too. But I hit page fault mostly.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 928e8d57cd08..42460e4480f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
        struct amdkfd_process_info *process_info = mem->process_info;
        struct amdgpu_bo *bo = mem->bo;
        struct ttm_operation_ctx ctx = { true, false };
+       struct page **pages;
        int ret = 0;

        mutex_lock(&process_info->lock);
@@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
                goto out;
        }

-       ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+       pages = kvmalloc_array(bo->tbo.ttm->num_pages,
+                       sizeof(struct page *),
+                       GFP_KERNEL | __GFP_ZERO);
+       if (!pages)
+               goto unregister_out;
+
+       ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
        if (ret) {
                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
                goto unregister_out;
@@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
                pr_err("%s: Failed to reserve BO\n", __func__);
                goto release_out;
        }
+
+       WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
+
+       memcpy(bo->tbo.ttm->pages,
+                       pages,
+                       sizeof(struct page*) * bo->tbo.ttm->num_pages);
        amdgpu_bo_placement_from_domain(bo, mem->domain);
        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
        if (ret)
@@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 release_out:
        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
 unregister_out:
+       kvfree(pages);
        if (ret)
                amdgpu_mn_unregister(bo);
 out:
--
2.25.1


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19  2:39   ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-19  2:39 UTC (permalink / raw)
  To: amd-gfx
  Cc: Deucher, Alexander, daniel, Kuehling, Felix, Koenig, Christian,
	dri-devel

[AMD Official Use Only]

To observe the issue. I made one kfdtest case for debug.
It just alloc a userptr memory and detect if memory is corrupted.
I can hit this failure in 2 minutes. :(

diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp b/tests/kfdtest/src/KFDMemoryTest.cpp
index 70c8033..a72f53f 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -584,6 +584,32 @@ TEST_F(KFDMemoryTest, ZeroMemorySizeAlloc) {
     TEST_END
 }

+TEST_F(KFDMemoryTest, swap) {
+    TEST_START(TESTPROFILE_RUNALL)
+
+    unsigned int size = 128<<20;
+    unsigned int*tmp = (unsigned int *)mmap(0,
+                   size,
+                   PROT_READ | PROT_WRITE,
+                   MAP_ANONYMOUS | MAP_PRIVATE,
+                   -1,
+                   0);
+    EXPECT_NE(tmp, MAP_FAILED);
+
+    LOG() << "pls run this with KFDMemoryTest.LargestSysBufferTest" << std::endl;
+    do {
+           memset(tmp, 0xcc, size);
+
+           HsaMemoryBuffer buf(tmp, size);
+           sleep(1);
+           EXPECT_EQ(tmp[0], 0xcccccccc);
+    } while (true);
+
+    munmap(tmp, size);
+
+    TEST_END
+}
+
 // Basic test for hsaKmtAllocMemory
 TEST_F(KFDMemoryTest, MemoryAlloc) {
     TEST_START(TESTPROFILE_RUNALL)
--
2.25.1

________________________________________
发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
发送时间: 2021年5月19日 10:28
收件人: amd-gfx@lists.freedesktop.org
抄送: Kuehling, Felix; Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch; Pan, Xinhui
主题: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

cpu 1                                           cpu 2
kfd alloc BO A(userptr)                         alloc BO B(GTT)
    ->init -> validate                          -> init -> validate -> populate
    init_user_pages                               -> swapout BO A //hit ttm pages limit
        -> get_user_pages (fill up ttm->pages)
         -> validate -> populate
          -> swapin BO A // Now hit the BUG

We know that get_user_pages may race with swapout on same BO.
Threre are some issues I have met.
1) memory corruption.
This is because we do a swap before memory is setup. ttm_tt_swapout()
just create a swap_storage with its content being 0x0. So when we setup
memory after the swapout. The following swapin makes the memory
corrupted.

2) panic
When swapout happes with get_user_pages, they touch ttm->pages without
anylock. It causes memory corruption too. But I hit page fault mostly.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 928e8d57cd08..42460e4480f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
        struct amdkfd_process_info *process_info = mem->process_info;
        struct amdgpu_bo *bo = mem->bo;
        struct ttm_operation_ctx ctx = { true, false };
+       struct page **pages;
        int ret = 0;

        mutex_lock(&process_info->lock);
@@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
                goto out;
        }

-       ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
+       pages = kvmalloc_array(bo->tbo.ttm->num_pages,
+                       sizeof(struct page *),
+                       GFP_KERNEL | __GFP_ZERO);
+       if (!pages)
+               goto unregister_out;
+
+       ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
        if (ret) {
                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
                goto unregister_out;
@@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
                pr_err("%s: Failed to reserve BO\n", __func__);
                goto release_out;
        }
+
+       WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
+
+       memcpy(bo->tbo.ttm->pages,
+                       pages,
+                       sizeof(struct page*) * bo->tbo.ttm->num_pages);
        amdgpu_bo_placement_from_domain(bo, mem->domain);
        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
        if (ret)
@@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
 release_out:
        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
 unregister_out:
+       kvfree(pages);
        if (ret)
                amdgpu_mn_unregister(bo);
 out:
--
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19  2:28 ` xinhui pan
@ 2021-05-19  3:29   ` Felix Kuehling
  -1 siblings, 0 replies; 30+ messages in thread
From: Felix Kuehling @ 2021-05-19  3:29 UTC (permalink / raw)
  To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig, dri-devel

Swapping SG BOs makes no sense, because TTM doesn't own the pages of 
this type of BO.

Last I checked, userptr BOs (and other SG BOs) were protected from 
swapout by the fact that they would not be added to the swap-LRU. But it 
looks like Christian just removed the swap-LRU. I guess this broke that 
protection:

commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
Author: Christian König <christian.koenig@amd.com>
Date:   Tue Oct 6 16:30:09 2020 +0200

     drm/ttm: remove swap LRU v3

     Instead evict round robin from each devices SYSTEM and TT domain.

     v2: reorder num_pages access reported by Dan's script
     v3: fix rebase fallout, num_pages should be 32bit

     Signed-off-by: Christian König <christian.koenig@amd.com>
     Tested-by: Nirmoy Das <nirmoy.das@amd.com>
     Reviewed-by: Huang Rui <ray.huang@amd.com>
     Reviewed-by: Matthew Auld <matthew.auld@intel.com>
     Link: https://patchwork.freedesktop.org/patch/424009/

Regards,
   Felix


On 2021-05-18 10:28 p.m., xinhui pan wrote:
> cpu 1                                           cpu 2
> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>      ->init -> validate				-> init -> validate -> populate
>      init_user_pages				  -> swapout BO A //hit ttm pages limit
> 	-> get_user_pages (fill up ttm->pages)
> 	 -> validate -> populate
>            -> swapin BO A // Now hit the BUG
>
> We know that get_user_pages may race with swapout on same BO.
> Threre are some issues I have met.
> 1) memory corruption.
> This is because we do a swap before memory is setup. ttm_tt_swapout()
> just create a swap_storage with its content being 0x0. So when we setup
> memory after the swapout. The following swapin makes the memory
> corrupted.
>
> 2) panic
> When swapout happes with get_user_pages, they touch ttm->pages without
> anylock. It causes memory corruption too. But I hit page fault mostly.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 928e8d57cd08..42460e4480f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   	struct amdkfd_process_info *process_info = mem->process_info;
>   	struct amdgpu_bo *bo = mem->bo;
>   	struct ttm_operation_ctx ctx = { true, false };
> +	struct page **pages;
>   	int ret = 0;
>   
>   	mutex_lock(&process_info->lock);
> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   		goto out;
>   	}
>   
> -	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> +	pages = kvmalloc_array(bo->tbo.ttm->num_pages,
> +			sizeof(struct page *),
> +			GFP_KERNEL | __GFP_ZERO);
> +	if (!pages)
> +		goto unregister_out;
> +
> +	ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>   	if (ret) {
>   		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>   		goto unregister_out;
> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   		pr_err("%s: Failed to reserve BO\n", __func__);
>   		goto release_out;
>   	}
> +
> +	WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
> +
> +	memcpy(bo->tbo.ttm->pages,
> +			pages,
> +			sizeof(struct page*) * bo->tbo.ttm->num_pages);
>   	amdgpu_bo_placement_from_domain(bo, mem->domain);
>   	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>   	if (ret)
> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   release_out:
>   	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   unregister_out:
> +	kvfree(pages);
>   	if (ret)
>   		amdgpu_mn_unregister(bo);
>   out:

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19  3:29   ` Felix Kuehling
  0 siblings, 0 replies; 30+ messages in thread
From: Felix Kuehling @ 2021-05-19  3:29 UTC (permalink / raw)
  To: xinhui pan, amd-gfx
  Cc: alexander.deucher, daniel, christian.koenig, dri-devel

Swapping SG BOs makes no sense, because TTM doesn't own the pages of 
this type of BO.

Last I checked, userptr BOs (and other SG BOs) were protected from 
swapout by the fact that they would not be added to the swap-LRU. But it 
looks like Christian just removed the swap-LRU. I guess this broke that 
protection:

commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
Author: Christian König <christian.koenig@amd.com>
Date:   Tue Oct 6 16:30:09 2020 +0200

     drm/ttm: remove swap LRU v3

     Instead evict round robin from each devices SYSTEM and TT domain.

     v2: reorder num_pages access reported by Dan's script
     v3: fix rebase fallout, num_pages should be 32bit

     Signed-off-by: Christian König <christian.koenig@amd.com>
     Tested-by: Nirmoy Das <nirmoy.das@amd.com>
     Reviewed-by: Huang Rui <ray.huang@amd.com>
     Reviewed-by: Matthew Auld <matthew.auld@intel.com>
     Link: https://patchwork.freedesktop.org/patch/424009/

Regards,
   Felix


On 2021-05-18 10:28 p.m., xinhui pan wrote:
> cpu 1                                           cpu 2
> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>      ->init -> validate				-> init -> validate -> populate
>      init_user_pages				  -> swapout BO A //hit ttm pages limit
> 	-> get_user_pages (fill up ttm->pages)
> 	 -> validate -> populate
>            -> swapin BO A // Now hit the BUG
>
> We know that get_user_pages may race with swapout on same BO.
> Threre are some issues I have met.
> 1) memory corruption.
> This is because we do a swap before memory is setup. ttm_tt_swapout()
> just create a swap_storage with its content being 0x0. So when we setup
> memory after the swapout. The following swapin makes the memory
> corrupted.
>
> 2) panic
> When swapout happes with get_user_pages, they touch ttm->pages without
> anylock. It causes memory corruption too. But I hit page fault mostly.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 928e8d57cd08..42460e4480f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   	struct amdkfd_process_info *process_info = mem->process_info;
>   	struct amdgpu_bo *bo = mem->bo;
>   	struct ttm_operation_ctx ctx = { true, false };
> +	struct page **pages;
>   	int ret = 0;
>   
>   	mutex_lock(&process_info->lock);
> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   		goto out;
>   	}
>   
> -	ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> +	pages = kvmalloc_array(bo->tbo.ttm->num_pages,
> +			sizeof(struct page *),
> +			GFP_KERNEL | __GFP_ZERO);
> +	if (!pages)
> +		goto unregister_out;
> +
> +	ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>   	if (ret) {
>   		pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>   		goto unregister_out;
> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   		pr_err("%s: Failed to reserve BO\n", __func__);
>   		goto release_out;
>   	}
> +
> +	WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
> +
> +	memcpy(bo->tbo.ttm->pages,
> +			pages,
> +			sizeof(struct page*) * bo->tbo.ttm->num_pages);
>   	amdgpu_bo_placement_from_domain(bo, mem->domain);
>   	ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>   	if (ret)
> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   release_out:
>   	amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   unregister_out:
> +	kvfree(pages);
>   	if (ret)
>   		amdgpu_mn_unregister(bo);
>   out:
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19  3:29   ` Felix Kuehling
@ 2021-05-19  4:09     ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-19  4:09 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian, dri-devel

[AMD Official Use Only]

yes, we really dont swapout SG BOs.
The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.

we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.

________________________________________
发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
发送时间: 2021年5月19日 11:29
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Swapping SG BOs makes no sense, because TTM doesn't own the pages of
this type of BO.

Last I checked, userptr BOs (and other SG BOs) were protected from
swapout by the fact that they would not be added to the swap-LRU. But it
looks like Christian just removed the swap-LRU. I guess this broke that
protection:

commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
Author: Christian König <christian.koenig@amd.com>
Date:   Tue Oct 6 16:30:09 2020 +0200

     drm/ttm: remove swap LRU v3

     Instead evict round robin from each devices SYSTEM and TT domain.

     v2: reorder num_pages access reported by Dan's script
     v3: fix rebase fallout, num_pages should be 32bit

     Signed-off-by: Christian König <christian.koenig@amd.com>
     Tested-by: Nirmoy Das <nirmoy.das@amd.com>
     Reviewed-by: Huang Rui <ray.huang@amd.com>
     Reviewed-by: Matthew Auld <matthew.auld@intel.com>
     Link: https://patchwork.freedesktop.org/patch/424009/

Regards,
   Felix


On 2021-05-18 10:28 p.m., xinhui pan wrote:
> cpu 1                                           cpu 2
> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>      ->init -> validate                               -> init -> validate -> populate
>      init_user_pages                            -> swapout BO A //hit ttm pages limit
>       -> get_user_pages (fill up ttm->pages)
>        -> validate -> populate
>            -> swapin BO A // Now hit the BUG
>
> We know that get_user_pages may race with swapout on same BO.
> Threre are some issues I have met.
> 1) memory corruption.
> This is because we do a swap before memory is setup. ttm_tt_swapout()
> just create a swap_storage with its content being 0x0. So when we setup
> memory after the swapout. The following swapin makes the memory
> corrupted.
>
> 2) panic
> When swapout happes with get_user_pages, they touch ttm->pages without
> anylock. It causes memory corruption too. But I hit page fault mostly.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 928e8d57cd08..42460e4480f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>       struct amdkfd_process_info *process_info = mem->process_info;
>       struct amdgpu_bo *bo = mem->bo;
>       struct ttm_operation_ctx ctx = { true, false };
> +     struct page **pages;
>       int ret = 0;
>
>       mutex_lock(&process_info->lock);
> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               goto out;
>       }
>
> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
> +                     sizeof(struct page *),
> +                     GFP_KERNEL | __GFP_ZERO);
> +     if (!pages)
> +             goto unregister_out;
> +
> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>       if (ret) {
>               pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>               goto unregister_out;
> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               pr_err("%s: Failed to reserve BO\n", __func__);
>               goto release_out;
>       }
> +
> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
> +
> +     memcpy(bo->tbo.ttm->pages,
> +                     pages,
> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>       amdgpu_bo_placement_from_domain(bo, mem->domain);
>       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>       if (ret)
> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   release_out:
>       amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   unregister_out:
> +     kvfree(pages);
>       if (ret)
>               amdgpu_mn_unregister(bo);
>   out:

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19  4:09     ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-19  4:09 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, daniel, Koenig, Christian, dri-devel

[AMD Official Use Only]

yes, we really dont swapout SG BOs.
The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.

we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.

________________________________________
发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
发送时间: 2021年5月19日 11:29
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Swapping SG BOs makes no sense, because TTM doesn't own the pages of
this type of BO.

Last I checked, userptr BOs (and other SG BOs) were protected from
swapout by the fact that they would not be added to the swap-LRU. But it
looks like Christian just removed the swap-LRU. I guess this broke that
protection:

commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
Author: Christian König <christian.koenig@amd.com>
Date:   Tue Oct 6 16:30:09 2020 +0200

     drm/ttm: remove swap LRU v3

     Instead evict round robin from each devices SYSTEM and TT domain.

     v2: reorder num_pages access reported by Dan's script
     v3: fix rebase fallout, num_pages should be 32bit

     Signed-off-by: Christian König <christian.koenig@amd.com>
     Tested-by: Nirmoy Das <nirmoy.das@amd.com>
     Reviewed-by: Huang Rui <ray.huang@amd.com>
     Reviewed-by: Matthew Auld <matthew.auld@intel.com>
     Link: https://patchwork.freedesktop.org/patch/424009/

Regards,
   Felix


On 2021-05-18 10:28 p.m., xinhui pan wrote:
> cpu 1                                           cpu 2
> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>      ->init -> validate                               -> init -> validate -> populate
>      init_user_pages                            -> swapout BO A //hit ttm pages limit
>       -> get_user_pages (fill up ttm->pages)
>        -> validate -> populate
>            -> swapin BO A // Now hit the BUG
>
> We know that get_user_pages may race with swapout on same BO.
> Threre are some issues I have met.
> 1) memory corruption.
> This is because we do a swap before memory is setup. ttm_tt_swapout()
> just create a swap_storage with its content being 0x0. So when we setup
> memory after the swapout. The following swapin makes the memory
> corrupted.
>
> 2) panic
> When swapout happes with get_user_pages, they touch ttm->pages without
> anylock. It causes memory corruption too. But I hit page fault mostly.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 928e8d57cd08..42460e4480f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>       struct amdkfd_process_info *process_info = mem->process_info;
>       struct amdgpu_bo *bo = mem->bo;
>       struct ttm_operation_ctx ctx = { true, false };
> +     struct page **pages;
>       int ret = 0;
>
>       mutex_lock(&process_info->lock);
> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               goto out;
>       }
>
> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
> +                     sizeof(struct page *),
> +                     GFP_KERNEL | __GFP_ZERO);
> +     if (!pages)
> +             goto unregister_out;
> +
> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>       if (ret) {
>               pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>               goto unregister_out;
> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               pr_err("%s: Failed to reserve BO\n", __func__);
>               goto release_out;
>       }
> +
> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
> +
> +     memcpy(bo->tbo.ttm->pages,
> +                     pages,
> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>       amdgpu_bo_placement_from_domain(bo, mem->domain);
>       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>       if (ret)
> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   release_out:
>       amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   unregister_out:
> +     kvfree(pages);
>       if (ret)
>               amdgpu_mn_unregister(bo);
>   out:
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19  4:09     ` Pan, Xinhui
@ 2021-05-19  5:07       ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-19  5:07 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx; +Cc: Deucher, Alexander, Koenig, Christian, dri-devel

[AMD Official Use Only]

I have reverted Chris'  patch, still hit this failure.
Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.

-       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
-               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
[snip]
+       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {


________________________________________
发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
发送时间: 2021年5月19日 12:09
收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

yes, we really dont swapout SG BOs.
The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.

we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.

________________________________________
发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
发送时间: 2021年5月19日 11:29
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Swapping SG BOs makes no sense, because TTM doesn't own the pages of
this type of BO.

Last I checked, userptr BOs (and other SG BOs) were protected from
swapout by the fact that they would not be added to the swap-LRU. But it
looks like Christian just removed the swap-LRU. I guess this broke that
protection:

commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
Author: Christian König <christian.koenig@amd.com>
Date:   Tue Oct 6 16:30:09 2020 +0200

     drm/ttm: remove swap LRU v3

     Instead evict round robin from each devices SYSTEM and TT domain.

     v2: reorder num_pages access reported by Dan's script
     v3: fix rebase fallout, num_pages should be 32bit

     Signed-off-by: Christian König <christian.koenig@amd.com>
     Tested-by: Nirmoy Das <nirmoy.das@amd.com>
     Reviewed-by: Huang Rui <ray.huang@amd.com>
     Reviewed-by: Matthew Auld <matthew.auld@intel.com>
     Link: https://patchwork.freedesktop.org/patch/424009/

Regards,
   Felix


On 2021-05-18 10:28 p.m., xinhui pan wrote:
> cpu 1                                           cpu 2
> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>      ->init -> validate                               -> init -> validate -> populate
>      init_user_pages                            -> swapout BO A //hit ttm pages limit
>       -> get_user_pages (fill up ttm->pages)
>        -> validate -> populate
>            -> swapin BO A // Now hit the BUG
>
> We know that get_user_pages may race with swapout on same BO.
> Threre are some issues I have met.
> 1) memory corruption.
> This is because we do a swap before memory is setup. ttm_tt_swapout()
> just create a swap_storage with its content being 0x0. So when we setup
> memory after the swapout. The following swapin makes the memory
> corrupted.
>
> 2) panic
> When swapout happes with get_user_pages, they touch ttm->pages without
> anylock. It causes memory corruption too. But I hit page fault mostly.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 928e8d57cd08..42460e4480f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>       struct amdkfd_process_info *process_info = mem->process_info;
>       struct amdgpu_bo *bo = mem->bo;
>       struct ttm_operation_ctx ctx = { true, false };
> +     struct page **pages;
>       int ret = 0;
>
>       mutex_lock(&process_info->lock);
> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               goto out;
>       }
>
> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
> +                     sizeof(struct page *),
> +                     GFP_KERNEL | __GFP_ZERO);
> +     if (!pages)
> +             goto unregister_out;
> +
> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>       if (ret) {
>               pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>               goto unregister_out;
> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               pr_err("%s: Failed to reserve BO\n", __func__);
>               goto release_out;
>       }
> +
> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
> +
> +     memcpy(bo->tbo.ttm->pages,
> +                     pages,
> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>       amdgpu_bo_placement_from_domain(bo, mem->domain);
>       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>       if (ret)
> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   release_out:
>       amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   unregister_out:
> +     kvfree(pages);
>       if (ret)
>               amdgpu_mn_unregister(bo);
>   out:

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19  5:07       ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-19  5:07 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, daniel, Koenig, Christian, dri-devel

[AMD Official Use Only]

I have reverted Chris'  patch, still hit this failure.
Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.

-       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
-               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
[snip]
+       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
+               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {


________________________________________
发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
发送时间: 2021年5月19日 12:09
收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

yes, we really dont swapout SG BOs.
The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.

we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.

________________________________________
发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
发送时间: 2021年5月19日 11:29
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Swapping SG BOs makes no sense, because TTM doesn't own the pages of
this type of BO.

Last I checked, userptr BOs (and other SG BOs) were protected from
swapout by the fact that they would not be added to the swap-LRU. But it
looks like Christian just removed the swap-LRU. I guess this broke that
protection:

commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
Author: Christian König <christian.koenig@amd.com>
Date:   Tue Oct 6 16:30:09 2020 +0200

     drm/ttm: remove swap LRU v3

     Instead evict round robin from each devices SYSTEM and TT domain.

     v2: reorder num_pages access reported by Dan's script
     v3: fix rebase fallout, num_pages should be 32bit

     Signed-off-by: Christian König <christian.koenig@amd.com>
     Tested-by: Nirmoy Das <nirmoy.das@amd.com>
     Reviewed-by: Huang Rui <ray.huang@amd.com>
     Reviewed-by: Matthew Auld <matthew.auld@intel.com>
     Link: https://patchwork.freedesktop.org/patch/424009/

Regards,
   Felix


On 2021-05-18 10:28 p.m., xinhui pan wrote:
> cpu 1                                           cpu 2
> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>      ->init -> validate                               -> init -> validate -> populate
>      init_user_pages                            -> swapout BO A //hit ttm pages limit
>       -> get_user_pages (fill up ttm->pages)
>        -> validate -> populate
>            -> swapin BO A // Now hit the BUG
>
> We know that get_user_pages may race with swapout on same BO.
> Threre are some issues I have met.
> 1) memory corruption.
> This is because we do a swap before memory is setup. ttm_tt_swapout()
> just create a swap_storage with its content being 0x0. So when we setup
> memory after the swapout. The following swapin makes the memory
> corrupted.
>
> 2) panic
> When swapout happes with get_user_pages, they touch ttm->pages without
> anylock. It causes memory corruption too. But I hit page fault mostly.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 928e8d57cd08..42460e4480f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>       struct amdkfd_process_info *process_info = mem->process_info;
>       struct amdgpu_bo *bo = mem->bo;
>       struct ttm_operation_ctx ctx = { true, false };
> +     struct page **pages;
>       int ret = 0;
>
>       mutex_lock(&process_info->lock);
> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               goto out;
>       }
>
> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
> +                     sizeof(struct page *),
> +                     GFP_KERNEL | __GFP_ZERO);
> +     if (!pages)
> +             goto unregister_out;
> +
> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>       if (ret) {
>               pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>               goto unregister_out;
> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>               pr_err("%s: Failed to reserve BO\n", __func__);
>               goto release_out;
>       }
> +
> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
> +
> +     memcpy(bo->tbo.ttm->pages,
> +                     pages,
> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>       amdgpu_bo_placement_from_domain(bo, mem->domain);
>       ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>       if (ret)
> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>   release_out:
>       amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>   unregister_out:
> +     kvfree(pages);
>       if (ret)
>               amdgpu_mn_unregister(bo);
>   out:
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19  5:07       ` Pan, Xinhui
@ 2021-05-19 10:01         ` Christian König
  -1 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-19 10:01 UTC (permalink / raw)
  To: Pan, Xinhui, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

I'm scratching my head how that is even possible.

See when a BO is created in the system domain it is just an empty hull, 
e.g. without backing store and allocated pages.

So the swapout function will just ignore it.

Christian.

Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> I have reverted Chris'  patch, still hit this failure.
> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>
> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> [snip]
> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>
>
> ________________________________________
> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
> 发送时间: 2021年5月19日 12:09
> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> yes, we really dont swapout SG BOs.
> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>
> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>
> ________________________________________
> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
> 发送时间: 2021年5月19日 11:29
> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
> this type of BO.
>
> Last I checked, userptr BOs (and other SG BOs) were protected from
> swapout by the fact that they would not be added to the swap-LRU. But it
> looks like Christian just removed the swap-LRU. I guess this broke that
> protection:
>
> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
> Author: Christian König <christian.koenig@amd.com>
> Date:   Tue Oct 6 16:30:09 2020 +0200
>
>       drm/ttm: remove swap LRU v3
>
>       Instead evict round robin from each devices SYSTEM and TT domain.
>
>       v2: reorder num_pages access reported by Dan's script
>       v3: fix rebase fallout, num_pages should be 32bit
>
>       Signed-off-by: Christian König <christian.koenig@amd.com>
>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>       Link: https://patchwork.freedesktop.org/patch/424009/
>
> Regards,
>     Felix
>
>
> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>> cpu 1                                           cpu 2
>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>       ->init -> validate                               -> init -> validate -> populate
>>       init_user_pages                            -> swapout BO A //hit ttm pages limit
>>        -> get_user_pages (fill up ttm->pages)
>>         -> validate -> populate
>>             -> swapin BO A // Now hit the BUG
>>
>> We know that get_user_pages may race with swapout on same BO.
>> Threre are some issues I have met.
>> 1) memory corruption.
>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>> just create a swap_storage with its content being 0x0. So when we setup
>> memory after the swapout. The following swapin makes the memory
>> corrupted.
>>
>> 2) panic
>> When swapout happes with get_user_pages, they touch ttm->pages without
>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 928e8d57cd08..42460e4480f8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>        struct amdkfd_process_info *process_info = mem->process_info;
>>        struct amdgpu_bo *bo = mem->bo;
>>        struct ttm_operation_ctx ctx = { true, false };
>> +     struct page **pages;
>>        int ret = 0;
>>
>>        mutex_lock(&process_info->lock);
>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                goto out;
>>        }
>>
>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>> +                     sizeof(struct page *),
>> +                     GFP_KERNEL | __GFP_ZERO);
>> +     if (!pages)
>> +             goto unregister_out;
>> +
>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>        if (ret) {
>>                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>                goto unregister_out;
>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>                goto release_out;
>>        }
>> +
>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>> +
>> +     memcpy(bo->tbo.ttm->pages,
>> +                     pages,
>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>        if (ret)
>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>    release_out:
>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>    unregister_out:
>> +     kvfree(pages);
>>        if (ret)
>>                amdgpu_mn_unregister(bo);
>>    out:
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19 10:01         ` Christian König
  0 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-19 10:01 UTC (permalink / raw)
  To: Pan, Xinhui, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

I'm scratching my head how that is even possible.

See when a BO is created in the system domain it is just an empty hull, 
e.g. without backing store and allocated pages.

So the swapout function will just ignore it.

Christian.

Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> I have reverted Chris'  patch, still hit this failure.
> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>
> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> [snip]
> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>
>
> ________________________________________
> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
> 发送时间: 2021年5月19日 12:09
> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> yes, we really dont swapout SG BOs.
> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>
> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>
> ________________________________________
> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
> 发送时间: 2021年5月19日 11:29
> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
> this type of BO.
>
> Last I checked, userptr BOs (and other SG BOs) were protected from
> swapout by the fact that they would not be added to the swap-LRU. But it
> looks like Christian just removed the swap-LRU. I guess this broke that
> protection:
>
> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
> Author: Christian König <christian.koenig@amd.com>
> Date:   Tue Oct 6 16:30:09 2020 +0200
>
>       drm/ttm: remove swap LRU v3
>
>       Instead evict round robin from each devices SYSTEM and TT domain.
>
>       v2: reorder num_pages access reported by Dan's script
>       v3: fix rebase fallout, num_pages should be 32bit
>
>       Signed-off-by: Christian König <christian.koenig@amd.com>
>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>       Link: https://patchwork.freedesktop.org/patch/424009/
>
> Regards,
>     Felix
>
>
> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>> cpu 1                                           cpu 2
>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>       ->init -> validate                               -> init -> validate -> populate
>>       init_user_pages                            -> swapout BO A //hit ttm pages limit
>>        -> get_user_pages (fill up ttm->pages)
>>         -> validate -> populate
>>             -> swapin BO A // Now hit the BUG
>>
>> We know that get_user_pages may race with swapout on same BO.
>> Threre are some issues I have met.
>> 1) memory corruption.
>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>> just create a swap_storage with its content being 0x0. So when we setup
>> memory after the swapout. The following swapin makes the memory
>> corrupted.
>>
>> 2) panic
>> When swapout happes with get_user_pages, they touch ttm->pages without
>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 928e8d57cd08..42460e4480f8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>        struct amdkfd_process_info *process_info = mem->process_info;
>>        struct amdgpu_bo *bo = mem->bo;
>>        struct ttm_operation_ctx ctx = { true, false };
>> +     struct page **pages;
>>        int ret = 0;
>>
>>        mutex_lock(&process_info->lock);
>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                goto out;
>>        }
>>
>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>> +                     sizeof(struct page *),
>> +                     GFP_KERNEL | __GFP_ZERO);
>> +     if (!pages)
>> +             goto unregister_out;
>> +
>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>        if (ret) {
>>                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>                goto unregister_out;
>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>                goto release_out;
>>        }
>> +
>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>> +
>> +     memcpy(bo->tbo.ttm->pages,
>> +                     pages,
>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>        if (ret)
>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>    release_out:
>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>    unregister_out:
>> +     kvfree(pages);
>>        if (ret)
>>                amdgpu_mn_unregister(bo);
>>    out:
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19 10:01         ` Christian König
@ 2021-05-19 15:11           ` Felix Kuehling
  -1 siblings, 0 replies; 30+ messages in thread
From: Felix Kuehling @ 2021-05-19 15:11 UTC (permalink / raw)
  To: Christian König, Pan, Xinhui, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

Looks like we're creating the userptr BO as ttm_bo_type_device. I guess
we should be using ttm_bo_type_sg? BTW, amdgpu_gem_userptr_ioctl also
uses ttm_bo_type_device.

Regards,
  Felix


Am 2021-05-19 um 6:01 a.m. schrieb Christian König:
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty
> hull, e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be
>> swapout first. That is why we hit this issue frequently now. But the
>> bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this
>> BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too
>> late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I
>> can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>       drm/ttm: remove swap LRU v3
>>
>>       Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>       v2: reorder num_pages access reported by Dan's script
>>       v3: fix rebase fallout, num_pages should be 32bit
>>
>>       Signed-off-by: Christian König <christian.koenig@amd.com>
>>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>       Link: https://patchwork.freedesktop.org/patch/424009/
>>
>> Regards,
>>     Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>       ->init -> validate                               -> init ->
>>> validate -> populate
>>>       init_user_pages                            -> swapout BO A
>>> //hit ttm pages limit
>>>        -> get_user_pages (fill up ttm->pages)
>>>         -> validate -> populate
>>>             -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
>>> +++++++++++++++-
>>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>        struct amdkfd_process_info *process_info = mem->process_info;
>>>        struct amdgpu_bo *bo = mem->bo;
>>>        struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>        int ret = 0;
>>>
>>>        mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                goto out;
>>>        }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>        if (ret) {
>>>                pr_err("%s: Failed to get user pages: %d\n",
>>> __func__, ret);
>>>                goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>>                goto release_out;
>>>        }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>        if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>    release_out:
>>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>    unregister_out:
>>> +     kvfree(pages);
>>>        if (ret)
>>>                amdgpu_mn_unregister(bo);
>>>    out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-19 15:11           ` Felix Kuehling
  0 siblings, 0 replies; 30+ messages in thread
From: Felix Kuehling @ 2021-05-19 15:11 UTC (permalink / raw)
  To: Christian König, Pan, Xinhui, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

Looks like we're creating the userptr BO as ttm_bo_type_device. I guess
we should be using ttm_bo_type_sg? BTW, amdgpu_gem_userptr_ioctl also
uses ttm_bo_type_device.

Regards,
  Felix


Am 2021-05-19 um 6:01 a.m. schrieb Christian König:
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty
> hull, e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be
>> swapout first. That is why we hit this issue frequently now. But the
>> bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this
>> BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too
>> late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I
>> can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>       drm/ttm: remove swap LRU v3
>>
>>       Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>       v2: reorder num_pages access reported by Dan's script
>>       v3: fix rebase fallout, num_pages should be 32bit
>>
>>       Signed-off-by: Christian König <christian.koenig@amd.com>
>>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>       Link: https://patchwork.freedesktop.org/patch/424009/
>>
>> Regards,
>>     Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>       ->init -> validate                               -> init ->
>>> validate -> populate
>>>       init_user_pages                            -> swapout BO A
>>> //hit ttm pages limit
>>>        -> get_user_pages (fill up ttm->pages)
>>>         -> validate -> populate
>>>             -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
>>> +++++++++++++++-
>>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>        struct amdkfd_process_info *process_info = mem->process_info;
>>>        struct amdgpu_bo *bo = mem->bo;
>>>        struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>        int ret = 0;
>>>
>>>        mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                goto out;
>>>        }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>        if (ret) {
>>>                pr_err("%s: Failed to get user pages: %d\n",
>>> __func__, ret);
>>>                goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>>                goto release_out;
>>>        }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>        if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>    release_out:
>>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>    unregister_out:
>>> +     kvfree(pages);
>>>        if (ret)
>>>                amdgpu_mn_unregister(bo);
>>>    out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19 10:01         ` Christian König
@ 2021-05-20  2:55           ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  2:55 UTC (permalink / raw)
  To: Christian König, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

[AMD Official Use Only]

swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.

swapin function is called during populate as TTM_PAGE_FLAG_SWAPPED is set.
Now here is the problem, we swapin data to ttm bakend memory from swap storage. That just causes the memory been overwritten.

________________________________________
发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
发送时间: 2021年5月19日 18:01
收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

I'm scratching my head how that is even possible.

See when a BO is created in the system domain it is just an empty hull,
e.g. without backing store and allocated pages.

So the swapout function will just ignore it.

Christian.

Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> I have reverted Chris'  patch, still hit this failure.
> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>
> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> [snip]
> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>
>
> ________________________________________
> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
> 发送时间: 2021年5月19日 12:09
> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> yes, we really dont swapout SG BOs.
> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>
> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>
> ________________________________________
> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
> 发送时间: 2021年5月19日 11:29
> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
> this type of BO.
>
> Last I checked, userptr BOs (and other SG BOs) were protected from
> swapout by the fact that they would not be added to the swap-LRU. But it
> looks like Christian just removed the swap-LRU. I guess this broke that
> protection:
>
> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
> Author: Christian König <christian.koenig@amd.com>
> Date:   Tue Oct 6 16:30:09 2020 +0200
>
>       drm/ttm: remove swap LRU v3
>
>       Instead evict round robin from each devices SYSTEM and TT domain.
>
>       v2: reorder num_pages access reported by Dan's script
>       v3: fix rebase fallout, num_pages should be 32bit
>
>       Signed-off-by: Christian König <christian.koenig@amd.com>
>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>       Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F424009%2F&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=K3%2FnFpN56y8L49UuYRM6SqefVFLnqIwpDAtWpS1XvnQ%3D&amp;reserved=0
>
> Regards,
>     Felix
>
>
> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>> cpu 1                                           cpu 2
>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>       ->init -> validate                               -> init -> validate -> populate
>>       init_user_pages                            -> swapout BO A //hit ttm pages limit
>>        -> get_user_pages (fill up ttm->pages)
>>         -> validate -> populate
>>             -> swapin BO A // Now hit the BUG
>>
>> We know that get_user_pages may race with swapout on same BO.
>> Threre are some issues I have met.
>> 1) memory corruption.
>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>> just create a swap_storage with its content being 0x0. So when we setup
>> memory after the swapout. The following swapin makes the memory
>> corrupted.
>>
>> 2) panic
>> When swapout happes with get_user_pages, they touch ttm->pages without
>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 928e8d57cd08..42460e4480f8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>        struct amdkfd_process_info *process_info = mem->process_info;
>>        struct amdgpu_bo *bo = mem->bo;
>>        struct ttm_operation_ctx ctx = { true, false };
>> +     struct page **pages;
>>        int ret = 0;
>>
>>        mutex_lock(&process_info->lock);
>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                goto out;
>>        }
>>
>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>> +                     sizeof(struct page *),
>> +                     GFP_KERNEL | __GFP_ZERO);
>> +     if (!pages)
>> +             goto unregister_out;
>> +
>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>        if (ret) {
>>                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>                goto unregister_out;
>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>                goto release_out;
>>        }
>> +
>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>> +
>> +     memcpy(bo->tbo.ttm->pages,
>> +                     pages,
>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>        if (ret)
>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>    release_out:
>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>    unregister_out:
>> +     kvfree(pages);
>>        if (ret)
>>                amdgpu_mn_unregister(bo);
>>    out:
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=QOuCH8r6DMdp%2Bvr6LAPhDw0BUCKPzK%2Bbv6MoHx4AmMo%3D&amp;reserved=0


^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-20  2:55           ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  2:55 UTC (permalink / raw)
  To: Christian König, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

[AMD Official Use Only]

swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.

swapin function is called during populate as TTM_PAGE_FLAG_SWAPPED is set.
Now here is the problem, we swapin data to ttm bakend memory from swap storage. That just causes the memory been overwritten.

________________________________________
发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
发送时间: 2021年5月19日 18:01
收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

I'm scratching my head how that is even possible.

See when a BO is created in the system domain it is just an empty hull,
e.g. without backing store and allocated pages.

So the swapout function will just ignore it.

Christian.

Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> I have reverted Chris'  patch, still hit this failure.
> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>
> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> [snip]
> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>
>
> ________________________________________
> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
> 发送时间: 2021年5月19日 12:09
> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> yes, we really dont swapout SG BOs.
> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>
> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>
> ________________________________________
> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
> 发送时间: 2021年5月19日 11:29
> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
> this type of BO.
>
> Last I checked, userptr BOs (and other SG BOs) were protected from
> swapout by the fact that they would not be added to the swap-LRU. But it
> looks like Christian just removed the swap-LRU. I guess this broke that
> protection:
>
> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
> Author: Christian König <christian.koenig@amd.com>
> Date:   Tue Oct 6 16:30:09 2020 +0200
>
>       drm/ttm: remove swap LRU v3
>
>       Instead evict round robin from each devices SYSTEM and TT domain.
>
>       v2: reorder num_pages access reported by Dan's script
>       v3: fix rebase fallout, num_pages should be 32bit
>
>       Signed-off-by: Christian König <christian.koenig@amd.com>
>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>       Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F424009%2F&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=K3%2FnFpN56y8L49UuYRM6SqefVFLnqIwpDAtWpS1XvnQ%3D&amp;reserved=0
>
> Regards,
>     Felix
>
>
> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>> cpu 1                                           cpu 2
>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>       ->init -> validate                               -> init -> validate -> populate
>>       init_user_pages                            -> swapout BO A //hit ttm pages limit
>>        -> get_user_pages (fill up ttm->pages)
>>         -> validate -> populate
>>             -> swapin BO A // Now hit the BUG
>>
>> We know that get_user_pages may race with swapout on same BO.
>> Threre are some issues I have met.
>> 1) memory corruption.
>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>> just create a swap_storage with its content being 0x0. So when we setup
>> memory after the swapout. The following swapin makes the memory
>> corrupted.
>>
>> 2) panic
>> When swapout happes with get_user_pages, they touch ttm->pages without
>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 928e8d57cd08..42460e4480f8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>        struct amdkfd_process_info *process_info = mem->process_info;
>>        struct amdgpu_bo *bo = mem->bo;
>>        struct ttm_operation_ctx ctx = { true, false };
>> +     struct page **pages;
>>        int ret = 0;
>>
>>        mutex_lock(&process_info->lock);
>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                goto out;
>>        }
>>
>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>> +                     sizeof(struct page *),
>> +                     GFP_KERNEL | __GFP_ZERO);
>> +     if (!pages)
>> +             goto unregister_out;
>> +
>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>        if (ret) {
>>                pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>                goto unregister_out;
>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>                goto release_out;
>>        }
>> +
>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>> +
>> +     memcpy(bo->tbo.ttm->pages,
>> +                     pages,
>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>        if (ret)
>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>    release_out:
>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>    unregister_out:
>> +     kvfree(pages);
>>        if (ret)
>>                amdgpu_mn_unregister(bo);
>>    out:
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=QOuCH8r6DMdp%2Bvr6LAPhDw0BUCKPzK%2Bbv6MoHx4AmMo%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-19 15:11           ` Felix Kuehling
@ 2021-05-20  2:58             ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  2:58 UTC (permalink / raw)
  To: Kuehling, Felix, Christian König, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

[AMD Official Use Only]

I am not sure if we can create a ttm_bo_type_sg bo for userptr. But I have another idea now. we can use flag AMDGPU_AMDKFD_CREATE_USERPTR_BO to create the userptr bo.
________________________________________
发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
发送时间: 2021年5月19日 23:11
收件人: Christian König; Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Looks like we're creating the userptr BO as ttm_bo_type_device. I guess
we should be using ttm_bo_type_sg? BTW, amdgpu_gem_userptr_ioctl also
uses ttm_bo_type_device.

Regards,
  Felix


Am 2021-05-19 um 6:01 a.m. schrieb Christian König:
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty
> hull, e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be
>> swapout first. That is why we hit this issue frequently now. But the
>> bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this
>> BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too
>> late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I
>> can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>       drm/ttm: remove swap LRU v3
>>
>>       Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>       v2: reorder num_pages access reported by Dan's script
>>       v3: fix rebase fallout, num_pages should be 32bit
>>
>>       Signed-off-by: Christian König <christian.koenig@amd.com>
>>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>       Link: https://patchwork.freedesktop.org/patch/424009/
>>
>> Regards,
>>     Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>       ->init -> validate                               -> init ->
>>> validate -> populate
>>>       init_user_pages                            -> swapout BO A
>>> //hit ttm pages limit
>>>        -> get_user_pages (fill up ttm->pages)
>>>         -> validate -> populate
>>>             -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
>>> +++++++++++++++-
>>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>        struct amdkfd_process_info *process_info = mem->process_info;
>>>        struct amdgpu_bo *bo = mem->bo;
>>>        struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>        int ret = 0;
>>>
>>>        mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                goto out;
>>>        }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>        if (ret) {
>>>                pr_err("%s: Failed to get user pages: %d\n",
>>> __func__, ret);
>>>                goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>>                goto release_out;
>>>        }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>        if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>    release_out:
>>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>    unregister_out:
>>> +     kvfree(pages);
>>>        if (ret)
>>>                amdgpu_mn_unregister(bo);
>>>    out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-20  2:58             ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  2:58 UTC (permalink / raw)
  To: Kuehling, Felix, Christian König, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

[AMD Official Use Only]

I am not sure if we can create a ttm_bo_type_sg bo for userptr. But I have another idea now. we can use flag AMDGPU_AMDKFD_CREATE_USERPTR_BO to create the userptr bo.
________________________________________
发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
发送时间: 2021年5月19日 23:11
收件人: Christian König; Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

Looks like we're creating the userptr BO as ttm_bo_type_device. I guess
we should be using ttm_bo_type_sg? BTW, amdgpu_gem_userptr_ioctl also
uses ttm_bo_type_device.

Regards,
  Felix


Am 2021-05-19 um 6:01 a.m. schrieb Christian König:
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty
> hull, e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be
>> swapout first. That is why we hit this issue frequently now. But the
>> bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this
>> BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too
>> late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I
>> can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian;
>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>> swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>       drm/ttm: remove swap LRU v3
>>
>>       Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>       v2: reorder num_pages access reported by Dan's script
>>       v3: fix rebase fallout, num_pages should be 32bit
>>
>>       Signed-off-by: Christian König <christian.koenig@amd.com>
>>       Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>       Reviewed-by: Huang Rui <ray.huang@amd.com>
>>       Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>       Link: https://patchwork.freedesktop.org/patch/424009/
>>
>> Regards,
>>     Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>       ->init -> validate                               -> init ->
>>> validate -> populate
>>>       init_user_pages                            -> swapout BO A
>>> //hit ttm pages limit
>>>        -> get_user_pages (fill up ttm->pages)
>>>         -> validate -> populate
>>>             -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
>>> +++++++++++++++-
>>>    1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>        struct amdkfd_process_info *process_info = mem->process_info;
>>>        struct amdgpu_bo *bo = mem->bo;
>>>        struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>        int ret = 0;
>>>
>>>        mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                goto out;
>>>        }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>        if (ret) {
>>>                pr_err("%s: Failed to get user pages: %d\n",
>>> __func__, ret);
>>>                goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>                pr_err("%s: Failed to reserve BO\n", __func__);
>>>                goto release_out;
>>>        }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>        amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>        ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>        if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>> uint64_t user_addr)
>>>    release_out:
>>>        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>    unregister_out:
>>> +     kvfree(pages);
>>>        if (ret)
>>>                amdgpu_mn_unregister(bo);
>>>    out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-20  2:58             ` Pan, Xinhui
@ 2021-05-20  6:38               ` Christian König
  -1 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-20  6:38 UTC (permalink / raw)
  To: Pan, Xinhui, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

The problem is that ttm_bo_type_sg doesn't allocate a page array for the 
TT object.

Christian.

Am 20.05.21 um 04:58 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> I am not sure if we can create a ttm_bo_type_sg bo for userptr. But I have another idea now. we can use flag AMDGPU_AMDKFD_CREATE_USERPTR_BO to create the userptr bo.
> ________________________________________
> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
> 发送时间: 2021年5月19日 23:11
> 收件人: Christian König; Pan, Xinhui; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
> 主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> Looks like we're creating the userptr BO as ttm_bo_type_device. I guess
> we should be using ttm_bo_type_sg? BTW, amdgpu_gem_userptr_ioctl also
> uses ttm_bo_type_device.
>
> Regards,
>    Felix
>
>
> Am 2021-05-19 um 6:01 a.m. schrieb Christian König:
>> I'm scratching my head how that is even possible.
>>
>> See when a BO is created in the system domain it is just an empty
>> hull, e.g. without backing store and allocated pages.
>>
>> So the swapout function will just ignore it.
>>
>> Christian.
>>
>> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>>> [AMD Official Use Only]
>>>
>>> I have reverted Chris'  patch, still hit this failure.
>>> Just see two lines in Chris' patch. Any BO in cpu domian would be
>>> swapout first. That is why we hit this issue frequently now. But the
>>> bug is there long time ago.
>>>
>>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> [snip]
>>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>>
>>>
>>> ________________________________________
>>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>>> 发送时间: 2021年5月19日 12:09
>>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>>> 抄送: Deucher, Alexander; Koenig, Christian;
>>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>>> swapout and swapin
>>>
>>> yes, we really dont swapout SG BOs.
>>> The problems is that before we validate a userptr BO, we create this
>>> BO in CPU domain by default. So this BO has chance to swapout.
>>>
>>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too
>>> late.
>>> I have not try to revert Chris' patch as I think it desnt help. Or I
>>> can have a try later.
>>>
>>> ________________________________________
>>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>>> 发送时间: 2021年5月19日 11:29
>>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>>> 抄送: Deucher, Alexander; Koenig, Christian;
>>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>>> swapout and swapin
>>>
>>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>>> this type of BO.
>>>
>>> Last I checked, userptr BOs (and other SG BOs) were protected from
>>> swapout by the fact that they would not be added to the swap-LRU. But it
>>> looks like Christian just removed the swap-LRU. I guess this broke that
>>> protection:
>>>
>>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>>> Author: Christian König <christian.koenig@amd.com>
>>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>>
>>>        drm/ttm: remove swap LRU v3
>>>
>>>        Instead evict round robin from each devices SYSTEM and TT domain.
>>>
>>>        v2: reorder num_pages access reported by Dan's script
>>>        v3: fix rebase fallout, num_pages should be 32bit
>>>
>>>        Signed-off-by: Christian König <christian.koenig@amd.com>
>>>        Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>>        Reviewed-by: Huang Rui <ray.huang@amd.com>
>>>        Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>>        Link: https://patchwork.freedesktop.org/patch/424009/
>>>
>>> Regards,
>>>      Felix
>>>
>>>
>>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>>> cpu 1                                           cpu 2
>>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>>        ->init -> validate                               -> init ->
>>>> validate -> populate
>>>>        init_user_pages                            -> swapout BO A
>>>> //hit ttm pages limit
>>>>         -> get_user_pages (fill up ttm->pages)
>>>>          -> validate -> populate
>>>>              -> swapin BO A // Now hit the BUG
>>>>
>>>> We know that get_user_pages may race with swapout on same BO.
>>>> Threre are some issues I have met.
>>>> 1) memory corruption.
>>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>>> just create a swap_storage with its content being 0x0. So when we setup
>>>> memory after the swapout. The following swapin makes the memory
>>>> corrupted.
>>>>
>>>> 2) panic
>>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>>
>>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
>>>> +++++++++++++++-
>>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> index 928e8d57cd08..42460e4480f8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>         struct amdkfd_process_info *process_info = mem->process_info;
>>>>         struct amdgpu_bo *bo = mem->bo;
>>>>         struct ttm_operation_ctx ctx = { true, false };
>>>> +     struct page **pages;
>>>>         int ret = 0;
>>>>
>>>>         mutex_lock(&process_info->lock);
>>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>                 goto out;
>>>>         }
>>>>
>>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>>> +                     sizeof(struct page *),
>>>> +                     GFP_KERNEL | __GFP_ZERO);
>>>> +     if (!pages)
>>>> +             goto unregister_out;
>>>> +
>>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>>         if (ret) {
>>>>                 pr_err("%s: Failed to get user pages: %d\n",
>>>> __func__, ret);
>>>>                 goto unregister_out;
>>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>                 pr_err("%s: Failed to reserve BO\n", __func__);
>>>>                 goto release_out;
>>>>         }
>>>> +
>>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>>> +
>>>> +     memcpy(bo->tbo.ttm->pages,
>>>> +                     pages,
>>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>>         amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>>         ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>>         if (ret)
>>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>     release_out:
>>>>         amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>>     unregister_out:
>>>> +     kvfree(pages);
>>>>         if (ret)
>>>>                 amdgpu_mn_unregister(bo);
>>>>     out:
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-20  6:38               ` Christian König
  0 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-20  6:38 UTC (permalink / raw)
  To: Pan, Xinhui, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

The problem is that ttm_bo_type_sg doesn't allocate a page array for the 
TT object.

Christian.

Am 20.05.21 um 04:58 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> I am not sure if we can create a ttm_bo_type_sg bo for userptr. But I have another idea now. we can use flag AMDGPU_AMDKFD_CREATE_USERPTR_BO to create the userptr bo.
> ________________________________________
> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
> 发送时间: 2021年5月19日 23:11
> 收件人: Christian König; Pan, Xinhui; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
> 主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> Looks like we're creating the userptr BO as ttm_bo_type_device. I guess
> we should be using ttm_bo_type_sg? BTW, amdgpu_gem_userptr_ioctl also
> uses ttm_bo_type_device.
>
> Regards,
>    Felix
>
>
> Am 2021-05-19 um 6:01 a.m. schrieb Christian König:
>> I'm scratching my head how that is even possible.
>>
>> See when a BO is created in the system domain it is just an empty
>> hull, e.g. without backing store and allocated pages.
>>
>> So the swapout function will just ignore it.
>>
>> Christian.
>>
>> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>>> [AMD Official Use Only]
>>>
>>> I have reverted Chris'  patch, still hit this failure.
>>> Just see two lines in Chris' patch. Any BO in cpu domian would be
>>> swapout first. That is why we hit this issue frequently now. But the
>>> bug is there long time ago.
>>>
>>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> [snip]
>>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>>
>>>
>>> ________________________________________
>>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>>> 发送时间: 2021年5月19日 12:09
>>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>>> 抄送: Deucher, Alexander; Koenig, Christian;
>>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>>> swapout and swapin
>>>
>>> yes, we really dont swapout SG BOs.
>>> The problems is that before we validate a userptr BO, we create this
>>> BO in CPU domain by default. So this BO has chance to swapout.
>>>
>>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too
>>> late.
>>> I have not try to revert Chris' patch as I think it desnt help. Or I
>>> can have a try later.
>>>
>>> ________________________________________
>>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>>> 发送时间: 2021年5月19日 11:29
>>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>>> 抄送: Deucher, Alexander; Koenig, Christian;
>>> dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to
>>> swapout and swapin
>>>
>>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>>> this type of BO.
>>>
>>> Last I checked, userptr BOs (and other SG BOs) were protected from
>>> swapout by the fact that they would not be added to the swap-LRU. But it
>>> looks like Christian just removed the swap-LRU. I guess this broke that
>>> protection:
>>>
>>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>>> Author: Christian König <christian.koenig@amd.com>
>>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>>
>>>        drm/ttm: remove swap LRU v3
>>>
>>>        Instead evict round robin from each devices SYSTEM and TT domain.
>>>
>>>        v2: reorder num_pages access reported by Dan's script
>>>        v3: fix rebase fallout, num_pages should be 32bit
>>>
>>>        Signed-off-by: Christian König <christian.koenig@amd.com>
>>>        Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>>        Reviewed-by: Huang Rui <ray.huang@amd.com>
>>>        Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>>        Link: https://patchwork.freedesktop.org/patch/424009/
>>>
>>> Regards,
>>>      Felix
>>>
>>>
>>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>>> cpu 1                                           cpu 2
>>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>>        ->init -> validate                               -> init ->
>>>> validate -> populate
>>>>        init_user_pages                            -> swapout BO A
>>>> //hit ttm pages limit
>>>>         -> get_user_pages (fill up ttm->pages)
>>>>          -> validate -> populate
>>>>              -> swapin BO A // Now hit the BUG
>>>>
>>>> We know that get_user_pages may race with swapout on same BO.
>>>> Threre are some issues I have met.
>>>> 1) memory corruption.
>>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>>> just create a swap_storage with its content being 0x0. So when we setup
>>>> memory after the swapout. The following swapin makes the memory
>>>> corrupted.
>>>>
>>>> 2) panic
>>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>>
>>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>>> ---
>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16
>>>> +++++++++++++++-
>>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> index 928e8d57cd08..42460e4480f8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>         struct amdkfd_process_info *process_info = mem->process_info;
>>>>         struct amdgpu_bo *bo = mem->bo;
>>>>         struct ttm_operation_ctx ctx = { true, false };
>>>> +     struct page **pages;
>>>>         int ret = 0;
>>>>
>>>>         mutex_lock(&process_info->lock);
>>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>                 goto out;
>>>>         }
>>>>
>>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>>> +                     sizeof(struct page *),
>>>> +                     GFP_KERNEL | __GFP_ZERO);
>>>> +     if (!pages)
>>>> +             goto unregister_out;
>>>> +
>>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>>         if (ret) {
>>>>                 pr_err("%s: Failed to get user pages: %d\n",
>>>> __func__, ret);
>>>>                 goto unregister_out;
>>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>                 pr_err("%s: Failed to reserve BO\n", __func__);
>>>>                 goto release_out;
>>>>         }
>>>> +
>>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>>> +
>>>> +     memcpy(bo->tbo.ttm->pages,
>>>> +                     pages,
>>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>>         amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>>         ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>>         if (ret)
>>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem,
>>>> uint64_t user_addr)
>>>>     release_out:
>>>>         amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>>     unregister_out:
>>>> +     kvfree(pages);
>>>>         if (ret)
>>>>                 amdgpu_mn_unregister(bo);
>>>>     out:
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-20  2:55           ` Pan, Xinhui
@ 2021-05-20  6:39             ` Christian König
  -1 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-20  6:39 UTC (permalink / raw)
  To: Pan, Xinhui, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.

That's the fundamental problem. A TT object which isn't populated 
shouldn't be considered for swapout nor eviction in the first place.

I'm going to take a look later today.

Christian.

Am 20.05.21 um 04:55 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.
>
> swapin function is called during populate as TTM_PAGE_FLAG_SWAPPED is set.
> Now here is the problem, we swapin data to ttm bakend memory from swap storage. That just causes the memory been overwritten.
>
> ________________________________________
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2021年5月19日 18:01
> 收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
> 主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty hull,
> e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>        drm/ttm: remove swap LRU v3
>>
>>        Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>        v2: reorder num_pages access reported by Dan's script
>>        v3: fix rebase fallout, num_pages should be 32bit
>>
>>        Signed-off-by: Christian König <christian.koenig@amd.com>
>>        Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>        Reviewed-by: Huang Rui <ray.huang@amd.com>
>>        Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>        Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F424009%2F&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=K3%2FnFpN56y8L49UuYRM6SqefVFLnqIwpDAtWpS1XvnQ%3D&amp;reserved=0
>>
>> Regards,
>>      Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>        ->init -> validate                               -> init -> validate -> populate
>>>        init_user_pages                            -> swapout BO A //hit ttm pages limit
>>>         -> get_user_pages (fill up ttm->pages)
>>>          -> validate -> populate
>>>              -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>         struct amdkfd_process_info *process_info = mem->process_info;
>>>         struct amdgpu_bo *bo = mem->bo;
>>>         struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>         int ret = 0;
>>>
>>>         mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 goto out;
>>>         }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>         if (ret) {
>>>                 pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>>                 goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 pr_err("%s: Failed to reserve BO\n", __func__);
>>>                 goto release_out;
>>>         }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>         amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>         ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>         if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>     release_out:
>>>         amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>     unregister_out:
>>> +     kvfree(pages);
>>>         if (ret)
>>>                 amdgpu_mn_unregister(bo);
>>>     out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=QOuCH8r6DMdp%2Bvr6LAPhDw0BUCKPzK%2Bbv6MoHx4AmMo%3D&amp;reserved=0


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-20  6:39             ` Christian König
  0 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-20  6:39 UTC (permalink / raw)
  To: Pan, Xinhui, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.

That's the fundamental problem. A TT object which isn't populated 
shouldn't be considered for swapout nor eviction in the first place.

I'm going to take a look later today.

Christian.

Am 20.05.21 um 04:55 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.
>
> swapin function is called during populate as TTM_PAGE_FLAG_SWAPPED is set.
> Now here is the problem, we swapin data to ttm bakend memory from swap storage. That just causes the memory been overwritten.
>
> ________________________________________
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2021年5月19日 18:01
> 收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
> 主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty hull,
> e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>        drm/ttm: remove swap LRU v3
>>
>>        Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>        v2: reorder num_pages access reported by Dan's script
>>        v3: fix rebase fallout, num_pages should be 32bit
>>
>>        Signed-off-by: Christian König <christian.koenig@amd.com>
>>        Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>        Reviewed-by: Huang Rui <ray.huang@amd.com>
>>        Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>        Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F424009%2F&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=K3%2FnFpN56y8L49UuYRM6SqefVFLnqIwpDAtWpS1XvnQ%3D&amp;reserved=0
>>
>> Regards,
>>      Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>        ->init -> validate                               -> init -> validate -> populate
>>>        init_user_pages                            -> swapout BO A //hit ttm pages limit
>>>         -> get_user_pages (fill up ttm->pages)
>>>          -> validate -> populate
>>>              -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>         struct amdkfd_process_info *process_info = mem->process_info;
>>>         struct amdgpu_bo *bo = mem->bo;
>>>         struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>         int ret = 0;
>>>
>>>         mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 goto out;
>>>         }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>         if (ret) {
>>>                 pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>>                 goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 pr_err("%s: Failed to reserve BO\n", __func__);
>>>                 goto release_out;
>>>         }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>         amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>         ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>         if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>     release_out:
>>>         amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>     unregister_out:
>>> +     kvfree(pages);
>>>         if (ret)
>>>                 amdgpu_mn_unregister(bo);
>>>     out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7Cb4422d8b3e4947cd243c08d91aad14c3%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570152942496679%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=QOuCH8r6DMdp%2Bvr6LAPhDw0BUCKPzK%2Bbv6MoHx4AmMo%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-20  6:39             ` Christian König
@ 2021-05-20  7:30               ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  7:30 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx, Christian König
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

[-- Attachment #1: Type: text/plain, Size: 155 bytes --]

I just sent out patch below yesterday.  swapping unpopulated bo is useless indeed.

[RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.

[-- Attachment #2: Type: text/html, Size: 697 bytes --]

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-20  7:30               ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  7:30 UTC (permalink / raw)
  To: Kuehling, Felix, amd-gfx, Christian König
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel


[-- Attachment #1.1: Type: text/plain, Size: 155 bytes --]

I just sent out patch below yesterday.  swapping unpopulated bo is useless indeed.

[RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.

[-- Attachment #1.2: Type: text/html, Size: 697 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
  2021-05-20  6:39             ` Christian König
@ 2021-05-20  7:32               ` Pan, Xinhui
  -1 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  7:32 UTC (permalink / raw)
  To: Christian König, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian

[AMD Official Use Only]

I just sent out patch below yesterday.  swapping unpopulated bo is useless indeed.

[RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.

________________________________________
发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
发送时间: 2021年5月20日 14:39
收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
主题: Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.

That's the fundamental problem. A TT object which isn't populated
shouldn't be considered for swapout nor eviction in the first place.

I'm going to take a look later today.

Christian.

Am 20.05.21 um 04:55 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.
>
> swapin function is called during populate as TTM_PAGE_FLAG_SWAPPED is set.
> Now here is the problem, we swapin data to ttm bakend memory from swap storage. That just causes the memory been overwritten.
>
> ________________________________________
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2021年5月19日 18:01
> 收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
> 主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty hull,
> e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>        drm/ttm: remove swap LRU v3
>>
>>        Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>        v2: reorder num_pages access reported by Dan's script
>>        v3: fix rebase fallout, num_pages should be 32bit
>>
>>        Signed-off-by: Christian König <christian.koenig@amd.com>
>>        Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>        Reviewed-by: Huang Rui <ray.huang@amd.com>
>>        Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>        Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F424009%2F&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7C2f5f749422eb44efa95408d91b5a029c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570895679516712%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=I9K%2FsTdCESLlS9P5rLOGUqEGCcxi0Jc7aaEOltyBP4M%3D&amp;reserved=0
>>
>> Regards,
>>      Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>        ->init -> validate                               -> init -> validate -> populate
>>>        init_user_pages                            -> swapout BO A //hit ttm pages limit
>>>         -> get_user_pages (fill up ttm->pages)
>>>          -> validate -> populate
>>>              -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>         struct amdkfd_process_info *process_info = mem->process_info;
>>>         struct amdgpu_bo *bo = mem->bo;
>>>         struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>         int ret = 0;
>>>
>>>         mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 goto out;
>>>         }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>         if (ret) {
>>>                 pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>>                 goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 pr_err("%s: Failed to reserve BO\n", __func__);
>>>                 goto release_out;
>>>         }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>         amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>         ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>         if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>     release_out:
>>>         amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>     unregister_out:
>>> +     kvfree(pages);
>>>         if (ret)
>>>                 amdgpu_mn_unregister(bo);
>>>     out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7C2f5f749422eb44efa95408d91b5a029c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570895679516712%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=aZE0mxygik02mv9JUstV2BDKJl6zb1BNLggHq5WLhKY%3D&amp;reserved=0


^ permalink raw reply	[flat|nested] 30+ messages in thread

* 回复: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
@ 2021-05-20  7:32               ` Pan, Xinhui
  0 siblings, 0 replies; 30+ messages in thread
From: Pan, Xinhui @ 2021-05-20  7:32 UTC (permalink / raw)
  To: Christian König, Kuehling, Felix, amd-gfx
  Cc: Deucher, Alexander, dri-devel, Koenig, Christian, daniel

[AMD Official Use Only]

I just sent out patch below yesterday.  swapping unpopulated bo is useless indeed.

[RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.

________________________________________
发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
发送时间: 2021年5月20日 14:39
收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
主题: Re: 回复: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin

> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.

That's the fundamental problem. A TT object which isn't populated
shouldn't be considered for swapout nor eviction in the first place.

I'm going to take a look later today.

Christian.

Am 20.05.21 um 04:55 schrieb Pan, Xinhui:
> [AMD Official Use Only]
>
> swapout function create one swap storage which is filled with zero. And set ttm->page_flags as TTM_PAGE_FLAG_SWAPPED.  Just because ttm has no backend page this time, no real data is swapout to this swap storage.
>
> swapin function is called during populate as TTM_PAGE_FLAG_SWAPPED is set.
> Now here is the problem, we swapin data to ttm bakend memory from swap storage. That just causes the memory been overwritten.
>
> ________________________________________
> 发件人: Christian König <ckoenig.leichtzumerken@gmail.com>
> 发送时间: 2021年5月19日 18:01
> 收件人: Pan, Xinhui; Kuehling, Felix; amd-gfx@lists.freedesktop.org
> 抄送: Deucher, Alexander; daniel@ffwll.ch; Koenig, Christian; dri-devel@lists.freedesktop.org
> 主题: Re: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>
> I'm scratching my head how that is even possible.
>
> See when a BO is created in the system domain it is just an empty hull,
> e.g. without backing store and allocated pages.
>
> So the swapout function will just ignore it.
>
> Christian.
>
> Am 19.05.21 um 07:07 schrieb Pan, Xinhui:
>> [AMD Official Use Only]
>>
>> I have reverted Chris'  patch, still hit this failure.
>> Just see two lines in Chris' patch. Any BO in cpu domian would be swapout first. That is why we hit this issue frequently now. But the bug is there long time ago.
>>
>> -       for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>> -               list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> [snip]
>> +       for (i = TTM_PL_SYSTEM; i < TTM_NUM_MEM_TYPES; ++i) {
>> +               for (j = 0; j < TTM_MAX_BO_PRIORITY; ++j) {
>>
>>
>> ________________________________________
>> 发件人: Pan, Xinhui <Xinhui.Pan@amd.com>
>> 发送时间: 2021年5月19日 12:09
>> 收件人: Kuehling, Felix; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> yes, we really dont swapout SG BOs.
>> The problems is that before we validate a userptr BO, we create this BO in CPU domain by default. So this BO has chance to swapout.
>>
>> we set flag TTM_PAGE_FLAG_SG on userptr BO in popluate() which is too late.
>> I have not try to revert Chris' patch as I think it desnt help. Or I can have a try later.
>>
>> ________________________________________
>> 发件人: Kuehling, Felix <Felix.Kuehling@amd.com>
>> 发送时间: 2021年5月19日 11:29
>> 收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
>> 抄送: Deucher, Alexander; Koenig, Christian; dri-devel@lists.freedesktop.org; daniel@ffwll.ch
>> 主题: Re: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin
>>
>> Swapping SG BOs makes no sense, because TTM doesn't own the pages of
>> this type of BO.
>>
>> Last I checked, userptr BOs (and other SG BOs) were protected from
>> swapout by the fact that they would not be added to the swap-LRU. But it
>> looks like Christian just removed the swap-LRU. I guess this broke that
>> protection:
>>
>> commit 2cb51d22d70b18eaf339abf9758bf0b7608da65c
>> Author: Christian König <christian.koenig@amd.com>
>> Date:   Tue Oct 6 16:30:09 2020 +0200
>>
>>        drm/ttm: remove swap LRU v3
>>
>>        Instead evict round robin from each devices SYSTEM and TT domain.
>>
>>        v2: reorder num_pages access reported by Dan's script
>>        v3: fix rebase fallout, num_pages should be 32bit
>>
>>        Signed-off-by: Christian König <christian.koenig@amd.com>
>>        Tested-by: Nirmoy Das <nirmoy.das@amd.com>
>>        Reviewed-by: Huang Rui <ray.huang@amd.com>
>>        Reviewed-by: Matthew Auld <matthew.auld@intel.com>
>>        Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F424009%2F&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7C2f5f749422eb44efa95408d91b5a029c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570895679516712%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=I9K%2FsTdCESLlS9P5rLOGUqEGCcxi0Jc7aaEOltyBP4M%3D&amp;reserved=0
>>
>> Regards,
>>      Felix
>>
>>
>> On 2021-05-18 10:28 p.m., xinhui pan wrote:
>>> cpu 1                                           cpu 2
>>> kfd alloc BO A(userptr)                         alloc BO B(GTT)
>>>        ->init -> validate                               -> init -> validate -> populate
>>>        init_user_pages                            -> swapout BO A //hit ttm pages limit
>>>         -> get_user_pages (fill up ttm->pages)
>>>          -> validate -> populate
>>>              -> swapin BO A // Now hit the BUG
>>>
>>> We know that get_user_pages may race with swapout on same BO.
>>> Threre are some issues I have met.
>>> 1) memory corruption.
>>> This is because we do a swap before memory is setup. ttm_tt_swapout()
>>> just create a swap_storage with its content being 0x0. So when we setup
>>> memory after the swapout. The following swapin makes the memory
>>> corrupted.
>>>
>>> 2) panic
>>> When swapout happes with get_user_pages, they touch ttm->pages without
>>> anylock. It causes memory corruption too. But I hit page fault mostly.
>>>
>>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 16 +++++++++++++++-
>>>     1 file changed, 15 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> index 928e8d57cd08..42460e4480f8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>>> @@ -835,6 +835,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>         struct amdkfd_process_info *process_info = mem->process_info;
>>>         struct amdgpu_bo *bo = mem->bo;
>>>         struct ttm_operation_ctx ctx = { true, false };
>>> +     struct page **pages;
>>>         int ret = 0;
>>>
>>>         mutex_lock(&process_info->lock);
>>> @@ -852,7 +853,13 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 goto out;
>>>         }
>>>
>>> -     ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
>>> +     pages = kvmalloc_array(bo->tbo.ttm->num_pages,
>>> +                     sizeof(struct page *),
>>> +                     GFP_KERNEL | __GFP_ZERO);
>>> +     if (!pages)
>>> +             goto unregister_out;
>>> +
>>> +     ret = amdgpu_ttm_tt_get_user_pages(bo, pages);
>>>         if (ret) {
>>>                 pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
>>>                 goto unregister_out;
>>> @@ -863,6 +870,12 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>                 pr_err("%s: Failed to reserve BO\n", __func__);
>>>                 goto release_out;
>>>         }
>>> +
>>> +     WARN_ON_ONCE(bo->tbo.ttm->page_flags & TTM_PAGE_FLAG_SWAPPED);
>>> +
>>> +     memcpy(bo->tbo.ttm->pages,
>>> +                     pages,
>>> +                     sizeof(struct page*) * bo->tbo.ttm->num_pages);
>>>         amdgpu_bo_placement_from_domain(bo, mem->domain);
>>>         ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
>>>         if (ret)
>>> @@ -872,6 +885,7 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
>>>     release_out:
>>>         amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
>>>     unregister_out:
>>> +     kvfree(pages);
>>>         if (ret)
>>>                 amdgpu_mn_unregister(bo);
>>>     out:
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CXinhui.Pan%40amd.com%7C2f5f749422eb44efa95408d91b5a029c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570895679516712%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=aZE0mxygik02mv9JUstV2BDKJl6zb1BNLggHq5WLhKY%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.
  2021-05-19  2:28   ` xinhui pan
@ 2021-05-20 11:00     ` Christian König
  -1 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-20 11:00 UTC (permalink / raw)
  To: xinhui pan, amd-gfx
  Cc: alexander.deucher, Felix.Kuehling, dri-devel, christian.koenig

Am 19.05.21 um 04:28 schrieb xinhui pan:
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_device.c | 1 +
>   1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
> index 510e3e001dab..a9772fcc8f9c 100644
> --- a/drivers/gpu/drm/ttm/ttm_device.c
> +++ b/drivers/gpu/drm/ttm/ttm_device.c
> @@ -146,6 +146,7 @@ int ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx,
>   				uint32_t num_pages;
>   
>   				if (!bo->ttm ||
> +				    !bo->ttm->pages || !bo->ttm->pages[0] ||

This should call ttm_tt_is_populated() instead.

But apart from that it sounds like the right approach to fix this.

Thanks,
Christian.

>   				    bo->ttm->page_flags & TTM_PAGE_FLAG_SG ||
>   				    bo->ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)
>   					continue;


^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page.
@ 2021-05-20 11:00     ` Christian König
  0 siblings, 0 replies; 30+ messages in thread
From: Christian König @ 2021-05-20 11:00 UTC (permalink / raw)
  To: xinhui pan, amd-gfx
  Cc: alexander.deucher, Felix.Kuehling, daniel, dri-devel, christian.koenig

Am 19.05.21 um 04:28 schrieb xinhui pan:
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_device.c | 1 +
>   1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
> index 510e3e001dab..a9772fcc8f9c 100644
> --- a/drivers/gpu/drm/ttm/ttm_device.c
> +++ b/drivers/gpu/drm/ttm/ttm_device.c
> @@ -146,6 +146,7 @@ int ttm_device_swapout(struct ttm_device *bdev, struct ttm_operation_ctx *ctx,
>   				uint32_t num_pages;
>   
>   				if (!bo->ttm ||
> +				    !bo->ttm->pages || !bo->ttm->pages[0] ||

This should call ttm_tt_is_populated() instead.

But apart from that it sounds like the right approach to fix this.

Thanks,
Christian.

>   				    bo->ttm->page_flags & TTM_PAGE_FLAG_SG ||
>   				    bo->ttm->page_flags & TTM_PAGE_FLAG_SWAPPED)
>   					continue;

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2021-05-20 11:01 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-19  2:28 [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin xinhui pan
2021-05-19  2:28 ` xinhui pan
2021-05-19  2:28 ` [RFC PATCH 2/2] drm/ttm: skip swapout when ttm has no backend page xinhui pan
2021-05-19  2:28   ` xinhui pan
2021-05-20 11:00   ` Christian König
2021-05-20 11:00     ` Christian König
2021-05-19  2:39 ` 回复: [RFC PATCH 1/2] drm/amdgpu: Fix memory corruption due to swapout and swapin Pan, Xinhui
2021-05-19  2:39   ` Pan, Xinhui
2021-05-19  3:29 ` Felix Kuehling
2021-05-19  3:29   ` Felix Kuehling
2021-05-19  4:09   ` 回复: " Pan, Xinhui
2021-05-19  4:09     ` Pan, Xinhui
2021-05-19  5:07     ` Pan, Xinhui
2021-05-19  5:07       ` Pan, Xinhui
2021-05-19 10:01       ` Christian König
2021-05-19 10:01         ` Christian König
2021-05-19 15:11         ` Felix Kuehling
2021-05-19 15:11           ` Felix Kuehling
2021-05-20  2:58           ` 回复: " Pan, Xinhui
2021-05-20  2:58             ` Pan, Xinhui
2021-05-20  6:38             ` Christian König
2021-05-20  6:38               ` Christian König
2021-05-20  2:55         ` Pan, Xinhui
2021-05-20  2:55           ` Pan, Xinhui
2021-05-20  6:39           ` Christian König
2021-05-20  6:39             ` Christian König
2021-05-20  7:30             ` Pan, Xinhui
2021-05-20  7:30               ` Pan, Xinhui
2021-05-20  7:32             ` 回复: " Pan, Xinhui
2021-05-20  7:32               ` Pan, Xinhui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.