dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
@ 2018-01-12 22:28 Andrey Grodzovsky
  2018-01-12 22:28 ` [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering Andrey Grodzovsky
  2018-01-16  6:02 ` [PATCH " He, Roger
  0 siblings, 2 replies; 13+ messages in thread
From: Andrey Grodzovsky @ 2018-01-12 22:28 UTC (permalink / raw)
  To: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Andrey Grodzovsky, Hongbo.He-5C7GfCeVMHo, Christian.Koenig-5C7GfCeVMHo

This to allow drivers to choose to avoid OOM invocation and handle
page allocation failures instead.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c             |  3 +++
 drivers/gpu/drm/ttm/ttm_page_alloc.c     |  6 ++++++
 drivers/gpu/drm/ttm/ttm_page_alloc_dma.c |  3 +++
 drivers/gpu/drm/ttm/ttm_tt.c             | 13 +++++++++++--
 include/drm/ttm/ttm_bo_api.h             |  1 +
 include/drm/ttm/ttm_bo_driver.h          |  4 ++++
 6 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 2eb71ff..f32aab1 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -234,6 +234,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 	if (bdev->need_dma32)
 		page_flags |= TTM_PAGE_FLAG_DMA32;
 
+	if (bdev->no_retry)
+		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
+
 	switch (bo->type) {
 	case ttm_bo_type_device:
 		if (zero_alloc)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 0eab24e..f34c843 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -741,6 +741,9 @@ static int ttm_page_pool_get_pages(struct ttm_page_pool *pool,
 		if (ttm_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (ttm_flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		/* ttm_alloc_new_pages doesn't reference pool so we can run
 		 * multiple requests in parallel.
 		 **/
@@ -893,6 +896,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		if (flags & TTM_PAGE_FLAG_DMA32)
 			gfp_flags |= GFP_DMA32;
 		else
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index c7f01a4..6949ef7 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -920,6 +920,9 @@ static gfp_t ttm_dma_pool_gfp_flags(struct ttm_dma_tt *ttm_dma, bool huge)
 		gfp_flags &= ~__GFP_COMP;
 	}
 
+	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
+		gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 	return gfp_flags;
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 5a046a3..9e4d43d 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -301,7 +301,11 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
-		from_page = shmem_read_mapping_page(swap_space, i);
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
+		from_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
+
 		if (IS_ERR(from_page)) {
 			ret = PTR_ERR(from_page);
 			goto out_err;
@@ -350,10 +354,15 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
+
 		from_page = ttm->pages[i];
 		if (unlikely(from_page == NULL))
 			continue;
-		to_page = shmem_read_mapping_page(swap_space, i);
+
+		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
 		if (IS_ERR(to_page)) {
 			ret = PTR_ERR(to_page);
 			goto out_err;
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 2cd025c..099f24b 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -176,6 +176,7 @@ struct ttm_buffer_object {
 	unsigned long num_pages;
 	size_t acc_size;
 
+
 	/**
 	* Members not needing protection.
 	*/
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 94064b1..9b417eb 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -86,6 +86,7 @@ struct ttm_backend_func {
 #define TTM_PAGE_FLAG_ZERO_ALLOC      (1 << 6)
 #define TTM_PAGE_FLAG_DMA32           (1 << 7)
 #define TTM_PAGE_FLAG_SG              (1 << 8)
+#define TTM_PAGE_FLAG_NO_RETRY	       (1 << 9)
 
 enum ttm_caching_state {
 	tt_uncached,
@@ -556,6 +557,7 @@ struct ttm_bo_global {
  * @dev_mapping: A pointer to the struct address_space representing the
  * device address space.
  * @wq: Work queue structure for the delayed delete workqueue.
+ * @no_retry: Don't retry allocation if it fails
  *
  */
 
@@ -592,6 +594,8 @@ struct ttm_bo_device {
 	struct delayed_work wq;
 
 	bool need_dma32;
+
+	bool no_retry;
 };
 
 /**
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
  2018-01-12 22:28 [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
@ 2018-01-12 22:28 ` Andrey Grodzovsky
  2018-01-16  6:18   ` He, Roger
  2018-01-16  6:02 ` [PATCH " He, Roger
  1 sibling, 1 reply; 13+ messages in thread
From: Andrey Grodzovsky @ 2018-01-12 22:28 UTC (permalink / raw)
  To: dri-devel, amd-gfx; +Cc: Hongbo.He, Christian.Koenig

This to have a load time option to avoid OOM on RAM allocations.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b7c181e..1387239 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -127,6 +127,7 @@ extern int amdgpu_job_hang_limit;
 extern int amdgpu_lbpw;
 extern int amdgpu_compute_multipipe;
 extern int amdgpu_gpu_recovery;
+extern int amdgpu_alloc_no_oom;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
 extern int amdgpu_si_support;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d96f9ac..6e98189 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -130,6 +130,7 @@ int amdgpu_job_hang_limit = 0;
 int amdgpu_lbpw = -1;
 int amdgpu_compute_multipipe = -1;
 int amdgpu_gpu_recovery = -1; /* auto */
+int amdgpu_alloc_no_oom = -1; /* auto */
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -285,6 +286,9 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);
 MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto");
 module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
+MODULE_PARM_DESC(alloc_no_oom, "Allocate RAM without triggering OOM killer, (1 = enable, 0 = disable, -1 = auto");
+module_param_named(alloc_no_oom, amdgpu_alloc_no_oom, int, 0444);
+
 #ifdef CONFIG_DRM_AMDGPU_SI
 
 #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 5c4c3e0..fc27164 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 #endif
 
 	bo->tbo.bdev = &adev->mman.bdev;
+
+	if (amdgpu_alloc_no_oom == 1)
+		bo->tbo.bdev->no_retry = true;
+
 	amdgpu_ttm_placement_from_domain(bo, domain);
 
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* RE: [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
  2018-01-12 22:28 [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
  2018-01-12 22:28 ` [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering Andrey Grodzovsky
@ 2018-01-16  6:02 ` He, Roger
  2018-01-16  8:53   ` Christian König
  1 sibling, 1 reply; 13+ messages in thread
From: He, Roger @ 2018-01-16  6:02 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx; +Cc: Koenig, Christian



-----Original Message-----
From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com] 
Sent: Saturday, January 13, 2018 6:29 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM..

This to allow drivers to choose to avoid OOM invocation and handle page allocation failures instead.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c             |  3 +++
 drivers/gpu/drm/ttm/ttm_page_alloc.c     |  6 ++++++
 drivers/gpu/drm/ttm/ttm_page_alloc_dma.c |  3 +++
 drivers/gpu/drm/ttm/ttm_tt.c             | 13 +++++++++++--
 include/drm/ttm/ttm_bo_api.h             |  1 +
 include/drm/ttm/ttm_bo_driver.h          |  4 ++++
 6 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2eb71ff..f32aab1 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -234,6 +234,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 	if (bdev->need_dma32)
 		page_flags |= TTM_PAGE_FLAG_DMA32;
 
+	if (bdev->no_retry)
+		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
+
 	switch (bo->type) {
 	case ttm_bo_type_device:
 		if (zero_alloc)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 0eab24e..f34c843 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -741,6 +741,9 @@ static int ttm_page_pool_get_pages(struct ttm_page_pool *pool,
 		if (ttm_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (ttm_flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		/* ttm_alloc_new_pages doesn't reference pool so we can run
 		 * multiple requests in parallel.
 		 **/
@@ -893,6 +896,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		if (flags & TTM_PAGE_FLAG_DMA32)
 			gfp_flags |= GFP_DMA32;
 		else
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index c7f01a4..6949ef7 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -920,6 +920,9 @@ static gfp_t ttm_dma_pool_gfp_flags(struct ttm_dma_tt *ttm_dma, bool huge)
 		gfp_flags &= ~__GFP_COMP;
 	}
 
+	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
+		gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 	return gfp_flags;
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 5a046a3..9e4d43d 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -301,7 +301,11 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
-		from_page = shmem_read_mapping_page(swap_space, i);
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
+		from_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
+
 		if (IS_ERR(from_page)) {
 			ret = PTR_ERR(from_page);
 			goto out_err;
@@ -350,10 +354,15 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? 
+__GFP_RETRY_MAYFAIL : 0);
+
 		from_page = ttm->pages[i];
 		if (unlikely(from_page == NULL))
 			continue;
-		to_page = shmem_read_mapping_page(swap_space, i);
+
+		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
 		if (IS_ERR(to_page)) {
 			ret = PTR_ERR(to_page);
 			goto out_err;
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 2cd025c..099f24b 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -176,6 +176,7 @@ struct ttm_buffer_object {
 	unsigned long num_pages;
 	size_t acc_size;
 
+
Please remove this newline here.
Apart from that,   this patch is Reviewed-by: Roger He <Hongbo.He@amd.com>

Thanks
Roger(Hongbo.He)

 	/**
 	* Members not needing protection.
 	*/
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 94064b1..9b417eb 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -86,6 +86,7 @@ struct ttm_backend_func {
 #define TTM_PAGE_FLAG_ZERO_ALLOC      (1 << 6)
 #define TTM_PAGE_FLAG_DMA32           (1 << 7)
 #define TTM_PAGE_FLAG_SG              (1 << 8)
+#define TTM_PAGE_FLAG_NO_RETRY	       (1 << 9)
 
 enum ttm_caching_state {
 	tt_uncached,
@@ -556,6 +557,7 @@ struct ttm_bo_global {
  * @dev_mapping: A pointer to the struct address_space representing the
  * device address space.
  * @wq: Work queue structure for the delayed delete workqueue.
+ * @no_retry: Don't retry allocation if it fails
  *
  */
 
@@ -592,6 +594,8 @@ struct ttm_bo_device {
 	struct delayed_work wq;
 
 	bool need_dma32;
+
+	bool no_retry;
 };
 
 /**
--
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
  2018-01-12 22:28 ` [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering Andrey Grodzovsky
@ 2018-01-16  6:18   ` He, Roger
       [not found]     ` <MWHPR1201MB012784C4B5F4E7E90FE49F2FFDEA0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 13+ messages in thread
From: He, Roger @ 2018-01-16  6:18 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx; +Cc: Koenig, Christian


-----Original Message-----
From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com] 
Sent: Saturday, January 13, 2018 6:29 AM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
Subject: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.

This to have a load time option to avoid OOM on RAM allocations.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index b7c181e..1387239 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -127,6 +127,7 @@ extern int amdgpu_job_hang_limit;  extern int amdgpu_lbpw;  extern int amdgpu_compute_multipipe;  extern int amdgpu_gpu_recovery;
+extern int amdgpu_alloc_no_oom;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
 extern int amdgpu_si_support;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index d96f9ac..6e98189 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -130,6 +130,7 @@ int amdgpu_job_hang_limit = 0;  int amdgpu_lbpw = -1;  int amdgpu_compute_multipipe = -1;  int amdgpu_gpu_recovery = -1; /* auto */
+int amdgpu_alloc_no_oom = -1; /* auto */

How about turn it on as default?

Thanks
Roger(Hongbo.He)

MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");  module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); @@ -285,6 +286,9 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);  MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto");  module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
 
+MODULE_PARM_DESC(alloc_no_oom, "Allocate RAM without triggering OOM 
+killer, (1 = enable, 0 = disable, -1 = auto"); 
+module_param_named(alloc_no_oom, amdgpu_alloc_no_oom, int, 0444);
+
 #ifdef CONFIG_DRM_AMDGPU_SI
 
 #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 5c4c3e0..fc27164 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,  #endif
 
 	bo->tbo.bdev = &adev->mman.bdev;
+
+	if (amdgpu_alloc_no_oom == 1)
+		bo->tbo.bdev->no_retry = true;
+
 	amdgpu_ttm_placement_from_domain(bo, domain);
 
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
--
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
  2018-01-16  6:02 ` [PATCH " He, Roger
@ 2018-01-16  8:53   ` Christian König
  0 siblings, 0 replies; 13+ messages in thread
From: Christian König @ 2018-01-16  8:53 UTC (permalink / raw)
  To: He, Roger, Grodzovsky, Andrey, dri-devel, amd-gfx; +Cc: Koenig, Christian

Am 16.01.2018 um 07:02 schrieb He, Roger:
>
> -----Original Message-----
> From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com]
> Sent: Saturday, January 13, 2018 6:29 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
>
> This to allow drivers to choose to avoid OOM invocation and handle page allocation failures instead.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c             |  3 +++
>   drivers/gpu/drm/ttm/ttm_page_alloc.c     |  6 ++++++
>   drivers/gpu/drm/ttm/ttm_page_alloc_dma.c |  3 +++
>   drivers/gpu/drm/ttm/ttm_tt.c             | 13 +++++++++++--
>   include/drm/ttm/ttm_bo_api.h             |  1 +
>   include/drm/ttm/ttm_bo_driver.h          |  4 ++++
>   6 files changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2eb71ff..f32aab1 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -234,6 +234,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
>   	if (bdev->need_dma32)
>   		page_flags |= TTM_PAGE_FLAG_DMA32;
>   
> +	if (bdev->no_retry)
> +		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> +
>   	switch (bo->type) {
>   	case ttm_bo_type_device:
>   		if (zero_alloc)
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> index 0eab24e..f34c843 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> @@ -741,6 +741,9 @@ static int ttm_page_pool_get_pages(struct ttm_page_pool *pool,
>   		if (ttm_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
>   			gfp_flags |= __GFP_ZERO;
>   
> +		if (ttm_flags & TTM_PAGE_FLAG_NO_RETRY)
> +			gfp_flags |= __GFP_RETRY_MAYFAIL;
> +
>   		/* ttm_alloc_new_pages doesn't reference pool so we can run
>   		 * multiple requests in parallel.
>   		 **/
> @@ -893,6 +896,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
>   		if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
>   			gfp_flags |= __GFP_ZERO;
>   
> +		if (flags & TTM_PAGE_FLAG_NO_RETRY)
> +			gfp_flags |= __GFP_RETRY_MAYFAIL;
> +
>   		if (flags & TTM_PAGE_FLAG_DMA32)
>   			gfp_flags |= GFP_DMA32;
>   		else
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> index c7f01a4..6949ef7 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> @@ -920,6 +920,9 @@ static gfp_t ttm_dma_pool_gfp_flags(struct ttm_dma_tt *ttm_dma, bool huge)
>   		gfp_flags &= ~__GFP_COMP;
>   	}
>   
> +	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
> +		gfp_flags |= __GFP_RETRY_MAYFAIL;
> +
>   	return gfp_flags;
>   }
>   
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 5a046a3..9e4d43d 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -301,7 +301,11 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
>   	swap_space = swap_storage->f_mapping;
>   
>   	for (i = 0; i < ttm->num_pages; ++i) {
> -		from_page = shmem_read_mapping_page(swap_space, i);
> +		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
> +
> +		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
> +		from_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
> +
>   		if (IS_ERR(from_page)) {
>   			ret = PTR_ERR(from_page);
>   			goto out_err;
> @@ -350,10 +354,15 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
>   	swap_space = swap_storage->f_mapping;
>   
>   	for (i = 0; i < ttm->num_pages; ++i) {
> +		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
> +
> +		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ?
> +__GFP_RETRY_MAYFAIL : 0);
> +
>   		from_page = ttm->pages[i];
>   		if (unlikely(from_page == NULL))
>   			continue;
> -		to_page = shmem_read_mapping_page(swap_space, i);
> +
> +		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
>   		if (IS_ERR(to_page)) {
>   			ret = PTR_ERR(to_page);
>   			goto out_err;
> diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 2cd025c..099f24b 100644
> --- a/include/drm/ttm/ttm_bo_api.h
> +++ b/include/drm/ttm/ttm_bo_api.h
> @@ -176,6 +176,7 @@ struct ttm_buffer_object {
>   	unsigned long num_pages;
>   	size_t acc_size;
>   
> +
> Please remove this newline here.
> Apart from that,   this patch is Reviewed-by: Roger He <Hongbo.He@amd.com>

Jup agree, apart from the nit pick the patch is Reviewed-by: Christian 
König <christian.koenig@amd.com> as well.

Christian.

>
> Thanks
> Roger(Hongbo.He)
>
>   	/**
>   	* Members not needing protection.
>   	*/
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 94064b1..9b417eb 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -86,6 +86,7 @@ struct ttm_backend_func {
>   #define TTM_PAGE_FLAG_ZERO_ALLOC      (1 << 6)
>   #define TTM_PAGE_FLAG_DMA32           (1 << 7)
>   #define TTM_PAGE_FLAG_SG              (1 << 8)
> +#define TTM_PAGE_FLAG_NO_RETRY	       (1 << 9)
>   
>   enum ttm_caching_state {
>   	tt_uncached,
> @@ -556,6 +557,7 @@ struct ttm_bo_global {
>    * @dev_mapping: A pointer to the struct address_space representing the
>    * device address space.
>    * @wq: Work queue structure for the delayed delete workqueue.
> + * @no_retry: Don't retry allocation if it fails
>    *
>    */
>   
> @@ -592,6 +594,8 @@ struct ttm_bo_device {
>   	struct delayed_work wq;
>   
>   	bool need_dma32;
> +
> +	bool no_retry;
>   };
>   
>   /**
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
       [not found]     ` <MWHPR1201MB012784C4B5F4E7E90FE49F2FFDEA0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2018-01-16  8:54       ` Christian König
  2018-01-16 12:43         ` Andrey Grodzovsky
  2018-01-16 15:18         ` [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
  0 siblings, 2 replies; 13+ messages in thread
From: Christian König @ 2018-01-16  8:54 UTC (permalink / raw)
  To: He, Roger, Grodzovsky, Andrey,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Koenig, Christian

Am 16.01.2018 um 07:18 schrieb He, Roger:
> -----Original Message-----
> From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com]
> Sent: Saturday, January 13, 2018 6:29 AM
> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
> Subject: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
>
> This to have a load time option to avoid OOM on RAM allocations.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
>   3 files changed, 9 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b7c181e..1387239 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -127,6 +127,7 @@ extern int amdgpu_job_hang_limit;  extern int amdgpu_lbpw;  extern int amdgpu_compute_multipipe;  extern int amdgpu_gpu_recovery;
> +extern int amdgpu_alloc_no_oom;
>   
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   extern int amdgpu_si_support;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d96f9ac..6e98189 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -130,6 +130,7 @@ int amdgpu_job_hang_limit = 0;  int amdgpu_lbpw = -1;  int amdgpu_compute_multipipe = -1;  int amdgpu_gpu_recovery = -1; /* auto */
> +int amdgpu_alloc_no_oom = -1; /* auto */
>
> How about turn it on as default?

I think we can even go a step further, drop the module parameter and 
just turn it always on for amdgpu.

Christian.

>
> Thanks
> Roger(Hongbo.He)
>
> MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");  module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); @@ -285,6 +286,9 @@ module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444);  MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto");  module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444);
>   
> +MODULE_PARM_DESC(alloc_no_oom, "Allocate RAM without triggering OOM
> +killer, (1 = enable, 0 = disable, -1 = auto");
> +module_param_named(alloc_no_oom, amdgpu_alloc_no_oom, int, 0444);
> +
>   #ifdef CONFIG_DRM_AMDGPU_SI
>   
>   #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 5c4c3e0..fc27164 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,  #endif
>   
>   	bo->tbo.bdev = &adev->mman.bdev;
> +
> +	if (amdgpu_alloc_no_oom == 1)
> +		bo->tbo.bdev->no_retry = true;
> +
>   	amdgpu_ttm_placement_from_domain(bo, domain);
>   
>   	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
> --
> 2.7.4
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
  2018-01-16  8:54       ` Christian König
@ 2018-01-16 12:43         ` Andrey Grodzovsky
       [not found]           ` <2fb8142b-99c2-c682-d867-83da7327030b-5C7GfCeVMHo@public.gmane.org>
  2018-01-16 15:18         ` [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
  1 sibling, 1 reply; 13+ messages in thread
From: Andrey Grodzovsky @ 2018-01-16 12:43 UTC (permalink / raw)
  To: christian.koenig, He, Roger, dri-devel, amd-gfx



On 01/16/2018 03:54 AM, Christian König wrote:
> Am 16.01.2018 um 07:18 schrieb He, Roger:
>> -----Original Message-----
>> From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com]
>> Sent: Saturday, January 13, 2018 6:29 AM
>> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>> Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger 
>> <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>> Subject: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM 
>> triggering.
>>
>> This to have a load time option to avoid OOM on RAM allocations.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
>>   3 files changed, 9 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b7c181e..1387239 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -127,6 +127,7 @@ extern int amdgpu_job_hang_limit;  extern int 
>> amdgpu_lbpw;  extern int amdgpu_compute_multipipe;  extern int 
>> amdgpu_gpu_recovery;
>> +extern int amdgpu_alloc_no_oom;
>>     #ifdef CONFIG_DRM_AMDGPU_SI
>>   extern int amdgpu_si_support;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index d96f9ac..6e98189 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -130,6 +130,7 @@ int amdgpu_job_hang_limit = 0;  int amdgpu_lbpw = 
>> -1;  int amdgpu_compute_multipipe = -1;  int amdgpu_gpu_recovery = 
>> -1; /* auto */
>> +int amdgpu_alloc_no_oom = -1; /* auto */
>>
>> How about turn it on as default?
>
> I think we can even go a step further, drop the module parameter and 
> just turn it always on for amdgpu.
>
> Christian.

Will fix, just a reminder that Roger's patches -
[PATCH 1/2] drm/ttm: don't update global memory count for some special cases
[PATCH 2/2] drm/ttm: only free pages rather than update global memory 
count together

Needs to be merged before my patches since the fix a TTM bug on 
allocation failure.

Thanks,
Andrey

>
>>
>> Thanks
>> Roger(Hongbo.He)
>>
>> MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in 
>> megabytes");  module_param_named(vramlimit, amdgpu_vram_limit, int, 
>> 0600); @@ -285,6 +286,9 @@ module_param_named(compute_multipipe, 
>> amdgpu_compute_multipipe, int, 0444);  MODULE_PARM_DESC(gpu_recovery, 
>> "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = 
>> auto"); module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 
>> 0444);
>>   +MODULE_PARM_DESC(alloc_no_oom, "Allocate RAM without triggering OOM
>> +killer, (1 = enable, 0 = disable, -1 = auto");
>> +module_param_named(alloc_no_oom, amdgpu_alloc_no_oom, int, 0444);
>> +
>>   #ifdef CONFIG_DRM_AMDGPU_SI
>>     #if defined(CONFIG_DRM_RADEON) || 
>> defined(CONFIG_DRM_RADEON_MODULE) diff --git 
>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index 5c4c3e0..fc27164 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct 
>> amdgpu_device *adev,  #endif
>>         bo->tbo.bdev = &adev->mman.bdev;
>> +
>> +    if (amdgpu_alloc_no_oom == 1)
>> +        bo->tbo.bdev->no_retry = true;
>> +
>>       amdgpu_ttm_placement_from_domain(bo, domain);
>>         r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
>> -- 
>> 2.7.4
>>
>> _______________________________________________
>> dri-devel mailing list
>> dri-devel@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
       [not found]           ` <2fb8142b-99c2-c682-d867-83da7327030b-5C7GfCeVMHo@public.gmane.org>
@ 2018-01-16 12:46             ` Christian König
  2018-01-17  2:09               ` He, Roger
  0 siblings, 1 reply; 13+ messages in thread
From: Christian König @ 2018-01-16 12:46 UTC (permalink / raw)
  To: Andrey Grodzovsky, christian.koenig-5C7GfCeVMHo, He, Roger,
	dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 16.01.2018 um 13:43 schrieb Andrey Grodzovsky:
>
>
> On 01/16/2018 03:54 AM, Christian König wrote:
>> Am 16.01.2018 um 07:18 schrieb He, Roger:
>>> -----Original Message-----
>>> From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com]
>>> Sent: Saturday, January 13, 2018 6:29 AM
>>> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>> Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger 
>>> <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Subject: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM 
>>> triggering.
>>>
>>> This to have a load time option to avoid OOM on RAM allocations.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
>>>   3 files changed, 9 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b7c181e..1387239 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -127,6 +127,7 @@ extern int amdgpu_job_hang_limit;  extern int 
>>> amdgpu_lbpw;  extern int amdgpu_compute_multipipe;  extern int 
>>> amdgpu_gpu_recovery;
>>> +extern int amdgpu_alloc_no_oom;
>>>     #ifdef CONFIG_DRM_AMDGPU_SI
>>>   extern int amdgpu_si_support;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index d96f9ac..6e98189 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -130,6 +130,7 @@ int amdgpu_job_hang_limit = 0;  int amdgpu_lbpw 
>>> = -1;  int amdgpu_compute_multipipe = -1;  int amdgpu_gpu_recovery = 
>>> -1; /* auto */
>>> +int amdgpu_alloc_no_oom = -1; /* auto */
>>>
>>> How about turn it on as default?
>>
>> I think we can even go a step further, drop the module parameter and 
>> just turn it always on for amdgpu.
>>
>> Christian.
>
> Will fix, just a reminder that Roger's patches -
> [PATCH 1/2] drm/ttm: don't update global memory count for some special 
> cases
> [PATCH 2/2] drm/ttm: only free pages rather than update global memory 
> count together
>
> Needs to be merged before my patches since the fix a TTM bug on 
> allocation failure.

The second is merged, but I had some comments on the first and Roger 
hasn't replied yet.

Roger what's the status on that one?

Regards,
Christian.

>
> Thanks,
> Andrey
>
>>
>>>
>>> Thanks
>>> Roger(Hongbo.He)
>>>
>>> MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in 
>>> megabytes");  module_param_named(vramlimit, amdgpu_vram_limit, int, 
>>> 0600); @@ -285,6 +286,9 @@ module_param_named(compute_multipipe, 
>>> amdgpu_compute_multipipe, int, 0444); MODULE_PARM_DESC(gpu_recovery, 
>>> "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = 
>>> auto"); module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 
>>> 0444);
>>>   +MODULE_PARM_DESC(alloc_no_oom, "Allocate RAM without triggering OOM
>>> +killer, (1 = enable, 0 = disable, -1 = auto");
>>> +module_param_named(alloc_no_oom, amdgpu_alloc_no_oom, int, 0444);
>>> +
>>>   #ifdef CONFIG_DRM_AMDGPU_SI
>>>     #if defined(CONFIG_DRM_RADEON) || 
>>> defined(CONFIG_DRM_RADEON_MODULE) diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 5c4c3e0..fc27164 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct 
>>> amdgpu_device *adev,  #endif
>>>         bo->tbo.bdev = &adev->mman.bdev;
>>> +
>>> +    if (amdgpu_alloc_no_oom == 1)
>>> +        bo->tbo.bdev->no_retry = true;
>>> +
>>>       amdgpu_ttm_placement_from_domain(bo, domain);
>>>         r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, 
>>> type,
>>> -- 
>>> 2.7.4
>>>
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
  2018-01-16  8:54       ` Christian König
  2018-01-16 12:43         ` Andrey Grodzovsky
@ 2018-01-16 15:18         ` Andrey Grodzovsky
       [not found]           ` <1516115906-26095-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2018-01-17  2:21           ` He, Roger
  1 sibling, 2 replies; 13+ messages in thread
From: Andrey Grodzovsky @ 2018-01-16 15:18 UTC (permalink / raw)
  To: dri-devel, amd-gfx, Christian.Koenig; +Cc: Hongbo.He

This to allow drivers to choose to avoid OOM invocation and handle
page allocation failures instead.

v2:
Remove extra new lines.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c             |  3 +++
 drivers/gpu/drm/ttm/ttm_page_alloc.c     |  6 ++++++
 drivers/gpu/drm/ttm/ttm_page_alloc_dma.c |  3 +++
 drivers/gpu/drm/ttm/ttm_tt.c             | 13 +++++++++++--
 include/drm/ttm/ttm_bo_driver.h          |  4 ++++
 5 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 2eb71ff..f32aab1 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -234,6 +234,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 	if (bdev->need_dma32)
 		page_flags |= TTM_PAGE_FLAG_DMA32;
 
+	if (bdev->no_retry)
+		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
+
 	switch (bo->type) {
 	case ttm_bo_type_device:
 		if (zero_alloc)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 0eab24e..f34c843 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -741,6 +741,9 @@ static int ttm_page_pool_get_pages(struct ttm_page_pool *pool,
 		if (ttm_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (ttm_flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		/* ttm_alloc_new_pages doesn't reference pool so we can run
 		 * multiple requests in parallel.
 		 **/
@@ -893,6 +896,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		if (flags & TTM_PAGE_FLAG_DMA32)
 			gfp_flags |= GFP_DMA32;
 		else
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index c7f01a4..6949ef7 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -920,6 +920,9 @@ static gfp_t ttm_dma_pool_gfp_flags(struct ttm_dma_tt *ttm_dma, bool huge)
 		gfp_flags &= ~__GFP_COMP;
 	}
 
+	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
+		gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 	return gfp_flags;
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 5a046a3..9e4d43d 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -301,7 +301,11 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
-		from_page = shmem_read_mapping_page(swap_space, i);
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
+		from_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
+
 		if (IS_ERR(from_page)) {
 			ret = PTR_ERR(from_page);
 			goto out_err;
@@ -350,10 +354,15 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
+
 		from_page = ttm->pages[i];
 		if (unlikely(from_page == NULL))
 			continue;
-		to_page = shmem_read_mapping_page(swap_space, i);
+
+		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
 		if (IS_ERR(to_page)) {
 			ret = PTR_ERR(to_page);
 			goto out_err;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index 94064b1..9b417eb 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -86,6 +86,7 @@ struct ttm_backend_func {
 #define TTM_PAGE_FLAG_ZERO_ALLOC      (1 << 6)
 #define TTM_PAGE_FLAG_DMA32           (1 << 7)
 #define TTM_PAGE_FLAG_SG              (1 << 8)
+#define TTM_PAGE_FLAG_NO_RETRY	       (1 << 9)
 
 enum ttm_caching_state {
 	tt_uncached,
@@ -556,6 +557,7 @@ struct ttm_bo_global {
  * @dev_mapping: A pointer to the struct address_space representing the
  * device address space.
  * @wq: Work queue structure for the delayed delete workqueue.
+ * @no_retry: Don't retry allocation if it fails
  *
  */
 
@@ -592,6 +594,8 @@ struct ttm_bo_device {
 	struct delayed_work wq;
 
 	bool need_dma32;
+
+	bool no_retry;
 };
 
 /**
-- 
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
       [not found]           ` <1516115906-26095-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2018-01-16 15:18             ` Andrey Grodzovsky
  2018-01-16 15:22             ` [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM Christian König
  1 sibling, 0 replies; 13+ messages in thread
From: Andrey Grodzovsky @ 2018-01-16 15:18 UTC (permalink / raw)
  To: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	Christian.Koenig-5C7GfCeVMHo
  Cc: Andrey Grodzovsky, Hongbo.He-5C7GfCeVMHo

Avoid OOM on syatem pages allocations.

v2:
Remove modeprobe parameter, making this behaviour the only option.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 5c4c3e0..b4dc3bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
 #endif
 
 	bo->tbo.bdev = &adev->mman.bdev;
+
+	/* We opt to avoid OOM on system pages allocations */
+	bo->tbo.bdev->no_retry = true;
+
 	amdgpu_ttm_placement_from_domain(bo, domain);
 
 	r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, type,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
       [not found]           ` <1516115906-26095-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2018-01-16 15:18             ` [PATCH v2 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering Andrey Grodzovsky
@ 2018-01-16 15:22             ` Christian König
  1 sibling, 0 replies; 13+ messages in thread
From: Christian König @ 2018-01-16 15:22 UTC (permalink / raw)
  To: Andrey Grodzovsky, dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Hongbo.He-5C7GfCeVMHo

Am 16.01.2018 um 16:18 schrieb Andrey Grodzovsky:
> This to allow drivers to choose to avoid OOM invocation and handle
> page allocation failures instead.
>
> v2:
> Remove extra new lines.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com> for the series.

> ---
>   drivers/gpu/drm/ttm/ttm_bo.c             |  3 +++
>   drivers/gpu/drm/ttm/ttm_page_alloc.c     |  6 ++++++
>   drivers/gpu/drm/ttm/ttm_page_alloc_dma.c |  3 +++
>   drivers/gpu/drm/ttm/ttm_tt.c             | 13 +++++++++++--
>   include/drm/ttm/ttm_bo_driver.h          |  4 ++++
>   5 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 2eb71ff..f32aab1 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -234,6 +234,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
>   	if (bdev->need_dma32)
>   		page_flags |= TTM_PAGE_FLAG_DMA32;
>   
> +	if (bdev->no_retry)
> +		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
> +
>   	switch (bo->type) {
>   	case ttm_bo_type_device:
>   		if (zero_alloc)
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> index 0eab24e..f34c843 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
> @@ -741,6 +741,9 @@ static int ttm_page_pool_get_pages(struct ttm_page_pool *pool,
>   		if (ttm_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
>   			gfp_flags |= __GFP_ZERO;
>   
> +		if (ttm_flags & TTM_PAGE_FLAG_NO_RETRY)
> +			gfp_flags |= __GFP_RETRY_MAYFAIL;
> +
>   		/* ttm_alloc_new_pages doesn't reference pool so we can run
>   		 * multiple requests in parallel.
>   		 **/
> @@ -893,6 +896,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
>   		if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
>   			gfp_flags |= __GFP_ZERO;
>   
> +		if (flags & TTM_PAGE_FLAG_NO_RETRY)
> +			gfp_flags |= __GFP_RETRY_MAYFAIL;
> +
>   		if (flags & TTM_PAGE_FLAG_DMA32)
>   			gfp_flags |= GFP_DMA32;
>   		else
> diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> index c7f01a4..6949ef7 100644
> --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
> @@ -920,6 +920,9 @@ static gfp_t ttm_dma_pool_gfp_flags(struct ttm_dma_tt *ttm_dma, bool huge)
>   		gfp_flags &= ~__GFP_COMP;
>   	}
>   
> +	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
> +		gfp_flags |= __GFP_RETRY_MAYFAIL;
> +
>   	return gfp_flags;
>   }
>   
> diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> index 5a046a3..9e4d43d 100644
> --- a/drivers/gpu/drm/ttm/ttm_tt.c
> +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> @@ -301,7 +301,11 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
>   	swap_space = swap_storage->f_mapping;
>   
>   	for (i = 0; i < ttm->num_pages; ++i) {
> -		from_page = shmem_read_mapping_page(swap_space, i);
> +		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
> +
> +		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
> +		from_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
> +
>   		if (IS_ERR(from_page)) {
>   			ret = PTR_ERR(from_page);
>   			goto out_err;
> @@ -350,10 +354,15 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
>   	swap_space = swap_storage->f_mapping;
>   
>   	for (i = 0; i < ttm->num_pages; ++i) {
> +		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
> +
> +		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
> +
>   		from_page = ttm->pages[i];
>   		if (unlikely(from_page == NULL))
>   			continue;
> -		to_page = shmem_read_mapping_page(swap_space, i);
> +
> +		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
>   		if (IS_ERR(to_page)) {
>   			ret = PTR_ERR(to_page);
>   			goto out_err;
> diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
> index 94064b1..9b417eb 100644
> --- a/include/drm/ttm/ttm_bo_driver.h
> +++ b/include/drm/ttm/ttm_bo_driver.h
> @@ -86,6 +86,7 @@ struct ttm_backend_func {
>   #define TTM_PAGE_FLAG_ZERO_ALLOC      (1 << 6)
>   #define TTM_PAGE_FLAG_DMA32           (1 << 7)
>   #define TTM_PAGE_FLAG_SG              (1 << 8)
> +#define TTM_PAGE_FLAG_NO_RETRY	       (1 << 9)
>   
>   enum ttm_caching_state {
>   	tt_uncached,
> @@ -556,6 +557,7 @@ struct ttm_bo_global {
>    * @dev_mapping: A pointer to the struct address_space representing the
>    * device address space.
>    * @wq: Work queue structure for the delayed delete workqueue.
> + * @no_retry: Don't retry allocation if it fails
>    *
>    */
>   
> @@ -592,6 +594,8 @@ struct ttm_bo_device {
>   	struct delayed_work wq;
>   
>   	bool need_dma32;
> +
> +	bool no_retry;
>   };
>   
>   /**

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.
  2018-01-16 12:46             ` Christian König
@ 2018-01-17  2:09               ` He, Roger
  0 siblings, 0 replies; 13+ messages in thread
From: He, Roger @ 2018-01-17  2:09 UTC (permalink / raw)
  To: Koenig, Christian, Grodzovsky, Andrey, dri-devel, amd-gfx



-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: Tuesday, January 16, 2018 8:46 PM
To: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; He, Roger <Hongbo.He@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering.

Am 16.01.2018 um 13:43 schrieb Andrey Grodzovsky:
>
>
> On 01/16/2018 03:54 AM, Christian König wrote:
>> Am 16.01.2018 um 07:18 schrieb He, Roger:
>>> -----Original Message-----
>>> From: Andrey Grodzovsky [mailto:andrey.grodzovsky@amd.com]
>>> Sent: Saturday, January 13, 2018 6:29 AM
>>> To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
>>> Cc: Koenig, Christian <Christian.Koenig@amd.com>; He, Roger 
>>> <Hongbo.He@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>
>>> Subject: [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM 
>>> triggering.
>>>
>>> This to have a load time option to avoid OOM on RAM allocations.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 4 ++++
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++++
>>>   3 files changed, 9 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index b7c181e..1387239 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -127,6 +127,7 @@ extern int amdgpu_job_hang_limit;  extern int 
>>> amdgpu_lbpw;  extern int amdgpu_compute_multipipe;  extern int 
>>> amdgpu_gpu_recovery;
>>> +extern int amdgpu_alloc_no_oom;
>>>     #ifdef CONFIG_DRM_AMDGPU_SI
>>>   extern int amdgpu_si_support;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index d96f9ac..6e98189 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -130,6 +130,7 @@ int amdgpu_job_hang_limit = 0;  int amdgpu_lbpw 
>>> = -1;  int amdgpu_compute_multipipe = -1;  int amdgpu_gpu_recovery = 
>>> -1; /* auto */
>>> +int amdgpu_alloc_no_oom = -1; /* auto */
>>>
>>> How about turn it on as default?
>>
>> I think we can even go a step further, drop the module parameter and 
>> just turn it always on for amdgpu.
>>
>> Christian.
>
> Will fix, just a reminder that Roger's patches - [PATCH 1/2] drm/ttm: 
> don't update global memory count for some special cases [PATCH 2/2] 
> drm/ttm: only free pages rather than update global memory count 
> together
>
> Needs to be merged before my patches since the fix a TTM bug on 
> allocation failure.

	The second is merged, but I had some comments on the first and Roger hasn't replied yet.

	Roger what's the status on that one?

Already fixed locally, but not tested yet.  Try to send out today.

Thanks
Roger(Hongbo.He)

>
> Thanks,
> Andrey
>
>>
>>>
>>> Thanks
>>> Roger(Hongbo.He)
>>>
>>> MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in 
>>> megabytes");  module_param_named(vramlimit, amdgpu_vram_limit, int, 
>>> 0600); @@ -285,6 +286,9 @@ module_param_named(compute_multipipe,
>>> amdgpu_compute_multipipe, int, 0444); MODULE_PARM_DESC(gpu_recovery, 
>>> "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = 
>>> auto"); module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 
>>> 0444);
>>>   +MODULE_PARM_DESC(alloc_no_oom, "Allocate RAM without triggering 
>>> OOM
>>> +killer, (1 = enable, 0 = disable, -1 = auto"); 
>>> +module_param_named(alloc_no_oom, amdgpu_alloc_no_oom, int, 0444);
>>> +
>>>   #ifdef CONFIG_DRM_AMDGPU_SI
>>>     #if defined(CONFIG_DRM_RADEON) ||
>>> defined(CONFIG_DRM_RADEON_MODULE) diff --git 
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index 5c4c3e0..fc27164 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -420,6 +420,10 @@ static int amdgpu_bo_do_create(struct 
>>> amdgpu_device *adev,  #endif
>>>         bo->tbo.bdev = &adev->mman.bdev;
>>> +
>>> +    if (amdgpu_alloc_no_oom == 1)
>>> +        bo->tbo.bdev->no_retry = true;
>>> +
>>>       amdgpu_ttm_placement_from_domain(bo, domain);
>>>         r = ttm_bo_init_reserved(&adev->mman.bdev, &bo->tbo, size, 
>>> type,
>>> --
>>> 2.7.4
>>>
>>> _______________________________________________
>>> dri-devel mailing list
>>> dri-devel@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>>
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM..
  2018-01-16 15:18         ` [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
       [not found]           ` <1516115906-26095-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2018-01-17  2:21           ` He, Roger
  1 sibling, 0 replies; 13+ messages in thread
From: He, Roger @ 2018-01-17  2:21 UTC (permalink / raw)
  To: Grodzovsky, Andrey, dri-devel, amd-gfx, Koenig, Christian


Reviewed-by: Roger He <Hongbo.He@amd.com>

Thanks
Roger(Hongbo.He)
-----Original Message-----
From: dri-devel [mailto:dri-devel-bounces@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky
Sent: Tuesday, January 16, 2018 11:18 PM
To: dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; Koenig, Christian <Christian.Koenig@amd.com>
Cc: He, Roger <Hongbo.He@amd.com>
Subject: [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM..

This to allow drivers to choose to avoid OOM invocation and handle page allocation failures instead.

v2:
Remove extra new lines.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c             |  3 +++
 drivers/gpu/drm/ttm/ttm_page_alloc.c     |  6 ++++++
 drivers/gpu/drm/ttm/ttm_page_alloc_dma.c |  3 +++
 drivers/gpu/drm/ttm/ttm_tt.c             | 13 +++++++++++--
 include/drm/ttm/ttm_bo_driver.h          |  4 ++++
 5 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 2eb71ff..f32aab1 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -234,6 +234,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 	if (bdev->need_dma32)
 		page_flags |= TTM_PAGE_FLAG_DMA32;
 
+	if (bdev->no_retry)
+		page_flags |= TTM_PAGE_FLAG_NO_RETRY;
+
 	switch (bo->type) {
 	case ttm_bo_type_device:
 		if (zero_alloc)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c
index 0eab24e..f34c843 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -741,6 +741,9 @@ static int ttm_page_pool_get_pages(struct ttm_page_pool *pool,
 		if (ttm_flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (ttm_flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		/* ttm_alloc_new_pages doesn't reference pool so we can run
 		 * multiple requests in parallel.
 		 **/
@@ -893,6 +896,9 @@ static int ttm_get_pages(struct page **pages, unsigned npages, int flags,
 		if (flags & TTM_PAGE_FLAG_ZERO_ALLOC)
 			gfp_flags |= __GFP_ZERO;
 
+		if (flags & TTM_PAGE_FLAG_NO_RETRY)
+			gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 		if (flags & TTM_PAGE_FLAG_DMA32)
 			gfp_flags |= GFP_DMA32;
 		else
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
index c7f01a4..6949ef7 100644
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -920,6 +920,9 @@ static gfp_t ttm_dma_pool_gfp_flags(struct ttm_dma_tt *ttm_dma, bool huge)
 		gfp_flags &= ~__GFP_COMP;
 	}
 
+	if (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY)
+		gfp_flags |= __GFP_RETRY_MAYFAIL;
+
 	return gfp_flags;
 }
 
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 5a046a3..9e4d43d 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -301,7 +301,11 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
-		from_page = shmem_read_mapping_page(swap_space, i);
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? __GFP_RETRY_MAYFAIL : 0);
+		from_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
+
 		if (IS_ERR(from_page)) {
 			ret = PTR_ERR(from_page);
 			goto out_err;
@@ -350,10 +354,15 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
 	swap_space = swap_storage->f_mapping;
 
 	for (i = 0; i < ttm->num_pages; ++i) {
+		gfp_t gfp_mask = mapping_gfp_mask(swap_space);
+
+		gfp_mask |= (ttm->page_flags & TTM_PAGE_FLAG_NO_RETRY ? 
+__GFP_RETRY_MAYFAIL : 0);
+
 		from_page = ttm->pages[i];
 		if (unlikely(from_page == NULL))
 			continue;
-		to_page = shmem_read_mapping_page(swap_space, i);
+
+		to_page = shmem_read_mapping_page_gfp(swap_space, i, gfp_mask);
 		if (IS_ERR(to_page)) {
 			ret = PTR_ERR(to_page);
 			goto out_err;
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 94064b1..9b417eb 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -86,6 +86,7 @@ struct ttm_backend_func {
 #define TTM_PAGE_FLAG_ZERO_ALLOC      (1 << 6)
 #define TTM_PAGE_FLAG_DMA32           (1 << 7)
 #define TTM_PAGE_FLAG_SG              (1 << 8)
+#define TTM_PAGE_FLAG_NO_RETRY	       (1 << 9)
 
 enum ttm_caching_state {
 	tt_uncached,
@@ -556,6 +557,7 @@ struct ttm_bo_global {
  * @dev_mapping: A pointer to the struct address_space representing the
  * device address space.
  * @wq: Work queue structure for the delayed delete workqueue.
+ * @no_retry: Don't retry allocation if it fails
  *
  */
 
@@ -592,6 +594,8 @@ struct ttm_bo_device {
 	struct delayed_work wq;
 
 	bool need_dma32;
+
+	bool no_retry;
 };
 
 /**
--
2.7.4

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-01-17  2:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-12 22:28 [PATCH 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
2018-01-12 22:28 ` [PATCH 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering Andrey Grodzovsky
2018-01-16  6:18   ` He, Roger
     [not found]     ` <MWHPR1201MB012784C4B5F4E7E90FE49F2FFDEA0-3iK1xFAIwjq9imrIu4W8xGrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2018-01-16  8:54       ` Christian König
2018-01-16 12:43         ` Andrey Grodzovsky
     [not found]           ` <2fb8142b-99c2-c682-d867-83da7327030b-5C7GfCeVMHo@public.gmane.org>
2018-01-16 12:46             ` Christian König
2018-01-17  2:09               ` He, Roger
2018-01-16 15:18         ` [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM Andrey Grodzovsky
     [not found]           ` <1516115906-26095-1-git-send-email-andrey.grodzovsky-5C7GfCeVMHo@public.gmane.org>
2018-01-16 15:18             ` [PATCH v2 2/2] drm/amdgpu: Use new TTM flag to avoid OOM triggering Andrey Grodzovsky
2018-01-16 15:22             ` [PATCH v2 1/2] drm/ttm: Allow page allocations w/o triggering OOM Christian König
2018-01-17  2:21           ` He, Roger
2018-01-16  6:02 ` [PATCH " He, Roger
2018-01-16  8:53   ` Christian König

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).