>From 6ef6fbf384808e11490ff77b3933aa27feb63f7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 18 Jul 2017 16:08:44 -0400 Subject: [PATCH] radeonsi: set a per-buffer flag that disables inter-process sharing (v3) For lower overhead in the CS ioctl. Winsys allocators are not used with interprocess-sharable resources. v2: It shouldn't crash anymore, but the kernel will reject the new flag. v3 (christian): Rename the flag, avoid sending those buffers in the BO list. --- src/gallium/drivers/radeon/r600_buffer_common.c | 7 +++++ src/gallium/drivers/radeon/radeon_winsys.h | 20 ++++++++++--- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 37 ++++++++++++++++--------- src/gallium/winsys/amdgpu/drm/amdgpu_bo.h | 2 ++ src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 25 +++++++++++++---- src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 27 ++++++++++-------- 6 files changed, 84 insertions(+), 34 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index dd1c209..2747ac4 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -167,6 +167,13 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen, RADEON_FLAG_GTT_WC; } + /* Only displayable single-sample textures can be shared between + * processes. */ + if (res->b.b.target == PIPE_BUFFER || + res->b.b.nr_samples >= 2 || + rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY) + res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING; + /* If VRAM is just stolen system memory, allow both VRAM and * GTT, whichever has free space. If a buffer is evicted from * VRAM to GTT, it will stay there. diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index b00b144..f0a0a92 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -54,6 +54,7 @@ enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_NO_CPU_ACCESS = (1 << 1), RADEON_FLAG_NO_SUBALLOC = (1 << 2), RADEON_FLAG_SPARSE = (1 << 3), + RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4), }; enum radeon_bo_usage { /* bitfield */ @@ -661,14 +662,19 @@ static inline unsigned radeon_flags_from_heap(enum radeon_heap heap) { switch (heap) { case RADEON_HEAP_VRAM_NO_CPU_ACCESS: - return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS; + return RADEON_FLAG_GTT_WC | + RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_NO_INTERPROCESS_SHARING; + case RADEON_HEAP_VRAM: case RADEON_HEAP_VRAM_GTT: case RADEON_HEAP_GTT_WC: - return RADEON_FLAG_GTT_WC; + return RADEON_FLAG_GTT_WC | + RADEON_FLAG_NO_INTERPROCESS_SHARING; + case RADEON_HEAP_GTT: default: - return 0; + return RADEON_FLAG_NO_INTERPROCESS_SHARING; } } @@ -700,8 +706,14 @@ static inline int radeon_get_heap_index(enum radeon_bo_domain domain, /* NO_CPU_ACCESS implies VRAM only. */ assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM); + /* Resources with interprocess sharing don't use any winsys allocators. */ + if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)) + return -1; + /* Unsupported flags: NO_SUBALLOC, SPARSE. */ - if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS)) + if (flags & ~(RADEON_FLAG_GTT_WC | + RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_NO_INTERPROCESS_SHARING)) return -1; switch (domain) { diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 97bbe23..d76d441 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -38,6 +38,10 @@ #include #include +#ifndef AMDGPU_GEM_CREATE_LOCAL +#define AMDGPU_GEM_CREATE_LOCAL (1 << 6) +#endif + /* Set to 1 for verbose output showing committed sparse buffer ranges. */ #define DEBUG_SPARSE_COMMITS 0 @@ -402,6 +406,8 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS; if (flags & RADEON_FLAG_GTT_WC) request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; + if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING) + request.flags |= AMDGPU_GEM_CREATE_LOCAL; r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); if (r) { @@ -435,6 +441,7 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, bo->u.real.va_handle = va_handle; bo->initial_domain = initial_domain; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); + bo->is_local = !!(request.flags & AMDGPU_GEM_CREATE_LOCAL); if (initial_domain & RADEON_DOMAIN_VRAM) ws->allocated_vram += align64(size, ws->info.gart_page_size); @@ -1134,7 +1141,7 @@ amdgpu_bo_create(struct radeon_winsys *rws, { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_winsys_bo *bo; - unsigned usage = 0, pb_cache_bucket; + unsigned usage = 0, pb_cache_bucket = 0; /* VRAM implies WC. This is not optional. */ assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC); @@ -1189,19 +1196,23 @@ no_slab: size = align64(size, ws->info.gart_page_size); alignment = align(alignment, ws->info.gart_page_size); - int heap = radeon_get_heap_index(domain, flags); - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); - usage = 1 << heap; /* Only set one usage bit for each heap. */ + bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); + if (use_reusable_pool) { + int heap = radeon_get_heap_index(domain, flags); + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); + usage = 1 << heap; /* Only set one usage bit for each heap. */ - /* Get a buffer from the cache. */ - bo = (struct amdgpu_winsys_bo*) - pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, - pb_cache_bucket); - if (bo) - return &bo->base; + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); + assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); + + /* Get a buffer from the cache. */ + bo = (struct amdgpu_winsys_bo*) + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage, + pb_cache_bucket); + if (bo) + return &bo->base; + } /* Create a new one. */ bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags, @@ -1216,7 +1227,7 @@ no_slab: return NULL; } - bo->u.real.use_reusable_pool = true; + bo->u.real.use_reusable_pool = use_reusable_pool; return &bo->base; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 1311344..10b095d 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -115,6 +115,8 @@ struct amdgpu_winsys_bo { unsigned num_fences; unsigned max_fences; struct pipe_fence_handle **fences; + + bool is_local; }; struct amdgpu_slab { diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 9cadfc4..7ec33c5 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -362,8 +362,9 @@ amdgpu_lookup_or_add_real_buffer(struct amdgpu_cs *acs, struct amdgpu_winsys_bo { struct amdgpu_cs_context *cs = acs->csc; unsigned hash; - int idx = amdgpu_lookup_buffer(cs, bo); + int idx; + idx = amdgpu_lookup_buffer(cs, bo); if (idx >= 0) return idx; @@ -1123,6 +1124,8 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) free(handles); mtx_unlock(&ws->global_bo_list_lock); } else { + unsigned num_handles; + if (!amdgpu_add_sparse_backing_buffers(cs)) { r = -ENOMEM; goto bo_list_error; @@ -1142,21 +1145,31 @@ void amdgpu_cs_submit_ib(void *job, int thread_index) } } + num_handles = 0; for (i = 0; i < cs->num_real_buffers; ++i) { struct amdgpu_cs_buffer *buffer = &cs->real_buffers[i]; + if (buffer->bo->is_local) + continue; + assert(buffer->u.real.priority_usage != 0); - cs->handles[i] = buffer->bo->bo; - cs->flags[i] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4; + cs->handles[num_handles] = buffer->bo->bo; + cs->flags[num_handles] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4; + ++num_handles; } if (acs->ring_type == RING_GFX) ws->gfx_bo_list_counter += cs->num_real_buffers; - r = amdgpu_bo_list_create(ws->dev, cs->num_real_buffers, - cs->handles, cs->flags, - &cs->request.resources); + if (num_handles) { + r = amdgpu_bo_list_create(ws->dev, num_handles, + cs->handles, cs->flags, + &cs->request.resources); + } else { + r = 0; + cs->request.resources = 0; + } } bo_list_error: diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 8027a5f..15e9d38 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -914,7 +914,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); struct radeon_bo *bo; - unsigned usage = 0, pb_cache_bucket; + unsigned usage = 0, pb_cache_bucket = 0; assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */ @@ -969,17 +969,22 @@ no_slab: size = align(size, ws->info.gart_page_size); alignment = align(alignment, ws->info.gart_page_size); - int heap = radeon_get_heap_index(domain, flags); - assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); - usage = 1 << heap; /* Only set one usage bit for each heap. */ + bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING; - pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); - assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); + /* Shared resources don't use cached heaps. */ + if (use_reusable_pool) { + int heap = radeon_get_heap_index(domain, flags); + assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS); + usage = 1 << heap; /* Only set one usage bit for each heap. */ - bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, - usage, pb_cache_bucket)); - if (bo) - return &bo->base; + pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap); + assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets)); + + bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, + usage, pb_cache_bucket)); + if (bo) + return &bo->base; + } bo = radeon_create_bo(ws, size, alignment, usage, domain, flags, pb_cache_bucket); @@ -994,7 +999,7 @@ no_slab: return NULL; } - bo->u.real.use_reusable_pool = true; + bo->u.real.use_reusable_pool = use_reusable_pool; mtx_lock(&ws->bo_handles_mutex); util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); -- 2.7.4