* [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id
@ 2022-10-26 9:03 Ma Jun
2022-10-26 9:03 ` [PATCH v2 2/2] drm/amdkfd: Fix the warning of array-index-out-of-bounds Ma Jun
2022-10-26 19:07 ` [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Felix Kuehling
0 siblings, 2 replies; 4+ messages in thread
From: Ma Jun @ 2022-10-26 9:03 UTC (permalink / raw)
To: amd-gfx, felix.kuehling, alexander.deucher; +Cc: guchun.chen
Init and save the base cu processor id for later use
Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 20 +++++++++++++-------
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++
2 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index d25ac9cbe5b2..35d09e29aafb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -41,13 +41,18 @@ static uint32_t gpu_processor_id_low = 0x80001000;
* @total_cu_count - Total CUs present in the GPU including ones
* masked off
*/
-static inline unsigned int get_and_inc_gpu_processor_id(
- unsigned int total_cu_count)
+static inline void assign_gpu_processor_id(struct kfd_dev *kfd)
{
- int current_id = gpu_processor_id_low;
+ struct amdgpu_device *adev = kfd->adev;
+ unsigned int array_count = 0;
+ unsigned int total_cu_count = 0;
+
+ kfd->processor_id_low = gpu_processor_id_low;
+
+ array_count = adev->gfx.config.max_sh_per_se * adev->gfx.config.max_shader_engines;
+ total_cu_count = array_count * adev->gfx.config.max_cu_per_sh;
gpu_processor_id_low += total_cu_count;
- return current_id;
}
/* Static table to describe GPU Cache information */
@@ -2223,7 +2228,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
struct crat_subtype_computeunit *cu;
struct kfd_cu_info cu_info;
int avail_size = *size;
- uint32_t total_num_of_cu;
int num_of_cache_entries = 0;
int cache_mem_filled = 0;
uint32_t nid = 0;
@@ -2275,8 +2279,10 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
cu->wave_front_size = cu_info.wave_front_size;
cu->array_count = cu_info.num_shader_arrays_per_engine *
cu_info.num_shader_engines;
- total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
- cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
+
+ assign_gpu_processor_id(kdev);
+ cu->processor_id_low = kdev->processor_id_low;
+
cu->num_cu_per_array = cu_info.num_cu_per_sh;
cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
cu->num_banks = cu_info.num_shader_engines;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 182eb67edbc5..4c06b233472f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -370,6 +370,9 @@ struct kfd_dev {
/* Track per device allocated watch points. */
uint32_t alloc_watch_ids;
+
+ /* cu processor id base */
+ unsigned int processor_id_low;
};
struct kfd_ipc_obj;
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH v2 2/2] drm/amdkfd: Fix the warning of array-index-out-of-bounds
2022-10-26 9:03 [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Ma Jun
@ 2022-10-26 9:03 ` Ma Jun
2022-10-26 19:07 ` [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Felix Kuehling
1 sibling, 0 replies; 4+ messages in thread
From: Ma Jun @ 2022-10-26 9:03 UTC (permalink / raw)
To: amd-gfx, felix.kuehling, alexander.deucher; +Cc: guchun.chen
For some GPUs with more CUs, the original sibling_map[32]
in struct crat_subtype_cache is not enough
to save the cache information when create the VCRAT table,
so skip filling the struct crat_subtype_cache info instead
fill struct kfd_cache_properties directly to fix this problem.
v2:
- Remove unnecessary sys interface "cache_ext"
Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 307 +++-------------------
drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 12 +
drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 238 ++++++++++++++++-
drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 5 +-
4 files changed, 278 insertions(+), 284 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 35d09e29aafb..561376d207c5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -55,16 +55,6 @@ static inline void assign_gpu_processor_id(struct kfd_dev *kfd)
gpu_processor_id_low += total_cu_count;
}
-/* Static table to describe GPU Cache information */
-struct kfd_gpu_cache_info {
- uint32_t cache_size;
- uint32_t cache_level;
- uint32_t flags;
- /* Indicates how many Compute Units share this cache
- * within a SA. Value = 1 indicates the cache is not shared
- */
- uint32_t num_cu_shared;
-};
static struct kfd_gpu_cache_info kaveri_cache_info[] = {
{
@@ -1028,9 +1018,13 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
props->cachelines_per_tag = cache->lines_per_tag;
props->cache_assoc = cache->associativity;
props->cache_latency = cache->cache_latency;
+
memcpy(props->sibling_map, cache->sibling_map,
sizeof(props->sibling_map));
+ /* set the sibling_map_size as 32 for CRAT from ACPI */
+ props->sibling_map_size = CRAT_SIBLINGMAP_SIZE;
+
if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
props->cache_type |= HSA_CACHE_TYPE_DATA;
if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
@@ -1248,124 +1242,6 @@ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
return ret;
}
-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
-static int fill_in_l1_pcache(struct crat_subtype_cache *pcache,
- struct kfd_gpu_cache_info *pcache_info,
- struct kfd_cu_info *cu_info,
- int mem_available,
- int cu_bitmask,
- int cache_type, unsigned int cu_processor_id,
- int cu_block)
-{
- unsigned int cu_sibling_map_mask;
- int first_active_cu;
-
- /* First check if enough memory is available */
- if (sizeof(struct crat_subtype_cache) > mem_available)
- return -ENOMEM;
-
- cu_sibling_map_mask = cu_bitmask;
- cu_sibling_map_mask >>= cu_block;
- cu_sibling_map_mask &=
- ((1 << pcache_info[cache_type].num_cu_shared) - 1);
- first_active_cu = ffs(cu_sibling_map_mask);
-
- /* CU could be inactive. In case of shared cache find the first active
- * CU. and incase of non-shared cache check if the CU is inactive. If
- * inactive active skip it
- */
- if (first_active_cu) {
- memset(pcache, 0, sizeof(struct crat_subtype_cache));
- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
- pcache->length = sizeof(struct crat_subtype_cache);
- pcache->flags = pcache_info[cache_type].flags;
- pcache->processor_id_low = cu_processor_id
- + (first_active_cu - 1);
- pcache->cache_level = pcache_info[cache_type].cache_level;
- pcache->cache_size = pcache_info[cache_type].cache_size;
-
- /* Sibling map is w.r.t processor_id_low, so shift out
- * inactive CU
- */
- cu_sibling_map_mask =
- cu_sibling_map_mask >> (first_active_cu - 1);
-
- pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
- pcache->sibling_map[1] =
- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
- pcache->sibling_map[2] =
- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
- pcache->sibling_map[3] =
- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
- return 0;
- }
- return 1;
-}
-
-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
-static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache,
- struct kfd_gpu_cache_info *pcache_info,
- struct kfd_cu_info *cu_info,
- int mem_available,
- int cache_type, unsigned int cu_processor_id)
-{
- unsigned int cu_sibling_map_mask;
- int first_active_cu;
- int i, j, k;
-
- /* First check if enough memory is available */
- if (sizeof(struct crat_subtype_cache) > mem_available)
- return -ENOMEM;
-
- cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
- cu_sibling_map_mask &=
- ((1 << pcache_info[cache_type].num_cu_shared) - 1);
- first_active_cu = ffs(cu_sibling_map_mask);
-
- /* CU could be inactive. In case of shared cache find the first active
- * CU. and incase of non-shared cache check if the CU is inactive. If
- * inactive active skip it
- */
- if (first_active_cu) {
- memset(pcache, 0, sizeof(struct crat_subtype_cache));
- pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
- pcache->length = sizeof(struct crat_subtype_cache);
- pcache->flags = pcache_info[cache_type].flags;
- pcache->processor_id_low = cu_processor_id
- + (first_active_cu - 1);
- pcache->cache_level = pcache_info[cache_type].cache_level;
- pcache->cache_size = pcache_info[cache_type].cache_size;
-
- /* Sibling map is w.r.t processor_id_low, so shift out
- * inactive CU
- */
- cu_sibling_map_mask =
- cu_sibling_map_mask >> (first_active_cu - 1);
- k = 0;
- for (i = 0; i < cu_info->num_shader_engines; i++) {
- for (j = 0; j < cu_info->num_shader_arrays_per_engine;
- j++) {
- pcache->sibling_map[k] =
- (uint8_t)(cu_sibling_map_mask & 0xFF);
- pcache->sibling_map[k+1] =
- (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
- pcache->sibling_map[k+2] =
- (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
- pcache->sibling_map[k+3] =
- (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
- k += 4;
- cu_sibling_map_mask =
- cu_info->cu_bitmap[i % 4][j + i / 4];
- cu_sibling_map_mask &= (
- (1 << pcache_info[cache_type].num_cu_shared)
- - 1);
- }
- }
- return 0;
- }
- return 1;
-}
-
#define KFD_MAX_CACHE_TYPES 6
static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
@@ -1440,225 +1316,129 @@ static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
return i;
}
-/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
- * tables
- *
- * @kdev - [IN] GPU device
- * @gpu_processor_id - [IN] GPU processor ID to which these caches
- * associate
- * @available_size - [IN] Amount of memory available in pcache
- * @cu_info - [IN] Compute Unit info obtained from KGD
- * @pcache - [OUT] memory into which cache data is to be filled in.
- * @size_filled - [OUT] amount of data used up in pcache.
- * @num_of_entries - [OUT] number of caches added
- */
-static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
- int gpu_processor_id,
- int available_size,
- struct kfd_cu_info *cu_info,
- struct crat_subtype_cache *pcache,
- int *size_filled,
- int *num_of_entries)
+int get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info)
{
- struct kfd_gpu_cache_info *pcache_info;
struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
int num_of_cache_types = 0;
- int i, j, k;
- int ct = 0;
- int mem_available = available_size;
- unsigned int cu_processor_id;
- int ret;
- unsigned int num_cu_shared;
switch (kdev->adev->asic_type) {
case CHIP_KAVERI:
- pcache_info = kaveri_cache_info;
+ *pcache_info = kaveri_cache_info;
num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
break;
case CHIP_HAWAII:
- pcache_info = hawaii_cache_info;
+ *pcache_info = hawaii_cache_info;
num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
break;
case CHIP_CARRIZO:
- pcache_info = carrizo_cache_info;
+ *pcache_info = carrizo_cache_info;
num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
break;
case CHIP_TONGA:
- pcache_info = tonga_cache_info;
+ *pcache_info = tonga_cache_info;
num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
break;
case CHIP_FIJI:
- pcache_info = fiji_cache_info;
+ *pcache_info = fiji_cache_info;
num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
break;
case CHIP_POLARIS10:
- pcache_info = polaris10_cache_info;
+ *pcache_info = polaris10_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
break;
case CHIP_POLARIS11:
- pcache_info = polaris11_cache_info;
+ *pcache_info = polaris11_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
break;
case CHIP_POLARIS12:
- pcache_info = polaris12_cache_info;
+ *pcache_info = polaris12_cache_info;
num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
break;
case CHIP_VEGAM:
- pcache_info = vegam_cache_info;
+ *pcache_info = vegam_cache_info;
num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
break;
default:
switch (KFD_GC_VERSION(kdev)) {
case IP_VERSION(9, 0, 1):
- pcache_info = vega10_cache_info;
+ *pcache_info = vega10_cache_info;
num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
break;
case IP_VERSION(9, 2, 1):
- pcache_info = vega12_cache_info;
+ *pcache_info = vega12_cache_info;
num_of_cache_types = ARRAY_SIZE(vega12_cache_info);
break;
case IP_VERSION(9, 4, 0):
case IP_VERSION(9, 4, 1):
- pcache_info = vega20_cache_info;
+ *pcache_info = vega20_cache_info;
num_of_cache_types = ARRAY_SIZE(vega20_cache_info);
break;
case IP_VERSION(9, 4, 2):
- pcache_info = aldebaran_cache_info;
+ *pcache_info = aldebaran_cache_info;
num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info);
break;
case IP_VERSION(9, 1, 0):
case IP_VERSION(9, 2, 2):
- pcache_info = raven_cache_info;
+ *pcache_info = raven_cache_info;
num_of_cache_types = ARRAY_SIZE(raven_cache_info);
break;
case IP_VERSION(9, 3, 0):
- pcache_info = renoir_cache_info;
+ *pcache_info = renoir_cache_info;
num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
break;
case IP_VERSION(10, 1, 10):
case IP_VERSION(10, 1, 2):
case IP_VERSION(10, 1, 3):
case IP_VERSION(10, 1, 4):
- pcache_info = navi10_cache_info;
+ *pcache_info = navi10_cache_info;
num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
break;
case IP_VERSION(10, 1, 1):
- pcache_info = navi14_cache_info;
+ *pcache_info = navi14_cache_info;
num_of_cache_types = ARRAY_SIZE(navi14_cache_info);
break;
case IP_VERSION(10, 3, 0):
- pcache_info = sienna_cichlid_cache_info;
+ *pcache_info = sienna_cichlid_cache_info;
num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info);
break;
case IP_VERSION(10, 3, 2):
- pcache_info = navy_flounder_cache_info;
+ *pcache_info = navy_flounder_cache_info;
num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info);
break;
case IP_VERSION(10, 3, 4):
- pcache_info = dimgrey_cavefish_cache_info;
+ *pcache_info = dimgrey_cavefish_cache_info;
num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info);
break;
case IP_VERSION(10, 3, 1):
- pcache_info = vangogh_cache_info;
+ *pcache_info = vangogh_cache_info;
num_of_cache_types = ARRAY_SIZE(vangogh_cache_info);
break;
case IP_VERSION(10, 3, 5):
- pcache_info = beige_goby_cache_info;
+ *pcache_info = beige_goby_cache_info;
num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info);
break;
case IP_VERSION(10, 3, 3):
case IP_VERSION(10, 3, 7): /* TODO: Double check these on production silicon */
- pcache_info = yellow_carp_cache_info;
+ *pcache_info = yellow_carp_cache_info;
num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
break;
case IP_VERSION(10, 3, 6):
- pcache_info = gc_10_3_6_cache_info;
+ *pcache_info = gc_10_3_6_cache_info;
num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info);
break;
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 1):
case IP_VERSION(11, 0, 2):
case IP_VERSION(11, 0, 3):
- pcache_info = cache_info;
+ *pcache_info = cache_info;
num_of_cache_types =
- kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
+ kfd_fill_gpu_cache_info_from_gfx_config(kdev, *pcache_info);
break;
default:
return -EINVAL;
}
}
-
- *size_filled = 0;
- *num_of_entries = 0;
-
- /* For each type of cache listed in the kfd_gpu_cache_info table,
- * go through all available Compute Units.
- * The [i,j,k] loop will
- * if kfd_gpu_cache_info.num_cu_shared = 1
- * will parse through all available CU
- * If (kfd_gpu_cache_info.num_cu_shared != 1)
- * then it will consider only one CU from
- * the shared unit
- */
-
- for (ct = 0; ct < num_of_cache_types; ct++) {
- cu_processor_id = gpu_processor_id;
- if (pcache_info[ct].cache_level == 1) {
- for (i = 0; i < cu_info->num_shader_engines; i++) {
- for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
- for (k = 0; k < cu_info->num_cu_per_sh;
- k += pcache_info[ct].num_cu_shared) {
- ret = fill_in_l1_pcache(pcache,
- pcache_info,
- cu_info,
- mem_available,
- cu_info->cu_bitmap[i % 4][j + i / 4],
- ct,
- cu_processor_id,
- k);
-
- if (ret < 0)
- break;
-
- if (!ret) {
- pcache++;
- (*num_of_entries)++;
- mem_available -= sizeof(*pcache);
- (*size_filled) += sizeof(*pcache);
- }
-
- /* Move to next CU block */
- num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
- cu_info->num_cu_per_sh) ?
- pcache_info[ct].num_cu_shared :
- (cu_info->num_cu_per_sh - k);
- cu_processor_id += num_cu_shared;
- }
- }
- }
- } else {
- ret = fill_in_l2_l3_pcache(pcache,
- pcache_info,
- cu_info,
- mem_available,
- ct,
- cu_processor_id);
-
- if (ret < 0)
- break;
-
- if (!ret) {
- pcache++;
- (*num_of_entries)++;
- mem_available -= sizeof(*pcache);
- (*size_filled) += sizeof(*pcache);
- }
- }
- }
-
- pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
-
- return 0;
+ return num_of_cache_types;
}
static bool kfd_ignore_crat(void)
@@ -2228,8 +2008,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
struct crat_subtype_computeunit *cu;
struct kfd_cu_info cu_info;
int avail_size = *size;
- int num_of_cache_entries = 0;
- int cache_mem_filled = 0;
uint32_t nid = 0;
int ret = 0;
@@ -2332,31 +2110,12 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
crat_table->length += sizeof(struct crat_subtype_memory);
crat_table->total_entries++;
- /* TODO: Fill in cache information. This information is NOT readily
- * available in KGD
- */
- sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
- sub_type_hdr->length);
- ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
- avail_size,
- &cu_info,
- (struct crat_subtype_cache *)sub_type_hdr,
- &cache_mem_filled,
- &num_of_cache_entries);
-
- if (ret < 0)
- return ret;
-
- crat_table->length += cache_mem_filled;
- crat_table->total_entries += num_of_cache_entries;
- avail_size -= cache_mem_filled;
-
/* Fill in Subtype: IO_LINKS
* Only direct links are added here which is Link from GPU to
* its NUMA node. Indirect links are added by userspace.
*/
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
- cache_mem_filled);
+ sub_type_hdr->length);
ret = kfd_fill_gpu_direct_io_link_to_cpu(&avail_size, kdev,
(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index fbb5f6c32ef5..bfd3cdeff608 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -317,6 +317,18 @@ struct cdit_header {
struct kfd_dev;
+/* Static table to describe GPU Cache information */
+struct kfd_gpu_cache_info {
+ uint32_t cache_size;
+ uint32_t cache_level;
+ uint32_t flags;
+ /* Indicates how many Compute Units share this cache
+ * within a SA. Value = 1 indicates the cache is not shared
+ */
+ uint32_t num_cu_shared;
+};
+int get_gpu_cache_info(struct kfd_dev *kdev, struct kfd_gpu_cache_info **pcache_info);
+
#ifdef CONFIG_ACPI
int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index e0680d265a66..5fc0c2c1467b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -387,7 +387,6 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
/* Making sure that the buffer is an empty string */
buffer[0] = 0;
-
cache = container_of(attr, struct kfd_cache_properties, attr);
if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu))
return -EPERM;
@@ -402,12 +401,13 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc);
sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency);
sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type);
+
offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map ");
- for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+ for (i = 0; i < cache->sibling_map_size; i++)
for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++)
/* Check each bit */
offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,",
- (cache->sibling_map[i] >> j) & 1);
+ (cache->sibling_map[i] >> j) & 1);
/* Replace the last "," with end of line */
buffer[offs-1] = '\n';
@@ -1721,6 +1721,217 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev)
dev->node_props.capability |= HSA_CAP_TRAP_DEBUG_FIRMWARE_SUPPORTED;
}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext,
+ struct kfd_gpu_cache_info *pcache_info,
+ struct kfd_cu_info *cu_info,
+ int cu_bitmask,
+ int cache_type, unsigned int cu_processor_id,
+ int cu_block)
+{
+ unsigned int cu_sibling_map_mask;
+ int first_active_cu;
+ struct kfd_cache_properties *pcache = NULL;
+
+ cu_sibling_map_mask = cu_bitmask;
+ cu_sibling_map_mask >>= cu_block;
+ cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ first_active_cu = ffs(cu_sibling_map_mask);
+
+ /* CU could be inactive. In case of shared cache find the first active
+ * CU. and incase of non-shared cache check if the CU is inactive. If
+ * inactive active skip it
+ */
+ if (first_active_cu) {
+ pcache = kfd_alloc_struct(pcache);
+ if (!pcache)
+ return -ENOMEM;
+
+ memset(pcache, 0, sizeof(struct kfd_cache_properties));
+ pcache->processor_id_low = cu_processor_id + (first_active_cu - 1);
+ pcache->cache_level = pcache_info[cache_type].cache_level;
+ pcache->cache_size = pcache_info[cache_type].cache_size;
+
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_DATA;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_CPU;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+ /* Sibling map is w.r.t processor_id_low, so shift out
+ * inactive CU
+ */
+ cu_sibling_map_mask =
+ cu_sibling_map_mask >> (first_active_cu - 1);
+
+ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+ pcache->sibling_map[1] =
+ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+ pcache->sibling_map[2] =
+ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+ pcache->sibling_map[3] =
+ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+
+ pcache->sibling_map_size = 4;
+ *props_ext = pcache;
+
+ return 0;
+ }
+ return 1;
+}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
+ struct kfd_gpu_cache_info *pcache_info,
+ struct kfd_cu_info *cu_info,
+ int cache_type, unsigned int cu_processor_id)
+{
+ unsigned int cu_sibling_map_mask;
+ int first_active_cu;
+ int i, j, k;
+ struct kfd_cache_properties *pcache = NULL;
+
+ cu_sibling_map_mask = cu_info->cu_bitmap[0][0];
+ cu_sibling_map_mask &=
+ ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ first_active_cu = ffs(cu_sibling_map_mask);
+
+ /* CU could be inactive. In case of shared cache find the first active
+ * CU. and incase of non-shared cache check if the CU is inactive. If
+ * inactive active skip it
+ */
+ if (first_active_cu) {
+ pcache = kfd_alloc_struct(pcache);
+ if (!pcache)
+ return -ENOMEM;
+
+ memset(pcache, 0, sizeof(struct kfd_cache_properties));
+ pcache->processor_id_low = cu_processor_id
+ + (first_active_cu - 1);
+ pcache->cache_level = pcache_info[cache_type].cache_level;
+ pcache->cache_size = pcache_info[cache_type].cache_size;
+
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_DATA;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_INST_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_CPU;
+ if (pcache_info[cache_type].flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+ pcache->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+ /* Sibling map is w.r.t processor_id_low, so shift out
+ * inactive CU
+ */
+ cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
+ k = 0;
+
+ for (i = 0; i < cu_info->num_shader_engines; i++) {
+ for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) {
+ pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+ pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+ pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+ pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+ k += 4;
+
+ cu_sibling_map_mask = cu_info->cu_bitmap[i % 4][j + i / 4];
+ cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1);
+ }
+ }
+ pcache->sibling_map_size = k;
+ *props_ext = pcache;
+ return 0;
+ }
+ return 1;
+}
+
+/* kfd_fill_cache_non_crat_info - Fill GPU cache info using kfd_gpu_cache_info
+ * tables
+ */
+void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct kfd_dev *kdev)
+{
+ struct kfd_gpu_cache_info *pcache_info = NULL;
+ int i, j, k;
+ int ct = 0;
+ unsigned int cu_processor_id;
+ int ret;
+ unsigned int num_cu_shared;
+ struct kfd_cu_info cu_info;
+ struct kfd_cu_info *pcu_info;
+ int gpu_processor_id;
+ struct kfd_cache_properties *props_ext;
+ int num_of_entries = 0;
+ int num_of_cache_types = 0;
+
+ amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info);
+ pcu_info = &cu_info;
+
+ gpu_processor_id = kdev->processor_id_low;
+
+ num_of_cache_types = get_gpu_cache_info(kdev, &pcache_info);
+ if (!num_of_cache_types) {
+ pr_warn("no cache info found\n");
+ return;
+ }
+
+ /* For each type of cache listed in the kfd_gpu_cache_info table,
+ * go through all available Compute Units.
+ * The [i,j,k] loop will
+ * if kfd_gpu_cache_info.num_cu_shared = 1
+ * will parse through all available CU
+ * If (kfd_gpu_cache_info.num_cu_shared != 1)
+ * then it will consider only one CU from
+ * the shared unit
+ */
+ for (ct = 0; ct < num_of_cache_types; ct++) {
+ cu_processor_id = gpu_processor_id;
+ if (pcache_info[ct].cache_level == 1) {
+ for (i = 0; i < pcu_info->num_shader_engines; i++) {
+ for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) {
+ for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
+
+ ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
+ pcu_info->cu_bitmap[i % 4][j + i / 4], ct,
+ cu_processor_id, k);
+
+ if (ret < 0)
+ break;
+
+ if (!ret) {
+ num_of_entries++;
+ list_add_tail(&props_ext->list, &dev->cache_props);
+ }
+
+ /* Move to next CU block */
+ num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <=
+ pcu_info->num_cu_per_sh) ?
+ pcache_info[ct].num_cu_shared :
+ (pcu_info->num_cu_per_sh - k);
+ cu_processor_id += num_cu_shared;
+ }
+ }
+ }
+ } else {
+ ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
+ pcu_info, ct, cu_processor_id);
+
+ if (ret < 0)
+ break;
+
+ if (!ret) {
+ num_of_entries++;
+ list_add_tail(&props_ext->list, &dev->cache_props);
+ }
+ }
+ }
+ dev->node_props.caches_count += num_of_entries;
+ pr_debug("Added [%d] GPU cache entries\n", num_of_entries);
+}
+
int kfd_topology_add_device(struct kfd_dev *gpu)
{
uint32_t gpu_id;
@@ -1759,6 +1970,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
topology_crat_proximity_domain--;
return res;
}
+
res = kfd_parse_crat_table(crat_image,
&temp_topology_device_list,
proximity_domain);
@@ -1771,23 +1983,31 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
kfd_topology_update_device_list(&temp_topology_device_list,
&topology_device_list);
+ up_write(&topology_lock);
+
+ dev = kfd_assign_gpu(gpu);
+ if (WARN_ON(!dev)) {
+ res = -ENODEV;
+ goto err;
+ }
+
+ down_write(&topology_lock);
+
+ /* Fill the cache affinity information here for the GPUs
+ * using VCRAT
+ */
+ kfd_fill_cache_non_crat_info(dev, gpu);
/* Update the SYSFS tree, since we added another topology
* device
*/
res = kfd_topology_update_sysfs();
up_write(&topology_lock);
-
if (!res)
sys_props.generation_count++;
else
pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
gpu_id, res);
- dev = kfd_assign_gpu(gpu);
- if (WARN_ON(!dev)) {
- res = -ENODEV;
- goto err;
- }
}
dev->gpu_id = gpu_id;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index dc4e239c8f8f..3e8ac87f0ac9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -87,6 +87,8 @@ struct kfd_mem_properties {
struct attribute attr_used;
};
+#define CACHE_SIBLINGMAP_SIZE 64
+
struct kfd_cache_properties {
struct list_head list;
uint32_t processor_id_low;
@@ -97,10 +99,11 @@ struct kfd_cache_properties {
uint32_t cache_assoc;
uint32_t cache_latency;
uint32_t cache_type;
- uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE];
+ uint8_t sibling_map[CACHE_SIBLINGMAP_SIZE];
struct kfd_dev *gpu;
struct kobject *kobj;
struct attribute attr;
+ uint32_t sibling_map_size;
};
struct kfd_iolink_properties {
--
2.25.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id
2022-10-26 9:03 [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Ma Jun
2022-10-26 9:03 ` [PATCH v2 2/2] drm/amdkfd: Fix the warning of array-index-out-of-bounds Ma Jun
@ 2022-10-26 19:07 ` Felix Kuehling
2022-10-27 5:16 ` Ma, Jun
1 sibling, 1 reply; 4+ messages in thread
From: Felix Kuehling @ 2022-10-26 19:07 UTC (permalink / raw)
To: Ma Jun, amd-gfx, alexander.deucher; +Cc: guchun.chen
On 2022-10-26 05:03, Ma Jun wrote:
> Init and save the base cu processor id for later use
>
> Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 20 +++++++++++++-------
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++
> 2 files changed, 16 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index d25ac9cbe5b2..35d09e29aafb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -41,13 +41,18 @@ static uint32_t gpu_processor_id_low = 0x80001000;
> * @total_cu_count - Total CUs present in the GPU including ones
> * masked off
> */
> -static inline unsigned int get_and_inc_gpu_processor_id(
> - unsigned int total_cu_count)
> +static inline void assign_gpu_processor_id(struct kfd_dev *kfd)
> {
> - int current_id = gpu_processor_id_low;
> + struct amdgpu_device *adev = kfd->adev;
> + unsigned int array_count = 0;
> + unsigned int total_cu_count = 0;
> +
> + kfd->processor_id_low = gpu_processor_id_low;
This still seems backwards. And I don't think you need it. You can
access this after the topology has been parsed from the
kfd_topology_device (dev->node_props.simd_id_base). I see you're using
it in the next patch in kfd_fill_cache_non_crat_info, which has a
kfd_topology_device parameter. This should run after parsing the CRAT
tables, so the info should be available at that time.
Regards,
Felix
> +
> + array_count = adev->gfx.config.max_sh_per_se * adev->gfx.config.max_shader_engines;
> + total_cu_count = array_count * adev->gfx.config.max_cu_per_sh;
>
> gpu_processor_id_low += total_cu_count;
> - return current_id;
> }
>
> /* Static table to describe GPU Cache information */
> @@ -2223,7 +2228,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
> struct crat_subtype_computeunit *cu;
> struct kfd_cu_info cu_info;
> int avail_size = *size;
> - uint32_t total_num_of_cu;
> int num_of_cache_entries = 0;
> int cache_mem_filled = 0;
> uint32_t nid = 0;
> @@ -2275,8 +2279,10 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
> cu->wave_front_size = cu_info.wave_front_size;
> cu->array_count = cu_info.num_shader_arrays_per_engine *
> cu_info.num_shader_engines;
> - total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
> - cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
> +
> + assign_gpu_processor_id(kdev);
> + cu->processor_id_low = kdev->processor_id_low;
> +
> cu->num_cu_per_array = cu_info.num_cu_per_sh;
> cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
> cu->num_banks = cu_info.num_shader_engines;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 182eb67edbc5..4c06b233472f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -370,6 +370,9 @@ struct kfd_dev {
>
> /* Track per device allocated watch points. */
> uint32_t alloc_watch_ids;
> +
> + /* cu processor id base */
> + unsigned int processor_id_low;
> };
>
> struct kfd_ipc_obj;
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id
2022-10-26 19:07 ` [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Felix Kuehling
@ 2022-10-27 5:16 ` Ma, Jun
0 siblings, 0 replies; 4+ messages in thread
From: Ma, Jun @ 2022-10-27 5:16 UTC (permalink / raw)
To: Felix Kuehling, Ma Jun, amd-gfx, alexander.deucher; +Cc: guchun.chen
Hi Felix,
On 10/27/2022 3:07 AM, Felix Kuehling wrote:
> On 2022-10-26 05:03, Ma Jun wrote:
>> Init and save the base cu processor id for later use
>>
>> Signed-off-by: Ma Jun <Jun.Ma2@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 20 +++++++++++++-------
>> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++
>> 2 files changed, 16 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> index d25ac9cbe5b2..35d09e29aafb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
>> @@ -41,13 +41,18 @@ static uint32_t gpu_processor_id_low = 0x80001000;
>> * @total_cu_count - Total CUs present in the GPU including ones
>> * masked off
>> */
>> -static inline unsigned int get_and_inc_gpu_processor_id(
>> - unsigned int total_cu_count)
>> +static inline void assign_gpu_processor_id(struct kfd_dev *kfd)
>> {
>> - int current_id = gpu_processor_id_low;
>> + struct amdgpu_device *adev = kfd->adev;
>> + unsigned int array_count = 0;
>> + unsigned int total_cu_count = 0;
>> +
>> + kfd->processor_id_low = gpu_processor_id_low;
>
> This still seems backwards. And I don't think you need it. You can
> access this after the topology has been parsed from the
> kfd_topology_device (dev->node_props.simd_id_base). I see you're using
> it in the next patch in kfd_fill_cache_non_crat_info, which has a
> kfd_topology_device parameter. This should run after parsing the CRAT
> tables, so the info should be available at that time.
Thanks. I will use dev->node_props.simd_id_base in the second patch and drop
this one.
Regards
Ma Jun
>
> Regards,
> Felix
>
>
>> +
>> + array_count = adev->gfx.config.max_sh_per_se * adev->gfx.config.max_shader_engines;
>> + total_cu_count = array_count * adev->gfx.config.max_cu_per_sh;
>>
>> gpu_processor_id_low += total_cu_count;
>> - return current_id;
>> }
>>
>> /* Static table to describe GPU Cache information */
>> @@ -2223,7 +2228,6 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>> struct crat_subtype_computeunit *cu;
>> struct kfd_cu_info cu_info;
>> int avail_size = *size;
>> - uint32_t total_num_of_cu;
>> int num_of_cache_entries = 0;
>> int cache_mem_filled = 0;
>> uint32_t nid = 0;
>> @@ -2275,8 +2279,10 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
>> cu->wave_front_size = cu_info.wave_front_size;
>> cu->array_count = cu_info.num_shader_arrays_per_engine *
>> cu_info.num_shader_engines;
>> - total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
>> - cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
>> +
>> + assign_gpu_processor_id(kdev);
>> + cu->processor_id_low = kdev->processor_id_low;
>> +
>> cu->num_cu_per_array = cu_info.num_cu_per_sh;
>> cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
>> cu->num_banks = cu_info.num_shader_engines;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 182eb67edbc5..4c06b233472f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -370,6 +370,9 @@ struct kfd_dev {
>>
>> /* Track per device allocated watch points. */
>> uint32_t alloc_watch_ids;
>> +
>> + /* cu processor id base */
>> + unsigned int processor_id_low;
>> };
>>
>> struct kfd_ipc_obj;
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-10-27 5:17 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-26 9:03 [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Ma Jun
2022-10-26 9:03 ` [PATCH v2 2/2] drm/amdkfd: Fix the warning of array-index-out-of-bounds Ma Jun
2022-10-26 19:07 ` [PATCH v2 1/2] drm/amdkfd: Init the base cu processor id Felix Kuehling
2022-10-27 5:16 ` Ma, Jun
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.