All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH libdrm 2/3] amdgpu: add ras tests
@ 2019-03-19  3:46 Pan, Xinhui
  0 siblings, 0 replies; only message in thread
From: Pan, Xinhui @ 2019-03-19  3:46 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Deucher, Alexander, Xu, Feifei, Zhang, Hawking

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
---
 tests/amdgpu/Makefile.am   |   3 +-
 tests/amdgpu/amdgpu_test.c |  11 +
 tests/amdgpu/amdgpu_test.h |  22 ++
 tests/amdgpu/meson.build   |   2 +-
 tests/amdgpu/ras_tests.c   | 594 +++++++++++++++++++++++++++++++++++++
 5 files changed, 630 insertions(+), 2 deletions(-)
 create mode 100644 tests/amdgpu/ras_tests.c

diff --git a/tests/amdgpu/Makefile.am b/tests/amdgpu/Makefile.am
index 447ff217..48278848 100644
--- a/tests/amdgpu/Makefile.am
+++ b/tests/amdgpu/Makefile.am
@@ -33,4 +33,5 @@ amdgpu_test_SOURCES = \
 	vcn_tests.c \
 	uve_ib.h \
 	deadlock_tests.c \
-	vm_tests.c
+	vm_tests.c	\
+	ras_tests.c
diff --git a/tests/amdgpu/amdgpu_test.c b/tests/amdgpu/amdgpu_test.c
index a793ca7d..8fc7a0b9 100644
--- a/tests/amdgpu/amdgpu_test.c
+++ b/tests/amdgpu/amdgpu_test.c
@@ -56,6 +56,7 @@
 #define UVD_ENC_TESTS_STR "UVD ENC Tests"
 #define DEADLOCK_TESTS_STR "Deadlock Tests"
 #define VM_TESTS_STR "VM Tests"
+#define RAS_TESTS_STR "RAS Tests"
 
 /**
  *  Open handles for amdgpu devices
@@ -116,6 +117,12 @@ static CU_SuiteInfo suites[] = {
 		.pCleanupFunc = suite_vm_tests_clean,
 		.pTests = vm_tests,
 	},
+	{
+		.pName = RAS_TESTS_STR,
+		.pInitFunc = suite_ras_tests_init,
+		.pCleanupFunc = suite_ras_tests_clean,
+		.pTests = ras_tests,
+	},
 
 	CU_SUITE_INFO_NULL,
 };
@@ -165,6 +172,10 @@ static Suites_Active_Status suites_active_stat[] = {
 			.pName = VM_TESTS_STR,
 			.pActive = suite_vm_tests_enable,
 		},
+		{
+			.pName = RAS_TESTS_STR,
+			.pActive = suite_ras_tests_enable,
+		},
 };
 
 
diff --git a/tests/amdgpu/amdgpu_test.h b/tests/amdgpu/amdgpu_test.h
index af81eea8..bcd0bc7e 100644
--- a/tests/amdgpu/amdgpu_test.h
+++ b/tests/amdgpu/amdgpu_test.h
@@ -194,6 +194,28 @@ CU_BOOL suite_vm_tests_enable(void);
  */
 extern CU_TestInfo vm_tests[];
 
+
+/**
+ * Initialize ras test suite
+ */
+int suite_ras_tests_init();
+
+/**
+ * Deinitialize deadlock test suite
+ */
+int suite_ras_tests_clean();
+
+/**
+ * Decide if the suite is enabled by default or not.
+ */
+CU_BOOL suite_ras_tests_enable(void);
+
+/**
+ * Tests in ras test suite
+ */
+extern CU_TestInfo ras_tests[];
+
+
 /**
  * Helper functions
  */
diff --git a/tests/amdgpu/meson.build b/tests/amdgpu/meson.build
index 4c1237c6..95ed9305 100644
--- a/tests/amdgpu/meson.build
+++ b/tests/amdgpu/meson.build
@@ -24,7 +24,7 @@ if dep_cunit.found()
     files(
       'amdgpu_test.c', 'basic_tests.c', 'bo_tests.c', 'cs_tests.c',
       'vce_tests.c', 'uvd_enc_tests.c', 'vcn_tests.c', 'deadlock_tests.c',
-      'vm_tests.c',
+      'vm_tests.c', 'ras_tests.c',
     ),
     dependencies : [dep_cunit, dep_threads],
     include_directories : [inc_root, inc_drm, include_directories('../../amdgpu')],
diff --git a/tests/amdgpu/ras_tests.c b/tests/amdgpu/ras_tests.c
new file mode 100644
index 00000000..989eb153
--- /dev/null
+++ b/tests/amdgpu/ras_tests.c
@@ -0,0 +1,594 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+*/
+
+#include "CUnit/Basic.h"
+
+#include "amdgpu_test.h"
+#include "amdgpu_drm.h"
+#include "amdgpu_internal.h"
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include "xf86drm.h"
+
+const char *ras_block_string[] = {
+	"umc",
+	"sdma",
+	"gfx",
+	"mmhub",
+	"athub",
+	"pcie_bif",
+	"hdp",
+	"xgmi_wafl",
+	"df",
+	"smn",
+	"sem",
+	"mp0",
+	"mp1",
+	"fuse",
+};
+
+#define ras_block_str(i) (ras_block_string[i])
+
+enum amdgpu_ras_block {
+	AMDGPU_RAS_BLOCK__UMC = 0,
+	AMDGPU_RAS_BLOCK__SDMA,
+	AMDGPU_RAS_BLOCK__GFX,
+	AMDGPU_RAS_BLOCK__MMHUB,
+	AMDGPU_RAS_BLOCK__ATHUB,
+	AMDGPU_RAS_BLOCK__PCIE_BIF,
+	AMDGPU_RAS_BLOCK__HDP,
+	AMDGPU_RAS_BLOCK__XGMI_WAFL,
+	AMDGPU_RAS_BLOCK__DF,
+	AMDGPU_RAS_BLOCK__SMN,
+	AMDGPU_RAS_BLOCK__SEM,
+	AMDGPU_RAS_BLOCK__MP0,
+	AMDGPU_RAS_BLOCK__MP1,
+	AMDGPU_RAS_BLOCK__FUSE,
+
+	AMDGPU_RAS_BLOCK__LAST
+};
+
+#define AMDGPU_RAS_BLOCK_COUNT  AMDGPU_RAS_BLOCK__LAST
+#define AMDGPU_RAS_BLOCK_MASK   ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
+
+enum amdgpu_ras_error_type {
+	AMDGPU_RAS_ERROR__NONE				= 0,
+	AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE		= 2,
+	AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE		= 4,
+	AMDGPU_RAS_ERROR__POISON			= 8,
+};
+
+struct ras_common_if {
+	enum amdgpu_ras_block block;
+	enum amdgpu_ras_error_type type;
+	uint32_t sub_block_index;
+	char name[32];
+};
+
+struct ras_inject_if {
+	struct ras_common_if head;
+	uint64_t address;
+	uint64_t value;
+};
+
+struct ras_debug_if {
+	union {
+		struct ras_common_if head;
+		struct ras_inject_if inject;
+	};
+	int op;
+};
+/* for now, only umc, gfx, sdma has implemented. */
+static uint32_t ras_block_mask_inject_query = (1 << AMDGPU_RAS_BLOCK__UMC);
+
+static uint32_t ras_block_mask_basic = (1 << AMDGPU_RAS_BLOCK__UMC)
+				| (1 << AMDGPU_RAS_BLOCK__SDMA)
+				| (1 << AMDGPU_RAS_BLOCK__GFX);
+
+struct amdgpu_ras_data {
+	amdgpu_device_handle device_handle;
+	uint32_t  id;
+	uint32_t  capability;
+};
+
+/* all devices who has ras supported */
+static struct amdgpu_ras_data devices[MAX_CARDS_SUPPORTED];
+static int devices_count;
+
+static uint32_t amdgpu_ras_lookup_capability(amdgpu_device_handle device_handle)
+{
+	union {
+		uint64_t feature_mask;
+		struct {
+			uint32_t enabled_features;
+			uint32_t supported_features;
+		};
+	} features = { 0 };
+	int ret;
+
+	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
+			sizeof(features), &features);
+	if (ret)
+		return 0;
+
+	return features.supported_features;
+}
+
+static int get_file_contents(char *file, char *buf, int size);
+
+static int amdgpu_ras_lookup_id(drmDevicePtr device)
+{
+	char path[1024];
+	char str[128];
+	drmPciBusInfo info;
+	int i;
+	int ret;
+
+	for (i = 0; i < MAX_CARDS_SUPPORTED; i++) {
+		memset(str, 0, sizeof(str));
+		sprintf(path, "/sys/kernel/debug/dri/%d/name", i);
+		if (get_file_contents(path, str, sizeof(str)) <= 0)
+			continue;
+
+		ret = sscanf(str, "amdgpu dev=%04hx:%02hhx:%02hhx.%01hhx",
+				&info.domain, &info.bus, &info.dev, &info.func);
+		if (ret != 4)
+			continue;
+
+		if (memcmp(&info, device->businfo.pci, sizeof(info)) == 0)
+				return i;
+	}
+	return -1;
+}
+
+CU_BOOL suite_ras_tests_enable(void)
+{
+	amdgpu_device_handle device_handle;
+	uint32_t  major_version;
+	uint32_t  minor_version;
+	int i;
+	drmDevicePtr device;
+
+	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
+		if (amdgpu_device_initialize(drm_amdgpu[i], &major_version,
+					&minor_version, &device_handle))
+			continue;
+
+		if (drmGetDevice2(drm_amdgpu[i],
+					DRM_DEVICE_GET_PCI_REVISION,
+					&device))
+			continue;
+
+		if (device->bustype == DRM_BUS_PCI &&
+				amdgpu_ras_lookup_capability(device_handle)) {
+			amdgpu_device_deinitialize(device_handle);
+			return CU_TRUE;
+		}
+
+		if (amdgpu_device_deinitialize(device_handle))
+			continue;
+	}
+
+	return CU_FALSE;
+}
+
+int suite_ras_tests_init(void)
+{
+	drmDevicePtr device;
+	amdgpu_device_handle device_handle;
+	uint32_t  major_version;
+	uint32_t  minor_version;
+	uint32_t  capability;
+	int id;
+	int i;
+	int r;
+
+	for (i = 0; i < MAX_CARDS_SUPPORTED && drm_amdgpu[i] >= 0; i++) {
+		r = amdgpu_device_initialize(drm_amdgpu[i], &major_version,
+				&minor_version, &device_handle);
+		if (r)
+			continue;
+
+		if (drmGetDevice2(drm_amdgpu[i],
+					DRM_DEVICE_GET_PCI_REVISION,
+					&device)) {
+			amdgpu_device_deinitialize(device_handle);
+			continue;
+		}
+
+		if (device->bustype != DRM_BUS_PCI) {
+			amdgpu_device_deinitialize(device_handle);
+			continue;
+		}
+
+		capability = amdgpu_ras_lookup_capability(device_handle);
+		if (capability == 0) {
+			amdgpu_device_deinitialize(device_handle);
+			continue;
+
+		}
+
+		id = amdgpu_ras_lookup_id(device);
+		if (id == -1) {
+			amdgpu_device_deinitialize(device_handle);
+			continue;
+		}
+
+		devices[devices_count++] = (struct amdgpu_ras_data) {
+			device_handle, id, capability
+		};
+	}
+
+	if (devices_count == 0)
+		return CUE_SINIT_FAILED;
+
+	return CUE_SUCCESS;
+}
+
+int suite_ras_tests_clean(void)
+{
+	int r;
+	int i;
+	int ret = CUE_SUCCESS;
+
+	for (i = 0; i < devices_count; i++) {
+		r = amdgpu_device_deinitialize(devices[i].device_handle);
+		if (r)
+			ret = CUE_SCLEAN_FAILED;
+	}
+	return ret;
+}
+
+static void amdgpu_ras_disable_test(void);
+static void amdgpu_ras_enable_test(void);
+static void amdgpu_ras_inject_test(void);
+static void amdgpu_ras_query_test(void);
+static void amdgpu_ras_basic_test(void);
+
+CU_TestInfo ras_tests[] = {
+	{ "ras basic test",	amdgpu_ras_basic_test },
+	{ "ras query test",	amdgpu_ras_query_test },
+	{ "ras inject test",	amdgpu_ras_inject_test },
+	{ "ras disable test",	amdgpu_ras_disable_test },
+#if 0
+	{ "ras enable test",	amdgpu_ras_enable_test },
+#endif
+	CU_TEST_INFO_NULL,
+};
+
+//helpers
+
+static int test_card;
+static char sysfs_path[1024];
+static char debugfs_path[1024];
+static uint32_t ras_mask;
+static amdgpu_device_handle device_handle;
+
+static int set_test_card(int card)
+{
+	int i;
+
+	test_card = card;
+	sprintf(sysfs_path, "/sys/class/drm/card%d/device/ras/", devices[card].id);
+	sprintf(debugfs_path, "/sys/kernel/debug/dri/%d/ras/", devices[card].id);
+	ras_mask = devices[card].capability;
+	device_handle = devices[card].device_handle;
+
+	return 0;
+}
+
+static const char *get_ras_sysfs_root(void)
+{
+	return sysfs_path;
+}
+
+static const char *get_ras_debugfs_root(void)
+{
+	return debugfs_path;
+}
+
+static int set_file_contents(char *file, char *buf, int size)
+{
+	int n, fd;
+	fd = open(file, O_WRONLY);
+	if (fd == -1)
+		return -1;
+	n = write(fd, buf, size);
+	close(fd);
+	return n;
+}
+
+static int get_file_contents(char *file, char *buf, int size)
+{
+	int n, fd;
+	fd = open(file, O_RDONLY);
+	if (fd == -1)
+		return -1;
+	n = read(fd, buf, size);
+	close(fd);
+	return n;
+}
+
+static int is_file_ok(char *file, int flags)
+{
+	int fd;
+
+	fd = open(file, flags);
+	if (fd == -1)
+		return -1;
+	close(fd);
+	return 0;
+}
+
+static int amdgpu_ras_is_feature_enabled(enum amdgpu_ras_block block)
+{
+	uint32_t feature_mask;
+	int ret;
+
+	ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
+			sizeof(feature_mask), &feature_mask);
+	if (ret)
+		return -1;
+
+	return (1 << block) & feature_mask;
+}
+
+static int amdgpu_ras_is_feature_supported(enum amdgpu_ras_block block)
+{
+	return (1 << block) & ras_mask;
+}
+
+static int amdgpu_ras_invoke(struct ras_debug_if *data)
+{
+	char path[1024];
+	int ret;
+
+	sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
+
+	ret = set_file_contents(path, (char *)data, sizeof(*data))
+		- sizeof(*data);
+	return ret;
+}
+
+static int amdgpu_ras_query_err_count(enum amdgpu_ras_block block,
+		unsigned long *ue, unsigned long *ce)
+{
+	char buf[64];
+	char name[1024];
+	int ret;
+
+	*ue = *ce = 0;
+
+	if (amdgpu_ras_is_feature_supported(block) <= 0)
+		return -1;
+
+	sprintf(name, "%s%s%s", get_ras_sysfs_root(), ras_block_str(block), "_err_count");
+
+	if (is_file_ok(name, O_RDONLY))
+		return 0;
+
+	if (get_file_contents(name, buf, sizeof(buf)) <= 0)
+		return -1;
+
+	if (sscanf(buf, "ue: %lu\nce: %lu", ue, ce) != 2)
+		return -1;
+
+	return 0;
+}
+
+//tests
+static void amdgpu_ras_features_test(int enable)
+{
+	struct ras_debug_if data;
+	int ret;
+	int i;
+
+	data.op = enable;
+	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
+		struct ras_common_if head = {
+			.block = i,
+			.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+			.sub_block_index = 0,
+			.name = "",
+		};
+
+		if (amdgpu_ras_is_feature_supported(i) <= 0)
+			continue;
+
+		data.head = head;
+
+		ret = amdgpu_ras_invoke(&data);
+		CU_ASSERT_EQUAL(ret, 0);
+
+		if (ret)
+			continue;
+
+		ret = enable ^ amdgpu_ras_is_feature_enabled(i);
+		CU_ASSERT_EQUAL(ret, 0);
+	}
+}
+
+static void amdgpu_ras_disable_test(void)
+{
+	int i;
+	for (i = 0; i < devices_count; i++) {
+		set_test_card(i);
+		amdgpu_ras_features_test(0);
+	}
+}
+
+static void amdgpu_ras_enable_test(void)
+{
+	int i;
+	for (i = 0; i < devices_count; i++) {
+		set_test_card(i);
+		amdgpu_ras_features_test(1);
+	}
+}
+
+static void __amdgpu_ras_inject_test(void)
+{
+	struct ras_debug_if data;
+	int ret;
+	int i;
+	unsigned long ue, ce, ue_old, ce_old;
+
+	data.op = 2;
+	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
+		int timeout = 3;
+		struct ras_inject_if inject = {
+			.head = {
+				.block = i,
+				.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
+				.sub_block_index = 0,
+				.name = "",
+			},
+			.address = 0,
+			.value = 0,
+		};
+
+		if (amdgpu_ras_is_feature_enabled(i) <= 0)
+			continue;
+
+		if (!((1 << i) & ras_block_mask_inject_query))
+			continue;
+
+		data.inject = inject;
+
+		ret = amdgpu_ras_query_err_count(i, &ue_old, &ce_old);
+		CU_ASSERT_EQUAL(ret, 0);
+
+		if (ret)
+			continue;
+
+		ret = amdgpu_ras_invoke(&data);
+		CU_ASSERT_EQUAL(ret, 0);
+
+		if (ret)
+			continue;
+
+loop:
+		while (timeout > 0) {
+			ret = amdgpu_ras_query_err_count(i, &ue, &ce);
+			CU_ASSERT_EQUAL(ret, 0);
+
+			if (ret)
+				continue;
+			if (ue_old != ue) {
+				/*recovery takes ~10s*/
+				sleep(10);
+				break;
+			}
+
+			sleep(1);
+			timeout -= 1;
+		}
+
+		CU_ASSERT_EQUAL(ue_old + 1, ue);
+		CU_ASSERT_EQUAL(ce_old, ce);
+	}
+}
+
+static void amdgpu_ras_inject_test(void)
+{
+	int i;
+	for (i = 0; i < devices_count; i++) {
+		set_test_card(i);
+		__amdgpu_ras_inject_test();
+	}
+}
+
+static void __amdgpu_ras_query_test(void)
+{
+	unsigned long ue, ce;
+	int ret;
+	int i;
+
+	for (i = 0; i < AMDGPU_RAS_BLOCK__LAST; i++) {
+		if (amdgpu_ras_is_feature_supported(i) <= 0)
+			continue;
+
+		if (!((1 << i) & ras_block_mask_inject_query))
+			continue;
+
+		ret = amdgpu_ras_query_err_count(i, &ue, &ce);
+		CU_ASSERT_EQUAL(ret, 0);
+	}
+}
+
+static void amdgpu_ras_query_test(void)
+{
+	int i;
+	for (i = 0; i < devices_count; i++) {
+		set_test_card(i);
+		__amdgpu_ras_query_test();
+	}
+}
+
+static void amdgpu_ras_basic_test(void)
+{
+	unsigned long ue, ce;
+	char name[1024];
+	int ret;
+	int i;
+	int j;
+	uint32_t features;
+	char path[1024];
+
+	ret = is_file_ok("/sys/module/amdgpu/parameters/ras_mask", O_RDONLY);
+	CU_ASSERT_EQUAL(ret, 0);
+
+	for (i = 0; i < devices_count; i++) {
+		set_test_card(i);
+
+		ret = amdgpu_query_info(device_handle, AMDGPU_INFO_RAS_ENABLED_FEATURES,
+				sizeof(features), &features);
+		CU_ASSERT_EQUAL(ret, 0);
+
+		sprintf(path, "%s%s", get_ras_debugfs_root(), "ras_ctrl");
+		ret = is_file_ok(path, O_WRONLY);
+		CU_ASSERT_EQUAL(ret, 0);
+
+		sprintf(path, "%s%s", get_ras_sysfs_root(), "features");
+		ret = is_file_ok(path, O_RDONLY);
+		CU_ASSERT_EQUAL(ret, 0);
+
+		for (j = 0; j < AMDGPU_RAS_BLOCK__LAST; j++) {
+			ret = amdgpu_ras_is_feature_supported(j);
+			if (ret <= 0)
+				continue;
+
+			if (!((1 << j) & ras_block_mask_basic))
+				continue;
+
+			sprintf(path, "%s%s%s", get_ras_sysfs_root(), ras_block_str(j), "_err_count");
+			ret = is_file_ok(path, O_RDONLY);
+			CU_ASSERT_EQUAL(ret, 0);
+
+			sprintf(path, "%s%s%s", get_ras_debugfs_root(), ras_block_str(j), "_err_inject");
+			ret = is_file_ok(path, O_WRONLY);
+			CU_ASSERT_EQUAL(ret, 0);
+		}
+	}
+}
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2019-03-19  3:46 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-19  3:46 [PATCH libdrm 2/3] amdgpu: add ras tests Pan, Xinhui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.