All of lore.kernel.org
 help / color / mirror / Atom feed
From: Davidlohr Bueso <dave@stgolabs.net>
To: "alison.schofield@intel.com" <alison.schofield@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>,
	Ira Weiny <ira.weiny@intel.com>,
	Vishal Verma <vishal.l.verma@intel.com>,
	Ben Widawsky <bwidawsk@kernel.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	Ingo Molnar <mingo@redhat.com>,
	linux-cxl@vger.kernel.org, linux-kernel@vger.kernel.org,
	a.manzanares@samsung.com
Subject: Re: [PATCH 2/3] cxl/mbox: Add GET_POISON_LIST mailbox command support
Date: Thu, 16 Jun 2022 12:43:34 -0700	[thread overview]
Message-ID: <20220616194334.pvorvoozt4rrzr66@offworld> (raw)
In-Reply-To: <382a9c35ef43e89db85670637d88371f9197b7a2.1655250669.git.alison.schofield@intel.com>

On Tue, 14 Jun 2022, alison.schofield@intel.com wrote:

>From: Alison Schofield <alison.schofield@intel.com>
>
>CXL devices that support persistent memory maintain a list of locations
>that are poisoned or result in poison if the addresses are accessed by
>the host.
>
>Per the spec (CXL 2.0 8.2.8.5.4.1), the device returns this Poison
>list as a set of  Media Error Records that include the source of the
>error, the starting device physical address and length. The length is
>the number of adjacent DPAs in the record and is in units of 64 bytes.
>
>Retrieve the list and log each Media Error Record as a trace event of
>type cxl_poison_list.
>
>Signed-off-by: Alison Schofield <alison.schofield@intel.com>
>---
> drivers/cxl/cxlmem.h    | 43 +++++++++++++++++++++++
> drivers/cxl/core/mbox.c | 75 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 118 insertions(+)
>
>diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
>index 60d10ee1e7fc..29cf0459b44a 100644
>--- a/drivers/cxl/cxlmem.h
>+++ b/drivers/cxl/cxlmem.h
>@@ -174,6 +174,7 @@ struct cxl_endpoint_dvsec_info {
>  *                (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
>  * @lsa_size: Size of Label Storage Area
>  *                (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
>+ * @poison_max_mer: maximum Media Error Records tracked in Poison List
>  * @mbox_mutex: Mutex to synchronize mailbox access.
>  * @firmware_version: Firmware version for the memory device.
>  * @enabled_cmds: Hardware commands found enabled in CEL.
>@@ -204,6 +205,7 @@ struct cxl_dev_state {
>
>	size_t payload_size;
>	size_t lsa_size;
>+	u32 poison_max;
>	struct mutex mbox_mutex; /* Protects device mailbox and firmware */
>	char firmware_version[0x10];
>	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
>@@ -317,6 +319,46 @@ struct cxl_mbox_set_partition_info {
>
> #define  CXL_SET_PARTITION_IMMEDIATE_FLAG	BIT(0)
>
>+struct cxl_mbox_poison_payload_in {
>+	__le64 offset;
>+	__le64 length;
>+} __packed;
>+
>+struct cxl_mbox_poison_payload_out {
>+	u8 flags;
>+	u8 rsvd1;
>+	__le64 overflow_timestamp;
>+	__le16 count;
>+	u8 rsvd2[0x14];
>+	struct cxl_poison_record {
>+		__le64 address;
>+		__le32 length;
>+		__le32 rsvd;
>+	} __packed record[];
>+} __packed;
>+
>+/* CXL 8.2.9.5.4.1 Get Poison List: payload out flags: */
>+#define CXL_POISON_FLAG_MORE            BIT(0)
>+#define CXL_POISON_FLAG_OVERFLOW        BIT(1)
>+#define CXL_POISON_FLAG_SCANNING        BIT(2)
>+
>+/* CXL 8.2.9.5.4.1 Get Poison List: Error is encoded in record.address[2:0] */
>+#define CXL_POISON_SOURCE_MASK		GENMASK(2, 0)
>+#define	CXL_POISON_SOURCE_UNKNOWN	0
>+#define	CXL_POISON_SOURCE_EXTERNAL	1
>+#define	CXL_POISON_SOURCE_INTERNAL	2
>+#define	CXL_POISON_SOURCE_INJECTED	3
>+#define	CXL_POISON_SOURCE_VENDOR	7
>+
>+/* Software define */
>+#define	CXL_POISON_SOURCE_INVALID	99
>+#define CXL_POISON_SOURCE_VALID(x)		\
>+	(((x) == CXL_POISON_SOURCE_UNKNOWN)  ||	\
>+	 ((x) == CXL_POISON_SOURCE_EXTERNAL) ||	\
>+	 ((x) == CXL_POISON_SOURCE_INTERNAL) ||	\
>+	 ((x) == CXL_POISON_SOURCE_INJECTED) ||	\
>+	 ((x) == CXL_POISON_SOURCE_VENDOR))
>+
> /**
>  * struct cxl_mem_command - Driver representation of a memory device command
>  * @info: Command information as it exists for the UAPI
>@@ -351,6 +393,7 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds);
> struct cxl_dev_state *cxl_dev_state_create(struct device *dev);
> void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
> void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds);
>+int cxl_mem_get_poison_list(struct device *dev);
> #ifdef CONFIG_CXL_SUSPEND
> void cxl_mem_active_inc(void);
> void cxl_mem_active_dec(void);
>diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
>index 54f434733b56..c10c7020ebc2 100644
>--- a/drivers/cxl/core/mbox.c
>+++ b/drivers/cxl/core/mbox.c
>@@ -9,6 +9,9 @@
>
> #include "core.h"
>
>+#define CREATE_TRACE_POINTS
>+#include <trace/events/cxl.h>
>+
> static bool cxl_raw_allow_all;
>
> /**
>@@ -755,6 +758,7 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
> {
>	/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
>	struct cxl_mbox_identify id;
>+	__le32 val = 0;
>	int rc;
>
>	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
>@@ -783,6 +787,9 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
>	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
>	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
>
>+	memcpy(&val, id.poison_list_max_mer, 3);
>+	cxlds->poison_max = le32_to_cpu(val);
>+
>	return 0;
> }
> EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
>@@ -826,6 +833,74 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
> }
> EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
>
>+int cxl_mem_get_poison_list(struct device *dev)
>+{
>+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
>+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>+	struct cxl_mbox_poison_payload_out *po;
>+	struct cxl_mbox_poison_payload_in pi;
>+	int nr_records = 0;
>+	int rc, i;
>+
>+	if (range_len(&cxlds->pmem_range)) {
>+		pi.offset = cpu_to_le64(cxlds->pmem_range.start);
>+		pi.length = cpu_to_le64(range_len(&cxlds->pmem_range));

Do you ever see this changing to not always use the full pmem DPA range
but allow arbitrary ones? I also assume this is the reason why you don't
check the range vs cxlds->ram_range to prevent any overlaps, no?

Thanks,
Davidlohr

>+	} else {
>+		return -ENXIO;
>+	}
>+
>+	po = kvmalloc(cxlds->payload_size, GFP_KERNEL);
>+	if (!po)
>+		return -ENOMEM;
>+
>+	do {
>+		rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_POISON, &pi,
>+				       sizeof(pi), po, cxlds->payload_size);
>+		if (rc)
>+			goto out;
>+
>+		if (po->flags & CXL_POISON_FLAG_OVERFLOW) {
>+			time64_t o_time = le64_to_cpu(po->overflow_timestamp);
>+
>+			dev_err(dev, "Poison list overflow at %ptTs UTC\n",
>+				&o_time);
>+			rc = -ENXIO;
>+			goto out;
>+		}
>+
>+		if (po->flags & CXL_POISON_FLAG_SCANNING) {
>+			dev_err(dev, "Scan Media in Progress\n");
>+			rc = -EBUSY;
>+			goto out;
>+		}
>+
>+		for (i = 0; i < le16_to_cpu(po->count); i++) {
>+			u64 addr = le64_to_cpu(po->record[i].address);
>+			u32 len = le32_to_cpu(po->record[i].length);
>+			int source = FIELD_GET(CXL_POISON_SOURCE_MASK, addr);
>+
>+			if (!CXL_POISON_SOURCE_VALID(source)) {
>+				dev_dbg(dev, "Invalid poison source %d",
>+					source);
>+				source = CXL_POISON_SOURCE_INVALID;
>+			}
>+
>+			trace_cxl_poison_list(dev, source, addr, len);
>+		}
>+
>+		/* Protect against an uncleared _FLAG_MORE */
>+		nr_records = nr_records + le16_to_cpu(po->count);
>+		if (nr_records >= cxlds->poison_max)
>+			goto out;
>+
>+	} while (po->flags & CXL_POISON_FLAG_MORE);
>+
>+out:
>+	kvfree(po);
>+	return rc;
>+}
>+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison_list, CXL);
>+
> struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
> {
>	struct cxl_dev_state *cxlds;
>
>--
>2.31.1
>

  parent reply	other threads:[~2022-06-16 19:43 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-15  0:10 [PATCH 0/3] CXL Poison List Retrieval & Tracing alison.schofield
2022-06-15  0:10 ` [PATCH 1/3] trace, cxl: Introduce a TRACE_EVENT for CXL Poison Records alison.schofield
2022-06-15  1:15   ` Steven Rostedt
2022-06-16 19:45   ` Davidlohr Bueso
2022-06-17 16:17   ` Jonathan Cameron
2022-06-17 18:04   ` Dan Williams
2022-06-15  0:10 ` [PATCH 2/3] cxl/mbox: Add GET_POISON_LIST mailbox command support alison.schofield
2022-06-15  3:22   ` Ira Weiny
2022-06-15  5:07     ` Alison Schofield
2022-06-15 15:01       ` Ira Weiny
2022-06-15 17:19         ` Alison Schofield
2022-06-16 19:43   ` Davidlohr Bueso [this message]
2022-06-16 20:34     ` Alison Schofield
2022-06-16 21:47       ` Davidlohr Bueso
2022-06-16 22:10         ` Alison Schofield
2022-06-16 22:20           ` Davidlohr Bueso
2022-06-16 22:45       ` Davidlohr Bueso
2022-06-16 23:15         ` Alison Schofield
2022-06-16 23:44           ` Verma, Vishal L
2022-06-17  0:03             ` Davidlohr Bueso
2022-06-17 19:02       ` Dan Williams
2022-06-20 10:53         ` Jonathan Cameron
2022-06-17 13:01   ` Jonathan Cameron
2022-06-17 14:05   ` Jonathan Cameron
2022-06-17 16:29     ` Alison Schofield
2022-06-17 17:29       ` Davidlohr Bueso
2022-06-17 19:32       ` Dan Williams
2022-06-20 10:56       ` Jonathan Cameron
2022-06-17 19:27     ` Dan Williams
2022-06-20 11:30       ` Jonathan Cameron
2022-06-17 18:26   ` Dan Williams
2022-06-15  0:10 ` [PATCH 3/3] cxl/core: Add sysfs attribute get_poison for list retrieval alison.schofield
2022-06-15  3:30   ` Ira Weiny
2022-06-16 15:04   ` Jonathan Cameron
2022-06-16 20:39     ` Alison Schofield
2022-06-17 18:42   ` Dan Williams
2022-06-18  0:21     ` Alison Schofield
2022-06-18  1:08       ` Dan Williams
2022-06-18  1:35         ` Alison Schofield
2022-06-17 17:52 ` [PATCH 0/3] CXL Poison List Retrieval & Tracing Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220616194334.pvorvoozt4rrzr66@offworld \
    --to=dave@stgolabs.net \
    --cc=a.manzanares@samsung.com \
    --cc=alison.schofield@intel.com \
    --cc=bwidawsk@kernel.org \
    --cc=dan.j.williams@intel.com \
    --cc=ira.weiny@intel.com \
    --cc=linux-cxl@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=rostedt@goodmis.org \
    --cc=vishal.l.verma@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.