linux-cxl.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] cxl/region: Fix region setup/teardown for RCDs
@ 2023-03-28 18:36 Dan Williams
  2023-03-29 17:39 ` Dave Jiang
  2023-03-31  5:12 ` Ira Weiny
  0 siblings, 2 replies; 3+ messages in thread
From: Dan Williams @ 2023-03-28 18:36 UTC (permalink / raw)
  To: linux-cxl
  Cc: vishal.l.verma, ira.weiny, dave.jiang, alison.schofield,
	Jonathan.Cameron

RCDs (CXL memory devices that link train without VH capability and show
up as root complex integrated endpoints), hide the presence of the link
between the endpoint and the host-bridge. The CXL region setup/teardown
paths assume that a link hop is present and go looking for at least one
'struct cxl_port' instance between the CXL root port-object and an
endpoint port-object leading to crashes of the form:

    BUG: kernel NULL pointer dereference, address: 0000000000000008
    [..]
    RIP: 0010:cxl_region_setup_targets+0x3e9/0xae0 [cxl_core]
    [..]
    Call Trace:
     <TASK>
     cxl_region_attach+0x46c/0x7a0 [cxl_core]
     cxl_create_region+0x20b/0x270 [cxl_core]
     cxl_mock_mem_probe+0x641/0x800 [cxl_mock_mem]
     platform_probe+0x5b/0xb0

Detect RCDs explicitly and skip walking the non-existent port hierarchy
between root and endpoint in that case.

While this has been a problem since:

commit 0a19bfc8de93 ("cxl/port: Add RCD endpoint port enumeration")

...it becomes a more reliable crash scenario with the new autodiscovery
implementation.

Fixes: a32320b71f08 ("cxl/region: Add region autodiscovery")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/region.c |   28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 808f23ec4e2b..52bbf6268d5f 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -134,9 +134,13 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
 		struct cxl_endpoint_decoder *cxled = p->targets[i];
 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
 		struct cxl_port *iter = cxled_to_port(cxled);
+		struct cxl_dev_state *cxlds = cxlmd->cxlds;
 		struct cxl_ep *ep;
 		int rc = 0;
 
+		if (cxlds->rcd)
+			goto endpoint_reset;
+
 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
 			iter = to_cxl_port(iter->dev.parent);
 
@@ -153,6 +157,7 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
 				return rc;
 		}
 
+endpoint_reset:
 		rc = cxled->cxld.reset(&cxled->cxld);
 		if (rc)
 			return rc;
@@ -1199,6 +1204,7 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
 {
 	struct cxl_region_params *p = &cxlr->params;
 	struct cxl_endpoint_decoder *cxled;
+	struct cxl_dev_state *cxlds;
 	struct cxl_memdev *cxlmd;
 	struct cxl_port *iter;
 	struct cxl_ep *ep;
@@ -1214,6 +1220,10 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
 	for (i = 0; i < p->nr_targets; i++) {
 		cxled = p->targets[i];
 		cxlmd = cxled_to_memdev(cxled);
+		cxlds = cxlmd->cxlds;
+
+		if (cxlds->rcd)
+			continue;
 
 		iter = cxled_to_port(cxled);
 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
@@ -1229,14 +1239,24 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
 {
 	struct cxl_region_params *p = &cxlr->params;
 	struct cxl_endpoint_decoder *cxled;
+	struct cxl_dev_state *cxlds;
+	int i, rc, rch = 0, vh = 0;
 	struct cxl_memdev *cxlmd;
 	struct cxl_port *iter;
 	struct cxl_ep *ep;
-	int i, rc;
 
 	for (i = 0; i < p->nr_targets; i++) {
 		cxled = p->targets[i];
 		cxlmd = cxled_to_memdev(cxled);
+		cxlds = cxlmd->cxlds;
+
+		/* validate that all targets agree on topology */
+		if (!cxlds->rcd) {
+			vh++;
+		} else {
+			rch++;
+			continue;
+		}
 
 		iter = cxled_to_port(cxled);
 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
@@ -1256,6 +1276,12 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
 		}
 	}
 
+	if (rch && vh) {
+		dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
+		cxl_region_teardown_targets(cxlr);
+		return -ENXIO;
+	}
+
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] cxl/region: Fix region setup/teardown for RCDs
  2023-03-28 18:36 [PATCH] cxl/region: Fix region setup/teardown for RCDs Dan Williams
@ 2023-03-29 17:39 ` Dave Jiang
  2023-03-31  5:12 ` Ira Weiny
  1 sibling, 0 replies; 3+ messages in thread
From: Dave Jiang @ 2023-03-29 17:39 UTC (permalink / raw)
  To: Dan Williams, linux-cxl
  Cc: vishal.l.verma, ira.weiny, alison.schofield, Jonathan.Cameron



On 3/28/23 11:36 AM, Dan Williams wrote:
> RCDs (CXL memory devices that link train without VH capability and show
> up as root complex integrated endpoints), hide the presence of the link
> between the endpoint and the host-bridge. The CXL region setup/teardown
> paths assume that a link hop is present and go looking for at least one
> 'struct cxl_port' instance between the CXL root port-object and an
> endpoint port-object leading to crashes of the form:
> 
>      BUG: kernel NULL pointer dereference, address: 0000000000000008
>      [..]
>      RIP: 0010:cxl_region_setup_targets+0x3e9/0xae0 [cxl_core]
>      [..]
>      Call Trace:
>       <TASK>
>       cxl_region_attach+0x46c/0x7a0 [cxl_core]
>       cxl_create_region+0x20b/0x270 [cxl_core]
>       cxl_mock_mem_probe+0x641/0x800 [cxl_mock_mem]
>       platform_probe+0x5b/0xb0
> 
> Detect RCDs explicitly and skip walking the non-existent port hierarchy
> between root and endpoint in that case.
> 
> While this has been a problem since:
> 
> commit 0a19bfc8de93 ("cxl/port: Add RCD endpoint port enumeration")
> 
> ...it becomes a more reliable crash scenario with the new autodiscovery
> implementation.
> 
> Fixes: a32320b71f08 ("cxl/region: Add region autodiscovery")
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Reviewed-by: Dave Jiang <dave.jiang@intel.com>

> ---
>   drivers/cxl/core/region.c |   28 +++++++++++++++++++++++++++-
>   1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 808f23ec4e2b..52bbf6268d5f 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -134,9 +134,13 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
>   		struct cxl_endpoint_decoder *cxled = p->targets[i];
>   		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>   		struct cxl_port *iter = cxled_to_port(cxled);
> +		struct cxl_dev_state *cxlds = cxlmd->cxlds;
>   		struct cxl_ep *ep;
>   		int rc = 0;
>   
> +		if (cxlds->rcd)
> +			goto endpoint_reset;
> +
>   		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
>   			iter = to_cxl_port(iter->dev.parent);
>   
> @@ -153,6 +157,7 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
>   				return rc;
>   		}
>   
> +endpoint_reset:
>   		rc = cxled->cxld.reset(&cxled->cxld);
>   		if (rc)
>   			return rc;
> @@ -1199,6 +1204,7 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
>   {
>   	struct cxl_region_params *p = &cxlr->params;
>   	struct cxl_endpoint_decoder *cxled;
> +	struct cxl_dev_state *cxlds;
>   	struct cxl_memdev *cxlmd;
>   	struct cxl_port *iter;
>   	struct cxl_ep *ep;
> @@ -1214,6 +1220,10 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
>   	for (i = 0; i < p->nr_targets; i++) {
>   		cxled = p->targets[i];
>   		cxlmd = cxled_to_memdev(cxled);
> +		cxlds = cxlmd->cxlds;
> +
> +		if (cxlds->rcd)
> +			continue;
>   
>   		iter = cxled_to_port(cxled);
>   		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
> @@ -1229,14 +1239,24 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
>   {
>   	struct cxl_region_params *p = &cxlr->params;
>   	struct cxl_endpoint_decoder *cxled;
> +	struct cxl_dev_state *cxlds;
> +	int i, rc, rch = 0, vh = 0;
>   	struct cxl_memdev *cxlmd;
>   	struct cxl_port *iter;
>   	struct cxl_ep *ep;
> -	int i, rc;
>   
>   	for (i = 0; i < p->nr_targets; i++) {
>   		cxled = p->targets[i];
>   		cxlmd = cxled_to_memdev(cxled);
> +		cxlds = cxlmd->cxlds;
> +
> +		/* validate that all targets agree on topology */
> +		if (!cxlds->rcd) {
> +			vh++;
> +		} else {
> +			rch++;
> +			continue;
> +		}
>   
>   		iter = cxled_to_port(cxled);
>   		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
> @@ -1256,6 +1276,12 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
>   		}
>   	}
>   
> +	if (rch && vh) {
> +		dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
> +		cxl_region_teardown_targets(cxlr);
> +		return -ENXIO;
> +	}
> +
>   	return 0;
>   }
>   
> 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] cxl/region: Fix region setup/teardown for RCDs
  2023-03-28 18:36 [PATCH] cxl/region: Fix region setup/teardown for RCDs Dan Williams
  2023-03-29 17:39 ` Dave Jiang
@ 2023-03-31  5:12 ` Ira Weiny
  1 sibling, 0 replies; 3+ messages in thread
From: Ira Weiny @ 2023-03-31  5:12 UTC (permalink / raw)
  To: Dan Williams, linux-cxl
  Cc: vishal.l.verma, ira.weiny, dave.jiang, alison.schofield,
	Jonathan.Cameron

Dan Williams wrote:
> RCDs (CXL memory devices that link train without VH capability and show
> up as root complex integrated endpoints), hide the presence of the link
> between the endpoint and the host-bridge. The CXL region setup/teardown
> paths assume that a link hop is present and go looking for at least one
> 'struct cxl_port' instance between the CXL root port-object and an
> endpoint port-object leading to crashes of the form:
> 
>     BUG: kernel NULL pointer dereference, address: 0000000000000008
>     [..]
>     RIP: 0010:cxl_region_setup_targets+0x3e9/0xae0 [cxl_core]
>     [..]
>     Call Trace:
>      <TASK>
>      cxl_region_attach+0x46c/0x7a0 [cxl_core]
>      cxl_create_region+0x20b/0x270 [cxl_core]
>      cxl_mock_mem_probe+0x641/0x800 [cxl_mock_mem]
>      platform_probe+0x5b/0xb0
> 
> Detect RCDs explicitly and skip walking the non-existent port hierarchy
> between root and endpoint in that case.
> 
> While this has been a problem since:
> 
> commit 0a19bfc8de93 ("cxl/port: Add RCD endpoint port enumeration")
> 
> ...it becomes a more reliable crash scenario with the new autodiscovery
> implementation.
> 
> Fixes: a32320b71f08 ("cxl/region: Add region autodiscovery")
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Reviewed-by: Ira Weiny <ira.weiny@intel.com>

> ---
>  drivers/cxl/core/region.c |   28 +++++++++++++++++++++++++++-
>  1 file changed, 27 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 808f23ec4e2b..52bbf6268d5f 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -134,9 +134,13 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
>  		struct cxl_endpoint_decoder *cxled = p->targets[i];
>  		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>  		struct cxl_port *iter = cxled_to_port(cxled);
> +		struct cxl_dev_state *cxlds = cxlmd->cxlds;
>  		struct cxl_ep *ep;
>  		int rc = 0;
>  
> +		if (cxlds->rcd)
> +			goto endpoint_reset;
> +
>  		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
>  			iter = to_cxl_port(iter->dev.parent);
>  
> @@ -153,6 +157,7 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
>  				return rc;
>  		}
>  
> +endpoint_reset:
>  		rc = cxled->cxld.reset(&cxled->cxld);
>  		if (rc)
>  			return rc;
> @@ -1199,6 +1204,7 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
>  {
>  	struct cxl_region_params *p = &cxlr->params;
>  	struct cxl_endpoint_decoder *cxled;
> +	struct cxl_dev_state *cxlds;
>  	struct cxl_memdev *cxlmd;
>  	struct cxl_port *iter;
>  	struct cxl_ep *ep;
> @@ -1214,6 +1220,10 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
>  	for (i = 0; i < p->nr_targets; i++) {
>  		cxled = p->targets[i];
>  		cxlmd = cxled_to_memdev(cxled);
> +		cxlds = cxlmd->cxlds;
> +
> +		if (cxlds->rcd)
> +			continue;
>  
>  		iter = cxled_to_port(cxled);
>  		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
> @@ -1229,14 +1239,24 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
>  {
>  	struct cxl_region_params *p = &cxlr->params;
>  	struct cxl_endpoint_decoder *cxled;
> +	struct cxl_dev_state *cxlds;
> +	int i, rc, rch = 0, vh = 0;
>  	struct cxl_memdev *cxlmd;
>  	struct cxl_port *iter;
>  	struct cxl_ep *ep;
> -	int i, rc;
>  
>  	for (i = 0; i < p->nr_targets; i++) {
>  		cxled = p->targets[i];
>  		cxlmd = cxled_to_memdev(cxled);
> +		cxlds = cxlmd->cxlds;
> +
> +		/* validate that all targets agree on topology */
> +		if (!cxlds->rcd) {
> +			vh++;
> +		} else {
> +			rch++;
> +			continue;
> +		}
>  
>  		iter = cxled_to_port(cxled);
>  		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
> @@ -1256,6 +1276,12 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
>  		}
>  	}
>  
> +	if (rch && vh) {
> +		dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
> +		cxl_region_teardown_targets(cxlr);
> +		return -ENXIO;
> +	}
> +
>  	return 0;
>  }
>  
> 



^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-03-31  5:12 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-28 18:36 [PATCH] cxl/region: Fix region setup/teardown for RCDs Dan Williams
2023-03-29 17:39 ` Dave Jiang
2023-03-31  5:12 ` Ira Weiny

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).