Re: [PATCH 16/19] cxl/hdm: Define a driver interface for DPA allocation

From: Dave Jiang <dave.jiang@intel.com>
To: Dan Williams <dan.j.williams@intel.com>, <linux-cxl@vger.kernel.org>
Cc: <ira.weiny@intel.com>, <navneet.singh@intel.com>
Subject: Re: [PATCH 16/19] cxl/hdm: Define a driver interface for DPA allocation
Date: Tue, 13 Jun 2023 16:53:03 -0700	[thread overview]
Message-ID: <98b1f61a-e6c2-71d4-c368-50d958501b0c@intel.com> (raw)
In-Reply-To: <168592158743.1948938.7622563891193802610.stgit@dwillia2-xfh.jf.intel.com>

On 6/4/23 16:33, Dan Williams wrote:
> Region creation involves finding available DPA (device-physical-address)
> capacity to map into HPA (host-physical-address) space. Given the HPA
> capacity constraint, define an API, cxl_request_dpa(), that has the
> flexibility to map the minimum amount of memory the driver needs to
> operate vs the total possible that can be mapped given HPA availability.
> 
> Factor out the core of cxl_dpa_alloc(), that does free space scanning,
> into a cxl_dpa_freespace() helper, and use that to balance the capacity
> available to map vs the @min and @max arguments to cxl_request_dpa().
> 
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>   drivers/cxl/core/hdm.c |  140 +++++++++++++++++++++++++++++++++++++++++-------
>   drivers/cxl/cxl.h      |    6 ++
>   drivers/cxl/cxlmem.h   |    4 +
>   3 files changed, 131 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 91ab3033c781..514d30131d92 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -464,30 +464,17 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>   	return rc;
>   }
>   
> -int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
> +static resource_size_t cxl_dpa_freespace(struct cxl_endpoint_decoder *cxled,

This function name reads odd for me. Maybe cxl_dpa_reserve_freespace()?

DJ

> +					 resource_size_t *start_out,
> +					 resource_size_t *skip_out)
>   {
>   	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>   	resource_size_t free_ram_start, free_pmem_start;
> -	struct cxl_port *port = cxled_to_port(cxled);
>   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> -	struct device *dev = &cxled->cxld.dev;
>   	resource_size_t start, avail, skip;
>   	struct resource *p, *last;
> -	int rc;
>   
> -	down_write(&cxl_dpa_rwsem);
> -	if (cxled->cxld.region) {
> -		dev_dbg(dev, "decoder attached to %s\n",
> -			dev_name(&cxled->cxld.region->dev));
> -		rc = -EBUSY;
> -		goto out;
> -	}
> -
> -	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
> -		dev_dbg(dev, "decoder enabled\n");
> -		rc = -EBUSY;
> -		goto out;
> -	}
> +	lockdep_assert_held(&cxl_dpa_rwsem);
>   
>   	for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling)
>   		last = p;
> @@ -525,11 +512,42 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>   			skip_end = start - 1;
>   		skip = skip_end - skip_start + 1;
>   	} else {
> -		dev_dbg(dev, "mode not set\n");
> -		rc = -EINVAL;
> +		dev_dbg(cxled_dev(cxled), "mode not set\n");
> +		avail = 0;
> +	}
> +
> +	if (!avail)
> +		return 0;
> +	if (start_out)
> +		*start_out = start;
> +	if (skip_out)
> +		*skip_out = skip;
> +	return avail;
> +}
> +
> +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
> +{
> +	struct cxl_port *port = cxled_to_port(cxled);
> +	struct device *dev = &cxled->cxld.dev;
> +	resource_size_t start, avail, skip;
> +	int rc;
> +
> +	down_write(&cxl_dpa_rwsem);
> +	if (cxled->cxld.region) {
> +		dev_dbg(dev, "decoder attached to %s\n",
> +			dev_name(&cxled->cxld.region->dev));
> +		rc = -EBUSY;
> +		goto out;
> +	}
> +
> +	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
> +		dev_dbg(dev, "decoder enabled\n");
> +		rc = -EBUSY;
>   		goto out;
>   	}
>   
> +	avail = cxl_dpa_freespace(cxled, &start, &skip);
> +
>   	if (size > avail) {
>   		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
>   			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
> @@ -548,6 +566,90 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>   	return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
>   }
>   
> +static int find_free_decoder(struct device *dev, void *data)
> +{
> +	struct cxl_endpoint_decoder *cxled;
> +	struct cxl_port *port;
> +
> +	if (!is_endpoint_decoder(dev))
> +		return 0;
> +
> +	cxled = to_cxl_endpoint_decoder(dev);
> +	port = cxled_to_port(cxled);
> +
> +	if (cxled->cxld.id != port->hdm_end + 1)
> +		return 0;
> +	return 1;
> +}
> +
> +/**
> + * cxl_request_dpa - search and reserve DPA given input constraints
> + * @endpoint: an endpoint port with available decoders
> + * @mode: DPA operation mode (ram vs pmem)
> + * @min: the minimum amount of capacity the call needs
> + * @max: extra capacity to allocate after min is satisfied
> + *
> + * Given that a region needs to allocate from limited HPA capacity it
> + * may be the case that a device has more mappable DPA capacity than
> + * available HPA. So, the expectation is that @min is a driver known
> + * value for how much capacity is needed, and @max is based the limit of
> + * how much HPA space is available for a new region.
> + *
> + * Returns a pinned cxl_decoder with at least @min bytes of capacity
> + * reserved, or an error pointer. The caller is also expected to own the
> + * lifetime of the memdev registration associated with the endpoint to
> + * pin the decoder registered as well.
> + */
> +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_port *endpoint,
> +					     enum cxl_decoder_mode mode,
> +					     resource_size_t min,
> +					     resource_size_t max)
> +{
> +	struct cxl_endpoint_decoder *cxled;
> +	struct device *cxled_dev;
> +	resource_size_t alloc;
> +	int rc;
> +
> +	if (!IS_ALIGNED(min | max, SZ_256M))
> +		return ERR_PTR(-EINVAL);
> +
> +	down_read(&cxl_dpa_rwsem);
> +	cxled_dev = device_find_child(&endpoint->dev, NULL, find_free_decoder);
> +	if (!cxled_dev)
> +		cxled = ERR_PTR(-ENXIO);
> +	else
> +		cxled = to_cxl_endpoint_decoder(cxled_dev);
> +	up_read(&cxl_dpa_rwsem);
> +
> +	if (IS_ERR(cxled))
> +		return cxled;
> +
> +	rc = cxl_dpa_set_mode(cxled, mode);
> +	if (rc)
> +		goto err;
> +
> +	down_read(&cxl_dpa_rwsem);
> +	alloc = cxl_dpa_freespace(cxled, NULL, NULL);
> +	up_read(&cxl_dpa_rwsem);
> +
> +	if (max)
> +		alloc = min(max, alloc);
> +	if (alloc < min) {
> +		rc = -ENOMEM;
> +		goto err;
> +	}
> +
> +	rc = cxl_dpa_alloc(cxled, alloc);
> +	if (rc)
> +		goto err;
> +
> +	return cxled;
> +err:
> +	put_device(cxled_dev);
> +	return ERR_PTR(rc);
> +}
> +EXPORT_SYMBOL_NS_GPL(cxl_request_dpa, CXL);
> +
>   static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl)
>   {
>   	u16 eig;
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 258c90727dd2..55808697773f 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -680,6 +680,12 @@ struct cxl_decoder *to_cxl_decoder(struct device *dev);
>   struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
>   struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
>   struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev);
> +
> +static inline struct device *cxled_dev(struct cxl_endpoint_decoder *cxled)
> +{
> +	return &cxled->cxld.dev;
> +}
> +
>   bool is_root_decoder(struct device *dev);
>   bool is_switch_decoder(struct device *dev);
>   bool is_endpoint_decoder(struct device *dev);
> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index e3bcd6d12a1c..8ec5c305d186 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -89,6 +89,10 @@ struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds);
>   int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   			 resource_size_t base, resource_size_t len,
>   			 resource_size_t skipped);
> +struct cxl_endpoint_decoder *cxl_request_dpa(struct cxl_port *endpoint,
> +					     enum cxl_decoder_mode mode,
> +					     resource_size_t min,
> +					     resource_size_t max);
>   
>   static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
>   					 struct cxl_memdev *cxlmd)
>