All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted
@ 2015-11-06 21:55 yu.dai
  2015-11-06 22:07 ` Chris Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: yu.dai @ 2015-11-06 21:55 UTC (permalink / raw)
  To: intel-gfx

From: Alex Dai <yu.dai@intel.com>

We keep a copy of GuC fw in a GEM obj. However its content is lost
if the GEM obj is evicted (igt/gem_evict_*). Therefore, the later
fw loading during GPU reset will fail.

Now we keep the copy in a memory block rather than a GEM objet.
During fw loading, we will wrap up a GEM obj with fw data for DMA
copy. The GEM obj will be freed DMA is done.

Signed-off-by: Alex Dai <yu.dai@intel.com>
---
 drivers/gpu/drm/i915/intel_guc.h        |  2 +-
 drivers/gpu/drm/i915/intel_guc_loader.c | 84 +++++++++++++++------------------
 2 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 5ba5866..2714106 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -68,7 +68,7 @@ struct intel_guc_fw {
 	struct drm_device *		guc_dev;
 	const char *			guc_fw_path;
 	size_t				guc_fw_size;
-	struct drm_i915_gem_object *	guc_fw_obj;
+	uint32_t			*guc_fw_data;
 	enum intel_guc_fw_status	guc_fw_fetch_status;
 	enum intel_guc_fw_status	guc_fw_load_status;
 
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index 550921f..ebf15e8 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -218,26 +218,45 @@ static inline bool guc_ucode_response(struct drm_i915_private *dev_priv,
 static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv)
 {
 	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
-	struct drm_i915_gem_object *fw_obj = guc_fw->guc_fw_obj;
+	struct drm_i915_gem_object *fw_obj;
 	unsigned long offset;
-	struct sg_table *sg = fw_obj->pages;
-	u32 status, rsa[UOS_RSA_SCRATCH_MAX_COUNT];
+	u32 status, *rsa;
 	int i, ret = 0;
 
 	/* where RSA signature starts */
-	offset = guc_fw->rsa_offset;
+	rsa = (u32 *)((u8 *)guc_fw->guc_fw_data + guc_fw->rsa_offset);
 
 	/* Copy RSA signature from the fw image to HW for verification */
-	sg_pcopy_to_buffer(sg->sgl, sg->nents, rsa, sizeof(rsa), offset);
 	for (i = 0; i < UOS_RSA_SCRATCH_MAX_COUNT; i++)
-		I915_WRITE(UOS_RSA_SCRATCH(i), rsa[i]);
+		I915_WRITE(UOS_RSA_SCRATCH(i), *(rsa + i));
+
+	/* Wrap up a gem obj filled with header + ucode */
+	fw_obj = i915_gem_object_create_from_data(dev_priv->dev,
+			(u8 *)guc_fw->guc_fw_data + guc_fw->header_offset,
+			guc_fw->header_size + guc_fw->ucode_size);
+	if (IS_ERR_OR_NULL(fw_obj)) {
+		ret = fw_obj ? PTR_ERR(fw_obj) : -ENOMEM;
+		return ret;
+	}
+
+	ret = i915_gem_object_set_to_gtt_domain(fw_obj, false);
+	if (ret) {
+		DRM_DEBUG_DRIVER("set-domain failed %d\n", ret);
+		return ret;
+	}
+
+	ret = i915_gem_obj_ggtt_pin(fw_obj, 0, 0);
+	if (ret) {
+		DRM_DEBUG_DRIVER("pin failed %d\n", ret);
+		return ret;
+	}
 
 	/* The header plus uCode will be copied to WOPCM via DMA, excluding any
 	 * other components */
 	I915_WRITE(DMA_COPY_SIZE, guc_fw->header_size + guc_fw->ucode_size);
 
 	/* Set the source address for the new blob */
-	offset = i915_gem_obj_ggtt_offset(fw_obj) + guc_fw->header_offset;
+	offset = i915_gem_obj_ggtt_offset(fw_obj);
 	I915_WRITE(DMA_ADDR_0_LOW, lower_32_bits(offset));
 	I915_WRITE(DMA_ADDR_0_HIGH, upper_32_bits(offset) & 0xFFFF);
 
@@ -269,6 +288,10 @@ static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv)
 		ret = -ENOEXEC;
 	}
 
+	/* We can free it now that DMA has completed */
+	i915_gem_object_ggtt_unpin(fw_obj);
+	drm_gem_object_unreference(&fw_obj->base);
+
 	DRM_DEBUG_DRIVER("returning %d\n", ret);
 
 	return ret;
@@ -279,22 +302,9 @@ static int guc_ucode_xfer_dma(struct drm_i915_private *dev_priv)
  */
 static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 {
-	struct intel_guc_fw *guc_fw = &dev_priv->guc.guc_fw;
 	struct drm_device *dev = dev_priv->dev;
 	int ret;
 
-	ret = i915_gem_object_set_to_gtt_domain(guc_fw->guc_fw_obj, false);
-	if (ret) {
-		DRM_DEBUG_DRIVER("set-domain failed %d\n", ret);
-		return ret;
-	}
-
-	ret = i915_gem_obj_ggtt_pin(guc_fw->guc_fw_obj, 0, 0);
-	if (ret) {
-		DRM_DEBUG_DRIVER("pin failed %d\n", ret);
-		return ret;
-	}
-
 	/* Invalidate GuC TLB to let GuC take the latest updates to GTT. */
 	I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
 
@@ -337,12 +347,6 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
-	/*
-	 * We keep the object pages for reuse during resume. But we can unpin it
-	 * now that DMA has completed, so it doesn't continue to take up space.
-	 */
-	i915_gem_object_ggtt_unpin(guc_fw->guc_fw_obj);
-
 	return ret;
 }
 
@@ -444,7 +448,6 @@ fail:
 
 static void guc_fw_fetch(struct drm_device *dev, struct intel_guc_fw *guc_fw)
 {
-	struct drm_i915_gem_object *obj;
 	const struct firmware *fw;
 	struct guc_css_header *css;
 	size_t size;
@@ -530,34 +533,27 @@ static void guc_fw_fetch(struct drm_device *dev, struct intel_guc_fw *guc_fw)
 			guc_fw->guc_fw_major_found, guc_fw->guc_fw_minor_found,
 			guc_fw->guc_fw_major_wanted, guc_fw->guc_fw_minor_wanted);
 
-	mutex_lock(&dev->struct_mutex);
-	obj = i915_gem_object_create_from_data(dev, fw->data, fw->size);
-	mutex_unlock(&dev->struct_mutex);
-	if (IS_ERR_OR_NULL(obj)) {
-		err = obj ? PTR_ERR(obj) : -ENOMEM;
+	guc_fw->guc_fw_data = kmalloc(fw->size, GFP_KERNEL);
+	if (guc_fw->guc_fw_data == NULL) {
+		err = -ENOMEM;
 		goto fail;
 	}
 
-	guc_fw->guc_fw_obj = obj;
+	memcpy(guc_fw->guc_fw_data, fw->data, fw->size);
 	guc_fw->guc_fw_size = fw->size;
 
-	DRM_DEBUG_DRIVER("GuC fw fetch status SUCCESS, obj %p\n",
-			guc_fw->guc_fw_obj);
+	DRM_DEBUG_DRIVER("GuC fw fetch status SUCCESS\n");
 
 	release_firmware(fw);
 	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_SUCCESS;
 	return;
 
 fail:
-	DRM_DEBUG_DRIVER("GuC fw fetch status FAIL; err %d, fw %p, obj %p\n",
-		err, fw, guc_fw->guc_fw_obj);
+	DRM_DEBUG_DRIVER("GuC fw fetch status FAIL; err %d, fw %p\n", err, fw);
 	DRM_ERROR("Failed to fetch GuC firmware from %s (error %d)\n",
 		  guc_fw->guc_fw_path, err);
 
-	obj = guc_fw->guc_fw_obj;
-	if (obj)
-		drm_gem_object_unreference(&obj->base);
-	guc_fw->guc_fw_obj = NULL;
+	guc_fw->guc_fw_data = NULL;
 
 	release_firmware(fw);		/* OK even if fw is NULL */
 	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_FAIL;
@@ -627,11 +623,7 @@ void intel_guc_ucode_fini(struct drm_device *dev)
 	direct_interrupts_to_host(dev_priv);
 	i915_guc_submission_fini(dev);
 
-	mutex_lock(&dev->struct_mutex);
-	if (guc_fw->guc_fw_obj)
-		drm_gem_object_unreference(&guc_fw->guc_fw_obj->base);
-	guc_fw->guc_fw_obj = NULL;
-	mutex_unlock(&dev->struct_mutex);
+	kfree(guc_fw->guc_fw_data);
 
 	guc_fw->guc_fw_fetch_status = GUC_FIRMWARE_NONE;
 }
-- 
2.5.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-06 21:55 [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted yu.dai
@ 2015-11-06 22:07 ` Chris Wilson
  2015-11-06 23:18   ` Yu Dai
  0 siblings, 1 reply; 18+ messages in thread
From: Chris Wilson @ 2015-11-06 22:07 UTC (permalink / raw)
  To: yu.dai; +Cc: intel-gfx

On Fri, Nov 06, 2015 at 01:55:27PM -0800, yu.dai@intel.com wrote:
> From: Alex Dai <yu.dai@intel.com>
> 
> We keep a copy of GuC fw in a GEM obj. However its content is lost
> if the GEM obj is evicted (igt/gem_evict_*). Therefore, the later
> fw loading during GPU reset will fail.

No, it's not. The bug is in sg_copy_buffer called by
i915_gem_object_create_from_data introduced by yourselves.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-06 22:07 ` Chris Wilson
@ 2015-11-06 23:18   ` Yu Dai
  2015-11-09 10:15     ` Chris Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Yu Dai @ 2015-11-06 23:18 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx



On 11/06/2015 02:07 PM, Chris Wilson wrote:
> On Fri, Nov 06, 2015 at 01:55:27PM -0800, yu.dai@intel.com wrote:
> > From: Alex Dai <yu.dai@intel.com>
> >
> > We keep a copy of GuC fw in a GEM obj. However its content is lost
> > if the GEM obj is evicted (igt/gem_evict_*). Therefore, the later
> > fw loading during GPU reset will fail.
>
> No, it's not. The bug is in sg_copy_buffer called by
> i915_gem_object_create_from_data introduced by yourselves.
>

My understanding is that sg_copy_from_buffer is used to copy data. Can 
you clarify why using this will cause such issue? I also did another 
experiment that always leaving the GEM obj pinned, then it's content 
never changed.

Alex
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-06 23:18   ` Yu Dai
@ 2015-11-09 10:15     ` Chris Wilson
  2015-11-10 23:27       ` [PATCH v1] " yu.dai
  2015-11-10 23:29       ` [PATCH] " Yu Dai
  0 siblings, 2 replies; 18+ messages in thread
From: Chris Wilson @ 2015-11-09 10:15 UTC (permalink / raw)
  To: Yu Dai; +Cc: intel-gfx

On Fri, Nov 06, 2015 at 03:18:37PM -0800, Yu Dai wrote:
> 
> 
> On 11/06/2015 02:07 PM, Chris Wilson wrote:
> >On Fri, Nov 06, 2015 at 01:55:27PM -0800, yu.dai@intel.com wrote:
> >> From: Alex Dai <yu.dai@intel.com>
> >>
> >> We keep a copy of GuC fw in a GEM obj. However its content is lost
> >> if the GEM obj is evicted (igt/gem_evict_*). Therefore, the later
> >> fw loading during GPU reset will fail.
> >
> >No, it's not. The bug is in sg_copy_buffer called by
> >i915_gem_object_create_from_data introduced by yourselves.
> >
> 
> My understanding is that sg_copy_from_buffer is used to copy data.
> Can you clarify why using this will cause such issue?

"However its content is lost if the GEM obj is evicted (igt/gem_evict_*)."

is not strictly true. The content is lost if the object is swapped
because the page is never marked as dirty. sg_copy_buffer() is copying
into the page but never marks said page as dirty.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-09 10:15     ` Chris Wilson
@ 2015-11-10 23:27       ` yu.dai
  2015-11-11  9:07         ` Chris Wilson
  2015-11-24 15:47         ` Dave Gordon
  2015-11-10 23:29       ` [PATCH] " Yu Dai
  1 sibling, 2 replies; 18+ messages in thread
From: yu.dai @ 2015-11-10 23:27 UTC (permalink / raw)
  To: intel-gfx

From: Alex Dai <yu.dai@intel.com>

We keep a copy of GuC fw in a GEM obj. However its content is lost
if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
fw loading during GPU reset will fail. Mark the obj dirty after
copying data into the pages. So its content will be kept during
swapped out.

Signed-off-by: Alex Dai <yu.dai@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f1e3fde..3b15167 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
 	i915_gem_object_pin_pages(obj);
 	sg = obj->pages;
 	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
+	obj->dirty = 1;
 	i915_gem_object_unpin_pages(obj);
 
 	if (WARN_ON(bytes != size)) {
-- 
2.5.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-09 10:15     ` Chris Wilson
  2015-11-10 23:27       ` [PATCH v1] " yu.dai
@ 2015-11-10 23:29       ` Yu Dai
  1 sibling, 0 replies; 18+ messages in thread
From: Yu Dai @ 2015-11-10 23:29 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx



On 11/09/2015 02:15 AM, Chris Wilson wrote:
> On Fri, Nov 06, 2015 at 03:18:37PM -0800, Yu Dai wrote:
> >
> >
> > On 11/06/2015 02:07 PM, Chris Wilson wrote:
> > >On Fri, Nov 06, 2015 at 01:55:27PM -0800, yu.dai@intel.com wrote:
> > >> From: Alex Dai <yu.dai@intel.com>
> > >>
> > >> We keep a copy of GuC fw in a GEM obj. However its content is lost
> > >> if the GEM obj is evicted (igt/gem_evict_*). Therefore, the later
> > >> fw loading during GPU reset will fail.
> > >
> > >No, it's not. The bug is in sg_copy_buffer called by
> > >i915_gem_object_create_from_data introduced by yourselves.
> > >
> >
> > My understanding is that sg_copy_from_buffer is used to copy data.
> > Can you clarify why using this will cause such issue?
>
> "However its content is lost if the GEM obj is evicted (igt/gem_evict_*)."
>
> is not strictly true. The content is lost if the object is swapped
> because the page is never marked as dirty. sg_copy_buffer() is copying
> into the page but never marks said page as dirty.
> -Chris
>
Thanks Chris. Now the patch is one line change. :)

Alex
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-10 23:27       ` [PATCH v1] " yu.dai
@ 2015-11-11  9:07         ` Chris Wilson
  2015-11-11 19:01           ` Yu Dai
  2015-11-23  9:18           ` akash goel
  2015-11-24 15:47         ` Dave Gordon
  1 sibling, 2 replies; 18+ messages in thread
From: Chris Wilson @ 2015-11-11  9:07 UTC (permalink / raw)
  To: yu.dai; +Cc: intel-gfx

On Tue, Nov 10, 2015 at 03:27:36PM -0800, yu.dai@intel.com wrote:
> From: Alex Dai <yu.dai@intel.com>
> 
> We keep a copy of GuC fw in a GEM obj. However its content is lost
> if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
> fw loading during GPU reset will fail. Mark the obj dirty after
> copying data into the pages. So its content will be kept during
> swapped out.
> 
> Signed-off-by: Alex Dai <yu.dai@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f1e3fde..3b15167 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
>  	i915_gem_object_pin_pages(obj);
>  	sg = obj->pages;
>  	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
> +	obj->dirty = 1;

That's one way of doing it, but atypical for our CPU access to the pages.
The root cause is still that sg_copy_from_buffer() isn't calling
set_page_dirty() and so there will be other places in the kernel that
fall foul of it. Also, not that we could have written this to not require
the whole object to be pinned at once as well.

I would prefer this to be fixed in sg_copy_from_buffer() for the reason
that all callers are susceptible to this bug.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-11  9:07         ` Chris Wilson
@ 2015-11-11 19:01           ` Yu Dai
  2015-11-23  9:18           ` akash goel
  1 sibling, 0 replies; 18+ messages in thread
From: Yu Dai @ 2015-11-11 19:01 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx



On 11/11/2015 01:07 AM, Chris Wilson wrote:
> On Tue, Nov 10, 2015 at 03:27:36PM -0800, yu.dai@intel.com wrote:
> > From: Alex Dai <yu.dai@intel.com>
> >
> > We keep a copy of GuC fw in a GEM obj. However its content is lost
> > if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
> > fw loading during GPU reset will fail. Mark the obj dirty after
> > copying data into the pages. So its content will be kept during
> > swapped out.
> >
> > Signed-off-by: Alex Dai <yu.dai@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index f1e3fde..3b15167 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
> >  	i915_gem_object_pin_pages(obj);
> >  	sg = obj->pages;
> >  	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
> > +	obj->dirty = 1;
>
> That's one way of doing it, but atypical for our CPU access to the pages.
> The root cause is still that sg_copy_from_buffer() isn't calling
> set_page_dirty() and so there will be other places in the kernel that
> fall foul of it. Also, not that we could have written this to not require
> the whole object to be pinned at once as well.
>
> I would prefer this to be fixed in sg_copy_from_buffer() for the reason
> that all callers are susceptible to this bug.
> -Chris
>
Makes sense. I will keep this as it for now. We want to set this GEM obj 
dirty too in addition of marking each page dirty. A change in 
sg_copy_buffer will have big impact on kernel though.

Thanks,
Alex
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-11  9:07         ` Chris Wilson
  2015-11-11 19:01           ` Yu Dai
@ 2015-11-23  9:18           ` akash goel
  2015-11-23  9:37             ` Chris Wilson
  1 sibling, 1 reply; 18+ messages in thread
From: akash goel @ 2015-11-23  9:18 UTC (permalink / raw)
  To: Chris Wilson, yu.dai, intel-gfx; +Cc: Goel, Akash

On Wed, Nov 11, 2015 at 2:37 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Tue, Nov 10, 2015 at 03:27:36PM -0800, yu.dai@intel.com wrote:
>> From: Alex Dai <yu.dai@intel.com>
>>
>> We keep a copy of GuC fw in a GEM obj. However its content is lost
>> if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
>> fw loading during GPU reset will fail. Mark the obj dirty after
>> copying data into the pages. So its content will be kept during
>> swapped out.
>>
>> Signed-off-by: Alex Dai <yu.dai@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_gem.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>> index f1e3fde..3b15167 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
>>       i915_gem_object_pin_pages(obj);
>>       sg = obj->pages;
>>       bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
>> +     obj->dirty = 1;
>
> That's one way of doing it, but atypical for our CPU access to the pages.
> The root cause is still that sg_copy_from_buffer() isn't calling
> set_page_dirty() and so there will be other places in the kernel that
> fall foul of it. Also, not that we could have written this to not require
> the whole object to be pinned at once as well.
>
> I would prefer this to be fixed in sg_copy_from_buffer() for the reason
> that all callers are susceptible to this bug.
> -Chris

Probably the sg_copy_from_buffer is written with an assumption that
the pages it is supposed to write to,
would always be the kernel pages (GFP_KERNEL), which will always be
resident in RAM. Thus no real need to mark the pages as dirty.
Or the Clients/Users of  sg_copy_from_buffer function are expected to
explicitly mark the pages as dirty.

Best regards
Akash

>
> --
> Chris Wilson, Intel Open Source Technology Centre
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-23  9:18           ` akash goel
@ 2015-11-23  9:37             ` Chris Wilson
  0 siblings, 0 replies; 18+ messages in thread
From: Chris Wilson @ 2015-11-23  9:37 UTC (permalink / raw)
  To: akash goel; +Cc: intel-gfx, Goel, Akash

On Mon, Nov 23, 2015 at 02:48:24PM +0530, akash goel wrote:
> On Wed, Nov 11, 2015 at 2:37 PM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > On Tue, Nov 10, 2015 at 03:27:36PM -0800, yu.dai@intel.com wrote:
> >> From: Alex Dai <yu.dai@intel.com>
> >>
> >> We keep a copy of GuC fw in a GEM obj. However its content is lost
> >> if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
> >> fw loading during GPU reset will fail. Mark the obj dirty after
> >> copying data into the pages. So its content will be kept during
> >> swapped out.
> >>
> >> Signed-off-by: Alex Dai <yu.dai@intel.com>
> >> ---
> >>  drivers/gpu/drm/i915/i915_gem.c | 1 +
> >>  1 file changed, 1 insertion(+)
> >>
> >> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >> index f1e3fde..3b15167 100644
> >> --- a/drivers/gpu/drm/i915/i915_gem.c
> >> +++ b/drivers/gpu/drm/i915/i915_gem.c
> >> @@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
> >>       i915_gem_object_pin_pages(obj);
> >>       sg = obj->pages;
> >>       bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
> >> +     obj->dirty = 1;
> >
> > That's one way of doing it, but atypical for our CPU access to the pages.
> > The root cause is still that sg_copy_from_buffer() isn't calling
> > set_page_dirty() and so there will be other places in the kernel that
> > fall foul of it. Also, not that we could have written this to not require
> > the whole object to be pinned at once as well.
> >
> > I would prefer this to be fixed in sg_copy_from_buffer() for the reason
> > that all callers are susceptible to this bug.
> > -Chris
> 
> Probably the sg_copy_from_buffer is written with an assumption that
> the pages it is supposed to write to,

Seems unlikely considering it is supposed to be a generic library
helper.

> would always be the kernel pages (GFP_KERNEL), which will always be
> resident in RAM. Thus no real need to mark the pages as dirty.
> Or the Clients/Users of  sg_copy_from_buffer function are expected to
> explicitly mark the pages as dirty.

In which case, we shouldn't be using it. So back to making the library
function correct, or using a better interface to write the shmemfs pages
directly (which would avoid the extra clears, or for more points an
internal object which we can write and avoid even the kmalloc through
the generic firmware loader).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-10 23:27       ` [PATCH v1] " yu.dai
  2015-11-11  9:07         ` Chris Wilson
@ 2015-11-24 15:47         ` Dave Gordon
  2015-11-24 16:04           ` Daniel Vetter
  1 sibling, 1 reply; 18+ messages in thread
From: Dave Gordon @ 2015-11-24 15:47 UTC (permalink / raw)
  To: yu.dai, intel-gfx

On 10/11/15 23:27, yu.dai@intel.com wrote:
> From: Alex Dai <yu.dai@intel.com>
>
> We keep a copy of GuC fw in a GEM obj. However its content is lost
> if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
> fw loading during GPU reset will fail. Mark the obj dirty after
> copying data into the pages. So its content will be kept during
> swapped out.
>
> Signed-off-by: Alex Dai <yu.dai@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 1 +
>   1 file changed, 1 insertion(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f1e3fde..3b15167 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
>   	i915_gem_object_pin_pages(obj);
>   	sg = obj->pages;
>   	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
> +	obj->dirty = 1;
>   	i915_gem_object_unpin_pages(obj);
>
>   	if (WARN_ON(bytes != size)) {

Hi,

although I liked this at first, on further checking I've found that 
there are other cases where the CPU writes on a GEM object without 
marking it dirty, so in theory all of those could also be subject to the 
same issue.

Functions that may suffer from this problem include copy_batch(),
relocate_entry_cpu(), and render_state_setup(); hence, the objects that 
might be affected (CPU updates discarded) would in general be batch buffers.

Obviously also i915_gem_object_create_from_data() currently used only 
for firmware objects; and populate_lr_context() also modifies a GEM 
object using the CPU, but it already explicitly marks the object dirty 
and so avoid any problems.

At present, setting an object into the GTT domain with intent-to-write 
causes the object to be marked as dirty, whereas setting it into the CPU 
domain with intent-to-write doesn't. Since the obj->dirty flag doesn't 
mean "CPU and GPU views differ" but rather "In-memory version is newer 
than backing store", setting it should depend only on the 
intent-to-write flag, not on which domain it's being put into.

So my preferred solution would be to set the obj->dirty flag inside 
i915_gem_object_set_to_cpu_domain(obj, true). Then objects that have 
been written by the CPU would be swapped out to backing store rather 
than discarded if the shrinker decides to reclaim the memory.

To make this work properly, we also need to audit all the callers of 
i915_gem_object_set_to_cpu_domain(), because I think in several places 
the callers pass the second argument as true where it should be false; 
this would cause objects to unnecessarily be marked dirty, increasing 
swap requirements and impacting performance.

I'll follow up with an actual patch once people have had an opportunity 
to check that the above analysis is accurate.

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-24 15:47         ` Dave Gordon
@ 2015-11-24 16:04           ` Daniel Vetter
  2015-11-24 17:47             ` Dave Gordon
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Vetter @ 2015-11-24 16:04 UTC (permalink / raw)
  To: Dave Gordon; +Cc: intel-gfx

On Tue, Nov 24, 2015 at 03:47:02PM +0000, Dave Gordon wrote:
> On 10/11/15 23:27, yu.dai@intel.com wrote:
> >From: Alex Dai <yu.dai@intel.com>
> >
> >We keep a copy of GuC fw in a GEM obj. However its content is lost
> >if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
> >fw loading during GPU reset will fail. Mark the obj dirty after
> >copying data into the pages. So its content will be kept during
> >swapped out.
> >
> >Signed-off-by: Alex Dai <yu.dai@intel.com>
> >---
> >  drivers/gpu/drm/i915/i915_gem.c | 1 +
> >  1 file changed, 1 insertion(+)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >index f1e3fde..3b15167 100644
> >--- a/drivers/gpu/drm/i915/i915_gem.c
> >+++ b/drivers/gpu/drm/i915/i915_gem.c
> >@@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
> >  	i915_gem_object_pin_pages(obj);
> >  	sg = obj->pages;
> >  	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
> >+	obj->dirty = 1;
> >  	i915_gem_object_unpin_pages(obj);
> >
> >  	if (WARN_ON(bytes != size)) {
> 
> Hi,
> 
> although I liked this at first, on further checking I've found that there
> are other cases where the CPU writes on a GEM object without marking it
> dirty, so in theory all of those could also be subject to the same issue.
> 
> Functions that may suffer from this problem include copy_batch(),
> relocate_entry_cpu(), and render_state_setup(); hence, the objects that
> might be affected (CPU updates discarded) would in general be batch buffers.
> 
> Obviously also i915_gem_object_create_from_data() currently used only for
> firmware objects; and populate_lr_context() also modifies a GEM object using
> the CPU, but it already explicitly marks the object dirty and so avoid any
> problems.
> 
> At present, setting an object into the GTT domain with intent-to-write
> causes the object to be marked as dirty, whereas setting it into the CPU
> domain with intent-to-write doesn't. Since the obj->dirty flag doesn't mean
> "CPU and GPU views differ" but rather "In-memory version is newer than
> backing store", setting it should depend only on the intent-to-write flag,
> not on which domain it's being put into.
> 
> So my preferred solution would be to set the obj->dirty flag inside
> i915_gem_object_set_to_cpu_domain(obj, true). Then objects that have been
> written by the CPU would be swapped out to backing store rather than
> discarded if the shrinker decides to reclaim the memory.
> 
> To make this work properly, we also need to audit all the callers of
> i915_gem_object_set_to_cpu_domain(), because I think in several places the
> callers pass the second argument as true where it should be false; this
> would cause objects to unnecessarily be marked dirty, increasing swap
> requirements and impacting performance.
> 
> I'll follow up with an actual patch once people have had an opportunity to
> check that the above analysis is accurate.

set_to_cpu_domain won't cover everyone, since some callers like
shmem_pwrite/pread do clever tricks with inline flushing.

Also there's a fundamental difference between writing through GTT (where
we always pin all the pages, even though now we do have partial views),
and CPU writes, which in most places access the bo page-by-page.

Given that I'm still slightly leaning towards a "cpu writes need to mark
pages dirty themselves" world-view. At least teaching this to
set_to_cpu_domain won't be a fully effective magic bullet.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-24 16:04           ` Daniel Vetter
@ 2015-11-24 17:47             ` Dave Gordon
  2015-11-24 18:06               ` Daniel Vetter
  0 siblings, 1 reply; 18+ messages in thread
From: Dave Gordon @ 2015-11-24 17:47 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On 24/11/15 16:04, Daniel Vetter wrote:
> On Tue, Nov 24, 2015 at 03:47:02PM +0000, Dave Gordon wrote:
>> On 10/11/15 23:27, yu.dai@intel.com wrote:
>>> From: Alex Dai <yu.dai@intel.com>
>>>
>>> We keep a copy of GuC fw in a GEM obj. However its content is lost
>>> if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
>>> fw loading during GPU reset will fail. Mark the obj dirty after
>>> copying data into the pages. So its content will be kept during
>>> swapped out.
>>>
>>> Signed-off-by: Alex Dai <yu.dai@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gem.c | 1 +
>>>   1 file changed, 1 insertion(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index f1e3fde..3b15167 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
>>>   	i915_gem_object_pin_pages(obj);
>>>   	sg = obj->pages;
>>>   	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
>>> +	obj->dirty = 1;
>>>   	i915_gem_object_unpin_pages(obj);
>>>
>>>   	if (WARN_ON(bytes != size)) {
>>
>> Hi,
>>
>> although I liked this at first, on further checking I've found that there
>> are other cases where the CPU writes on a GEM object without marking it
>> dirty, so in theory all of those could also be subject to the same issue.
>>
>> Functions that may suffer from this problem include copy_batch(),
>> relocate_entry_cpu(), and render_state_setup(); hence, the objects that
>> might be affected (CPU updates discarded) would in general be batch buffers.
>>
>> Obviously also i915_gem_object_create_from_data() currently used only for
>> firmware objects; and populate_lr_context() also modifies a GEM object using
>> the CPU, but it already explicitly marks the object dirty and so avoid any
>> problems.
>>
>> At present, setting an object into the GTT domain with intent-to-write
>> causes the object to be marked as dirty, whereas setting it into the CPU
>> domain with intent-to-write doesn't. Since the obj->dirty flag doesn't mean
>> "CPU and GPU views differ" but rather "In-memory version is newer than
>> backing store", setting it should depend only on the intent-to-write flag,
>> not on which domain it's being put into.
>>
>> So my preferred solution would be to set the obj->dirty flag inside
>> i915_gem_object_set_to_cpu_domain(obj, true). Then objects that have been
>> written by the CPU would be swapped out to backing store rather than
>> discarded if the shrinker decides to reclaim the memory.
>>
>> To make this work properly, we also need to audit all the callers of
>> i915_gem_object_set_to_cpu_domain(), because I think in several places the
>> callers pass the second argument as true where it should be false; this
>> would cause objects to unnecessarily be marked dirty, increasing swap
>> requirements and impacting performance.
>>
>> I'll follow up with an actual patch once people have had an opportunity to
>> check that the above analysis is accurate.
>
> set_to_cpu_domain won't cover everyone, since some callers like
> shmem_pwrite/pread do clever tricks with inline flushing.
>
> Also there's a fundamental difference between writing through GTT (where
> we always pin all the pages, even though now we do have partial views),
> and CPU writes, which in most places access the bo page-by-page.
>
> Given that I'm still slightly leaning towards a "cpu writes need to mark
> pages dirty themselves" world-view. At least teaching this to
> set_to_cpu_domain won't be a fully effective magic bullet.
> -Daniel

shmem_pwrite doesn't call set_to_cpu_domain, and it already explicitly 
marks the GEM object as dirty; shmem_pread obviously doesn't need to as 
it's a read operation and doesn't change the object contents.

There's no difference (as far as backing store is concerned) whether the 
object is written via a CPU mapping, a GTT mapping, or by the GPU; the 
end result is that the object in memory is different from the version on 
backing store, and the object should be marked dirty if we want the 
changes to persist across eviction.

There may indeed be additional functions that modify GEM objects in some 
way without first setting them into the CPU domain, in which case they 
will also be responsible for marking the objects dirty. But all the 
(correct) callers of i915_gem_object_set_to_cpu_domain(obj, true) follow 
it by writing on the object, and only one marks the object as dirty, so 
all others are potential victims of having the shrinker discard their 
uncommitted changes.

Fixing it here (i.e when mapping whole objects for CPU write access) 
means that object-level operations then don't have to worry about 
page-level details. Functions that want to be smarter about tracking 
updates at the level of individual pages could still do so, but I don't 
want to impose that requirement on code that just wants to manipulate 
objects.

BTW, I note that the different implementations of put_pages() are not 
consistent about how they treat dirty pages within objects not marked as 
dirty. Does this matter?

.Dave.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-24 17:47             ` Dave Gordon
@ 2015-11-24 18:06               ` Daniel Vetter
  2015-11-24 23:01                 ` Chris Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Vetter @ 2015-11-24 18:06 UTC (permalink / raw)
  To: Dave Gordon; +Cc: intel-gfx

On Tue, Nov 24, 2015 at 05:47:08PM +0000, Dave Gordon wrote:
> On 24/11/15 16:04, Daniel Vetter wrote:
> >On Tue, Nov 24, 2015 at 03:47:02PM +0000, Dave Gordon wrote:
> >>On 10/11/15 23:27, yu.dai@intel.com wrote:
> >>>From: Alex Dai <yu.dai@intel.com>
> >>>
> >>>We keep a copy of GuC fw in a GEM obj. However its content is lost
> >>>if the GEM obj is swapped (igt/gem_evict_*). Therefore, the later
> >>>fw loading during GPU reset will fail. Mark the obj dirty after
> >>>copying data into the pages. So its content will be kept during
> >>>swapped out.
> >>>
> >>>Signed-off-by: Alex Dai <yu.dai@intel.com>
> >>>---
> >>>  drivers/gpu/drm/i915/i915_gem.c | 1 +
> >>>  1 file changed, 1 insertion(+)
> >>>
> >>>diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >>>index f1e3fde..3b15167 100644
> >>>--- a/drivers/gpu/drm/i915/i915_gem.c
> >>>+++ b/drivers/gpu/drm/i915/i915_gem.c
> >>>@@ -5199,6 +5199,7 @@ i915_gem_object_create_from_data(struct drm_device *dev,
> >>>  	i915_gem_object_pin_pages(obj);
> >>>  	sg = obj->pages;
> >>>  	bytes = sg_copy_from_buffer(sg->sgl, sg->nents, (void *)data, size);
> >>>+	obj->dirty = 1;
> >>>  	i915_gem_object_unpin_pages(obj);
> >>>
> >>>  	if (WARN_ON(bytes != size)) {
> >>
> >>Hi,
> >>
> >>although I liked this at first, on further checking I've found that there
> >>are other cases where the CPU writes on a GEM object without marking it
> >>dirty, so in theory all of those could also be subject to the same issue.
> >>
> >>Functions that may suffer from this problem include copy_batch(),
> >>relocate_entry_cpu(), and render_state_setup(); hence, the objects that
> >>might be affected (CPU updates discarded) would in general be batch buffers.
> >>
> >>Obviously also i915_gem_object_create_from_data() currently used only for
> >>firmware objects; and populate_lr_context() also modifies a GEM object using
> >>the CPU, but it already explicitly marks the object dirty and so avoid any
> >>problems.
> >>
> >>At present, setting an object into the GTT domain with intent-to-write
> >>causes the object to be marked as dirty, whereas setting it into the CPU
> >>domain with intent-to-write doesn't. Since the obj->dirty flag doesn't mean
> >>"CPU and GPU views differ" but rather "In-memory version is newer than
> >>backing store", setting it should depend only on the intent-to-write flag,
> >>not on which domain it's being put into.
> >>
> >>So my preferred solution would be to set the obj->dirty flag inside
> >>i915_gem_object_set_to_cpu_domain(obj, true). Then objects that have been
> >>written by the CPU would be swapped out to backing store rather than
> >>discarded if the shrinker decides to reclaim the memory.
> >>
> >>To make this work properly, we also need to audit all the callers of
> >>i915_gem_object_set_to_cpu_domain(), because I think in several places the
> >>callers pass the second argument as true where it should be false; this
> >>would cause objects to unnecessarily be marked dirty, increasing swap
> >>requirements and impacting performance.
> >>
> >>I'll follow up with an actual patch once people have had an opportunity to
> >>check that the above analysis is accurate.
> >
> >set_to_cpu_domain won't cover everyone, since some callers like
> >shmem_pwrite/pread do clever tricks with inline flushing.
> >
> >Also there's a fundamental difference between writing through GTT (where
> >we always pin all the pages, even though now we do have partial views),
> >and CPU writes, which in most places access the bo page-by-page.
> >
> >Given that I'm still slightly leaning towards a "cpu writes need to mark
> >pages dirty themselves" world-view. At least teaching this to
> >set_to_cpu_domain won't be a fully effective magic bullet.
> >-Daniel
> 
> shmem_pwrite doesn't call set_to_cpu_domain, and it already explicitly marks
> the GEM object as dirty; shmem_pread obviously doesn't need to as it's a
> read operation and doesn't change the object contents.
> 
> There's no difference (as far as backing store is concerned) whether the
> object is written via a CPU mapping, a GTT mapping, or by the GPU; the end
> result is that the object in memory is different from the version on backing
> store, and the object should be marked dirty if we want the changes to
> persist across eviction.
> 
> There may indeed be additional functions that modify GEM objects in some way
> without first setting them into the CPU domain, in which case they will also
> be responsible for marking the objects dirty. But all the (correct) callers
> of i915_gem_object_set_to_cpu_domain(obj, true) follow it by writing on the
> object, and only one marks the object as dirty, so all others are potential
> victims of having the shrinker discard their uncommitted changes.
> 
> Fixing it here (i.e when mapping whole objects for CPU write access) means
> that object-level operations then don't have to worry about page-level
> details. Functions that want to be smarter about tracking updates at the
> level of individual pages could still do so, but I don't want to impose that
> requirement on code that just wants to manipulate objects.

It's still not entirely equivalent: With GTT writes it's impossible to
write through the GTT with get_pages having been called, which means
someone eventually will do the put_pages. But with CPU writes you can do a
set_to_cpu_domain just for flushing, and then write through some
completely different path. Just setting obj->dirty only works if you also
have the pages.

Which means moving the obj->dirty = true into set_to_cpu_domain won't be a
solution that works everywhere, but it will work mostly. That's a bit
fragile. For create_from_data it will work perfectly, and if we put the
set_to_cpu_domain after get_pages it's even obvious that this is so. But
since it's fundamentally fragile I don't like it that much.

But it's also not awesome that set_to_gtt_domain does this for callers.

For lack of clear solutions I'd go with sprinkling obj->dirty or
page_set_dirty over callers. Aside: relocate_entry_cpu probably gets away
because of the unconditional obj->dirty we do later on, and that we redo
all relocs if a fault happens. Still would be good to fix it, just for
safety.

> BTW, I note that the different implementations of put_pages() are not
> consistent about how they treat dirty pages within objects not marked as
> dirty. Does this matter?

Just checked them. All the struct page backed ones handle it
(shmem+userptr) all the others (dma-buf+stolen) don't. Seems consistent.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-24 18:06               ` Daniel Vetter
@ 2015-11-24 23:01                 ` Chris Wilson
  2015-11-25  8:40                   ` Daniel Vetter
  0 siblings, 1 reply; 18+ messages in thread
From: Chris Wilson @ 2015-11-24 23:01 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Tue, Nov 24, 2015 at 07:06:21PM +0100, Daniel Vetter wrote:
> Just setting obj->dirty only works if you also have the pages.

Exactly. The CPU access has historically always been page-by-page. The
style here more or less to emulate the CPU mmap.
 
> But it's also not awesome that set_to_gtt_domain does this for callers.

Hmm, do you have an example where we want set-to-gtt(write), but not
actually write through the backing storage? Internal use of set-to-gtt
has never been ideal (e.g. context) but we haven't yet come up with a
better semantic.
 
> For lack of clear solutions I'd go with sprinkling obj->dirty or
> page_set_dirty over callers. Aside: relocate_entry_cpu probably gets away
> because of the unconditional obj->dirty we do later on, and that we redo
> all relocs if a fault happens. Still would be good to fix it, just for
> safety.

[copy_batch() isn't a bug as the contents are invalidated after use
anyway]

relocate_entry_cpu() is a bug we never caught. Indeed we've papered over
it to mask some over userspace issues, but just adding the set_page_dirty()
as required isn't going to be a big hardship.

We have tons of swapthrash tests to check persistency of GPU buffers,
but we never tried to thrash the batches themselves out to swap and then
reuse them.

I guess that it is because userspace doesn't reuse batches that we never
had report of the issue. Hibernating would be a good exercise of such.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-24 23:01                 ` Chris Wilson
@ 2015-11-25  8:40                   ` Daniel Vetter
  2015-11-25  9:02                     ` Chris Wilson
  0 siblings, 1 reply; 18+ messages in thread
From: Daniel Vetter @ 2015-11-25  8:40 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, Dave Gordon, intel-gfx

On Tue, Nov 24, 2015 at 11:01:25PM +0000, Chris Wilson wrote:
> On Tue, Nov 24, 2015 at 07:06:21PM +0100, Daniel Vetter wrote:
> > Just setting obj->dirty only works if you also have the pages.
> 
> Exactly. The CPU access has historically always been page-by-page. The
> style here more or less to emulate the CPU mmap.
>  
> > But it's also not awesome that set_to_gtt_domain does this for callers.
> 
> Hmm, do you have an example where we want set-to-gtt(write), but not
> actually write through the backing storage? Internal use of set-to-gtt
> has never been ideal (e.g. context) but we haven't yet come up with a
> better semantic.

Just the inconsistency that Dave pointed out is a bit worrisome. At most
we can fix this with docs (which atm we have), which gives us a rather low
score on API design (still a positive one still it's possible to get
right). I agree that I don't have better semantics either.

> > For lack of clear solutions I'd go with sprinkling obj->dirty or
> > page_set_dirty over callers. Aside: relocate_entry_cpu probably gets away
> > because of the unconditional obj->dirty we do later on, and that we redo
> > all relocs if a fault happens. Still would be good to fix it, just for
> > safety.
> 
> [copy_batch() isn't a bug as the contents are invalidated after use
> anyway]

Just for consistency adding the obj->dirty after get_pages won't hurt
though.

> relocate_entry_cpu() is a bug we never caught. Indeed we've papered over
> it to mask some over userspace issues, but just adding the set_page_dirty()
> as required isn't going to be a big hardship.

Yeah.

> We have tons of swapthrash tests to check persistency of GPU buffers,
> but we never tried to thrash the batches themselves out to swap and then
> reuse them.
> 
> I guess that it is because userspace doesn't reuse batches that we never
> had report of the issue. Hibernating would be a good exercise of such.

Hm it's not just batches but any object with relocs. Could this explain
the oddball libva/uxa hang? Stuff like "after playing $game for hours my
desktop looked funny", but not for tiling issues.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-25  8:40                   ` Daniel Vetter
@ 2015-11-25  9:02                     ` Chris Wilson
  2015-11-25  9:59                       ` Daniel Vetter
  0 siblings, 1 reply; 18+ messages in thread
From: Chris Wilson @ 2015-11-25  9:02 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Wed, Nov 25, 2015 at 09:40:45AM +0100, Daniel Vetter wrote:
> Hm it's not just batches but any object with relocs. Could this explain
> the oddball libva/uxa hang? Stuff like "after playing $game for hours my
> desktop looked funny", but not for tiling issues.

Possible, but with libva having its own issues with not marking GPU
writes, only time will tell. I say batches because in modern code, only
the batch has the reloc. In UXA and mesa, even the auxiliary buffers with
the relocs are reallocated with each batch.

There's only one swap related corruption issue on gen4, for which we
also know the machine had bad swizzle detection, and there is another
swap related bug on gen2, but neither are actually susceptible to this
bug. I don't have any strong candidates for an eureka moment.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v1] drm/i915/guc: Fix a fw content lost issue after it is evicted
  2015-11-25  9:02                     ` Chris Wilson
@ 2015-11-25  9:59                       ` Daniel Vetter
  0 siblings, 0 replies; 18+ messages in thread
From: Daniel Vetter @ 2015-11-25  9:59 UTC (permalink / raw)
  To: Chris Wilson, Daniel Vetter, Dave Gordon, intel-gfx

On Wed, Nov 25, 2015 at 09:02:20AM +0000, Chris Wilson wrote:
> On Wed, Nov 25, 2015 at 09:40:45AM +0100, Daniel Vetter wrote:
> > Hm it's not just batches but any object with relocs. Could this explain
> > the oddball libva/uxa hang? Stuff like "after playing $game for hours my
> > desktop looked funny", but not for tiling issues.
> 
> Possible, but with libva having its own issues with not marking GPU
> writes, only time will tell. I say batches because in modern code, only
> the batch has the reloc. In UXA and mesa, even the auxiliary buffers with
> the relocs are reallocated with each batch.

Ah, I didn't realize/forgot that UXA doesn't reuse the indirect state.

> There's only one swap related corruption issue on gen4, for which we
> also know the machine had bad swizzle detection, and there is another
> swap related bug on gen2, but neither are actually susceptible to this
> bug. I don't have any strong candidates for an eureka moment.

Was just a thought really.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-11-25  9:59 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-06 21:55 [PATCH] drm/i915/guc: Fix a fw content lost issue after it is evicted yu.dai
2015-11-06 22:07 ` Chris Wilson
2015-11-06 23:18   ` Yu Dai
2015-11-09 10:15     ` Chris Wilson
2015-11-10 23:27       ` [PATCH v1] " yu.dai
2015-11-11  9:07         ` Chris Wilson
2015-11-11 19:01           ` Yu Dai
2015-11-23  9:18           ` akash goel
2015-11-23  9:37             ` Chris Wilson
2015-11-24 15:47         ` Dave Gordon
2015-11-24 16:04           ` Daniel Vetter
2015-11-24 17:47             ` Dave Gordon
2015-11-24 18:06               ` Daniel Vetter
2015-11-24 23:01                 ` Chris Wilson
2015-11-25  8:40                   ` Daniel Vetter
2015-11-25  9:02                     ` Chris Wilson
2015-11-25  9:59                       ` Daniel Vetter
2015-11-10 23:29       ` [PATCH] " Yu Dai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.