Re: [Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM

From: Chris Wilson <chris@chris-wilson.co.uk>
To: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>,
	intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM
Date: Mon, 04 May 2020 14:08:56 +0100	[thread overview]
Message-ID: <158859773631.11660.433515374088951309@build.alporthouse.com> (raw)
In-Reply-To: <f4536bc0-bc3c-0762-0334-69f03df7194a@linux.intel.com>

Quoting Tvrtko Ursulin (2020-05-04 13:58:46)
> 
> On 04/05/2020 05:48, Chris Wilson wrote:
> > +static bool can_store_dword(const struct intel_engine_cs *engine)
> > +{
> > +     return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> > +}
> 
> A bit confusing to future reader who it differs from 
> intel_engine_can_store_dword but apart from adding a comment I don't 
> have any better ideas.

can_use_engine_for_reloc_without_killing_the_machine()

if (!engine_can_reloc_gpu()) ?

> 
> > +
> >   static u32 *reloc_gpu(struct i915_execbuffer *eb,
> >                     struct i915_vma *vma,
> >                     unsigned int len)
> > @@ -1387,9 +1392,9 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
> >       if (unlikely(!cache->rq)) {
> >               struct intel_engine_cs *engine = eb->engine;
> >   
> > -             if (!intel_engine_can_store_dword(engine)) {
> > +             if (!can_store_dword(engine)) {
> >                       engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> > -                     if (!engine || !intel_engine_can_store_dword(engine))
> > +                     if (!engine)
> >                               return ERR_PTR(-ENODEV);
> >               }
> >   
> > @@ -1435,91 +1440,138 @@ static inline bool use_reloc_gpu(struct i915_vma *vma)
> >       return !dma_resv_test_signaled_rcu(vma->resv, true);
> >   }
> >   
> > -static u64
> > -relocate_entry(struct i915_vma *vma,
> > -            const struct drm_i915_gem_relocation_entry *reloc,
> > -            struct i915_execbuffer *eb,
> > -            const struct i915_vma *target)
> > +static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
> >   {
> > -     u64 offset = reloc->offset;
> > -     u64 target_offset = relocation_target(reloc, target);
> > -     bool wide = eb->reloc_cache.use_64bit_reloc;
> > -     void *vaddr;
> > +     struct page *page;
> > +     unsigned long addr;
> >   
> > -     if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) {
> > -             const unsigned int gen = eb->reloc_cache.gen;
> > -             unsigned int len;
> > -             u32 *batch;
> > -             u64 addr;
> > +     GEM_BUG_ON(vma->pages != vma->obj->mm.pages);
> >   
> > -             if (wide)
> > -                     len = offset & 7 ? 8 : 5;
> > -             else if (gen >= 4)
> > -                     len = 4;
> > -             else
> > -                     len = 3;
> > +     page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT);
> > +     addr = PFN_PHYS(page_to_pfn(page));
> > +     GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */
> >   
> > -             batch = reloc_gpu(eb, vma, len);
> > -             if (IS_ERR(batch))
> > -                     goto repeat;
> > +     return addr + offset_in_page(offset);
> > +}
> > +
> > +static bool __reloc_entry_gpu(struct i915_vma *vma,
> > +                           struct i915_execbuffer *eb,
> > +                           u64 offset,
> > +                           u64 target_addr)
> > +{
> > +     const unsigned int gen = eb->reloc_cache.gen;
> > +     unsigned int len;
> > +     u32 *batch;
> > +     u64 addr;
> > +
> > +     if (gen >= 8)
> > +             len = offset & 7 ? 8 : 5;
> 
> This used to be driven of eb->reloc_cache.use_64bit_reloc, any practical 
> effect of the change? Doesn't seem to.. Should you still use it for 
> consistency though?

It used to be because we already pulled it into a local wide. I switched to
gen here for consistency with the if-else ladders.

> 
> > +     else if (gen >= 4)
> > +             len = 4;
> > +     else
> > +             len = 3;
> > +
> > +     batch = reloc_gpu(eb, vma, len);
> > +     if (IS_ERR(batch))
> > +             return false;
> > +
> > +     addr = gen8_canonical_addr(vma->node.start + offset);
> > +     if (gen >= 8) {
> > +             if (offset & 7) {
> > +                     *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > +                     *batch++ = lower_32_bits(addr);
> > +                     *batch++ = upper_32_bits(addr);
> > +                     *batch++ = lower_32_bits(target_addr);
> > +
> > +                     addr = gen8_canonical_addr(addr + 4);
> >   
> > -             addr = gen8_canonical_addr(vma->node.start + offset);
> > -             if (wide) {
> > -                     if (offset & 7) {
> > -                             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > -                             *batch++ = lower_32_bits(addr);
> > -                             *batch++ = upper_32_bits(addr);
> > -                             *batch++ = lower_32_bits(target_offset);
> > -
> > -                             addr = gen8_canonical_addr(addr + 4);
> > -
> > -                             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > -                             *batch++ = lower_32_bits(addr);
> > -                             *batch++ = upper_32_bits(addr);
> > -                             *batch++ = upper_32_bits(target_offset);
> > -                     } else {
> > -                             *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> > -                             *batch++ = lower_32_bits(addr);
> > -                             *batch++ = upper_32_bits(addr);
> > -                             *batch++ = lower_32_bits(target_offset);
> > -                             *batch++ = upper_32_bits(target_offset);
> > -                     }
> > -             } else if (gen >= 6) {
> >                       *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > -                     *batch++ = 0;
> > -                     *batch++ = addr;
> > -                     *batch++ = target_offset;
> > -             } else if (gen >= 4) {
> > -                     *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> > -                     *batch++ = 0;
> > -                     *batch++ = addr;
> > -                     *batch++ = target_offset;
> > +                     *batch++ = lower_32_bits(addr);
> > +                     *batch++ = upper_32_bits(addr);
> > +                     *batch++ = upper_32_bits(target_addr);
> >               } else {
> > -                     *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> > -                     *batch++ = addr;
> > -                     *batch++ = target_offset;
> > +                     *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> > +                     *batch++ = lower_32_bits(addr);
> > +                     *batch++ = upper_32_bits(addr);
> > +                     *batch++ = lower_32_bits(target_addr);
> > +                     *batch++ = upper_32_bits(target_addr);
> >               }
> > -
> > -             goto out;
> > +     } else if (gen >= 6) {
> > +             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > +             *batch++ = 0;
> > +             *batch++ = addr;
> > +             *batch++ = target_addr;
> > +     } else if (IS_I965G(eb->i915)) {
> > +             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > +             *batch++ = 0;
> > +             *batch++ = vma_phys_addr(vma, offset);
> > +             *batch++ = target_addr;
> > +     } else if (gen >= 4) {
> > +             *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> > +             *batch++ = 0;
> > +             *batch++ = addr;
> > +             *batch++ = target_addr;
> > +     } else if (gen >= 3 &&
> > +                !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) {
> > +             *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> > +             *batch++ = addr;
> > +             *batch++ = target_addr;
> > +     } else {
> > +             *batch++ = MI_STORE_DWORD_IMM;
> > +             *batch++ = vma_phys_addr(vma, offset);
> > +             *batch++ = target_addr;
> >       }
> >   
> > +     return true;
> > +}
> > +
> > +static bool reloc_entry_gpu(struct i915_vma *vma,
> > +                         struct i915_execbuffer *eb,
> > +                         u64 offset,
> > +                         u64 target_addr)
> > +{
> > +     if (eb->reloc_cache.vaddr)
> > +             return false;
> > +
> > +     if (!use_reloc_gpu(vma))
> > +             return false;
> > +
> > +     return __reloc_entry_gpu(vma, eb, offset, target_addr);
> > +}
> > +
> > +static u64
> > +relocate_entry(struct i915_vma *vma,
> > +            const struct drm_i915_gem_relocation_entry *reloc,
> > +            struct i915_execbuffer *eb,
> > +            const struct i915_vma *target)
> > +{
> > +     u64 target_addr = relocation_target(reloc, target);
> > +     u64 offset = reloc->offset;
> > +
> > +     if (!reloc_entry_gpu(vma, eb, offset, target_addr)) {
> > +             bool wide = eb->reloc_cache.use_64bit_reloc;
> > +             void *vaddr;
> > +
> >   repeat:
> > -     vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
> > -     if (IS_ERR(vaddr))
> > -             return PTR_ERR(vaddr);
> > +             vaddr = reloc_vaddr(vma->obj,
> > +                                 &eb->reloc_cache,
> > +                                 offset >> PAGE_SHIFT);
> > +             if (IS_ERR(vaddr))
> > +                     return PTR_ERR(vaddr);
> >   
> > -     clflush_write32(vaddr + offset_in_page(offset),
> > -                     lower_32_bits(target_offset),
> > -                     eb->reloc_cache.vaddr);
> > +             GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
> > +             clflush_write32(vaddr + offset_in_page(offset),
> > +                             lower_32_bits(target_addr),
> > +                             eb->reloc_cache.vaddr);
> >   
> > -     if (wide) {
> > -             offset += sizeof(u32);
> > -             target_offset >>= 32;
> > -             wide = false;
> > -             goto repeat;
> > +             if (wide) {
> > +                     offset += sizeof(u32);
> > +                     target_addr >>= 32;
> > +                     wide = false;
> > +                     goto repeat;
> > +             }
> >       }
> >   
> > -out:
> >       return target->node.start | UPDATE;
> >   }
> >   
> > @@ -3022,3 +3074,7 @@ end:;
> >       kvfree(exec2_list);
> >       return err;
> >   }
> 
> The diff is a bit messy but looks okay and in CI we trust.
> 
> > +
> > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > +#include "selftests/i915_gem_execbuffer.c"
> > +#endif
> > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> > new file mode 100644
> > index 000000000000..985f9fbd0ba0
> > --- /dev/null
> > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> > @@ -0,0 +1,206 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2020 Intel Corporation
> > + */
> > +
> > +#include "i915_selftest.h"
> > +
> > +#include "gt/intel_engine_pm.h"
> > +#include "selftests/igt_flush_test.h"
> > +
> > +static void hexdump(const void *buf, size_t len)
> 
> Uh-oh seems to be the third copy! ;)

* jedi handwave

Yeah, I'm close to pulling them into i915_selftests.c as pr_hexdump(). A
certain Tvrtko might one day win his argument to land the wrapper in lib/

> > +static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> > +                        struct drm_i915_gem_object *obj)
> > +{
> > +     enum {
> > +             X = 0,
> > +             Y = 3,
> > +             Z = 8
> > +     };
> > +     const u64 mask = GENMASK_ULL(eb->reloc_cache.gen >= 8 ? 63 : 31, 0);
> 
> Mask is to remove the poison? use_64_bit relocs instead of gen, or this 
> is more correct?

Might as well just use_64_bit_relocs. I suppose doing a manual check
against gen might help notice a use_64_bit_relocs slip?

> 
> > +     const u32 *map = page_mask_bits(obj->mm.mapping);
> > +     struct i915_request *rq;
> > +     struct i915_vma *vma;
> > +     int inc;
> > +     int err;
> > +
> > +     vma = i915_vma_instance(obj, eb->context->vm, NULL);
> > +     if (IS_ERR(vma))
> > +             return PTR_ERR(vma);
> > +
> > +     err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
> > +     if (err)
> > +             return err;
> > +
> > +     /* 8-Byte aligned */
> > +     if (!__reloc_entry_gpu(vma, eb, X * sizeof(u32), X)) {
> > +             err = -EIO;
> > +             goto unpin_vma;
> > +     }
> > +
> > +     /* !8-Byte aligned */
> 
> What is the significance of the non 8 byte aligned?

That's the if(offset & 7) path in the gen8 code. If the offset is 8-byte
aligned we can do a QWord MI_STORE_DATA_IMM, if it is only 4-byte
aligned, we need 2 MI_STORE_DATA_IMM.

> 
> > +     if (!__reloc_entry_gpu(vma, eb, Y * sizeof(u32), Y)) {
> > +             err = -EIO;
> > +             goto unpin_vma;
> > +     }
> > +
> > +     /* Skip to the end of the cmd page */
> > +     inc = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
> > +     inc -= eb->reloc_cache.rq_size;
> > +     memset(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size,
> > +            0, inc * sizeof(u32));
> > +     eb->reloc_cache.rq_size += inc;
> > +
> > +     /* Force batch chaining */
> > +     if (!__reloc_entry_gpu(vma, eb, Z * sizeof(u32), Z)) {
> > +             err = -EIO;
> > +             goto unpin_vma;
> > +     }
> > +
> > +     GEM_BUG_ON(!eb->reloc_cache.rq);
> > +     rq = i915_request_get(eb->reloc_cache.rq);
> > +     err = reloc_gpu_flush(&eb->reloc_cache);
> > +     if (err)
> > +             goto put_rq;
> > +     GEM_BUG_ON(eb->reloc_cache.rq);
> > +
> > +     err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2);
> > +     if (err) {
> > +             intel_gt_set_wedged(eb->engine->gt);
> > +             goto put_rq;
> > +     }
> > +
> > +     if (!i915_request_completed(rq)) {
> > +             pr_err("%s: did not wait for relocations!\n", eb->engine->name);
> > +             err = -EINVAL;
> > +             goto put_rq;
> > +     }
> > +
> > +     if (read_reloc(map, X, mask) != X) {
> > +             pr_err("%s[X]: map[%d] %llx != %x\n",
> > +                    eb->engine->name, X, read_reloc(map, X, mask), X);
> > +             err = -EINVAL;
> > +     }
> > +
> > +     if (read_reloc(map, Y, mask) != Y) {
> > +             pr_err("%s[Y]: map[%d] %llx != %x\n",
> > +                    eb->engine->name, Y, read_reloc(map, Y, mask), Y);
> > +             err = -EINVAL;
> > +     }
> > +
> > +     if (read_reloc(map, Z, mask) != Z) {
> > +             pr_err("%s[Z]: map[%d] %llx != %x\n",
> > +                    eb->engine->name, Z, read_reloc(map, Z, mask), Z);
> > +             err = -EINVAL;
> > +     }
> 
> Why this couldn't just be an array of [0, 3, 8] instead of enums and 
> then all these tests could be a single loop? I can't figure out what is 
> the benefit of enum in other words.. Okay in the test phase it can't be 
> a simple loop since it needs the special case for last iteration, but 
> here it could be.

If you saw how far this came from the first version... I just liked X,
Y, Z as labels.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx