intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>,
	intel-gfx@lists.freedesktop.org
Subject: Re: [Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM
Date: Mon, 04 May 2020 14:08:56 +0100	[thread overview]
Message-ID: <158859773631.11660.433515374088951309@build.alporthouse.com> (raw)
In-Reply-To: <f4536bc0-bc3c-0762-0334-69f03df7194a@linux.intel.com>

Quoting Tvrtko Ursulin (2020-05-04 13:58:46)
> 
> On 04/05/2020 05:48, Chris Wilson wrote:
> > +static bool can_store_dword(const struct intel_engine_cs *engine)
> > +{
> > +     return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> > +}
> 
> A bit confusing to future reader who it differs from 
> intel_engine_can_store_dword but apart from adding a comment I don't 
> have any better ideas.

can_use_engine_for_reloc_without_killing_the_machine()

if (!engine_can_reloc_gpu()) ?

> 
> > +
> >   static u32 *reloc_gpu(struct i915_execbuffer *eb,
> >                     struct i915_vma *vma,
> >                     unsigned int len)
> > @@ -1387,9 +1392,9 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
> >       if (unlikely(!cache->rq)) {
> >               struct intel_engine_cs *engine = eb->engine;
> >   
> > -             if (!intel_engine_can_store_dword(engine)) {
> > +             if (!can_store_dword(engine)) {
> >                       engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> > -                     if (!engine || !intel_engine_can_store_dword(engine))
> > +                     if (!engine)
> >                               return ERR_PTR(-ENODEV);
> >               }
> >   
> > @@ -1435,91 +1440,138 @@ static inline bool use_reloc_gpu(struct i915_vma *vma)
> >       return !dma_resv_test_signaled_rcu(vma->resv, true);
> >   }
> >   
> > -static u64
> > -relocate_entry(struct i915_vma *vma,
> > -            const struct drm_i915_gem_relocation_entry *reloc,
> > -            struct i915_execbuffer *eb,
> > -            const struct i915_vma *target)
> > +static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
> >   {
> > -     u64 offset = reloc->offset;
> > -     u64 target_offset = relocation_target(reloc, target);
> > -     bool wide = eb->reloc_cache.use_64bit_reloc;
> > -     void *vaddr;
> > +     struct page *page;
> > +     unsigned long addr;
> >   
> > -     if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) {
> > -             const unsigned int gen = eb->reloc_cache.gen;
> > -             unsigned int len;
> > -             u32 *batch;
> > -             u64 addr;
> > +     GEM_BUG_ON(vma->pages != vma->obj->mm.pages);
> >   
> > -             if (wide)
> > -                     len = offset & 7 ? 8 : 5;
> > -             else if (gen >= 4)
> > -                     len = 4;
> > -             else
> > -                     len = 3;
> > +     page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT);
> > +     addr = PFN_PHYS(page_to_pfn(page));
> > +     GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */
> >   
> > -             batch = reloc_gpu(eb, vma, len);
> > -             if (IS_ERR(batch))
> > -                     goto repeat;
> > +     return addr + offset_in_page(offset);
> > +}
> > +
> > +static bool __reloc_entry_gpu(struct i915_vma *vma,
> > +                           struct i915_execbuffer *eb,
> > +                           u64 offset,
> > +                           u64 target_addr)
> > +{
> > +     const unsigned int gen = eb->reloc_cache.gen;
> > +     unsigned int len;
> > +     u32 *batch;
> > +     u64 addr;
> > +
> > +     if (gen >= 8)
> > +             len = offset & 7 ? 8 : 5;
> 
> This used to be driven of eb->reloc_cache.use_64bit_reloc, any practical 
> effect of the change? Doesn't seem to.. Should you still use it for 
> consistency though?

It used to be because we already pulled it into a local wide. I switched to
gen here for consistency with the if-else ladders.


> 
> > +     else if (gen >= 4)
> > +             len = 4;
> > +     else
> > +             len = 3;
> > +
> > +     batch = reloc_gpu(eb, vma, len);
> > +     if (IS_ERR(batch))
> > +             return false;
> > +
> > +     addr = gen8_canonical_addr(vma->node.start + offset);
> > +     if (gen >= 8) {
> > +             if (offset & 7) {
> > +                     *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > +                     *batch++ = lower_32_bits(addr);
> > +                     *batch++ = upper_32_bits(addr);
> > +                     *batch++ = lower_32_bits(target_addr);
> > +
> > +                     addr = gen8_canonical_addr(addr + 4);
> >   
> > -             addr = gen8_canonical_addr(vma->node.start + offset);
> > -             if (wide) {
> > -                     if (offset & 7) {
> > -                             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > -                             *batch++ = lower_32_bits(addr);
> > -                             *batch++ = upper_32_bits(addr);
> > -                             *batch++ = lower_32_bits(target_offset);
> > -
> > -                             addr = gen8_canonical_addr(addr + 4);
> > -
> > -                             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > -                             *batch++ = lower_32_bits(addr);
> > -                             *batch++ = upper_32_bits(addr);
> > -                             *batch++ = upper_32_bits(target_offset);
> > -                     } else {
> > -                             *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> > -                             *batch++ = lower_32_bits(addr);
> > -                             *batch++ = upper_32_bits(addr);
> > -                             *batch++ = lower_32_bits(target_offset);
> > -                             *batch++ = upper_32_bits(target_offset);
> > -                     }
> > -             } else if (gen >= 6) {
> >                       *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > -                     *batch++ = 0;
> > -                     *batch++ = addr;
> > -                     *batch++ = target_offset;
> > -             } else if (gen >= 4) {
> > -                     *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> > -                     *batch++ = 0;
> > -                     *batch++ = addr;
> > -                     *batch++ = target_offset;
> > +                     *batch++ = lower_32_bits(addr);
> > +                     *batch++ = upper_32_bits(addr);
> > +                     *batch++ = upper_32_bits(target_addr);
> >               } else {
> > -                     *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> > -                     *batch++ = addr;
> > -                     *batch++ = target_offset;
> > +                     *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> > +                     *batch++ = lower_32_bits(addr);
> > +                     *batch++ = upper_32_bits(addr);
> > +                     *batch++ = lower_32_bits(target_addr);
> > +                     *batch++ = upper_32_bits(target_addr);
> >               }
> > -
> > -             goto out;
> > +     } else if (gen >= 6) {
> > +             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > +             *batch++ = 0;
> > +             *batch++ = addr;
> > +             *batch++ = target_addr;
> > +     } else if (IS_I965G(eb->i915)) {
> > +             *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > +             *batch++ = 0;
> > +             *batch++ = vma_phys_addr(vma, offset);
> > +             *batch++ = target_addr;
> > +     } else if (gen >= 4) {
> > +             *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> > +             *batch++ = 0;
> > +             *batch++ = addr;
> > +             *batch++ = target_addr;
> > +     } else if (gen >= 3 &&
> > +                !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) {
> > +             *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> > +             *batch++ = addr;
> > +             *batch++ = target_addr;
> > +     } else {
> > +             *batch++ = MI_STORE_DWORD_IMM;
> > +             *batch++ = vma_phys_addr(vma, offset);
> > +             *batch++ = target_addr;
> >       }
> >   
> > +     return true;
> > +}
> > +
> > +static bool reloc_entry_gpu(struct i915_vma *vma,
> > +                         struct i915_execbuffer *eb,
> > +                         u64 offset,
> > +                         u64 target_addr)
> > +{
> > +     if (eb->reloc_cache.vaddr)
> > +             return false;
> > +
> > +     if (!use_reloc_gpu(vma))
> > +             return false;
> > +
> > +     return __reloc_entry_gpu(vma, eb, offset, target_addr);
> > +}
> > +
> > +static u64
> > +relocate_entry(struct i915_vma *vma,
> > +            const struct drm_i915_gem_relocation_entry *reloc,
> > +            struct i915_execbuffer *eb,
> > +            const struct i915_vma *target)
> > +{
> > +     u64 target_addr = relocation_target(reloc, target);
> > +     u64 offset = reloc->offset;
> > +
> > +     if (!reloc_entry_gpu(vma, eb, offset, target_addr)) {
> > +             bool wide = eb->reloc_cache.use_64bit_reloc;
> > +             void *vaddr;
> > +
> >   repeat:
> > -     vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
> > -     if (IS_ERR(vaddr))
> > -             return PTR_ERR(vaddr);
> > +             vaddr = reloc_vaddr(vma->obj,
> > +                                 &eb->reloc_cache,
> > +                                 offset >> PAGE_SHIFT);
> > +             if (IS_ERR(vaddr))
> > +                     return PTR_ERR(vaddr);
> >   
> > -     clflush_write32(vaddr + offset_in_page(offset),
> > -                     lower_32_bits(target_offset),
> > -                     eb->reloc_cache.vaddr);
> > +             GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
> > +             clflush_write32(vaddr + offset_in_page(offset),
> > +                             lower_32_bits(target_addr),
> > +                             eb->reloc_cache.vaddr);
> >   
> > -     if (wide) {
> > -             offset += sizeof(u32);
> > -             target_offset >>= 32;
> > -             wide = false;
> > -             goto repeat;
> > +             if (wide) {
> > +                     offset += sizeof(u32);
> > +                     target_addr >>= 32;
> > +                     wide = false;
> > +                     goto repeat;
> > +             }
> >       }
> >   
> > -out:
> >       return target->node.start | UPDATE;
> >   }
> >   
> > @@ -3022,3 +3074,7 @@ end:;
> >       kvfree(exec2_list);
> >       return err;
> >   }
> 
> The diff is a bit messy but looks okay and in CI we trust.
> 
> > +
> > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > +#include "selftests/i915_gem_execbuffer.c"
> > +#endif
> > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> > new file mode 100644
> > index 000000000000..985f9fbd0ba0
> > --- /dev/null
> > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> > @@ -0,0 +1,206 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2020 Intel Corporation
> > + */
> > +
> > +#include "i915_selftest.h"
> > +
> > +#include "gt/intel_engine_pm.h"
> > +#include "selftests/igt_flush_test.h"
> > +
> > +static void hexdump(const void *buf, size_t len)
> 
> Uh-oh seems to be the third copy! ;)

* jedi handwave

Yeah, I'm close to pulling them into i915_selftests.c as pr_hexdump(). A
certain Tvrtko might one day win his argument to land the wrapper in lib/

> > +static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> > +                        struct drm_i915_gem_object *obj)
> > +{
> > +     enum {
> > +             X = 0,
> > +             Y = 3,
> > +             Z = 8
> > +     };
> > +     const u64 mask = GENMASK_ULL(eb->reloc_cache.gen >= 8 ? 63 : 31, 0);
> 
> Mask is to remove the poison? use_64_bit relocs instead of gen, or this 
> is more correct?

Might as well just use_64_bit_relocs. I suppose doing a manual check
against gen might help notice a use_64_bit_relocs slip?

> 
> > +     const u32 *map = page_mask_bits(obj->mm.mapping);
> > +     struct i915_request *rq;
> > +     struct i915_vma *vma;
> > +     int inc;
> > +     int err;
> > +
> > +     vma = i915_vma_instance(obj, eb->context->vm, NULL);
> > +     if (IS_ERR(vma))
> > +             return PTR_ERR(vma);
> > +
> > +     err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
> > +     if (err)
> > +             return err;
> > +
> > +     /* 8-Byte aligned */
> > +     if (!__reloc_entry_gpu(vma, eb, X * sizeof(u32), X)) {
> > +             err = -EIO;
> > +             goto unpin_vma;
> > +     }
> > +
> > +     /* !8-Byte aligned */
> 
> What is the significance of the non 8 byte aligned?

That's the if(offset & 7) path in the gen8 code. If the offset is 8-byte
aligned we can do a QWord MI_STORE_DATA_IMM, if it is only 4-byte
aligned, we need 2 MI_STORE_DATA_IMM.

> 
> > +     if (!__reloc_entry_gpu(vma, eb, Y * sizeof(u32), Y)) {
> > +             err = -EIO;
> > +             goto unpin_vma;
> > +     }
> > +
> > +     /* Skip to the end of the cmd page */
> > +     inc = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
> > +     inc -= eb->reloc_cache.rq_size;
> > +     memset(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size,
> > +            0, inc * sizeof(u32));
> > +     eb->reloc_cache.rq_size += inc;
> > +
> > +     /* Force batch chaining */
> > +     if (!__reloc_entry_gpu(vma, eb, Z * sizeof(u32), Z)) {
> > +             err = -EIO;
> > +             goto unpin_vma;
> > +     }
> > +
> > +     GEM_BUG_ON(!eb->reloc_cache.rq);
> > +     rq = i915_request_get(eb->reloc_cache.rq);
> > +     err = reloc_gpu_flush(&eb->reloc_cache);
> > +     if (err)
> > +             goto put_rq;
> > +     GEM_BUG_ON(eb->reloc_cache.rq);
> > +
> > +     err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2);
> > +     if (err) {
> > +             intel_gt_set_wedged(eb->engine->gt);
> > +             goto put_rq;
> > +     }
> > +
> > +     if (!i915_request_completed(rq)) {
> > +             pr_err("%s: did not wait for relocations!\n", eb->engine->name);
> > +             err = -EINVAL;
> > +             goto put_rq;
> > +     }
> > +
> > +     if (read_reloc(map, X, mask) != X) {
> > +             pr_err("%s[X]: map[%d] %llx != %x\n",
> > +                    eb->engine->name, X, read_reloc(map, X, mask), X);
> > +             err = -EINVAL;
> > +     }
> > +
> > +     if (read_reloc(map, Y, mask) != Y) {
> > +             pr_err("%s[Y]: map[%d] %llx != %x\n",
> > +                    eb->engine->name, Y, read_reloc(map, Y, mask), Y);
> > +             err = -EINVAL;
> > +     }
> > +
> > +     if (read_reloc(map, Z, mask) != Z) {
> > +             pr_err("%s[Z]: map[%d] %llx != %x\n",
> > +                    eb->engine->name, Z, read_reloc(map, Z, mask), Z);
> > +             err = -EINVAL;
> > +     }
> 
> Why this couldn't just be an array of [0, 3, 8] instead of enums and 
> then all these tests could be a single loop? I can't figure out what is 
> the benefit of enum in other words.. Okay in the test phase it can't be 
> a simple loop since it needs the special case for last iteration, but 
> here it could be.

If you saw how far this came from the first version... I just liked X,
Y, Z as labels.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2020-05-04 13:09 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-04  4:48 [Intel-gfx] [PATCH 01/22] drm/i915: Allow some leniency in PCU reads Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 02/22] drm/i915/gem: Specify address type for chained reloc batches Chris Wilson
2020-05-04 11:49   ` Tvrtko Ursulin
2020-05-04 11:53     ` Chris Wilson
2020-05-04 12:15       ` Tvrtko Ursulin
2020-05-04  4:48 ` [Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM Chris Wilson
2020-05-04 12:58   ` Tvrtko Ursulin
2020-05-04 13:08     ` Chris Wilson [this message]
2020-05-04 13:22   ` [Intel-gfx] [PATCH v2] " Chris Wilson
2020-05-04 13:33     ` Tvrtko Ursulin
2020-05-04  4:48 ` [Intel-gfx] [PATCH 04/22] drm/i915/gt: Small tidy of gen8+ breadcrumb emission Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 05/22] drm/i915: Mark concurrent submissions with a weak-dependency Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 06/22] drm/i915/selftests: Repeat the rps clock frequency measurement Chris Wilson
2020-05-04 17:17   ` Mika Kuoppala
2020-05-04  4:48 ` [Intel-gfx] [PATCH 07/22] drm/i915/gt: Stop holding onto the pinned_default_state Chris Wilson
2020-05-05 20:08   ` Andi Shyti
2020-05-05 20:13     ` Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 08/22] dma-buf: Proxy fence, an unsignaled fence placeholder Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 09/22] drm/syncobj: Allow use of dma-fence-proxy Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 10/22] drm/i915/gem: Teach execbuf how to wait on future syncobj Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 11/22] drm/i915/gem: Allow combining submit-fences with syncobj Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 12/22] drm/i915/gt: Declare when we enabled timeslicing Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 13/22] drm/i915: Replace the hardcoded I915_FENCE_TIMEOUT Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 14/22] drm/i915: Drop I915_RESET_TIMEOUT and friends Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 15/22] drm/i915: Drop I915_IDLE_ENGINES_TIMEOUT Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 16/22] drm/i915: Always defer fenced work to the worker Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 17/22] drm/i915/gem: Assign context id for async work Chris Wilson
2020-05-04  4:48 ` [Intel-gfx] [PATCH 18/22] drm/i915: Export a preallocate variant of i915_active_acquire() Chris Wilson
2020-05-04  4:49 ` [Intel-gfx] [PATCH 19/22] drm/i915/gem: Separate the ww_mutex walker into its own list Chris Wilson
2020-05-04  4:49 ` [Intel-gfx] [PATCH 20/22] drm/i915/gem: Asynchronous GTT unbinding Chris Wilson
2020-05-04  4:49 ` [Intel-gfx] [PATCH 21/22] drm/i915/gem: Bind the fence async for execbuf Chris Wilson
2020-05-04  4:49 ` [Intel-gfx] [PATCH 22/22] drm/i915/gem: Lazily acquire the device wakeref for freeing objects Chris Wilson
2020-05-04  5:16 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/22] drm/i915: Allow some leniency in PCU reads Patchwork
2020-05-04  5:40 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2020-05-04  9:52 ` [Intel-gfx] [PATCH 01/22] " Mika Kuoppala
2020-05-04 10:00   ` Chris Wilson
2020-05-04 13:43 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/22] drm/i915: Allow some leniency in PCU reads (rev2) Patchwork
2020-05-04 13:49 ` [Intel-gfx] ✓ Fi.CI.IGT: success for series starting with [01/22] drm/i915: Allow some leniency in PCU reads Patchwork
2020-05-04 14:07 ` [Intel-gfx] ✓ Fi.CI.BAT: success for series starting with [01/22] drm/i915: Allow some leniency in PCU reads (rev2) Patchwork
2020-05-05  1:31 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=158859773631.11660.433515374088951309@build.alporthouse.com \
    --to=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=tvrtko.ursulin@linux.intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).