Re: [PATCH v4 3/4] Introduce & use new lightweight SGL iterators

From: Chris Wilson <chris@chris-wilson.co.uk>
To: Dave Gordon <david.s.gordon@intel.com>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH v4 3/4] Introduce & use new lightweight SGL iterators
Date: Fri, 20 May 2016 10:56:31 +0100	[thread overview]
Message-ID: <20160520095631.GI3590@nuc-i3427.alporthouse.com> (raw)
In-Reply-To: <20160520081506.GF3590@nuc-i3427.alporthouse.com>

On Fri, May 20, 2016 at 09:15:06AM +0100, Chris Wilson wrote:
> On Fri, May 20, 2016 at 01:31:29AM +0100, Dave Gordon wrote:
> > The existing for_each_sg_page() iterator is somewhat heavyweight, and is
> > limiting i915 driver performance in a few benchmarks. So here we
> > introduce somewhat lighter weight iterators, primarily for use with GEM
> > objects or other case where we need only deal with whole aligned pages.
> > 
> > Unlike the old iterator, the new iterators use an internal state
> > structure which is not intended to be accessed by the caller; instead
> > each takes as a parameter an output variable which is set before each
> > iteration. This makes them particularly simple to use :)
> > 
> > One of the new iterators provides the caller with the DMA address of
> > each page in turn; the other provides the 'struct page' pointer required
> > by many memory management operations.
> > 
> > Various uses of for_each_sg_page() are then converted to the new macros.
> > 
> > Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h         | 62 +++++++++++++++++++++++++--
> >  drivers/gpu/drm/i915/i915_gem.c         | 20 ++++-----
> >  drivers/gpu/drm/i915/i915_gem_fence.c   | 14 +++---
> >  drivers/gpu/drm/i915/i915_gem_gtt.c     | 76 ++++++++++++++++-----------------
> >  drivers/gpu/drm/i915/i915_gem_userptr.c |  7 ++-
> >  5 files changed, 116 insertions(+), 63 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 6894d8e..01cde0f 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2120,6 +2120,10 @@ struct drm_i915_gem_object_ops {
> >  #define INTEL_FRONTBUFFER_ALL_MASK(pipe) \
> >  	(0xff << (INTEL_FRONTBUFFER_BITS_PER_PIPE * (pipe)))
> >  
> > +void i915_gem_track_fb(struct drm_i915_gem_object *old,
> > +		       struct drm_i915_gem_object *new,
> > +		       unsigned frontbuffer_bits);
> > +
> 
> That's not a much better spot, since this is now before the object it
> acts upon. One day we'll get our types separated from their actors.
> 
> >  struct drm_i915_gem_object {
> >  	struct drm_gem_object base;
> >  
> > @@ -2251,9 +2255,61 @@ struct drm_i915_gem_object {
> >  };
> >  #define to_intel_bo(x) container_of(x, struct drm_i915_gem_object, base)
> >  
> > -void i915_gem_track_fb(struct drm_i915_gem_object *old,
> > -		       struct drm_i915_gem_object *new,
> > -		       unsigned frontbuffer_bits);
> > +/*
> > + * New optimised SGL iterator for GEM objects
> > + */
> > +struct sgt_iter {
> > +	struct scatterlist *sgp;
> > +	union {
> > +		unsigned long pfn;
> > +		unsigned long dma;
> > +	} ix;
> 
> An anonymous union here would be slightly more pleasant. And we should
> be using the correct types.
> 
> > +	unsigned int curr;
> > +	unsigned int max;
> > +};
> > +
> > +/* Constructor for new iterator */
> > +static inline struct sgt_iter
> > +__sgt_iter(struct scatterlist *sgl, bool dma)
> > +{
> > +	struct sgt_iter s = { .sgp = sgl };
> > +
> > +	if (s.sgp) {
> 
> Not acceptable just to GPF out if passed in a NULL? It's a BUG for all
> our use cases. And would save a conditional every chain step.

Oh, it's how we stop after __sg_next().

Ok, played a bit and the only thing that sticks is

 /*
- * New optimised SGL iterator for GEM objects
+ * Optimised SGL iterator for GEM objects
  */
 struct sgt_iter {
        struct scatterlist *sgp;
        union {
                unsigned long pfn;
-               unsigned long dma;
-       } ix;
+               dma_addr_t dma;
+       };
        unsigned int curr;
        unsigned int max;
 };
 
-/* Constructor for new iterator */
-static inline struct sgt_iter
+static __always_inline struct sgt_iter

Nothing is ever new past the commit that introduces it!

Final diff (a few % in micro, consistently above the noise):

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 53ff4993f5b8..586f06646630 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2256,31 +2256,26 @@ struct drm_i915_gem_object {
 #define to_intel_bo(x) container_of(x, struct drm_i915_gem_object, base)
 
 /*
- * New optimised SGL iterator for GEM objects
+ * Optimised SGL iterator for GEM objects
  */
-struct sgt_iter {
+static __always_inline struct sgt_iter {
        struct scatterlist *sgp;
        union {
                unsigned long pfn;
-               unsigned long dma;
-       } ix;
+               dma_addr_t dma;
+       };
        unsigned int curr;
        unsigned int max;
-};
-
-/* Constructor for new iterator */
-static inline struct sgt_iter
-__sgt_iter(struct scatterlist *sgl, bool dma)
-{
+} __sgt_iter(struct scatterlist *sgl, bool dma) {
        struct sgt_iter s = { .sgp = sgl };
 
        if (s.sgp) {
                s.max = s.curr = s.sgp->offset;
                s.max += s.sgp->length;
                if (dma)
-                       s.ix.dma = sg_dma_address(s.sgp);
+                       s.dma = sg_dma_address(s.sgp);
                else
-                       s.ix.pfn = page_to_pfn(sg_page(s.sgp));
+                       s.pfn = page_to_pfn(sg_page(s.sgp));
        }
 
        return s;
@@ -2313,9 +2308,9 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
  */
 #define for_each_sgt_dma(__dmap, __iter, __sgt)                                \
        for ((__iter) = __sgt_iter((__sgt)->sgl, true);                 \
-            ((__dmap) = (__iter).ix.dma + (__iter).curr);              \
+            ((__dmap) = (__iter).dma + (__iter).curr);                 \
             (((__iter).curr += PAGE_SIZE) < (__iter).max) ||           \
-               ((__iter) = __sgt_iter(__sg_next((__iter).sgp), true), 0))
+            ((__iter) = __sgt_iter(__sg_next((__iter).sgp), true), 0))
 
 /**
  * for_each_sgt_page - iterate over the pages of the given sg_table
@@ -2325,10 +2320,10 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
  */
 #define for_each_sgt_page(__pp, __iter, __sgt)                         \
        for ((__iter) = __sgt_iter((__sgt)->sgl, false);                \
-            ((__pp) = (__iter).ix.pfn == 0 ? NULL :                    \
-                  pfn_to_page((__iter).ix.pfn + ((__iter).curr >> PAGE_SHIFT)));\
+            ((__pp) = (__iter).pfn == 0 ? NULL :                       \
+             pfn_to_page((__iter).pfn + ((__iter).curr >> PAGE_SHIFT))); \
             (((__iter).curr += PAGE_SIZE) < (__iter).max) ||           \
-               ((__iter) = __sgt_iter(__sg_next((__iter).sgp), false), 0))
+            ((__iter) = __sgt_iter(__sg_next((__iter).sgp), false), 0))


And with that,
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx