All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl
@ 2015-12-23 11:19 Chris Wilson
  2015-12-23 11:32 ` Chris Wilson
  2015-12-23 13:35 ` [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Chris Wilson
  0 siblings, 2 replies; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 11:19 UTC (permalink / raw)
  To: intel-gfx

With a bit of care (and leniency) we can iterate over the object and
wait for previous rendering to complete with judicial use of atomic
reference counting. The ABI requires us to ensure that an active object
is eventually flushed (like the busy-ioctl) which is guaranteed by our
management of requests (i.e. everything that is submitted to hardware is
flushed in the same request). All we have to do is ensure that we can
detect when the requests are complete for reporting when the object is
idle (without triggering ETIME) - this is handled by
__i915_wait_request.

The biggest danger in the code is walking the object without holding any
locks. We iterate over the set of last requests and carefully grab a
reference upon it. (If it is changing beneath us, that is the usual
userspace race and even with locking you get the same indeterminate
results.) If the request is unreferenced beneath us, it will be disposed
of into the request cache - so we have to carefully order the retrieval
of the request pointer with its removal.

The impact of this is actually quite small - the return to userspace
following the wait was already lockless. What we achieve here is
completing an already finished wait without hitting the struct_mutex,
our hold is quite short and so we are typically just a victim of
contention rather than a cause.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c         | 53 +++++++++++----------------------
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h |  8 ++++-
 3 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c169574758d5..33de69eff93a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2414,57 +2414,40 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct drm_i915_gem_wait *args = data;
 	struct drm_i915_gem_object *obj;
-	struct drm_i915_gem_request *req[I915_NUM_RINGS];
-	int i, n = 0;
-	int ret;
+	int i, ret = 0;
 
 	if (args->flags != 0)
 		return -EINVAL;
 
-	ret = i915_mutex_lock_interruptible(dev);
-	if (ret)
-		return ret;
-
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->bo_handle));
-	if (&obj->base == NULL) {
-		mutex_unlock(&dev->struct_mutex);
+	if (&obj->base == NULL)
 		return -ENOENT;
-	}
-
-	/* Need to make sure the object gets inactive eventually. */
-	ret = i915_gem_object_flush_active(obj);
-	if (ret)
-		goto out;
 
 	if (!obj->active)
 		goto out;
 
-	/* Do this after OLR check to make sure we make forward progress polling
-	 * on this IOCTL with a timeout == 0 (like busy ioctl)
-	 */
-	if (args->timeout_ns == 0) {
-		ret = -ETIME;
-		goto out;
-	}
-
 	for (i = 0; i < I915_NUM_RINGS; i++) {
-		if (obj->last_read[i].request == NULL)
+		struct drm_i915_gem_request *req;
+
+reload:
+		req = READ_ONCE(obj->last_read[i].request);
+		if (req == NULL)
 			continue;
 
-		req[n++] = i915_gem_request_get(obj->last_read[i].request);
+		req = i915_gem_request_get_rcu(req);
+		if (req == NULL)
+			goto reload;
+
+		ret = __i915_wait_request(req, true,
+					  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
+					  to_rps_client(file));
+		i915_gem_request_put(req);
+		if (ret)
+			goto out;
 	}
 
 out:
-	drm_gem_object_unreference(&obj->base);
-	mutex_unlock(&dev->struct_mutex);
-
-	for (i = 0; i < n; i++) {
-		if (ret == 0)
-			ret = __i915_wait_request(req[i], true,
-						  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
-						  to_rps_client(file));
-		i915_gem_request_put(req[i]);
-	}
+	drm_gem_object_unreference_unlocked(&obj->base);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 4e087143b1d2..a1efaf3063f2 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -335,7 +335,7 @@ static void __i915_gem_request_retire_active(struct drm_i915_gem_request *req)
 		 * and pass along the auxiliary information.
 		 */
 		INIT_LIST_HEAD(&active->link);
-		active->request = NULL;
+		smp_store_mb(active->request, NULL);
 
 		active->retire(active, req);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index fed2abaa906e..5ab91cd32042 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -139,6 +139,12 @@ i915_gem_request_get(struct drm_i915_gem_request *req)
 	return to_request(fence_get(&req->fence));
 }
 
+static inline struct drm_i915_gem_request *
+i915_gem_request_get_rcu(struct drm_i915_gem_request *req)
+{
+	return to_request(fence_get_rcu(&req->fence));
+}
+
 static inline void
 i915_gem_request_put(struct drm_i915_gem_request *req)
 {
@@ -242,8 +248,8 @@ static inline void
 i915_gem_request_mark_active(struct drm_i915_gem_request *request,
 			     struct i915_gem_active *active)
 {
-	active->request = request;
 	list_move(&active->link, &request->active_list);
+	smp_store_mb(active->request, request);
 }
 
 #endif /* I915_GEM_REQUEST_H */
-- 
2.6.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl
  2015-12-23 11:19 [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
@ 2015-12-23 11:32 ` Chris Wilson
  2015-12-23 12:05   ` Chris Wilson
  2015-12-23 13:35 ` [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Chris Wilson
  1 sibling, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 11:32 UTC (permalink / raw)
  To: intel-gfx

On Wed, Dec 23, 2015 at 11:19:23AM +0000, Chris Wilson wrote:
> With a bit of care (and leniency) we can iterate over the object and
> wait for previous rendering to complete with judicial use of atomic
> reference counting. The ABI requires us to ensure that an active object
> is eventually flushed (like the busy-ioctl) which is guaranteed by our
> management of requests (i.e. everything that is submitted to hardware is
> flushed in the same request). All we have to do is ensure that we can
> detect when the requests are complete for reporting when the object is
> idle (without triggering ETIME) - this is handled by
> __i915_wait_request.
> 
> The biggest danger in the code is walking the object without holding any
> locks. We iterate over the set of last requests and carefully grab a
> reference upon it. (If it is changing beneath us, that is the usual
> userspace race and even with locking you get the same indeterminate
> results.) If the request is unreferenced beneath us, it will be disposed
> of into the request cache - so we have to carefully order the retrieval
> of the request pointer with its removal.
> 
> The impact of this is actually quite small - the return to userspace
> following the wait was already lockless. What we achieve here is
> completing an already finished wait without hitting the struct_mutex,
> our hold is quite short and so we are typically just a victim of
> contention rather than a cause.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Some food for thought. I would especially like someone to poke holes in
the racy pointer lookup and check the store mb() versus the
rcu-reference.

The same technique would also apply to busy-ioctl, and that's about the
limits of its applicablity. Though isolating those from contention and
having to do a flush-active should be beneficial system-wide.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl
  2015-12-23 11:32 ` Chris Wilson
@ 2015-12-23 12:05   ` Chris Wilson
  2015-12-23 12:13     ` Chris Wilson
  0 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 12:05 UTC (permalink / raw)
  To: intel-gfx

On Wed, Dec 23, 2015 at 11:32:23AM +0000, Chris Wilson wrote:
> On Wed, Dec 23, 2015 at 11:19:23AM +0000, Chris Wilson wrote:
> > With a bit of care (and leniency) we can iterate over the object and
> > wait for previous rendering to complete with judicial use of atomic
> > reference counting. The ABI requires us to ensure that an active object
> > is eventually flushed (like the busy-ioctl) which is guaranteed by our
> > management of requests (i.e. everything that is submitted to hardware is
> > flushed in the same request). All we have to do is ensure that we can
> > detect when the requests are complete for reporting when the object is
> > idle (without triggering ETIME) - this is handled by
> > __i915_wait_request.
> > 
> > The biggest danger in the code is walking the object without holding any
> > locks. We iterate over the set of last requests and carefully grab a
> > reference upon it. (If it is changing beneath us, that is the usual
> > userspace race and even with locking you get the same indeterminate
> > results.) If the request is unreferenced beneath us, it will be disposed
> > of into the request cache - so we have to carefully order the retrieval
> > of the request pointer with its removal.
> > 
> > The impact of this is actually quite small - the return to userspace
> > following the wait was already lockless. What we achieve here is
> > completing an already finished wait without hitting the struct_mutex,
> > our hold is quite short and so we are typically just a victim of
> > contention rather than a cause.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Some food for thought. I would especially like someone to poke holes in
> the racy pointer lookup and check the store mb() versus the
> rcu-reference.

So what I think is the missing element here is then

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 871aaae1a9d5..4d4ab8e6423f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4193,7 +4193,8 @@ i915_gem_load(struct drm_device *dev)
        dev_priv->requests =
                kmem_cache_create("i915_gem_request",
                                  sizeof(struct drm_i915_gem_request), 0,
-                                 SLAB_HWCACHE_ALIGN,
+                                 SLAB_HWCACHE_ALIGN |
+                                 SLAB_DESTROY_BY_RCU,
                                  NULL);
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl
  2015-12-23 12:05   ` Chris Wilson
@ 2015-12-23 12:13     ` Chris Wilson
  2015-12-23 12:26       ` Chris Wilson
  0 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 12:13 UTC (permalink / raw)
  To: intel-gfx

On Wed, Dec 23, 2015 at 12:05:00PM +0000, Chris Wilson wrote:
> On Wed, Dec 23, 2015 at 11:32:23AM +0000, Chris Wilson wrote:
> > On Wed, Dec 23, 2015 at 11:19:23AM +0000, Chris Wilson wrote:
> > > With a bit of care (and leniency) we can iterate over the object and
> > > wait for previous rendering to complete with judicial use of atomic
> > > reference counting. The ABI requires us to ensure that an active object
> > > is eventually flushed (like the busy-ioctl) which is guaranteed by our
> > > management of requests (i.e. everything that is submitted to hardware is
> > > flushed in the same request). All we have to do is ensure that we can
> > > detect when the requests are complete for reporting when the object is
> > > idle (without triggering ETIME) - this is handled by
> > > __i915_wait_request.
> > > 
> > > The biggest danger in the code is walking the object without holding any
> > > locks. We iterate over the set of last requests and carefully grab a
> > > reference upon it. (If it is changing beneath us, that is the usual
> > > userspace race and even with locking you get the same indeterminate
> > > results.) If the request is unreferenced beneath us, it will be disposed
> > > of into the request cache - so we have to carefully order the retrieval
> > > of the request pointer with its removal.
> > > 
> > > The impact of this is actually quite small - the return to userspace
> > > following the wait was already lockless. What we achieve here is
> > > completing an already finished wait without hitting the struct_mutex,
> > > our hold is quite short and so we are typically just a victim of
> > > contention rather than a cause.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > Some food for thought. I would especially like someone to poke holes in
> > the racy pointer lookup and check the store mb() versus the
> > rcu-reference.
> 
> So what I think is the missing element here is then
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 871aaae1a9d5..4d4ab8e6423f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4193,7 +4193,8 @@ i915_gem_load(struct drm_device *dev)
>         dev_priv->requests =
>                 kmem_cache_create("i915_gem_request",
>                                   sizeof(struct drm_i915_gem_request), 0,
> -                                 SLAB_HWCACHE_ALIGN,
> +                                 SLAB_HWCACHE_ALIGN |
> +                                 SLAB_DESTROY_BY_RCU,
>                                   NULL);

And we promptly run into memory exhaustion issues.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl
  2015-12-23 12:13     ` Chris Wilson
@ 2015-12-23 12:26       ` Chris Wilson
  0 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 12:26 UTC (permalink / raw)
  To: intel-gfx

On Wed, Dec 23, 2015 at 12:13:15PM +0000, Chris Wilson wrote:
> On Wed, Dec 23, 2015 at 12:05:00PM +0000, Chris Wilson wrote:
> > On Wed, Dec 23, 2015 at 11:32:23AM +0000, Chris Wilson wrote:
> > > On Wed, Dec 23, 2015 at 11:19:23AM +0000, Chris Wilson wrote:
> > > > With a bit of care (and leniency) we can iterate over the object and
> > > > wait for previous rendering to complete with judicial use of atomic
> > > > reference counting. The ABI requires us to ensure that an active object
> > > > is eventually flushed (like the busy-ioctl) which is guaranteed by our
> > > > management of requests (i.e. everything that is submitted to hardware is
> > > > flushed in the same request). All we have to do is ensure that we can
> > > > detect when the requests are complete for reporting when the object is
> > > > idle (without triggering ETIME) - this is handled by
> > > > __i915_wait_request.
> > > > 
> > > > The biggest danger in the code is walking the object without holding any
> > > > locks. We iterate over the set of last requests and carefully grab a
> > > > reference upon it. (If it is changing beneath us, that is the usual
> > > > userspace race and even with locking you get the same indeterminate
> > > > results.) If the request is unreferenced beneath us, it will be disposed
> > > > of into the request cache - so we have to carefully order the retrieval
> > > > of the request pointer with its removal.
> > > > 
> > > > The impact of this is actually quite small - the return to userspace
> > > > following the wait was already lockless. What we achieve here is
> > > > completing an already finished wait without hitting the struct_mutex,
> > > > our hold is quite short and so we are typically just a victim of
> > > > contention rather than a cause.
> > > > 
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > Some food for thought. I would especially like someone to poke holes in
> > > the racy pointer lookup and check the store mb() versus the
> > > rcu-reference.
> > 
> > So what I think is the missing element here is then
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 871aaae1a9d5..4d4ab8e6423f 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -4193,7 +4193,8 @@ i915_gem_load(struct drm_device *dev)
> >         dev_priv->requests =
> >                 kmem_cache_create("i915_gem_request",
> >                                   sizeof(struct drm_i915_gem_request), 0,
> > -                                 SLAB_HWCACHE_ALIGN,
> > +                                 SLAB_HWCACHE_ALIGN |
> > +                                 SLAB_DESTROY_BY_RCU,
> >                                   NULL);
> 
> And we promptly run into memory exhaustion issues.

And with some carefully placed synchronize_rcu() we may be able to get
the best of both worlds...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2015-12-23 11:19 [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
  2015-12-23 11:32 ` Chris Wilson
@ 2015-12-23 13:35 ` Chris Wilson
  2015-12-23 13:35   ` [PATCH v2 2/3] drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
                     ` (2 more replies)
  1 sibling, 3 replies; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 13:35 UTC (permalink / raw)
  To: intel-gfx

If we enable RCU for the requests (providing a grace period where we can
inspect a "dead" request before it is freed), we can allow callers to
carefully perform lockless lookup of an active request.

However, by enabling deferred freeing of requests, we can potentially
hog a lot of memory when dealing with tens of thousands of requests per
second - with a quick insertion of the a synchronize_rcu() inside our
shrinker callback, that issue disappears.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c          |  3 ++-
 drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h  | 24 +++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_shrinker.c |  1 +
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c169574758d5..696ada3891ed 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4222,7 +4222,8 @@ i915_gem_load(struct drm_device *dev)
 	dev_priv->requests =
 		kmem_cache_create("i915_gem_request",
 				  sizeof(struct drm_i915_gem_request), 0,
-				  SLAB_HWCACHE_ALIGN,
+				  SLAB_HWCACHE_ALIGN |
+				  SLAB_DESTROY_BY_RCU,
 				  NULL);
 
 	INIT_LIST_HEAD(&dev_priv->context_list);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 4e087143b1d2..380a5963d957 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -335,7 +335,7 @@ static void __i915_gem_request_retire_active(struct drm_i915_gem_request *req)
 		 * and pass along the auxiliary information.
 		 */
 		INIT_LIST_HEAD(&active->link);
-		active->request = NULL;
+		rcu_assign_pointer(active->request, NULL);
 
 		active->retire(active, req);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 4eb15ed9205e..fae85b111a1e 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -139,6 +139,12 @@ i915_gem_request_get(struct drm_i915_gem_request *req)
 	return to_request(fence_get(&req->fence));
 }
 
+static inline struct drm_i915_gem_request *
+i915_gem_request_get_rcu(struct drm_i915_gem_request *req)
+{
+	return to_request(fence_get_rcu(&req->fence));
+}
+
 static inline void
 i915_gem_request_put(struct drm_i915_gem_request *req)
 {
@@ -243,7 +249,23 @@ i915_gem_request_mark_active(struct drm_i915_gem_request *request,
 			     struct i915_gem_active *active)
 {
 	list_move(&active->link, &request->active_list);
-	active->request = request;
+	rcu_assign_pointer(active->request, request);
+}
+
+static inline struct drm_i915_gem_request *
+i915_gem_active_get_request_rcu(struct i915_gem_active *active)
+{
+	do {
+		struct drm_i915_gem_request *request;
+
+		request = rcu_dereference(active->request);
+		if (request == NULL)
+			return NULL;
+
+		request = i915_gem_request_get_rcu(request);
+		if (request)
+			return request;
+	} while (1);
 }
 
 #endif /* I915_GEM_REQUEST_H */
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index c561ed2b8287..03a8bbb3e31e 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -142,6 +142,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
 	}
 
 	i915_gem_retire_requests(dev_priv->dev);
+	synchronize_rcu(); /* expedite the grace period to free the requests */
 
 	return count;
 }
-- 
2.6.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 2/3] drm/i915: Remove (struct_mutex) locking for wait-ioctl
  2015-12-23 13:35 ` [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Chris Wilson
@ 2015-12-23 13:35   ` Chris Wilson
  2015-12-23 13:35   ` [PATCH v2 3/3] drm/i915: Remove (struct_mutex) locking for busy-ioctl Chris Wilson
  2016-01-05 14:59   ` [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Daniel Vetter
  2 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 13:35 UTC (permalink / raw)
  To: intel-gfx

With a bit of care (and leniency) we can iterate over the object and
wait for previous rendering to complete with judicial use of atomic
reference counting. The ABI requires us to ensure that an active object
is eventually flushed (like the busy-ioctl) which is guaranteed by our
management of requests (i.e. everything that is submitted to hardware is
flushed in the same request). All we have to do is ensure that we can
detect when the requests are complete for reporting when the object is
idle (without triggering ETIME) - this is handled by
__i915_wait_request.

The biggest danger in the code is walking the object without holding any
locks. We iterate over the set of last requests and carefully grab a
reference upon it. (If it is changing beneath us, that is the usual
userspace race and even with locking you get the same indeterminate
results.) If the request is unreferenced beneath us, it will be disposed
of into the request cache - so we have to carefully order the retrieval
of the request pointer with its removal, and to do this we employ RCU on
the request cache and upon the last_request pointer tracking.

The impact of this is actually quite small - the return to userspace
following the wait was already lockless. What we achieve here is
completing an already finished wait without hitting the struct_mutex,
our hold is quite short and so we are typically just a victim of
contention rather than a cause.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c | 55 ++++++++++++++---------------------------
 1 file changed, 19 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 696ada3891ed..3e331f7e9d74 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2414,57 +2414,40 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct drm_i915_gem_wait *args = data;
 	struct drm_i915_gem_object *obj;
-	struct drm_i915_gem_request *req[I915_NUM_RINGS];
-	int i, n = 0;
-	int ret;
+	int i, ret = 0;
 
 	if (args->flags != 0)
 		return -EINVAL;
 
-	ret = i915_mutex_lock_interruptible(dev);
-	if (ret)
-		return ret;
-
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->bo_handle));
-	if (&obj->base == NULL) {
-		mutex_unlock(&dev->struct_mutex);
+	if (&obj->base == NULL)
 		return -ENOENT;
-	}
-
-	/* Need to make sure the object gets inactive eventually. */
-	ret = i915_gem_object_flush_active(obj);
-	if (ret)
-		goto out;
-
-	if (!obj->active)
-		goto out;
 
-	/* Do this after OLR check to make sure we make forward progress polling
-	 * on this IOCTL with a timeout == 0 (like busy ioctl)
-	 */
-	if (args->timeout_ns == 0) {
-		ret = -ETIME;
+	if (!obj->active) /* XXX READ_ONCE(obj->flags) */
 		goto out;
-	}
 
+	rcu_read_lock();
 	for (i = 0; i < I915_NUM_RINGS; i++) {
-		if (obj->last_read[i].request == NULL)
+		struct drm_i915_gem_request *req;
+
+		req = i915_gem_active_get_request_rcu(&obj->last_read[i]);
+		if (req == NULL)
 			continue;
 
-		req[n++] = i915_gem_request_get(obj->last_read[i].request);
+		rcu_read_unlock();
+		ret = __i915_wait_request(req, true,
+					  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
+					  to_rps_client(file));
+		i915_gem_request_put(req);
+		if (ret)
+			goto out;
+
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 
 out:
-	drm_gem_object_unreference(&obj->base);
-	mutex_unlock(&dev->struct_mutex);
-
-	for (i = 0; i < n; i++) {
-		if (ret == 0)
-			ret = __i915_wait_request(req[i], true,
-						  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
-						  to_rps_client(file));
-		i915_gem_request_put(req[i]);
-	}
+	drm_gem_object_unreference_unlocked(&obj->base);
 	return ret;
 }
 
-- 
2.6.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH v2 3/3] drm/i915: Remove (struct_mutex) locking for busy-ioctl
  2015-12-23 13:35 ` [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Chris Wilson
  2015-12-23 13:35   ` [PATCH v2 2/3] drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
@ 2015-12-23 13:35   ` Chris Wilson
  2016-01-05 14:59   ` [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Daniel Vetter
  2 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2015-12-23 13:35 UTC (permalink / raw)
  To: intel-gfx

By applying the same logic as for wait-ioctl, we can query whether a
request has completed without holding struct_mutex. The biggest impact
system-wide is removing the flush_active and the contention that causes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c | 53 +++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3e331f7e9d74..80810d5b5caf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3591,37 +3591,38 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 {
 	struct drm_i915_gem_busy *args = data;
 	struct drm_i915_gem_object *obj;
-	int ret;
-
-	ret = i915_mutex_lock_interruptible(dev);
-	if (ret)
-		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
-	if (&obj->base == NULL) {
-		ret = -ENOENT;
-		goto unlock;
-	}
+	if (&obj->base == NULL)
+		return -ENOENT;
 
-	/* Count all active objects as busy, even if they are currently not used
-	 * by the gpu. Users of this interface expect objects to eventually
-	 * become non-busy without any further actions, therefore emit any
-	 * necessary flushes here.
-	 */
-	ret = i915_gem_object_flush_active(obj);
-	if (ret)
-		goto unref;
+	args->busy = 0;
+	if (obj->active) { /* XXX READ_ONCE(obj->flags) */
+		struct drm_i915_gem_request *req;
+		int i;
 
-	BUILD_BUG_ON(I915_NUM_RINGS > 16);
-	args->busy = obj->active << 16;
-	if (obj->last_write.request)
-		args->busy |= obj->last_write.request->engine->id;
+		BUILD_BUG_ON(I915_NUM_RINGS > 16);
+		rcu_read_lock();
+		for (i = 0; i < I915_NUM_RINGS; i++) {
+			req = i915_gem_active_get_request_rcu(&obj->last_read[i]);
+			if (req == NULL)
+				continue;
 
-unref:
-	drm_gem_object_unreference(&obj->base);
-unlock:
-	mutex_unlock(&dev->struct_mutex);
-	return ret;
+			if (i915_gem_request_completed(req))
+				args->busy |= 1 << (16 + i);
+			i915_gem_request_put(req);
+		}
+
+		req = i915_gem_active_get_request_rcu(&obj->last_write);
+		if (req) {
+			args->busy |= req->engine->id;
+			i915_gem_request_put(req);
+		}
+		rcu_read_unlock();
+	}
+
+	drm_gem_object_unreference_unlocked(&obj->base);
+	return 0;
 }
 
 int
-- 
2.6.4

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2015-12-23 13:35 ` [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Chris Wilson
  2015-12-23 13:35   ` [PATCH v2 2/3] drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
  2015-12-23 13:35   ` [PATCH v2 3/3] drm/i915: Remove (struct_mutex) locking for busy-ioctl Chris Wilson
@ 2016-01-05 14:59   ` Daniel Vetter
  2016-01-05 15:02     ` Peter Zijlstra
  2 siblings, 1 reply; 17+ messages in thread
From: Daniel Vetter @ 2016-01-05 14:59 UTC (permalink / raw)
  To: Chris Wilson
  Cc: intel-gfx, Linux MM, Peter Zijlstra, Jens Axboe,
	Paul E. McKenney, Christoph Lameter, Hugh Dickins, Pekka Enberg

On Wed, Dec 23, 2015 at 01:35:54PM +0000, Chris Wilson wrote:
> If we enable RCU for the requests (providing a grace period where we can
> inspect a "dead" request before it is freed), we can allow callers to
> carefully perform lockless lookup of an active request.
> 
> However, by enabling deferred freeing of requests, we can potentially
> hog a lot of memory when dealing with tens of thousands of requests per
> second - with a quick insertion of the a synchronize_rcu() inside our
> shrinker callback, that issue disappears.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem.c          |  3 ++-
>  drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
>  drivers/gpu/drm/i915/i915_gem_request.h  | 24 +++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_shrinker.c |  1 +
>  4 files changed, 27 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c169574758d5..696ada3891ed 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4222,7 +4222,8 @@ i915_gem_load(struct drm_device *dev)
>  	dev_priv->requests =
>  		kmem_cache_create("i915_gem_request",
>  				  sizeof(struct drm_i915_gem_request), 0,
> -				  SLAB_HWCACHE_ALIGN,
> +				  SLAB_HWCACHE_ALIGN |
> +				  SLAB_DESTROY_BY_RCU,
>  				  NULL);
>  
>  	INIT_LIST_HEAD(&dev_priv->context_list);

[snip i915 private changes, leave just slab/shrinker changes]

> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index c561ed2b8287..03a8bbb3e31e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -142,6 +142,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
>  	}
>  
>  	i915_gem_retire_requests(dev_priv->dev);
> +	synchronize_rcu(); /* expedite the grace period to free the requests */

Shouldn't the slab subsystem do this for us if we request it delays the
actual kfree? Seems like a core bug to me ... Adding more folks.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2016-01-05 14:59   ` [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Daniel Vetter
@ 2016-01-05 15:02     ` Peter Zijlstra
  2016-01-05 15:06       ` Peter Zijlstra
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2016-01-05 15:02 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Linux MM, Jens Axboe, Paul E. McKenney,
	Christoph Lameter, Hugh Dickins, Pekka Enberg

On Tue, Jan 05, 2016 at 03:59:51PM +0100, Daniel Vetter wrote:
> On Wed, Dec 23, 2015 at 01:35:54PM +0000, Chris Wilson wrote:
> > If we enable RCU for the requests (providing a grace period where we can
> > inspect a "dead" request before it is freed), we can allow callers to
> > carefully perform lockless lookup of an active request.
> > 
> > However, by enabling deferred freeing of requests, we can potentially
> > hog a lot of memory when dealing with tens of thousands of requests per
> > second - with a quick insertion of the a synchronize_rcu() inside our
> > shrinker callback, that issue disappears.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c          |  3 ++-
> >  drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
> >  drivers/gpu/drm/i915/i915_gem_request.h  | 24 +++++++++++++++++++++++-
> >  drivers/gpu/drm/i915/i915_gem_shrinker.c |  1 +
> >  4 files changed, 27 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index c169574758d5..696ada3891ed 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -4222,7 +4222,8 @@ i915_gem_load(struct drm_device *dev)
> >  	dev_priv->requests =
> >  		kmem_cache_create("i915_gem_request",
> >  				  sizeof(struct drm_i915_gem_request), 0,
> > -				  SLAB_HWCACHE_ALIGN,
> > +				  SLAB_HWCACHE_ALIGN |
> > +				  SLAB_DESTROY_BY_RCU,
> >  				  NULL);
> >  
> >  	INIT_LIST_HEAD(&dev_priv->context_list);
> 
> [snip i915 private changes, leave just slab/shrinker changes]
> 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > index c561ed2b8287..03a8bbb3e31e 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > @@ -142,6 +142,7 @@ i915_gem_shrink(struct drm_i915_private *dev_priv,
> >  	}
> >  
> >  	i915_gem_retire_requests(dev_priv->dev);
> > +	synchronize_rcu(); /* expedite the grace period to free the requests */
> 
> Shouldn't the slab subsystem do this for us if we request it delays the
> actual kfree? Seems like a core bug to me ... Adding more folks.

note that sync_rcu() can take a terribly long time.. but yes, I seem to
remember Paul talking about adding this to reclaim paths for just this
reason. Not sure that ever happened thouhg.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2016-01-05 15:02     ` Peter Zijlstra
@ 2016-01-05 15:06       ` Peter Zijlstra
  2016-01-05 16:35         ` Paul E. McKenney
  0 siblings, 1 reply; 17+ messages in thread
From: Peter Zijlstra @ 2016-01-05 15:06 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Linux MM, Jens Axboe, Paul E. McKenney,
	Christoph Lameter, Hugh Dickins, Pekka Enberg

On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > Shouldn't the slab subsystem do this for us if we request it delays the
> > actual kfree? Seems like a core bug to me ... Adding more folks.
> 
> note that sync_rcu() can take a terribly long time.. but yes, I seem to
> remember Paul talking about adding this to reclaim paths for just this
> reason. Not sure that ever happened thouhg.

Also, you might be wanting rcu_barrier() instead, that not only waits
for a GP to complete, but also for all pending callbacks to be
processed.

Without the latter there might still not be anything to free after it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2016-01-05 15:06       ` Peter Zijlstra
@ 2016-01-05 16:35         ` Paul E. McKenney
  2016-01-06  8:06             ` Daniel Vetter
  0 siblings, 1 reply; 17+ messages in thread
From: Paul E. McKenney @ 2016-01-05 16:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Wilson, intel-gfx, Linux MM, Jens Axboe, Christoph Lameter,
	Hugh Dickins, Pekka Enberg

On Tue, Jan 05, 2016 at 04:06:48PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > > Shouldn't the slab subsystem do this for us if we request it delays the
> > > actual kfree? Seems like a core bug to me ... Adding more folks.
> > 
> > note that sync_rcu() can take a terribly long time.. but yes, I seem to
> > remember Paul talking about adding this to reclaim paths for just this
> > reason. Not sure that ever happened thouhg.

There is an RCU OOM notifier, but it just ensures that existing callbacks
get processed in a timely fashion.  It does not block, as that would
prevent other OOM notifiers from getting their memory freed quickly.

> Also, you might be wanting rcu_barrier() instead, that not only waits
> for a GP to complete, but also for all pending callbacks to be
> processed.

And in fact what the RCU OOM notifier does can be thought of as an
asynchronous open-coded rcu_barrier().  If you are interested, please
see rcu_register_oom_notifier() and friends.

> Without the latter there might still not be anything to free after it.

Another approach is synchronize_rcu() after some largish number of
requests.  The advantage of this approach is that it throttles the
production of callbacks at the source.  The corresponding disadvantage
is that it slows things up.

Another approach is to use call_rcu(), but if the previous call_rcu()
is still in flight, block waiting for it.  Yet another approach is
the get_state_synchronize_rcu() / cond_synchronize_rcu() pair.  The
idea is to do something like this:

	cond_synchronize_rcu(cookie);
	cookie = get_state_synchronize_rcu();

You would of course do an initial get_state_synchronize_rcu() to
get things going.  This would not block unless there was less than
one grace period's worth of time between invocations.  But this
assumes a busy system, where there is almost always a grace period
in flight.  But you can make that happen as follows:

	cond_synchronize_rcu(cookie);
	cookie = get_state_synchronize_rcu();
	call_rcu(&my_rcu_head, noop_function);

Note that you need additional code to make sure that the old callback
has completed before doing a new one.  Setting and clearing a flag
with appropriate memory ordering control suffices (e.g,. smp_load_acquire()
and smp_store_release()).

And there are probably other approaches as well...

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2016-01-05 16:35         ` Paul E. McKenney
@ 2016-01-06  8:06             ` Daniel Vetter
  0 siblings, 0 replies; 17+ messages in thread
From: Daniel Vetter @ 2016-01-06  8:06 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Chris Wilson, intel-gfx, Linux MM, Jens Axboe,
	Christoph Lameter, Hugh Dickins, Pekka Enberg

On Tue, Jan 05, 2016 at 08:35:37AM -0800, Paul E. McKenney wrote:
> On Tue, Jan 05, 2016 at 04:06:48PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > > > Shouldn't the slab subsystem do this for us if we request it delays the
> > > > actual kfree? Seems like a core bug to me ... Adding more folks.
> > > 
> > > note that sync_rcu() can take a terribly long time.. but yes, I seem to
> > > remember Paul talking about adding this to reclaim paths for just this
> > > reason. Not sure that ever happened thouhg.
> 
> There is an RCU OOM notifier, but it just ensures that existing callbacks
> get processed in a timely fashion.  It does not block, as that would
> prevent other OOM notifiers from getting their memory freed quickly.
> 
> > Also, you might be wanting rcu_barrier() instead, that not only waits
> > for a GP to complete, but also for all pending callbacks to be
> > processed.
> 
> And in fact what the RCU OOM notifier does can be thought of as an
> asynchronous open-coded rcu_barrier().  If you are interested, please
> see rcu_register_oom_notifier() and friends.
> 
> > Without the latter there might still not be anything to free after it.
> 
> Another approach is synchronize_rcu() after some largish number of
> requests.  The advantage of this approach is that it throttles the
> production of callbacks at the source.  The corresponding disadvantage
> is that it slows things up.
> 
> Another approach is to use call_rcu(), but if the previous call_rcu()
> is still in flight, block waiting for it.  Yet another approach is
> the get_state_synchronize_rcu() / cond_synchronize_rcu() pair.  The
> idea is to do something like this:
> 
> 	cond_synchronize_rcu(cookie);
> 	cookie = get_state_synchronize_rcu();
> 
> You would of course do an initial get_state_synchronize_rcu() to
> get things going.  This would not block unless there was less than
> one grace period's worth of time between invocations.  But this
> assumes a busy system, where there is almost always a grace period
> in flight.  But you can make that happen as follows:
> 
> 	cond_synchronize_rcu(cookie);
> 	cookie = get_state_synchronize_rcu();
> 	call_rcu(&my_rcu_head, noop_function);
> 
> Note that you need additional code to make sure that the old callback
> has completed before doing a new one.  Setting and clearing a flag
> with appropriate memory ordering control suffices (e.g,. smp_load_acquire()
> and smp_store_release()).

This pretty much went over my head ;-) What I naively hoped for is that
kfree() on an rcu-freeing slab could be tought to magically stall a bit
(or at least expedite the delayed freeing) if we're piling up too many
freed objects. Doing that only in OOM is probably too late since OOM
handling is a bit unreliable/unpredictable. And I thought we're not the
first ones running into this problem.

Do all the other users of rcu-freed slabs just open-code their own custom
approach? If that's the recommendation we can certainly follow that, too.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
@ 2016-01-06  8:06             ` Daniel Vetter
  0 siblings, 0 replies; 17+ messages in thread
From: Daniel Vetter @ 2016-01-06  8:06 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Christoph Lameter, Peter Zijlstra, intel-gfx, Linux MM,
	Pekka Enberg, Jens Axboe, Hugh Dickins

On Tue, Jan 05, 2016 at 08:35:37AM -0800, Paul E. McKenney wrote:
> On Tue, Jan 05, 2016 at 04:06:48PM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 05, 2016 at 04:02:13PM +0100, Peter Zijlstra wrote:
> > > > Shouldn't the slab subsystem do this for us if we request it delays the
> > > > actual kfree? Seems like a core bug to me ... Adding more folks.
> > > 
> > > note that sync_rcu() can take a terribly long time.. but yes, I seem to
> > > remember Paul talking about adding this to reclaim paths for just this
> > > reason. Not sure that ever happened thouhg.
> 
> There is an RCU OOM notifier, but it just ensures that existing callbacks
> get processed in a timely fashion.  It does not block, as that would
> prevent other OOM notifiers from getting their memory freed quickly.
> 
> > Also, you might be wanting rcu_barrier() instead, that not only waits
> > for a GP to complete, but also for all pending callbacks to be
> > processed.
> 
> And in fact what the RCU OOM notifier does can be thought of as an
> asynchronous open-coded rcu_barrier().  If you are interested, please
> see rcu_register_oom_notifier() and friends.
> 
> > Without the latter there might still not be anything to free after it.
> 
> Another approach is synchronize_rcu() after some largish number of
> requests.  The advantage of this approach is that it throttles the
> production of callbacks at the source.  The corresponding disadvantage
> is that it slows things up.
> 
> Another approach is to use call_rcu(), but if the previous call_rcu()
> is still in flight, block waiting for it.  Yet another approach is
> the get_state_synchronize_rcu() / cond_synchronize_rcu() pair.  The
> idea is to do something like this:
> 
> 	cond_synchronize_rcu(cookie);
> 	cookie = get_state_synchronize_rcu();
> 
> You would of course do an initial get_state_synchronize_rcu() to
> get things going.  This would not block unless there was less than
> one grace period's worth of time between invocations.  But this
> assumes a busy system, where there is almost always a grace period
> in flight.  But you can make that happen as follows:
> 
> 	cond_synchronize_rcu(cookie);
> 	cookie = get_state_synchronize_rcu();
> 	call_rcu(&my_rcu_head, noop_function);
> 
> Note that you need additional code to make sure that the old callback
> has completed before doing a new one.  Setting and clearing a flag
> with appropriate memory ordering control suffices (e.g,. smp_load_acquire()
> and smp_store_release()).

This pretty much went over my head ;-) What I naively hoped for is that
kfree() on an rcu-freeing slab could be tought to magically stall a bit
(or at least expedite the delayed freeing) if we're piling up too many
freed objects. Doing that only in OOM is probably too late since OOM
handling is a bit unreliable/unpredictable. And I thought we're not the
first ones running into this problem.

Do all the other users of rcu-freed slabs just open-code their own custom
approach? If that's the recommendation we can certainly follow that, too.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2016-01-06  8:06             ` Daniel Vetter
@ 2016-01-06  8:38               ` Peter Zijlstra
  -1 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2016-01-06  8:38 UTC (permalink / raw)
  To: Paul E. McKenney, Chris Wilson, intel-gfx, Linux MM, Jens Axboe,
	Christoph Lameter, Hugh Dickins, Pekka Enberg

On Wed, Jan 06, 2016 at 09:06:58AM +0100, Daniel Vetter wrote:
> This pretty much went over my head ;-) What I naively hoped for is that
> kfree() on an rcu-freeing slab could be tought to magically stall a bit
> (or at least expedite the delayed freeing) if we're piling up too many
> freed objects.

Well, RCU does try harder when the callback list is getting 'big' (10k
IIRC).

> Doing that only in OOM is probably too late since OOM
> handling is a bit unreliable/unpredictable. And I thought we're not the
> first ones running into this problem.

The whole memory pressure thing is unreliable/unpredictable last time I
looked at it, but sure, I suppose we could try and poke RCU sooner, but
then you get into the problem of when, doing it too soon will be
detrimental to performance, doing it too late is, well, too late.

> Do all the other users of rcu-freed slabs just open-code their own custom
> approach? If that's the recommendation we can certainly follow that, too.

The ones I know of seem to simply ignore this problem..

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
@ 2016-01-06  8:38               ` Peter Zijlstra
  0 siblings, 0 replies; 17+ messages in thread
From: Peter Zijlstra @ 2016-01-06  8:38 UTC (permalink / raw)
  To: Paul E. McKenney, Chris Wilson, intel-gfx, Linux MM, Jens Axboe,
	Christoph Lameter, Hugh Dickins, Pekka Enberg

On Wed, Jan 06, 2016 at 09:06:58AM +0100, Daniel Vetter wrote:
> This pretty much went over my head ;-) What I naively hoped for is that
> kfree() on an rcu-freeing slab could be tought to magically stall a bit
> (or at least expedite the delayed freeing) if we're piling up too many
> freed objects.

Well, RCU does try harder when the callback list is getting 'big' (10k
IIRC).

> Doing that only in OOM is probably too late since OOM
> handling is a bit unreliable/unpredictable. And I thought we're not the
> first ones running into this problem.

The whole memory pressure thing is unreliable/unpredictable last time I
looked at it, but sure, I suppose we could try and poke RCU sooner, but
then you get into the problem of when, doing it too soon will be
detrimental to performance, doing it too late is, well, too late.

> Do all the other users of rcu-freed slabs just open-code their own custom
> approach? If that's the recommendation we can certainly follow that, too.

The ones I know of seem to simply ignore this problem..
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU
  2016-01-06  8:38               ` Peter Zijlstra
  (?)
@ 2016-01-06 15:56               ` Paul E. McKenney
  -1 siblings, 0 replies; 17+ messages in thread
From: Paul E. McKenney @ 2016-01-06 15:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Chris Wilson, intel-gfx, Linux MM, Jens Axboe, Christoph Lameter,
	Hugh Dickins, Pekka Enberg

On Wed, Jan 06, 2016 at 09:38:30AM +0100, Peter Zijlstra wrote:
> On Wed, Jan 06, 2016 at 09:06:58AM +0100, Daniel Vetter wrote:
> > This pretty much went over my head ;-) What I naively hoped for is that
> > kfree() on an rcu-freeing slab could be tought to magically stall a bit
> > (or at least expedite the delayed freeing) if we're piling up too many
> > freed objects.
> 
> Well, RCU does try harder when the callback list is getting 'big' (10k
> IIRC).

You got it, 10k by default, can be adjusted with the rcutree.qhimark
kernel-boot/sysfs parameter.  When a given CPU's callback list exceeds
this limit, it more aggressively starts a grace period, and if a grace
period is already in progress, it does more aggressive quiescent-state
forcing.  It does nothing to push back on processes generating callbacks,
other than by soaking up extra CPU cycles.

So, Daniel, if you haven't tried hammering the system hard, give it a
shot and see if qhimark is helping enough.  And perhaps adjust its value
if need be.  (Though please let me know if this is necessary -- if it is,
we should try to automate its setting.)

> > Doing that only in OOM is probably too late since OOM
> > handling is a bit unreliable/unpredictable. And I thought we're not the
> > first ones running into this problem.
> 
> The whole memory pressure thing is unreliable/unpredictable last time I
> looked at it, but sure, I suppose we could try and poke RCU sooner, but
> then you get into the problem of when, doing it too soon will be
> detrimental to performance, doing it too late is, well, too late.
> 
> > Do all the other users of rcu-freed slabs just open-code their own custom
> > approach? If that's the recommendation we can certainly follow that, too.
> 
> The ones I know of seem to simply ignore this problem..

I believe that there are a few that do the occasional synchronize_rcu()
to throttle themselves, but have not checked recently.

							Thanx, Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2016-01-06 15:56 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-23 11:19 [PATCH] RFC drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
2015-12-23 11:32 ` Chris Wilson
2015-12-23 12:05   ` Chris Wilson
2015-12-23 12:13     ` Chris Wilson
2015-12-23 12:26       ` Chris Wilson
2015-12-23 13:35 ` [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Chris Wilson
2015-12-23 13:35   ` [PATCH v2 2/3] drm/i915: Remove (struct_mutex) locking for wait-ioctl Chris Wilson
2015-12-23 13:35   ` [PATCH v2 3/3] drm/i915: Remove (struct_mutex) locking for busy-ioctl Chris Wilson
2016-01-05 14:59   ` [Intel-gfx] [PATCH v2 1/3] drm/i915: Enable lockless lookup of request tracking via RCU Daniel Vetter
2016-01-05 15:02     ` Peter Zijlstra
2016-01-05 15:06       ` Peter Zijlstra
2016-01-05 16:35         ` Paul E. McKenney
2016-01-06  8:06           ` Daniel Vetter
2016-01-06  8:06             ` Daniel Vetter
2016-01-06  8:38             ` [Intel-gfx] " Peter Zijlstra
2016-01-06  8:38               ` Peter Zijlstra
2016-01-06 15:56               ` [Intel-gfx] " Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.