All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel
@ 2010-02-01  9:50 Luca Barbieri
       [not found] ` <1265017810-13354-1-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Luca Barbieri @ 2010-02-01  9:50 UTC (permalink / raw)
  To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Luca Barbieri

nouveau_bo_wait will make the GPU channel wait for fence if possible,
otherwise falling back to waiting with the CPU using ttm_bo_wait.

The nouveau_fence_sync function currently returns -ENOSYS, and is
the focus of the next patch.

Signed-off-by: Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c    |   68 ++++++++++++++++++++++++++++++-
 drivers/gpu/drm/nouveau/nouveau_drv.h   |    2 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |    6 +++
 drivers/gpu/drm/nouveau/nouveau_gem.c   |   20 +++++----
 4 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index db0ed4c..8afc17e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -35,6 +35,70 @@
 
 #include <linux/log2.h>
 
+int
+nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan)
+{
+	int ret = 0;
+
+	if (likely(!bo->sync_obj))
+		return 0;
+
+	spin_lock(&bo->lock);
+	if (chan) {
+		struct nouveau_fence *new_fence;
+		struct nouveau_channel *waited_chan;
+
+		do {
+			struct nouveau_fence *prev_fence;
+			prev_fence = bo->sync_obj;
+
+			waited_chan = nouveau_fence_channel(prev_fence);
+			if (likely(!waited_chan || waited_chan == chan))
+				break;
+
+			nouveau_fence_ref(prev_fence);
+
+			ret = ttm_bo_wait(bo, false, false, true);
+			if (!ret)
+				goto unref_break;
+
+			if (unlikely(prev_fence != bo->sync_obj))
+				goto unref_continue;
+
+			spin_unlock(&bo->lock);
+			new_fence = nouveau_fence_sync(prev_fence, chan);
+			spin_lock(&bo->lock);
+
+			if (likely(!IS_ERR(new_fence))) {
+				if (likely(bo->sync_obj)) {
+					if (unlikely(bo->sync_obj != prev_fence)) {
+						nouveau_fence_unref((void **)&new_fence);
+						continue;
+					}
+					nouveau_fence_unref((void **)&bo->sync_obj);
+				}
+				bo->sync_obj = new_fence;
+				ret = 0;
+unref_break:
+				nouveau_fence_unref((void **)&prev_fence);
+				break;
+			}
+
+			if (unlikely(prev_fence != bo->sync_obj)) {
+unref_continue:
+				nouveau_fence_unref((void **)&prev_fence);
+				continue;
+			}
+
+			nouveau_fence_unref((void **)&prev_fence);
+			ret = ttm_bo_wait(bo, false, false, false);
+		} while (0);
+	} else
+		ret = ttm_bo_wait(bo, false, false, false);
+	spin_unlock(&bo->lock);
+	return ret;
+}
+
 static void
 nouveau_bo_del_ttm(struct ttm_buffer_object *bo)
 {
@@ -469,8 +533,10 @@ nouveau_bo_move_accel_cleanup(struct nouveau_channel *chan,
 
 	ret = ttm_bo_move_accel_cleanup(&nvbo->bo, fence, NULL,
 					evict, no_wait, new_mem);
+
+	/* TODO: this should be redundant, since we do the check in validate */
 	if (nvbo->channel && nvbo->channel != chan)
-		ret = nouveau_fence_wait(fence, NULL, false, false);
+		ret = nouveau_bo_wait(&nvbo->bo, nvbo->channel);
 	nouveau_fence_unref((void *)&fence);
 	return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 5445cef..2b78ee6 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -1110,6 +1110,7 @@ extern int nv04_crtc_create(struct drm_device *, int index);
 
 /* nouveau_bo.c */
 extern struct ttm_bo_driver nouveau_bo_driver;
+extern int nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan);
 extern int nouveau_bo_new(struct drm_device *, struct nouveau_channel *,
 			  int size, int align, uint32_t flags,
 			  uint32_t tile_mode, uint32_t tile_flags,
@@ -1135,6 +1136,7 @@ extern int nouveau_fence_emit(struct nouveau_fence *);
 struct nouveau_channel *nouveau_fence_channel(struct nouveau_fence *);
 extern bool nouveau_fence_signalled(void *obj, void *arg);
 extern int nouveau_fence_wait(void *obj, void *arg, bool lazy, bool intr);
+extern struct nouveau_fence *nouveau_fence_sync(struct nouveau_fence *, struct nouveau_channel *);
 extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index faddf53..9b1c2c3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -56,6 +56,12 @@ nouveau_fence_del(struct kref *ref)
 	kfree(fence);
 }
 
+struct nouveau_fence*
+nouveau_fence_sync(struct nouveau_fence *waited_fence, struct nouveau_channel *chan)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
 void
 nouveau_fence_update(struct nouveau_channel *chan)
 {
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 70cc308..b8d61ff 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -356,15 +356,6 @@ validate_list(struct nouveau_channel *chan, struct list_head *list,
 
 	list_for_each_entry(nvbo, list, entry) {
 		struct drm_nouveau_gem_pushbuf_bo *b = &pbbo[nvbo->pbbo_index];
-		struct nouveau_fence *prev_fence = nvbo->bo.sync_obj;
-
-		if (prev_fence && nouveau_fence_channel(prev_fence) != chan) {
-			spin_lock(&nvbo->bo.lock);
-			ret = ttm_bo_wait(&nvbo->bo, false, false, false);
-			spin_unlock(&nvbo->bo.lock);
-			if (unlikely(ret))
-				return ret;
-		}
 
 		ret = nouveau_gem_set_domain(nvbo->gem, b->read_domains,
 					     b->write_domains,
@@ -379,6 +370,17 @@ validate_list(struct nouveau_channel *chan, struct list_head *list,
 		if (unlikely(ret))
 			return ret;
 
+		/* we must wait *after *validation, since we do the move
+		   with the kernel channel.
+
+		    Note that this may spin/sleep on a fence
+		    TODO: is this a good idea, or should we bail and retry?
+		  */
+		ret = nouveau_bo_wait(&nvbo->bo, chan);
+		if (unlikely(ret))
+			return ret;
+
+
 		if (nvbo->bo.offset == b->presumed_offset &&
 		    ((nvbo->bo.mem.mem_type == TTM_PL_VRAM &&
 		      b->presumed_domain & NOUVEAU_GEM_DOMAIN_VRAM) ||
-- 
1.6.6.1.476.g01ddb

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found] ` <1265017810-13354-1-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
@ 2010-02-01  9:50   ` Luca Barbieri
       [not found]     ` <1265017810-13354-2-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
  2010-02-01  9:51   ` [PATCH 3/3] drm/nouveau: use semaphores for fully on-GPU interchannel synchronization Luca Barbieri
  2010-02-01 12:51   ` [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel Francisco Jerez
  2 siblings, 1 reply; 10+ messages in thread
From: Luca Barbieri @ 2010-02-01  9:50 UTC (permalink / raw)
  To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Luca Barbieri

This patch adds code to allocate semaphores in a dynamic way using
an algorithm with a lockless fast path.

1. Semaphore BOs

Semaphore BOs are BOs containing semaphores. Each is 4KB large and
contains 1024 4-byte semaphores. They are pinned.

Semaphore BOs are allocated on-demand and freed at device takedown.
Those that are not fully allocated are kept on a free list.

Each is assigned an handle. DMA objects and references are created
on demand for each channel that needs to use a semaphore BO.
Those objects and references are automatically destroyed at channel
destruction time.

Typically only a single semaphore BO will be needed.

2. Semaphore allocation

Each semaphore BO contains a bitmask of free semaphores within the BO.
Allocation is done in a lockless fashion using a count of free
semaphores and the bitmask.

Semaphores are released once the fence on the waiting side passed.
This is done by adding fields to nouveau_fence.

Semaphore values are zeroed when the semaphore BO is allocated, and
are afterwards only modified by the GPU.

This is performed by storing a bitmask that allows to alternate
between using the values 0 and 1 for a given semaphore.

Signed-off-by: Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
---
 drivers/gpu/drm/nouveau/nouveau_dma.h   |    1 +
 drivers/gpu/drm/nouveau/nouveau_drv.h   |    7 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  243 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_state.c |    4 +
 4 files changed, 255 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h b/drivers/gpu/drm/nouveau/nouveau_dma.h
index dabfd65..0658979 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
@@ -69,6 +69,7 @@ enum {
 	NvGdiRect	= 0x8000000c,
 	NvImageBlit	= 0x8000000d,
 	NvSw		= 0x8000000e,
+	NvSem		= 0x81000000, /* range of 16M handles */
 
 	/* G80+ display objects */
 	NvEvoVRAM	= 0x01000000,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 2b78ee6..0a7abc7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -620,6 +620,11 @@ struct drm_nouveau_private {
 	struct {
 		struct dentry *channel_root;
 	} debugfs;
+
+	spinlock_t sem_bo_free_list_lock;
+	struct nouveau_sem_bo *sem_bo_free_list;
+	atomic_t sem_handles;
+	uint32_t sem_max_handles;
 };
 
 static inline struct drm_nouveau_private *
@@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
 extern void nouveau_fence_handler(struct drm_device *dev, int channel);
+extern void nouveau_fence_device_init(struct drm_device *dev);
+extern void nouveau_fence_device_takedown(struct drm_device *dev);
 
 /* nouveau_gem.c */
 extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9b1c2c3..01152f3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -32,6 +32,13 @@
 
 #define USE_REFCNT (dev_priv->card_type >= NV_10)
 
+#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
+
+/* reading fences can be very expensive
+ * use a threshold that would only use up half a single sem_bo
+ */
+#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / (NOUVEAU_MAX_CHANNEL_NR * 2))
+
 struct nouveau_fence {
 	struct nouveau_channel *channel;
 	struct kref refcount;
@@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
 	return (struct nouveau_fence *)sync_obj;
 }
 
+struct nouveau_sem_bo {
+	struct nouveau_sem_bo *next;
+	struct nouveau_bo *bo;
+	uint32_t handle;
+
+	/* >= 0: num_free + 1 slots are free, sem_bo is or is about to be on free_list
+	    -1: all allocated, sem_bo is NOT on free_list
+	*/
+	atomic_t num_free;
+
+	DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+	DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+	DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
+};
+
+struct nouveau_sem {
+	struct nouveau_sem_bo *sem_bo;
+	unsigned num;
+	uint32_t value;
+};
+
+struct nouveau_sem_bo*
+nouveau_sem_bo_alloc(struct drm_device *dev)
+{
+	struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+	struct nouveau_bo *bo;
+	int flags = TTM_PL_FLAG_VRAM;
+	int ret;
+	bool is_iomem;
+	void *mem;
+
+	sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+
+	if (!sem_bo)
+		return 0;
+
+	ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
+			0, 0x0000, true, true, &bo);
+	if (ret)
+		goto out_free;
+
+	sem_bo->bo = bo;
+
+	ret = nouveau_bo_pin(bo, flags);
+	if (ret)
+		goto out_bo;
+
+	ret = nouveau_bo_map(bo);
+	if (ret)
+		goto out_unpin;
+
+	mem = ttm_kmap_obj_virtual(&bo->kmap, &is_iomem);
+	if (is_iomem)
+		memset_io(mem, 0, NOUVEAU_SEM_BO_SIZE);
+	else
+		memset(mem, 0, NOUVEAU_SEM_BO_SIZE);
+
+	nouveau_bo_unmap(bo);
+
+	memset((void *)sem_bo->free_slots, 0xff, sizeof(sem_bo->free_slots));
+	memset((void *)sem_bo->values, 0xff, sizeof(sem_bo->values));
+	atomic_set(&sem_bo->num_free, sizeof(sem_bo->free_slots) * 8 - 1);
+
+	memset((void *)sem_bo->channels, 0, sizeof(sem_bo->channels));
+
+	return sem_bo;
+
+out_unpin:
+	nouveau_bo_unpin(sem_bo->bo);
+out_bo:
+	nouveau_bo_ref(0, &sem_bo->bo);
+out_free:
+	kfree(sem_bo);
+	return 0;
+}
+
+static void
+nouveau_sem_bo_channel_dtor(struct drm_device *dev,
+			     struct nouveau_gpuobj *gpuobj) {
+	if (gpuobj->priv) {
+		struct nouveau_sem_bo *sem_bo;
+		struct nouveau_channel *chan = gpuobj->im_channel;
+		sem_bo = (struct nouveau_sem_bo *)gpuobj->priv;
+
+		clear_bit(chan->id, sem_bo->channels);
+		smp_wmb();
+	}
+}
+
+int
+nouveau_sem_bo_channel_init(struct nouveau_sem_bo *sem_bo, struct nouveau_channel *chan)
+{
+	struct drm_device *dev = chan->dev;
+	struct nouveau_gpuobj *obj = NULL;
+	int ret;
+
+	if (test_bit(chan->id, sem_bo->channels))
+		return 0;
+
+	BUG_ON(sem_bo->bo->bo.mem.mem_type != TTM_PL_VRAM);
+
+	ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY,
+		sem_bo->bo->bo.mem.mm_node->start, NOUVEAU_SEM_BO_SIZE,
+		NV_DMA_ACCESS_RW, NV_DMA_TARGET_VIDMEM, &obj);
+	if (ret)
+		return ret;
+
+	obj->dtor = nouveau_sem_bo_channel_dtor;
+	obj->priv = sem_bo;
+
+	ret = nouveau_gpuobj_ref_add(dev, chan, sem_bo->handle, obj, NULL);
+	if (ret) {
+		nouveau_gpuobj_del(dev, &obj);
+		return ret;
+	}
+
+	set_bit(chan->id, sem_bo->channels);
+	smp_wmb();
+
+	return 0;
+}
+
+void
+nouveau_sem_bo_free(struct nouveau_sem_bo *sem_bo)
+{
+	nouveau_bo_unpin(sem_bo->bo);
+	nouveau_bo_ref(0, &sem_bo->bo);
+	kfree(sem_bo);
+}
+
+static inline void
+nouveau_sem_bo_enqueue(struct drm_device *dev, struct nouveau_sem_bo *sem_bo)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	spin_lock(&dev_priv->sem_bo_free_list_lock);
+	sem_bo->next = dev_priv->sem_bo_free_list;
+	dev_priv->sem_bo_free_list = sem_bo;
+	spin_unlock(&dev_priv->sem_bo_free_list_lock);
+}
+
+int
+nouveau_sem_alloc(struct drm_device *dev, struct nouveau_sem *sem)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	struct nouveau_sem_bo *sem_bo = 0;
+	int v;
+
+retry:
+	sem_bo = dev_priv->sem_bo_free_list;
+	if (!sem_bo) {
+		unsigned handle;
+
+		if (atomic_read(&dev_priv->sem_handles) >= dev_priv->sem_max_handles)
+			return -ENOMEM;
+
+		/* this works because sem_handles <= sem_max_handles + NR_CPUS */
+		handle = atomic_add_return(1, &dev_priv->sem_handles) - 1;
+
+		if (handle >= dev_priv->sem_max_handles)
+			return -ENOMEM;
+
+		sem_bo = nouveau_sem_bo_alloc(dev);
+		if (!sem_bo)
+			return -ENOMEM;
+
+		sem_bo->handle = NvSem + handle;
+
+		atomic_dec(&sem_bo->num_free);
+		nouveau_sem_bo_enqueue(dev, sem_bo);
+	} else {
+		if (unlikely(!atomic_add_unless(&sem_bo->num_free, -1, 0))) {
+			spin_lock(&dev_priv->sem_bo_free_list_lock);
+			if (unlikely(sem_bo != dev_priv->sem_bo_free_list)) {
+				spin_unlock(&dev_priv->sem_bo_free_list_lock);
+				goto retry;
+			}
+
+			dev_priv->sem_bo_free_list = sem_bo->next;
+			if (atomic_dec_return(&sem_bo->num_free) >= 0)
+				dev_priv->sem_bo_free_list = sem_bo;
+
+			spin_unlock(&dev_priv->sem_bo_free_list_lock);
+		}
+	}
+
+retry_bit:
+	v = find_first_bit(sem_bo->free_slots, sizeof(sem_bo->free_slots) * 8);
+
+	/* we reserved our bit by decrementing num_free
+	   however, the first available bit may have been taken */
+	BUG_ON(v >= sizeof(sem_bo->free_slots) * 8);
+
+	if (unlikely(!test_and_clear_bit(v, sem_bo->free_slots)))
+		goto retry_bit;
+
+	sem->sem_bo = sem_bo;
+	sem->value = test_and_change_bit(v, sem_bo->values);
+	sem->num = v;
+
+	return 0;
+}
+
+void
+nouveau_sem_release(struct drm_device *dev, struct nouveau_sem_bo *sem_bo, int i)
+{
+	set_bit(i, sem_bo->free_slots);
+
+	if (atomic_inc_and_test(&sem_bo->num_free))
+		nouveau_sem_bo_enqueue(dev, sem_bo);
+}
+
 static void
 nouveau_fence_del(struct kref *ref)
 {
@@ -266,3 +485,27 @@ nouveau_fence_fini(struct nouveau_channel *chan)
 	}
 }
 
+void
+nouveau_fence_device_init(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	spin_lock_init(&dev_priv->sem_bo_free_list_lock);
+	dev_priv->sem_bo_free_list = 0;
+	atomic_set(&dev_priv->sem_handles, 0);
+	/* these are each pinned and 4KB, providing 1024 semaphores each
+	   we should need only one in normal circumstances */
+	dev_priv->sem_max_handles = 16;
+}
+
+void
+nouveau_fence_device_takedown(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	struct nouveau_sem_bo *sem_bo, *next;
+	/* all the sem_bos allocated must be in the free list since all channels
+	* and thus fences have already been terminated */
+	for (sem_bo = dev_priv->sem_bo_free_list; sem_bo; sem_bo = next) {
+		next = sem_bo->next;
+		nouveau_sem_bo_free(sem_bo);
+	}
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
index fcd7610..7161981 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -412,6 +412,8 @@ nouveau_card_init(struct drm_device *dev)
 	if (ret)
 		goto out_mem;
 
+	nouveau_fence_device_init(dev);
+
 	/* PMC */
 	ret = engine->mc.init(dev);
 	if (ret)
@@ -532,6 +534,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
 		engine->timer.takedown(dev);
 		engine->mc.takedown(dev);
 
+		nouveau_fence_device_takedown(dev);
+
 		mutex_lock(&dev->struct_mutex);
 		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_VRAM);
 		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_TT);
-- 
1.6.6.1.476.g01ddb

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] drm/nouveau: use semaphores for fully on-GPU interchannel synchronization
       [not found] ` <1265017810-13354-1-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
  2010-02-01  9:50   ` [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator Luca Barbieri
@ 2010-02-01  9:51   ` Luca Barbieri
  2010-02-01 12:51   ` [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel Francisco Jerez
  2 siblings, 0 replies; 10+ messages in thread
From: Luca Barbieri @ 2010-02-01  9:51 UTC (permalink / raw)
  To: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Luca Barbieri

This patch implements the nouveau_fence_sync interface introduced
in the first patch using dynamically allocated semaphores,
introduced in the second patch.

This is tested on NV40, but should work on NV17-NV50 (previous cards
will just fallback to CPU waiting).
Make sure you are using the latest Nouveau git, as it contains critical
semaphore support.

Unlike a previously posted patch, this patch does not make any use of
software methods and is designed to do all the work on the GPU, and be
as fast as possible.

To perform inter-channel synchronization, commands are emitted on
both channels involved.

First, a semaphore is allocated, and a valid handle for it is inserted
in the channel hash table if necessary.

DMA_SEMAPHORE is set only if different from the last used one. This
is usually not the case, and thus SEMAPHORE interrupts only happen
once per channel usually.

After that, SEMAPHORE_OFFSET is set if changed and then either ACQUIRE
or RELEASE is used.

On the waiting channel, a fence is also emitted. Once that fence
expires, the semaphore is released and can be reused for any purpose.

This results in synchronization taking place fully on the GPU, with
no CPU waiting necessary at all.

Signed-off-by: Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |    5 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  135 +++++++++++++++++++++++++++++--
 2 files changed, 133 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 0a7abc7..c4d6502 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -195,6 +195,8 @@ struct nouveau_channel {
 		uint32_t sequence;
 		uint32_t sequence_ack;
 		uint32_t last_sequence_irq;
+		atomic_t sem_count;
+		unsigned sem_threshold;
 	} fence;
 
 	/* DMA push buffer */
@@ -255,6 +257,9 @@ struct nouveau_channel {
 		char name[32];
 		struct drm_info_list info;
 	} debugfs;
+
+	unsigned sem_handle;
+	unsigned sem_num;
 };
 
 struct nouveau_instmem_engine {
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 01152f3..ec33bd3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -46,6 +46,9 @@ struct nouveau_fence {
 
 	uint32_t sequence;
 	bool signalled;
+
+	struct nouveau_sem_bo *sem_bo;
+	int sem_num;
 };
 
 static inline struct nouveau_fence *
@@ -275,10 +278,122 @@ nouveau_fence_del(struct kref *ref)
 	kfree(fence);
 }
 
+static inline void
+nouveau_sem_emit(struct nouveau_channel *chan, struct nouveau_sem *sem, unsigned op)
+{
+	uint32_t handle = sem->sem_bo->handle;
+	if (chan->sem_handle != handle) {
+		BEGIN_RING(chan, NvSubSw, NV_SW_DMA_SEMAPHORE, 1);
+		OUT_RING(chan, handle);
+		chan->sem_handle = handle;
+	}
+	if (chan->sem_num != sem->num) {
+		BEGIN_RING(chan, NvSubSw, NV_SW_SEMAPHORE_OFFSET, 1);
+		OUT_RING(chan, sem->num << 2);
+		chan->sem_num = sem->num;
+	}
+	BEGIN_RING(chan, NvSubSw, op, 1);
+	OUT_RING(chan, sem->value);
+}
+
+/* Currently this ignores waited_fence->sequence and syncs the last fence on waited_fence->channel
+ * If a better GPU synchronization mechanism is discovered, then the actual fence may be used.
+ * Note that sem_fence is a fence on the *waiting *channel, used to free the semaphore.
+ */
 struct nouveau_fence*
 nouveau_fence_sync(struct nouveau_fence *waited_fence, struct nouveau_channel *chan)
 {
-	return ERR_PTR(-ENOSYS);
+	struct nouveau_channel *waited_chan;
+	struct drm_device *dev;
+	struct drm_nouveau_private *dev_priv;
+	struct nouveau_sem sem;
+	uint32_t handle;
+	int ret;
+	struct nouveau_fence *sem_fence;
+	unsigned long flags;
+
+	dev = chan->dev;
+	dev_priv = chan->dev->dev_private;
+
+	if (dev_priv->chipset < 0x17)
+		return ERR_PTR(-ENOSYS);
+
+	waited_chan = waited_fence->channel;
+
+	ret = RING_SPACE(chan, 6 + 2);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = RING_SPACE(waited_chan, 6);
+	if (ret)
+		return ERR_PTR(ret);
+
+	/* try to reclaim semaphores when we hit the threshold
+	   this helps keeping a low number of active semaphores
+
+	   Note that in the DRI2 case this is never triggered
+	   since we wait for fences on both channels.
+
+	   However, if buffers were all different, this could be
+	   necessary.
+	*/
+	if (atomic_read(&chan->fence.sem_count) >= chan->fence.sem_threshold) {
+		spin_lock_irqsave(&chan->fence.lock, flags);
+		if (atomic_read(&chan->fence.sem_count) >= chan->fence.sem_threshold)
+			nouveau_fence_update(chan);
+		spin_unlock_irqrestore(&chan->fence.lock, flags);
+	}
+
+	ret = nouveau_fence_new(chan, &sem_fence, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = nouveau_sem_alloc(chan->dev, &sem);
+	if (ret) {
+		kfree(sem_fence);
+		return ERR_PTR(ret);
+	}
+
+	BUG_ON(!sem.sem_bo);
+
+	ret = nouveau_sem_bo_channel_init(sem.sem_bo, chan);
+	if (!ret)
+		ret = nouveau_sem_bo_channel_init(sem.sem_bo, waited_chan);
+	if (ret) {
+		nouveau_sem_release(dev, sem.sem_bo, sem.num);
+		kfree(sem_fence);
+		return ERR_PTR(ret);
+	}
+
+	handle = sem.sem_bo->handle;
+
+/*	NV_DEBUG(dev, "sync %i <- %i with %x:%i (sem %i/%i)\n", chan->id, waited_chan->id, sem.sem_bo->handle, sem.num, atomic_read(&chan->fence.sem_count), chan->fence.sem_threshold); */
+
+	sem_fence->sem_bo = sem.sem_bo;
+	sem_fence->sem_num = sem.num;
+
+	atomic_inc(&chan->fence.sem_count);
+
+/* TODO: this should take the channel locks when they are added */
+	nouveau_sem_emit(chan, &sem, NV_SW_SEMAPHORE_ACQUIRE);
+
+	nouveau_fence_emit(sem_fence);
+
+	nouveau_sem_emit(waited_chan, &sem, NV_SW_SEMAPHORE_RELEASE);
+	FIRE_RING(waited_chan);
+	return sem_fence;
+}
+
+void nouveau_fence_complete(struct nouveau_fence *fence)
+{
+	if (fence->sem_bo) {
+		nouveau_sem_release(fence->channel->dev, fence->sem_bo, fence->sem_num);
+		atomic_dec(&fence->channel->fence.sem_count);
+	}
+
+	fence->signalled = true;
+	list_del(&fence->entry);
+	kref_put(&fence->refcount, nouveau_fence_del);
 }
 
 void
@@ -288,6 +403,7 @@ nouveau_fence_update(struct nouveau_channel *chan)
 	struct list_head *entry, *tmp;
 	struct nouveau_fence *fence;
 	uint32_t sequence;
+	unsigned sem_threshold;
 
 	if (USE_REFCNT)
 		sequence = nvchan_rd32(chan, 0x48);
@@ -302,13 +418,16 @@ nouveau_fence_update(struct nouveau_channel *chan)
 		fence = list_entry(entry, struct nouveau_fence, entry);
 
 		sequence = fence->sequence;
-		fence->signalled = true;
-		list_del(&fence->entry);
-		kref_put(&fence->refcount, nouveau_fence_del);
+		nouveau_fence_complete(fence);
 
 		if (sequence == chan->fence.sequence_ack)
 			break;
 	}
+
+	sem_threshold = atomic_read(&chan->fence.sem_count) * 2;
+	if (sem_threshold < NOUVEAU_SEM_MIN_THRESHOLD)
+		sem_threshold = NOUVEAU_SEM_MIN_THRESHOLD;
+	chan->fence.sem_threshold = sem_threshold;
 }
 
 int
@@ -467,6 +586,10 @@ nouveau_fence_init(struct nouveau_channel *chan)
 {
 	INIT_LIST_HEAD(&chan->fence.pending);
 	spin_lock_init(&chan->fence.lock);
+	atomic_set(&chan->fence.sem_count, 0);
+	chan->fence.sem_threshold = NOUVEAU_SEM_MIN_THRESHOLD;
+	chan->sem_handle = 0;
+	chan->sem_num = ~0;
 	return 0;
 }
 
@@ -479,9 +602,7 @@ nouveau_fence_fini(struct nouveau_channel *chan)
 	list_for_each_safe(entry, tmp, &chan->fence.pending) {
 		fence = list_entry(entry, struct nouveau_fence, entry);
 
-		fence->signalled = true;
-		list_del(&fence->entry);
-		kref_put(&fence->refcount, nouveau_fence_del);
+		nouveau_fence_complete(fence);
 	}
 }
 
-- 
1.6.6.1.476.g01ddb

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel
       [not found] ` <1265017810-13354-1-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
  2010-02-01  9:50   ` [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator Luca Barbieri
  2010-02-01  9:51   ` [PATCH 3/3] drm/nouveau: use semaphores for fully on-GPU interchannel synchronization Luca Barbieri
@ 2010-02-01 12:51   ` Francisco Jerez
  2 siblings, 0 replies; 10+ messages in thread
From: Francisco Jerez @ 2010-02-01 12:51 UTC (permalink / raw)
  To: Luca Barbieri; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1.1: Type: text/plain, Size: 6721 bytes --]

Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org> writes:

> nouveau_bo_wait will make the GPU channel wait for fence if possible,
> otherwise falling back to waiting with the CPU using ttm_bo_wait.
>
> The nouveau_fence_sync function currently returns -ENOSYS, and is
> the focus of the next patch.
>
> Signed-off-by: Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>

Just a few comments inline.

> ---
>  drivers/gpu/drm/nouveau/nouveau_bo.c    |   68 ++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/nouveau/nouveau_drv.h   |    2 +
>  drivers/gpu/drm/nouveau/nouveau_fence.c |    6 +++
>  drivers/gpu/drm/nouveau/nouveau_gem.c   |   20 +++++----
>  4 files changed, 86 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index db0ed4c..8afc17e 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -35,6 +35,70 @@
>  
>  #include <linux/log2.h>
>  
> +int
> +nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan)
> +{
> +	int ret = 0;
> +
> +	if (likely(!bo->sync_obj))
> +		return 0;
> +
> +	spin_lock(&bo->lock);
> +	if (chan) {
> +		struct nouveau_fence *new_fence;
> +		struct nouveau_channel *waited_chan;
> +
> +		do {
> +			struct nouveau_fence *prev_fence;
> +			prev_fence = bo->sync_obj;
> +
> +			waited_chan = nouveau_fence_channel(prev_fence);
> +			if (likely(!waited_chan || waited_chan == chan))
> +				break;
> +
> +			nouveau_fence_ref(prev_fence);
> +
> +			ret = ttm_bo_wait(bo, false, false, true);
> +			if (!ret)
> +				goto unref_break;
> +
> +			if (unlikely(prev_fence != bo->sync_obj))
> +				goto unref_continue;
> +
> +			spin_unlock(&bo->lock);
> +			new_fence = nouveau_fence_sync(prev_fence, chan);
> +			spin_lock(&bo->lock);
> +
> +			if (likely(!IS_ERR(new_fence))) {
> +				if (likely(bo->sync_obj)) {
> +					if (unlikely(bo->sync_obj != prev_fence)) {
> +						nouveau_fence_unref((void **)&new_fence);
> +						continue;
> +					}
> +					nouveau_fence_unref((void **)&bo->sync_obj);
> +				}
> +				bo->sync_obj = new_fence;
> +				ret = 0;
> +unref_break:
> +				nouveau_fence_unref((void **)&prev_fence);
> +				break;
> +			}
> +
> +			if (unlikely(prev_fence != bo->sync_obj)) {
> +unref_continue:
> +				nouveau_fence_unref((void **)&prev_fence);
> +				continue;
> +			}
> +
> +			nouveau_fence_unref((void **)&prev_fence);
> +			ret = ttm_bo_wait(bo, false, false, false);
> +		} while (0);
> +	} else
> +		ret = ttm_bo_wait(bo, false, false, false);
> +	spin_unlock(&bo->lock);
> +	return ret;
> +}

I believe this looping will become unnecessary if the IRQ-fencing
patches get in, as every sync_obj will be usable as a SEMAPHORE_ACQUIRE
target.

> +
>  static void
>  nouveau_bo_del_ttm(struct ttm_buffer_object *bo)
>  {
> @@ -469,8 +533,10 @@ nouveau_bo_move_accel_cleanup(struct nouveau_channel *chan,
>  
>  	ret = ttm_bo_move_accel_cleanup(&nvbo->bo, fence, NULL,
>  					evict, no_wait, new_mem);
> +
> +	/* TODO: this should be redundant, since we do the check in validate */
>  	if (nvbo->channel && nvbo->channel != chan)
> -		ret = nouveau_fence_wait(fence, NULL, false, false);
> +		ret = nouveau_bo_wait(&nvbo->bo, nvbo->channel);
>  	nouveau_fence_unref((void *)&fence);
>  	return ret;
>  }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index 5445cef..2b78ee6 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -1110,6 +1110,7 @@ extern int nv04_crtc_create(struct drm_device *, int index);
>  
>  /* nouveau_bo.c */
>  extern struct ttm_bo_driver nouveau_bo_driver;
> +extern int nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan);
>  extern int nouveau_bo_new(struct drm_device *, struct nouveau_channel *,
>  			  int size, int align, uint32_t flags,
>  			  uint32_t tile_mode, uint32_t tile_flags,
> @@ -1135,6 +1136,7 @@ extern int nouveau_fence_emit(struct nouveau_fence *);
>  struct nouveau_channel *nouveau_fence_channel(struct nouveau_fence *);
>  extern bool nouveau_fence_signalled(void *obj, void *arg);
>  extern int nouveau_fence_wait(void *obj, void *arg, bool lazy, bool intr);
> +extern struct nouveau_fence *nouveau_fence_sync(struct nouveau_fence *, struct nouveau_channel *);
>  extern int nouveau_fence_flush(void *obj, void *arg);
>  extern void nouveau_fence_unref(void **obj);
>  extern void *nouveau_fence_ref(void *obj);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index faddf53..9b1c2c3 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -56,6 +56,12 @@ nouveau_fence_del(struct kref *ref)
>  	kfree(fence);
>  }
>  
> +struct nouveau_fence*
> +nouveau_fence_sync(struct nouveau_fence *waited_fence, struct nouveau_channel *chan)
> +{
> +	return ERR_PTR(-ENOSYS);
> +}
> +
>  void
>  nouveau_fence_update(struct nouveau_channel *chan)
>  {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 70cc308..b8d61ff 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -356,15 +356,6 @@ validate_list(struct nouveau_channel *chan, struct list_head *list,
>  
>  	list_for_each_entry(nvbo, list, entry) {
>  		struct drm_nouveau_gem_pushbuf_bo *b = &pbbo[nvbo->pbbo_index];
> -		struct nouveau_fence *prev_fence = nvbo->bo.sync_obj;
> -
> -		if (prev_fence && nouveau_fence_channel(prev_fence) != chan) {
> -			spin_lock(&nvbo->bo.lock);
> -			ret = ttm_bo_wait(&nvbo->bo, false, false, false);
> -			spin_unlock(&nvbo->bo.lock);
> -			if (unlikely(ret))
> -				return ret;
> -		}
>  
>  		ret = nouveau_gem_set_domain(nvbo->gem, b->read_domains,
>  					     b->write_domains,
> @@ -379,6 +370,17 @@ validate_list(struct nouveau_channel *chan, struct list_head *list,
>  		if (unlikely(ret))
>  			return ret;
>  
> +		/* we must wait *after *validation, since we do the move
> +		   with the kernel channel.
> +
> +		    Note that this may spin/sleep on a fence
> +		    TODO: is this a good idea, or should we bail and retry?
> +		  */
> +		ret = nouveau_bo_wait(&nvbo->bo, chan);
> +		if (unlikely(ret))
> +			return ret;
> +

This is fine IMHO.

> +
>  		if (nvbo->bo.offset == b->presumed_offset &&
>  		    ((nvbo->bo.mem.mem_type == TTM_PL_VRAM &&
>  		      b->presumed_domain & NOUVEAU_GEM_DOMAIN_VRAM) ||

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 181 bytes --]

_______________________________________________
Nouveau mailing list
Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found]     ` <1265017810-13354-2-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
@ 2010-02-01 12:56       ` Francisco Jerez
       [not found]         ` <87bpg9w1w8.fsf-sGOZH3hwPm2sTnJN9+BGXg@public.gmane.org>
  2010-02-01 18:03       ` Marcin Slusarz
  1 sibling, 1 reply; 10+ messages in thread
From: Francisco Jerez @ 2010-02-01 12:56 UTC (permalink / raw)
  To: Luca Barbieri; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1.1: Type: text/plain, Size: 12179 bytes --]

Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org> writes:

> This patch adds code to allocate semaphores in a dynamic way using
> an algorithm with a lockless fast path.
>
> 1. Semaphore BOs
>
> Semaphore BOs are BOs containing semaphores. Each is 4KB large and
> contains 1024 4-byte semaphores. They are pinned.
>
> Semaphore BOs are allocated on-demand and freed at device takedown.
> Those that are not fully allocated are kept on a free list.
>
> Each is assigned an handle. DMA objects and references are created
> on demand for each channel that needs to use a semaphore BO.
> Those objects and references are automatically destroyed at channel
> destruction time.
>
> Typically only a single semaphore BO will be needed.
>
> 2. Semaphore allocation
>
> Each semaphore BO contains a bitmask of free semaphores within the BO.
> Allocation is done in a lockless fashion using a count of free
> semaphores and the bitmask.
>
> Semaphores are released once the fence on the waiting side passed.
> This is done by adding fields to nouveau_fence.
>
> Semaphore values are zeroed when the semaphore BO is allocated, and
> are afterwards only modified by the GPU.
>
> This is performed by storing a bitmask that allows to alternate
> between using the values 0 and 1 for a given semaphore.
>
> Signed-off-by: Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
> ---
>  drivers/gpu/drm/nouveau/nouveau_dma.h   |    1 +
>  drivers/gpu/drm/nouveau/nouveau_drv.h   |    7 +
>  drivers/gpu/drm/nouveau/nouveau_fence.c |  243 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/nouveau/nouveau_state.c |    4 +
>  4 files changed, 255 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index dabfd65..0658979 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -69,6 +69,7 @@ enum {
>  	NvGdiRect	= 0x8000000c,
>  	NvImageBlit	= 0x8000000d,
>  	NvSw		= 0x8000000e,
> +	NvSem		= 0x81000000, /* range of 16M handles */
>  
>  	/* G80+ display objects */
>  	NvEvoVRAM	= 0x01000000,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index 2b78ee6..0a7abc7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -620,6 +620,11 @@ struct drm_nouveau_private {
>  	struct {
>  		struct dentry *channel_root;
>  	} debugfs;
> +
> +	spinlock_t sem_bo_free_list_lock;
> +	struct nouveau_sem_bo *sem_bo_free_list;
> +	atomic_t sem_handles;
> +	uint32_t sem_max_handles;
>  };
>  
>  static inline struct drm_nouveau_private *
> @@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
>  extern void nouveau_fence_unref(void **obj);
>  extern void *nouveau_fence_ref(void *obj);
>  extern void nouveau_fence_handler(struct drm_device *dev, int channel);
> +extern void nouveau_fence_device_init(struct drm_device *dev);
> +extern void nouveau_fence_device_takedown(struct drm_device *dev);
>  
>  /* nouveau_gem.c */
>  extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 9b1c2c3..01152f3 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -32,6 +32,13 @@
>  
>  #define USE_REFCNT (dev_priv->card_type >= NV_10)
>  
> +#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
> +
> +/* reading fences can be very expensive
> + * use a threshold that would only use up half a single sem_bo
> + */
> +#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / (NOUVEAU_MAX_CHANNEL_NR * 2))
> +
>  struct nouveau_fence {
>  	struct nouveau_channel *channel;
>  	struct kref refcount;
> @@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
>  	return (struct nouveau_fence *)sync_obj;
>  }
>  
> +struct nouveau_sem_bo {
> +	struct nouveau_sem_bo *next;
> +	struct nouveau_bo *bo;
> +	uint32_t handle;
> +
> +	/* >= 0: num_free + 1 slots are free, sem_bo is or is about to be on free_list
> +	    -1: all allocated, sem_bo is NOT on free_list
> +	*/
> +	atomic_t num_free;
> +
> +	DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
> +	DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
> +	DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
> +};
> +
> +struct nouveau_sem {
> +	struct nouveau_sem_bo *sem_bo;
> +	unsigned num;
> +	uint32_t value;
> +};
> +
> +struct nouveau_sem_bo*
> +nouveau_sem_bo_alloc(struct drm_device *dev)
> +{
> +	struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
> +	struct nouveau_bo *bo;
> +	int flags = TTM_PL_FLAG_VRAM;
> +	int ret;
> +	bool is_iomem;
> +	void *mem;
> +
> +	sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
> +
> +	if (!sem_bo)
> +		return 0;
> +
> +	ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
> +			0, 0x0000, true, true, &bo);
> +	if (ret)
> +		goto out_free;
> +
> +	sem_bo->bo = bo;
> +
> +	ret = nouveau_bo_pin(bo, flags);
> +	if (ret)
> +		goto out_bo;
> +
> +	ret = nouveau_bo_map(bo);
> +	if (ret)
> +		goto out_unpin;
> +
> +	mem = ttm_kmap_obj_virtual(&bo->kmap, &is_iomem);
> +	if (is_iomem)
> +		memset_io(mem, 0, NOUVEAU_SEM_BO_SIZE);
> +	else
> +		memset(mem, 0, NOUVEAU_SEM_BO_SIZE);
> +
> +	nouveau_bo_unmap(bo);
> +
> +	memset((void *)sem_bo->free_slots, 0xff, sizeof(sem_bo->free_slots));
> +	memset((void *)sem_bo->values, 0xff, sizeof(sem_bo->values));
> +	atomic_set(&sem_bo->num_free, sizeof(sem_bo->free_slots) * 8 - 1);
> +
> +	memset((void *)sem_bo->channels, 0, sizeof(sem_bo->channels));
> +
> +	return sem_bo;
> +
> +out_unpin:
> +	nouveau_bo_unpin(sem_bo->bo);
> +out_bo:
> +	nouveau_bo_ref(0, &sem_bo->bo);
> +out_free:
> +	kfree(sem_bo);
> +	return 0;
> +}
> +
> +static void
> +nouveau_sem_bo_channel_dtor(struct drm_device *dev,
> +			     struct nouveau_gpuobj *gpuobj) {
> +	if (gpuobj->priv) {
> +		struct nouveau_sem_bo *sem_bo;
> +		struct nouveau_channel *chan = gpuobj->im_channel;
> +		sem_bo = (struct nouveau_sem_bo *)gpuobj->priv;
> +
> +		clear_bit(chan->id, sem_bo->channels);
> +		smp_wmb();
> +	}
> +}
> +
> +int
> +nouveau_sem_bo_channel_init(struct nouveau_sem_bo *sem_bo, struct nouveau_channel *chan)
> +{
> +	struct drm_device *dev = chan->dev;
> +	struct nouveau_gpuobj *obj = NULL;
> +	int ret;
> +
> +	if (test_bit(chan->id, sem_bo->channels))
> +		return 0;
> +
> +	BUG_ON(sem_bo->bo->bo.mem.mem_type != TTM_PL_VRAM);
> +
> +	ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY,
> +		sem_bo->bo->bo.mem.mm_node->start, NOUVEAU_SEM_BO_SIZE,
> +		NV_DMA_ACCESS_RW, NV_DMA_TARGET_VIDMEM, &obj);
> +	if (ret)
> +		return ret;
> +
> +	obj->dtor = nouveau_sem_bo_channel_dtor;
> +	obj->priv = sem_bo;
> +
> +	ret = nouveau_gpuobj_ref_add(dev, chan, sem_bo->handle, obj, NULL);
> +	if (ret) {
> +		nouveau_gpuobj_del(dev, &obj);
> +		return ret;
> +	}
> +
> +	set_bit(chan->id, sem_bo->channels);
> +	smp_wmb();
> +
> +	return 0;
> +}
> +
> +void
> +nouveau_sem_bo_free(struct nouveau_sem_bo *sem_bo)
> +{
> +	nouveau_bo_unpin(sem_bo->bo);
> +	nouveau_bo_ref(0, &sem_bo->bo);
> +	kfree(sem_bo);
> +}
> +
> +static inline void
> +nouveau_sem_bo_enqueue(struct drm_device *dev, struct nouveau_sem_bo *sem_bo)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	spin_lock(&dev_priv->sem_bo_free_list_lock);
> +	sem_bo->next = dev_priv->sem_bo_free_list;
> +	dev_priv->sem_bo_free_list = sem_bo;
> +	spin_unlock(&dev_priv->sem_bo_free_list_lock);
> +}
> +
> +int
> +nouveau_sem_alloc(struct drm_device *dev, struct nouveau_sem *sem)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	struct nouveau_sem_bo *sem_bo = 0;
> +	int v;
> +
> +retry:
> +	sem_bo = dev_priv->sem_bo_free_list;
> +	if (!sem_bo) {
> +		unsigned handle;
> +
> +		if (atomic_read(&dev_priv->sem_handles) >= dev_priv->sem_max_handles)
> +			return -ENOMEM;
> +
> +		/* this works because sem_handles <= sem_max_handles + NR_CPUS */
> +		handle = atomic_add_return(1, &dev_priv->sem_handles) - 1;
> +
> +		if (handle >= dev_priv->sem_max_handles)
> +			return -ENOMEM;
> +
> +		sem_bo = nouveau_sem_bo_alloc(dev);
> +		if (!sem_bo)
> +			return -ENOMEM;
> +
> +		sem_bo->handle = NvSem + handle;
> +
> +		atomic_dec(&sem_bo->num_free);
> +		nouveau_sem_bo_enqueue(dev, sem_bo);
> +	} else {
> +		if (unlikely(!atomic_add_unless(&sem_bo->num_free, -1, 0))) {
> +			spin_lock(&dev_priv->sem_bo_free_list_lock);
> +			if (unlikely(sem_bo != dev_priv->sem_bo_free_list)) {
> +				spin_unlock(&dev_priv->sem_bo_free_list_lock);
> +				goto retry;
> +			}
> +
> +			dev_priv->sem_bo_free_list = sem_bo->next;
> +			if (atomic_dec_return(&sem_bo->num_free) >= 0)
> +				dev_priv->sem_bo_free_list = sem_bo;
> +
> +			spin_unlock(&dev_priv->sem_bo_free_list_lock);
> +		}
> +	}
> +
> +retry_bit:
> +	v = find_first_bit(sem_bo->free_slots, sizeof(sem_bo->free_slots) * 8);
> +
> +	/* we reserved our bit by decrementing num_free
> +	   however, the first available bit may have been taken */
> +	BUG_ON(v >= sizeof(sem_bo->free_slots) * 8);
> +
> +	if (unlikely(!test_and_clear_bit(v, sem_bo->free_slots)))
> +		goto retry_bit;
> +
> +	sem->sem_bo = sem_bo;
> +	sem->value = test_and_change_bit(v, sem_bo->values);
> +	sem->num = v;
> +
> +	return 0;
> +}
> +

It seems to me this code could be hugely simplified using a plain
spinlock. The locklessness of this patch is really cool but I have
doubts it will pay off...

How often do we expect cross-channel sync to kick in? Maybe 2-3 times
per frame? I suspect contentions will be rare enough to make spinlocks
as fast as atomics for all real-life cases, and they don't have such a
high maintainability cost. What do you guys think?

> +void
> +nouveau_sem_release(struct drm_device *dev, struct nouveau_sem_bo *sem_bo, int i)
> +{
> +	set_bit(i, sem_bo->free_slots);
> +
> +	if (atomic_inc_and_test(&sem_bo->num_free))
> +		nouveau_sem_bo_enqueue(dev, sem_bo);
> +}
> +
>  static void
>  nouveau_fence_del(struct kref *ref)
>  {
> @@ -266,3 +485,27 @@ nouveau_fence_fini(struct nouveau_channel *chan)
>  	}
>  }
>  
> +void
> +nouveau_fence_device_init(struct drm_device *dev)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	spin_lock_init(&dev_priv->sem_bo_free_list_lock);
> +	dev_priv->sem_bo_free_list = 0;
> +	atomic_set(&dev_priv->sem_handles, 0);
> +	/* these are each pinned and 4KB, providing 1024 semaphores each
> +	   we should need only one in normal circumstances */
> +	dev_priv->sem_max_handles = 16;
> +}
> +
> +void
> +nouveau_fence_device_takedown(struct drm_device *dev)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	struct nouveau_sem_bo *sem_bo, *next;
> +	/* all the sem_bos allocated must be in the free list since all channels
> +	* and thus fences have already been terminated */
> +	for (sem_bo = dev_priv->sem_bo_free_list; sem_bo; sem_bo = next) {
> +		next = sem_bo->next;
> +		nouveau_sem_bo_free(sem_bo);
> +	}
> +}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
> index fcd7610..7161981 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_state.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_state.c
> @@ -412,6 +412,8 @@ nouveau_card_init(struct drm_device *dev)
>  	if (ret)
>  		goto out_mem;
>  
> +	nouveau_fence_device_init(dev);
> +
>  	/* PMC */
>  	ret = engine->mc.init(dev);
>  	if (ret)
> @@ -532,6 +534,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
>  		engine->timer.takedown(dev);
>  		engine->mc.takedown(dev);
>  
> +		nouveau_fence_device_takedown(dev);
> +
>  		mutex_lock(&dev->struct_mutex);
>  		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_VRAM);
>  		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_TT);

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 181 bytes --]

_______________________________________________
Nouveau mailing list
Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found]         ` <87bpg9w1w8.fsf-sGOZH3hwPm2sTnJN9+BGXg@public.gmane.org>
@ 2010-02-01 13:27           ` Luca Barbieri
       [not found]             ` <ff13bc9a1002010527n5812d30fj8555a940aeba179f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Luca Barbieri @ 2010-02-01 13:27 UTC (permalink / raw)
  To: Francisco Jerez; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> How often do we expect cross-channel sync to kick in? Maybe 2-3 times
> per frame? I suspect contentions will be rare enough to make spinlocks
> as fast as atomics for all real-life cases, and they don't have such a
> high maintainability cost. What do you guys think?

For the case of a single (or a few) GL application the requirements
are indeed modest.

I'm not sure that spinlocks or an otherwise reduced solution would be
much simpler.
You basically would just avoid the retrying code.

Also if you have a multithreaded/multiprocess GPGPU application on
large SMP machine things may change, as you may have a lot of commands
and semaphores in flight, as well as high contention for anything
global.

Of course, currently we hold both the BKL and struct_mutex around
things, which makes it all moot, but hopefully we'll switch to
per-channel mutexes soon (I'm looking into that).

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found]             ` <ff13bc9a1002010527n5812d30fj8555a940aeba179f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-02-01 14:30               ` Francisco Jerez
       [not found]                 ` <874om1vxj4.fsf-sGOZH3hwPm2sTnJN9+BGXg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Francisco Jerez @ 2010-02-01 14:30 UTC (permalink / raw)
  To: Luca Barbieri; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1.1: Type: text/plain, Size: 1437 bytes --]

Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org> writes:

>> How often do we expect cross-channel sync to kick in? Maybe 2-3 times
>> per frame? I suspect contentions will be rare enough to make spinlocks
>> as fast as atomics for all real-life cases, and they don't have such a
>> high maintainability cost. What do you guys think?
>
> For the case of a single (or a few) GL application the requirements
> are indeed modest.
>
> I'm not sure that spinlocks or an otherwise reduced solution would be
> much simpler.
> You basically would just avoid the retrying code.
>
> Also if you have a multithreaded/multiprocess GPGPU application on
> large SMP machine things may change, as you may have a lot of commands
> and semaphores in flight, as well as high contention for anything
> global.
>
Sounds like premature optimization to me. I'm just stating my personal
view here, but I have a feeling a patch with 60% of lines could do very
well the same for most realistic cases.

Maarten, Ben, what do you think about this?

> Of course, currently we hold both the BKL and struct_mutex around
> things, which makes it all moot, but hopefully we'll switch to
> per-channel mutexes soon (I'm looking into that).

BTW, the kernel has some linked list helpers you might want to use for
sem_bo_free_list, and probably the best place for the sem stuff to live
is "dev_priv->fence" instead of the root of drm_nouveau_private.

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 181 bytes --]

_______________________________________________
Nouveau mailing list
Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found]                 ` <874om1vxj4.fsf-sGOZH3hwPm2sTnJN9+BGXg@public.gmane.org>
@ 2010-02-01 14:50                   ` Luca Barbieri
       [not found]                     ` <ff13bc9a1002010650o5cab4d65q26f1d655f82f4f5a-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Luca Barbieri @ 2010-02-01 14:50 UTC (permalink / raw)
  To: Francisco Jerez; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> Sounds like premature optimization to me. I'm just stating my personal
> view here, but I have a feeling a patch with 60% of lines could do very
> well the same for most realistic cases.

Perhaps, but really, the only thing you would probably save by using
spinlocks in the fast path is retrying in nouveau_sem_alloc, which
should be at most 10 lines saved.

You could save much more by supporting only a single static semaphore
BO, and still retain almost all flexibility by making it large.
However, it's somewhat inelegant, and why not just keep the
functionality so we never need to revisit this again?

> BTW, the kernel has some linked list helpers you might want to use for
> sem_bo_free_list
It is a singly linked list, and slist.h never got merged.
I could possibly make it doubly linked, even though it's a bit useless.

> and probably the best place for the sem stuff to live
> is "dev_priv->fence" instead of the root of drm_nouveau_private.
There is no "fence" currently in drm_nouveau_private.
Adding a "sem" or "fence" substructure could make sense though.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found]                     ` <ff13bc9a1002010650o5cab4d65q26f1d655f82f4f5a-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2010-02-01 15:25                       ` Francisco Jerez
  0 siblings, 0 replies; 10+ messages in thread
From: Francisco Jerez @ 2010-02-01 15:25 UTC (permalink / raw)
  To: Luca Barbieri; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1.1: Type: text/plain, Size: 1332 bytes --]

Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org> writes:

>> Sounds like premature optimization to me. I'm just stating my personal
>> view here, but I have a feeling a patch with 60% of lines could do very
>> well the same for most realistic cases.
>
> Perhaps, but really, the only thing you would probably save by using
> spinlocks in the fast path is retrying in nouveau_sem_alloc, which
> should be at most 10 lines saved.
>
You'd also save some bookkeeping and the double error checking.

> You could save much more by supporting only a single static semaphore
> BO, and still retain almost all flexibility by making it large.
> However, it's somewhat inelegant, and why not just keep the
> functionality so we never need to revisit this again?

Yup, I'm fine with having several semaphore BOs.

>
>> BTW, the kernel has some linked list helpers you might want to use for
>> sem_bo_free_list
> It is a singly linked list, and slist.h never got merged.
> I could possibly make it doubly linked, even though it's a bit useless.
>
>> and probably the best place for the sem stuff to live
>> is "dev_priv->fence" instead of the root of drm_nouveau_private.
> There is no "fence" currently in drm_nouveau_private.
> Adding a "sem" or "fence" substructure could make sense though.

Yes, that would make sense.

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 181 bytes --]

_______________________________________________
Nouveau mailing list
Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator
       [not found]     ` <1265017810-13354-2-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
  2010-02-01 12:56       ` Francisco Jerez
@ 2010-02-01 18:03       ` Marcin Slusarz
  1 sibling, 0 replies; 10+ messages in thread
From: Marcin Slusarz @ 2010-02-01 18:03 UTC (permalink / raw)
  To: Luca Barbieri; +Cc: nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On Mon, Feb 01, 2010 at 10:50:09AM +0100, Luca Barbieri wrote:
> This patch adds code to allocate semaphores in a dynamic way using
> an algorithm with a lockless fast path.

some minor comments below

> 
> 1. Semaphore BOs
> 
> Semaphore BOs are BOs containing semaphores. Each is 4KB large and
> contains 1024 4-byte semaphores. They are pinned.
> 
> Semaphore BOs are allocated on-demand and freed at device takedown.
> Those that are not fully allocated are kept on a free list.
> 
> Each is assigned an handle. DMA objects and references are created
> on demand for each channel that needs to use a semaphore BO.
> Those objects and references are automatically destroyed at channel
> destruction time.
> 
> Typically only a single semaphore BO will be needed.
> 
> 2. Semaphore allocation
> 
> Each semaphore BO contains a bitmask of free semaphores within the BO.
> Allocation is done in a lockless fashion using a count of free
> semaphores and the bitmask.
> 
> Semaphores are released once the fence on the waiting side passed.
> This is done by adding fields to nouveau_fence.
> 
> Semaphore values are zeroed when the semaphore BO is allocated, and
> are afterwards only modified by the GPU.
> 
> This is performed by storing a bitmask that allows to alternate
> between using the values 0 and 1 for a given semaphore.
> 
> Signed-off-by: Luca Barbieri <luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
> ---
>  drivers/gpu/drm/nouveau/nouveau_dma.h   |    1 +
>  drivers/gpu/drm/nouveau/nouveau_drv.h   |    7 +
>  drivers/gpu/drm/nouveau/nouveau_fence.c |  243 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/nouveau/nouveau_state.c |    4 +
>  4 files changed, 255 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index dabfd65..0658979 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -69,6 +69,7 @@ enum {
>  	NvGdiRect	= 0x8000000c,
>  	NvImageBlit	= 0x8000000d,
>  	NvSw		= 0x8000000e,
> +	NvSem		= 0x81000000, /* range of 16M handles */
>  
>  	/* G80+ display objects */
>  	NvEvoVRAM	= 0x01000000,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index 2b78ee6..0a7abc7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -620,6 +620,11 @@ struct drm_nouveau_private {
>  	struct {
>  		struct dentry *channel_root;
>  	} debugfs;
> +
> +	spinlock_t sem_bo_free_list_lock;
> +	struct nouveau_sem_bo *sem_bo_free_list;
> +	atomic_t sem_handles;
> +	uint32_t sem_max_handles;
>  };
>  
>  static inline struct drm_nouveau_private *
> @@ -1141,6 +1146,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
>  extern void nouveau_fence_unref(void **obj);
>  extern void *nouveau_fence_ref(void *obj);
>  extern void nouveau_fence_handler(struct drm_device *dev, int channel);
> +extern void nouveau_fence_device_init(struct drm_device *dev);
> +extern void nouveau_fence_device_takedown(struct drm_device *dev);
>  
>  /* nouveau_gem.c */
>  extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 9b1c2c3..01152f3 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -32,6 +32,13 @@
>  
>  #define USE_REFCNT (dev_priv->card_type >= NV_10)
>  
> +#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
> +
> +/* reading fences can be very expensive
> + * use a threshold that would only use up half a single sem_bo
> + */
> +#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE / (NOUVEAU_MAX_CHANNEL_NR * 2))
> +
>  struct nouveau_fence {
>  	struct nouveau_channel *channel;
>  	struct kref refcount;
> @@ -47,6 +54,218 @@ nouveau_fence(void *sync_obj)
>  	return (struct nouveau_fence *)sync_obj;
>  }
>  
> +struct nouveau_sem_bo {
> +	struct nouveau_sem_bo *next;
> +	struct nouveau_bo *bo;
> +	uint32_t handle;
> +
> +	/* >= 0: num_free + 1 slots are free, sem_bo is or is about to be on free_list
> +	    -1: all allocated, sem_bo is NOT on free_list
> +	*/
> +	atomic_t num_free;
> +
> +	DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
> +	DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
> +	DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
> +};
> +
> +struct nouveau_sem {
> +	struct nouveau_sem_bo *sem_bo;
> +	unsigned num;
> +	uint32_t value;
> +};
> +
> +struct nouveau_sem_bo*
> +nouveau_sem_bo_alloc(struct drm_device *dev)
> +{
> +	struct nouveau_sem_bo *sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
^^^^^^^^

> +	struct nouveau_bo *bo;
> +	int flags = TTM_PL_FLAG_VRAM;
> +	int ret;
> +	bool is_iomem;
> +	void *mem;
> +
> +	sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);

double allocation

> +
> +	if (!sem_bo)
> +		return 0;

sparse would probably complain about 0 instead of NULL

> +
> +	ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
> +			0, 0x0000, true, true, &bo);
> +	if (ret)
> +		goto out_free;
> +
> +	sem_bo->bo = bo;
> +
> +	ret = nouveau_bo_pin(bo, flags);
> +	if (ret)
> +		goto out_bo;
> +
> +	ret = nouveau_bo_map(bo);
> +	if (ret)
> +		goto out_unpin;
> +
> +	mem = ttm_kmap_obj_virtual(&bo->kmap, &is_iomem);
> +	if (is_iomem)
> +		memset_io(mem, 0, NOUVEAU_SEM_BO_SIZE);
> +	else
> +		memset(mem, 0, NOUVEAU_SEM_BO_SIZE);
> +
> +	nouveau_bo_unmap(bo);
> +
> +	memset((void *)sem_bo->free_slots, 0xff, sizeof(sem_bo->free_slots));
> +	memset((void *)sem_bo->values, 0xff, sizeof(sem_bo->values));
> +	atomic_set(&sem_bo->num_free, sizeof(sem_bo->free_slots) * 8 - 1);
> +
> +	memset((void *)sem_bo->channels, 0, sizeof(sem_bo->channels));
> +
> +	return sem_bo;
> +
> +out_unpin:
> +	nouveau_bo_unpin(sem_bo->bo);
> +out_bo:
> +	nouveau_bo_ref(0, &sem_bo->bo);
> +out_free:
> +	kfree(sem_bo);
> +	return 0;
> +}
> +
> +static void
> +nouveau_sem_bo_channel_dtor(struct drm_device *dev,
> +			     struct nouveau_gpuobj *gpuobj) {
> +	if (gpuobj->priv) {

if (!gpuobj->priv)
	return;

would save this code from unneeded indentation...

> +		struct nouveau_sem_bo *sem_bo;
> +		struct nouveau_channel *chan = gpuobj->im_channel;
> +		sem_bo = (struct nouveau_sem_bo *)gpuobj->priv;

priv is (void *) - no need to cast it

> +
> +		clear_bit(chan->id, sem_bo->channels);
> +		smp_wmb();
> +	}
> +}
> +
> +int
> +nouveau_sem_bo_channel_init(struct nouveau_sem_bo *sem_bo, struct nouveau_channel *chan)
> +{
> +	struct drm_device *dev = chan->dev;
> +	struct nouveau_gpuobj *obj = NULL;
> +	int ret;
> +
> +	if (test_bit(chan->id, sem_bo->channels))
> +		return 0;
> +
> +	BUG_ON(sem_bo->bo->bo.mem.mem_type != TTM_PL_VRAM);

it might be a bug, but this function has failure path and killing the box is not nice,
so why not handle it properly? (WARN_ON + -EINVAL?)

> +
> +	ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY,
> +		sem_bo->bo->bo.mem.mm_node->start, NOUVEAU_SEM_BO_SIZE,
> +		NV_DMA_ACCESS_RW, NV_DMA_TARGET_VIDMEM, &obj);
> +	if (ret)
> +		return ret;
> +
> +	obj->dtor = nouveau_sem_bo_channel_dtor;
> +	obj->priv = sem_bo;
> +
> +	ret = nouveau_gpuobj_ref_add(dev, chan, sem_bo->handle, obj, NULL);
> +	if (ret) {
> +		nouveau_gpuobj_del(dev, &obj);
> +		return ret;
> +	}
> +
> +	set_bit(chan->id, sem_bo->channels);
> +	smp_wmb();
> +
> +	return 0;
> +}
> +
> +void
> +nouveau_sem_bo_free(struct nouveau_sem_bo *sem_bo)
> +{
> +	nouveau_bo_unpin(sem_bo->bo);
> +	nouveau_bo_ref(0, &sem_bo->bo);
> +	kfree(sem_bo);
> +}
> +
> +static inline void
> +nouveau_sem_bo_enqueue(struct drm_device *dev, struct nouveau_sem_bo *sem_bo)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	spin_lock(&dev_priv->sem_bo_free_list_lock);
> +	sem_bo->next = dev_priv->sem_bo_free_list;
> +	dev_priv->sem_bo_free_list = sem_bo;
> +	spin_unlock(&dev_priv->sem_bo_free_list_lock);
> +}
> +
> +int
> +nouveau_sem_alloc(struct drm_device *dev, struct nouveau_sem *sem)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	struct nouveau_sem_bo *sem_bo = 0;
> +	int v;
> +
> +retry:
> +	sem_bo = dev_priv->sem_bo_free_list;
> +	if (!sem_bo) {
> +		unsigned handle;
> +
> +		if (atomic_read(&dev_priv->sem_handles) >= dev_priv->sem_max_handles)
> +			return -ENOMEM;
> +
> +		/* this works because sem_handles <= sem_max_handles + NR_CPUS */
> +		handle = atomic_add_return(1, &dev_priv->sem_handles) - 1;
> +
> +		if (handle >= dev_priv->sem_max_handles)
> +			return -ENOMEM;
> +
> +		sem_bo = nouveau_sem_bo_alloc(dev);
> +		if (!sem_bo)
> +			return -ENOMEM;
> +
> +		sem_bo->handle = NvSem + handle;
> +
> +		atomic_dec(&sem_bo->num_free);
> +		nouveau_sem_bo_enqueue(dev, sem_bo);

you could hide "handle" and "next" initialization in nouveau_sem_bo_alloc

> +	} else {
> +		if (unlikely(!atomic_add_unless(&sem_bo->num_free, -1, 0))) {
> +			spin_lock(&dev_priv->sem_bo_free_list_lock);
> +			if (unlikely(sem_bo != dev_priv->sem_bo_free_list)) {
> +				spin_unlock(&dev_priv->sem_bo_free_list_lock);
> +				goto retry;
> +			}
> +
> +			dev_priv->sem_bo_free_list = sem_bo->next;
> +			if (atomic_dec_return(&sem_bo->num_free) >= 0)
> +				dev_priv->sem_bo_free_list = sem_bo;
> +
> +			spin_unlock(&dev_priv->sem_bo_free_list_lock);
> +		}
> +	}
> +
> +retry_bit:
> +	v = find_first_bit(sem_bo->free_slots, sizeof(sem_bo->free_slots) * 8);
> +
> +	/* we reserved our bit by decrementing num_free
> +	   however, the first available bit may have been taken */
> +	BUG_ON(v >= sizeof(sem_bo->free_slots) * 8);

another unneeded BUG

> +
> +	if (unlikely(!test_and_clear_bit(v, sem_bo->free_slots)))
> +		goto retry_bit;
> +
> +	sem->sem_bo = sem_bo;
> +	sem->value = test_and_change_bit(v, sem_bo->values);
> +	sem->num = v;
> +
> +	return 0;
> +}
> +
> +void
> +nouveau_sem_release(struct drm_device *dev, struct nouveau_sem_bo *sem_bo, int i)
> +{
> +	set_bit(i, sem_bo->free_slots);
> +
> +	if (atomic_inc_and_test(&sem_bo->num_free))
> +		nouveau_sem_bo_enqueue(dev, sem_bo);
> +}
> +
>  static void
>  nouveau_fence_del(struct kref *ref)
>  {
> @@ -266,3 +485,27 @@ nouveau_fence_fini(struct nouveau_channel *chan)
>  	}
>  }
>  
> +void
> +nouveau_fence_device_init(struct drm_device *dev)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	spin_lock_init(&dev_priv->sem_bo_free_list_lock);
> +	dev_priv->sem_bo_free_list = 0;
> +	atomic_set(&dev_priv->sem_handles, 0);
> +	/* these are each pinned and 4KB, providing 1024 semaphores each
> +	   we should need only one in normal circumstances */
> +	dev_priv->sem_max_handles = 16;
> +}
> +
> +void
> +nouveau_fence_device_takedown(struct drm_device *dev)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	struct nouveau_sem_bo *sem_bo, *next;
> +	/* all the sem_bos allocated must be in the free list since all channels
> +	* and thus fences have already been terminated */
> +	for (sem_bo = dev_priv->sem_bo_free_list; sem_bo; sem_bo = next) {
> +		next = sem_bo->next;
> +		nouveau_sem_bo_free(sem_bo);
> +	}
> +}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c b/drivers/gpu/drm/nouveau/nouveau_state.c
> index fcd7610..7161981 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_state.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_state.c
> @@ -412,6 +412,8 @@ nouveau_card_init(struct drm_device *dev)
>  	if (ret)
>  		goto out_mem;
>  
> +	nouveau_fence_device_init(dev);
> +
>  	/* PMC */
>  	ret = engine->mc.init(dev);
>  	if (ret)
> @@ -532,6 +534,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
>  		engine->timer.takedown(dev);
>  		engine->mc.takedown(dev);
>  
> +		nouveau_fence_device_takedown(dev);
> +
>  		mutex_lock(&dev->struct_mutex);
>  		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_VRAM);
>  		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_TT);
> -- 
> 1.6.6.1.476.g01ddb
> 
> _______________________________________________
> Nouveau mailing list
> Nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> http://lists.freedesktop.org/mailman/listinfo/nouveau

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-02-01 18:03 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-02-01  9:50 [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel Luca Barbieri
     [not found] ` <1265017810-13354-1-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
2010-02-01  9:50   ` [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator Luca Barbieri
     [not found]     ` <1265017810-13354-2-git-send-email-luca-Ukmtq+NC3rhBHFWNQifrYwC/G2K4zDHf@public.gmane.org>
2010-02-01 12:56       ` Francisco Jerez
     [not found]         ` <87bpg9w1w8.fsf-sGOZH3hwPm2sTnJN9+BGXg@public.gmane.org>
2010-02-01 13:27           ` Luca Barbieri
     [not found]             ` <ff13bc9a1002010527n5812d30fj8555a940aeba179f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-02-01 14:30               ` Francisco Jerez
     [not found]                 ` <874om1vxj4.fsf-sGOZH3hwPm2sTnJN9+BGXg@public.gmane.org>
2010-02-01 14:50                   ` Luca Barbieri
     [not found]                     ` <ff13bc9a1002010650o5cab4d65q26f1d655f82f4f5a-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-02-01 15:25                       ` Francisco Jerez
2010-02-01 18:03       ` Marcin Slusarz
2010-02-01  9:51   ` [PATCH 3/3] drm/nouveau: use semaphores for fully on-GPU interchannel synchronization Luca Barbieri
2010-02-01 12:51   ` [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel Francisco Jerez

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.