Re: [PATCH 1/1] drm/amdkfd: Improve concurrency of event handling

From: Felix Kuehling <felix.kuehling@amd.com>
To: "Christian König" <christian.koenig@amd.com>,
	amd-gfx@lists.freedesktop.org
Cc: sean.keely@amd.com
Subject: Re: [PATCH 1/1] drm/amdkfd: Improve concurrency of event handling
Date: Thu, 3 Mar 2022 10:31:55 -0500	[thread overview]
Message-ID: <bc98ad9b-d9ab-a529-6089-70470df1d390@amd.com> (raw)
In-Reply-To: <a09270d0-54b5-364d-fd0a-53f4fd746f25@amd.com>

Am 2022-03-03 um 02:25 schrieb Christian König:
> Am 02.03.22 um 21:06 schrieb Felix Kuehling:
>> Use rcu_read_lock to read p->event_idr concurrently with other readers
>> and writers. Use p->event_mutex only for creating and destroying events
>> and in kfd_wait_on_events.
>
> That might not necessary work as you expected. rcu_read_lock() does 
> not provide barriers for the CPU cache.
>
>> Protect the contents of the kfd_event structure with a per-event
>> spinlock that can be taken inside the rcu_read_lock critical section.
>
> That helps to prevent problems for the ev structure, but it doesn't 
> protect the idr itself.

idr_find can be used safely under rcu_read_lock according to this 
comment in idr.h:

  * idr_find() is able to be called locklessly, using RCU. The caller must
  * ensure calls to this function are made within rcu_read_lock() regions.
  * Other readers (lock-free or otherwise) and modifications may be running
  * concurrently.
  *
  * It is still required that the caller manage the synchronization and
  * lifetimes of the items. So if RCU lock-free lookups are used, typically
  * this would mean that the items have their own locks, or are amenable to
  * lock-free access; and that the items are freed by RCU (or only freed after
  * having been deleted from the idr tree *and* a synchronize_rcu() grace
  * period).

It was introduced by f9c46d6ea5ce ("idr: make idr_find rcu-safe") in 2008.

>
> BTW: Why are you using an idr in the first place? I don't see the 
> lookup used anywhere.

lookup_event_by_id is just a wrapper around idr_find. It's used in 
kfd_event_destroy, kfd_set_event, kfd_reset_event and 
kfd_wait_on_events. lookup_signaled_event_by_partial_id also uses 
idr_find. It's used in kfd_signal_event_interrupt.

Regards,
   Felix

>
> Regards,
> Christian.
>
>>
>> This eliminates contention of p->event_mutex in set_event, which tends
>> to be on the critical path for dispatch latency even when busy waiting
>> is used. It also eliminates lock contention in event interrupt handlers.
>> Since the p->event_mutex is now used much less, the impact of requiring
>> it in kfd_wait_on_events should also be much smaller.
>>
>> This should improve event handling latency for processes using multiple
>> GPUs concurrently.
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_events.c | 119 +++++++++++++++---------
>>   drivers/gpu/drm/amd/amdkfd/kfd_events.h |   1 +
>>   2 files changed, 78 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> index deecccebe5b6..1726306715b2 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> @@ -128,8 +128,8 @@ static int 
>> allocate_event_notification_slot(struct kfd_process *p,
>>   }
>>     /*
>> - * Assumes that p->event_mutex is held and of course that p is not 
>> going
>> - * away (current or locked).
>> + * Assumes that p->event_mutex or rcu_readlock is held and of course 
>> that p is
>> + * not going away.
>>    */
>>   static struct kfd_event *lookup_event_by_id(struct kfd_process *p, 
>> uint32_t id)
>>   {
>> @@ -251,15 +251,18 @@ static void destroy_event(struct kfd_process 
>> *p, struct kfd_event *ev)
>>       struct kfd_event_waiter *waiter;
>>         /* Wake up pending waiters. They will return failure */
>> +    spin_lock(&ev->lock);
>>       list_for_each_entry(waiter, &ev->wq.head, wait.entry)
>> -        waiter->event = NULL;
>> +        WRITE_ONCE(waiter->event, NULL);
>>       wake_up_all(&ev->wq);
>> +    spin_unlock(&ev->lock);
>>         if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
>>           ev->type == KFD_EVENT_TYPE_DEBUG)
>>           p->signal_event_count--;
>>         idr_remove(&p->event_idr, ev->event_id);
>> +    synchronize_rcu();
>>       kfree(ev);
>>   }
>>   @@ -392,6 +395,7 @@ int kfd_event_create(struct file *devkfd, 
>> struct kfd_process *p,
>>       ev->auto_reset = auto_reset;
>>       ev->signaled = false;
>>   +    spin_lock_init(&ev->lock);
>>       init_waitqueue_head(&ev->wq);
>>         *event_page_offset = 0;
>> @@ -466,6 +470,7 @@ int kfd_criu_restore_event(struct file *devkfd,
>>       ev->auto_reset = ev_priv->auto_reset;
>>       ev->signaled = ev_priv->signaled;
>>   +    spin_lock_init(&ev->lock);
>>       init_waitqueue_head(&ev->wq);
>>         mutex_lock(&p->event_mutex);
>> @@ -609,13 +614,13 @@ static void set_event(struct kfd_event *ev)
>>         /* Auto reset if the list is non-empty and we're waking
>>        * someone. waitqueue_active is safe here because we're
>> -     * protected by the p->event_mutex, which is also held when
>> +     * protected by the ev->lock, which is also held when
>>        * updating the wait queues in kfd_wait_on_events.
>>        */
>>       ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
>>         list_for_each_entry(waiter, &ev->wq.head, wait.entry)
>> -        waiter->activated = true;
>> +        WRITE_ONCE(waiter->activated, true);
>>         wake_up_all(&ev->wq);
>>   }
>> @@ -626,16 +631,18 @@ int kfd_set_event(struct kfd_process *p, 
>> uint32_t event_id)
>>       int ret = 0;
>>       struct kfd_event *ev;
>>   -    mutex_lock(&p->event_mutex);
>> +    rcu_read_lock();
>>         ev = lookup_event_by_id(p, event_id);
>> +    spin_lock(&ev->lock);
>>         if (ev && event_can_be_cpu_signaled(ev))
>>           set_event(ev);
>>       else
>>           ret = -EINVAL;
>>   -    mutex_unlock(&p->event_mutex);
>> +    spin_unlock(&ev->lock);
>> +    rcu_read_unlock();
>>       return ret;
>>   }
>>   @@ -650,23 +657,25 @@ int kfd_reset_event(struct kfd_process *p, 
>> uint32_t event_id)
>>       int ret = 0;
>>       struct kfd_event *ev;
>>   -    mutex_lock(&p->event_mutex);
>> +    rcu_read_lock();
>>         ev = lookup_event_by_id(p, event_id);
>> +    spin_lock(&ev->lock);
>>         if (ev && event_can_be_cpu_signaled(ev))
>>           reset_event(ev);
>>       else
>>           ret = -EINVAL;
>>   -    mutex_unlock(&p->event_mutex);
>> +    spin_unlock(&ev->lock);
>> +    rcu_read_unlock();
>>       return ret;
>>     }
>>     static void acknowledge_signal(struct kfd_process *p, struct 
>> kfd_event *ev)
>>   {
>> -    page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT;
>> +    WRITE_ONCE(page_slots(p->signal_page)[ev->event_id], 
>> UNSIGNALED_EVENT_SLOT);
>>   }
>>     static void set_event_from_interrupt(struct kfd_process *p,
>> @@ -674,7 +683,9 @@ static void set_event_from_interrupt(struct 
>> kfd_process *p,
>>   {
>>       if (ev && event_can_be_gpu_signaled(ev)) {
>>           acknowledge_signal(p, ev);
>> +        spin_lock(&ev->lock);
>>           set_event(ev);
>> +        spin_unlock(&ev->lock);
>>       }
>>   }
>>   @@ -693,7 +704,7 @@ void kfd_signal_event_interrupt(u32 pasid, 
>> uint32_t partial_id,
>>       if (!p)
>>           return; /* Presumably process exited. */
>>   -    mutex_lock(&p->event_mutex);
>> +    rcu_read_lock();
>>         if (valid_id_bits)
>>           ev = lookup_signaled_event_by_partial_id(p, partial_id,
>> @@ -721,7 +732,7 @@ void kfd_signal_event_interrupt(u32 pasid, 
>> uint32_t partial_id,
>>                   if (id >= KFD_SIGNAL_EVENT_LIMIT)
>>                       break;
>>   -                if (slots[id] != UNSIGNALED_EVENT_SLOT)
>> +                if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT)
>>                       set_event_from_interrupt(p, ev);
>>               }
>>           } else {
>> @@ -730,14 +741,14 @@ void kfd_signal_event_interrupt(u32 pasid, 
>> uint32_t partial_id,
>>                * only signaled events from the IDR.
>>                */
>>               for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++)
>> -                if (slots[id] != UNSIGNALED_EVENT_SLOT) {
>> +                if (READ_ONCE(slots[id]) != UNSIGNALED_EVENT_SLOT) {
>>                       ev = lookup_event_by_id(p, id);
>>                       set_event_from_interrupt(p, ev);
>>                   }
>>           }
>>       }
>>   -    mutex_unlock(&p->event_mutex);
>> +    rcu_read_unlock();
>>       kfd_unref_process(p);
>>   }
>>   @@ -767,9 +778,11 @@ static int init_event_waiter_get_status(struct 
>> kfd_process *p,
>>       if (!ev)
>>           return -EINVAL;
>>   +    spin_lock(&ev->lock);
>>       waiter->event = ev;
>>       waiter->activated = ev->signaled;
>>       ev->signaled = ev->signaled && !ev->auto_reset;
>> +    spin_unlock(&ev->lock);
>>         return 0;
>>   }
>> @@ -781,8 +794,11 @@ static void 
>> init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
>>       /* Only add to the wait list if we actually need to
>>        * wait on this event.
>>        */
>> -    if (!waiter->activated)
>> +    if (!waiter->activated) {
>> +        spin_lock(&ev->lock);
>>           add_wait_queue(&ev->wq, &waiter->wait);
>> +        spin_unlock(&ev->lock);
>> +    }
>>   }
>>     /* test_event_condition - Test condition of events being waited for
>> @@ -802,10 +818,10 @@ static uint32_t test_event_condition(bool all, 
>> uint32_t num_events,
>>       uint32_t activated_count = 0;
>>         for (i = 0; i < num_events; i++) {
>> -        if (!event_waiters[i].event)
>> +        if (!READ_ONCE(event_waiters[i].event))
>>               return KFD_IOC_WAIT_RESULT_FAIL;
>>   -        if (event_waiters[i].activated) {
>> +        if (READ_ONCE(event_waiters[i].activated)) {
>>               if (!all)
>>                   return KFD_IOC_WAIT_RESULT_COMPLETE;
>>   @@ -834,6 +850,8 @@ static int copy_signaled_event_data(uint32_t 
>> num_events,
>>       for (i = 0; i < num_events; i++) {
>>           waiter = &event_waiters[i];
>>           event = waiter->event;
>> +        if (!event)
>> +            return -EINVAL; /* event was destroyed */
>>           if (waiter->activated && event->type == 
>> KFD_EVENT_TYPE_MEMORY) {
>>               dst = &data[i].memory_exception_data;
>>               src = &event->memory_exception_data;
>> @@ -844,11 +862,8 @@ static int copy_signaled_event_data(uint32_t 
>> num_events,
>>       }
>>         return 0;
>> -
>>   }
>>   -
>> -
>>   static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
>>   {
>>       if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
>> @@ -872,9 +887,12 @@ static void free_waiters(uint32_t num_events, 
>> struct kfd_event_waiter *waiters)
>>       uint32_t i;
>>         for (i = 0; i < num_events; i++)
>> -        if (waiters[i].event)
>> +        if (waiters[i].event) {
>> +            spin_lock(&waiters[i].event->lock);
>>               remove_wait_queue(&waiters[i].event->wq,
>>                         &waiters[i].wait);
>> +            spin_unlock(&waiters[i].event->lock);
>> +        }
>>         kfree(waiters);
>>   }
>> @@ -898,6 +916,9 @@ int kfd_wait_on_events(struct kfd_process *p,
>>           goto out;
>>       }
>>   +    /* Use p->event_mutex here to protect against concurrent 
>> creation and
>> +     * destruction of events while we initialize event_waiters.
>> +     */
>>       mutex_lock(&p->event_mutex);
>>         for (i = 0; i < num_events; i++) {
>> @@ -976,14 +997,19 @@ int kfd_wait_on_events(struct kfd_process *p,
>>       }
>>       __set_current_state(TASK_RUNNING);
>>   +    mutex_lock(&p->event_mutex);
>>       /* copy_signaled_event_data may sleep. So this has to happen
>>        * after the task state is set back to RUNNING.
>> +     *
>> +     * The event may also have been destroyed after signaling. So
>> +     * copy_signaled_event_data also must confirm that the event
>> +     * still exists. Therefore this must be under the p->event_mutex
>> +     * which is also held when events are destroyed.
>>        */
>>       if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
>>           ret = copy_signaled_event_data(num_events,
>>                              event_waiters, events);
>>   -    mutex_lock(&p->event_mutex);
>>   out_unlock:
>>       free_waiters(num_events, event_waiters);
>>       mutex_unlock(&p->event_mutex);
>> @@ -1042,8 +1068,7 @@ int kfd_event_mmap(struct kfd_process *p, 
>> struct vm_area_struct *vma)
>>   }
>>     /*
>> - * Assumes that p->event_mutex is held and of course
>> - * that p is not going away (current or locked).
>> + * Assumes that p is not going away.
>>    */
>>   static void lookup_events_by_type_and_signal(struct kfd_process *p,
>>           int type, void *event_data)
>> @@ -1055,6 +1080,8 @@ static void 
>> lookup_events_by_type_and_signal(struct kfd_process *p,
>>         ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
>>   +    rcu_read_lock();
>> +
>>       id = KFD_FIRST_NONSIGNAL_EVENT_ID;
>>       idr_for_each_entry_continue(&p->event_idr, ev, id)
>>           if (ev->type == type) {
>> @@ -1062,9 +1089,11 @@ static void 
>> lookup_events_by_type_and_signal(struct kfd_process *p,
>>               dev_dbg(kfd_device,
>>                       "Event found: id %X type %d",
>>                       ev->event_id, ev->type);
>> +            spin_lock(&ev->lock);
>>               set_event(ev);
>>               if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
>>                   ev->memory_exception_data = *ev_data;
>> +            spin_unlock(&ev->lock);
>>           }
>>         if (type == KFD_EVENT_TYPE_MEMORY) {
>> @@ -1087,6 +1116,8 @@ static void 
>> lookup_events_by_type_and_signal(struct kfd_process *p,
>>                   p->lead_thread->pid, p->pasid);
>>           }
>>       }
>> +
>> +    rcu_read_unlock();
>>   }
>>     #ifdef KFD_SUPPORT_IOMMU_V2
>> @@ -1162,16 +1193,10 @@ void kfd_signal_iommu_event(struct kfd_dev 
>> *dev, u32 pasid,
>>         if (KFD_GC_VERSION(dev) != IP_VERSION(9, 1, 0) &&
>>           KFD_GC_VERSION(dev) != IP_VERSION(9, 2, 2) &&
>> -        KFD_GC_VERSION(dev) != IP_VERSION(9, 3, 0)) {
>> -        mutex_lock(&p->event_mutex);
>> -
>> -        /* Lookup events by type and signal them */
>> +        KFD_GC_VERSION(dev) != IP_VERSION(9, 3, 0))
>>           lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
>>                   &memory_exception_data);
>>   -        mutex_unlock(&p->event_mutex);
>> -    }
>> -
>>       kfd_unref_process(p);
>>   }
>>   #endif /* KFD_SUPPORT_IOMMU_V2 */
>> @@ -1188,12 +1213,7 @@ void kfd_signal_hw_exception_event(u32 pasid)
>>       if (!p)
>>           return; /* Presumably process exited. */
>>   -    mutex_lock(&p->event_mutex);
>> -
>> -    /* Lookup events by type and signal them */
>>       lookup_events_by_type_and_signal(p, 
>> KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
>> -
>> -    mutex_unlock(&p->event_mutex);
>>       kfd_unref_process(p);
>>   }
>>   @@ -1229,16 +1249,19 @@ void kfd_signal_vm_fault_event(struct 
>> kfd_dev *dev, u32 pasid,
>>               info->prot_write ? 1 : 0;
>>           memory_exception_data.failure.imprecise = 0;
>>       }
>> -    mutex_lock(&p->event_mutex);
>> +
>> +    rcu_read_lock();
>>         id = KFD_FIRST_NONSIGNAL_EVENT_ID;
>>       idr_for_each_entry_continue(&p->event_idr, ev, id)
>>           if (ev->type == KFD_EVENT_TYPE_MEMORY) {
>> +            spin_lock(&ev->lock);
>>               ev->memory_exception_data = memory_exception_data;
>>               set_event(ev);
>> +            spin_unlock(&ev->lock);
>>           }
>>   -    mutex_unlock(&p->event_mutex);
>> +    rcu_read_unlock();
>>       kfd_unref_process(p);
>>   }
>>   @@ -1272,22 +1295,28 @@ void kfd_signal_reset_event(struct kfd_dev 
>> *dev)
>>               continue;
>>           }
>>   -        mutex_lock(&p->event_mutex);
>> +        rcu_read_lock();
>> +
>>           id = KFD_FIRST_NONSIGNAL_EVENT_ID;
>>           idr_for_each_entry_continue(&p->event_idr, ev, id) {
>>               if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
>> +                spin_lock(&ev->lock);
>>                   ev->hw_exception_data = hw_exception_data;
>>                   ev->hw_exception_data.gpu_id = user_gpu_id;
>>                   set_event(ev);
>> +                spin_unlock(&ev->lock);
>>               }
>>               if (ev->type == KFD_EVENT_TYPE_MEMORY &&
>>                   reset_cause == KFD_HW_EXCEPTION_ECC) {
>> +                spin_lock(&ev->lock);
>>                   ev->memory_exception_data = memory_exception_data;
>>                   ev->memory_exception_data.gpu_id = user_gpu_id;
>>                   set_event(ev);
>> +                spin_unlock(&ev->lock);
>>               }
>>           }
>> -        mutex_unlock(&p->event_mutex);
>> +
>> +        rcu_read_unlock();
>>       }
>>       srcu_read_unlock(&kfd_processes_srcu, idx);
>>   }
>> @@ -1320,19 +1349,25 @@ void kfd_signal_poison_consumed_event(struct 
>> kfd_dev *dev, u32 pasid)
>>       memory_exception_data.gpu_id = user_gpu_id;
>>       memory_exception_data.failure.imprecise = true;
>>   -    mutex_lock(&p->event_mutex);
>> +    rcu_read_lock();
>> +
>>       idr_for_each_entry_continue(&p->event_idr, ev, id) {
>>           if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
>> +            spin_lock(&ev->lock);
>>               ev->hw_exception_data = hw_exception_data;
>>               set_event(ev);
>> +            spin_unlock(&ev->lock);
>>           }
>>             if (ev->type == KFD_EVENT_TYPE_MEMORY) {
>> +            spin_lock(&ev->lock);
>>               ev->memory_exception_data = memory_exception_data;
>>               set_event(ev);
>> +            spin_unlock(&ev->lock);
>>           }
>>       }
>> -    mutex_unlock(&p->event_mutex);
>> +
>> +    rcu_read_unlock();
>>         /* user application will handle SIGBUS signal */
>>       send_sig(SIGBUS, p->lead_thread, 0);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
>> index 1238af11916e..55d376f56021 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
>> @@ -59,6 +59,7 @@ struct kfd_event {
>>         int type;
>>   +    spinlock_t lock;
>>       wait_queue_head_t wq; /* List of event waiters. */
>>         /* Only for signal events. */
>