Re: [PATCH] virtio_ring: Fix the stale index in available ring

From: Gavin Shan <gshan@redhat.com>
To: "Michael S. Tsirkin" <mst@redhat.com>
Cc: virtualization@lists.linux.dev, linux-kernel@vger.kernel.org,
	jasowang@redhat.com, xuanzhuo@linux.alibaba.com,
	yihyu@redhat.com, shan.gavin@gmail.com,
	Will Deacon <will@kernel.org>,
	Catalin Marinas <catalin.marinas@arm.com>,
	linux-arm-kernel@lists.infradead.org, mochs@nvidia.com
Subject: Re: [PATCH] virtio_ring: Fix the stale index in available ring
Date: Mon, 18 Mar 2024 09:41:45 +1000	[thread overview]
Message-ID: <589d980f-2e4d-47b4-9dc7-8c64dbe271ce@redhat.com> (raw)
In-Reply-To: <20240317124214-mutt-send-email-mst@kernel.org>

On 3/18/24 02:50, Michael S. Tsirkin wrote:
> On Fri, Mar 15, 2024 at 09:24:36PM +1000, Gavin Shan wrote:
>>
>> On 3/15/24 21:05, Michael S. Tsirkin wrote:
>>> On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote:
>>>>>> Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I tried
>>>> to reproduce it with my own driver where one thread writes to the shared buffer
>>>> and another thread reads from the buffer. I don't hit the out-of-order issue so
>>>> far.
>>>
>>> Make sure the 2 areas you are accessing are in different cache lines.
>>>
>>
>> Yes, I already put those 2 areas to separate cache lines.
>>
>>>
>>>> My driver may be not correct somewhere and I will update if I can reproduce
>>>> the issue with my driver in the future.
>>>
>>> Then maybe your change is just making virtio slower and masks the bug
>>> that is actually elsewhere?
>>>
>>> You don't really need a driver. Here's a simple test: without barriers
>>> assertion will fail. With barriers it will not.
>>> (Warning: didn't bother testing too much, could be buggy.
>>>
>>> ---
>>>
>>> #include <pthread.h>
>>> #include <stdio.h>
>>> #include <stdlib.h>
>>> #include <assert.h>
>>>
>>> #define FIRST values[0]
>>> #define SECOND values[64]
>>>
>>> volatile int values[100] = {};
>>>
>>> void* writer_thread(void* arg) {
>>> 	while (1) {
>>> 	FIRST++;
>>> 	// NEED smp_wmb here
>>          __asm__ volatile("dmb ishst" : : : "memory");
>>> 	SECOND++;
>>> 	}
>>> }
>>>
>>> void* reader_thread(void* arg) {
>>>       while (1) {
>>> 	int first = FIRST;
>>> 	// NEED smp_rmb here
>>          __asm__ volatile("dmb ishld" : : : "memory");
>>> 	int second = SECOND;
>>> 	assert(first - second == 1 || first - second == 0);
>>>       }
>>> }
>>>
>>> int main() {
>>>       pthread_t writer, reader;
>>>
>>>       pthread_create(&writer, NULL, writer_thread, NULL);
>>>       pthread_create(&reader, NULL, reader_thread, NULL);
>>>
>>>       pthread_join(writer, NULL);
>>>       pthread_join(reader, NULL);
>>>
>>>       return 0;
>>> }
>>>
>>
>> Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit
>> the assert on both of them. After replacing 'dmb' with 'dsb', I can
>> hit assert on both of them too. I need to look at the code closely.
>>
>> [root@virt-mtcollins-02 test]# ./a
>> a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
>> Aborted (core dumped)
>>
>> [root@nvidia-grace-hopper-05 test]# ./a
>> a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
>> Aborted (core dumped)
>>
>> Thanks,
>> Gavin
> 
> 
> Actually this test is broken. No need for ordering it's a simple race.
> The following works on x86 though (x86 does not need barriers
> though).
> 
> 
> #include <pthread.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <assert.h>
> 
> #if 0
> #define x86_rmb()  asm volatile("lfence":::"memory")
> #define x86_mb()  asm volatile("mfence":::"memory")
> #define x86_smb()  asm volatile("sfence":::"memory")
> #else
> #define x86_rmb()  asm volatile("":::"memory")
> #define x86_mb()  asm volatile("":::"memory")
> #define x86_smb()  asm volatile("":::"memory")
> #endif
> 
> #define FIRST values[0]
> #define SECOND values[640]
> #define FLAG values[1280]
> 
> volatile unsigned values[2000] = {};
> 
> void* writer_thread(void* arg) {
> 	while (1) {
> 	/* Now synchronize with reader */
> 	while(FLAG);
> 	FIRST++;
> 	x86_smb();
> 	SECOND++;
> 	x86_smb();
> 	FLAG = 1;
> 	}
> }
> 
> void* reader_thread(void* arg) {
>      while (1) {
> 	/* Now synchronize with writer */
> 	while(!FLAG);
> 	x86_rmb();
> 	unsigned first = FIRST;
> 	x86_rmb();
> 	unsigned second = SECOND;
> 	assert(first - second == 1 || first - second == 0);
> 	FLAG = 0;
> 
> 	if (!(first %1000000))
> 		printf("%d\n", first);
>     }
> }
> 
> int main() {
>      pthread_t writer, reader;
> 
>      pthread_create(&writer, NULL, writer_thread, NULL);
>      pthread_create(&reader, NULL, reader_thread, NULL);
> 
>      pthread_join(writer, NULL);
>      pthread_join(reader, NULL);
> 
>      return 0;
> }
> 

I tried it on host and VM of NVidia's grace-hopper. Without the barriers, I
can hit assert. With the barriers, it's working fine without hitting the
assert.

I also had some code to mimic virtio vring last weekend, and it's just
working well. Back to our original issue, __smb_wmb() is issued by guest
while __smb_rmb() is executed on host. The VM and host are running at
different exception level: EL2 vs EL1. I'm not sure it's the cause. I
need to modify my code so that __smb_wmb() and __smb_rmb() can be executed
from guest and host.

[gshan@gshan code]$ cat test.h
#ifndef __TEST_H
#define __TEST_H

struct vring_desc {
	uint64_t	addr;
	uint32_t	len;
	uint16_t	flags;
	uint16_t	next;
} __attribute__((aligned(4)));

struct vring_avail {
	uint16_t	flags;
	uint16_t	idx;
	uint16_t	ring[];
} __attribute__((aligned(4)));

struct vring_used_elem {
	uint32_t	id;
	uint32_t	len;
} __attribute__((aligned(4)));

struct vring_used {
	uint16_t 		flags;
         uint16_t 		idx;
         struct vring_used_elem	ring[];
} __attribute__((aligned(4)));

struct vring {
	struct vring_desc	*desc;
	struct vring_avail	*avail;
	struct vring_used	*used;
	uint8_t			pad0[64];

	/* Writer */
	uint32_t		num;
	uint32_t		w_num_free;
	uint32_t		w_free_head;
	uint16_t		w_avail_idx;
	uint16_t		w_last_used_idx;
	uint16_t		*w_extra_data;
	uint16_t		*w_extra_next;
	uint8_t			pad1[64];

	/* Reader */
	uint16_t		r_avail_idx;
	uint16_t		r_last_avail_idx;
	uint16_t		r_last_used_idx;
	uint8_t			pad2[64];
};

static inline unsigned int vring_size(unsigned int num, unsigned long align)
{
	return ((sizeof(struct vring_desc) * num +
		 sizeof(uint16_t) * (3 + num) + (align - 1)) & ~(align - 1)) +
	       sizeof(uint16_t) * 3 + sizeof(struct vring_used_elem) * num;

}

static inline void __smp_rmb(void)
{
#ifdef WEAK_BARRIER
	__asm__ volatile("dmb ishld" : : : "memory");
#else
	__asm__ volatile("dsb sy"    : : : "memory");
#endif
}

static inline void __smp_wmb(void)
{
#ifdef WEAK_BARRIER
	__asm__ volatile("dmb ishst" : : : "memory");
#else
	__asm__ volatile("dsb sy"    : : : "memory");
#endif
}

#endif /* __TEST_H */

[gshan@gshan code]$ cat test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h>
#include <assert.h>
#include <sched.h>
#include <pthread.h>

#include "test.h"

static struct vring *vring;

static int bind_cpu(int cpuid)
{
	cpu_set_t cpuset;

	CPU_ZERO(&cpuset);
	CPU_SET(cpuid, &cpuset);

	return pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
}

static void write_free_used_desc(void)
{
	uint16_t last_used;
	uint32_t idx;

	if ((uint16_t)(vring->used->idx - vring->w_last_used_idx) < 64)
		return;

	while (true) {
		if (vring->w_last_used_idx == vring->used->idx)
			return;

		__smp_rmb();

		/* Retrieve the head */
		last_used = vring->w_last_used_idx & (vring->num - 1);
		idx = vring->used->ring[last_used].id;
		assert(idx < vring->num);
		assert(vring->w_extra_data[idx]);

		/* Reclaim the descriptor */
		vring->w_extra_data[idx] = 0;
		vring->w_extra_next[idx] = vring->w_free_head;
		vring->w_free_head = idx;

		/* Update statistics */
		vring->w_num_free++;
		vring->w_last_used_idx++;
	}
}

static void write_push_desc(void)
{
	uint32_t head = vring->w_free_head;
	uint32_t avail_idx;

	if (vring->w_num_free < 1)
		return;

	/*
	 * The data in the descriptor doesn't matter. The idea here
	 * is to dirty the cache line.
	 */
	vring->desc[head].flags = 1;
	vring->desc[head].addr  = 0xffffffffffffffff;
	vring->desc[head].len   = 0xffffffff;
	vring->desc[head].next  = vring->w_extra_next[head];
	vring->desc[head].flags = 0;

	vring->w_num_free--;
	vring->w_free_head = vring->w_extra_next[head];
	vring->w_extra_data[head] = 1;

	avail_idx = vring->w_avail_idx & (vring->num - 1);
	vring->avail->ring[avail_idx] = head;

	__smp_wmb();

	vring->w_avail_idx++;
	vring->avail->idx = vring->w_avail_idx;
}

static void *write_worker(void *arg)
{
	assert(!bind_cpu(10));

	while (true) {
		write_free_used_desc();
		write_push_desc();
	}

	return NULL;
}

static void read_pull_desc(void)
{
	uint16_t avail_idx, last_avail_idx;
	uint32_t head;

	last_avail_idx = vring->r_last_avail_idx;
	if (vring->r_avail_idx == vring->r_last_avail_idx) {
		vring->r_avail_idx = vring->avail->idx;
		if (vring->r_avail_idx == last_avail_idx)
			return;

		__smp_rmb();
	}

	head = vring->avail->ring[last_avail_idx & (vring->num - 1)];
	assert(head < vring->num);
	vring->r_last_avail_idx++;

	vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].id  = head;
	vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].len = 0;
	vring->r_last_used_idx++;

	__smp_wmb();

	vring->used->idx = vring->r_last_used_idx;
}

static void *read_worker(void *arg)
{
	assert(!bind_cpu(60));

	while (true) {
		read_pull_desc();
	}

	return NULL;
}

static void init_vring(unsigned int num, unsigned long align)
{
	unsigned int size, i;

	/* vring */
	vring = malloc(sizeof(*vring));
	assert(vring);
	memset(vring, 0, sizeof(*vring));

	/* Descriptors */
	size = vring_size(num, align);
	vring->desc = (struct vring_desc *)malloc(size);
	assert(vring->desc);
	memset(vring->desc, 0, size);
	vring->avail = (struct vring_avail *)((void *)vring->desc +
					      num * sizeof(struct vring_desc));
	vring->used = (struct vring_used *)(((unsigned long)&vring->avail->ring[num] +
					    sizeof(uint16_t) + (align - 1)) & ~(align - 1));

	/* Writer's extra data */
	vring->w_extra_data = malloc(sizeof(uint16_t) * num);
	assert(vring->w_extra_data);
	memset(vring->w_extra_data, 0, sizeof(uint16_t) * num);
	vring->w_extra_next = malloc(sizeof(uint16_t) * num);
	assert(vring->w_extra_next);
	memset(vring->w_extra_next, 0, sizeof(uint16_t) * num);
	for (i = 0; i < num - 1; i++)
		vring->w_extra_next[i] = i + 1;

	/* Statistics */
	vring->num = num;
	vring->w_num_free = num;
	vring->w_free_head = 0;
	vring->w_avail_idx = 0;
	vring->w_last_used_idx = 0;
	vring->r_avail_idx = 0;
	vring->r_last_avail_idx = 0;
	vring->r_last_used_idx = 0;
}

int main(int argc, char **argv)
{
	pthread_t w_tid, r_tid;
	int ret;

	assert(!bind_cpu(0));

	init_vring(256, 64);

	ret = pthread_create(&w_tid, NULL, write_worker, NULL);
	assert(!ret);
	ret = pthread_create(&r_tid, NULL, read_worker, NULL);
	assert(!ret);
	ret = pthread_join(w_tid, NULL);
	assert(!ret);
	ret = pthread_join(r_tid, NULL);
	assert(!ret);

	while (true) {
		sleep(1);
	}

	return 0;
}

Thanks,
Gavin