```linux-kernel.vger.kernel.org archive mirror
help / color / mirror / Atom feed```
```* [PATCH] lib/find_bit: Add find_prev_*_bit functions.
@ 2020-12-02  1:10 Yun Levi
2020-12-02  9:47 ` Andy Shevchenko
From: Yun Levi @ 2020-12-02  1:10 UTC (permalink / raw)
To: dushistov, arnd, akpm, gustavo, vilhelm.gray, richard.weiyang,
andriy.shevchenko, joseph.qi, skalluru, yury.norov, jpoimboe
Cc: linux-kernel, linux-arch

Inspired find_next_*bit function series, add find_prev_*_bit series.
I'm not sure whether it'll be used right now But, I add these functions
for future usage.

Signed-off-by: Levi Yun <ppbuk5246@gmail.com>
---
fs/ufs/util.h                     |  24 +++---
include/asm-generic/bitops/find.h |  69 ++++++++++++++++
include/asm-generic/bitops/le.h   |  33 ++++++++
include/linux/bitops.h            |  34 +++++---
lib/find_bit.c                    | 126 +++++++++++++++++++++++++++++-
5 files changed, 260 insertions(+), 26 deletions(-)

diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 4931bec1a01c..7c87c77d10ca 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -2,7 +2,7 @@
/*
*  linux/fs/ufs/util.h
*
* Daniel Pirkl <daniel.pirkl@email.cz>
* Charles University, Faculty of Mathematics and Physics
*/
@@ -263,7 +263,7 @@ extern int ufs_prepare_chunk(struct page *page,
loff_t pos, unsigned len);
/*
* These functions manipulate ufs buffers
*/
ufs_sb_private_info *, struct super_block *, u64 , u64);
ufs_sb_private_info *, struct super_block *, u64, u64);
extern void ubh_brelse (struct ufs_buffer_head *);
@@ -296,7 +296,7 @@ static inline void *get_usb_offset(struct
ufs_sb_private_info *uspi,
unsigned int offset)
{
unsigned int index;
-
+
index = offset >> uspi->s_fshift;
return uspi->s_ubh.bh[index]->b_data + offset;
@@ -411,9 +411,9 @@ static inline unsigned _ubh_find_next_zero_bit_(
offset = 0;
}
return (base << uspi->s_bpfshift) + pos - begin;
-}
+}

-static inline unsigned find_last_zero_bit (unsigned char * bitmap,
+static inline unsigned __ubh_find_last_zero_bit(unsigned char * bitmap,
unsigned size, unsigned offset)
{
unsigned bit, i;
@@ -453,7 +453,7 @@ static inline unsigned _ubh_find_last_zero_bit_(
size + (uspi->s_bpf - start), uspi->s_bpf)
- (uspi->s_bpf - start);
size -= count;
-               pos = find_last_zero_bit (ubh->bh[base]->b_data,
+               pos = __ubh_find_last_zero_bit(ubh->bh[base]->b_data,
start, start - count);
if (pos > start - count || !size)
break;
@@ -461,7 +461,7 @@ static inline unsigned _ubh_find_last_zero_bit_(
start = uspi->s_bpf;
}
return (base << uspi->s_bpfshift) + pos - begin;
-}
+}

#define ubh_isblockclear(ubh,begin,block)
(!_ubh_isblockset_(uspi,ubh,begin,block))

@@ -483,7 +483,7 @@ static inline int _ubh_isblockset_(struct
ufs_sb_private_info * uspi,
mask = 0x01 << (block & 0x07);
return (*ubh_get_addr (ubh, begin + (block >> 3)) &
}
-       return 0;
+       return 0;
}

#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
@@ -492,8 +492,8 @@ static inline void _ubh_clrblock_(struct
ufs_sb_private_info * uspi,
{
switch (uspi->s_fpb) {
case 8:
-               *ubh_get_addr (ubh, begin + block) = 0x00;
-               return;
+               *ubh_get_addr (ubh, begin + block) = 0x00;
+               return;
case 4:
*ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f <<
((block & 0x01) << 2));
return;
@@ -531,9 +531,9 @@ static inline void ufs_fragacct (struct
super_block * sb, unsigned blockmap,
{
struct ufs_sb_private_info * uspi;
unsigned fragsize, pos;
-
+
uspi = UFS_SB(sb)->s_uspi;
-
+
fragsize = 0;
for (pos = 0; pos < uspi->s_fpb; pos++) {
if (blockmap & (1 << pos)) {
diff --git a/include/asm-generic/bitops/find.h
b/include/asm-generic/bitops/find.h
index 9fdf21302fdf..ca18b2ec954c 100644
--- a/include/asm-generic/bitops/find.h
+++ b/include/asm-generic/bitops/find.h
@@ -16,6 +16,20 @@ extern unsigned long find_next_bit(const unsigned
size, unsigned long offset);
#endif

+#ifndef find_prev_bit
+/**
+ * find_prev_bit - find the prev set bit in a memory region
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the prev set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_prev_bit(const unsigned long *addr, unsigned long
+               size, unsigned long offset);
+#endif
+
#ifndef find_next_and_bit
/**
* find_next_and_bit - find the next set bit in both memory regions
@@ -32,6 +46,22 @@ extern unsigned long find_next_and_bit(const
unsigned long offset);
#endif

+#ifndef find_prev_and_bit
+/**
+ * find_prev_and_bit - find the prev set bit in both memory regions
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the prev set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_prev_and_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long size,
+               unsigned long offset);
+#endif
+
#ifndef find_next_zero_bit
/**
* find_next_zero_bit - find the next cleared bit in a memory region
@@ -46,6 +76,20 @@ extern unsigned long find_next_zero_bit(const
long size, unsigned long offset);
#endif

+#ifndef find_prev_zero_bit
+/**
+ * find_prev_zero_bit - find the prev cleared bit in a memory region
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number of the prev zero bit
+ * If no bits are zero, returns @size.
+ */
+extern unsigned long find_prev_zero_bit(const unsigned long *addr, unsigned
+               long size, unsigned long offset);
+#endif
+
#ifdef CONFIG_GENERIC_FIND_FIRST_BIT

/**
@@ -80,6 +124,31 @@ extern unsigned long find_first_zero_bit(const

#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */

+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+extern unsigned long find_last_bit(const unsigned long *addr,
+                                  unsigned long size);
+#endif
+
+#ifndef find_last_zero_bit
+/**
+ * find_last_zero_bit - find the last cleared bit in a memory region
+ * @size: The maximum number of bits to search
+ *
+ * Returns the bit number of the first cleared bit.
+ * If no bits are zero, returns @size.
+ */
+extern unsigned long find_last_zero_bit(const unsigned long *addr,
+                                        unsigned long size);
+#endif
+
/**
* find_next_clump8 - find next 8-bit clump with set bits in a memory region
* @clump: location to store copy of found clump
diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h
index 188d3eba3ace..d0bd15bc34d9 100644
--- a/include/asm-generic/bitops/le.h
+++ b/include/asm-generic/bitops/le.h
@@ -27,6 +27,24 @@ static inline unsigned long
}

+static inline unsigned long find_prev_zero_bit_le(const void *addr,
+               unsigned long size, unsigned long offset)
+{
+}
+
+static inline unsigned long find_prev_bit_le(const void *addr,
+               unsigned long size, unsigned long offset)
+{
+}
+
+static inline unsigned long find_last_zero_bit_le(const void *addr,
+               unsigned long size)
+{
+}
+
#elif defined(__BIG_ENDIAN)

#define BITOP_LE_SWIZZLE       ((BITS_PER_LONG-1) & ~0x7)
@@ -41,11 +59,26 @@ extern unsigned long find_next_bit_le(const void *addr,
unsigned long size, unsigned long offset);
#endif

+#ifndef find_prev_zero_bit_le
+extern unsigned long find_prev_zero_bit_le(const void *addr,
+               unsigned long size, unsigned long offset);
+#endif
+
+#ifndef find_prev_bit_le
+extern unsigned long find_prev_bit_le(const void *addr,
+               unsigned long size, unsigned long offset);
+#endif
+
#ifndef find_first_zero_bit_le
#endif

+#ifndef find_last_zero_bit_le
+       find_prev_zero_bit_le((addr), (size), (size - 1))
+#endif
+
#else
#endif
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5b74bdf159d6..124c68929861 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -50,6 +50,28 @@ extern unsigned long __sw_hweight64(__u64 w);
(bit) < (size);                                    \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))

+       for ((bit) = find_last_bit((addr), (size));             \
+            (bit) < (size);                                    \
+            (bit) = find_prev_bit((addr), (size), (bit)))
+
+/* same as for_each_set_bit_reverse() but use bit as value to start with */
+       for ((bit) = find_prev_bit((addr), (size), (bit));      \
+            (bit) < (size);                                    \
+            (bit) = find_prev_bit((addr), (size), (bit - 1)))
+
+       for ((bit) = find_last_zero_bit((addr), (size));        \
+            (bit) < (size);                                    \
+            (bit) = find_prev_zero_bit((addr), (size), (bit)))
+
+/* same as for_each_clear_bit_reverse() but use bit as value to start with */
+       for ((bit) = find_prev_zero_bit((addr), (size), (bit)); \
+            (bit) < (size);                                    \
+            (bit) = find_next_zero_bit((addr), (size), (bit - 1)))
+
/**
* for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
* @start: bit offset to start search and to store the current iteration offset
@@ -283,17 +305,5 @@ static __always_inline void __assign_bit(long nr,
})
#endif

-#ifndef find_last_bit
-/**
- * find_last_bit - find the last set bit in a memory region
- * @size: The number of bits to search
- *
- * Returns the bit number of the last set bit, or size.
- */
-extern unsigned long find_last_bit(const unsigned long *addr,
-                                  unsigned long size);
-#endif
-
#endif /* __KERNEL__ */
#endif
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 4a8751010d59..cbe06abd3d21 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -69,6 +69,58 @@ static unsigned long _find_next_bit(const unsigned
}
#endif

+#if !defined(find_prev_bit) || !defined(find_prev_zero_bit) ||
\
+       !defined(find_prev_bit_le) || !defined(find_prev_zero_bit_le)
||        \
+       !defined(find_prev_and_bit)
+/*
+ * This is a common helper function for find_prev_bit, find_prev_zero_bit, and
+ * find_prev_and_bit. The differences are:
+ *  - The "invert" argument, which is XORed with each fetched word before
+ *    searching it for one bits.
+ *  - The optional "addr2", which is anded with "addr1" if present.
+ */
+static unsigned long _find_prev_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long nbits,
+               unsigned long start, unsigned long invert, unsigned long le)
+{
+
+       if (unlikely(start >= nbits))
+               return nbits;
+
+       tmp = addr1[start / BITS_PER_LONG];
+               tmp &= addr2[start / BITS_PER_LONG];
+       tmp ^= invert;
+
+       /* Handle 1st word. */
+       if (le)
+
+
+       start = round_down(start, BITS_PER_LONG);
+
+       while (!tmp) {
+               start -= BITS_PER_LONG;
+               if (start >= nbits)
+                       return nbits;
+
+               tmp = addr1[start / BITS_PER_LONG];
+                       tmp &= addr2[start / BITS_PER_LONG];
+               tmp ^= invert;
+       }
+
+       if (le)
+               tmp = swab(tmp);
+
+       return start + __fls(tmp);
+}
+#endif
+
+
#ifndef find_next_bit
/*
* Find the next set bit in a memory region.
@@ -81,6 +133,18 @@ unsigned long find_next_bit(const unsigned long
EXPORT_SYMBOL(find_next_bit);
#endif

+#ifndef find_prev_bit
+/*
+ * Find the prev set bit in a memory region.
+ */
+unsigned long find_prev_bit(const unsigned long *addr, unsigned long size,
+                           unsigned long offset)
+{
+       return _find_prev_bit(addr, NULL, size, offset, 0UL, 0);
+}
+EXPORT_SYMBOL(find_prev_bit);
+#endif
+
#ifndef find_next_zero_bit
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
@@ -90,7 +154,16 @@ unsigned long find_next_zero_bit(const unsigned
EXPORT_SYMBOL(find_next_zero_bit);
#endif

-#if !defined(find_next_and_bit)
+#ifndef find_prev_zero_bit
+unsigned long find_prev_zero_bit(const unsigned long *addr, unsigned long size,
+                                unsigned long offset)
+{
+       return _find_prev_bit(addr, NULL, size, offset, ~0UL, 0);
+}
+EXPORT_SYMBOL(find_prev_zero_bit);
+#endif
+
+#ifndef find_next_and_bit
unsigned long find_next_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size,
unsigned long offset)
@@ -100,6 +173,16 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
EXPORT_SYMBOL(find_next_and_bit);
#endif

+#ifndef find_prev_and_bit
+unsigned long find_prev_and_bit(const unsigned long *addr1,
+               const unsigned long *addr2, unsigned long size,
+               unsigned long offset)
+{
+}
+EXPORT_SYMBOL(find_prev_and_bit);
+#endif
+
#ifndef find_first_bit
/*
* Find the first set bit in a memory region.
@@ -141,7 +224,7 @@ unsigned long find_last_bit(const unsigned long
{
if (size) {
-               unsigned long idx = (size-1) / BITS_PER_LONG;
+               unsigned long idx = (size - 1) / BITS_PER_LONG;

do {
@@ -156,6 +239,27 @@ unsigned long find_last_bit(const unsigned long
EXPORT_SYMBOL(find_last_bit);
#endif

+#ifndef find_last_zero_bit
+unsigned long find_last_zero_bit(const unsigned long *addr, unsigned long size)
+{
+       if (size) {
+               unsigned long val = BITMAP_LAST_WORD_MASK(size);
+               unsigned long idx = (size - 1) / BITS_PER_LONG;
+
+               do {
+                       if (val)
+                               return idx * BITS_PER_LONG + __fls(val);
+
+                       val = ~0ul;
+               } while (idx--);
+       }
+
+       return size;
+}
+EXPORT_SYMBOL(find_last_zero_bit);
+#endif
+
#ifdef __BIG_ENDIAN

#ifndef find_next_zero_bit_le
@@ -167,6 +271,15 @@ unsigned long find_next_zero_bit_le(const void
EXPORT_SYMBOL(find_next_zero_bit_le);
#endif

+#ifndef find_prev_zero_bit_le
+unsigned long find_prev_zero_bit_le(const void *addr, unsigned
+               long size, unsigned long offset)
+{
+       return _find_prev_bit(addr, NULL, size, offset, ~0UL, 1);
+}
+EXPORT_SYMBOL(find_prev_zero_bit_le);
+#endif
+
#ifndef find_next_bit_le
unsigned long find_next_bit_le(const void *addr, unsigned
long size, unsigned long offset)
@@ -176,6 +289,15 @@ unsigned long find_next_bit_le(const void *addr, unsigned
EXPORT_SYMBOL(find_next_bit_le);
#endif

+#ifdef find_prev_bit_le
+unsigned long find_prev_bit_le(const void *addr, unsigned
+               long size, unsigned long offset)
+{
+       return _find_prev_bit(addr, NULL, size, offset, 0UL, 1);
+}
+EXPORT_SYMBOL(find_prev_bit_le);
+#endif
+
#endif /* __BIG_ENDIAN */

unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
--
2.29.2

```* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02  1:10 [PATCH] lib/find_bit: Add find_prev_*_bit functions Yun Levi
@ 2020-12-02  9:47 ` Andy Shevchenko
2020-12-02 10:04   ` Rasmus Villemoes
From: Andy Shevchenko @ 2020-12-02  9:47 UTC (permalink / raw)
To: Yun Levi
Cc: dushistov, arnd, akpm, gustavo, vilhelm.gray, richard.weiyang,
joseph.qi, skalluru, yury.norov, jpoimboe, linux-kernel,
linux-arch

On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
> Inspired find_next_*bit function series, add find_prev_*_bit series.
> I'm not sure whether it'll be used right now But, I add these functions
> for future usage.

This patch has few issues:
- it has more things than described (should be several patches instead)
- new functionality can be split logically to couple or more pieces as well
- it proposes functionality w/o user (dead code)

--
With Best Regards,
Andy Shevchenko

```* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02  9:47 ` Andy Shevchenko
@ 2020-12-02 10:04   ` Rasmus Villemoes
2020-12-02 11:50     ` Yun Levi
From: Rasmus Villemoes @ 2020-12-02 10:04 UTC (permalink / raw)
To: Andy Shevchenko, Yun Levi
Cc: dushistov, arnd, akpm, gustavo, vilhelm.gray, richard.weiyang,
joseph.qi, skalluru, yury.norov, jpoimboe, linux-kernel,
linux-arch

On 02/12/2020 10.47, Andy Shevchenko wrote:
> On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
>> Inspired find_next_*bit function series, add find_prev_*_bit series.
>> I'm not sure whether it'll be used right now But, I add these functions
>> for future usage.
>
> This patch has few issues:
> - it has more things than described (should be several patches instead)
> - new functionality can be split logically to couple or more pieces as well
> - it proposes functionality w/o user (dead code)

Yeah, the last point means it can't be applied - please submit it again
if and when you have an actual use case. And I'll add

- it lacks extension of the bitmap test module to cover the new
functions (that also wants to be a separate patch).

Rasmus

```* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02 10:04   ` Rasmus Villemoes
@ 2020-12-02 11:50     ` Yun Levi
2020-12-02 12:06       ` Andy Shevchenko
0 siblings, 2 replies; 400+ messages in thread
From: Yun Levi @ 2020-12-02 11:50 UTC (permalink / raw)
To: Rasmus Villemoes
Cc: Andy Shevchenko, dushistov, arnd, akpm, gustavo, vilhelm.gray,
richard.weiyang, joseph.qi, skalluru, yury.norov, jpoimboe,
linux-kernel, linux-arch

Thanks for kind advice. But I'm so afraid to have questions below:

> - it proposes functionality w/o user (dead code)
Actually, I add these series functions to rewrite some of the
resource clean-up routine.
A typical case is ethtool_set_per_queue_coalesce 's rollback label.
Could this usage be an actual use case?

>- it lacks extension of the bitmap test module to cover the new
> functions (that also wants to be a separate patch).
I see, then Could I add some of testcase on lib/test_bitops.c for testing?

On Wed, Dec 2, 2020 at 7:04 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> On 02/12/2020 10.47, Andy Shevchenko wrote:
> > On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
> >> Inspired find_next_*bit function series, add find_prev_*_bit series.
> >> I'm not sure whether it'll be used right now But, I add these functions
> >> for future usage.
> >
> > This patch has few issues:
> > - it has more things than described (should be several patches instead)
> > - new functionality can be split logically to couple or more pieces as well
> > - it proposes functionality w/o user (dead code)
>
> Yeah, the last point means it can't be applied - please submit it again
> if and when you have an actual use case. And I'll add
>
> - it lacks extension of the bitmap test module to cover the new
> functions (that also wants to be a separate patch).
>
> Rasmus

```* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02 11:50     ` Yun Levi
@ 2020-12-02 12:06       ` Andy Shevchenko
1 sibling, 0 replies; 400+ messages in thread
From: Andy Shevchenko @ 2020-12-02 12:06 UTC (permalink / raw)
To: Yun Levi
Cc: Rasmus Villemoes, dushistov, arnd, akpm, gustavo, vilhelm.gray,
richard.weiyang, joseph.qi, skalluru, yury.norov, jpoimboe,
linux-kernel, linux-arch

On Wed, Dec 02, 2020 at 08:50:24PM +0900, Yun Levi wrote:
> Thanks for kind advice. But I'm so afraid to have questions below:
>
>  > - it proposes functionality w/o user (dead code)
>      Actually, I add these series functions to rewrite some of the
> resource clean-up routine.
>      A typical case is ethtool_set_per_queue_coalesce 's rollback label.
>      Could this usage be an actual use case?

Then create it as a patch in the series and in cover letter (0 message when you
supply --cover-letter to your `git format-patch ...` command line) mention
this.

>  >- it lacks extension of the bitmap test module to cover the new
>  > functions (that also wants to be a separate patch).
>      I see, then Could I add some of testcase on lib/test_bitops.c for testing?

Sounds good to me. Most important is to have test cases, then we will see which
test suite is the best fit, but as I said sounds like a good shot.

And please do not top post in replies!

> On Wed, Dec 2, 2020 at 7:04 PM Rasmus Villemoes
> <linux@rasmusvillemoes.dk> wrote:
> >
> > On 02/12/2020 10.47, Andy Shevchenko wrote:
> > > On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
> > >> Inspired find_next_*bit function series, add find_prev_*_bit series.
> > >> I'm not sure whether it'll be used right now But, I add these functions
> > >> for future usage.
> > >
> > > This patch has few issues:
> > > - it has more things than described (should be several patches instead)
> > > - new functionality can be split logically to couple or more pieces as well
> > > - it proposes functionality w/o user (dead code)
> >
> > Yeah, the last point means it can't be applied - please submit it again
> > if and when you have an actual use case. And I'll add
> >
> > - it lacks extension of the bitmap test module to cover the new
> > functions (that also wants to be a separate patch).

--
With Best Regards,
Andy Shevchenko

```* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
@ 2020-12-02 17:37         ` Andy Shevchenko
2020-12-02 18:27           ` Yun Levi
2020-12-02 18:22         ` Yun Levi
From: Andy Shevchenko @ 2020-12-02 17:37 UTC (permalink / raw)
To: Yury Norov
Cc: Yun Levi, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch

On Wed, Dec 02, 2020 at 09:26:05AM -0800, Yury Norov wrote:
> On Wed, Dec 2, 2020 at 3:50 AM Yun Levi <ppbuk5246@gmail.com> wrote:

...

>  I think this patch has some good catches. We definitely need to implement
> find_last_zero_bit(), as it is used by fs/ufs, and their local
> implementation is not optimal.

Side note: speaking of performance, any plans to fix for_each_*_bit*() for
cases when the nbits is known to be <= BITS_PER_LONG?

Now it makes an awful code generation (something like few hundred bytes of
code).

--
With Best Regards,
Andy Shevchenko

```* (no subject)
2020-12-02 17:37         ` Andy Shevchenko
@ 2020-12-02 18:22         ` Yun Levi
2020-12-02 21:26           ` Yury Norov
From: Yun Levi @ 2020-12-02 18:22 UTC (permalink / raw)
To: Yury Norov
Cc: Rasmus Villemoes, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

On Thu, Dec 3, 2020 at 2:26 AM Yury Norov <yury.norov@gmail.com> wrote:

> Also look at lib/find_bit_benchmark.c
Thanks. I'll see.

> We need find_next_*_bit() because find_first_*_bit() can start searching only at word-aligned
> bits. In the case of find_last_*_bit(), we can start at any bit. So, if my understanding is correct,
> for the purpose of reverse traversing we can go with already existing find_last_bit(),

Thank you. I haven't thought that way.
But I think if we implement reverse traversing using find_last_bit(),
we have a problem.
Suppose the last bit 0, 1, 2, is set.
If we start
find_last_bit(bitmap, 3) ==> return 2;
find_last_bit(bitmap, 2) ==> return 1;
find_last_bit(bitmap, 1) ==> return 0;
find_last_bit(bitmap, 0) ===> return 0? // here we couldn't
distinguish size 0 input or 0 is set

and the for_each traverse routine prevent above case by returning size
(nbits) using find_next_bit.
So, for compatibility and the same expected return value like next traversing,
I think we need to find_prev_*_bit routine. if my understanding is correct.

>  I think this patch has some good catches. We definitely need to implement
> find_last_zero_bit(), as it is used by fs/ufs, and their local implementation is not optimal.
>
> We also should consider adding reverse traversing macros based on find_last_*_bit(),
> if there are proposed users.

Not only this, I think 'steal_from_bitmap_to_front' can be improved
using ffind_prev_zero_bit
like

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af0013d3df63..9debb9707390 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2372,7 +2372,6 @@ static bool steal_from_bitmap_to_front(struct
btrfs_free_space_ctl *ctl,
u64 bitmap_offset;
unsigned long i;
unsigned long j;
- unsigned long prev_j;
u64 bytes;

bitmap_offset = offset_to_bitmap(ctl, info->offset);
@@ -2388,20 +2387,15 @@ static bool steal_from_bitmap_to_front(struct
btrfs_free_space_ctl *ctl,
return false;

i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
- j = 0;
- prev_j = (unsigned long)-1;
- for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
- if (j > i)
- break;
- prev_j = j;
- }
- if (prev_j == i)
+ j = find_prev_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
+
+ if (j == i)
return false;

- if (prev_j == (unsigned long)-1)
+ if (j == BITS_PER_BITMAP)
bytes = (i + 1) * ctl->unit;
else
- bytes = (i - prev_j) * ctl->unit;
+ bytes = (i - j) * ctl->unit;

info->offset -= bytes;
info->bytes += bytes;

Thanks.

HTH
Levi.

```* (no subject)
2020-12-02 17:37         ` Andy Shevchenko
@ 2020-12-02 18:27           ` Yun Levi
2020-12-02 18:51             ` your mail Andy Shevchenko
From: Yun Levi @ 2020-12-02 18:27 UTC (permalink / raw)
To: Andy Shevchenko
Cc: Yury Norov, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch

On Thu, Dec 3, 2020 at 2:36 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Wed, Dec 02, 2020 at 09:26:05AM -0800, Yury Norov wrote:
> > On Wed, Dec 2, 2020 at 3:50 AM Yun Levi <ppbuk5246@gmail.com> wrote:
>
> ...
>
> >  I think this patch has some good catches. We definitely need to implement
> > find_last_zero_bit(), as it is used by fs/ufs, and their local
> > implementation is not optimal.
>
> Side note: speaking of performance, any plans to fix for_each_*_bit*() for
> cases when the nbits is known to be <= BITS_PER_LONG?
>
> Now it makes an awful code generation (something like few hundred bytes of
> code).
>
> --
> With Best Regards,
> Andy Shevchenko
>
>

> Side note: speaking of performance, any plans to fix for_each_*_bit*() for
> cases when the nbits is known to be <= BITS_PER_LONG?

Frankly Speaking, I don't have an idea in now.....
Could you share your idea or wisdom?

Thanks.
Levi.

```* Re: your mail
2020-12-02 18:27           ` Yun Levi
@ 2020-12-02 18:51             ` Andy Shevchenko
2020-12-02 18:56               ` Andy Shevchenko
From: Andy Shevchenko @ 2020-12-02 18:51 UTC (permalink / raw)
To: Yun Levi
Cc: Yury Norov, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch

On Thu, Dec 03, 2020 at 03:27:33AM +0900, Yun Levi wrote:
> On Thu, Dec 3, 2020 at 2:36 AM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Wed, Dec 02, 2020 at 09:26:05AM -0800, Yury Norov wrote:

...

> > Side note: speaking of performance, any plans to fix for_each_*_bit*() for
> > cases when the nbits is known to be <= BITS_PER_LONG?
> >
> > Now it makes an awful code generation (something like few hundred bytes of
> > code).

> Frankly Speaking, I don't have an idea in now.....
> Could you share your idea or wisdom?

Something like (I may be mistaken by names, etc, I'm not a compiler expert,
and this is in pseudo language, I don't remember all API names by hart,
just to express the idea) as a rough first step

__builtin_constant(nbits, find_next_set_bit_long, find_next_set_bit)

find_next_set_bit_long()
{
unsigned long v = BIT_LAST_WORD(i);
return ffs_long(v);
}

Same for find_first_set_bit() -> map it to ffs_long().

And I believe it can be optimized more.

--
With Best Regards,
Andy Shevchenko

```* Re: your mail
2020-12-02 18:51             ` your mail Andy Shevchenko
@ 2020-12-02 18:56               ` Andy Shevchenko
2020-12-02 23:16                 ` Yun Levi
From: Andy Shevchenko @ 2020-12-02 18:56 UTC (permalink / raw)
To: Yun Levi
Cc: Yury Norov, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch

On Wed, Dec 02, 2020 at 08:51:27PM +0200, Andy Shevchenko wrote:
> On Thu, Dec 03, 2020 at 03:27:33AM +0900, Yun Levi wrote:
> > On Thu, Dec 3, 2020 at 2:36 AM Andy Shevchenko
> > <andriy.shevchenko@linux.intel.com> wrote:
> > > On Wed, Dec 02, 2020 at 09:26:05AM -0800, Yury Norov wrote:

...

> > > Side note: speaking of performance, any plans to fix for_each_*_bit*() for
> > > cases when the nbits is known to be <= BITS_PER_LONG?
> > >
> > > Now it makes an awful code generation (something like few hundred bytes of
> > > code).
>
> > Frankly Speaking, I don't have an idea in now.....
> > Could you share your idea or wisdom?
>
> Something like (I may be mistaken by names, etc, I'm not a compiler expert,
> and this is in pseudo language, I don't remember all API names by hart,
> just to express the idea) as a rough first step
>
> __builtin_constant(nbits, find_next_set_bit_long, find_next_set_bit)
>
> find_next_set_bit_long()
> {
> 	unsigned long v = BIT_LAST_WORD(i);
> 	return ffs_long(v);
> }
>
> Same for find_first_set_bit() -> map it to ffs_long().
>
> And I believe it can be optimized more.

Btw it will also require to reconsider test cases where such constant small
nbits values are passed (forcing compiler to avoid optimization somehow, one
way is to try random nbits for some test cases).

--
With Best Regards,
Andy Shevchenko

```* Re:
2020-12-02 18:22         ` Yun Levi
@ 2020-12-02 21:26           ` Yury Norov
2020-12-02 22:51             ` Yun Levi
From: Yury Norov @ 2020-12-02 21:26 UTC (permalink / raw)
To: Yun Levi
Cc: Rasmus Villemoes, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

On Wed, Dec 2, 2020 at 10:22 AM Yun Levi <ppbuk5246@gmail.com> wrote:
>
> On Thu, Dec 3, 2020 at 2:26 AM Yury Norov <yury.norov@gmail.com> wrote:
>
> > Also look at lib/find_bit_benchmark.c
> Thanks. I'll see.
>
> > We need find_next_*_bit() because find_first_*_bit() can start searching only at word-aligned
> > bits. In the case of find_last_*_bit(), we can start at any bit. So, if my understanding is correct,
> > for the purpose of reverse traversing we can go with already existing find_last_bit(),
>
> Thank you. I haven't thought that way.
> But I think if we implement reverse traversing using find_last_bit(),
> we have a problem.
> Suppose the last bit 0, 1, 2, is set.
> If we start
>     find_last_bit(bitmap, 3) ==> return 2;
>     find_last_bit(bitmap, 2) ==> return 1;
>     find_last_bit(bitmap, 1) ==> return 0;
>     find_last_bit(bitmap, 0) ===> return 0? // here we couldn't
> distinguish size 0 input or 0 is set

If you traverse backward and reach bit #0, you're done. No need to continue.

>
> and the for_each traverse routine prevent above case by returning size
> (nbits) using find_next_bit.
> So, for compatibility and the same expected return value like next traversing,
> I think we need to find_prev_*_bit routine. if my understanding is correct.
>
>
> >  I think this patch has some good catches. We definitely need to implement
> > find_last_zero_bit(), as it is used by fs/ufs, and their local implementation is not optimal.
> >
> > We also should consider adding reverse traversing macros based on find_last_*_bit(),
> > if there are proposed users.
>
> Not only this, I think 'steal_from_bitmap_to_front' can be improved
> using ffind_prev_zero_bit
> like
>
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index af0013d3df63..9debb9707390 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2372,7 +2372,6 @@ static bool steal_from_bitmap_to_front(struct
> btrfs_free_space_ctl *ctl,
>   u64 bitmap_offset;
>   unsigned long i;
>   unsigned long j;
> - unsigned long prev_j;
>   u64 bytes;
>
>   bitmap_offset = offset_to_bitmap(ctl, info->offset);
> @@ -2388,20 +2387,15 @@ static bool steal_from_bitmap_to_front(struct
> btrfs_free_space_ctl *ctl,
>   return false;
>
>   i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
> - j = 0;
> - prev_j = (unsigned long)-1;
> - for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
> - if (j > i)
> - break;
> - prev_j = j;
> - }
> - if (prev_j == i)
> + j = find_prev_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);

This one may be implemented with find_last_zero_bit() as well:

unsigned log j = find_last_zero_bit(bitmap, BITS_PER_BITMAP);
if (j <= i || j >= BITS_PER_BITMAP)
return false;

I believe the latter version is better because find_last_*_bit() is simpler in
implementation (and partially exists), has less parameters, and therefore
simpler for users, and doesn't introduce functionality duplication.

The only consideration I can imagine to advocate find_prev*() is the performance
advantage in the scenario when we know for sure that first N bits of
bitmap are all
set/clear, and we can bypass traversing that area. But again, in this
case we can pass the
bitmap address with the appropriate offset, and stay with find_last_*()

> +
> + if (j == i)
>   return false;
>
> - if (prev_j == (unsigned long)-1)
> + if (j == BITS_PER_BITMAP)
>   bytes = (i + 1) * ctl->unit;
>   else
> - bytes = (i - prev_j) * ctl->unit;
> + bytes = (i - j) * ctl->unit;
>
>   info->offset -= bytes;
>   info->bytes += bytes;
>
> Thanks.
>
> HTH
> Levi.

```* (no subject)
2020-12-02 21:26           ` Yury Norov
@ 2020-12-02 22:51             ` Yun Levi
2020-12-03  1:23               ` Yun Levi
From: Yun Levi @ 2020-12-02 22:51 UTC (permalink / raw)
To: Yury Norov
Cc: Rasmus Villemoes, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

On Thu, Dec 3, 2020 at 6:26 AM Yury Norov <yury.norov@gmail.com> wrote:
>
> On Wed, Dec 2, 2020 at 10:22 AM Yun Levi <ppbuk5246@gmail.com> wrote:
> >
> > On Thu, Dec 3, 2020 at 2:26 AM Yury Norov <yury.norov@gmail.com> wrote:
> >
> > > Also look at lib/find_bit_benchmark.c
> > Thanks. I'll see.
> >
> > > We need find_next_*_bit() because find_first_*_bit() can start searching only at word-aligned
> > > bits. In the case of find_last_*_bit(), we can start at any bit. So, if my understanding is correct,
> > > for the purpose of reverse traversing we can go with already existing find_last_bit(),
> >
> > Thank you. I haven't thought that way.
> > But I think if we implement reverse traversing using find_last_bit(),
> > we have a problem.
> > Suppose the last bit 0, 1, 2, is set.
> > If we start
> >     find_last_bit(bitmap, 3) ==> return 2;
> >     find_last_bit(bitmap, 2) ==> return 1;
> >     find_last_bit(bitmap, 1) ==> return 0;
> >     find_last_bit(bitmap, 0) ===> return 0? // here we couldn't
> > distinguish size 0 input or 0 is set
>
> If you traverse backward and reach bit #0, you're done. No need to continue.
I think the case when I consider the this macro like

(bit) < (size);

If we implement the above macro only with find_last_zero_bit,
I think there is no way without adding any additional variable to finish loop.
But I don't want to add additional variable to sustain same format
with for_each_clear_bit,
That's why i decide to implement find_prev_*_bit series.

I don't know it's correct thinking or biased. Am I wrong?

>
> >
> > and the for_each traverse routine prevent above case by returning size
> > (nbits) using find_next_bit.
> > So, for compatibility and the same expected return value like next traversing,
> > I think we need to find_prev_*_bit routine. if my understanding is correct.
> >
> >
> > >  I think this patch has some good catches. We definitely need to implement
> > > find_last_zero_bit(), as it is used by fs/ufs, and their local implementation is not optimal.
> > >
> > > We also should consider adding reverse traversing macros based on find_last_*_bit(),
> > > if there are proposed users.
> >
> > Not only this, I think 'steal_from_bitmap_to_front' can be improved
> > using ffind_prev_zero_bit
> > like
> >
> > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > index af0013d3df63..9debb9707390 100644
> > --- a/fs/btrfs/free-space-cache.c
> > +++ b/fs/btrfs/free-space-cache.c
> > @@ -2372,7 +2372,6 @@ static bool steal_from_bitmap_to_front(struct
> > btrfs_free_space_ctl *ctl,
> >   u64 bitmap_offset;
> >   unsigned long i;
> >   unsigned long j;
> > - unsigned long prev_j;
> >   u64 bytes;
> >
> >   bitmap_offset = offset_to_bitmap(ctl, info->offset);
> > @@ -2388,20 +2387,15 @@ static bool steal_from_bitmap_to_front(struct
> > btrfs_free_space_ctl *ctl,
> >   return false;
> >
> >   i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
> > - j = 0;
> > - prev_j = (unsigned long)-1;
> > - for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
> > - if (j > i)
> > - break;
> > - prev_j = j;
> > - }
> > - if (prev_j == i)
> > + j = find_prev_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
>
> This one may be implemented with find_last_zero_bit() as well:
>
> unsigned log j = find_last_zero_bit(bitmap, BITS_PER_BITMAP);
> if (j <= i || j >= BITS_PER_BITMAP)
>         return false;
>
Actually, in that code, we don't need to check the bit after i.
Originally, if my understanding is correct, former code tries to find
the last 0 bit before i.
and if all bits are fully set before i, it use next one as i + 1

that's why i think the if condition should be
if (j >= i)

But above condition couldn't the discern the case when all bits are
fully set before i.
Also, I think we don't need to check the bit after i and In this case,
find_prev_zero_bit which
specifies the start point is clear to show the meaning of the code.

> I believe the latter version is better because find_last_*_bit() is simpler in
> implementation (and partially exists), has less parameters, and therefore
> simpler for users, and doesn't introduce functionality duplication.
>
> The only consideration I can imagine to advocate find_prev*() is the performance
> advantage in the scenario when we know for sure that first N bits of
> bitmap are all
> set/clear, and we can bypass traversing that area. But again, in this
> case we can pass the
> bitmap address with the appropriate offset, and stay with find_last_*()
>
> > +
> > + if (j == i)
> >   return false;
> >
> > - if (prev_j == (unsigned long)-1)
> > + if (j == BITS_PER_BITMAP)
> >   bytes = (i + 1) * ctl->unit;
> >   else
> > - bytes = (i - prev_j) * ctl->unit;
> > + bytes = (i - j) * ctl->unit;
> >
> >   info->offset -= bytes;
> >   info->bytes += bytes;
> >
> > Thanks.
> >
> > HTH
> > Levi.

Thanks but

```* (no subject)
2020-12-02 18:56               ` Andy Shevchenko
@ 2020-12-02 23:16                 ` Yun Levi
0 siblings, 0 replies; 400+ messages in thread
From: Yun Levi @ 2020-12-02 23:16 UTC (permalink / raw)
To: Andy Shevchenko
Cc: Yury Norov, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch

On Thu, Dec 3, 2020 at 3:55 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Wed, Dec 02, 2020 at 08:51:27PM +0200, Andy Shevchenko wrote:
> > On Thu, Dec 03, 2020 at 03:27:33AM +0900, Yun Levi wrote:
> > > On Thu, Dec 3, 2020 at 2:36 AM Andy Shevchenko
> > > <andriy.shevchenko@linux.intel.com> wrote:
> > > > On Wed, Dec 02, 2020 at 09:26:05AM -0800, Yury Norov wrote:
>
> ...
>
> > > > Side note: speaking of performance, any plans to fix for_each_*_bit*() for
> > > > cases when the nbits is known to be <= BITS_PER_LONG?
> > > >
> > > > Now it makes an awful code generation (something like few hundred bytes of
> > > > code).
> >
> > > Frankly Speaking, I don't have an idea in now.....
> > > Could you share your idea or wisdom?
> >
> > Something like (I may be mistaken by names, etc, I'm not a compiler expert,
> > and this is in pseudo language, I don't remember all API names by hart,
> > just to express the idea) as a rough first step
> >
> > __builtin_constant(nbits, find_next_set_bit_long, find_next_set_bit)
> >
> > find_next_set_bit_long()
> > {
> >       unsigned long v = BIT_LAST_WORD(i);
> >       return ffs_long(v);
> > }
> >

I think this idea is hard to apply to find_next_set_bit.
because __builtin_constant should be not only to size but also to offset.
though we find size && offset is const under BITS_PER_LONG,
I'm not sure it could be implemented as const expression..

> > Same for find_first_set_bit() -> map it to ffs_long().
> >
> > And I believe it can be optimized more.

In case of the find_first_set_bit, I think it would be possible,
But I think it much better to separate as another patch set.
So I want to focus on adding find_prev_*_bit, find_last_zero_bit to
this patchset and mail-thread. Frankly speaking I need time to see
that suggestion and think so, in next patch v2,
it wouldn't be included.

>
> Btw it will also require to reconsider test cases where such constant small
> nbits values are passed (forcing compiler to avoid optimization somehow, one
> way is to try random nbits for some test cases).
>
> --
> With Best Regards,
> Andy Shevchenko
>
>

if my understanding and attitude are wrong, I really apologize for my
rudeness and stubbornness but please let me know what thing is wrong.

Sincerely
Levi.

```* (no subject)
2020-12-02 22:51             ` Yun Levi
@ 2020-12-03  1:23               ` Yun Levi
2020-12-03  8:33                 ` Rasmus Villemoes
From: Yun Levi @ 2020-12-03  1:23 UTC (permalink / raw)
To: Yury Norov
Cc: Rasmus Villemoes, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

On Thu, Dec 3, 2020 at 7:51 AM Yun Levi <ppbuk5246@gmail.com> wrote:
>
> On Thu, Dec 3, 2020 at 6:26 AM Yury Norov <yury.norov@gmail.com> wrote:
> >
> > On Wed, Dec 2, 2020 at 10:22 AM Yun Levi <ppbuk5246@gmail.com> wrote:
> > >
> > > On Thu, Dec 3, 2020 at 2:26 AM Yury Norov <yury.norov@gmail.com> wrote:
> > >
> > > > Also look at lib/find_bit_benchmark.c
> > > Thanks. I'll see.
> > >
> > > > We need find_next_*_bit() because find_first_*_bit() can start searching only at word-aligned
> > > > bits. In the case of find_last_*_bit(), we can start at any bit. So, if my understanding is correct,
> > > > for the purpose of reverse traversing we can go with already existing find_last_bit(),
> > >
> > > Thank you. I haven't thought that way.
> > > But I think if we implement reverse traversing using find_last_bit(),
> > > we have a problem.
> > > Suppose the last bit 0, 1, 2, is set.
> > > If we start
> > >     find_last_bit(bitmap, 3) ==> return 2;
> > >     find_last_bit(bitmap, 2) ==> return 1;
> > >     find_last_bit(bitmap, 1) ==> return 0;
> > >     find_last_bit(bitmap, 0) ===> return 0? // here we couldn't
> > > distinguish size 0 input or 0 is set
> >
> > If you traverse backward and reach bit #0, you're done. No need to continue.
> I think the case when I consider the this macro like
>
>     for ((bit) = find_last_zero_bit((addr), (size))
>           (bit) < (size);
>           (bit) = find_prev_zero_bit((addr), (size), (bit)))
>
> If we implement the above macro only with find_last_zero_bit,
> I think there is no way without adding any additional variable to finish loop.
> But I don't want to add additional variable to sustain same format
> with for_each_clear_bit,
> That's why i decide to implement find_prev_*_bit series.
>
> I don't know it's correct thinking or biased. Am I wrong?
>
> >
> > >
> > > and the for_each traverse routine prevent above case by returning size
> > > (nbits) using find_next_bit.
> > > So, for compatibility and the same expected return value like next traversing,
> > > I think we need to find_prev_*_bit routine. if my understanding is correct.
> > >
> > >
> > > >  I think this patch has some good catches. We definitely need to implement
> > > > find_last_zero_bit(), as it is used by fs/ufs, and their local implementation is not optimal.
> > > >
> > > > We also should consider adding reverse traversing macros based on find_last_*_bit(),
> > > > if there are proposed users.
> > >
> > > Not only this, I think 'steal_from_bitmap_to_front' can be improved
> > > using ffind_prev_zero_bit
> > > like
> > >
> > > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > > index af0013d3df63..9debb9707390 100644
> > > --- a/fs/btrfs/free-space-cache.c
> > > +++ b/fs/btrfs/free-space-cache.c
> > > @@ -2372,7 +2372,6 @@ static bool steal_from_bitmap_to_front(struct
> > > btrfs_free_space_ctl *ctl,
> > >   u64 bitmap_offset;
> > >   unsigned long i;
> > >   unsigned long j;
> > > - unsigned long prev_j;
> > >   u64 bytes;
> > >
> > >   bitmap_offset = offset_to_bitmap(ctl, info->offset);
> > > @@ -2388,20 +2387,15 @@ static bool steal_from_bitmap_to_front(struct
> > > btrfs_free_space_ctl *ctl,
> > >   return false;
> > >
> > >   i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
> > > - j = 0;
> > > - prev_j = (unsigned long)-1;
> > > - for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
> > > - if (j > i)
> > > - break;
> > > - prev_j = j;
> > > - }
> > > - if (prev_j == i)
> > > + j = find_prev_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
> >
> > This one may be implemented with find_last_zero_bit() as well:
> >
> > unsigned log j = find_last_zero_bit(bitmap, BITS_PER_BITMAP);
> > if (j <= i || j >= BITS_PER_BITMAP)
> >         return false;
> >
> Actually, in that code, we don't need to check the bit after i.
> Originally, if my understanding is correct, former code tries to find
> the last 0 bit before i.
> and if all bits are fully set before i, it use next one as i + 1
>
> that's why i think the if condition should be
>    if (j >= i)
>
> But above condition couldn't the discern the case when all bits are
> fully set before i.
> Also, I think we don't need to check the bit after i and In this case,
> find_prev_zero_bit which
> specifies the start point is clear to show the meaning of the code.
>
>
> > I believe the latter version is better because find_last_*_bit() is simpler in
> > implementation (and partially exists), has less parameters, and therefore
> > simpler for users, and doesn't introduce functionality duplication.

I think it's not duplication.
Actually, former you teach me find_first_*_bit should be start word-aligned bit,
But as find_first_*_bit declares it as "size of bitmap" not a start offset.
Though the bitmap size it's word-aligned, it doesn't matter to fine
first bit in the specified size of bitmap (it no, it will return just
size of bitmap)

Likewise, find_last_*_bit is also similar in context.
Fundamentally, it's not a start offset of bitmap but I think it just
size of bitmap.

That's the reason why we need to find_next_*_bit to start at the
specified offset.
In this matter, I think it's better to have find_prev_*_bit.

So, I think we can use both of these functions to be used to achieve a goal.
But, each function has different concept actually that's why I don't
think it's not duplication.

if my understanding is wrong.. Forgive me. and let me know..

Thanks.

> >
> > The only consideration I can imagine to advocate find_prev*() is the performance
> > advantage in the scenario when we know for sure that first N bits of
> > bitmap are all
> > set/clear, and we can bypass traversing that area. But again, in this
> > case we can pass the
> > bitmap address with the appropriate offset, and stay with find_last_*()
> >
> > > +
> > > + if (j == i)
> > >   return false;
> > >
> > > - if (prev_j == (unsigned long)-1)
> > > + if (j == BITS_PER_BITMAP)
> > >   bytes = (i + 1) * ctl->unit;
> > >   else
> > > - bytes = (i - prev_j) * ctl->unit;
> > > + bytes = (i - j) * ctl->unit;
> > >
> > >   info->offset -= bytes;
> > >   info->bytes += bytes;
> > >
> > > Thanks.
> > >
> > > HTH
> > > Levi.
>
> Thanks but

```* Re:
2020-12-03  1:23               ` Yun Levi
@ 2020-12-03  8:33                 ` Rasmus Villemoes
2020-12-03  9:47                   ` Re: Yun Levi
From: Rasmus Villemoes @ 2020-12-03  8:33 UTC (permalink / raw)
To: Yun Levi, Yury Norov
Cc: dushistov, Arnd Bergmann, Andrew Morton, Gustavo A. R. Silva,
William Breathitt Gray, richard.weiyang, joseph.qi, skalluru,
Josh Poimboeuf, Linux Kernel Mailing List, linux-arch,
Andy Shevchenko

On 03/12/2020 02.23, Yun Levi wrote:
> On Thu, Dec 3, 2020 at 7:51 AM Yun Levi <ppbuk5246@gmail.com> wrote:
>>
>> On Thu, Dec 3, 2020 at 6:26 AM Yury Norov <yury.norov@gmail.com> wrote:
>>>
>>> On Wed, Dec 2, 2020 at 10:22 AM Yun Levi <ppbuk5246@gmail.com> wrote:
>>>>
>>>> On Thu, Dec 3, 2020 at 2:26 AM Yury Norov <yury.norov@gmail.com> wrote:
>>>>
>>>>> Also look at lib/find_bit_benchmark.c
>>>> Thanks. I'll see.
>>>>
>>>>> We need find_next_*_bit() because find_first_*_bit() can start searching only at word-aligned
>>>>> bits. In the case of find_last_*_bit(), we can start at any bit. So, if my understanding is correct,
>>>>> for the purpose of reverse traversing we can go with already existing find_last_bit(),
>>>>
>>>> Thank you. I haven't thought that way.
>>>> But I think if we implement reverse traversing using find_last_bit(),
>>>> we have a problem.
>>>> Suppose the last bit 0, 1, 2, is set.
>>>> If we start
>>>>     find_last_bit(bitmap, 3) ==> return 2;
>>>>     find_last_bit(bitmap, 2) ==> return 1;
>>>>     find_last_bit(bitmap, 1) ==> return 0;
>>>>     find_last_bit(bitmap, 0) ===> return 0? // here we couldn't

Either just make the return type of all find_prev/find_last be signed
int and use -1 as the sentinel to indicate "no such position exists", so
the loop condition would be foo >= 0. Or, change the condition from
"stop if we get the size returned" to "only continue if we get something
strictly less than the size we passed in (i.e., something which can
possibly be a valid bit index). In the latter case, both (unsigned)-1
aka UINT_MAX and the actual size value passed work equally well as a
sentinel.

If one uses UINT_MAX, a for_each_bit_reverse() macro would just be
something like

for (i = find_last_bit(bitmap, size); i < size; i =
find_last_bit(bitmap, i))

if one wants to use the size argument as the sentinel, the caller would
have to supply a scratch variable to keep track of the last i value:

for (j = size, i = find_last_bit(bitmap, j); i < j; j = i, i =
find_last_bit(bitmap, j))

which is probably a little less ergonomic.

Rasmus

```* Re:
2020-12-03  8:33                 ` Rasmus Villemoes
@ 2020-12-03  9:47                   ` Yun Levi
2020-12-03 18:46                     ` Re: Yury Norov
From: Yun Levi @ 2020-12-03  9:47 UTC (permalink / raw)
To: Rasmus Villemoes
Cc: Yury Norov, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

> If one uses UINT_MAX, a for_each_bit_reverse() macro would just be
> something like
>
> for (i = find_last_bit(bitmap, size); i < size; i =
> find_last_bit(bitmap, i))
>
> if one wants to use the size argument as the sentinel, the caller would
> have to supply a scratch variable to keep track of the last i value:
>
> for (j = size, i = find_last_bit(bitmap, j); i < j; j = i, i =
> find_last_bit(bitmap, j))
>
> which is probably a little less ergonomic.

Actually Because I want to avoid the modification of return type of
find_last_*_bit for new sentinel,
the big difference between find_last_bit and find_prev_bit is
find_last_bit doesn't check the size bit and use sentinel with size.
but find_prev_bit check the offset bit and use sentinel with size
which passed by another argument.
So if we use find_prev_bit, we could have a clear iteration if
using find_prev_bit like.

for ((bit) = find_last_bit((addr), (size));    \
(bit) < (size);                                     \
(bit) = find_prev_bit((addr), (size), (bit - 1)))

for ((bit) = find_prev_bit((addr), (size), (bit)); \
(bit) < (size);                                           \
(bit) = find_prev_bit((addr), (size), (bit - 1)))

Though find_prev_*_bit / find_last_*_bit have the same functionality.
But they also have a small difference.
I think this small this small difference doesn't make some of
confusion to user but it help to solve problem
with a simple way (just like the iteration above).

So I think I need, find_prev_*_bit series.

Am I missing anything?

Thanks.

Levi.

On Thu, Dec 3, 2020 at 5:33 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> On 03/12/2020 02.23, Yun Levi wrote:
> > On Thu, Dec 3, 2020 at 7:51 AM Yun Levi <ppbuk5246@gmail.com> wrote:
> >>
> >> On Thu, Dec 3, 2020 at 6:26 AM Yury Norov <yury.norov@gmail.com> wrote:
> >>>
> >>> On Wed, Dec 2, 2020 at 10:22 AM Yun Levi <ppbuk5246@gmail.com> wrote:
> >>>>
> >>>> On Thu, Dec 3, 2020 at 2:26 AM Yury Norov <yury.norov@gmail.com> wrote:
> >>>>
> >>>>> Also look at lib/find_bit_benchmark.c
> >>>> Thanks. I'll see.
> >>>>
> >>>>> We need find_next_*_bit() because find_first_*_bit() can start searching only at word-aligned
> >>>>> bits. In the case of find_last_*_bit(), we can start at any bit. So, if my understanding is correct,
> >>>>> for the purpose of reverse traversing we can go with already existing find_last_bit(),
> >>>>
> >>>> Thank you. I haven't thought that way.
> >>>> But I think if we implement reverse traversing using find_last_bit(),
> >>>> we have a problem.
> >>>> Suppose the last bit 0, 1, 2, is set.
> >>>> If we start
> >>>>     find_last_bit(bitmap, 3) ==> return 2;
> >>>>     find_last_bit(bitmap, 2) ==> return 1;
> >>>>     find_last_bit(bitmap, 1) ==> return 0;
> >>>>     find_last_bit(bitmap, 0) ===> return 0? // here we couldn't
>
> Either just make the return type of all find_prev/find_last be signed
> int and use -1 as the sentinel to indicate "no such position exists", so
> the loop condition would be foo >= 0. Or, change the condition from
> "stop if we get the size returned" to "only continue if we get something
> strictly less than the size we passed in (i.e., something which can
> possibly be a valid bit index). In the latter case, both (unsigned)-1
> aka UINT_MAX and the actual size value passed work equally well as a
> sentinel.
>
> If one uses UINT_MAX, a for_each_bit_reverse() macro would just be
> something like
>
> for (i = find_last_bit(bitmap, size); i < size; i =
> find_last_bit(bitmap, i))
>
> if one wants to use the size argument as the sentinel, the caller would
> have to supply a scratch variable to keep track of the last i value:
>
> for (j = size, i = find_last_bit(bitmap, j); i < j; j = i, i =
> find_last_bit(bitmap, j))
>
> which is probably a little less ergonomic.
>
> Rasmus

```* Re:
2020-12-03  9:47                   ` Re: Yun Levi
@ 2020-12-03 18:46                     ` Yury Norov
2020-12-03 18:52                       ` Re: Willy Tarreau
2020-12-05 11:10                       ` Re: Rasmus Villemoes
0 siblings, 2 replies; 400+ messages in thread
From: Yury Norov @ 2020-12-03 18:46 UTC (permalink / raw)
To: Yun Levi
Cc: Rasmus Villemoes, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

Yun, could you please stop top-posting and excessive trimming in the thread?

On Thu, Dec 3, 2020 at 1:47 AM Yun Levi <ppbuk5246@gmail.com> wrote:
> > Either just make the return type of all find_prev/find_last be signed
> > int and use -1 as the sentinel to indicate "no such position exists", so
> > the loop condition would be foo >= 0. Or, change the condition from
> > "stop if we get the size returned" to "only continue if we get something
> > strictly less than the size we passed in (i.e., something which can
> > possibly be a valid bit index). In the latter case, both (unsigned)-1
> > aka UINT_MAX and the actual size value passed work equally well as a
> > sentinel.
> >
> > If one uses UINT_MAX, a for_each_bit_reverse() macro would just be
> > something like
> >
> > for (i = find_last_bit(bitmap, size); i < size; i =
> > find_last_bit(bitmap, i))
> >
> > if one wants to use the size argument as the sentinel, the caller would
> > have to supply a scratch variable to keep track of the last i value:
> >
> > for (j = size, i = find_last_bit(bitmap, j); i < j; j = i, i =
> > find_last_bit(bitmap, j))
> >
> > which is probably a little less ergonomic.
> >
> > Rasmus

I would prefer to avoid changing the find*bit() semantics. As for now,
if any of find_*_bit()
finds nothing, it returns the size of the bitmap it was passed.
Changing this for
a single function would break the consistency, and may cause problems
for those who
rely on existing behaviour.

Passing non-positive size to find_*_bit() should produce undefined
behaviour, because we cannot dereference a pointer to the bitmap in
this case; this is most probably a sign of a problem on a caller side
anyways.

Let's keep this logic unchanged?

> Actually Because I want to avoid the modification of return type of
> find_last_*_bit for new sentinel,
> the big difference between find_last_bit and find_prev_bit is
>    find_last_bit doesn't check the size bit and use sentinel with size.
>    but find_prev_bit check the offset bit and use sentinel with size
> which passed by another argument.
>    So if we use find_prev_bit, we could have a clear iteration if
> using find_prev_bit like.
>
>   #define for_each_set_bit_reverse(bit, addr, size) \
>       for ((bit) = find_last_bit((addr), (size));    \
>             (bit) < (size);                                     \
>             (bit) = find_prev_bit((addr), (size), (bit - 1)))
>
>   #define for_each_set_bit_from_reverse(bit, addr, size) \
>       for ((bit) = find_prev_bit((addr), (size), (bit)); \
>              (bit) < (size);                                           \
>              (bit) = find_prev_bit((addr), (size), (bit - 1)))
>
> Though find_prev_*_bit / find_last_*_bit have the same functionality.
> But they also have a small difference.
> I think this small this small difference doesn't make some of
> confusion to user but it help to solve problem
> with a simple way (just like the iteration above).
>
> So I think I need, find_prev_*_bit series.
>
> Am I missing anything?
>
> Thanks.
>
> Levi.

As you said, find_last_bit() and proposed find_prev_*_bit() have the
same functionality.
If you really want to have find_prev_*_bit(), could you please at
least write it using find_last_bit(), otherwise it would be just a
blottering.

Regarding reverse search, we can probably do like this (not tested,
just an idea):

for ((bit) = find_last_bit((addr), (size));    \
(bit) < (size);                                     \
(size) = (bit), (bit) = find_last_bit((addr), (bit)))

Thanks,
Yury

```* Re:
2020-12-03 18:46                     ` Re: Yury Norov
@ 2020-12-03 18:52                       ` Willy Tarreau
2020-12-04  1:36                         ` Re: Yun Levi
2020-12-05 11:10                       ` Re: Rasmus Villemoes
From: Willy Tarreau @ 2020-12-03 18:52 UTC (permalink / raw)
To: Yury Norov
Cc: Yun Levi, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch, Andy Shevchenko

On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> Yun, could you please stop top-posting and excessive trimming in the thread?

And re-configure the mail agent to make the "Subject" field appear and
fill it.

Willy

```* Re:
2020-12-03 18:52                       ` Re: Willy Tarreau
@ 2020-12-04  1:36                         ` Yun Levi
2020-12-04 18:14                           ` Re: Yury Norov
From: Yun Levi @ 2020-12-04  1:36 UTC (permalink / raw)
To: Willy Tarreau
Cc: Yury Norov, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch, Andy Shevchenko

>On Fri, Dec 4, 2020 at 3:53 AM Willy Tarreau <w@1wt.eu> wrote:
>
> On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > Yun, could you please stop top-posting and excessive trimming in the thread?
>
> And re-configure the mail agent to make the "Subject" field appear and
> fill it.

>On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> Yun, could you please stop top-posting and excessive trimming in the thread?
Sorry to make you uncomfortable... Thanks for advice.

>On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> As you said, find_last_bit() and proposed find_prev_*_bit() have the
> same functionality.
> If you really want to have find_prev_*_bit(), could you please at
> least write it using find_last_bit(), otherwise it would be just a
> blottering.

Actually find_prev_*_bit call _find_prev_bit which is a common helper function
like _find_next_bit.
As you know this function is required to support __BIGEDIAN's little
endian search.
find_prev_bit actually wrapper of _find_prev_bit which have a feature
the find_last_bit.

That makes the semantics difference between find_last_bit and find_prev_bit.
-- specify where you find from and
In loop, find_last_bit couldn't sustain original size as sentinel
return value
(we should change the size argument for next searching
But it means whenever we call, "NOT SET or NOT CLEAR"'s sentinel
return value is changed per call).

Because we should have _find_prev_bit,
I think it's the matter to choose which is better to usein
find_prev_bit (find_last_bit? or _find_prev_bit?)
sustaining find_prev_bit feature (give size as sentinel return, from
where I start).
if my understanding is correct.

In my view, I prefer to use _find_prev_bit like find_next_bit for
integrated format.

But In some of the benchmarking, find_last_bit is better than _find_prev_bit,
here what I tested (look similar but sometimes have some difference).

Start testing find_bit() with random-filled bitmap
[  +0.001850] find_next_bit:                  842792 ns, 163788 iterations
[  +0.000873] find_prev_bit:                  870914 ns, 163788 iterations
[  +0.000824] find_next_zero_bit:             821959 ns, 163894 iterations
[  +0.000677] find_prev_zero_bit:             676240 ns, 163894 iterations
[  +0.000777] find_last_bit:                  659103 ns, 163788 iterations
[  +0.001822] find_first_bit:                1708041 ns,  16250 iterations
[  +0.000539] find_next_and_bit:              492182 ns,  73871 iterations
[  +0.000001]
Start testing find_bit() with sparse bitmap
[  +0.000222] find_next_bit:                   13227 ns,    654 iterations
[  +0.000013] find_prev_bit:                   11652 ns,    654 iterations
[  +0.001845] find_next_zero_bit:            1723869 ns, 327028 iterations
[  +0.001538] find_prev_zero_bit:            1355808 ns, 327028 iterations
[  +0.000010] find_last_bit:                    8114 ns,    654 iterations
[  +0.000867] find_first_bit:                 710639 ns,    654 iterations
[  +0.000006] find_next_and_bit:                4273 ns,      1 iterations
[  +0.000004] find_next_and_bit:                3278 ns,      1 iterations

Start testing find_bit() with random-filled bitmap
[  +0.001784] find_next_bit:                  805553 ns, 164240 iterations
[  +0.000643] find_prev_bit:                  632474 ns, 164240 iterations
[  +0.000950] find_next_zero_bit:             877215 ns, 163442 iterations
[  +0.000664] find_prev_zero_bit:             662339 ns, 163442 iterations
[  +0.000680] find_last_bit:                  602204 ns, 164240 iterations
[  +0.001912] find_first_bit:                1758208 ns,  16408 iterations
[  +0.000760] find_next_and_bit:              531033 ns,  73798 iterations
[  +0.000002]
Start testing find_bit() with sparse bitmap
[  +0.000203] find_next_bit:                   12468 ns,    656 iterations
[  +0.000205] find_prev_bit:                   10948 ns,    656 iterations
[  +0.001759] find_next_zero_bit:            1579447 ns, 327026 iterations
[  +0.001935] find_prev_zero_bit:            1931961 ns, 327026 iterations
[  +0.000013] find_last_bit:                    9543 ns,    656 iterations
[  +0.000732] find_first_bit:                 562009 ns,    656 iterations
[  +0.000217] find_next_and_bit:                6804 ns,      1 iterations
[  +0.000007] find_next_and_bit:                4367 ns,      1 iterations

Is it better to write find_prev_bit using find_last_bit?
I question again.

HTH.
Levi.

```* Re:
2020-12-04  1:36                         ` Re: Yun Levi
@ 2020-12-04 18:14                           ` Yury Norov
2020-12-05  0:45                             ` Re: Yun Levi
From: Yury Norov @ 2020-12-04 18:14 UTC (permalink / raw)
To: Yun Levi
Cc: Willy Tarreau, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch, Andy Shevchenko

On Thu, Dec 3, 2020 at 5:36 PM Yun Levi <ppbuk5246@gmail.com> wrote:
>
> >On Fri, Dec 4, 2020 at 3:53 AM Willy Tarreau <w@1wt.eu> wrote:
> >
> > On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > > Yun, could you please stop top-posting and excessive trimming in the thread?
> >
> > And re-configure the mail agent to make the "Subject" field appear and
> > fill it.
>
> >On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > Yun, could you please stop top-posting and excessive trimming in the thread?
> Sorry to make you uncomfortable... Thanks for advice.
>
> >On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > As you said, find_last_bit() and proposed find_prev_*_bit() have the
> > same functionality.
> > If you really want to have find_prev_*_bit(), could you please at
> > least write it using find_last_bit(), otherwise it would be just a
> > blottering.
>
> Actually find_prev_*_bit call _find_prev_bit which is a common helper function
> like _find_next_bit.
> As you know this function is required to support __BIGEDIAN's little
> endian search.
> find_prev_bit actually wrapper of _find_prev_bit which have a feature
> the find_last_bit.
>
> That makes the semantics difference between find_last_bit and find_prev_bit.
> -- specify where you find from and
>    In loop, find_last_bit couldn't sustain original size as sentinel
> return value
>     (we should change the size argument for next searching
>      But it means whenever we call, "NOT SET or NOT CLEAR"'s sentinel
> return value is changed per call).
>
> Because we should have _find_prev_bit,
> I think it's the matter to choose which is better to usein
> find_prev_bit (find_last_bit? or _find_prev_bit?)
> sustaining find_prev_bit feature (give size as sentinel return, from
> where I start).
> if my understanding is correct.
>
> In my view, I prefer to use _find_prev_bit like find_next_bit for
> integrated format.
>
> But In some of the benchmarking, find_last_bit is better than _find_prev_bit,
> here what I tested (look similar but sometimes have some difference).
>
>               Start testing find_bit() with random-filled bitmap
> [  +0.001850] find_next_bit:                  842792 ns, 163788 iterations
> [  +0.000873] find_prev_bit:                  870914 ns, 163788 iterations
> [  +0.000824] find_next_zero_bit:             821959 ns, 163894 iterations
> [  +0.000677] find_prev_zero_bit:             676240 ns, 163894 iterations
> [  +0.000777] find_last_bit:                  659103 ns, 163788 iterations
> [  +0.001822] find_first_bit:                1708041 ns,  16250 iterations
> [  +0.000539] find_next_and_bit:              492182 ns,  73871 iterations
> [  +0.000001]
>               Start testing find_bit() with sparse bitmap
> [  +0.000222] find_next_bit:                   13227 ns,    654 iterations
> [  +0.000013] find_prev_bit:                   11652 ns,    654 iterations
> [  +0.001845] find_next_zero_bit:            1723869 ns, 327028 iterations
> [  +0.001538] find_prev_zero_bit:            1355808 ns, 327028 iterations
> [  +0.000010] find_last_bit:                    8114 ns,    654 iterations
> [  +0.000867] find_first_bit:                 710639 ns,    654 iterations
> [  +0.000006] find_next_and_bit:                4273 ns,      1 iterations
> [  +0.000004] find_next_and_bit:                3278 ns,      1 iterations
>
>               Start testing find_bit() with random-filled bitmap
> [  +0.001784] find_next_bit:                  805553 ns, 164240 iterations
> [  +0.000643] find_prev_bit:                  632474 ns, 164240 iterations
> [  +0.000950] find_next_zero_bit:             877215 ns, 163442 iterations
> [  +0.000664] find_prev_zero_bit:             662339 ns, 163442 iterations
> [  +0.000680] find_last_bit:                  602204 ns, 164240 iterations
> [  +0.001912] find_first_bit:                1758208 ns,  16408 iterations
> [  +0.000760] find_next_and_bit:              531033 ns,  73798 iterations
> [  +0.000002]
>               Start testing find_bit() with sparse bitmap
> [  +0.000203] find_next_bit:                   12468 ns,    656 iterations
> [  +0.000205] find_prev_bit:                   10948 ns,    656 iterations
> [  +0.001759] find_next_zero_bit:            1579447 ns, 327026 iterations
> [  +0.001935] find_prev_zero_bit:            1931961 ns, 327026 iterations
> [  +0.000013] find_last_bit:                    9543 ns,    656 iterations
> [  +0.000732] find_first_bit:                 562009 ns,    656 iterations
> [  +0.000217] find_next_and_bit:                6804 ns,      1 iterations
> [  +0.000007] find_next_and_bit:                4367 ns,      1 iterations
>
> Is it better to write find_prev_bit using find_last_bit?
> I question again.

I answer again. It's better not to write find_prev_bit at all and
learn how to use existing functionality.

Yury

>
> HTH.
> Levi.

```* Re:
2020-12-04 18:14                           ` Re: Yury Norov
@ 2020-12-05  0:45                             ` Yun Levi
0 siblings, 0 replies; 400+ messages in thread
From: Yun Levi @ 2020-12-05  0:45 UTC (permalink / raw)
To: Yury Norov
Cc: Willy Tarreau, Rasmus Villemoes, dushistov, Arnd Bergmann,
Andrew Morton, Gustavo A. R. Silva, William Breathitt Gray,
richard.weiyang, joseph.qi, skalluru, Josh Poimboeuf,
Linux Kernel Mailing List, linux-arch, Andy Shevchenko

> I answer again. It's better not to write find_prev_bit at all and
> learn how to use existing functionality.

Thanks for the answer I'll fix and send the patch again :)

On Sat, Dec 5, 2020 at 3:14 AM Yury Norov <yury.norov@gmail.com> wrote:
>
> On Thu, Dec 3, 2020 at 5:36 PM Yun Levi <ppbuk5246@gmail.com> wrote:
> >
> > >On Fri, Dec 4, 2020 at 3:53 AM Willy Tarreau <w@1wt.eu> wrote:
> > >
> > > On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > > > Yun, could you please stop top-posting and excessive trimming in the thread?
> > >
> > > And re-configure the mail agent to make the "Subject" field appear and
> > > fill it.
> >
> > >On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > > Yun, could you please stop top-posting and excessive trimming in the thread?
> > Sorry to make you uncomfortable... Thanks for advice.
> >
> > >On Thu, Dec 03, 2020 at 10:46:25AM -0800, Yury Norov wrote:
> > > As you said, find_last_bit() and proposed find_prev_*_bit() have the
> > > same functionality.
> > > If you really want to have find_prev_*_bit(), could you please at
> > > least write it using find_last_bit(), otherwise it would be just a
> > > blottering.
> >
> > Actually find_prev_*_bit call _find_prev_bit which is a common helper function
> > like _find_next_bit.
> > As you know this function is required to support __BIGEDIAN's little
> > endian search.
> > find_prev_bit actually wrapper of _find_prev_bit which have a feature
> > the find_last_bit.
> >
> > That makes the semantics difference between find_last_bit and find_prev_bit.
> > -- specify where you find from and
> >    In loop, find_last_bit couldn't sustain original size as sentinel
> > return value
> >     (we should change the size argument for next searching
> >      But it means whenever we call, "NOT SET or NOT CLEAR"'s sentinel
> > return value is changed per call).
> >
> > Because we should have _find_prev_bit,
> > I think it's the matter to choose which is better to usein
> > find_prev_bit (find_last_bit? or _find_prev_bit?)
> > sustaining find_prev_bit feature (give size as sentinel return, from
> > where I start).
> > if my understanding is correct.
> >
> > In my view, I prefer to use _find_prev_bit like find_next_bit for
> > integrated format.
> >
> > But In some of the benchmarking, find_last_bit is better than _find_prev_bit,
> > here what I tested (look similar but sometimes have some difference).
> >
> >               Start testing find_bit() with random-filled bitmap
> > [  +0.001850] find_next_bit:                  842792 ns, 163788 iterations
> > [  +0.000873] find_prev_bit:                  870914 ns, 163788 iterations
> > [  +0.000824] find_next_zero_bit:             821959 ns, 163894 iterations
> > [  +0.000677] find_prev_zero_bit:             676240 ns, 163894 iterations
> > [  +0.000777] find_last_bit:                  659103 ns, 163788 iterations
> > [  +0.001822] find_first_bit:                1708041 ns,  16250 iterations
> > [  +0.000539] find_next_and_bit:              492182 ns,  73871 iterations
> > [  +0.000001]
> >               Start testing find_bit() with sparse bitmap
> > [  +0.000222] find_next_bit:                   13227 ns,    654 iterations
> > [  +0.000013] find_prev_bit:                   11652 ns,    654 iterations
> > [  +0.001845] find_next_zero_bit:            1723869 ns, 327028 iterations
> > [  +0.001538] find_prev_zero_bit:            1355808 ns, 327028 iterations
> > [  +0.000010] find_last_bit:                    8114 ns,    654 iterations
> > [  +0.000867] find_first_bit:                 710639 ns,    654 iterations
> > [  +0.000006] find_next_and_bit:                4273 ns,      1 iterations
> > [  +0.000004] find_next_and_bit:                3278 ns,      1 iterations
> >
> >               Start testing find_bit() with random-filled bitmap
> > [  +0.001784] find_next_bit:                  805553 ns, 164240 iterations
> > [  +0.000643] find_prev_bit:                  632474 ns, 164240 iterations
> > [  +0.000950] find_next_zero_bit:             877215 ns, 163442 iterations
> > [  +0.000664] find_prev_zero_bit:             662339 ns, 163442 iterations
> > [  +0.000680] find_last_bit:                  602204 ns, 164240 iterations
> > [  +0.001912] find_first_bit:                1758208 ns,  16408 iterations
> > [  +0.000760] find_next_and_bit:              531033 ns,  73798 iterations
> > [  +0.000002]
> >               Start testing find_bit() with sparse bitmap
> > [  +0.000203] find_next_bit:                   12468 ns,    656 iterations
> > [  +0.000205] find_prev_bit:                   10948 ns,    656 iterations
> > [  +0.001759] find_next_zero_bit:            1579447 ns, 327026 iterations
> > [  +0.001935] find_prev_zero_bit:            1931961 ns, 327026 iterations
> > [  +0.000013] find_last_bit:                    9543 ns,    656 iterations
> > [  +0.000732] find_first_bit:                 562009 ns,    656 iterations
> > [  +0.000217] find_next_and_bit:                6804 ns,      1 iterations
> > [  +0.000007] find_next_and_bit:                4367 ns,      1 iterations
> >
> > Is it better to write find_prev_bit using find_last_bit?
> > I question again.
>
> I answer again. It's better not to write find_prev_bit at all and
> learn how to use existing functionality.
>
> Yury
>
> >
> > HTH.
> > Levi.

```* Re:
2020-12-03 18:46                     ` Re: Yury Norov
2020-12-03 18:52                       ` Re: Willy Tarreau
@ 2020-12-05 11:10                       ` Rasmus Villemoes
2020-12-05 18:20                         ` Re: Yury Norov
From: Rasmus Villemoes @ 2020-12-05 11:10 UTC (permalink / raw)
To: Yury Norov, Yun Levi
Cc: dushistov, Arnd Bergmann, Andrew Morton, Gustavo A. R. Silva,
William Breathitt Gray, richard.weiyang, joseph.qi, skalluru,
Josh Poimboeuf, Linux Kernel Mailing List, linux-arch,
Andy Shevchenko

On 03/12/2020 19.46, Yury Norov wrote:

> I would prefer to avoid changing the find*bit() semantics. As for now,
> if any of find_*_bit()
> finds nothing, it returns the size of the bitmap it was passed.

Yeah, we should actually try to fix that, it causes bad code generation.
It's hard, because callers of course do that "if ret == size" check. But
it's really silly that something like find_first_bit needs to do that
"min(i*BPL + __ffs(word), size)" - the caller does a comparison anyway,
that comparison might as well be "ret >= size" rather than "ret ==
size", and then we could get rid of that branch (which min() necessarily
becomes) at the end of find_next_bit.

I haven't dug very deep into this, but I could also imagine the
arch-specific parts of this might become a little easier to do if the
semantics were just "if no such bit, return an indeterminate value >=
the size".

> Changing this for
> a single function would break the consistency, and may cause problems
> for those who
> rely on existing behaviour.

True. But I think it should be possible - I suppose most users are via
the iterator macros, which could all be updated at once. Changing ret ==
size to ret >= size will still work even if the implementations have not
been switched over, so it should be doable.

>
> Passing non-positive size to find_*_bit() should produce undefined
> behaviour, because we cannot dereference a pointer to the bitmap in
> this case; this is most probably a sign of a problem on a caller side
> anyways.

No, the out-of-line bitmap functions should all handle the case of a
zero-size bitmap sensibly.

Is bitmap full? Yes (all the 0 bits are set).
Is bitmap empty? Yes, (none of the 0 bits are set).
Find the first bit set (returns 0, there's no such bit)

Etc. The static inlines for small_const_nbits do assume that the pointer
can be dereferenced, which is why small_const_nbits was updated to mean
1<=bits<=BITS_PER_LONG rather than just bits<=BITS_PER_LONG.

Rasmus

```* Re:
2020-12-05 11:10                       ` Re: Rasmus Villemoes
@ 2020-12-05 18:20                         ` Yury Norov
0 siblings, 0 replies; 400+ messages in thread
From: Yury Norov @ 2020-12-05 18:20 UTC (permalink / raw)
To: Rasmus Villemoes
Cc: Yun Levi, dushistov, Arnd Bergmann, Andrew Morton,
Gustavo A. R. Silva, William Breathitt Gray, richard.weiyang,
joseph.qi, skalluru, Josh Poimboeuf, Linux Kernel Mailing List,
linux-arch, Andy Shevchenko

On Sat, Dec 5, 2020 at 3:10 AM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> On 03/12/2020 19.46, Yury Norov wrote:
>
> > I would prefer to avoid changing the find*bit() semantics. As for now,
> > if any of find_*_bit()
> > finds nothing, it returns the size of the bitmap it was passed.
>
> Yeah, we should actually try to fix that, it causes bad code generation.
> It's hard, because callers of course do that "if ret == size" check. But
> it's really silly that something like find_first_bit needs to do that
> "min(i*BPL + __ffs(word), size)" - the caller does a comparison anyway,
> that comparison might as well be "ret >= size" rather than "ret ==
> size", and then we could get rid of that branch (which min() necessarily
> becomes) at the end of find_next_bit.

We didn't do that 5 years ago because it's too invasive and the improvement
is barely measurable, the difference is 2 instructions (on arm64).e.
Has something
changed since that?

20000000000000000 <find_first_bit_better>:
0:   aa0003e3        mov     x3, x0
4:   aa0103e0        mov     x0, x1
8:   b4000181        cbz     x1, 38 <find_first_bit_better+0x38>
c:   f9400064        ldr     x4, [x3]
10:   d2800802        mov     x2, #0x40                       // #64
14:   91002063        add     x3, x3, #0x8
18:   b40000c4        cbz     x4, 30 <find_first_bit_better+0x30>
1c:   14000008        b       3c <find_first_bit_better+0x3c>
20:   f8408464        ldr     x4, [x3], #8
24:   91010045        add     x5, x2, #0x40
28:   b50000c4        cbnz    x4, 40 <find_first_bit_better+0x40>
2c:   aa0503e2        mov     x2, x5
30:   eb00005f        cmp     x2, x0
34:   54ffff63        b.cc    20 <find_first_bit_better+0x20>  //
b.lo, b.ul, b.last
38:   d65f03c0        ret
3c:   d2800002        mov     x2, #0x0                        // #0
40:   dac00084        rbit    x4, x4
44:   dac01084        clz     x4, x4
48:   8b020080        add     x0, x4, x2
4c:   d65f03c0        ret

0000000000000050 <find_first_bit_worse>:
50:   aa0003e4        mov     x4, x0
54:   aa0103e0        mov     x0, x1
58:   b4000181        cbz     x1, 88 <find_first_bit_worse+0x38>
5c:   f9400083        ldr     x3, [x4]
60:   d2800802        mov     x2, #0x40                       // #64
64:   91002084        add     x4, x4, #0x8
68:   b40000c3        cbz     x3, 80 <find_first_bit_worse+0x30>
6c:   14000008        b       8c <find_first_bit_worse+0x3c>
70:   f8408483        ldr     x3, [x4], #8
74:   91010045        add     x5, x2, #0x40
78:   b50000c3        cbnz    x3, 90 <find_first_bit_worse+0x40>
7c:   aa0503e2        mov     x2, x5
80:   eb02001f        cmp     x0, x2
84:   54ffff68        b.hi    70 <find_first_bit_worse+0x20>  // b.pmore
88:   d65f03c0        ret
8c:   d2800002        mov     x2, #0x0                        // #0
90:   dac00063        rbit    x3, x3
94:   dac01063        clz     x3, x3
98:   8b020062        add     x2, x3, x2
9c:   eb02001f        cmp     x0, x2
a0:   9a829000        csel    x0, x0, x2, ls  // ls = plast
a4:   d65f03c0        ret

> I haven't dug very deep into this, but I could also imagine the
> arch-specific parts of this might become a little easier to do if the
> semantics were just "if no such bit, return an indeterminate value >=
> the size".
>
> > Changing this for
> > a single function would break the consistency, and may cause problems
> > for those who
> > rely on existing behaviour.
>
> True. But I think it should be possible - I suppose most users are via
> the iterator macros, which could all be updated at once. Changing ret ==
> size to ret >= size will still work even if the implementations have not
> been switched over, so it should be doable.

Since there's no assembler users for it, we can do just:
#define find_first_bit(bitmap, size)
min(better_find_first_bit((bitmap), (size)), (size))

... and deprecate find_first_bit.

> > Passing non-positive size to find_*_bit() should produce undefined
> > behaviour, because we cannot dereference a pointer to the bitmap in
> > this case; this is most probably a sign of a problem on a caller side
> > anyways.
>
> No, the out-of-line bitmap functions should all handle the case of a
> zero-size bitmap sensibly.

I could be more specific, the behaviour is defined: don't dereference
the address and return undefined value (which now is always 0).

> Is bitmap full? Yes (all the 0 bits are set).
> Is bitmap empty? Yes, (none of the 0 bits are set).
> Find the first bit set (returns 0, there's no such bit)

I can't answer because this object is not a map of bits - there's no room for
bits inside.

> Etc. The static inlines for small_const_nbits do assume that the pointer
> can be dereferenced, which is why small_const_nbits was updated to mean
> 1<=bits<=BITS_PER_LONG rather than just bits<=BITS_PER_LONG.

I don't want to do something like

if (size == 0)
return -1;

... because it legitimizes this kind of usage and hides problems on
callers' side.
Instead, I'd add WARN_ON(size == 0), but I don't think it's so
critical to bother with it.

Yury

> Rasmus

```* Re:
2022-04-22 15:53           ` Thomas Gleixner
@ 2022-04-23  2:29             ` Nicholas Piggin
0 siblings, 0 replies; 400+ messages in thread
From: Nicholas Piggin @ 2022-04-23  2:29 UTC (permalink / raw)
To: Michael Ellerman, paulmck, Thomas Gleixner, Zhouyi Zhou
Cc: Daniel
Lezcano, linux-kernel, linuxppc-dev, Miguel Ojeda, rcu,
Viresh
Kumar

Excerpts from Thomas Gleixner's message of April 23, 2022 1:53 am:
> On Wed, Apr 13 2022 at 15:11, Nicholas Piggin wrote:
>> So we traced the problem down to possibly a misunderstanding between
>> decrementer clock event device and core code.
>>
>> The decrementer is only oneshot*ish*. It actually needs to either be
>> reprogrammed or shut down otherwise it just continues to cause
>> interrupts.
>
> I always thought that PPC had sane timers. That's really disillusioning.

My comment was probably a bit misleading explanation of the whole
situation. This weirdness is actually in software in the powerpc
clock event driver due to a recent change I made assuming the clock
event goes to oneshot-stopped.

The hardware is relatively sane I think, global synchronized constant
rate high frequency clock distributed to the CPUs so reads don't
go off-core. And per-CPU "decrementer" event interrupt at the same
frequency as the clock -- program it to a +ve value and it decrements
until zero then creates basically a level triggered interrupt.

Before my change, the decrementer interrupt would always clear the
interrupt at entry. The event_handler usually programs another
timer in so I tried to avoid that first clear counting on the
oneshot_stopped callback to clear the interrupt if there was no
other timer.

>> Before commit 35de589cb879, it was sort of two-shot. The initial
>> interrupt at the programmed time would set its internal next_tb variable
>> to ~0 and call the ->event_handler(). If that did not set_next_event or
>> stop the timer, the interrupt will fire again immediately, notice
>> next_tb is ~0, and only then stop the decrementer interrupt.
>>
>> So that was already kind of ugly, this patch just turned it into a hang.
>>
>> The problem happens when the tick is stopped with an event still
>> pending, then tick_nohz_handler() is called, but it bails out because
>> tick_stopped == 1 so the device never gets programmed again, and so it
>> keeps firing.
>>
>> How to fix it? Before commit a7cba02deced, powerpc's decrementer was
>> really oneshot, but we would like to avoid doing that because it requires
>> additional programming of the hardware on each timer interrupt. We have
>> the ONESHOT_STOPPED state which seems to be just about what we want.
>>
>> Did the ONESHOT_STOPPED patch just miss this case, or is there a reason
>> we don't stop it here? This patch seems to fix the hang (not heavily
>> tested though).
>
> This was definitely overlooked, but it's arguable it is is not required
> for real oneshot clockevent devices. This should only handle the case
> where the interrupt was already pending.
>
> The ONESHOT_STOPPED state was introduced to handle the case where the
> last timer gets canceled, so the already programmed event does not fire.
>
> It was not necessarily meant to "fix" clockevent devices which are
> pretending to be ONESHOT, but keep firing over and over.
>
> That, said. I'm fine with the change along with a big fat comment why
> this is required.

Thanks for taking a look and confirming. I just sent a patch with a
comment and what looks like another missed case. Hopefully it's okay.

Thanks,
Nick

```* Re:
2022-04-13  5:11         ` Nicholas Piggin
@ 2022-04-22 15:53           ` Thomas Gleixner
2022-04-23  2:29             ` Re: Nicholas Piggin
From: Thomas Gleixner @ 2022-04-22 15:53 UTC (permalink / raw)
To: Nicholas Piggin, Michael Ellerman, paulmck, Zhouyi Zhou
Cc: linuxppc-dev, Miguel Ojeda, rcu, Daniel Lezcano, linux-kernel,
Viresh Kumar

On Wed, Apr 13 2022 at 15:11, Nicholas Piggin wrote:
> So we traced the problem down to possibly a misunderstanding between
> decrementer clock event device and core code.
>
> The decrementer is only oneshot*ish*. It actually needs to either be
> reprogrammed or shut down otherwise it just continues to cause
> interrupts.

I always thought that PPC had sane timers. That's really disillusioning.

> Before commit 35de589cb879, it was sort of two-shot. The initial
> interrupt at the programmed time would set its internal next_tb variable
> to ~0 and call the ->event_handler(). If that did not set_next_event or
> stop the timer, the interrupt will fire again immediately, notice
> next_tb is ~0, and only then stop the decrementer interrupt.
>
> So that was already kind of ugly, this patch just turned it into a hang.
>
> The problem happens when the tick is stopped with an event still
> pending, then tick_nohz_handler() is called, but it bails out because
> tick_stopped == 1 so the device never gets programmed again, and so it
> keeps firing.
>
> How to fix it? Before commit a7cba02deced, powerpc's decrementer was
> really oneshot, but we would like to avoid doing that because it requires
> additional programming of the hardware on each timer interrupt. We have
> the ONESHOT_STOPPED state which seems to be just about what we want.
>
> Did the ONESHOT_STOPPED patch just miss this case, or is there a reason
> we don't stop it here? This patch seems to fix the hang (not heavily
> tested though).

This was definitely overlooked, but it's arguable it is is not required
for real oneshot clockevent devices. This should only handle the case
where the interrupt was already pending.

The ONESHOT_STOPPED state was introduced to handle the case where the
last timer gets canceled, so the already programmed event does not fire.

It was not necessarily meant to "fix" clockevent devices which are
pretending to be ONESHOT, but keep firing over and over.

That, said. I'm fine with the change along with a big fat comment why
this is required.

Thanks,

tglx

```* Re:
2022-04-21 23:17   ` Re: Yury Norov
@ 2022-04-21 23:21     ` John Hubbard
0 siblings, 0 replies; 400+ messages in thread
From: John Hubbard @ 2022-04-21 23:21 UTC (permalink / raw)
To: Yury Norov; +Cc: Andrew Morton, Minchan Kim, linux-mm, linux-kernel

On 4/21/22 16:17, Yury Norov wrote:
>> Let's add a Cc: line for Michan as well:
>>
>> Cc: Minchan Kim <minchan@kernel.org>
>
> He's in CC already, I think...
>

Here, I am talking about attribution in the commit log, as opposed
to the email Cc. In other words, I'm suggesting that you literally
add this line to the commit description:

Cc: Minchan Kim <minchan@kernel.org>

thanks,
--
John Hubbard
NVIDIA

```* Re:
2022-04-21 23:04 ` John Hubbard
2022-04-21 23:09   ` Re: John Hubbard
@ 2022-04-21 23:17   ` Yury Norov
2022-04-21 23:21     ` Re: John Hubbard
From: Yury Norov @ 2022-04-21 23:17 UTC (permalink / raw)
To: John Hubbard; +Cc: Andrew Morton, Minchan Kim, linux-mm, linux-kernel

On Thu, Apr 21, 2022 at 04:04:44PM -0700, John Hubbard wrote:
> On 4/21/22 09:41, Yury Norov wrote:
> > Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
> >
>
> Hi Yuri,
>
> Thanks for picking this up. I have been distracted and didn't trust
> myself to focus on this properly, so it's good to have help!
>
> IT/admin point: somehow the first line of the commit description didn't
> make it into an actual email subject. The subject line was blank when it
> arrived in my inbox, and the subject is in the body here instead. Not
> sure how that happened.
>
> Maybe check your git-sendemail setup?

git-sendmail is OK. I just accidentally added empty line above Subject,
which broke format. My bad, sorry for this.

> > pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
> > API requires struct page **pages to be provided (not NULL). However,
> > the comment to pin_user_pages() says:
> >
> >      * @pages:      array that receives pointers to the pages pinned.
> >      *              Should be at least nr_pages long. Or NULL, if caller
> >      *              only intends to ensure the pages are faulted in.
> >
> > This patch fixes comments along the pin_user_pages code, and also adds
> > WARN_ON(!pages), so that API users will have better understanding
> > on how to use it.
>
> No need to quote the code in the commit log. Instead, just summarize.
> For example:
>
> pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
> API requires struct page **pages to be provided (not NULL). However, the
> comment to pin_user_pages() clearly allows for passing in a NULL @pages
> argument.
>
> enforce the API.
>
> >
> > It has been independently spotted by Minchan Kim and confirmed with
> > John Hubbard:
> >
>
> Let's add a Cc: line for Michan as well:
>
> Cc: Minchan Kim <minchan@kernel.org>

He's in CC already, I think...

> > Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
> > ---
> >   mm/gup.c | 26 ++++++++++++++++++++++----
> >   1 file changed, 22 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/gup.c b/mm/gup.c
> > index f598a037eb04..559626457585 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -2871,6 +2871,10 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
> >   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> >   		return -EINVAL;
> > +	/* FOLL_PIN requires pages != NULL */
>
> Please delete each and every one of these one-line comments, because
> they merely echo what the code says.

Sure.

> > +	if (WARN_ON_ONCE(!pages))
> > +		return -EINVAL;
> > +
> >   	gup_flags |= FOLL_PIN;
> >   	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
> >   }
> > @@ -2893,6 +2897,10 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
> >   	 */
> >   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> >   		return 0;
> > +
> > +	/* FOLL_PIN requires pages != NULL */
> > +	if (WARN_ON_ONCE(!pages))
> > +		return 0;
> >   	/*
> >   	 * FOLL_FAST_ONLY is required in order to match the API description of
> >   	 * this routine: no fall back to regular ("slow") GUP.
> > @@ -2920,8 +2928,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
> >    * @nr_pages:	number of pages from start to pin
> >    * @gup_flags:	flags modifying lookup behaviour
> >    * @pages:	array that receives pointers to the pages pinned.
> > - *		Should be at least nr_pages long. Or NULL, if caller
> > - *		only intends to ensure the pages are faulted in.
> > + *		Should be at least nr_pages long.
> >    * @vmas:	array of pointers to vmas corresponding to each page.
> >    *		Or NULL if the caller does not require them.
> >    * @locked:	pointer to lock flag indicating whether lock is held and
> > @@ -2944,6 +2951,10 @@ long pin_user_pages_remote(struct mm_struct *mm,
> >   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> >   		return -EINVAL;
> > +	/* FOLL_PIN requires pages != NULL */
> > +	if (WARN_ON_ONCE(!pages))
> > +		return -EINVAL;
> > +
> >   	gup_flags |= FOLL_PIN;
> >   	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
> >   				       pages, vmas, locked);
> > @@ -2957,8 +2968,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
> >    * @nr_pages:	number of pages from start to pin
> >    * @gup_flags:	flags modifying lookup behaviour
> >    * @pages:	array that receives pointers to the pages pinned.
> > - *		Should be at least nr_pages long. Or NULL, if caller
> > - *		only intends to ensure the pages are faulted in.
> > + *		Should be at least nr_pages long.
> >    * @vmas:	array of pointers to vmas corresponding to each page.
> >    *		Or NULL if the caller does not require them.
> >    *
> > @@ -2976,6 +2986,10 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
> >   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> >   		return -EINVAL;
> > +	/* FOLL_PIN requires pages != NULL */
> > +	if (WARN_ON_ONCE(!pages))
> > +		return -EINVAL;
> > +
> >   	gup_flags |= FOLL_PIN;
> >   	return __gup_longterm_locked(current->mm, start, nr_pages,
> >   				     pages, vmas, gup_flags);
> > @@ -2994,6 +3008,10 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> >   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> >   		return -EINVAL;
> > +	/* FOLL_PIN requires pages != NULL */
> > +	if (WARN_ON_ONCE(!pages))
> > +		return -EINVAL;
> > +
> >   	gup_flags |= FOLL_PIN;
> >   	return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
> >   }
>
> I hope we don't break any callers with the newly enforced !pages, but it's
> the right thing to do, in order to avoid misunderstandings.
>
> thanks,
> --
> John Hubbard
> NVIDIA

Let me test v2 and resend shortly.

Thanks,
Yury

```* Re:
2022-04-21 23:04 ` John Hubbard
@ 2022-04-21 23:09   ` John Hubbard
2022-04-21 23:17   ` Re: Yury Norov
1 sibling, 0 replies; 400+ messages in thread
From: John Hubbard @ 2022-04-21 23:09 UTC (permalink / raw)
To: Yury Norov, Andrew Morton, Minchan Kim, linux-mm, linux-kernel

On 4/21/22 16:04, John Hubbard wrote:
> On 4/21/22 09:41, Yury Norov wrote:
>> Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
>>
>
> Hi Yuri,

...and I see that I have typo'd both Yury's and Minchan's name (further
down), in the same email!

Really apologize for screwing that up. It's Yury-with-a-"y", I know. :)

thanks,
--
John Hubbard
NVIDIA

```* Re:
2022-04-21 16:41 Yury Norov
@ 2022-04-21 23:04 ` John Hubbard
2022-04-21 23:09   ` Re: John Hubbard
2022-04-21 23:17   ` Re: Yury Norov
0 siblings, 2 replies; 400+ messages in thread
From: John Hubbard @ 2022-04-21 23:04 UTC (permalink / raw)
To: Yury Norov, Andrew Morton, Minchan Kim, linux-mm, linux-kernel

On 4/21/22 09:41, Yury Norov wrote:
> Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
>

Hi Yuri,

Thanks for picking this up. I have been distracted and didn't trust
myself to focus on this properly, so it's good to have help!

IT/admin point: somehow the first line of the commit description didn't
make it into an actual email subject. The subject line was blank when it
arrived in my inbox, and the subject is in the body here instead. Not
sure how that happened.

> pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
> API requires struct page **pages to be provided (not NULL). However,
> the comment to pin_user_pages() says:
>
>      * @pages:      array that receives pointers to the pages pinned.
>      *              Should be at least nr_pages long. Or NULL, if caller
>      *              only intends to ensure the pages are faulted in.
>
> This patch fixes comments along the pin_user_pages code, and also adds
> WARN_ON(!pages), so that API users will have better understanding
> on how to use it.

No need to quote the code in the commit log. Instead, just summarize.
For example:

pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
API requires struct page **pages to be provided (not NULL). However, the
comment to pin_user_pages() clearly allows for passing in a NULL @pages
argument.

enforce the API.

>
> It has been independently spotted by Minchan Kim and confirmed with
> John Hubbard:
>

Let's add a Cc: line for Michan as well:

Cc: Minchan Kim <minchan@kernel.org>

>
> Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
> ---
>   mm/gup.c | 26 ++++++++++++++++++++++----
>   1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/mm/gup.c b/mm/gup.c
> index f598a037eb04..559626457585 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2871,6 +2871,10 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
>   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
>   		return -EINVAL;
>
> +	/* FOLL_PIN requires pages != NULL */

they merely echo what the code says.

> +	if (WARN_ON_ONCE(!pages))
> +		return -EINVAL;
> +
>   	gup_flags |= FOLL_PIN;
>   	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
>   }
> @@ -2893,6 +2897,10 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
>   	 */
>   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
>   		return 0;
> +
> +	/* FOLL_PIN requires pages != NULL */
> +	if (WARN_ON_ONCE(!pages))
> +		return 0;
>   	/*
>   	 * FOLL_FAST_ONLY is required in order to match the API description of
>   	 * this routine: no fall back to regular ("slow") GUP.
> @@ -2920,8 +2928,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
>    * @nr_pages:	number of pages from start to pin
>    * @gup_flags:	flags modifying lookup behaviour
>    * @pages:	array that receives pointers to the pages pinned.
> - *		Should be at least nr_pages long. Or NULL, if caller
> - *		only intends to ensure the pages are faulted in.
> + *		Should be at least nr_pages long.
>    * @vmas:	array of pointers to vmas corresponding to each page.
>    *		Or NULL if the caller does not require them.
>    * @locked:	pointer to lock flag indicating whether lock is held and
> @@ -2944,6 +2951,10 @@ long pin_user_pages_remote(struct mm_struct *mm,
>   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
>   		return -EINVAL;
>
> +	/* FOLL_PIN requires pages != NULL */
> +	if (WARN_ON_ONCE(!pages))
> +		return -EINVAL;
> +
>   	gup_flags |= FOLL_PIN;
>   	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
>   				       pages, vmas, locked);
> @@ -2957,8 +2968,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
>    * @nr_pages:	number of pages from start to pin
>    * @gup_flags:	flags modifying lookup behaviour
>    * @pages:	array that receives pointers to the pages pinned.
> - *		Should be at least nr_pages long. Or NULL, if caller
> - *		only intends to ensure the pages are faulted in.
> + *		Should be at least nr_pages long.
>    * @vmas:	array of pointers to vmas corresponding to each page.
>    *		Or NULL if the caller does not require them.
>    *
> @@ -2976,6 +2986,10 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
>   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
>   		return -EINVAL;
>
> +	/* FOLL_PIN requires pages != NULL */
> +	if (WARN_ON_ONCE(!pages))
> +		return -EINVAL;
> +
>   	gup_flags |= FOLL_PIN;
>   	return __gup_longterm_locked(current->mm, start, nr_pages,
>   				     pages, vmas, gup_flags);
> @@ -2994,6 +3008,10 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
>   	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
>   		return -EINVAL;
>
> +	/* FOLL_PIN requires pages != NULL */
> +	if (WARN_ON_ONCE(!pages))
> +		return -EINVAL;
> +
>   	gup_flags |= FOLL_PIN;
>   	return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
>   }

I hope we don't break any callers with the newly enforced !pages, but it's
the right thing to do, in order to avoid misunderstandings.

thanks,
--
John Hubbard
NVIDIA

```* Re:
2022-03-29  8:35                 ` Re: Thomas Gleixner
2022-03-29 14:37                   ` Re: Michael S. Tsirkin
@ 2022-04-12  6:55                   ` Michael S. Tsirkin
1 sibling, 0 replies; 400+ messages in thread
From: Michael S. Tsirkin @ 2022-04-12  6:55 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> On Mon, Mar 28 2022 at 06:40, Michael S. Tsirkin wrote:
> > On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> >> > > So I think we might talk different issues:
> >> > >
> >> > > 1) Whether request_irq() commits the previous setups, I think the
> >> > > answer is yes, since the spin_unlock of desc->lock (release) can
> >> > > guarantee this though there seems no documentation around
> >> > > request_irq() to say this.
> >> > >
> >> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> >> > > using smp_wmb() before the request_irq().
>
> That's a complete bogus example especially as there is not a single
> smp_rmb() which pairs with the smp_wmb().
>
> >> > > And even if write is ordered we still need read to be ordered to be
> >> > > paired with that.
> >
> > IMO it synchronizes with the CPU to which irq is
> > delivered. Otherwise basically all drivers would be broken,
> > wouldn't they be?
> > I don't know whether it's correct on all platforms, but if not
> > we need to fix request_irq.
>
> There is nothing to fix:
>
> request_irq()
>    raw_spin_lock_irq(desc->lock);       // ACQUIRE
>    ....
>    raw_spin_unlock_irq(desc->lock);     // RELEASE
>
> interrupt()
>    raw_spin_lock(desc->lock);           // ACQUIRE
>    set status to IN_PROGRESS
>    raw_spin_unlock(desc->lock);         // RELEASE
>    invoke handler()
>
> So anything which the driver set up _before_ request_irq() is visible to
> the interrupt handler. No?
>
> >> What happens if an interrupt is raised in the middle like:
> >>
> >> smp_store_release(dev->irq_soft_enabled, true)
> >> IRQ handler
> >> synchornize_irq()
>
> This is bogus. The obvious order of things is:
>
>     dev->ok = false;
>     request_irq();
>
>     moar_setup();
>     synchronize_irq();  // ACQUIRE + RELEASE
>     dev->ok = true;
>
> The reverse operation on teardown:
>
>     dev->ok = false;
>     synchronize_irq();  // ACQUIRE + RELEASE
>
>     teardown();
>
> So in both cases a simple check in the handler is sufficient:
>
> handler()
>     if (!dev->ok)
>     	return;

Does this need to be if (!READ_ONCE(dev->ok)) ?

> I'm not understanding what you folks are trying to "fix" here. If any
> driver does this in the wrong order, then the driver is broken.
>
> Sure, you can do the same with:
>
>     dev->ok = false;
>     request_irq();
>     moar_setup();
>     smp_wmb();
>     dev->ok = true;
>
> for the price of a smp_rmb() in the interrupt handler:
>
> handler()
>     if (!dev->ok)
>     	return;
>     smp_rmb();
>
> but that's only working for the setup case correctly and not for
> teardown.
>
> Thanks,
>
>         tglx

```* Re:
2022-03-30  5:14                       ` Re: Michael S. Tsirkin
@ 2022-03-30  5:53                         ` Jason Wang
0 siblings, 0 replies; 400+ messages in thread
From: Jason Wang @ 2022-03-30  5:53 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Wed, Mar 30, 2022 at 1:14 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Mar 30, 2022 at 10:40:59AM +0800, Jason Wang wrote:
> > On Tue, Mar 29, 2022 at 10:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > > > > broken,
> > > > > > > >
> > > > > > > > So I think we might talk different issues:
> > > > > > > >
> > > > > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > > > > guarantee this though there seems no documentation around
> > > > > > > > request_irq() to say this.
> > > > > > > >
> > > > > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > > > > using smp_wmb() before the request_irq().
> > > > > > > >
> > > > > > > > And even if write is ordered we still need read to be ordered to be
> > > > > > > > paired with that.
> > > > >
> > > > > IMO it synchronizes with the CPU to which irq is
> > > > > delivered. Otherwise basically all drivers would be broken,
> > > > > wouldn't they be?
> > > >
> > > > I guess it's because most of the drivers don't care much about the
> > > > buggy/malicious device.  And most of the devices may require an extra
> > > > step to enable device IRQ after request_irq(). Or it's the charge of
> > > > the driver to do the synchronization.
> > >
> > > It is true that the use-case of malicious devices is somewhat boutique.
> > > But I think most drivers do want to have their hotplug routines to be
> > > robust, yes.
> > >
> > > > > I don't know whether it's correct on all platforms, but if not
> > > > > we need to fix request_irq.
> > > > >
> > > > > > > >
> > > > > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > > > > virtio.
> > > > > > > >
> > > > > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > > > > virtio_device_ready():
> > > > > > > >
> > > > > > > > request_irq()
> > > > > > > > driver specific setups
> > > > > > > > virtio_device_ready()
> > > > > > > >
> > > > > > > > CPU 0 probe) request_irq()
> > > > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > > > CPU 0 probe) driver specific setups
> > > > > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > >
> > > > > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > > > > That will guarantee all setup is visible to the specific IRQ,
> > > > > >
> > > > > > Only the interrupt after synchronize_irq() returns.
> > > > >
> > > > > Anything else is a buggy device though.
> > > >
> > > > Yes, but the goal of this patch is to prevent the possible attack from
> > > > buggy(malicious) devices.
> > >
> > > Right. However if a driver of a *buggy* device somehow sees driver_ok =
> > > false even though it's actually initialized, that is not a deal breaker
> > > as that does not open us up to an attack.
> > >
> > > > >
> > > > > > >this
> > > > > > > is what it's point is.
> > > > > >
> > > > > > What happens if an interrupt is raised in the middle like:
> > > > > >
> > > > > > smp_store_release(dev->irq_soft_enabled, true)
> > > > > > IRQ handler
> > > > > > synchornize_irq()
> > > > > >
> > > > > > If we don't enforce a reading order, the IRQ handler may still see the
> > > > > > uninitialized variable.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > IMHO variables should be initialized before request_irq
> > > > > to a value meaning "not a valid interrupt".
> > > > > Specifically driver_ok = false.
> > > > > Handler in the scenario you describe will then see !driver_ok
> > > > > and exit immediately.
> > > >
> > > > So just to make sure we're on the same page.
> > > >
> > > > 1) virtio_reset_device() will set the driver_ok to false;
> > > > 2) virtio_device_ready() will set the driver_ok to true
> > > >
> > > > So for virtio drivers, it often did:
> > > >
> > > > 1) virtio_reset_device()
> > > > 2) find_vqs() which will call request_irq()
> > > > 3) other driver specific setups
> > > > 4) virtio_device_ready()
> > > >
> > > > In virtio_device_ready(), the patch perform the following currently:
> > > >
> > > > smp_store_release(driver_ok, true);
> > > > set_status(DRIVER_OK);
> > > >
> > > > smp_store_release() so we had
> > > >
> > > > smp_store_release(driver_ok, true);
> > > > synchornize_irq()
> > > > set_status(DRIVER_OK)
> > > >
> > > > Suppose there's a interrupt raised before the synchronize_irq(), if we do:
> > > >
> > > > if (READ_ONCE(driver_ok)) {
> > > >       vq->callback()
> > > > }
> > > >
> > > > It will see the driver_ok as true but how can we make sure
> > > > vq->callback sees the driver specific setups (3) above?
> > > >
> > > > And an example is virtio_scsi():
> > > >
> > > > virtio_reset_device()
> > > > virtscsi_probe()
> > > >     virtscsi_init()
> > > >         virtio_find_vqs()
> > > >         ...
> > > >         virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> > > >     ....
> > > >
> > > > In virtscsi_event_done():
> > > >
> > > > virtscsi_event_done():
> > > >     virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
> > > >
> > > > We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
> > > >
> > > > Thanks
> > >
> > >
> > > See response by Thomas. A simple if (!dev->driver_ok) should be enough,
> > > it's all under a lock.
> >
> > Ordered through ACQUIRE+RELEASE actually since the irq handler is not
> > running under the lock.
> >
> > Another question, for synchronize_irq() do you prefer
> >
> > 1) transport specific callbacks
> > or
> > 2) a simple synchornize_rcu()
> >
> > Thanks
>
>
> 1) I think, and I'd add a wrapper so we can switch to 2 if we really
> want to. But for now synchronizing the specific irq is obviously designed to
> make any changes to memory visible to this irq. that
> seems cleaner and easier to understand than memory ordering tricks
> and relying on side effects of synchornize_rcu, even though
> internally this all boils down to memory ordering since
> memory is what's used to implement locks :).
> Not to mention, synchronize_irq just scales much better from performance
> POV.

Ok. Let me try to do that in V2.

Thanks

>
>
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > We use smp_store_relase()
> > > > > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > > > > which surprises me.
> > > > > > > > > > > > >
> > > > > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > "
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > > > > > > > "
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > This is only for config interrupt.
> > > > > > > > > > > >
> > > > > > > > > > > > Ok.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > > > > expensive.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > > > >    * */
> > > > > > > > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > > > >   {
> > > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > > > >  * and NMI handlers.
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > You are right. So then
> > > > > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > > > >    READ_ONCE should do.
> > > > > > > > > > > >
> > > > > > > > > > > > See above.
> > > > > > > > > > > >
> > > > > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Ok.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Do you mean:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > > > > interrupt handlers
> > > > > > > > > > > > >
> > > > > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > > > > >
> > > > > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > > > > spinlock or others.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > > > > for old hypervisors
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > > > > >
> > > > > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > > > > codes. They look all fine since day0.
> > > > > > > > > > > >
> > > > > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > > > > > > > in the debug build).
> > > > > > > > > > > >
> > > > > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > > > > >
> > > > > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > > > > read back for validation in many cases.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > > > >   {
> > > > > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > > > > > > > + }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > > > >           bool failed;
> > > > > > > > > > > > > > > >           bool config_enabled;
> > > > > > > > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > > > >           struct device dev;
> > > > > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > > +/*
> > > > > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > > > > +         return true;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > > > > > > > paired
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Will fix.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Thanks
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >   /**
> > > > > > > > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > > > >    * @vdev: the device
> > > > > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > 2.25.1
> > > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>

```* Re:
2022-03-30  5:09                           ` Re: Michael S. Tsirkin
@ 2022-03-30  5:53                             ` Jason Wang
0 siblings, 0 replies; 400+ messages in thread
From: Jason Wang @ 2022-03-30  5:53 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Thomas Gleixner, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Wed, Mar 30, 2022 at 1:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Mar 30, 2022 at 10:38:06AM +0800, Jason Wang wrote:
> > On Wed, Mar 30, 2022 at 6:04 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> > > > On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > > > > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > > > > We are trying to fix the driver since at the moment it does not
> > > > > have the dev->ok flag at all.
> > > > >
> > > > > And I suspect virtio is not alone in that.
> > > > > So it would have been nice if there was a standard flag
> > > > > replacing the driver-specific dev->ok above, and ideally
> > > > > would also handle the case of an interrupt triggering
> > > > > too early by deferring the interrupt until the flag is set.
> > > > >
> > > > > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > > > > enable_irq instead of dev->ok = true, except
> > > > > - it doesn't work with affinity managed IRQs
> > > > > - it does not work with shared IRQs
> > > > >
> > > > > So using dev->ok as you propose above seems better at this point.
> > > >
> > > > Unless there is a big enough amount of drivers which could make use of a
> > > > generic mechanism for that.
> > > >
> > > > >> If any driver does this in the wrong order, then the driver is
> > > > >> broken.
> > > > >
> > > > > I agree, however:
> > > > > \$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > > > > 113
> > > > > \$ git grep -l request_irq drivers/net/|wc -l
> > > > > 397
> > > > >
> > > > > I suspect there are more drivers which in theory need the
> > > > > synchronize_irq dance but in practice do not execute it.
> > > >
> > > > That really depends on when the driver requests the interrupt, when
> > > > it actually enables the interrupt in the device itself
> > >
> > > This last point does not matter since we are talking about protecting
> > > against buggy/malicious devices. They can inject the interrupt anyway
> > > even if driver did not configure it.
> > >
> > > > and how the
> > > > interrupt service routine works.
> > > >
> > > > So just doing that grep dance does not tell much. You really have to do
> > > > a case by case analysis.
> > > >
> > > > Thanks,
> > > >
> > > >         tglx
> > >
> > >
> > > I agree. In fact, at least for network the standard approach is to
> > > request interrupts in the open call, virtio net is unusual
> > > in doing it in probe. We should consider changing that.
> > > Jason?
> >
> > This probably works only for virtio-net and it looks like not trivial
> > since we don't have a specific core API to request interrupts.
> >
> > Thanks
>
> We'll need a new API, for sure. E.g.  find vqs with no
> callback on probe, and then virtio_request_vq_callbacks separately.
>
> The existing API that specifies callbacks during find vqs
> can be used by other drivers.

Ok, I will do it.

Thanks

>
> > >
> > > --
> > > MST
> > >
>

```* Re:
2022-03-30  2:40                     ` Re: Jason Wang
@ 2022-03-30  5:14                       ` Michael S. Tsirkin
2022-03-30  5:53                         ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-30  5:14 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Wed, Mar 30, 2022 at 10:40:59AM +0800, Jason Wang wrote:
> On Tue, Mar 29, 2022 at 10:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > > > broken,
> > > > > > >
> > > > > > > So I think we might talk different issues:
> > > > > > >
> > > > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > > > guarantee this though there seems no documentation around
> > > > > > > request_irq() to say this.
> > > > > > >
> > > > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > > > using smp_wmb() before the request_irq().
> > > > > > >
> > > > > > > And even if write is ordered we still need read to be ordered to be
> > > > > > > paired with that.
> > > >
> > > > IMO it synchronizes with the CPU to which irq is
> > > > delivered. Otherwise basically all drivers would be broken,
> > > > wouldn't they be?
> > >
> > > I guess it's because most of the drivers don't care much about the
> > > buggy/malicious device.  And most of the devices may require an extra
> > > step to enable device IRQ after request_irq(). Or it's the charge of
> > > the driver to do the synchronization.
> >
> > It is true that the use-case of malicious devices is somewhat boutique.
> > But I think most drivers do want to have their hotplug routines to be
> > robust, yes.
> >
> > > > I don't know whether it's correct on all platforms, but if not
> > > > we need to fix request_irq.
> > > >
> > > > > > >
> > > > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > > > virtio.
> > > > > > >
> > > > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > > > virtio_device_ready():
> > > > > > >
> > > > > > > request_irq()
> > > > > > > driver specific setups
> > > > > > > virtio_device_ready()
> > > > > > >
> > > > > > > CPU 0 probe) request_irq()
> > > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > > CPU 0 probe) driver specific setups
> > > > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > >
> > > > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > > > That will guarantee all setup is visible to the specific IRQ,
> > > > >
> > > > > Only the interrupt after synchronize_irq() returns.
> > > >
> > > > Anything else is a buggy device though.
> > >
> > > Yes, but the goal of this patch is to prevent the possible attack from
> > > buggy(malicious) devices.
> >
> > Right. However if a driver of a *buggy* device somehow sees driver_ok =
> > false even though it's actually initialized, that is not a deal breaker
> > as that does not open us up to an attack.
> >
> > > >
> > > > > >this
> > > > > > is what it's point is.
> > > > >
> > > > > What happens if an interrupt is raised in the middle like:
> > > > >
> > > > > smp_store_release(dev->irq_soft_enabled, true)
> > > > > IRQ handler
> > > > > synchornize_irq()
> > > > >
> > > > > If we don't enforce a reading order, the IRQ handler may still see the
> > > > > uninitialized variable.
> > > > >
> > > > > Thanks
> > > >
> > > > IMHO variables should be initialized before request_irq
> > > > to a value meaning "not a valid interrupt".
> > > > Specifically driver_ok = false.
> > > > Handler in the scenario you describe will then see !driver_ok
> > > > and exit immediately.
> > >
> > > So just to make sure we're on the same page.
> > >
> > > 1) virtio_reset_device() will set the driver_ok to false;
> > > 2) virtio_device_ready() will set the driver_ok to true
> > >
> > > So for virtio drivers, it often did:
> > >
> > > 1) virtio_reset_device()
> > > 2) find_vqs() which will call request_irq()
> > > 3) other driver specific setups
> > >
> > > In virtio_device_ready(), the patch perform the following currently:
> > >
> > > smp_store_release(driver_ok, true);
> > > set_status(DRIVER_OK);
> > >
> > > smp_store_release() so we had
> > >
> > > smp_store_release(driver_ok, true);
> > > synchornize_irq()
> > > set_status(DRIVER_OK)
> > >
> > > Suppose there's a interrupt raised before the synchronize_irq(), if we do:
> > >
> > > if (READ_ONCE(driver_ok)) {
> > >       vq->callback()
> > > }
> > >
> > > It will see the driver_ok as true but how can we make sure
> > > vq->callback sees the driver specific setups (3) above?
> > >
> > > And an example is virtio_scsi():
> > >
> > > virtio_reset_device()
> > > virtscsi_probe()
> > >     virtscsi_init()
> > >         virtio_find_vqs()
> > >         ...
> > >         virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> > >     ....
> > >
> > > In virtscsi_event_done():
> > >
> > > virtscsi_event_done():
> > >     virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
> > >
> > > We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
> > >
> > > Thanks
> >
> >
> > See response by Thomas. A simple if (!dev->driver_ok) should be enough,
> > it's all under a lock.
>
> Ordered through ACQUIRE+RELEASE actually since the irq handler is not
> running under the lock.
>
> Another question, for synchronize_irq() do you prefer
>
> 1) transport specific callbacks
> or
> 2) a simple synchornize_rcu()
>
> Thanks

1) I think, and I'd add a wrapper so we can switch to 2 if we really
want to. But for now synchronizing the specific irq is obviously designed to
make any changes to memory visible to this irq. that
seems cleaner and easier to understand than memory ordering tricks
and relying on side effects of synchornize_rcu, even though
internally this all boils down to memory ordering since
memory is what's used to implement locks :).
Not to mention, synchronize_irq just scales much better from performance
POV.

> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > We use smp_store_relase()
> > > > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > > > which surprises me.
> > > > > > > > > > > >
> > > > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > > > >
> > > > > > > > > > > > > "
> > > > > > > > > > > > >
> > > > > > > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > > > > > > "
> > > > > > > > > > > > >
> > > > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > This is only for config interrupt.
> > > > > > > > > > >
> > > > > > > > > > > Ok.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > > > expensive.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > > >    * */
> > > > > > > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > > >   {
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > > > >
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > > >  * and NMI handlers.
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > > > >
> > > > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > > > >
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > You are right. So then
> > > > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > > >    READ_ONCE should do.
> > > > > > > > > > >
> > > > > > > > > > > See above.
> > > > > > > > > > >
> > > > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Ok.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Do you mean:
> > > > > > > > > > > > >
> > > > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > > > interrupt handlers
> > > > > > > > > > > >
> > > > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > > > >
> > > > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > > > spinlock or others.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > > > for old hypervisors
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > > > >
> > > > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > > > codes. They look all fine since day0.
> > > > > > > > > > >
> > > > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > > > > > > in the debug build).
> > > > > > > > > > >
> > > > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > > > >
> > > > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > > > read back for validation in many cases.
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > > >   {
> > > > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > > > > > > + }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > > >           bool failed;
> > > > > > > > > > > > > > >           bool config_enabled;
> > > > > > > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > > >           struct device dev;
> > > > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > +/*
> > > > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > > > +         return true;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > > > > > > paired
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Will fix.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thanks
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >   /**
> > > > > > > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > > >    * @vdev: the device
> > > > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > 2.25.1
> > > > > > > > > > > >
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >

```* Re:
2022-03-30  2:38                         ` Re: Jason Wang
@ 2022-03-30  5:09                           ` Michael S. Tsirkin
2022-03-30  5:53                             ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-30  5:09 UTC (permalink / raw)
To: Jason Wang
Cc: Thomas Gleixner, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Wed, Mar 30, 2022 at 10:38:06AM +0800, Jason Wang wrote:
> On Wed, Mar 30, 2022 at 6:04 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> > > On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > > > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > > > We are trying to fix the driver since at the moment it does not
> > > > have the dev->ok flag at all.
> > > >
> > > > And I suspect virtio is not alone in that.
> > > > So it would have been nice if there was a standard flag
> > > > replacing the driver-specific dev->ok above, and ideally
> > > > would also handle the case of an interrupt triggering
> > > > too early by deferring the interrupt until the flag is set.
> > > >
> > > > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > > > enable_irq instead of dev->ok = true, except
> > > > - it doesn't work with affinity managed IRQs
> > > > - it does not work with shared IRQs
> > > >
> > > > So using dev->ok as you propose above seems better at this point.
> > >
> > > Unless there is a big enough amount of drivers which could make use of a
> > > generic mechanism for that.
> > >
> > > >> If any driver does this in the wrong order, then the driver is
> > > >> broken.
> > > >
> > > > I agree, however:
> > > > \$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > > > 113
> > > > \$ git grep -l request_irq drivers/net/|wc -l
> > > > 397
> > > >
> > > > I suspect there are more drivers which in theory need the
> > > > synchronize_irq dance but in practice do not execute it.
> > >
> > > That really depends on when the driver requests the interrupt, when
> > > it actually enables the interrupt in the device itself
> >
> > This last point does not matter since we are talking about protecting
> > against buggy/malicious devices. They can inject the interrupt anyway
> > even if driver did not configure it.
> >
> > > and how the
> > > interrupt service routine works.
> > >
> > > So just doing that grep dance does not tell much. You really have to do
> > > a case by case analysis.
> > >
> > > Thanks,
> > >
> > >         tglx
> >
> >
> > I agree. In fact, at least for network the standard approach is to
> > request interrupts in the open call, virtio net is unusual
> > in doing it in probe. We should consider changing that.
> > Jason?
>
> This probably works only for virtio-net and it looks like not trivial
> since we don't have a specific core API to request interrupts.
>
> Thanks

We'll need a new API, for sure. E.g.  find vqs with no
callback on probe, and then virtio_request_vq_callbacks separately.

The existing API that specifies callbacks during find vqs
can be used by other drivers.

> >
> > --
> > MST
> >

```* Re:
2022-03-29 14:08                   ` Re: Michael S. Tsirkin
@ 2022-03-30  2:40                     ` Jason Wang
2022-03-30  5:14                       ` Re: Michael S. Tsirkin
From: Jason Wang @ 2022-03-30  2:40 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Tue, Mar 29, 2022 at 10:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > > broken,
> > > > > >
> > > > > > So I think we might talk different issues:
> > > > > >
> > > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > > guarantee this though there seems no documentation around
> > > > > > request_irq() to say this.
> > > > > >
> > > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > > using smp_wmb() before the request_irq().
> > > > > >
> > > > > > And even if write is ordered we still need read to be ordered to be
> > > > > > paired with that.
> > >
> > > IMO it synchronizes with the CPU to which irq is
> > > delivered. Otherwise basically all drivers would be broken,
> > > wouldn't they be?
> >
> > I guess it's because most of the drivers don't care much about the
> > buggy/malicious device.  And most of the devices may require an extra
> > step to enable device IRQ after request_irq(). Or it's the charge of
> > the driver to do the synchronization.
>
> It is true that the use-case of malicious devices is somewhat boutique.
> But I think most drivers do want to have their hotplug routines to be
> robust, yes.
>
> > > I don't know whether it's correct on all platforms, but if not
> > > we need to fix request_irq.
> > >
> > > > > >
> > > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > > virtio.
> > > > > >
> > > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > > virtio_device_ready():
> > > > > >
> > > > > > request_irq()
> > > > > > driver specific setups
> > > > > > virtio_device_ready()
> > > > > >
> > > > > > CPU 0 probe) request_irq()
> > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > CPU 0 probe) driver specific setups
> > > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > >
> > > > > > Thanks
> > > > >
> > > > >
> > > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > > That will guarantee all setup is visible to the specific IRQ,
> > > >
> > > > Only the interrupt after synchronize_irq() returns.
> > >
> > > Anything else is a buggy device though.
> >
> > Yes, but the goal of this patch is to prevent the possible attack from
> > buggy(malicious) devices.
>
> Right. However if a driver of a *buggy* device somehow sees driver_ok =
> false even though it's actually initialized, that is not a deal breaker
> as that does not open us up to an attack.
>
> > >
> > > > >this
> > > > > is what it's point is.
> > > >
> > > > What happens if an interrupt is raised in the middle like:
> > > >
> > > > smp_store_release(dev->irq_soft_enabled, true)
> > > > IRQ handler
> > > > synchornize_irq()
> > > >
> > > > If we don't enforce a reading order, the IRQ handler may still see the
> > > > uninitialized variable.
> > > >
> > > > Thanks
> > >
> > > IMHO variables should be initialized before request_irq
> > > to a value meaning "not a valid interrupt".
> > > Specifically driver_ok = false.
> > > Handler in the scenario you describe will then see !driver_ok
> > > and exit immediately.
> >
> > So just to make sure we're on the same page.
> >
> > 1) virtio_reset_device() will set the driver_ok to false;
> > 2) virtio_device_ready() will set the driver_ok to true
> >
> > So for virtio drivers, it often did:
> >
> > 1) virtio_reset_device()
> > 2) find_vqs() which will call request_irq()
> > 3) other driver specific setups
> >
> > In virtio_device_ready(), the patch perform the following currently:
> >
> > smp_store_release(driver_ok, true);
> > set_status(DRIVER_OK);
> >
> > smp_store_release() so we had
> >
> > smp_store_release(driver_ok, true);
> > synchornize_irq()
> > set_status(DRIVER_OK)
> >
> > Suppose there's a interrupt raised before the synchronize_irq(), if we do:
> >
> >       vq->callback()
> > }
> >
> > It will see the driver_ok as true but how can we make sure
> > vq->callback sees the driver specific setups (3) above?
> >
> > And an example is virtio_scsi():
> >
> > virtio_reset_device()
> > virtscsi_probe()
> >     virtscsi_init()
> >         virtio_find_vqs()
> >         ...
> >         virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> >     ....
> >
> > In virtscsi_event_done():
> >
> > virtscsi_event_done():
> >     virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
> >
> > We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
> >
> > Thanks
>
>
> See response by Thomas. A simple if (!dev->driver_ok) should be enough,
> it's all under a lock.

Ordered through ACQUIRE+RELEASE actually since the irq handler is not
running under the lock.

Another question, for synchronize_irq() do you prefer

1) transport specific callbacks
or
2) a simple synchornize_rcu()

Thanks

>
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > We use smp_store_relase()
> > > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > > which surprises me.
> > > > > > > > > > >
> > > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > > >
> > > > > > > > > > > > "
> > > > > > > > > > > >
> > > > > > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > > > > > "
> > > > > > > > > > > >
> > > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > This is only for config interrupt.
> > > > > > > > > >
> > > > > > > > > > Ok.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > > expensive.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > >    * */
> > > > > > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > >   {
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > > >
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > >  * and NMI handlers.
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > > >
> > > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > > >
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > You are right. So then
> > > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > >    READ_ONCE should do.
> > > > > > > > > >
> > > > > > > > > > See above.
> > > > > > > > > >
> > > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > > > > > >   }
> > > > > > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Ok.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Do you mean:
> > > > > > > > > > > >
> > > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > > interrupt handlers
> > > > > > > > > > >
> > > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > > >
> > > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > > spinlock or others.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > > for old hypervisors
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > > >
> > > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > > codes. They look all fine since day0.
> > > > > > > > > >
> > > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > > > > > in the debug build).
> > > > > > > > > >
> > > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > > >
> > > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > > read back for validation in many cases.
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > >   {
> > > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > > > > > + }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > >           bool failed;
> > > > > > > > > > > > > >           bool config_enabled;
> > > > > > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > >           struct device dev;
> > > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > +/*
> > > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > > +         return true;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > > > > > paired
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Will fix.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >   /**
> > > > > > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > >    * @vdev: the device
> > > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > > > > > +  */
> > > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > >   }
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > 2.25.1
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>

```* Re:
2022-03-29 22:04                       ` Re: Michael S. Tsirkin
@ 2022-03-30  2:38                         ` Jason Wang
2022-03-30  5:09                           ` Re: Michael S. Tsirkin
From: Jason Wang @ 2022-03-30  2:38 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Thomas Gleixner, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Wed, Mar 30, 2022 at 6:04 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> > On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > > We are trying to fix the driver since at the moment it does not
> > > have the dev->ok flag at all.
> > >
> > > And I suspect virtio is not alone in that.
> > > So it would have been nice if there was a standard flag
> > > replacing the driver-specific dev->ok above, and ideally
> > > would also handle the case of an interrupt triggering
> > > too early by deferring the interrupt until the flag is set.
> > >
> > > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > > enable_irq instead of dev->ok = true, except
> > > - it doesn't work with affinity managed IRQs
> > > - it does not work with shared IRQs
> > >
> > > So using dev->ok as you propose above seems better at this point.
> >
> > Unless there is a big enough amount of drivers which could make use of a
> > generic mechanism for that.
> >
> > >> If any driver does this in the wrong order, then the driver is
> > >> broken.
> > >
> > > I agree, however:
> > > \$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > > 113
> > > \$ git grep -l request_irq drivers/net/|wc -l
> > > 397
> > >
> > > I suspect there are more drivers which in theory need the
> > > synchronize_irq dance but in practice do not execute it.
> >
> > That really depends on when the driver requests the interrupt, when
> > it actually enables the interrupt in the device itself
>
> This last point does not matter since we are talking about protecting
> against buggy/malicious devices. They can inject the interrupt anyway
> even if driver did not configure it.
>
> > and how the
> > interrupt service routine works.
> >
> > So just doing that grep dance does not tell much. You really have to do
> > a case by case analysis.
> >
> > Thanks,
> >
> >         tglx
>
>
> I agree. In fact, at least for network the standard approach is to
> request interrupts in the open call, virtio net is unusual
> in doing it in probe. We should consider changing that.
> Jason?

This probably works only for virtio-net and it looks like not trivial
since we don't have a specific core API to request interrupts.

Thanks

>
> --
> MST
>

```* Re:
2022-03-29 18:13                     ` Re: Thomas Gleixner
@ 2022-03-29 22:04                       ` Michael S. Tsirkin
2022-03-30  2:38                         ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-29 22:04 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > We are trying to fix the driver since at the moment it does not
> > have the dev->ok flag at all.
> >
> > And I suspect virtio is not alone in that.
> > So it would have been nice if there was a standard flag
> > replacing the driver-specific dev->ok above, and ideally
> > would also handle the case of an interrupt triggering
> > too early by deferring the interrupt until the flag is set.
> >
> > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > enable_irq instead of dev->ok = true, except
> > - it doesn't work with affinity managed IRQs
> > - it does not work with shared IRQs
> >
> > So using dev->ok as you propose above seems better at this point.
>
> Unless there is a big enough amount of drivers which could make use of a
> generic mechanism for that.
>
> >> If any driver does this in the wrong order, then the driver is
> >> broken.
> >
> > I agree, however:
> > \$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > 113
> > \$ git grep -l request_irq drivers/net/|wc -l
> > 397
> >
> > I suspect there are more drivers which in theory need the
> > synchronize_irq dance but in practice do not execute it.
>
> That really depends on when the driver requests the interrupt, when
> it actually enables the interrupt in the device itself

This last point does not matter since we are talking about protecting
against buggy/malicious devices. They can inject the interrupt anyway
even if driver did not configure it.

> and how the
> interrupt service routine works.
>
> So just doing that grep dance does not tell much. You really have to do
> a case by case analysis.
>
> Thanks,
>
>         tglx

I agree. In fact, at least for network the standard approach is to
request interrupts in the open call, virtio net is unusual
in doing it in probe. We should consider changing that.
Jason?

--
MST

```* Re:
2022-03-29 14:37                   ` Re: Michael S. Tsirkin
@ 2022-03-29 18:13                     ` Thomas Gleixner
2022-03-29 22:04                       ` Re: Michael S. Tsirkin
From: Thomas Gleixner @ 2022-03-29 18:13 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> We are trying to fix the driver since at the moment it does not
> have the dev->ok flag at all.
>
> And I suspect virtio is not alone in that.
> So it would have been nice if there was a standard flag
> replacing the driver-specific dev->ok above, and ideally
> would also handle the case of an interrupt triggering
> too early by deferring the interrupt until the flag is set.
>
> And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> enable_irq instead of dev->ok = true, except
> - it doesn't work with affinity managed IRQs
> - it does not work with shared IRQs
>
> So using dev->ok as you propose above seems better at this point.

Unless there is a big enough amount of drivers which could make use of a
generic mechanism for that.

>> If any driver does this in the wrong order, then the driver is
>> broken.
>
> I agree, however:
> \$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> 113
> \$ git grep -l request_irq drivers/net/|wc -l
> 397
>
> I suspect there are more drivers which in theory need the
> synchronize_irq dance but in practice do not execute it.

That really depends on when the driver requests the interrupt, when
it actually enables the interrupt in the device itself and how the
interrupt service routine works.

So just doing that grep dance does not tell much. You really have to do
a case by case analysis.

Thanks,

tglx

```* Re:
2022-03-29  8:35                 ` Re: Thomas Gleixner
@ 2022-03-29 14:37                   ` Michael S. Tsirkin
2022-03-29 18:13                     ` Re: Thomas Gleixner
2022-04-12  6:55                   ` Re: Michael S. Tsirkin
From: Michael S. Tsirkin @ 2022-03-29 14:37 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> On Mon, Mar 28 2022 at 06:40, Michael S. Tsirkin wrote:
> > On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> >> > > So I think we might talk different issues:
> >> > >
> >> > > 1) Whether request_irq() commits the previous setups, I think the
> >> > > answer is yes, since the spin_unlock of desc->lock (release) can
> >> > > guarantee this though there seems no documentation around
> >> > > request_irq() to say this.
> >> > >
> >> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> >> > > using smp_wmb() before the request_irq().
>
> That's a complete bogus example especially as there is not a single
> smp_rmb() which pairs with the smp_wmb().
>
> >> > > And even if write is ordered we still need read to be ordered to be
> >> > > paired with that.
> >
> > IMO it synchronizes with the CPU to which irq is
> > delivered. Otherwise basically all drivers would be broken,
> > wouldn't they be?
> > I don't know whether it's correct on all platforms, but if not
> > we need to fix request_irq.
>
> There is nothing to fix:
>
> request_irq()
>    raw_spin_lock_irq(desc->lock);       // ACQUIRE
>    ....
>    raw_spin_unlock_irq(desc->lock);     // RELEASE
>
> interrupt()
>    raw_spin_lock(desc->lock);           // ACQUIRE
>    set status to IN_PROGRESS
>    raw_spin_unlock(desc->lock);         // RELEASE
>    invoke handler()
>
> So anything which the driver set up _before_ request_irq() is visible to
> the interrupt handler. No?
> >> What happens if an interrupt is raised in the middle like:
> >>
> >> smp_store_release(dev->irq_soft_enabled, true)
> >> IRQ handler
> >> synchornize_irq()
>
> This is bogus. The obvious order of things is:
>
>     dev->ok = false;
>     request_irq();
>
>     moar_setup();
>     synchronize_irq();  // ACQUIRE + RELEASE
>     dev->ok = true;
>
> The reverse operation on teardown:
>
>     dev->ok = false;
>     synchronize_irq();  // ACQUIRE + RELEASE
>
>     teardown();
>
> So in both cases a simple check in the handler is sufficient:
>
> handler()
>     if (!dev->ok)
>     	return;

Thanks a lot for the analysis Thomas. This is more or less what I was
thinking.

>
> I'm not understanding what you folks are trying to "fix" here.

We are trying to fix the driver since at the moment it does not
have the dev->ok flag at all.

And I suspect virtio is not alone in that.
So it would have been nice if there was a standard flag
replacing the driver-specific dev->ok above, and ideally
would also handle the case of an interrupt triggering
too early by deferring the interrupt until the flag is set.

And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
enable_irq instead of dev->ok = true, except
- it doesn't work with affinity managed IRQs
- it does not work with shared IRQs

So using dev->ok as you propose above seems better at this point.

> If any
> driver does this in the wrong order, then the driver is broken.

I agree, however:
\$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
113
\$ git grep -l request_irq drivers/net/|wc -l
397

I suspect there are more drivers which in theory need the
synchronize_irq dance but in practice do not execute it.

> Sure, you can do the same with:
>
>     dev->ok = false;
>     request_irq();
>     moar_setup();
>     smp_wmb();
>     dev->ok = true;
>
> for the price of a smp_rmb() in the interrupt handler:
>
> handler()
>     if (!dev->ok)
>     	return;
>     smp_rmb();
>
> but that's only working for the setup case correctly and not for
> teardown.
>
> Thanks,
>
>         tglx

```* Re:
2022-03-29  7:12                 ` Re: Jason Wang
@ 2022-03-29 14:08                   ` Michael S. Tsirkin
2022-03-30  2:40                     ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-29 14:08 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > broken,
> > > > >
> > > > > So I think we might talk different issues:
> > > > >
> > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > guarantee this though there seems no documentation around
> > > > > request_irq() to say this.
> > > > >
> > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > using smp_wmb() before the request_irq().
> > > > >
> > > > > And even if write is ordered we still need read to be ordered to be
> > > > > paired with that.
> >
> > IMO it synchronizes with the CPU to which irq is
> > delivered. Otherwise basically all drivers would be broken,
> > wouldn't they be?
>
> I guess it's because most of the drivers don't care much about the
> buggy/malicious device.  And most of the devices may require an extra
> step to enable device IRQ after request_irq(). Or it's the charge of
> the driver to do the synchronization.

It is true that the use-case of malicious devices is somewhat boutique.
But I think most drivers do want to have their hotplug routines to be
robust, yes.

> > I don't know whether it's correct on all platforms, but if not
> > we need to fix request_irq.
> >
> > > > >
> > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > virtio.
> > > > >
> > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > virtio_device_ready():
> > > > >
> > > > > request_irq()
> > > > > driver specific setups
> > > > > virtio_device_ready()
> > > > >
> > > > > CPU 0 probe) request_irq()
> > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > CPU 0 probe) driver specific setups
> > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > >
> > > > > Thanks
> > > >
> > > >
> > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > That will guarantee all setup is visible to the specific IRQ,
> > >
> > > Only the interrupt after synchronize_irq() returns.
> >
> > Anything else is a buggy device though.
>
> Yes, but the goal of this patch is to prevent the possible attack from
> buggy(malicious) devices.

Right. However if a driver of a *buggy* device somehow sees driver_ok =
false even though it's actually initialized, that is not a deal breaker
as that does not open us up to an attack.

> >
> > > >this
> > > > is what it's point is.
> > >
> > > What happens if an interrupt is raised in the middle like:
> > >
> > > smp_store_release(dev->irq_soft_enabled, true)
> > > IRQ handler
> > > synchornize_irq()
> > >
> > > If we don't enforce a reading order, the IRQ handler may still see the
> > > uninitialized variable.
> > >
> > > Thanks
> >
> > IMHO variables should be initialized before request_irq
> > to a value meaning "not a valid interrupt".
> > Specifically driver_ok = false.
> > Handler in the scenario you describe will then see !driver_ok
> > and exit immediately.
>
> So just to make sure we're on the same page.
>
> 1) virtio_reset_device() will set the driver_ok to false;
> 2) virtio_device_ready() will set the driver_ok to true
>
> So for virtio drivers, it often did:
>
> 1) virtio_reset_device()
> 2) find_vqs() which will call request_irq()
> 3) other driver specific setups
>
> In virtio_device_ready(), the patch perform the following currently:
>
> smp_store_release(driver_ok, true);
> set_status(DRIVER_OK);
>
>
> smp_store_release(driver_ok, true);
> synchornize_irq()
> set_status(DRIVER_OK)
>
> Suppose there's a interrupt raised before the synchronize_irq(), if we do:
>
>       vq->callback()
> }
>
> It will see the driver_ok as true but how can we make sure
> vq->callback sees the driver specific setups (3) above?
>
> And an example is virtio_scsi():
>
> virtio_reset_device()
> virtscsi_probe()
>     virtscsi_init()
>         virtio_find_vqs()
>         ...
>         virtscsi_init_vq(&vscsi->event_vq, vqs[1])
>     ....
>
> In virtscsi_event_done():
>
> virtscsi_event_done():
>     virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
>
> We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
>
> Thanks

See response by Thomas. A simple if (!dev->driver_ok) should be enough,
it's all under a lock.

> >
> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > We use smp_store_relase()
> > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > which surprises me.
> > > > > > > > > >
> > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > >
> > > > > > > > > > > "
> > > > > > > > > > >
> > > > > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > > > > "
> > > > > > > > > > >
> > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > This is only for config interrupt.
> > > > > > > > >
> > > > > > > > > Ok.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > expensive.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > >
> > > > > > > > > > > > > ---
> > > > > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > +
> > > > > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > >    * */
> > > > > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > >   {
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > >
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > >  * and NMI handlers.
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > >
> > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > >
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > You are right. So then
> > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > >    READ_ONCE should do.
> > > > > > > > >
> > > > > > > > > See above.
> > > > > > > > >
> > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +  */
> > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > +
> > > > > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > > > > >   }
> > > > > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Ok.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Do you mean:
> > > > > > > > > > >
> > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > interrupt handlers
> > > > > > > > > >
> > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > >
> > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > spinlock or others.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > for old hypervisors
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > >
> > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > codes. They look all fine since day0.
> > > > > > > > >
> > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > > > > in the debug build).
> > > > > > > > >
> > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > >
> > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > read back for validation in many cases.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > >   }
> > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > >   {
> > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > > > > + }
> > > > > > > > > > > > > +
> > > > > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > >           bool failed;
> > > > > > > > > > > > >           bool config_enabled;
> > > > > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > >           struct device dev;
> > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > >   }
> > > > > > > > > > > > > +/*
> > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > +         return true;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > > > > paired
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Will fix.
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > > > > +  */
> > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > >   /**
> > > > > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > >    * @vdev: the device
> > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > > > > +  */
> > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > +
> > > > > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > >   }
> > > > > > > > > > > > > --
> > > > > > > > > > > > > 2.25.1
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >

```* Re:
2022-03-28 10:40               ` Re: Michael S. Tsirkin
2022-03-29  7:12                 ` Re: Jason Wang
@ 2022-03-29  8:35                 ` Thomas Gleixner
2022-03-29 14:37                   ` Re: Michael S. Tsirkin
2022-04-12  6:55                   ` Re: Michael S. Tsirkin
1 sibling, 2 replies; 400+ messages in thread
From: Thomas Gleixner @ 2022-03-29  8:35 UTC (permalink / raw)
To: Michael S. Tsirkin, Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Peter Zijlstra,
Stefano Garzarella, Keir Fraser, Paul E. McKenney

On Mon, Mar 28 2022 at 06:40, Michael S. Tsirkin wrote:
> On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
>> > > So I think we might talk different issues:
>> > >
>> > > 1) Whether request_irq() commits the previous setups, I think the
>> > > answer is yes, since the spin_unlock of desc->lock (release) can
>> > > guarantee this though there seems no documentation around
>> > > request_irq() to say this.
>> > >
>> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
>> > > using smp_wmb() before the request_irq().

That's a complete bogus example especially as there is not a single
smp_rmb() which pairs with the smp_wmb().

>> > > And even if write is ordered we still need read to be ordered to be
>> > > paired with that.
>
> IMO it synchronizes with the CPU to which irq is
> delivered. Otherwise basically all drivers would be broken,
> wouldn't they be?
> I don't know whether it's correct on all platforms, but if not
> we need to fix request_irq.

There is nothing to fix:

request_irq()
raw_spin_lock_irq(desc->lock);       // ACQUIRE
....
raw_spin_unlock_irq(desc->lock);     // RELEASE

interrupt()
raw_spin_lock(desc->lock);           // ACQUIRE
set status to IN_PROGRESS
raw_spin_unlock(desc->lock);         // RELEASE
invoke handler()

So anything which the driver set up _before_ request_irq() is visible to
the interrupt handler. No?

>> What happens if an interrupt is raised in the middle like:
>>
>> smp_store_release(dev->irq_soft_enabled, true)
>> IRQ handler
>> synchornize_irq()

This is bogus. The obvious order of things is:

dev->ok = false;
request_irq();

moar_setup();
synchronize_irq();  // ACQUIRE + RELEASE
dev->ok = true;

The reverse operation on teardown:

dev->ok = false;
synchronize_irq();  // ACQUIRE + RELEASE

teardown();

So in both cases a simple check in the handler is sufficient:

handler()
if (!dev->ok)
return;

I'm not understanding what you folks are trying to "fix" here. If any
driver does this in the wrong order, then the driver is broken.

Sure, you can do the same with:

dev->ok = false;
request_irq();
moar_setup();
smp_wmb();
dev->ok = true;

for the price of a smp_rmb() in the interrupt handler:

handler()
if (!dev->ok)
return;
smp_rmb();

but that's only working for the setup case correctly and not for
teardown.

Thanks,

tglx

```* Re:
2022-03-28 10:40               ` Re: Michael S. Tsirkin
@ 2022-03-29  7:12                 ` Jason Wang
2022-03-29 14:08                   ` Re: Michael S. Tsirkin
2022-03-29  8:35                 ` Re: Thomas Gleixner
From: Jason Wang @ 2022-03-29  7:12 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Mon, Mar 28, 2022 at 6:41 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> > On Mon, Mar 28, 2022 at 1:59 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> > > > On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > > > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > Bcc:
> > > > > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > > > > Reply-To:
> > > > > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > > > > >
> > > > > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > > > > >
> > > > > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > > > > >
> > > > > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > > > >     that is used by some device such as virtio-blk
> > > > > > > > > > > > 2) done only for PCI transport
> > > > > > > > > > > >
> > > > > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > > > > but the cost should be acceptable.
> > > > > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > > > > >
> > > > > > > > > > We do something like the following previously:
> > > > > > > > > >
> > > > > > > > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > > > >                 return IRQ_NONE;
> > > > > > > > > >
> > > > > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > > > > the driver.
> > > > > > > > >
> > > > > > > > > I don't think so - if you sync after setting the value then
> > > > > > > > > you are guaranteed that any handler running afterwards
> > > > > > > > > will see the new value.
> > > > > > > >
> > > > > > > > The problem is not disabled but the enable.
> > > > > > >
> > > > > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > > > > imo.
> > > > > >
> > > > > > It's the interrupt raised before setting irq_soft_enabled to true:
> > > > > >
> > > > > > CPU 0 probe) driver specific setup (not commited)
> > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > CPU 0 probe) set irq_soft_enabled to true
> > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Yea, it hurts if you do it.  So do not do it then ;).
> > > > >
> > > > > irq_soft_enabled (I think driver_ok or status is a better name)
> > > >
> > > > I can change it to driver_ok.
> > > >
> > > > > should be initialized to false *before* irq is requested.
> > > > >
> > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > broken,
> > > >
> > > > So I think we might talk different issues:
> > > >
> > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > guarantee this though there seems no documentation around
> > > > request_irq() to say this.
> > > >
> > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > using smp_wmb() before the request_irq().
> > > >
> > > > And even if write is ordered we still need read to be ordered to be
> > > > paired with that.
>
> IMO it synchronizes with the CPU to which irq is
> delivered. Otherwise basically all drivers would be broken,
> wouldn't they be?

I guess it's because most of the drivers don't care much about the
buggy/malicious device.  And most of the devices may require an extra
step to enable device IRQ after request_irq(). Or it's the charge of
the driver to do the synchronization.

> I don't know whether it's correct on all platforms, but if not
> we need to fix request_irq.
>
> > > >
> > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > virtio.
> > > >
> > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > >
> > > > request_irq()
> > > > driver specific setups
> > > >
> > > > CPU 0 probe) request_irq()
> > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > CPU 0 probe) driver specific setups
> > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > CPU 1 IRQ handler) use the uninitialized variable
> > > >
> > > > Thanks
> > >
> > >
> > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > That will guarantee all setup is visible to the specific IRQ,
> >
> > Only the interrupt after synchronize_irq() returns.
>
> Anything else is a buggy device though.

Yes, but the goal of this patch is to prevent the possible attack from
buggy(malicious) devices.

>
> > >this
> > > is what it's point is.
> >
> > What happens if an interrupt is raised in the middle like:
> >
> > smp_store_release(dev->irq_soft_enabled, true)
> > IRQ handler
> > synchornize_irq()
> >
> > If we don't enforce a reading order, the IRQ handler may still see the
> > uninitialized variable.
> >
> > Thanks
>
> IMHO variables should be initialized before request_irq
> to a value meaning "not a valid interrupt".
> Specifically driver_ok = false.
> Handler in the scenario you describe will then see !driver_ok
> and exit immediately.

So just to make sure we're on the same page.

1) virtio_reset_device() will set the driver_ok to false;
2) virtio_device_ready() will set the driver_ok to true

So for virtio drivers, it often did:

1) virtio_reset_device()
2) find_vqs() which will call request_irq()
3) other driver specific setups

In virtio_device_ready(), the patch perform the following currently:

smp_store_release(driver_ok, true);
set_status(DRIVER_OK);

smp_store_release(driver_ok, true);
synchornize_irq()
set_status(DRIVER_OK)

Suppose there's a interrupt raised before the synchronize_irq(), if we do:

vq->callback()
}

It will see the driver_ok as true but how can we make sure
vq->callback sees the driver specific setups (3) above?

And an example is virtio_scsi():

virtio_reset_device()
virtscsi_probe()
virtscsi_init()
virtio_find_vqs()
...
virtscsi_init_vq(&vscsi->event_vq, vqs[1])
....

In virtscsi_event_done():

virtscsi_event_done():
virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);

We need to make sure the even_done reads driver_ok before read vscsi->event_vq.

Thanks

>
>
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > > We use smp_store_relase()
> > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > which surprises me.
> > > > > > > > >
> > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > >
> > > > > > > > > > "
> > > > > > > > > >
> > > > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > > > "
> > > > > > > > > >
> > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > This is only for config interrupt.
> > > > > > > >
> > > > > > > > Ok.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > expensive.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > +
> > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > +
> > > > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > >    * */
> > > > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > >   {
> > > > > > > > > > > > + /*
> > > > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > though it's most likely is ...
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > >
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > >  * and NMI handlers.
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > >
> > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > >
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > You are right. So then
> > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > >    READ_ONCE should do.
> > > > > > > >
> > > > > > > > See above.
> > > > > > > >
> > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > >
> > > > > > > >
> > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +  */
> > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > +
> > > > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > > > >   }
> > > > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Ok.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > +
> > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Do you mean:
> > > > > > > > > >
> > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > interrupt handlers
> > > > > > > > >
> > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > users at the moment, even less than probe.
> > > > > > > >
> > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > spinlock or others.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > for old hypervisors
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > >
> > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > codes. They look all fine since day0.
> > > > > > > >
> > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > > > in the debug build).
> > > > > > > >
> > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > speeding boot up a tiny bit.
> > > > > > > >
> > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > read back for validation in many cases.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > >   }
> > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > >   {
> > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > > > + }
> > > > > > > > > > > > +
> > > > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > >           bool failed;
> > > > > > > > > > > >           bool config_enabled;
> > > > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > >           struct device dev;
> > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > >   }
> > > > > > > > > > > > +/*
> > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > + */
> > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > +{
> > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > +         return true;
> > > > > > > > > > > > +
> > > > > > > > > > > > + /*
> > > > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > > > paired
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Will fix.
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > > > +  */
> > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > >   /**
> > > > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > >    * @vdev: the device
> > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > > > + /*
> > > > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > > > +  */
> > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > +
> > > > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > >   }
> > > > > > > > > > > > --
> > > > > > > > > > > > 2.25.1
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>

```* Re:
2022-03-28  6:18             ` Re: Jason Wang
@ 2022-03-28 10:40               ` Michael S. Tsirkin
2022-03-29  7:12                 ` Re: Jason Wang
2022-03-29  8:35                 ` Re: Thomas Gleixner
0 siblings, 2 replies; 400+ messages in thread
From: Michael S. Tsirkin @ 2022-03-28 10:40 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> On Mon, Mar 28, 2022 at 1:59 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> > > On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > Bcc:
> > > > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > > > Reply-To:
> > > > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > > > >
> > > > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > > > >
> > > > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > > > >
> > > > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > > >     that is used by some device such as virtio-blk
> > > > > > > > > > > 2) done only for PCI transport
> > > > > > > > > > >
> > > > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > > > but the cost should be acceptable.
> > > > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > > > >
> > > > > > > > > We do something like the following previously:
> > > > > > > > >
> > > > > > > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > > >                 return IRQ_NONE;
> > > > > > > > >
> > > > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > > > the driver.
> > > > > > > >
> > > > > > > > I don't think so - if you sync after setting the value then
> > > > > > > > you are guaranteed that any handler running afterwards
> > > > > > > > will see the new value.
> > > > > > >
> > > > > > > The problem is not disabled but the enable.
> > > > > >
> > > > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > > > imo.
> > > > >
> > > > > It's the interrupt raised before setting irq_soft_enabled to true:
> > > > >
> > > > > CPU 0 probe) driver specific setup (not commited)
> > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > CPU 0 probe) set irq_soft_enabled to true
> > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > >
> > > > > Thanks
> > > >
> > > > Yea, it hurts if you do it.  So do not do it then ;).
> > > >
> > > > irq_soft_enabled (I think driver_ok or status is a better name)
> > >
> > > I can change it to driver_ok.
> > >
> > > > should be initialized to false *before* irq is requested.
> > > >
> > > > And requesting irq commits all memory otherwise all drivers would be
> > > > broken,
> > >
> > > So I think we might talk different issues:
> > >
> > > 1) Whether request_irq() commits the previous setups, I think the
> > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > guarantee this though there seems no documentation around
> > > request_irq() to say this.
> > >
> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > using smp_wmb() before the request_irq().
> > >
> > > And even if write is ordered we still need read to be ordered to be
> > > paired with that.

IMO it synchronizes with the CPU to which irq is
delivered. Otherwise basically all drivers would be broken,
wouldn't they be?
I don't know whether it's correct on all platforms, but if not
we need to fix request_irq.

> > >
> > > > if it doesn't it just needs to be fixed, not worked around in
> > > > virtio.
> > >
> > > 2) virtio drivers might do a lot of setups between request_irq() and
> > >
> > > request_irq()
> > > driver specific setups
> > >
> > > CPU 0 probe) request_irq()
> > > CPU 1 IRQ handler) read the uninitialized variable
> > > CPU 0 probe) driver specific setups
> > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > CPU 1 IRQ handler) use the uninitialized variable
> > >
> > > Thanks
> >
> >
> > As I said, virtio_device_ready needs to do synchronize_irq.
> > That will guarantee all setup is visible to the specific IRQ,
>
> Only the interrupt after synchronize_irq() returns.

Anything else is a buggy device though.

> >this
> > is what it's point is.
>
> What happens if an interrupt is raised in the middle like:
>
> smp_store_release(dev->irq_soft_enabled, true)
> IRQ handler
> synchornize_irq()
>
> If we don't enforce a reading order, the IRQ handler may still see the
> uninitialized variable.
>
> Thanks

IMHO variables should be initialized before request_irq
to a value meaning "not a valid interrupt".
Specifically driver_ok = false.
Handler in the scenario you describe will then see !driver_ok
and exit immediately.

> >
> >
> > > >
> > > >
> > > > > >
> > > > > > > We use smp_store_relase()
> > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > >
> > > > > > > >
> > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > which surprises me.
> > > > > > > >
> > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > are any buffers in any queues?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > >
> > > > > > > > > "
> > > > > > > > >
> > > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > > "
> > > > > > > > >
> > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > >
> > > > > > > >
> > > > > > > > This is only for config interrupt.
> > > > > > >
> > > > > > > Ok.
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > expensive.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > +
> > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > +
> > > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > >    * */
> > > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > >   {
> > > > > > > > > > > + /*
> > > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > though it's most likely is ...
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > >
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > >  * and NMI handlers.
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > >
> > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > >
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > irq_soft_enabled as false.
> > > > > > > > >
> > > > > > > >
> > > > > > > > You are right. So then
> > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > >    READ_ONCE should do.
> > > > > > >
> > > > > > > See above.
> > > > > > >
> > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > >
> > > > > > >
> > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +  */
> > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > +
> > > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > > >   }
> > > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Ok.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > +
> > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Do you mean:
> > > > > > > > >
> > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > interrupt handlers
> > > > > > > >
> > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > users at the moment, even less than probe.
> > > > > > >
> > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > spinlock or others.
> > > > > > >
> > > > > > > >
> > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > for old hypervisors
> > > > > > > >
> > > > > > > >
> > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > >
> > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > codes. They look all fine since day0.
> > > > > > >
> > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > > in the debug build).
> > > > > > >
> > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > device. I think it can be done but in a separate series.
> > > > > > >
> > > > > > > >
> > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > speeding boot up a tiny bit.
> > > > > > >
> > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > read back for validation in many cases.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > >   }
> > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > >   {
> > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > > + }
> > > > > > > > > > > +
> > > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > >           bool failed;
> > > > > > > > > > >           bool config_enabled;
> > > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > >           struct device dev;
> > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > >   }
> > > > > > > > > > > +/*
> > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > + */
> > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > +{
> > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > +         return true;
> > > > > > > > > > > +
> > > > > > > > > > > + /*
> > > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > > paired
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Will fix.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > > +  */
> > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >   /**
> > > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > >    * @vdev: the device
> > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > > + /*
> > > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > > +  */
> > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > +
> > > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > >   }
> > > > > > > > > > > --
> > > > > > > > > > > 2.25.1
> > > > > > > >
> > > > > >
> > > >
> >

```* Re:
2022-03-28  5:59           ` Re: Michael S. Tsirkin
@ 2022-03-28  6:18             ` Jason Wang
2022-03-28 10:40               ` Re: Michael S. Tsirkin
From: Jason Wang @ 2022-03-28  6:18 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Mon, Mar 28, 2022 at 1:59 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> > On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > Bcc:
> > > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > > Reply-To:
> > > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > > >
> > > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > > >
> > > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > > >
> > > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > >     that is used by some device such as virtio-blk
> > > > > > > > > > 2) done only for PCI transport
> > > > > > > > > >
> > > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > > but the cost should be acceptable.
> > > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > > >
> > > > > > > >
> > > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > > >
> > > > > > > > We do something like the following previously:
> > > > > > > >
> > > > > > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > >                 return IRQ_NONE;
> > > > > > > >
> > > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > > the driver.
> > > > > > >
> > > > > > > I don't think so - if you sync after setting the value then
> > > > > > > you are guaranteed that any handler running afterwards
> > > > > > > will see the new value.
> > > > > >
> > > > > > The problem is not disabled but the enable.
> > > > >
> > > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > > imo.
> > > >
> > > > It's the interrupt raised before setting irq_soft_enabled to true:
> > > >
> > > > CPU 0 probe) driver specific setup (not commited)
> > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > CPU 0 probe) set irq_soft_enabled to true
> > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > CPU 1 IRQ handler) use the uninitialized variable
> > > >
> > > > Thanks
> > >
> > > Yea, it hurts if you do it.  So do not do it then ;).
> > >
> > > irq_soft_enabled (I think driver_ok or status is a better name)
> >
> > I can change it to driver_ok.
> >
> > > should be initialized to false *before* irq is requested.
> > >
> > > And requesting irq commits all memory otherwise all drivers would be
> > > broken,
> >
> > So I think we might talk different issues:
> >
> > 1) Whether request_irq() commits the previous setups, I think the
> > answer is yes, since the spin_unlock of desc->lock (release) can
> > guarantee this though there seems no documentation around
> > request_irq() to say this.
> >
> > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > using smp_wmb() before the request_irq().
> >
> > And even if write is ordered we still need read to be ordered to be
> > paired with that.
> >
> > > if it doesn't it just needs to be fixed, not worked around in
> > > virtio.
> >
> > 2) virtio drivers might do a lot of setups between request_irq() and
> >
> > request_irq()
> > driver specific setups
> >
> > CPU 0 probe) request_irq()
> > CPU 1 IRQ handler) read the uninitialized variable
> > CPU 0 probe) driver specific setups
> > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > CPU 1 IRQ handler) read irq_soft_enable as true
> > CPU 1 IRQ handler) use the uninitialized variable
> >
> > Thanks
>
>
> As I said, virtio_device_ready needs to do synchronize_irq.
> That will guarantee all setup is visible to the specific IRQ,

Only the interrupt after synchronize_irq() returns.

>this
> is what it's point is.

What happens if an interrupt is raised in the middle like:

smp_store_release(dev->irq_soft_enabled, true)
IRQ handler
synchornize_irq()

If we don't enforce a reading order, the IRQ handler may still see the
uninitialized variable.

Thanks

>
>
> > >
> > >
> > > > >
> > > > > > We use smp_store_relase()
> > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > >
> > > > > > >
> > > > > > > which surprises me.
> > > > > > >
> > > > > > > CC Paul to help make sure I'm right.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > hardening is disabled by default.
> > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > are any buffers in any queues?
> > > > > > > >
> > > > > > > >
> > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > >
> > > > > > > > "
> > > > > > > >
> > > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > > "
> > > > > > > >
> > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > >
> > > > > > >
> > > > > > > This is only for config interrupt.
> > > > > >
> > > > > > Ok.
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > expensive.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > >   #include <linux/of.h>
> > > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > +
> > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > +
> > > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > >    * */
> > > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > >   {
> > > > > > > > > > + /*
> > > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > though it's most likely is ...
> > > > > > > >
> > > > > > > >
> > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > >
> > > > > > > > """
> > > > > > > >
> > > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > > >  * and NMI handlers.
> > > > > > > > """
> > > > > > > >
> > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > >
> > > > > > > > And it has the comment for explain the barrier:
> > > > > > > >
> > > > > > > > """
> > > > > > > >
> > > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > > """
> > > > > > > >
> > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > irq_soft_enabled as false.
> > > > > > > >
> > > > > > >
> > > > > > > You are right. So then
> > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > >    READ_ONCE should do.
> > > > > >
> > > > > > See above.
> > > > > >
> > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > >
> > > > > >
> > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > +  */
> > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > +
> > > > > > > > > >           dev->config->reset(dev);
> > > > > > > > > >   }
> > > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > >
> > > > > > > >
> > > > > > > > Ok.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > > >           dev->config_enabled = false;
> > > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > +
> > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > >
> > > > > > > >
> > > > > > > > Do you mean:
> > > > > > > >
> > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > interrupt handlers
> > > > > > >
> > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > users at the moment, even less than probe.
> > > > > >
> > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > spinlock or others.
> > > > > >
> > > > > > >
> > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > for old hypervisors
> > > > > > >
> > > > > > >
> > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > >
> > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > codes. They look all fine since day0.
> > > > > >
> > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > > in the debug build).
> > > > > >
> > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > device. I think it can be done but in a separate series.
> > > > > >
> > > > > > >
> > > > > > > And going down from there, how about we cache status in the
> > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > speeding boot up a tiny bit.
> > > > > >
> > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > read back for validation in many cases.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > >   }
> > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > >   {
> > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > > + }
> > > > > > > > > > +
> > > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > > >    * @dev: underlying device.
> > > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > >           bool failed;
> > > > > > > > > >           bool config_enabled;
> > > > > > > > > >           bool config_change_pending;
> > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > >           spinlock_t config_lock;
> > > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > >           struct device dev;
> > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > > >   }
> > > > > > > > > > +/*
> > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > + * @vdev: the device
> > > > > > > > > > + */
> > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > +{
> > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > +         return true;
> > > > > > > > > > +
> > > > > > > > > > + /*
> > > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > > paired
> > > > > > > >
> > > > > > > >
> > > > > > > > Will fix.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > > +  */
> > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >   /**
> > > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > >    * @vdev: the device
> > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > > + /*
> > > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > > +  */
> > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > +
> > > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > >   }
> > > > > > > > > > --
> > > > > > > > > > 2.25.1
> > > > > > >
> > > > >
> > >
>

```* Re:
2022-03-28  4:56         ` Re: Jason Wang
@ 2022-03-28  5:59           ` Michael S. Tsirkin
2022-03-28  6:18             ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-28  5:59 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > Bcc:
> > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > Reply-To:
> > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > >
> > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > >
> > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > >
> > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > >     that is used by some device such as virtio-blk
> > > > > > > > > 2) done only for PCI transport
> > > > > > > > >
> > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > but the cost should be acceptable.
> > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > >
> > > > > > >
> > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > >
> > > > > > > We do something like the following previously:
> > > > > > >
> > > > > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > >                 return IRQ_NONE;
> > > > > > >
> > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > the driver.
> > > > > >
> > > > > > I don't think so - if you sync after setting the value then
> > > > > > you are guaranteed that any handler running afterwards
> > > > > > will see the new value.
> > > > >
> > > > > The problem is not disabled but the enable.
> > > >
> > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > imo.
> > >
> > > It's the interrupt raised before setting irq_soft_enabled to true:
> > >
> > > CPU 0 probe) driver specific setup (not commited)
> > > CPU 1 IRQ handler) read the uninitialized variable
> > > CPU 0 probe) set irq_soft_enabled to true
> > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > CPU 1 IRQ handler) use the uninitialized variable
> > >
> > > Thanks
> >
> > Yea, it hurts if you do it.  So do not do it then ;).
> >
> > irq_soft_enabled (I think driver_ok or status is a better name)
>
> I can change it to driver_ok.
>
> > should be initialized to false *before* irq is requested.
> >
> > And requesting irq commits all memory otherwise all drivers would be
> > broken,
>
> So I think we might talk different issues:
>
> 1) Whether request_irq() commits the previous setups, I think the
> answer is yes, since the spin_unlock of desc->lock (release) can
> guarantee this though there seems no documentation around
> request_irq() to say this.
>
> And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> using smp_wmb() before the request_irq().
>
> And even if write is ordered we still need read to be ordered to be
> paired with that.
>
> > if it doesn't it just needs to be fixed, not worked around in
> > virtio.
>
> 2) virtio drivers might do a lot of setups between request_irq() and
>
> request_irq()
> driver specific setups
>
> CPU 0 probe) request_irq()
> CPU 1 IRQ handler) read the uninitialized variable
> CPU 0 probe) driver specific setups
> CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> CPU 1 IRQ handler) read irq_soft_enable as true
> CPU 1 IRQ handler) use the uninitialized variable
>
> Thanks

As I said, virtio_device_ready needs to do synchronize_irq.
That will guarantee all setup is visible to the specific IRQ, this
is what it's point is.

> >
> >
> > > >
> > > > > We use smp_store_relase()
> > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > >
> > > > > >
> > > > > > which surprises me.
> > > > > >
> > > > > > CC Paul to help make sure I'm right.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > hardening is disabled by default.
> > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > are any buffers in any queues?
> > > > > > >
> > > > > > >
> > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > >
> > > > > > > "
> > > > > > >
> > > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > > >     the callback could race with driver-specific initialization.
> > > > > > > "
> > > > > > >
> > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > >
> > > > > >
> > > > > > This is only for config interrupt.
> > > > >
> > > > > Ok.
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > expensive.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > >   #include <linux/of.h>
> > > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > +
> > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > +
> > > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > >    * */
> > > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > >   {
> > > > > > > > > + /*
> > > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > though it's most likely is ...
> > > > > > >
> > > > > > >
> > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > >
> > > > > > > """
> > > > > > >
> > > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > > >  * and NMI handlers.
> > > > > > > """
> > > > > > >
> > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > >
> > > > > > > And it has the comment for explain the barrier:
> > > > > > >
> > > > > > > """
> > > > > > >
> > > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > > """
> > > > > > >
> > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > irq_soft_enabled as false.
> > > > > > >
> > > > > >
> > > > > > You are right. So then
> > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > >    READ_ONCE should do.
> > > > >
> > > > > See above.
> > > > >
> > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > >
> > > > >
> > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > +  */
> > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > + synchronize_rcu();
> > > > > > > > > +
> > > > > > > > >           dev->config->reset(dev);
> > > > > > > > >   }
> > > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > >
> > > > > > >
> > > > > > > Ok.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > > >           dev->config_enabled = false;
> > > > > > > > >           dev->config_change_pending = false;
> > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > +
> > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > devices. this flag defeats the purpose.
> > > > > > >
> > > > > > >
> > > > > > > Do you mean:
> > > > > > >
> > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > interrupt handlers
> > > > > >
> > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > users at the moment, even less than probe.
> > > > >
> > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > spinlock or others.
> > > > >
> > > > > >
> > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > for old hypervisors
> > > > > >
> > > > > >
> > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > >
> > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > codes. They look all fine since day0.
> > > > >
> > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > > in the debug build).
> > > > >
> > > > > This looks like a hardening of the driver in the core instead of the
> > > > > device. I think it can be done but in a separate series.
> > > > >
> > > > > >
> > > > > > And going down from there, how about we cache status in the
> > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > speeding boot up a tiny bit.
> > > > >
> > > > > I don't fully understand here, actually spec requires status to be
> > > > > read back for validation in many cases.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > >   }
> > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > >   {
> > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > +         return IRQ_NONE;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > >           if (!more_used(vq)) {
> > > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > >                   return IRQ_NONE;
> > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > > >    * @dev: underlying device.
> > > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > >           bool failed;
> > > > > > > > >           bool config_enabled;
> > > > > > > > >           bool config_change_pending;
> > > > > > > > > + bool irq_soft_check;
> > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > >           spinlock_t config_lock;
> > > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > >           struct device dev;
> > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > > >   }
> > > > > > > > > +/*
> > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > + * @vdev: the device
> > > > > > > > > + */
> > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > +{
> > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > +         return true;
> > > > > > > > > +
> > > > > > > > > + /*
> > > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > > paired
> > > > > > >
> > > > > > >
> > > > > > > Will fix.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > +  * virtio_reset_device().
> > > > > > > > > +  */
> > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >   /**
> > > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > >    * @vdev: the device
> > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > > + /*
> > > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > > +  */
> > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > +
> > > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > >   }
> > > > > > > > > --
> > > > > > > > > 2.25.1
> > > > > >
> > > >
> >

```* Re:
2022-03-25 10:09       ` Re: Michael S. Tsirkin
@ 2022-03-28  4:56         ` Jason Wang
2022-03-28  5:59           ` Re: Michael S. Tsirkin
From: Jason Wang @ 2022-03-28  4:56 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > Bcc:
> > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > Reply-To:
> > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > >
> > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > >
> > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > >
> > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > >     that is used by some device such as virtio-blk
> > > > > > > > 2) done only for PCI transport
> > > > > > > >
> > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > but the cost should be acceptable.
> > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > >
> > > > > >
> > > > > > Even if we allow the transport driver to synchornize through
> > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > >
> > > > > > We do something like the following previously:
> > > > > >
> > > > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > >                 return IRQ_NONE;
> > > > > >
> > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > the driver.
> > > > >
> > > > > I don't think so - if you sync after setting the value then
> > > > > you are guaranteed that any handler running afterwards
> > > > > will see the new value.
> > > >
> > > > The problem is not disabled but the enable.
> > >
> > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > imo.
> >
> > It's the interrupt raised before setting irq_soft_enabled to true:
> >
> > CPU 0 probe) driver specific setup (not commited)
> > CPU 1 IRQ handler) read the uninitialized variable
> > CPU 0 probe) set irq_soft_enabled to true
> > CPU 1 IRQ handler) read irq_soft_enable as true
> > CPU 1 IRQ handler) use the uninitialized variable
> >
> > Thanks
>
> Yea, it hurts if you do it.  So do not do it then ;).
>
> irq_soft_enabled (I think driver_ok or status is a better name)

I can change it to driver_ok.

> should be initialized to false *before* irq is requested.
>
> And requesting irq commits all memory otherwise all drivers would be
> broken,

So I think we might talk different issues:

1) Whether request_irq() commits the previous setups, I think the
answer is yes, since the spin_unlock of desc->lock (release) can
guarantee this though there seems no documentation around
request_irq() to say this.

And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
using smp_wmb() before the request_irq().

And even if write is ordered we still need read to be ordered to be
paired with that.

> if it doesn't it just needs to be fixed, not worked around in
> virtio.

2) virtio drivers might do a lot of setups between request_irq() and

request_irq()
driver specific setups

CPU 0 probe) request_irq()
CPU 1 IRQ handler) read the uninitialized variable
CPU 0 probe) driver specific setups
CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
CPU 1 IRQ handler) read irq_soft_enable as true
CPU 1 IRQ handler) use the uninitialized variable

Thanks

>
>
> > >
> > > > We use smp_store_relase()
> > > > to make sure the driver commits the setup before enabling the irq. It
> > > > means the read needs to be ordered as well in vring_interrupt().
> > > >
> > > > >
> > > > > which surprises me.
> > > > >
> > > > > CC Paul to help make sure I'm right.
> > > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > hardening is disabled by default.
> > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > are any buffers in any queues?
> > > > > >
> > > > > >
> > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > >
> > > > > > "
> > > > > >
> > > > > >     This change will also benefit old hypervisors (before 2009)
> > > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > > >     the callback could race with driver-specific initialization.
> > > > > > "
> > > > > >
> > > > > > If this is only for config interrupt, I can remove the above log.
> > > > >
> > > > >
> > > > > This is only for config interrupt.
> > > >
> > > > Ok.
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > expensive.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > >
> > > > > > > > ---
> > > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > >   #include <linux/of.h>
> > > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > > +static bool irq_hardening = false;
> > > > > > > > +
> > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > > +
> > > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > >    * */
> > > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > > >   {
> > > > > > > > + /*
> > > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > > +  * interrupt for this line arriving after
> > > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > +  * irq_soft_enabled == false.
> > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > though it's most likely is ...
> > > > > >
> > > > > >
> > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > >
> > > > > > """
> > > > > >
> > > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > > >  * and NMI handlers.
> > > > > > """
> > > > > >
> > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > >
> > > > > > And it has the comment for explain the barrier:
> > > > > >
> > > > > > """
> > > > > >
> > > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > > """
> > > > > >
> > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > irq_soft_enabled as false.
> > > > > >
> > > > >
> > > > > You are right. So then
> > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > >    READ_ONCE should do.
> > > >
> > > > See above.
> > > >
> > > > > 2. isn't synchronize_irq also doing the same thing?
> > > >
> > > >
> > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > > +  */
> > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > + synchronize_rcu();
> > > > > > > > +
> > > > > > > >           dev->config->reset(dev);
> > > > > > > >   }
> > > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > let's not add useless overhead to the boot sequence.
> > > > > >
> > > > > >
> > > > > > Ok.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > > >           dev->config_enabled = false;
> > > > > > > >           dev->config_change_pending = false;
> > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > +
> > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > devices. this flag defeats the purpose.
> > > > > >
> > > > > >
> > > > > > Do you mean:
> > > > > >
> > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > interrupt handlers
> > > > >
> > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > users at the moment, even less than probe.
> > > >
> > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > spinlock or others.
> > > >
> > > > >
> > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > for old hypervisors
> > > > >
> > > > >
> > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > >
> > > > Probably not, we have devices that accept random inputs from outside,
> > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > codes. They look all fine since day0.
> > > >
> > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > > in the debug build).
> > > >
> > > > This looks like a hardening of the driver in the core instead of the
> > > > device. I think it can be done but in a separate series.
> > > >
> > > > >
> > > > > And going down from there, how about we cache status in the
> > > > > device? Then we don't need to keep re-reading it every time,
> > > > > speeding boot up a tiny bit.
> > > >
> > > > I don't fully understand here, actually spec requires status to be
> > > > read back for validation in many cases.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > >   }
> > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > >   {
> > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > +         return IRQ_NONE;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > >           if (!more_used(vq)) {
> > > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > >                   return IRQ_NONE;
> > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > > >    * @dev: underlying device.
> > > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > >           bool failed;
> > > > > > > >           bool config_enabled;
> > > > > > > >           bool config_change_pending;
> > > > > > > > + bool irq_soft_check;
> > > > > > > > + bool irq_soft_enabled;
> > > > > > > >           spinlock_t config_lock;
> > > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > >           struct device dev;
> > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > > >   }
> > > > > > > > +/*
> > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > + * @vdev: the device
> > > > > > > > + */
> > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > +{
> > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > +         return true;
> > > > > > > > +
> > > > > > > > + /*
> > > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > > paired
> > > > > >
> > > > > >
> > > > > > Will fix.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > +  * virtio_reset_device().
> > > > > > > > +  */
> > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >   /**
> > > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > >    * @vdev: the device
> > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > >           if (dev->config->enable_cbs)
> > > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > > + /*
> > > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > > +  */
> > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > +
> > > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > >   }
> > > > > > > > --
> > > > > > > > 2.25.1
> > > > >
> > >
>

```* Re:
2022-03-25  9:20     ` Re: Jason Wang
@ 2022-03-25 10:09       ` Michael S. Tsirkin
2022-03-28  4:56         ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-25 10:09 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > Bcc:
> > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > >
> > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > >
> > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > >
> > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > >     that is used by some device such as virtio-blk
> > > > > > > 2) done only for PCI transport
> > > > > > >
> > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > but the cost should be acceptable.
> > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > >
> > > > >
> > > > > Even if we allow the transport driver to synchornize through
> > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > >
> > > > > We do something like the following previously:
> > > > >
> > > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > >                 return IRQ_NONE;
> > > > >
> > > > > But it looks like a bug since speculative read can be done before the check
> > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > the driver.
> > > >
> > > > I don't think so - if you sync after setting the value then
> > > > you are guaranteed that any handler running afterwards
> > > > will see the new value.
> > >
> > > The problem is not disabled but the enable.
> >
> > So a misbehaving device can lose interrupts? That's not a problem at all
> > imo.
>
> It's the interrupt raised before setting irq_soft_enabled to true:
>
> CPU 0 probe) driver specific setup (not commited)
> CPU 1 IRQ handler) read the uninitialized variable
> CPU 0 probe) set irq_soft_enabled to true
> CPU 1 IRQ handler) read irq_soft_enable as true
> CPU 1 IRQ handler) use the uninitialized variable
>
> Thanks

Yea, it hurts if you do it.  So do not do it then ;).

irq_soft_enabled (I think driver_ok or status is a better name)
should be initialized to false *before* irq is requested.

And requesting irq commits all memory otherwise all drivers would be
broken, if it doesn't it just needs to be fixed, not worked around in
virtio.

> >
> > > We use smp_store_relase()
> > > to make sure the driver commits the setup before enabling the irq. It
> > > means the read needs to be ordered as well in vring_interrupt().
> > >
> > > >
> > > > which surprises me.
> > > >
> > > > CC Paul to help make sure I'm right.
> > > >
> > > >
> > > > >
> > > > > >
> > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > hardening is disabled by default.
> > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > are any buffers in any queues?
> > > > >
> > > > >
> > > > > I copied this from the commit log for 22b7050a024d7
> > > > >
> > > > > "
> > > > >
> > > > >     This change will also benefit old hypervisors (before 2009)
> > > > >     that send interrupts without checking DRIVER_OK: previously,
> > > > >     the callback could race with driver-specific initialization.
> > > > > "
> > > > >
> > > > > If this is only for config interrupt, I can remove the above log.
> > > >
> > > >
> > > > This is only for config interrupt.
> > >
> > > Ok.
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > expensive.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > >
> > > > > > > ---
> > > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > @@ -7,6 +7,12 @@
> > > > > > >   #include <linux/of.h>
> > > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > > +static bool irq_hardening = false;
> > > > > > > +
> > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > > +
> > > > > > >   /* Unique numbering for virtio devices. */
> > > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > >    * */
> > > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > > >   {
> > > > > > > + /*
> > > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > > +  * interrupt for this line arriving after
> > > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > > +  * irq_soft_enabled == false.
> > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > though it's most likely is ...
> > > > >
> > > > >
> > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > >
> > > > > """
> > > > >
> > > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > > >  * and NMI handlers.
> > > > > """
> > > > >
> > > > > So interrupt handlers are treated as read-side critical sections.
> > > > >
> > > > > And it has the comment for explain the barrier:
> > > > >
> > > > > """
> > > > >
> > > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > > >  * the end of its last RCU read-side critical section whose beginning
> > > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > > """
> > > > >
> > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > irq_soft_enabled as false.
> > > > >
> > > >
> > > > You are right. So then
> > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > >    READ_ONCE should do.
> > >
> > > See above.
> > >
> > > > 2. isn't synchronize_irq also doing the same thing?
> > >
> > >
> > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > >
> > > >
> > > >
> > > > > >
> > > > > > > +  */
> > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > + synchronize_rcu();
> > > > > > > +
> > > > > > >           dev->config->reset(dev);
> > > > > > >   }
> > > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > let's not add useless overhead to the boot sequence.
> > > > >
> > > > >
> > > > > Ok.
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > >           spin_lock_init(&dev->config_lock);
> > > > > > >           dev->config_enabled = false;
> > > > > > >           dev->config_change_pending = false;
> > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > +
> > > > > > > + if (dev->irq_soft_check)
> > > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > >           /* We always start by resetting the device, in case a previous
> > > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > devices. this flag defeats the purpose.
> > > > >
> > > > >
> > > > > Do you mean:
> > > > >
> > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > interrupt handlers
> > > >
> > > > But synchronize is only on tear-down path. That is not critical for any
> > > > users at the moment, even less than probe.
> > >
> > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > spinlock or others.
> > >
> > > >
> > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > for old hypervisors
> > > >
> > > >
> > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > >
> > > Probably not, we have devices that accept random inputs from outside,
> > > net, console, input etc. I've done a round of audits of the Qemu
> > > codes. They look all fine since day0.
> > >
> > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > > in the debug build).
> > >
> > > This looks like a hardening of the driver in the core instead of the
> > > device. I think it can be done but in a separate series.
> > >
> > > >
> > > > And going down from there, how about we cache status in the
> > > > device? Then we don't need to keep re-reading it every time,
> > > > speeding boot up a tiny bit.
> > >
> > > I don't fully understand here, actually spec requires status to be
> > > read back for validation in many cases.
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > >   }
> > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > >   {
> > > > > > > + struct virtqueue *_vq = v;
> > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > +         return IRQ_NONE;
> > > > > > > + }
> > > > > > > +
> > > > > > >           if (!more_used(vq)) {
> > > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > >                   return IRQ_NONE;
> > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > --- a/include/linux/virtio.h
> > > > > > > +++ b/include/linux/virtio.h
> > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > >    * @config_lock: protects configuration change reporting
> > > > > > >    * @dev: underlying device.
> > > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > >           bool failed;
> > > > > > >           bool config_enabled;
> > > > > > >           bool config_change_pending;
> > > > > > > + bool irq_soft_check;
> > > > > > > + bool irq_soft_enabled;
> > > > > > >           spinlock_t config_lock;
> > > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > >           struct device dev;
> > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > > >   }
> > > > > > > +/*
> > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > + * @vdev: the device
> > > > > > > + */
> > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > +{
> > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > +         return true;
> > > > > > > +
> > > > > > > + /*
> > > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > > +  * data. Paried with smp_store_relase() in
> > > > > > paired
> > > > >
> > > > >
> > > > > Will fix.
> > > > >
> > > > > Thanks
> > > > >
> > > > >
> > > > > >
> > > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > +  * virtio_reset_device().
> > > > > > > +  */
> > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > +}
> > > > > > > +
> > > > > > >   /**
> > > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > >    * @vdev: the device
> > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > >           if (dev->config->enable_cbs)
> > > > > > >                     dev->config->enable_cbs(dev);
> > > > > > > + /*
> > > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > > +  * virtio_irq_soft_enabled()
> > > > > > > +  */
> > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > +
> > > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > >   }
> > > > > > > --
> > > > > > > 2.25.1
> > > >
> >

```* Re:
2022-03-25  9:10   ` Re: Michael S. Tsirkin
@ 2022-03-25  9:20     ` Jason Wang
2022-03-25 10:09       ` Re: Michael S. Tsirkin
From: Jason Wang @ 2022-03-25  9:20 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > Bcc:
> > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > >
> > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > >
> > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > >
> > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > >     that is used by some device such as virtio-blk
> > > > > > 2) done only for PCI transport
> > > > > >
> > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > virtio_device. Then we can to toggle it during
> > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > but the cost should be acceptable.
> > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > >
> > > >
> > > > Even if we allow the transport driver to synchornize through
> > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > >
> > > > We do something like the following previously:
> > > >
> > > >         if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > >                 return IRQ_NONE;
> > > >
> > > > But it looks like a bug since speculative read can be done before the check
> > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > the driver.
> > >
> > > I don't think so - if you sync after setting the value then
> > > you are guaranteed that any handler running afterwards
> > > will see the new value.
> >
> > The problem is not disabled but the enable.
>
> So a misbehaving device can lose interrupts? That's not a problem at all
> imo.

It's the interrupt raised before setting irq_soft_enabled to true:

CPU 0 probe) driver specific setup (not commited)
CPU 1 IRQ handler) read the uninitialized variable
CPU 0 probe) set irq_soft_enabled to true
CPU 1 IRQ handler) read irq_soft_enable as true
CPU 1 IRQ handler) use the uninitialized variable

Thanks

>
> > We use smp_store_relase()
> > to make sure the driver commits the setup before enabling the irq. It
> > means the read needs to be ordered as well in vring_interrupt().
> >
> > >
> > > which surprises me.
> > >
> > > CC Paul to help make sure I'm right.
> > >
> > >
> > > >
> > > > >
> > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > module parameter is introduced to enable the hardening so function
> > > > > > hardening is disabled by default.
> > > > > Which devices are these? How come they send an interrupt before there
> > > > > are any buffers in any queues?
> > > >
> > > >
> > > > I copied this from the commit log for 22b7050a024d7
> > > >
> > > > "
> > > >
> > > >     This change will also benefit old hypervisors (before 2009)
> > > >     that send interrupts without checking DRIVER_OK: previously,
> > > >     the callback could race with driver-specific initialization.
> > > > "
> > > >
> > > > If this is only for config interrupt, I can remove the above log.
> > >
> > >
> > > This is only for config interrupt.
> >
> > Ok.
> >
> > >
> > > >
> > > > >
> > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > expensive.
> > > > > >
> > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > >
> > > > > > ---
> > > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > > >   include/linux/virtio.h        |  4 ++++
> > > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > --- a/drivers/virtio/virtio.c
> > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > @@ -7,6 +7,12 @@
> > > > > >   #include <linux/of.h>
> > > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > > +static bool irq_hardening = false;
> > > > > > +
> > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > > +
> > > > > >   /* Unique numbering for virtio devices. */
> > > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > >    * */
> > > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > > >   {
> > > > > > + /*
> > > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > > +  * interrupt for this line arriving after
> > > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > > +  * irq_soft_enabled == false.
> > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > though it's most likely is ...
> > > >
> > > >
> > > > According to the comment above tree RCU version of synchronize_rcu():
> > > >
> > > > """
> > > >
> > > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > > >  * or softirqs have been disabled also serve as RCU read-side critical
> > > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > > >  * and NMI handlers.
> > > > """
> > > >
> > > > So interrupt handlers are treated as read-side critical sections.
> > > >
> > > > And it has the comment for explain the barrier:
> > > >
> > > > """
> > > >
> > > >  * Note that this guarantee implies further memory-ordering guarantees.
> > > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > > >  * each CPU is guaranteed to have executed a full memory barrier since
> > > >  * the end of its last RCU read-side critical section whose beginning
> > > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > > """
> > > >
> > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > irq_soft_enabled as false.
> > > >
> > >
> > > You are right. So then
> > > 1. I do not think we need load_acquire - why is it needed? Just
> > >    READ_ONCE should do.
> >
> > See above.
> >
> > > 2. isn't synchronize_irq also doing the same thing?
> >
> >
> > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> >
> > >
> > >
> > > > >
> > > > > > +  */
> > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > + synchronize_rcu();
> > > > > > +
> > > > > >           dev->config->reset(dev);
> > > > > >   }
> > > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > let's not add useless overhead to the boot sequence.
> > > >
> > > >
> > > > Ok.
> > > >
> > > >
> > > > >
> > > > >
> > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > >           spin_lock_init(&dev->config_lock);
> > > > > >           dev->config_enabled = false;
> > > > > >           dev->config_change_pending = false;
> > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > +
> > > > > > + if (dev->irq_soft_check)
> > > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > >           /* We always start by resetting the device, in case a previous
> > > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > > one of the points of hardening is it's also helpful for buggy
> > > > > devices. this flag defeats the purpose.
> > > >
> > > >
> > > > Do you mean:
> > > >
> > > > 1) we need something like config_enable? This seems not easy to be
> > > > implemented without obvious overhead, mainly the synchronize with the
> > > > interrupt handlers
> > >
> > > But synchronize is only on tear-down path. That is not critical for any
> > > users at the moment, even less than probe.
> >
> > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > in the virtio_device_ready() and synchronize the IRQ handlers with
> > spinlock or others.
> >
> > >
> > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > for old hypervisors
> > >
> > >
> > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> >
> > Probably not, we have devices that accept random inputs from outside,
> > net, console, input etc. I've done a round of audits of the Qemu
> > codes. They look all fine since day0.
> >
> > > So with this approach, how about we rename the flag "driver_ok"?
> > > And then add_buf can actually test it and BUG_ON if not there  (at least
> > > in the debug build).
> >
> > This looks like a hardening of the driver in the core instead of the
> > device. I think it can be done but in a separate series.
> >
> > >
> > > And going down from there, how about we cache status in the
> > > device? Then we don't need to keep re-reading it every time,
> > > speeding boot up a tiny bit.
> >
> > I don't fully understand here, actually spec requires status to be
> > read back for validation in many cases.
> >
> > Thanks
> >
> > >
> > > >
> > > > >
> > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > >   }
> > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > >   {
> > > > > > + struct virtqueue *_vq = v;
> > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > +         return IRQ_NONE;
> > > > > > + }
> > > > > > +
> > > > > >           if (!more_used(vq)) {
> > > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > >                   return IRQ_NONE;
> > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > --- a/include/linux/virtio.h
> > > > > > +++ b/include/linux/virtio.h
> > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > >    * @config_enabled: configuration change reporting enabled
> > > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > >    * @config_lock: protects configuration change reporting
> > > > > >    * @dev: underlying device.
> > > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > >           bool failed;
> > > > > >           bool config_enabled;
> > > > > >           bool config_change_pending;
> > > > > > + bool irq_soft_check;
> > > > > > + bool irq_soft_enabled;
> > > > > >           spinlock_t config_lock;
> > > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > >           struct device dev;
> > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > --- a/include/linux/virtio_config.h
> > > > > > +++ b/include/linux/virtio_config.h
> > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > >           return __virtio_test_bit(vdev, fbit);
> > > > > >   }
> > > > > > +/*
> > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > + * @vdev: the device
> > > > > > + */
> > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > +{
> > > > > > + if (!vdev->irq_soft_check)
> > > > > > +         return true;
> > > > > > +
> > > > > > + /*
> > > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > > +  * data. Paried with smp_store_relase() in
> > > > > paired
> > > >
> > > >
> > > > Will fix.
> > > >
> > > > Thanks
> > > >
> > > >
> > > > >
> > > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > +  * virtio_reset_device().
> > > > > > +  */
> > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > +}
> > > > > > +
> > > > > >   /**
> > > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > >    * @vdev: the device
> > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > >           if (dev->config->enable_cbs)
> > > > > >                     dev->config->enable_cbs(dev);
> > > > > > + /*
> > > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > > +  * virtio_irq_soft_enabled()
> > > > > > +  */
> > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > +
> > > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > >   }
> > > > > > --
> > > > > > 2.25.1
> > >
>

```* Re:
2022-03-25  7:52 ` Jason Wang
@ 2022-03-25  9:10   ` Michael S. Tsirkin
2022-03-25  9:20     ` Re: Jason Wang
From: Michael S. Tsirkin @ 2022-03-25  9:10 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > Bcc:
> > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> >
> > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > >
> > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > >
> > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > >     that is used by some device such as virtio-blk
> > > > > 2) done only for PCI transport
> > > > >
> > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > by introducing a global irq_soft_enabled variable for each
> > > > > virtio_device. Then we can to toggle it during
> > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > but the cost should be acceptable.
> > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > >
> > >
> > > Even if we allow the transport driver to synchornize through
> > > synchronize_irq() we still need a check in the vring_interrupt().
> > >
> > > We do something like the following previously:
> > >
> > >                 return IRQ_NONE;
> > >
> > > But it looks like a bug since speculative read can be done before the check
> > > where the interrupt handler can't see the uncommitted setup which is done by
> > > the driver.
> >
> > I don't think so - if you sync after setting the value then
> > you are guaranteed that any handler running afterwards
> > will see the new value.
>
> The problem is not disabled but the enable.

So a misbehaving device can lose interrupts? That's not a problem at all
imo.

> We use smp_store_relase()
> to make sure the driver commits the setup before enabling the irq. It
> means the read needs to be ordered as well in vring_interrupt().
>
> >
> > which surprises me.
> >
> > CC Paul to help make sure I'm right.
> >
> >
> > >
> > > >
> > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > module parameter is introduced to enable the hardening so function
> > > > > hardening is disabled by default.
> > > > Which devices are these? How come they send an interrupt before there
> > > > are any buffers in any queues?
> > >
> > >
> > > I copied this from the commit log for 22b7050a024d7
> > >
> > > "
> > >
> > >     This change will also benefit old hypervisors (before 2009)
> > >     that send interrupts without checking DRIVER_OK: previously,
> > >     the callback could race with driver-specific initialization.
> > > "
> > >
> > > If this is only for config interrupt, I can remove the above log.
> >
> >
> > This is only for config interrupt.
>
> Ok.
>
> >
> > >
> > > >
> > > > > Note that the hardening is only done for vring interrupt since the
> > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > handler because it uses spinlock to do the synchronization which is
> > > > > expensive.
> > > > >
> > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >
> > > > > ---
> > > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > > >   include/linux/virtio.h        |  4 ++++
> > > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > --- a/drivers/virtio/virtio.c
> > > > > +++ b/drivers/virtio/virtio.c
> > > > > @@ -7,6 +7,12 @@
> > > > >   #include <linux/of.h>
> > > > >   #include <uapi/linux/virtio_ids.h>
> > > > > +static bool irq_hardening = false;
> > > > > +
> > > > > +module_param(irq_hardening, bool, 0444);
> > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > > +
> > > > >   /* Unique numbering for virtio devices. */
> > > > >   static DEFINE_IDA(virtio_index_ida);
> > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > >    * */
> > > > >   void virtio_reset_device(struct virtio_device *dev)
> > > > >   {
> > > > > + /*
> > > > > +  * The below synchronize_rcu() guarantees that any
> > > > > +  * interrupt for this line arriving after
> > > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > > +  * irq_soft_enabled == false.
> > > > News to me I did not know synchronize_rcu has anything to do
> > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > though it's most likely is ...
> > >
> > >
> > > According to the comment above tree RCU version of synchronize_rcu():
> > >
> > > """
> > >
> > >  * RCU read-side critical sections are delimited by rcu_read_lock()
> > >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> > >  * v5.0 and later, regions of code across which interrupts, preemption,
> > >  * or softirqs have been disabled also serve as RCU read-side critical
> > >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> > >  * and NMI handlers.
> > > """
> > >
> > > So interrupt handlers are treated as read-side critical sections.
> > >
> > > And it has the comment for explain the barrier:
> > >
> > > """
> > >
> > >  * Note that this guarantee implies further memory-ordering guarantees.
> > >  * On systems with more than one CPU, when synchronize_rcu() returns,
> > >  * each CPU is guaranteed to have executed a full memory barrier since
> > >  * the end of its last RCU read-side critical section whose beginning
> > >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > > """
> > >
> > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > irq_soft_enabled as false.
> > >
> >
> > You are right. So then
> > 1. I do not think we need load_acquire - why is it needed? Just
>
> See above.
>
> > 2. isn't synchronize_irq also doing the same thing?
>
>
> Yes, but it requires a config ops since the IRQ knowledge is transport specific.
>
> >
> >
> > > >
> > > > > +  */
> > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > + synchronize_rcu();
> > > > > +
> > > > >           dev->config->reset(dev);
> > > > >   }
> > > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > Also, we *really* don't need to synch if it was already disabled,
> > > > let's not add useless overhead to the boot sequence.
> > >
> > >
> > > Ok.
> > >
> > >
> > > >
> > > >
> > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > >           spin_lock_init(&dev->config_lock);
> > > > >           dev->config_enabled = false;
> > > > >           dev->config_change_pending = false;
> > > > > + dev->irq_soft_check = irq_hardening;
> > > > > +
> > > > > + if (dev->irq_soft_check)
> > > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > >           /* We always start by resetting the device, in case a previous
> > > > >            * driver messed it up.  This also tests that code path a little. */
> > > > one of the points of hardening is it's also helpful for buggy
> > > > devices. this flag defeats the purpose.
> > >
> > >
> > > Do you mean:
> > >
> > > 1) we need something like config_enable? This seems not easy to be
> > > implemented without obvious overhead, mainly the synchronize with the
> > > interrupt handlers
> >
> > But synchronize is only on tear-down path. That is not critical for any
> > users at the moment, even less than probe.
>
> I meant if we have vq->irq_pending, we need to call vring_interrupt()
> in the virtio_device_ready() and synchronize the IRQ handlers with
> spinlock or others.
>
> >
> > > 2) enable this by default, so I don't object, but this may have some risk
> > > for old hypervisors
> >
> >
> > The risk if there's a driver adding buffers without setting DRIVER_OK.
>
> Probably not, we have devices that accept random inputs from outside,
> net, console, input etc. I've done a round of audits of the Qemu
> codes. They look all fine since day0.
>
> > So with this approach, how about we rename the flag "driver_ok"?
> > And then add_buf can actually test it and BUG_ON if not there  (at least
> > in the debug build).
>
> This looks like a hardening of the driver in the core instead of the
> device. I think it can be done but in a separate series.
>
> >
> > And going down from there, how about we cache status in the
> > device? Then we don't need to keep re-reading it every time,
> > speeding boot up a tiny bit.
>
> I don't fully understand here, actually spec requires status to be
> read back for validation in many cases.
>
> Thanks
>
> >
> > >
> > > >
> > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > >   }
> > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > >   {
> > > > > + struct virtqueue *_vq = v;
> > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > +         return IRQ_NONE;
> > > > > + }
> > > > > +
> > > > >           if (!more_used(vq)) {
> > > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > >                   return IRQ_NONE;
> > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > --- a/include/linux/virtio.h
> > > > > +++ b/include/linux/virtio.h
> > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > >    * @config_enabled: configuration change reporting enabled
> > > > >    * @config_change_pending: configuration change reported while disabled
> > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > + * @irq_soft_enabled: callbacks enabled
> > > > >    * @config_lock: protects configuration change reporting
> > > > >    * @dev: underlying device.
> > > > >    * @id: the device type identification (used to match it with a driver).
> > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > >           bool failed;
> > > > >           bool config_enabled;
> > > > >           bool config_change_pending;
> > > > > + bool irq_soft_check;
> > > > > + bool irq_soft_enabled;
> > > > >           spinlock_t config_lock;
> > > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > >           struct device dev;
> > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > --- a/include/linux/virtio_config.h
> > > > > +++ b/include/linux/virtio_config.h
> > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > >           return __virtio_test_bit(vdev, fbit);
> > > > >   }
> > > > > +/*
> > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > + * @vdev: the device
> > > > > + */
> > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > +{
> > > > > + if (!vdev->irq_soft_check)
> > > > > +         return true;
> > > > > +
> > > > > + /*
> > > > > +  * Read irq_soft_enabled before reading other device specific
> > > > > +  * data. Paried with smp_store_relase() in
> > > > paired
> > >
> > >
> > > Will fix.
> > >
> > > Thanks
> > >
> > >
> > > >
> > > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > +  * virtio_reset_device().
> > > > > +  */
> > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > +}
> > > > > +
> > > > >   /**
> > > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > >    * @vdev: the device
> > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > >           if (dev->config->enable_cbs)
> > > > >                     dev->config->enable_cbs(dev);
> > > > > + /*
> > > > > +  * Commit the driver setup before enabling the virtqueue
> > > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > > +  * virtio_irq_soft_enabled()
> > > > > +  */
> > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > +
> > > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > >   }
> > > > > --
> > > > > 2.25.1
> >

```* Re:
2022-03-25  6:30 Michael S. Tsirkin
@ 2022-03-25  7:52 ` Jason Wang
2022-03-25  9:10   ` Re: Michael S. Tsirkin
From: Jason Wang @ 2022-03-25  7:52 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney

On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> Bcc:
> Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
>
> On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> >
> > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > This is a rework on the previous IRQ hardening that is done for
> > > > virtio-pci where several drawbacks were found and were reverted:
> > > >
> > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > >     that is used by some device such as virtio-blk
> > > > 2) done only for PCI transport
> > > >
> > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > by introducing a global irq_soft_enabled variable for each
> > > > virtio_device. Then we can to toggle it during
> > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > the future, we may provide config_ops for the transport that doesn't
> > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > but the cost should be acceptable.
> > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> >
> >
> > Even if we allow the transport driver to synchornize through
> > synchronize_irq() we still need a check in the vring_interrupt().
> >
> > We do something like the following previously:
> >
> >                 return IRQ_NONE;
> >
> > But it looks like a bug since speculative read can be done before the check
> > where the interrupt handler can't see the uncommitted setup which is done by
> > the driver.
>
> I don't think so - if you sync after setting the value then
> you are guaranteed that any handler running afterwards
> will see the new value.

The problem is not disabled but the enable. We use smp_store_relase()
to make sure the driver commits the setup before enabling the irq. It
means the read needs to be ordered as well in vring_interrupt().

>
> which surprises me.
>
> CC Paul to help make sure I'm right.
>
>
> >
> > >
> > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > module parameter is introduced to enable the hardening so function
> > > > hardening is disabled by default.
> > > Which devices are these? How come they send an interrupt before there
> > > are any buffers in any queues?
> >
> >
> > I copied this from the commit log for 22b7050a024d7
> >
> > "
> >
> >     This change will also benefit old hypervisors (before 2009)
> >     that send interrupts without checking DRIVER_OK: previously,
> >     the callback could race with driver-specific initialization.
> > "
> >
> > If this is only for config interrupt, I can remove the above log.
>
>
> This is only for config interrupt.

Ok.

>
> >
> > >
> > > > Note that the hardening is only done for vring interrupt since the
> > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > ("virtio: defer config changed notifications"). But the method that is
> > > > used by config interrupt can't be reused by the vring interrupt
> > > > handler because it uses spinlock to do the synchronization which is
> > > > expensive.
> > > >
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >
> > > > ---
> > > >   drivers/virtio/virtio.c       | 19 +++++++++++++++++++
> > > >   drivers/virtio/virtio_ring.c  |  9 ++++++++-
> > > >   include/linux/virtio.h        |  4 ++++
> > > >   include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > >   4 files changed, 56 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > --- a/drivers/virtio/virtio.c
> > > > +++ b/drivers/virtio/virtio.c
> > > > @@ -7,6 +7,12 @@
> > > >   #include <linux/of.h>
> > > >   #include <uapi/linux/virtio_ids.h>
> > > > +static bool irq_hardening = false;
> > > > +
> > > > +module_param(irq_hardening, bool, 0444);
> > > > +MODULE_PARM_DESC(irq_hardening,
> > > > +          "Disalbe IRQ software processing when it is not expected");
> > > > +
> > > >   /* Unique numbering for virtio devices. */
> > > >   static DEFINE_IDA(virtio_index_ida);
> > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > >    * */
> > > >   void virtio_reset_device(struct virtio_device *dev)
> > > >   {
> > > > + /*
> > > > +  * The below synchronize_rcu() guarantees that any
> > > > +  * interrupt for this line arriving after
> > > > +  * synchronize_rcu() has completed is guaranteed to see
> > > > +  * irq_soft_enabled == false.
> > > News to me I did not know synchronize_rcu has anything to do
> > > with interrupts. Did not you intend to use synchronize_irq?
> > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > though it's most likely is ...
> >
> >
> > According to the comment above tree RCU version of synchronize_rcu():
> >
> > """
> >
> >  * and rcu_read_unlock(), and may be nested.  In addition, but only in
> >  * v5.0 and later, regions of code across which interrupts, preemption,
> >  * or softirqs have been disabled also serve as RCU read-side critical
> >  * sections.  This includes hardware interrupt handlers, softirq handlers,
> >  * and NMI handlers.
> > """
> >
> > So interrupt handlers are treated as read-side critical sections.
> >
> > And it has the comment for explain the barrier:
> >
> > """
> >
> >  * Note that this guarantee implies further memory-ordering guarantees.
> >  * On systems with more than one CPU, when synchronize_rcu() returns,
> >  * each CPU is guaranteed to have executed a full memory barrier since
> >  * the end of its last RCU read-side critical section whose beginning
> >  * preceded the call to synchronize_rcu().  In addition, each CPU having
> > """
> >
> > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > irq_soft_enabled as false.
> >
>
> You are right. So then
> 1. I do not think we need load_acquire - why is it needed? Just

See above.

> 2. isn't synchronize_irq also doing the same thing?

Yes, but it requires a config ops since the IRQ knowledge is transport specific.

>
>
> > >
> > > > +  */
> > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > + synchronize_rcu();
> > > > +
> > > >           dev->config->reset(dev);
> > > >   }
> > > >   EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > Also, we *really* don't need to synch if it was already disabled,
> > > let's not add useless overhead to the boot sequence.
> >
> >
> > Ok.
> >
> >
> > >
> > >
> > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > >           spin_lock_init(&dev->config_lock);
> > > >           dev->config_enabled = false;
> > > >           dev->config_change_pending = false;
> > > > + dev->irq_soft_check = irq_hardening;
> > > > +
> > > > + if (dev->irq_soft_check)
> > > > +         dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > >           /* We always start by resetting the device, in case a previous
> > > >            * driver messed it up.  This also tests that code path a little. */
> > > one of the points of hardening is it's also helpful for buggy
> > > devices. this flag defeats the purpose.
> >
> >
> > Do you mean:
> >
> > 1) we need something like config_enable? This seems not easy to be
> > implemented without obvious overhead, mainly the synchronize with the
> > interrupt handlers
>
> But synchronize is only on tear-down path. That is not critical for any
> users at the moment, even less than probe.

I meant if we have vq->irq_pending, we need to call vring_interrupt()
in the virtio_device_ready() and synchronize the IRQ handlers with
spinlock or others.

>
> > 2) enable this by default, so I don't object, but this may have some risk
> > for old hypervisors
>
>
> The risk if there's a driver adding buffers without setting DRIVER_OK.

Probably not, we have devices that accept random inputs from outside,
net, console, input etc. I've done a round of audits of the Qemu
codes. They look all fine since day0.

> So with this approach, how about we rename the flag "driver_ok"?
> And then add_buf can actually test it and BUG_ON if not there  (at least
> in the debug build).

This looks like a hardening of the driver in the core instead of the
device. I think it can be done but in a separate series.

>
> And going down from there, how about we cache status in the
> device? Then we don't need to keep re-reading it every time,
> speeding boot up a tiny bit.

I don't fully understand here, actually spec requires status to be
read back for validation in many cases.

Thanks

>
> >
> > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > >           return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > >   }
> > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > >   {
> > > > + struct virtqueue *_vq = v;
> > > > + struct virtio_device *vdev = _vq->vdev;
> > > >           struct vring_virtqueue *vq = to_vvq(_vq);
> > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > +         dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > +         return IRQ_NONE;
> > > > + }
> > > > +
> > > >           if (!more_used(vq)) {
> > > >                   pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > >                   return IRQ_NONE;
> > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > index 5464f398912a..957d6ad604ac 100644
> > > > --- a/include/linux/virtio.h
> > > > +++ b/include/linux/virtio.h
> > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > >    * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > >    * @config_enabled: configuration change reporting enabled
> > > >    * @config_change_pending: configuration change reported while disabled
> > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > + * @irq_soft_enabled: callbacks enabled
> > > >    * @config_lock: protects configuration change reporting
> > > >    * @dev: underlying device.
> > > >    * @id: the device type identification (used to match it with a driver).
> > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > >           bool failed;
> > > >           bool config_enabled;
> > > >           bool config_change_pending;
> > > > + bool irq_soft_check;
> > > > + bool irq_soft_enabled;
> > > >           spinlock_t config_lock;
> > > >           spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > >           struct device dev;
> > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > --- a/include/linux/virtio_config.h
> > > > +++ b/include/linux/virtio_config.h
> > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > >           return __virtio_test_bit(vdev, fbit);
> > > >   }
> > > > +/*
> > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > + * @vdev: the device
> > > > + */
> > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > +{
> > > > + if (!vdev->irq_soft_check)
> > > > +         return true;
> > > > +
> > > > + /*
> > > > +  * Read irq_soft_enabled before reading other device specific
> > > > +  * data. Paried with smp_store_relase() in
> > > paired
> >
> >
> > Will fix.
> >
> > Thanks
> >
> >
> > >
> > > > +  * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > +  * virtio_reset_device().
> > > > +  */
> > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > +}
> > > > +
> > > >   /**
> > > >    * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > >    * @vdev: the device
> > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > >           if (dev->config->enable_cbs)
> > > >                     dev->config->enable_cbs(dev);
> > > > + /*
> > > > +  * Commit the driver setup before enabling the virtqueue
> > > > +  * callbacks. Paried with smp_load_acuqire() in
> > > > +  * virtio_irq_soft_enabled()
> > > > +  */
> > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > +
> > > >           BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > >           dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > >   }
> > > > --
> > > > 2.25.1
>

```* Re:
2022-01-20 15:37 ` Vitaly Wool
2022-01-20 23:29   ` Re: Damien Le Moal
@ 2022-02-04 21:45   ` Palmer Dabbelt
1 sibling, 0 replies; 400+ messages in thread
From: Palmer Dabbelt @ 2022-02-04 21:45 UTC (permalink / raw)
To: vitaly.wool; +Cc: gatecat, linux-riscv, Paul Walmsley, linux-kernel

On Thu, 20 Jan 2022 07:37:00 PST (-0800), vitaly.wool@konsulko.com wrote:
> Hey,
>
> On Thu, Jan 20, 2022 at 4:30 PM Myrtle Shah <gatecat@ds0.me> wrote:
>>
>> These are some initial patches to bugs I found attempting to
>> get a XIP kernel working on hardware:
>>  - 32-bit VexRiscv processor
>>  - kernel in SPI flash, at 0x00200000
>>  - 16MB of RAM at 0x10000000
>>  - MMU enabled
>>
>> I still have some more debugging to do, but these at least
>> get the kernel as far as initialising the MMU, and I would
>> appreciate feedback if anyone else is working on RISC-V XIP.
>
> I'll try to support you as much as I can, unfortunately I don't have
> any 32-bit RISC-V around so I was rather thinking of extending the
> RISC-V XIP support to 64-bit non-MMU targets.
> For now just please keep in mind that there might be some inherent
> assumptions that a target is 64 bit.

I don't test any of the XIP configs, but if you guys have something that's sane
to run in QEMU I'm happy to do so.  Given that there's now some folks finding
boot bugs it's probably worth getting what does boot into a regression test so
it's less likely to break moving forwards.

These are on fixes, with the second one split up so it's got a better chance of
landing in the stable trees.

Thanks!

>
> Best regards,
> Vitaly
>
>>
>> _______________________________________________
>> linux-riscv mailing list

```* Re:
2022-01-20 15:37 ` Vitaly Wool
@ 2022-01-20 23:29   ` Damien Le Moal
2022-02-04 21:45   ` Re: Palmer Dabbelt
1 sibling, 0 replies; 400+ messages in thread
From: Damien Le Moal @ 2022-01-20 23:29 UTC (permalink / raw)
To: Vitaly Wool, Myrtle Shah; +Cc: linux-riscv, Paul Walmsley, Palmer Dabbelt, LKML

On 2022/01/21 0:37, Vitaly Wool wrote:
> Hey,
>
> On Thu, Jan 20, 2022 at 4:30 PM Myrtle Shah <gatecat@ds0.me> wrote:
>>
>> These are some initial patches to bugs I found attempting to
>> get a XIP kernel working on hardware:
>>  - 32-bit VexRiscv processor
>>  - kernel in SPI flash, at 0x00200000
>>  - 16MB of RAM at 0x10000000
>>  - MMU enabled
>>
>> I still have some more debugging to do, but these at least
>> get the kernel as far as initialising the MMU, and I would
>> appreciate feedback if anyone else is working on RISC-V XIP.
>
> I'll try to support you as much as I can, unfortunately I don't have
> any 32-bit RISC-V around so I was rather thinking of extending the
> RISC-V XIP support to 64-bit non-MMU targets.

That would be great ! I am completing the buildroot patches for the K210. Got
u-boot almost working for SD card boot too (fighting a problem with rootfs
kernel mount on boot when using u-boot though).

> For now just please keep in mind that there might be some inherent
> assumptions that a target is 64 bit.
>
> Best regards,
> Vitaly
>
>>
>> _______________________________________________
>> linux-riscv mailing list
>
> _______________________________________________
> linux-riscv mailing list

--
Damien Le Moal
Western Digital Research

```* Re:
2022-01-20 15:28 Myrtle Shah
@ 2022-01-20 15:37 ` Vitaly Wool
2022-01-20 23:29   ` Re: Damien Le Moal
2022-02-04 21:45   ` Re: Palmer Dabbelt
0 siblings, 2 replies; 400+ messages in thread
From: Vitaly Wool @ 2022-01-20 15:37 UTC (permalink / raw)
To: Myrtle Shah; +Cc: linux-riscv, Paul Walmsley, Palmer Dabbelt, LKML

Hey,

On Thu, Jan 20, 2022 at 4:30 PM Myrtle Shah <gatecat@ds0.me> wrote:
>
> These are some initial patches to bugs I found attempting to
> get a XIP kernel working on hardware:
>  - 32-bit VexRiscv processor
>  - kernel in SPI flash, at 0x00200000
>  - 16MB of RAM at 0x10000000
>  - MMU enabled
>
> I still have some more debugging to do, but these at least
> get the kernel as far as initialising the MMU, and I would
> appreciate feedback if anyone else is working on RISC-V XIP.

I'll try to support you as much as I can, unfortunately I don't have
any 32-bit RISC-V around so I was rather thinking of extending the
RISC-V XIP support to 64-bit non-MMU targets.
For now just please keep in mind that there might be some inherent
assumptions that a target is 64 bit.

Best regards,
Vitaly

>
> _______________________________________________
> linux-riscv mailing list

```* Re:
@ 2021-11-29 21:59 ` sean.wang
0 siblings, 0 replies; 400+ messages in thread
From: sean.wang @ 2021-11-29 21:59 UTC (permalink / raw)
To: lb
Cc: marcel, johan.hedberg, luiz.dentz, upstream, linux-bluetooth,
linux-mediatek, linux-kernel, Sean Wang

From: Sean Wang <sean.wang@mediatek.com>

>Enable msft opcode for btmtksdio driver.
>
>Signed-off-by: Łukasz Bartosik <lb@semihalf.com>
>---
> drivers/bluetooth/btmtksdio.c | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index d9cf0c492e29..2a7a615663b9 100644
>--- a/drivers/bluetooth/btmtksdio.c
>+++ b/drivers/bluetooth/btmtksdio.c
>@@ -887,6 +887,7 @@ static int btmtksdio_setup(struct hci_dev *hdev)
>	if (enable_autosuspend)
>		pm_runtime_allow(bdev->dev);
>
>+	hci_set_msft_opcode(hdev, 0xFD30);

Hi Łukasz,

msft feature is supposed only supported on mt7921. Could you help rework the patch to enalbe msft opocde only for mt7921?

Sean

>	bt_dev_info(hdev, "Device setup in %llu usecs", duration);
>
>	return 0;
>

```* Re:
@ 2021-11-25 15:01     ` Hans de Goede
0 siblings, 0 replies; 400+ messages in thread
From: Hans de Goede @ 2021-11-25 15:01 UTC (permalink / raw)
To: Stephen Boyd, Andy Shevchenko, Daniel Scally, Laurent Pinchart,
Liam Girdwood, Mark Brown, Mark Gross, Mauro Carvalho Chehab,
Michael Turquette, Mika Westerberg, Rafael J.Wysocki,
Wolfram Sang
Cc: Len Brown, linux-acpi, platform-driver-x86, linux-kernel,
linux-i2c, Sakari Ailus, Kate Hsuan, linux-media, linux-clk

Hi,

On 11/2/21 22:16, Stephen Boyd wrote:
> Quoting Hans de Goede (2021-11-02 02:49:01)
>> diff --git a/drivers/clk/clk-tps68470.c b/drivers/clk/clk-tps68470.c
>> new file mode 100644
>> --- /dev/null
>> +++ b/drivers/clk/clk-tps68470.c
>> @@ -0,0 +1,257 @@
>> +/*
>> + * Clock driver for TPS68470 PMIC
>> + *
>> + * Copyright (c) 2021 Red Hat Inc.
>> + * Copyright (C) 2018 Intel Corporation
>> + *
>> + * Authors:
>> + *     Hans de Goede <hdegoede@redhat.com>
>> + *     Zaikuo Wang <zaikuo.wang@intel.com>
>> + *     Tianshu Qiu <tian.shu.qiu@intel.com>
>> + *     Jian Xu Zheng <jian.xu.zheng@intel.com>
>> + *     Yuning Pu <yuning.pu@intel.com>
>> + *     Antti Laakso <antti.laakso@intel.com>
>> + */
>> +
>> +#include <linux/clk-provider.h>
>> +#include <linux/clkdev.h>
>> +#include <linux/kernel.h>
>> +#include <linux/mfd/tps68470.h>
>> +#include <linux/module.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/platform_data/tps68470.h>
>> +#include <linux/regmap.h>
>> +
>> +#define TPS68470_CLK_NAME "tps68470-clk"
>> +
>> +#define to_tps68470_clkdata(clkd) \
>> +       container_of(clkd, struct tps68470_clkdata, clkout_hw)
>> +
> [...]
>> +
>> +static int tps68470_clk_set_rate(struct clk_hw *hw, unsigned long rate,
>> +                                unsigned long parent_rate)
>> +{
>> +       struct tps68470_clkdata *clkdata = to_tps68470_clkdata(hw);
>> +       unsigned int idx = tps68470_clk_cfg_lookup(rate);
>> +
>> +       if (rate != clk_freqs[idx].freq)
>> +               return -EINVAL;
>> +
>> +       clkdata->clk_cfg_idx = idx;
>
> It deserves a comment that set_rate can only be called when the clk is
> gated. We have CLK_SET_RATE_GATE flag as well that should be set if the
> clk can't support changing rate while enabled. With that flag set, this
> function should be able to actually change hardware with the assumption
> that the framework won't call down into this clk_op when the clk is
> enabled.

Ok for v6 I've added the CLK_SET_RATE_GATE flag + a comment why
it used and moved the divider programming to tps68470_clk_set_rate()m
while keeping the PLL_EN + output-enable writes in tps68470_clk_prepare()

>
>> +
>> +       return 0;
>> +}
>> +
>> +static const struct clk_ops tps68470_clk_ops = {
>> +       .is_prepared = tps68470_clk_is_prepared,
>> +       .prepare = tps68470_clk_prepare,
>> +       .unprepare = tps68470_clk_unprepare,
>> +       .recalc_rate = tps68470_clk_recalc_rate,
>> +       .round_rate = tps68470_clk_round_rate,
>> +       .set_rate = tps68470_clk_set_rate,
>> +};
>> +
>> +static const struct clk_init_data tps68470_clk_initdata = {
>
> Is there a reason to make this a static global? It's probably better to
> throw it on the stack so that a structure isn't sitting around after
> driver probe being unused.

Fixed for v6.

Thanks & Regards,

Hans

>
>> +       .name = TPS68470_CLK_NAME,
>> +       .ops = &tps68470_clk_ops,
>> +};
>

```* Re:
@ 2021-09-14 15:37 ` Nick Desaulniers
0 siblings, 0 replies; 400+ messages in thread
From: Nick Desaulniers @ 2021-09-14 15:37 UTC (permalink / raw)
To: zhao xc
Cc: tglx, mingo, bp, x86, hpa, nathan, tony.luck, linux, mpe,
dan.j.williams, linux-kernel, clang-built-linux

On Sun, Sep 12, 2021 at 10:42 PM zhao xc <xinchao.zhao.kernelz@gmail.com> wrote:
>
> Hi maintainer:
>         This is a patch fix the unused macro definition

Hi Zhao,
Thanks for the patch.  Would you mind following the standard procedure
for submitting patches to the list for review.  I wrote up
https://nickdesaulniers.github.io/blog/2017/05/16/submitting-your-first-patch-to-the-linux-kernel-and-responding-to-feedback/
a while ago, but I think it's still helpful.

--
Thanks,
~Nick Desaulniers

```* Re:
2021-08-12 20:19   ` Re: Andrew Morton
@ 2021-08-13  8:14     ` SeongJae Park
0 siblings, 0 replies; 400+ messages in thread
From: SeongJae Park @ 2021-08-13  8:14 UTC (permalink / raw)
To: Andrew Morton
Cc: SeongJae Park,  Valdis Klētnieks ,
SeongJae Park, linux-mm, linux-kernel

From: SeongJae Park <sjpark@amazon.de>

On Thu, 12 Aug 2021 13:19:21 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:

> On Thu, 12 Aug 2021 09:42:40 +0000 SeongJae Park <sj38.park@gmail.com> wrote:
>
> > > +config PAGE_IDLE_FLAG
> > > +       bool "Add PG_idle and PG_young flags"
> > > +       help
> > > +         This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
> > > +         Accessed bit writers can set the state of the bit in the flags to let
> > > +         other PTE Accessed bit readers don't disturbed.
> > >
> > > This needs to be converted to proper, or at least comprehensible, English....
> >
> > Thank you for the comment.
> >
> >
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
> >         bool "Add PG_idle and PG_young flags"
> >         select PAGE_EXTENSION if !64BIT
> >         help
> > -         This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
> > -         Accessed bit writers can set the state of the bit in the flags to let
> > -         other PTE Accessed bit readers don't disturbed.
> > +         This feature adds 'PG_idle' and 'PG_young' flags in 'struct page'.
> > +         PTE Accessed bit writers can save the state of the bit in the flags
> > +         to let other PTE Accessed bit readers don't get disturbed.
>
>
> --- a/mm/Kconfig~mm-idle_page_tracking-make-pg_idle-reusable-fix-fix
> +++ a/mm/Kconfig
> @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
>  	bool "Add PG_idle and PG_young flags"
>  	select PAGE_EXTENSION if !64BIT
>  	help
> -	  This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
> -	  Accessed bit writers can set the state of the bit in the flags to let
> -	  other PTE Accessed bit readers don't disturbed.
> +	  This adds PG_idle and PG_young flags to 'struct page'.  PTE Accessed
> +	  bit writers can set the state of the bit in the flags so that PTE
> +	  Accessed bit readers may avoid disturbance.
>
>  config IDLE_PAGE_TRACKING
>  	bool "Enable idle page tracking"

So good, thank you!

>
> Also, is there any way in which we can avoid presenting this option to
> the user?  Because most users will have real trouble understanding what
> this thing is for.  Can we simply select it when needed, as dictated by
> other, higher-level config options?

I believe this is the right way to go!  I sent a patch for removing the prompt
of this option:
https://lore.kernel.org/linux-mm/20210813081238.34705-1-sj38.park@gmail.com/

Thanks,
SeongJae Park

```* Re:
2021-08-12  9:42 ` SeongJae Park
@ 2021-08-12 20:19   ` Andrew Morton
2021-08-13  8:14     ` Re: SeongJae Park
From: Andrew Morton @ 2021-08-12 20:19 UTC (permalink / raw)
To: SeongJae Park
Cc:  Valdis Klētnieks , SeongJae Park, linux-mm, linux-kernel

On Thu, 12 Aug 2021 09:42:40 +0000 SeongJae Park <sj38.park@gmail.com> wrote:

> > +config PAGE_IDLE_FLAG
> > +       bool "Add PG_idle and PG_young flags"
> > +       help
> > +         This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
> > +         Accessed bit writers can set the state of the bit in the flags to let
> > +         other PTE Accessed bit readers don't disturbed.
> >
> > This needs to be converted to proper, or at least comprehensible, English....
>
> Thank you for the comment.
>
>
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
>         bool "Add PG_idle and PG_young flags"
>         select PAGE_EXTENSION if !64BIT
>         help
> -         This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
> -         Accessed bit writers can set the state of the bit in the flags to let
> -         other PTE Accessed bit readers don't disturbed.
> +         This feature adds 'PG_idle' and 'PG_young' flags in 'struct page'.
> +         PTE Accessed bit writers can save the state of the bit in the flags
> +         to let other PTE Accessed bit readers don't get disturbed.

--- a/mm/Kconfig~mm-idle_page_tracking-make-pg_idle-reusable-fix-fix
+++ a/mm/Kconfig
@@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
bool "Add PG_idle and PG_young flags"
select PAGE_EXTENSION if !64BIT
help
-	  This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
-	  Accessed bit writers can set the state of the bit in the flags to let
-	  other PTE Accessed bit readers don't disturbed.
+	  This adds PG_idle and PG_young flags to 'struct page'.  PTE Accessed
+	  bit writers can set the state of the bit in the flags so that PTE
+	  Accessed bit readers may avoid disturbance.

config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"

Also, is there any way in which we can avoid presenting this option to
the user?  Because most users will have real trouble understanding what
this thing is for.  Can we simply select it when needed, as dictated by
other, higher-level config options?

```* Re:
2021-08-12  9:21 Valdis Klētnieks
@ 2021-08-12  9:42 ` SeongJae Park
2021-08-12 20:19   ` Re: Andrew Morton
From: SeongJae Park @ 2021-08-12  9:42 UTC (permalink / raw)
To: Valdis Klētnieks
Cc: SeongJae Park, Andrew Morton, linux-mm, linux-kernel

From: SeongJae Park <sjpark@amazon.de>

Hello Valdis,

On Thu, 12 Aug 2021 05:21:57 -0400 "Valdis =?utf-8?Q?Kl=c4=93tnieks?=" <valdis.kletnieks@vt.edu> wrote:

> In this commit:
>
> commit fedc37448fb1be5d03e420ca7791d4286893d5ec
> Author: SeongJae Park <sjpark@amazon.de>
> Date:   Tue Aug 10 16:55:51 2021 +1000
>
>     mm/idle_page_tracking: make PG_idle reusable
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 504336de9a1e..d0b85dc12429 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -739,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT
>           initialisation.
>
> +config PAGE_IDLE_FLAG
> +       bool "Add PG_idle and PG_young flags"
> +       help
> +         This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
> +         Accessed bit writers can set the state of the bit in the flags to let
> +         other PTE Accessed bit readers don't disturbed.
>
> This needs to be converted to proper, or at least comprehensible, English....

Thank you for the comment.

--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
bool "Add PG_idle and PG_young flags"
select PAGE_EXTENSION if !64BIT
help
-         This feature adds PG_idle and PG_young flags in 'struct page'.  PTE
-         Accessed bit writers can set the state of the bit in the flags to let
-         other PTE Accessed bit readers don't disturbed.
+         This feature adds 'PG_idle' and 'PG_young' flags in 'struct page'.
+         PTE Accessed bit writers can save the state of the bit in the flags
+         to let other PTE Accessed bit readers don't get disturbed.

Thanks,
SeongJae Park

```* Re:
2021-07-27 15:10 ` Darrick J. Wong
2021-07-27 15:23   ` Andreas Grünbacher
@ 2021-07-27 15:30   ` Gao Xiang
1 sibling, 0 replies; 400+ messages in thread
From: Gao Xiang @ 2021-07-27 15:30 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-erofs, linux-fsdevel, LKML, Huang Jianan, Joseph Qi,
Christoph Hellwig, Matthew Wilcox, Andreas Gruenbacher

On Tue, Jul 27, 2021 at 08:10:51AM -0700, Darrick J. Wong wrote:
> I'll change the subject to:
>
> iomap: support reading inline data from non-zero pos

I'm fine with this too. Many thanks for updating!

Thanks,
Gao Xiang

```* Re:
2021-07-27 15:10 ` Darrick J. Wong
@ 2021-07-27 15:23   ` Andreas Grünbacher
2021-07-27 15:30   ` Re: Gao Xiang
1 sibling, 0 replies; 400+ messages in thread
From: Andreas Grünbacher @ 2021-07-27 15:23 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Gao Xiang, linux-erofs, Linux FS-devel Mailing List, LKML,
Huang Jianan, Joseph Qi, Christoph Hellwig, Matthew Wilcox,
Andreas Gruenbacher

Am Di., 27. Juli 2021 um 17:11 Uhr schrieb Darrick J. Wong <djwong@kernel.org>:
> I'll change the subject to:
>
> iomap: support reading inline data from non-zero pos

That surely works for me.

Thanks,
Andreas

```* Re:
2021-06-06 19:19 Davidlohr Bueso
@ 2021-06-07 16:02 ` André Almeida
0 siblings, 0 replies; 400+ messages in thread
From: André Almeida @ 2021-06-07 16:02 UTC (permalink / raw)
To: Davidlohr Bueso
Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
linux-kernel, Steven Rostedt, Sebastian Andrzej Siewior, kernel,
krisman, pgriffais, z.figura12, joel, malteskarupke, linux-api,
fweimer, libc-alpha, linux-kselftest, shuah, acme, corbet,
Peter Oskolkov, Andrey Semashev, mtk.manpages

Às 16:19 de 06/06/21, Davidlohr Bueso escreveu:
> Bcc:
> Subject: Re: [PATCH v4 07/15] docs: locking: futex2: Add documentation
>
> On Thu, 03 Jun 2021, Andrï¿½ Almeida wrote:
>
>> Add a new documentation file specifying both userspace API and internal
>> implementation details of futex2 syscalls.
>
> I think equally important would be to provide a manpage for each new
> syscall you are introducing, and keep mkt in the loop as in the past he
> extensively documented and improved futex manpages, and overall has a
> lot of experience with dealing with kernel interfaces.

Right, I'll add the man pages in a future version and make sure to have
mkt in the loop, thanks for the tip.

>
> Thanks,
> Davidlohr
>
>>
>> Signed-off-by: André Almeida <andrealmeid@collabora.com>
>> ---
>> Documentation/locking/futex2.rst | 198 +++++++++++++++++++++++++++++++
>> Documentation/locking/index.rst  |   1 +
>> 2 files changed, 199 insertions(+)
>> create mode 100644 Documentation/locking/futex2.rst
>>
>> diff --git a/Documentation/locking/futex2.rst
>> b/Documentation/locking/futex2.rst
>> new file mode 100644
>> index 000000000000..2f74d7c97a55
>> --- /dev/null
>> +++ b/Documentation/locking/futex2.rst
>> @@ -0,0 +1,198 @@
>> +
>> +======
>> +futex2
>> +======
>> +
>> +:Author: André Almeida <andrealmeid@collabora.com>
>> +
>> +futex, or fast user mutex, is a set of syscalls to allow userspace to
>> create
>> +performant synchronization mechanisms, such as mutexes, semaphores and
>> +conditional variables in userspace. C standard libraries, like glibc,
>> uses it
>> +as a means to implement more high level interfaces like pthreads.
>> +
>> +The interface
>> +=============
>> +
>> +uAPI functions
>> +--------------
>> +
>> +.. kernel-doc:: kernel/futex2.c
>> +   :identifiers: sys_futex_wait sys_futex_wake sys_futex_waitv
>> sys_futex_requeue
>> +
>> +uAPI structures
>> +---------------
>> +
>> +.. kernel-doc:: include/uapi/linux/futex.h
>> +
>> +The ``flag`` argument
>> +---------------------
>> +
>> +The flag is used to specify the size of the futex word
>> +(FUTEX_[8, 16, 32, 64]). It's mandatory to define one, since there's no
>> +default size.
>> +
>> +By default, the timeout uses a monotonic clock, but can be used as a
>> realtime
>> +one by using the FUTEX_REALTIME_CLOCK flag.
>> +
>> +By default, futexes are of the private type, that means that this
>> +will be accessed by threads that share the same memory region. This
>> allows for
>> +some internal optimizations, so they are faster. However, if the
>> +to be shared with different processes (like using ``mmap()`` or
>> ``shm()``), they
>> +need to be defined as shared and the flag FUTEX_SHARED_FLAG is used
>> to set that.
>> +
>> +By default, the operation has no NUMA-awareness, meaning that the
>> user can't
>> +choose the memory node where the kernel side futex data will be
>> stored. The
>> +user can choose the node where it wants to operate by setting the
>> +FUTEX_NUMA_FLAG and using the following structure (where X can be 8,
>> 16, 32 or
>> +64)::
>> +
>> + struct futexX_numa {
>> +         __uX value;
>> +         __sX hint;
>> + };
>> +
>> +This structure should be passed at the ``void *uaddr`` of futex
>> functions. The
>> +address of the structure will be used to be waited on/waken on, and the
>> +``value`` will be compared to ``val`` as usual. The ``hint`` member
>> is used to
>> +define which node the futex will use. When waiting, the futex will be
>> +registered on a kernel-side table stored on that node; when waking,
>> the futex
>> +will be searched for on that given table. That means that there's no
>> redundancy
>> +between tables, and the wrong ``hint`` value will lead to undesired
>> behavior.
>> +Userspace is responsible for dealing with node migrations issues that
>> may
>> +occur. ``hint`` can range from [0, MAX_NUMA_NODES), for specifying a
>> node, or
>> +-1, to use the same node the current process is using.
>> +
>> +When not using FUTEX_NUMA_FLAG on a NUMA system, the futex will be
>> stored on a
>> +global table on allocated on the first node.
>> +
>> +The ``timo`` argument
>> +---------------------
>> +
>> +As per the Y2038 work done in the kernel, new interfaces shouldn't
>> +options known to be buggy. Given that, ``timo`` should be a 64-bit
>> timeout at
>> +all platforms, using an absolute timeout value.
>> +
>> +Implementation
>> +==============
>> +
>> +The internal implementation follows a similar design to the original
>> futex.
>> +Given that we want to replicate the same external behavior of current
>> futex,
>> +this should be somewhat expected.
>> +
>> +Waiting
>> +-------
>> +
>> +For the wait operations, they are all treated as if you want to wait
>> on N
>> +futexes, so the path for futex_wait and futex_waitv is the basically
>> the same.
>> +For both syscalls, the first step is to prepare an internal list for
>> the list
>> +of futexes to wait for (using struct futexv_head). For futex_wait()
>> calls, this
>> +list will have a single object.
>> +
>> +We have a hash table, where waiters register themselves before
>> sleeping. Then
>> +the wake function checks this table looking for waiters at uaddr.
>> The hash
>> +bucket to be used is determined by a struct futex_key, that stores
>> information
>> +to uniquely identify an address from a given process. Given the huge
>> +space, there'll be hash collisions, so we store information to be
>> later used on
>> +collision treatment.
>> +
>> +First, for every futex we want to wait on, we check if (``*uaddr ==
>> val``).
>> +This check is done holding the bucket lock, so we are correctly
>> serialized with
>> +any futex_wake() calls. If any waiter fails the check above, we
>> dequeue all
>> +futexes. The check (``*uaddr == val``) can fail for two reasons:
>> +
>> +- The values are different, and we return -EAGAIN. However, if while
>> +  dequeueing we found that some futexes were awakened, we prioritize
>> this
>> +  and return success.
>> +
>> +- When trying to access the user address, we do so with page faults
>> +  disabled because we are holding a bucket's spin lock (and can't sleep
>> +  while holding a spin lock). If there's an error, it might be a page
>> +  fault, or an invalid address. We release the lock, dequeue everyone
>> +  (because it's illegal to sleep while there are futexes enqueued, we
>> +  could lose wakeups) and try again with page fault enabled. If we
>> +  succeed, this means that the address is valid, but we need to do
>> +  all the work again. For serialization reasons, we need to have the
>> +  spin lock when getting the user value. Additionally, for shared
>> +  futexes, we also need to recalculate the hash, since the underlying
>> +  mapping mechanisms could have changed when dealing with page fault.
>> +  If, even with page fault enabled, we can't access the address, it
>> +  means it's an invalid user address, and we return -EFAULT. For this
>> +  case, we prioritize the error, even if some futexes were awaken.
>> +
>> +If the check is OK, they are enqueued on a linked list in our bucket,
>> and
>> +proceed to the next one. If all waiters succeed, we put the thread to
>> sleep
>> +until a futex_wake() call, timeout expires or we get a signal. After
>> waking up,
>> +we dequeue everyone, and check if some futex was awakened. This
>> dequeue is done
>> +by iteratively walking at each element of struct futex_head list.
>> +
>> +All enqueuing/dequeuing operations requires to hold the bucket lock,
>> to avoid
>> +racing while modifying the list.
>> +
>> +Waking
>> +------
>> +
>> +We get the bucket that's storing the waiters at uaddr, and wake the
>> required
>> +number of waiters, checking for hash collision.
>> +
>> +There's an optimization that makes futex_wake() not take the bucket
>> lock if
>> +there's no one to be woken on that bucket. It checks an atomic
>> counter that each
>> +bucket has, if it says 0, then the syscall exits. In order for this
>> to work, the
>> +waiter thread increases it before taking the lock, so the wake thread
>> will
>> +correctly see that there's someone waiting and will continue the path
>> to take
>> +the bucket lock. To get the correct serialization, the waiter issues
>> a memory
>> +barrier after increasing the bucket counter and the waker issues a
>> memory
>> +barrier before checking it.
>> +
>> +Requeuing
>> +---------
>> +
>> +The requeue path first checks for each struct futex_requeue and their
>> flags.
>> +Then, it will compare the expected value with the one at uaddr1::uaddr.
>> +Following the same serialization explained at Waking_, we increase
>> the atomic
>> +counter for the bucket of uaddr2 before taking the lock. We need to
>> have both
>> +buckets locks at same time so we don't race with other futex
>> operation. To
>> +ensure the locks are taken in the same order for all threads (and
>> thus avoiding
>> +deadlocks), every requeue operation takes the "smaller" bucket first,
>> when
>> +
>> +If the compare with user value succeeds, we proceed by waking
>> ``nr_wake``
>> +futexes, and then requeuing ``nr_requeue`` from bucket of uaddr1 to
>> +This consists in a simple list deletion/addition and replacing the
>> old futex key
>> +with the new one.
>> +
>> +Futex keys
>> +----------
>> +
>> +There are two types of futexes: private and shared ones. The private
>> are futexes
>> +meant to be used by threads that share the same memory space, are
>> easier to be
>> +uniquely identified and thus can have some performance optimization. The
>> +elements for identifying one are: the start address of the page where
>> the
>> +address is, the address offset within the page and the current->mm
>> pointer.
>> +
>> +Now, for uniquely identifying a shared futex:
>> +
>> +- If the page containing the user address is an anonymous page, we can
>> +  just use the same data used for private futexes (the start address of
>> +  the page, the address offset within the page and the current->mm
>> +  pointer); that will be enough for uniquely identifying such futex. We
>> +  also set one bit at the key to differentiate if a private futex is
>> +  used on the same address (mixing shared and private calls does not
>> +  work).
>> +
>> +- If the page is file-backed, current->mm maybe isn't the same one for
>> +  every user of this futex, so we need to use other data: the
>> +  page->index, a UUID for the struct inode and the offset within the
>> +  page.
>> +
>> +Note that members of futex_key don't have any particular meaning
>> after they
>> +are part of the struct - they are just bytes to identify a futex.
>> Given that,
>> +we don't need to use a particular name or type that matches the
>> original data,
>> +we only need to care about the bitsize of each component and make
>> both private
>> +and shared fit in the same memory space.
>> +
>> +Source code documentation
>> +=========================
>> +
>> +.. kernel-doc:: kernel/futex2.c
>> +   :no-identifiers: sys_futex_wait sys_futex_wake sys_futex_waitv
>> sys_futex_requeue
>> diff --git a/Documentation/locking/index.rst
>> b/Documentation/locking/index.rst
>> index 7003bd5aeff4..9bf03c7fa1ec 100644
>> --- a/Documentation/locking/index.rst
>> +++ b/Documentation/locking/index.rst
>> @@ -24,6 +24,7 @@ locking
>>     percpu-rw-semaphore
>>     robust-futexes
>>     robust-futex-ABI
>> +    futex2
>>
>> .. only::  subproject and html
>>
>> --
>> 2.31.1
>>

```* Re:
2021-04-05  0:01 Mitali Borkar
@ 2021-04-06  7:03 ` Arnd Bergmann
0 siblings, 0 replies; 400+ messages in thread
From: Arnd Bergmann @ 2021-04-06  7:03 UTC (permalink / raw)
To: Mitali Borkar
Cc: manish, GR-Linux-NIC-Dev, gregkh, linux-staging,
Linux Kernel Mailing List

On Mon, Apr 5, 2021 at 2:03 AM Mitali Borkar <mitaliborkar810@gmail.com> wrote:
>
> Bcc:
> Subject: [PATCH] staging: qlge:remove else after break
>
> Fixed Warning:- else is not needed after break
> break terminates the loop if encountered. else is unnecessary and
> increases indenatation
>
> Signed-off-by: Mitali Borkar <mitaliborkar810@gmail.com>
> ---
>  drivers/staging/qlge/qlge_mpi.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/staging/qlge/qlge_mpi.c b/drivers/staging/qlge/qlge_mpi.c
> index 2630ebf50341..3a49f187203b 100644
> --- a/drivers/staging/qlge/qlge_mpi.c
> +++ b/drivers/staging/qlge/qlge_mpi.c
> @@ -935,13 +935,11 @@ static int qlge_idc_wait(struct qlge_adapter *qdev)
>                         netif_err(qdev, drv, qdev->ndev, "IDC Success.\n");
>                         status = 0;
>                         break;
> -               } else {
> -                       netif_err(qdev, drv, qdev->ndev,
> +               }       netif_err(qdev, drv, qdev->ndev,
>                                   "IDC: Invalid State 0x%.04x.\n",
>                                   mbcp->mbox_out[0]);
>                         status = -EIO;
>                         break;
> -               }
>         }

It looks like you got this one wrong in multiple ways:

- This is not an equivalent transformation, since the errror is now
printed in the first part of the 'if()' block as well.

- The indentation is wrong now, with the netif_err() starting in the
same line as the '}'.

- The description mentions a change in indentation, but you did not
actually change it.

- The changelog text appears mangled.

Arnd

```* Re:
@ 2021-03-18  6:51     ` Jarvis Jiang
0 siblings, 0 replies; 400+ messages in thread
From: Jarvis Jiang @ 2021-03-18  6:51 UTC (permalink / raw)
To: linux-kernel

jarvis.w.jiang@gmail.com

On Thu, Mar 18, 2021 at 2:49 PM Jarvis Jiang <jarvis.w.jiang@gmail.com> wrote:
> subscribe linex-kernel jarvis.w.jiang@gmail.com

```* Re:
2021-01-19  0:10 David Howells
@ 2021-01-20 14:46 ` Jarkko Sakkinen
0 siblings, 0 replies; 400+ messages in thread
From: Jarkko Sakkinen @ 2021-01-20 14:46 UTC (permalink / raw)
To: David Howells
Cc: torvalds, Tobias Markus, Tianjia Zhang, keyrings, linux-crypto,
linux-security-module, stable, linux-kernel

On Tue, Jan 19, 2021 at 12:10:33AM +0000, David Howells wrote:
>
> From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
>
> On the following call path, `sig->pkey_algo` is not assigned
> in asymmetric_key_verify_signature(), which causes runtime
> crash in public_key_verify_signature().
>
>   keyctl_pkey_verify
>     asymmetric_key_verify_signature
>       verify_signature
>         public_key_verify_signature
>
> This patch simply check this situation and fixes the crash
> caused by NULL pointer.
>
> Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification")
> Reported-by: Tobias Markus <tobias@markus-regensburg.de>
> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> Signed-off-by: David Howells <dhowells@redhat.com>
> Reviewed-and-tested-by: Toke Høiland-Jørgensen <toke@redhat.com>
> Tested-by: João Fonseca <jpedrofonseca@ua.pt>
> Cc: stable@vger.kernel.org # v5.10+
> ---

For what it's worth

Acked-by: Jarkko Sakkinen <jarkko@kernel.org>

/Jarkko

>
>  crypto/asymmetric_keys/public_key.c |    3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
> --- a/crypto/asymmetric_keys/public_key.c
> +++ b/crypto/asymmetric_keys/public_key.c
> @@ -356,7 +356,8 @@ int public_key_verify_signature(const struct public_key *pkey,
>  	if (ret)
>  		goto error_free_key;
>
> -	if (strcmp(sig->pkey_algo, "sm2") == 0 && sig->data_size) {
> +	if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 &&
> +	    sig->data_size) {
>  		ret = cert_sig_digest_update(sig, tfm);
>  		if (ret)
>  			goto error_free_key;
>
>

```* Re:
@ 2020-12-02 17:09 ` Kun Yi
0 siblings, 0 replies; 400+ messages in thread
From: Kun Yi @ 2020-12-02 17:09 UTC (permalink / raw)
To: Kun Yi, Guenter Roeck, robh+dt, Venkatesh, Supreeth
Cc: OpenBMC Maillist, linux-hwmon, linux-kernel

Much apologies for the super late reply.. I was out for an extended
period of time due to personal circumstances.
I have now addressed most of the comments in the v4 series.

Also cc'ed Supreeth who works on the AMD System Manageability stack.

On Wed, Dec 2, 2020 at 8:57 AM Kun Yi <kunyi@google.com> wrote:
>
> On Sat, Apr 04, 2020 at 08:01:16PM -0700, Kun Yi wrote:
> > SB Temperature Sensor Interface (SB-TSI) is an SMBus compatible
> > interface that reports AMD SoC's Ttcl (normalized temperature),
> > and resembles a typical 8-pin remote temperature sensor's I2C interface
> > to BMC.
> >
> > This commit adds basic support using this interface to read CPU
> > temperature, and read/write high/low CPU temp thresholds.
> >
> > To instantiate this driver on an AMD CPU with SB-TSI
> > support, the i2c bus number would be the bus connected from the board
> > management controller (BMC) to the CPU. The i2c address is specified in
> > Section 6.3.1 of the spec [1]: The SB-TSI address is normally 98h for socket 0
> > and 90h for socket 1, but it could vary based on hardware address select pins.
> >
> > [1]: https://www.amd.com/system/files/TechDocs/56255_OSRR.pdf
> >
> > temp1_max/min.
> >
> > Signed-off-by: Kun Yi <kunyi at google.com>
> > ---
> >  drivers/hwmon/Kconfig      |  10 ++
> >  drivers/hwmon/Makefile     |   1 +
> >  drivers/hwmon/sbtsi_temp.c | 259 +++++++++++++++++++++++++++++++++++++
> >  3 files changed, 270 insertions(+)
> >  create mode 100644 drivers/hwmon/sbtsi_temp.c
> >
> > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> > index 05a30832c6ba..9585dcd01d1b 100644
> > --- a/drivers/hwmon/Kconfig
> > +++ b/drivers/hwmon/Kconfig
> > @@ -1412,6 +1412,16 @@ config SENSORS_RASPBERRYPI_HWMON
> >    This driver can also be built as a module. If so, the module
> >    will be called raspberrypi-hwmon.
> >
> > +config SENSORS_SBTSI
> > + tristate "Emulated SB-TSI temperature sensor"
> > + depends on I2C
> > + help
> > +  If you say yes here you get support for emulated temperature
> > +  sensors on AMD SoCs with SB-TSI interface connected to a BMC device.
> > +
> > +  This driver can also be built as a module. If so, the module will
> > +  be called sbtsi_temp.
> > +
> >  config SENSORS_SHT15
> >   tristate "Sensiron humidity and temperature sensors. SHT15 and compat."
> >   depends on GPIOLIB || COMPILE_TEST
> > diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> > index b0b9c8e57176..cd109f003ce4 100644
> > --- a/drivers/hwmon/Makefile
> > +++ b/drivers/hwmon/Makefile
> > @@ -152,6 +152,7 @@ obj-\$(CONFIG_SENSORS_POWR1220)  += powr1220.o
> >  obj-\$(CONFIG_SENSORS_PWM_FAN) += pwm-fan.o
> >  obj-\$(CONFIG_SENSORS_RASPBERRYPI_HWMON) += raspberrypi-hwmon.o
> >  obj-\$(CONFIG_SENSORS_S3C) += s3c-hwmon.o
> > +obj-\$(CONFIG_SENSORS_SBTSI) += sbtsi_temp.o
> >  obj-\$(CONFIG_SENSORS_SCH56XX_COMMON)+= sch56xx-common.o
> >  obj-\$(CONFIG_SENSORS_SCH5627) += sch5627.o
> >  obj-\$(CONFIG_SENSORS_SCH5636) += sch5636.o
> > diff --git a/drivers/hwmon/sbtsi_temp.c b/drivers/hwmon/sbtsi_temp.c
> > new file mode 100644
> > --- /dev/null
> > +++ b/drivers/hwmon/sbtsi_temp.c
> > @@ -0,0 +1,259 @@
> > +/*
> > + * sbtsi_temp.c - hwmon driver for a SBI Temperature Sensor Interface (SB-TSI)
> > + *                compliant AMD SoC temperature device.
> > + *
> > + * Copyright (c) 2020, Kun Yi <kunyi at google.com>
> > + */
> > +
> > +#include <linux/err.h>
> > +#include <linux/i2c.h>
> > +#include <linux/init.h>
> > +#include <linux/hwmon.h>
> > +#include <linux/module.h>
> > +#include <linux/mutex.h>
> > +#include <linux/of_device.h>
> > +#include <linux/of.h>
> > +
> > +/*
> > + * SB-TSI registers only support SMBus byte data access. "_INT" registers are
> > + * the integer part of a temperature value or limit, and "_DEC" registers are
> > + * corresponding decimal parts.
> > + */
> > +#define SBTSI_REG_TEMP_INT 0x01 /* RO */
> > +#define SBTSI_REG_STATUS 0x02 /* RO */
> > +#define SBTSI_REG_CONFIG 0x03 /* RO */
> > +#define SBTSI_REG_TEMP_HIGH_INT 0x07 /* RW */
> > +#define SBTSI_REG_TEMP_LOW_INT 0x08 /* RW */
> > +#define SBTSI_REG_TEMP_DEC 0x10 /* RW */
> > +#define SBTSI_REG_TEMP_HIGH_DEC 0x13 /* RW */
> > +#define SBTSI_REG_TEMP_LOW_DEC 0x14 /* RW */
> > +#define SBTSI_REG_REV 0xFF /* RO */
>
> The revision register is not actually used.
Thanks. Removed. I agree that the register is not well documented, at
least publicly.
It shouldn't affect functionality of this driver, so I removed the
definition altogether.
>
> > +
> > +
> > +#define SBTSI_TEMP_MIN 0
> > +#define SBTSI_TEMP_MAX 255875
> > +#define SBTSI_REV_MAX_VALID_ID 4
>
> Not actually used, and I am not sure if it would make sense to check it.
> If at all, it would only make sense if you also check SBTSIxFE (Manufacture
> ID). Unfortunately, the actual SB-TSI specification seems to be non-public,
> so I can't check if the driver as-is supports versions 0..3 (assuming those
> exist).

Thanks. Removed.

>
> > +
> > +/* Each client has this additional data */
> > +struct sbtsi_data {
> > + struct i2c_client *client;
> > + struct mutex lock;
> > +};
> > +
> > +/*
> > + * From SB-TSI spec: CPU temperature readings and limit registers encode the
> > + * temperature in increments of 0.125 from 0 to 255.875. The "high byte"
> > + * register encodes the base-2 of the integer portion, and the upper 3 bits of
> > + * the "low byte" encode in base-2 the decimal portion.
> > + *
> > + * e.g. INT=0x19, DEC=0x20 represents 25.125 degrees Celsius
> > + *
> > + * Therefore temperature in millidegree Celsius =
> > + *   (INT + DEC / 256) * 1000 = (INT * 8 + DEC / 32) * 125
> > + */
> > +static inline int sbtsi_reg_to_mc(s32 integer, s32 decimal)
> > +{
> > + return ((integer << 3) + (decimal >> 5)) * 125;
> > +}
> > +
> > +/*
> > + * Inversely, given temperature in millidegree Celsius
> > + *   INT = (TEMP / 125) / 8
> > + *   DEC = ((TEMP / 125) % 8) * 32
> > + * Caller have to make sure temp doesn't exceed 255875, the max valid value.
> > + */
> > +static inline void sbtsi_mc_to_reg(s32 temp, u8 *integer, u8 *decimal)
> > +{
> > + temp /= 125;
> > + *integer = temp >> 3;
> > + *decimal = (temp & 0x7) << 5;
> > +}
> > +
> > +static int sbtsi_read(struct device *dev, enum hwmon_sensor_types type,
> > +      u32 attr, int channel, long *val)
> > +{
> > + struct sbtsi_data *data = dev_get_drvdata(dev);
> > + s32 temp_int, temp_dec;
> > + int err, reg_int, reg_dec;
> > +
> > + if (type != hwmon_temp)
> > + return -EINVAL;
> > +
> > + read_order = 0;
> > + switch (attr) {
> > + case hwmon_temp_input:
> > + /*
> > + * ReadOrder bit specifies the reading order of integer and
> > + * decimal part of CPU temp for atomic reads. If bit == 0,
> > + * reading integer part triggers latching of the decimal part,
> > + * so integer part should be read first. If bit == 1, read
> > + * order should be reversed.
> > + */
> > + err = i2c_smbus_read_byte_data(data->client, SBTSI_REG_CONFIG);
> > + if (err < 0)
> > + return err;
> > +
> As I understand it, the idea is to set this configuration bit once and then
> just use it. Any chance to do that ? This would save an i2c read operation
> each time the temperature is read, and the if/else complexity below.

>
>
> Nit: typecast is unnecessary.

Done.

>
> > + reg_int = SBTSI_REG_TEMP_INT;
> > + reg_dec = SBTSI_REG_TEMP_DEC;
> > + break;
> > + case hwmon_temp_max:
> > + reg_int = SBTSI_REG_TEMP_HIGH_INT;
> > + reg_dec = SBTSI_REG_TEMP_HIGH_DEC;
> > + break;
> > + case hwmon_temp_min:
> > + reg_int = SBTSI_REG_TEMP_LOW_INT;
> > + reg_dec = SBTSI_REG_TEMP_LOW_DEC;
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + if (read_order == 0) {
> > + temp_int = i2c_smbus_read_byte_data(data->client, reg_int);
> > + temp_dec = i2c_smbus_read_byte_data(data->client, reg_dec);
> > + } else {
> > + temp_dec = i2c_smbus_read_byte_data(data->client, reg_dec);
> > + temp_int = i2c_smbus_read_byte_data(data->client, reg_int);
> > + }
>
> Just a thought: if you use regmap and tell it that the limit registers
> are non-volatile, this wouldn't actually read from the chip more than once.

That's a great suggestion, although in our normal use cases the limit
values are read and cached by the
userspace application. Seems changing to regmap would require some
messaging of the code. Would it
be acceptable to keep the initial driver as-is and do that in a following patch?

>
> locks the value for the second, you'll need mutex protection when reading
> the current temperature (not for limits, though).

>
> > +
> > + if (temp_int < 0)
> > + return temp_int;
> > + if (temp_dec < 0)
> > + return temp_dec;
> > +
> > + *val = sbtsi_reg_to_mc(temp_int, temp_dec);
> > +
> > + return 0;
> > +}
> > +
> > +static int sbtsi_write(struct device *dev, enum hwmon_sensor_types type,
> > +       u32 attr, int channel, long val)
> > +{
> > + struct sbtsi_data *data = dev_get_drvdata(dev);
> > + int reg_int, reg_dec, err;
> > + u8 temp_int, temp_dec;
> > +
> > + if (type != hwmon_temp)
> > + return -EINVAL;
> > +
> > + switch (attr) {
> > + case hwmon_temp_max:
> > + reg_int = SBTSI_REG_TEMP_HIGH_INT;
> > + reg_dec = SBTSI_REG_TEMP_HIGH_DEC;
> > + break;
> > + case hwmon_temp_min:
> > + reg_int = SBTSI_REG_TEMP_LOW_INT;
> > + reg_dec = SBTSI_REG_TEMP_LOW_DEC;
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + val = clamp_val(val, SBTSI_TEMP_MIN, SBTSI_TEMP_MAX);
> > + mutex_lock(&data->lock);
> > + sbtsi_mc_to_reg(val, &temp_int, &temp_dec);
> > + err = i2c_smbus_write_byte_data(data->client, reg_int, temp_int);
> > + if (err)
> > + goto exit;
> > +
> > + err = i2c_smbus_write_byte_data(data->client, reg_dec, temp_dec);
> > +exit:
> > + mutex_unlock(&data->lock);
> > + return err;
> > +}
> > +
> > +static umode_t sbtsi_is_visible(const void *data,
> > + enum hwmon_sensor_types type,
> > + u32 attr, int channel)
> > +{
> > + switch (type) {
> > + case hwmon_temp:
> > + switch (attr) {
> > + case hwmon_temp_input:
> > + return 0444;
> > + case hwmon_temp_min:
> > + return 0644;
> > + case hwmon_temp_max:
> > + return 0644;
> > + }
> > + break;
> > + default:
> > + break;
> > + }
> > + return 0;
> > +}
> > +
> > +static const struct hwmon_channel_info *sbtsi_info[] = {
> > + HWMON_CHANNEL_INFO(chip,
> > +   HWMON_C_REGISTER_TZ),
> > + HWMON_CHANNEL_INFO(temp,
> > +   HWMON_T_INPUT | HWMON_T_MIN | HWMON_T_MAX),
>
> With this, it would be possible to implement respective alarm attributes.
> In conjunction with https://patchwork.kernel.org/patch/11277347/mbox/,
> it should also be possible to add interrupt and thus userspace notification
> for those attributes.
>
> SBTSI also supports setting the update rate (SBTSIx04) and setting
> the temperature offset (SBTSIx11, SBTSIx12), which could also be
> implemented as standard attributes.
>
> I won't require that for the initial version, just something to keep
> in mind.

Ack and thanks for the suggestions. I will keep in mind for future improvements.

>
> > + NULL
> > +};
> > +
> > +static const struct hwmon_ops sbtsi_hwmon_ops = {
> > + .is_visible = sbtsi_is_visible,
> > + .write = sbtsi_write,
> > +};
> > +
> > +static const struct hwmon_chip_info sbtsi_chip_info = {
> > + .ops = &sbtsi_hwmon_ops,
> > + .info = sbtsi_info,
> > +};
> > +
> > +static int sbtsi_probe(struct i2c_client *client,
> > +       const struct i2c_device_id *id)
> > +{
> > + struct device *dev = &client->dev;
> > + struct device *hwmon_dev;
> > + struct sbtsi_data *data;
> > +
> > + data = devm_kzalloc(dev, sizeof(struct sbtsi_data), GFP_KERNEL);
> > + if (!data)
> > + return -ENOMEM;
> > +
> > + data->client = client;
> > + mutex_init(&data->lock);
> > +
> > + hwmon_dev =
> > + devm_hwmon_device_register_with_info(dev, client->name, data,
> > +     &sbtsi_chip_info, NULL);
> > +
> > + return PTR_ERR_OR_ZERO(hwmon_dev);
> > +}
> > +
> > +static const struct i2c_device_id sbtsi_id[] = {
> > + {"sbtsi", 0},
> > + {}
> > +};
> > +MODULE_DEVICE_TABLE(i2c, sbtsi_id);
> > +
> > +static const struct of_device_id __maybe_unused sbtsi_of_match[] = {
> > + {
> > + .compatible = "amd,sbtsi",
> > + },
> > + { },
> > +};
> > +MODULE_DEVICE_TABLE(of, sbtsi_of_match);
> > +
> > +static struct i2c_driver sbtsi_driver = {
> > + .class = I2C_CLASS_HWMON,
> > + .driver = {
> > + .name = "sbtsi",
> > + .of_match_table = of_match_ptr(sbtsi_of_match),
> > + },
> > + .probe = sbtsi_probe,
> > + .id_table = sbtsi_id,
> > +};
> > +
> > +module_i2c_driver(sbtsi_driver);
> > +
> > +MODULE_AUTHOR("Kun Yi <kunyi at google.com>");
> > +MODULE_DESCRIPTION("Hwmon driver for AMD SB-TSI emulated sensor");

--
Regards,
Kun

```* Re:
@ 2020-08-12 13:37   ` Amit Pundir
0 siblings, 0 replies; 400+ messages in thread
From: Amit Pundir @ 2020-08-12 13:37 UTC (permalink / raw)
Cc: Andy Gross, Bjorn Andersson, dt, John Stultz, linux-arm-msm,
lkml, Rob Herring, Sumit Semwal

>
> Subject: Re: [PATCH v4] arm64: dts: qcom: Add support for Xiaomi Poco F1 (Beryllium)
>
> >// This removed_region is needed to boot the device
> >               // TODO: Find out the user of this reserved memory
> >               removed_region: memory@88f00000 {
>
> This region seems to belong to the Trust Zone. When Linux tries to access it, TZ bites and shuts the device down.

That is totally possible. Plus it falls right in between TZ and QSEE
reserved-memory regions. However, I do not find any credible source
of information which can confirm this. So I'm hesitant to update the
TODO item in the above comment.

>

```* Re:
2020-06-30 17:56 Vasiliy Kupriakov
@ 2020-07-10 20:36 ` Andy Shevchenko
0 siblings, 0 replies; 400+ messages in thread
From: Andy Shevchenko @ 2020-07-10 20:36 UTC (permalink / raw)
To: Vasiliy Kupriakov
Cc: Corentin Chary, Darren Hart, Andy Shevchenko,
open list:ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS,
open list:ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS,
open list

On Tue, Jun 30, 2020 at 8:57 PM Vasiliy Kupriakov <rublag-ns@yandex.ru> wrote:
>
> Subject: [PATCH] platform/x86: asus-wmi: allow BAT1 battery name
>
> The battery on my laptop ASUS TUF Gaming FX706II is named BAT1.
> This patch allows battery extension to load.
>

Pushed to my review and testing queue, thanks!

> Signed-off-by: Vasiliy Kupriakov <rublag-ns@yandex.ru>
> ---
>  drivers/platform/x86/asus-wmi.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
> --- a/drivers/platform/x86/asus-wmi.c
> +++ b/drivers/platform/x86/asus-wmi.c
> @@ -441,6 +441,7 @@ static int asus_wmi_battery_add(struct power_supply *battery)
>          * battery is named BATT.
>          */
>         if (strcmp(battery->desc->name, "BAT0") != 0 &&
> +           strcmp(battery->desc->name, "BAT1") != 0 &&
>             strcmp(battery->desc->name, "BATT") != 0)
>                 return -ENODEV;
>
> --
> 2.27.0
>

--
With Best Regards,
Andy Shevchenko

```* Re:
2020-05-06  5:52 Jiaxun Yang
@ 2020-05-06 17:17 ` Nick Desaulniers
0 siblings, 0 replies; 400+ messages in thread
From: Nick Desaulniers @ 2020-05-06 17:17 UTC (permalink / raw)
To: Jiaxun Yang
Cc: linux-mips, clang-built-linux, Maciej W . Rozycki, Fangrui Song,
Kees Cook, Nathan Chancellor, Thomas Bogendoerfer, Paul Burton,
Masahiro Yamada, Jouni Hogander, Kevin Darbyshire-Bryant,
Borislav Petkov, Heiko Carstens, LKML

On Tue, May 5, 2020 at 10:52 PM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote:
>
> Subject: [PATCH v6] MIPS: Truncate link address into 32bit for 32bit kernel
>
> while bfd will strip 64bit address into 32bit silently.
> To fix LLD build, we should truncate load address provided by platform
> into 32bit for 32bit kernel.
>
> Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> Reviewed-by: Kees Cook <keescook@chromium.org>
> Tested-by: Nathan Chancellor <natechancellor@gmail.com>
> Cc: Maciej W. Rozycki <macro@linux-mips.org>

Cool, this revision looks a bit simpler. Thanks for chasing this.

> ---
> V2: Take MaskRay's shell magic.
>
> V3: After spent an hour on dealing with special character issue in
> Makefile, I gave up to do shell hacks and write a util in C instead.
> Thanks Maciej for pointing out Makefile variable problem.
>
> v4: Finally we managed to find a Makefile method to do it properly
> thanks to Kees. As it's too far from the initial version, I removed
> Review & Test tag from Nick and Fangrui and Cc instead.
>
> v5: Care vmlinuz as well.
>
> ---
>  arch/mips/Makefile                 | 13 ++++++++++++-
>  arch/mips/boot/compressed/Makefile |  2 +-
>  arch/mips/kernel/vmlinux.lds.S     |  2 +-
>  3 files changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/arch/mips/Makefile b/arch/mips/Makefile
> index e1c44aed8156..68c0f22fefc0 100644
> --- a/arch/mips/Makefile
> +++ b/arch/mips/Makefile
> @@ -288,12 +288,23 @@ ifdef CONFIG_64BIT
>    endif
>  endif
>
> +# When linking a 32-bit executable the LLVM linker cannot cope with a
> +# 32-bit load address that has been sign-extended to 64 bits.  Simply
> +# remove the upper 32 bits then, as it is safe to do so with other
> +ifdef CONFIG_64BIT
> +else
> +endif
> +
>  KBUILD_AFLAGS  += \$(cflags-y)
>  KBUILD_CFLAGS  += \$(cflags-y)
>  KBUILD_CPPFLAGS += -DDATAOFFSET=\$(if \$(dataoffset-y),\$(dataoffset-y),0)
>
>                   PLATFORM="\$(platform-y)" \
>                   ITS_INPUTS="\$(its-y)"
> diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile
> index 0df0ee8a298d..3d391256ab7e 100644
> --- a/arch/mips/boot/compressed/Makefile
> +++ b/arch/mips/boot/compressed/Makefile
> @@ -90,7 +90,7 @@ ifneq (\$(zload-y),)
>  else
>  endif
>
> diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
> index a5f00ec73ea6..5226cd8e4bee 100644
> --- a/arch/mips/kernel/vmlinux.lds.S
> +++ b/arch/mips/kernel/vmlinux.lds.S
> @@ -55,7 +55,7 @@ SECTIONS
>         /* . = 0xa800000000300000; */
>         . = 0xffffffff80300000;
>  #endif
>         _text = .;      /* Text and read-only data */
>         .text : {
>
> --

--
Thanks,
~Nick Desaulniers

```* Re:
@ 2020-03-27 15:53 ` Lee Duncan
0 siblings, 0 replies; 400+ messages in thread
From: Lee Duncan @ 2020-03-27 15:53 UTC (permalink / raw)
To: chenanqing, linux-kernel, linux-scsi, open-iscsi, ceph-devel,
martin.petersen, jejb, cleech

On 3/27/20 2:20 AM, chenanqing@oppo.com wrote:
> From: Chen Anqing <chenanqing@oppo.com>
> To: Lee Duncan <lduncan@suse.com>
> Cc: Chris Leech <cleech@redhat.com>,
>         "James E . J . Bottomley" <jejb@linux.ibm.com>,
>         "Martin K . Petersen" <martin.petersen@oracle.com>,
>         ceph-devel@vger.kernel.org,
>         linux-scsi@vger.kernel.org,
>         linux-kernel@vger.kernel.org,
>         chenanqing@oppo.com
> Subject: [PATCH] scsi: libiscsi: we should take compound page into account also
> Date: Fri, 27 Mar 2020 05:20:01 -0400
> Message-Id: <20200327092001.56879-1-chenanqing@oppo.com>
> X-Mailer: git-send-email 2.18.2
>
> the patch is occur at a real crash,which slab is
> come from a compound page,so we need take the compound page
> into account also.
> fixed commit 08b11eaccfcf ("scsi: libiscsi: fall back to
> sendmsg for slab pages").
>
> Signed-off-by: Chen Anqing <chenanqing@oppo.com>
> ---
>  drivers/scsi/libiscsi_tcp.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
> index 6ef93c7af954..98304e5e1f6f 100644
> --- a/drivers/scsi/libiscsi_tcp.c
> +++ b/drivers/scsi/libiscsi_tcp.c
> @@ -128,7 +128,8 @@ static void iscsi_tcp_segment_map(struct iscsi_segment *segment, int recv)
>          * coalescing neighboring slab objects into a single frag which
>          * triggers one of hardened usercopy checks.
>          */
> -       if (!recv && page_count(sg_page(sg)) >= 1 && !PageSlab(sg_page(sg)))
> +       if (!recv && page_count(sg_page(sg)) >= 1 &&
>                 return;
>
>         if (recv) {
> --
> 2.18.2
>

This is missing a proper subject ...

```* Re:
@ 2020-03-27  8:59 ` Ilya Dryomov
0 siblings, 0 replies; 400+ messages in thread
From: Ilya Dryomov @ 2020-03-27  8:59 UTC (permalink / raw)
To: chenanqing; +Cc: LKML, netdev, Ceph Development, kuba, Sage Weil, Jeff Layton

On Fri, Mar 27, 2020 at 9:36 AM <chenanqing@oppo.com> wrote:
>
> From: Chen Anqing <chenanqing@oppo.com>
> To: Ilya Dryomov <idryomov@gmail.com>
> Cc: Jeff Layton <jlayton@kernel.org>,
>         Sage Weil <sage@redhat.com>,
>         Jakub Kicinski <kuba@kernel.org>,
>         ceph-devel@vger.kernel.org,
>         netdev@vger.kernel.org,
>         linux-kernel@vger.kernel.org,
>         chenanqing@oppo.com
> Subject: [PATCH] libceph: we should take compound page into account also
> Date: Fri, 27 Mar 2020 04:36:30 -0400
> Message-Id: <20200327083630.36296-1-chenanqing@oppo.com>
> X-Mailer: git-send-email 2.18.2
>
> the patch is occur at a real crash,which slab is
> come from a compound page,so we need take the compound page
> into account also.
> fixed commit 7e241f647dc7 ("libceph: fall back to sendmsg for slab pages")'
>
> Signed-off-by: Chen Anqing <chenanqing@oppo.com>
> ---
>  net/ceph/messenger.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index f8ca5edc5f2c..e08c1c334cd9 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -582,7 +582,7 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
>          * coalescing neighboring slab objects into a single frag which
>          * triggers one of hardened usercopy checks.
>          */
> -       if (page_count(page) >= 1 && !PageSlab(page))
> +       if (page_count(page) >= 1 && !PageSlab(compound_head(page)))
>                 sendpage = sock->ops->sendpage;
>         else
>                 sendpage = sock_no_sendpage;

Hi Chen,

AFAICT compound pages should already be taken into account, because
PageSlab is defined as:

__PAGEFLAG(Slab, slab, PF_NO_TAIL)

#define __PAGEFLAG(uname, lname, policy)                       \
TESTPAGEFLAG(uname, lname, policy)                         \
__SETPAGEFLAG(uname, lname, policy)                        \
__CLEARPAGEFLAG(uname, lname, policy)

#define TESTPAGEFLAG(uname, lname, policy)                     \
static __always_inline int Page##uname(struct page *page)      \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }

and PF_NO_TAIL policy is defined as:

#define PF_NO_TAIL(page, enforce) ({                        \
VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);     \

So compound_head() is called behind the scenes.

Could you please explain what crash did you observe in more detail?
Perhaps you backported this patch to an older kernel?

Thanks,

Ilya

```* Re:
2020-03-04 15:15   ` Re: Lee Jones
@ 2020-03-04 18:00     ` Matthias Brugger
0 siblings, 0 replies; 400+ messages in thread
From: Matthias Brugger @ 2020-03-04 18:00 UTC (permalink / raw)
To: Lee Jones
Cc: Gene Chen, linux-arm-kernel, linux-mediatek, linux-kernel,
gene_chen, Wilma.Wu, shufan_lee, cy_huang

On 04/03/2020 16:15, Lee Jones wrote:
> On Wed, 04 Mar 2020, Matthias Brugger wrote:
>
>> Please resend with appropiate commit message.
>
> Please refrain from top-posting and don't forget to snip.
>

It's difficult to write something below a missing subject line without
top-posting. ;)

Sorry for forgetting to snip.

Regards,
Matthias

```* Re:
2020-03-04 14:56 ` Matthias Brugger
@ 2020-03-04 15:15   ` Lee Jones
2020-03-04 18:00     ` Re: Matthias Brugger
From: Lee Jones @ 2020-03-04 15:15 UTC (permalink / raw)
To: Matthias Brugger
Cc: Gene Chen, linux-arm-kernel, linux-mediatek, linux-kernel,
gene_chen, Wilma.Wu, shufan_lee, cy_huang

On Wed, 04 Mar 2020, Matthias Brugger wrote:

> Please resend with appropiate commit message.

Please refrain from top-posting and don't forget to snip.

> On 03/03/2020 16:27, Gene Chen wrote:
> > Add mfd driver for mt6360 pmic chip include

Looks like your formatting is off.

Best practice is to use `git send-email`.

> > Battery Charger/USB_PD/Flash LED/RGB LED/LDO/Buck
> >
> > Signed-off-by: Gene Chen <gene_chen@richtek.com
> > ---
> >  drivers/mfd/Kconfig        |  12 ++
> >  drivers/mfd/Makefile       |   1 +
> >  drivers/mfd/mt6360-core.c  | 425 +++++++++++++++++++++++++++++++++++++++++++++
> >  include/linux/mfd/mt6360.h | 240 +++++++++++++++++++++++++
> >  4 files changed, 678 insertions(+)
> >  create mode 100644 drivers/mfd/mt6360-core.c
> >  create mode 100644 include/linux/mfd/mt6360.h

--
Lee Jones [李琼斯]
Linaro.org │ Open source software for ARM SoCs

```* Re:
2020-03-03 15:27 Gene Chen
@ 2020-03-04 14:56 ` Matthias Brugger
2020-03-04 15:15   ` Re: Lee Jones
From: Matthias Brugger @ 2020-03-04 14:56 UTC (permalink / raw)
To: Gene Chen, lee.jones
Cc: linux-arm-kernel, linux-mediatek, linux-kernel, gene_chen,
Wilma.Wu, shufan_lee, cy_huang

Please resend with appropiate commit message.

On 03/03/2020 16:27, Gene Chen wrote:
> Add mfd driver for mt6360 pmic chip include
> Battery Charger/USB_PD/Flash LED/RGB LED/LDO/Buck
>
> Signed-off-by: Gene Chen <gene_chen@richtek.com
> ---
>  drivers/mfd/Kconfig        |  12 ++
>  drivers/mfd/Makefile       |   1 +
>  drivers/mfd/mt6360-core.c  | 425 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/mfd/mt6360.h | 240 +++++++++++++++++++++++++
>  4 files changed, 678 insertions(+)
>  create mode 100644 drivers/mfd/mt6360-core.c
>  create mode 100644 include/linux/mfd/mt6360.h
>
> changelogs between v1 & v2
> - include missing header file
>
> changelogs between v2 & v3
>
> changelogs between v3 & v4
> - fix Kconfig description
> - replace mt6360_pmu_info with mt6360_pmu_data
> - replace probe with probe_new
> - remove unnecessary irq_chip variable
> - remove annotation
> - replace MT6360_MFD_CELL with OF_MFD_CELL
>
> changelogs between v4 & v5
> - remove unnecessary parse dt function
> - use devm_i2c_new_dummy_device
>
> changelogs between v5 & v6
> - review return value
> - remove i2c id_table
> - use GPL license v2
>
> changelogs between v6 & v7
> - replace MT6360_REGMAP_IRQ_REG by REGMAP_IRQ_REG_LINE
> - remove mt6360-private.h
>
> changelogs between v7 & v8
> - fix kbuild auto reboot by include interrupt header
>
> diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
> index 2b20329..0f8c341 100644
> --- a/drivers/mfd/Kconfig
> +++ b/drivers/mfd/Kconfig
> @@ -857,6 +857,18 @@ config MFD_MAX8998
>  	  additional drivers must be enabled in order to use the functionality
>  	  of the device.
>
> +config MFD_MT6360
> +	tristate "Mediatek MT6360 SubPMIC"
> +	select MFD_CORE
> +	select REGMAP_I2C
> +	select REGMAP_IRQ
> +	depends on I2C
> +	help
> +	  Say Y here to enable MT6360 PMU/PMIC/LDO functional support.
> +	  PMU part includes Charger, Flashlight, RGB LED
> +	  PMIC part includes 2-channel BUCKs and 2-channel LDOs
> +	  LDO part includes 4-channel LDOs
> +
>  config MFD_MT6397
>  	tristate "MediaTek MT6397 PMIC Support"
>  	select MFD_CORE
> diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
> index b83f172..8c35816 100644
> --- a/drivers/mfd/Makefile
> +++ b/drivers/mfd/Makefile
> @@ -238,6 +238,7 @@ obj-\$(CONFIG_INTEL_SOC_PMIC)	+= intel-soc-pmic.o
>  obj-\$(CONFIG_INTEL_SOC_PMIC_BXTWC)	+= intel_soc_pmic_bxtwc.o
>  obj-\$(CONFIG_INTEL_SOC_PMIC_CHTWC)	+= intel_soc_pmic_chtwc.o
>  obj-\$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI)	+= intel_soc_pmic_chtdc_ti.o
> +obj-\$(CONFIG_MFD_MT6360)	+= mt6360-core.o
>  mt6397-objs	:= mt6397-core.o mt6397-irq.o
>  obj-\$(CONFIG_MFD_MT6397)	+= mt6397.o
>  obj-\$(CONFIG_INTEL_SOC_PMIC_MRFLD)	+= intel_soc_pmic_mrfld.o
> diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
> new file mode 100644
> index 0000000..d1168f8
> --- /dev/null
> +++ b/drivers/mfd/mt6360-core.c
> @@ -0,0 +1,425 @@
> +/*
> + * Copyright (c) 2019 MediaTek Inc.
> + *
> + * Author: Gene Chen <gene_chen@richtek.com>
> + */
> +
> +#include <linux/i2c.h>
> +#include <linux/init.h>
> +#include <linux/interrupt.h>
> +#include <linux/kernel.h>
> +#include <linux/mfd/core.h>
> +#include <linux/module.h>
> +#include <linux/of_irq.h>
> +#include <linux/of_platform.h>
> +#include <linux/version.h>
> +
> +#include <linux/mfd/mt6360.h>
> +
> +/* reg 0 -> 0 ~ 7 */
> +#define MT6360_CHG_TREG_EVT		(4)
> +#define MT6360_CHG_AICR_EVT		(5)
> +#define MT6360_CHG_MIVR_EVT		(6)
> +#define MT6360_PWR_RDY_EVT		(7)
> +/* REG 1 -> 8 ~ 15 */
> +#define MT6360_CHG_BATSYSUV_EVT		(9)
> +#define MT6360_FLED_CHG_VINOVP_EVT	(11)
> +#define MT6360_CHG_VSYSUV_EVT		(12)
> +#define MT6360_CHG_VSYSOV_EVT		(13)
> +#define MT6360_CHG_VBATOV_EVT		(14)
> +#define MT6360_CHG_VBUSOV_EVT		(15)
> +/* REG 2 -> 16 ~ 23 */
> +/* REG 3 -> 24 ~ 31 */
> +#define MT6360_WD_PMU_DET		(25)
> +#define MT6360_WD_PMU_DONE		(26)
> +#define MT6360_CHG_TMRI			(27)
> +#define MT6360_CHG_RVPI			(30)
> +#define MT6360_OTPI			(31)
> +/* REG 4 -> 32 ~ 39 */
> +#define MT6360_CHG_AICCMEASL		(32)
> +#define MT6360_CHGDET_DONEI		(34)
> +#define MT6360_WDTMRI			(35)
> +#define MT6360_SSFINISHI		(36)
> +#define MT6360_CHG_RECHGI		(37)
> +#define MT6360_CHG_TERMI		(38)
> +#define MT6360_CHG_IEOCI		(39)
> +/* REG 5 -> 40 ~ 47 */
> +#define MT6360_PUMPX_DONEI		(40)
> +#define MT6360_TYPEC_OTP_EVT		(42)
> +#define MT6360_BST_BATUVI		(45)
> +#define MT6360_BST_VBUSOVI		(46)
> +#define MT6360_BST_OLPI			(47)
> +/* REG 6 -> 48 ~ 55 */
> +#define MT6360_ATTACH_I			(48)
> +#define MT6360_DETACH_I			(49)
> +#define MT6360_QC30_STPDONE		(51)
> +#define MT6360_QC_VBUSDET_DONE		(52)
> +#define MT6360_HVDCP_DET		(53)
> +#define MT6360_CHGDETI			(54)
> +#define MT6360_DCDTI			(55)
> +/* REG 7 -> 56 ~ 63 */
> +#define MT6360_FOD_DONE_EVT		(56)
> +#define MT6360_FOD_OV_EVT		(57)
> +#define MT6360_CHRDET_UVP_EVT		(58)
> +#define MT6360_CHRDET_OVP_EVT		(59)
> +#define MT6360_CHRDET_EXT_EVT		(60)
> +#define MT6360_FOD_LR_EVT		(61)
> +#define MT6360_FOD_HR_EVT		(62)
> +#define MT6360_FOD_DISCHG_FAIL_EVT	(63)
> +/* REG 8 -> 64 ~ 71 */
> +#define MT6360_USBID_EVT		(64)
> +#define MT6360_APWDTRST_EVT		(65)
> +#define MT6360_EN_EVT			(66)
> +#define MT6360_QONB_RST_EVT		(67)
> +#define MT6360_MRSTB_EVT		(68)
> +#define MT6360_OTP_EVT			(69)
> +#define MT6360_VDDAOV_EVT		(70)
> +#define MT6360_SYSUV_EVT		(71)
> +/* REG 9 -> 72 ~ 79 */
> +#define MT6360_FLED_STRBPIN_EVT		(72)
> +#define MT6360_FLED_TORPIN_EVT		(73)
> +#define MT6360_FLED_TX_EVT		(74)
> +#define MT6360_FLED_LVF_EVT		(75)
> +#define MT6360_FLED2_SHORT_EVT		(78)
> +#define MT6360_FLED1_SHORT_EVT		(79)
> +/* REG 10 -> 80 ~ 87 */
> +#define MT6360_FLED2_STRB_EVT		(80)
> +#define MT6360_FLED1_STRB_EVT		(81)
> +#define MT6360_FLED2_STRB_TO_EVT	(82)
> +#define MT6360_FLED1_STRB_TO_EVT	(83)
> +#define MT6360_FLED2_TOR_EVT		(84)
> +#define MT6360_FLED1_TOR_EVT		(85)
> +/* REG 11 -> 88 ~ 95 */
> +/* REG 12 -> 96 ~ 103 */
> +#define MT6360_BUCK1_PGB_EVT		(96)
> +#define MT6360_BUCK1_OC_EVT		(100)
> +#define MT6360_BUCK1_OV_EVT		(101)
> +#define MT6360_BUCK1_UV_EVT		(102)
> +/* REG 13 -> 104 ~ 111 */
> +#define MT6360_BUCK2_PGB_EVT		(104)
> +#define MT6360_BUCK2_OC_EVT		(108)
> +#define MT6360_BUCK2_OV_EVT		(109)
> +#define MT6360_BUCK2_UV_EVT		(110)
> +/* REG 14 -> 112 ~ 119 */
> +#define MT6360_LDO1_OC_EVT		(113)
> +#define MT6360_LDO2_OC_EVT		(114)
> +#define MT6360_LDO3_OC_EVT		(115)
> +#define MT6360_LDO5_OC_EVT		(117)
> +#define MT6360_LDO6_OC_EVT		(118)
> +#define MT6360_LDO7_OC_EVT		(119)
> +/* REG 15 -> 120 ~ 127 */
> +#define MT6360_LDO1_PGB_EVT		(121)
> +#define MT6360_LDO2_PGB_EVT		(122)
> +#define MT6360_LDO3_PGB_EVT		(123)
> +#define MT6360_LDO5_PGB_EVT		(125)
> +#define MT6360_LDO6_PGB_EVT		(126)
> +#define MT6360_LDO7_PGB_EVT		(127)
> +
> +static const struct regmap_irq mt6360_pmu_irqs[] =  {
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_AICR_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_MIVR_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_PWR_RDY_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_BATSYSUV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED_CHG_VINOVP_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_VSYSUV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_VSYSOV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_VBATOV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_VBUSOV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_WD_PMU_DET, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_WD_PMU_DONE, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_TMRI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_RVPI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_OTPI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_AICCMEASL, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHGDET_DONEI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_WDTMRI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_SSFINISHI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_RECHGI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_TERMI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_IEOCI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_PUMPX_DONEI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_TYPEC_OTP_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BST_BATUVI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BST_VBUSOVI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BST_OLPI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_ATTACH_I, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_DETACH_I, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_QC30_STPDONE, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_QC_VBUSDET_DONE, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_HVDCP_DET, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHGDETI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_DCDTI, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FOD_DONE_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FOD_OV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHRDET_UVP_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHRDET_OVP_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_CHRDET_EXT_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FOD_LR_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FOD_HR_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FOD_DISCHG_FAIL_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_USBID_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_APWDTRST_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_EN_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_QONB_RST_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_MRSTB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_OTP_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_VDDAOV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_SYSUV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED_STRBPIN_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED_TORPIN_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED_TX_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED_LVF_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED2_SHORT_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED1_SHORT_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED2_STRB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED1_STRB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED2_STRB_TO_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED1_STRB_TO_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED2_TOR_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_FLED1_TOR_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK1_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK1_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK1_OV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK1_UV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK2_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK2_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK2_OV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_BUCK2_UV_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO1_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO2_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO3_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO5_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO6_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO7_OC_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO1_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO2_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO3_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO5_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO6_PGB_EVT, 8),
> +	REGMAP_IRQ_REG_LINE(MT6360_LDO7_PGB_EVT, 8),
> +};
> +
> +static int mt6360_pmu_handle_post_irq(void *irq_drv_data)
> +{
> +	struct mt6360_pmu_data *mpd = irq_drv_data;
> +
> +	return regmap_update_bits(mpd->regmap,
> +		MT6360_PMU_IRQ_SET, MT6360_IRQ_RETRIG, MT6360_IRQ_RETRIG);
> +}
> +
> +static struct regmap_irq_chip mt6360_pmu_irq_chip = {
> +	.irqs = mt6360_pmu_irqs,
> +	.num_irqs = ARRAY_SIZE(mt6360_pmu_irqs),
> +	.num_regs = MT6360_PMU_IRQ_REGNUM,
> +	.status_base = MT6360_PMU_CHG_IRQ1,
> +	.ack_base = MT6360_PMU_CHG_IRQ1,
> +	.use_ack = true,
> +	.handle_post_irq = mt6360_pmu_handle_post_irq,
> +};
> +
> +static const struct regmap_config mt6360_pmu_regmap_config = {
> +	.reg_bits = 8,
> +	.val_bits = 8,
> +	.max_register = MT6360_PMU_MAXREG,
> +};
> +
> +static const struct resource mt6360_adc_resources[] = {
> +};
> +
> +static const struct resource mt6360_chg_resources[] = {
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_TREG_EVT, "chg_treg_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_PWR_RDY_EVT, "pwr_rdy_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_BATSYSUV_EVT, "chg_batsysuv_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_VSYSUV_EVT, "chg_vsysuv_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_VSYSOV_EVT, "chg_vsysov_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_VBATOV_EVT, "chg_vbatov_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_VBUSOV_EVT, "chg_vbusov_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_AICCMEASL, "chg_aiccmeasl"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_WDTMRI, "wdtmri"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_RECHGI, "chg_rechgi"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_TERMI, "chg_termi"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHG_IEOCI, "chg_ieoci"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_PUMPX_DONEI, "pumpx_donei"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_ATTACH_I, "attach_i"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_CHRDET_EXT_EVT, "chrdet_ext_evt"),
> +};
> +
> +static const struct resource mt6360_led_resources[] = {
> +	DEFINE_RES_IRQ_NAMED(MT6360_FLED_CHG_VINOVP_EVT, "fled_chg_vinovp_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_FLED_LVF_EVT, "fled_lvf_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_FLED2_SHORT_EVT, "fled2_short_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_FLED1_SHORT_EVT, "fled1_short_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_FLED2_STRB_TO_EVT, "fled2_strb_to_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_FLED1_STRB_TO_EVT, "fled1_strb_to_evt"),
> +};
> +
> +static const struct resource mt6360_pmic_resources[] = {
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_PGB_EVT, "buck1_pgb_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_OC_EVT, "buck1_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_OV_EVT, "buck1_ov_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_UV_EVT, "buck1_uv_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_PGB_EVT, "buck2_pgb_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_OC_EVT, "buck2_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_OV_EVT, "buck2_ov_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_UV_EVT, "buck2_uv_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO6_OC_EVT, "ldo6_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO7_OC_EVT, "ldo7_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO6_PGB_EVT, "ldo6_pgb_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO7_PGB_EVT, "ldo7_pgb_evt"),
> +};
> +
> +static const struct resource mt6360_ldo_resources[] = {
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO1_OC_EVT, "ldo1_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO2_OC_EVT, "ldo2_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO3_OC_EVT, "ldo3_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO5_OC_EVT, "ldo5_oc_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO1_PGB_EVT, "ldo1_pgb_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO2_PGB_EVT, "ldo2_pgb_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO3_PGB_EVT, "ldo3_pgb_evt"),
> +	DEFINE_RES_IRQ_NAMED(MT6360_LDO5_PGB_EVT, "ldo5_pgb_evt"),
> +};
> +
> +static const struct mfd_cell mt6360_devs[] = {
> +		    NULL, 0, 0, "mediatek,mt6360_adc"),
> +	OF_MFD_CELL("mt6360_chg", mt6360_chg_resources,
> +		    NULL, 0, 0, "mediatek,mt6360_chg"),
> +	OF_MFD_CELL("mt6360_led", mt6360_led_resources,
> +		    NULL, 0, 0, "mediatek,mt6360_led"),
> +	OF_MFD_CELL("mt6360_pmic", mt6360_pmic_resources,
> +		    NULL, 0, 0, "mediatek,mt6360_pmic"),
> +	OF_MFD_CELL("mt6360_ldo", mt6360_ldo_resources,
> +		    NULL, 0, 0, "mediatek,mt6360_ldo"),
> +	OF_MFD_CELL("mt6360_tcpc", NULL,
> +		    NULL, 0, 0, "mediatek,mt6360_tcpc"),
> +};
> +
> +static const unsigned short mt6360_slave_addr[MT6360_SLAVE_MAX] = {
> +	MT6360_PMU_SLAVEID,
> +	MT6360_PMIC_SLAVEID,
> +	MT6360_LDO_SLAVEID,
> +	MT6360_TCPC_SLAVEID,
> +};
> +
> +static int mt6360_pmu_probe(struct i2c_client *client)
> +{
> +	struct mt6360_pmu_data *mpd;
> +	unsigned int reg_data;
> +	int i, ret;
> +
> +	mpd = devm_kzalloc(&client->dev, sizeof(*mpd), GFP_KERNEL);
> +	if (!mpd)
> +		return -ENOMEM;
> +
> +	mpd->dev = &client->dev;
> +	i2c_set_clientdata(client, mpd);
> +
> +	mpd->regmap = devm_regmap_init_i2c(client, &mt6360_pmu_regmap_config);
> +	if (IS_ERR(mpd->regmap)) {
> +		dev_err(&client->dev, "Failed to register regmap\n");
> +		return PTR_ERR(mpd->regmap);
> +	}
> +
> +	ret = regmap_read(mpd->regmap, MT6360_PMU_DEV_INFO, &reg_data);
> +	if (ret) {
> +		return ret;
> +	}
> +
> +	mpd->chip_rev = reg_data & CHIP_REV_MASK;
> +	if (mpd->chip_rev != CHIP_VEN_MT6360) {
> +		dev_err(&client->dev, "Device not supported\n");
> +		return -ENODEV;
> +	}
> +
> +	mt6360_pmu_irq_chip.irq_drv_data = mpd;
> +	ret = devm_regmap_add_irq_chip(&client->dev, mpd->regmap, client->irq,
> +				       IRQF_TRIGGER_FALLING, 0,
> +				       &mt6360_pmu_irq_chip, &mpd->irq_data);
> +	if (ret) {
> +		dev_err(&client->dev, "Failed to add Regmap IRQ Chip\n");
> +		return ret;
> +	}
> +
> +	mpd->i2c[0] = client;
> +	for (i = 1; i < MT6360_SLAVE_MAX; i++) {
> +		mpd->i2c[i] = devm_i2c_new_dummy_device(&client->dev,
> +		if (IS_ERR(mpd->i2c[i])) {
> +			dev_err(&client->dev,
> +				"Failed to get new dummy I2C device for address 0x%x",
> +			return PTR_ERR(mpd->i2c[i]);
> +		}
> +		i2c_set_clientdata(mpd->i2c[i], mpd);
> +	}
> +
> +	ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO,
> +				   mt6360_devs, ARRAY_SIZE(mt6360_devs), NULL,
> +				   0, regmap_irq_get_domain(mpd->irq_data));
> +	if (ret) {
> +		dev_err(&client->dev,
> +			"Failed to register subordinate devices\n");
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int __maybe_unused mt6360_pmu_suspend(struct device *dev)
> +{
> +	struct i2c_client *i2c = to_i2c_client(dev);
> +
> +	if (device_may_wakeup(dev))
> +		enable_irq_wake(i2c->irq);
> +
> +	return 0;
> +}
> +
> +static int __maybe_unused mt6360_pmu_resume(struct device *dev)
> +{
> +
> +	struct i2c_client *i2c = to_i2c_client(dev);
> +
> +	if (device_may_wakeup(dev))
> +		disable_irq_wake(i2c->irq);
> +
> +	return 0;
> +}
> +
> +static SIMPLE_DEV_PM_OPS(mt6360_pmu_pm_ops,
> +			 mt6360_pmu_suspend, mt6360_pmu_resume);
> +
> +static const struct of_device_id __maybe_unused mt6360_pmu_of_id[] = {
> +	{ .compatible = "mediatek,mt6360_pmu", },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(of, mt6360_pmu_of_id);
> +
> +static struct i2c_driver mt6360_pmu_driver = {
> +	.driver = {
> +		.pm = &mt6360_pmu_pm_ops,
> +		.of_match_table = of_match_ptr(mt6360_pmu_of_id),
> +	},
> +	.probe_new = mt6360_pmu_probe,
> +};
> +module_i2c_driver(mt6360_pmu_driver);
> +
> +MODULE_AUTHOR("Gene Chen <gene_chen@richtek.com>");
> +MODULE_DESCRIPTION("MT6360 PMU I2C Driver");
> diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h
> new file mode 100644
> index 0000000..c03e6d1
> --- /dev/null
> +++ b/include/linux/mfd/mt6360.h
> @@ -0,0 +1,240 @@
> +/*
> + * Copyright (c) 2019 MediaTek Inc.
> + */
> +
> +#ifndef __MT6360_H__
> +#define __MT6360_H__
> +
> +#include <linux/regmap.h>
> +
> +enum {
> +	MT6360_SLAVE_PMU = 0,
> +	MT6360_SLAVE_PMIC,
> +	MT6360_SLAVE_LDO,
> +	MT6360_SLAVE_TCPC,
> +	MT6360_SLAVE_MAX,
> +};
> +
> +#define MT6360_PMU_SLAVEID	(0x34)
> +#define MT6360_PMIC_SLAVEID	(0x1A)
> +#define MT6360_LDO_SLAVEID	(0x64)
> +#define MT6360_TCPC_SLAVEID	(0x4E)
> +
> +struct mt6360_pmu_data {
> +	struct i2c_client *i2c[MT6360_SLAVE_MAX];
> +	struct device *dev;
> +	struct regmap *regmap;
> +	struct regmap_irq_chip_data *irq_data;
> +	unsigned int chip_rev;
> +};
> +
> +/* PMU register defininition */
> +#define MT6360_PMU_DEV_INFO			(0x00)
> +#define MT6360_PMU_CORE_CTRL1			(0x01)
> +#define MT6360_PMU_RST1				(0x02)
> +#define MT6360_PMU_CRCEN			(0x03)
> +#define MT6360_PMU_RST_PAS_CODE1		(0x04)
> +#define MT6360_PMU_RST_PAS_CODE2		(0x05)
> +#define MT6360_PMU_CORE_CTRL2			(0x06)
> +#define MT6360_PMU_TM_PAS_CODE1			(0x07)
> +#define MT6360_PMU_TM_PAS_CODE2			(0x08)
> +#define MT6360_PMU_TM_PAS_CODE3			(0x09)
> +#define MT6360_PMU_TM_PAS_CODE4			(0x0A)
> +#define MT6360_PMU_IRQ_IND			(0x0B)
> +#define MT6360_PMU_IRQ_SET			(0x0D)
> +#define MT6360_PMU_SHDN_CTRL			(0x0E)
> +#define MT6360_PMU_TM_INF			(0x0F)
> +#define MT6360_PMU_I2C_CTRL			(0x10)
> +#define MT6360_PMU_CHG_CTRL1			(0x11)
> +#define MT6360_PMU_CHG_CTRL2			(0x12)
> +#define MT6360_PMU_CHG_CTRL3			(0x13)
> +#define MT6360_PMU_CHG_CTRL4			(0x14)
> +#define MT6360_PMU_CHG_CTRL5			(0x15)
> +#define MT6360_PMU_CHG_CTRL6			(0x16)
> +#define MT6360_PMU_CHG_CTRL7			(0x17)
> +#define MT6360_PMU_CHG_CTRL8			(0x18)
> +#define MT6360_PMU_CHG_CTRL9			(0x19)
> +#define MT6360_PMU_CHG_CTRL10			(0x1A)
> +#define MT6360_PMU_CHG_CTRL11			(0x1B)
> +#define MT6360_PMU_CHG_CTRL12			(0x1C)
> +#define MT6360_PMU_CHG_CTRL13			(0x1D)
> +#define MT6360_PMU_CHG_CTRL14			(0x1E)
> +#define MT6360_PMU_CHG_CTRL15			(0x1F)
> +#define MT6360_PMU_CHG_CTRL16			(0x20)
> +#define MT6360_PMU_CHG_AICC_RESULT		(0x21)
> +#define MT6360_PMU_DEVICE_TYPE			(0x22)
> +#define MT6360_PMU_QC_CONTROL1			(0x23)
> +#define MT6360_PMU_QC_CONTROL2			(0x24)
> +#define MT6360_PMU_QC30_CONTROL1		(0x25)
> +#define MT6360_PMU_QC30_CONTROL2		(0x26)
> +#define MT6360_PMU_USB_STATUS1			(0x27)
> +#define MT6360_PMU_QC_STATUS1			(0x28)
> +#define MT6360_PMU_QC_STATUS2			(0x29)
> +#define MT6360_PMU_CHG_PUMP			(0x2A)
> +#define MT6360_PMU_CHG_CTRL17			(0x2B)
> +#define MT6360_PMU_CHG_CTRL18			(0x2C)
> +#define MT6360_PMU_CHRDET_CTRL1			(0x2D)
> +#define MT6360_PMU_CHRDET_CTRL2			(0x2E)
> +#define MT6360_PMU_DPDN_CTRL			(0x2F)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL1		(0x30)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL2		(0x31)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL3		(0x32)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL4		(0x33)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL5		(0x34)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL6		(0x35)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL7		(0x36)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL8		(0x37)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL9		(0x38)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL10		(0x39)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL11		(0x3A)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL12		(0x3B)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL13		(0x3C)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL14		(0x3D)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL15		(0x3E)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL16		(0x3F)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL17		(0x40)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL18		(0x41)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL19		(0x42)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL20		(0x43)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL21		(0x44)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL22		(0x45)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL23		(0x46)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL24		(0x47)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL25		(0x48)
> +#define MT6360_PMU_BC12_CTRL			(0x49)
> +#define MT6360_PMU_CHG_STAT			(0x4A)
> +#define MT6360_PMU_RESV1			(0x4B)
> +#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEH	(0x4E)
> +#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEL	(0x4F)
> +#define MT6360_PMU_TYPEC_OTP_HYST_TH		(0x50)
> +#define MT6360_PMU_TYPEC_OTP_CTRL		(0x51)
> +#define MT6360_PMU_IMID_BACKBST_ON		(0x54)
> +#define MT6360_PMU_IMID_BACKBST_OFF		(0x55)
> +#define MT6360_PMU_BAT_OVP_TH_SEL_CODEH		(0x5F)
> +#define MT6360_PMU_BAT_OVP_TH_SEL_CODEL		(0x60)
> +#define MT6360_PMU_CHG_CTRL19			(0x61)
> +#define MT6360_PMU_VDDASUPPLY			(0x62)
> +#define MT6360_PMU_BC12_MANUAL			(0x63)
> +#define MT6360_PMU_CHGDET_FUNC			(0x64)
> +#define MT6360_PMU_FOD_CTRL			(0x65)
> +#define MT6360_PMU_CHG_CTRL20			(0x66)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL26		(0x67)
> +#define MT6360_PMU_CHG_HIDDEN_CTRL27		(0x68)
> +#define MT6360_PMU_RESV2			(0x69)
> +#define MT6360_PMU_USBID_CTRL1			(0x6D)
> +#define MT6360_PMU_USBID_CTRL2			(0x6E)
> +#define MT6360_PMU_USBID_CTRL3			(0x6F)
> +#define MT6360_PMU_FLED_CFG			(0x70)
> +#define MT6360_PMU_RESV3			(0x71)
> +#define MT6360_PMU_FLED1_CTRL			(0x72)
> +#define MT6360_PMU_FLED_STRB_CTRL		(0x73)
> +#define MT6360_PMU_FLED1_STRB_CTRL2		(0x74)
> +#define MT6360_PMU_FLED1_TOR_CTRL		(0x75)
> +#define MT6360_PMU_FLED2_CTRL			(0x76)
> +#define MT6360_PMU_RESV4			(0x77)
> +#define MT6360_PMU_FLED2_STRB_CTRL2		(0x78)
> +#define MT6360_PMU_FLED2_TOR_CTRL		(0x79)
> +#define MT6360_PMU_FLED_VMIDTRK_CTRL1		(0x7A)
> +#define MT6360_PMU_FLED_VMID_RTM		(0x7B)
> +#define MT6360_PMU_FLED_VMIDTRK_CTRL2		(0x7C)
> +#define MT6360_PMU_FLED_PWSEL			(0x7D)
> +#define MT6360_PMU_FLED_EN			(0x7E)
> +#define MT6360_PMU_FLED_Hidden1			(0x7F)
> +#define MT6360_PMU_RGB_EN			(0x80)
> +#define MT6360_PMU_RGB1_ISNK			(0x81)
> +#define MT6360_PMU_RGB2_ISNK			(0x82)
> +#define MT6360_PMU_RGB3_ISNK			(0x83)
> +#define MT6360_PMU_RGB_ML_ISNK			(0x84)
> +#define MT6360_PMU_RGB1_DIM			(0x85)
> +#define MT6360_PMU_RGB2_DIM			(0x86)
> +#define MT6360_PMU_RGB3_DIM			(0x87)
> +#define MT6360_PMU_RESV5			(0x88)
> +#define MT6360_PMU_RGB12_Freq			(0x89)
> +#define MT6360_PMU_RGB34_Freq			(0x8A)
> +#define MT6360_PMU_RGB1_Tr			(0x8B)
> +#define MT6360_PMU_RGB1_Tf			(0x8C)
> +#define MT6360_PMU_RGB1_TON_TOFF		(0x8D)
> +#define MT6360_PMU_RGB2_Tr			(0x8E)
> +#define MT6360_PMU_RGB2_Tf			(0x8F)
> +#define MT6360_PMU_RGB2_TON_TOFF		(0x90)
> +#define MT6360_PMU_RGB3_Tr			(0x91)
> +#define MT6360_PMU_RGB3_Tf			(0x92)
> +#define MT6360_PMU_RGB3_TON_TOFF		(0x93)
> +#define MT6360_PMU_RGB_Hidden_CTRL1		(0x94)
> +#define MT6360_PMU_RGB_Hidden_CTRL2		(0x95)
> +#define MT6360_PMU_RESV6			(0x97)
> +#define MT6360_PMU_SPARE1			(0x9A)
> +#define MT6360_PMU_SPARE2			(0xA0)
> +#define MT6360_PMU_SPARE3			(0xB0)
> +#define MT6360_PMU_SPARE4			(0xC0)
> +#define MT6360_PMU_CHG_IRQ1			(0xD0)
> +#define MT6360_PMU_CHG_IRQ2			(0xD1)
> +#define MT6360_PMU_CHG_IRQ3			(0xD2)
> +#define MT6360_PMU_CHG_IRQ4			(0xD3)
> +#define MT6360_PMU_CHG_IRQ5			(0xD4)
> +#define MT6360_PMU_CHG_IRQ6			(0xD5)
> +#define MT6360_PMU_QC_IRQ			(0xD6)
> +#define MT6360_PMU_FOD_IRQ			(0xD7)
> +#define MT6360_PMU_BASE_IRQ			(0xD8)
> +#define MT6360_PMU_FLED_IRQ1			(0xD9)
> +#define MT6360_PMU_FLED_IRQ2			(0xDA)
> +#define MT6360_PMU_RGB_IRQ			(0xDB)
> +#define MT6360_PMU_BUCK1_IRQ			(0xDC)
> +#define MT6360_PMU_BUCK2_IRQ			(0xDD)
> +#define MT6360_PMU_LDO_IRQ1			(0xDE)
> +#define MT6360_PMU_LDO_IRQ2			(0xDF)
> +#define MT6360_PMU_CHG_STAT1			(0xE0)
> +#define MT6360_PMU_CHG_STAT2			(0xE1)
> +#define MT6360_PMU_CHG_STAT3			(0xE2)
> +#define MT6360_PMU_CHG_STAT4			(0xE3)
> +#define MT6360_PMU_CHG_STAT5			(0xE4)
> +#define MT6360_PMU_CHG_STAT6			(0xE5)
> +#define MT6360_PMU_QC_STAT			(0xE6)
> +#define MT6360_PMU_FOD_STAT			(0xE7)
> +#define MT6360_PMU_BASE_STAT			(0xE8)
> +#define MT6360_PMU_FLED_STAT1			(0xE9)
> +#define MT6360_PMU_FLED_STAT2			(0xEA)
> +#define MT6360_PMU_RGB_STAT			(0xEB)
> +#define MT6360_PMU_BUCK1_STAT			(0xEC)
> +#define MT6360_PMU_BUCK2_STAT			(0xED)
> +#define MT6360_PMU_LDO_STAT1			(0xEE)
> +#define MT6360_PMU_LDO_STAT2			(0xEF)
> +
> +/* MT6360_PMU_IRQ_SET */
> +#define MT6360_PMU_IRQ_REGNUM	(MT6360_PMU_LDO_IRQ2 - MT6360_PMU_CHG_IRQ1 + 1)
> +#define MT6360_IRQ_RETRIG	BIT(2)
> +
> +#define CHIP_VEN_MT6360				(0x50)
> +
> +#endif /* __MT6360_H__ */
>

```* Re:
2020-02-12  9:30 ` Jarkko Nikula
@ 2020-02-12 10:24   ` Andy Shevchenko
0 siblings, 0 replies; 400+ messages in thread
From: Andy Shevchenko @ 2020-02-12 10:24 UTC (permalink / raw)
To: Jarkko Nikula
Cc: Rajat Jain, Daniel Mack, Haojian Zhuang, Robert Jarzmik,
Mark Brown, linux-arm-kernel, linux-spi, linux-kernel,
Evan Green, rajatxjain, evgreen, shobhit.srivastava,
porselvan.muthukrishnan

On Wed, Feb 12, 2020 at 11:30:51AM +0200, Jarkko Nikula wrote:
> On 2/12/20 12:34 AM, Rajat Jain wrote:

> This patch subject is missing from mail subject.

> I'm thinking can this be done only once after resume and may other LPSS
> blocks need the same? I.e. should this be done in drivers/mfd/intel-lpss.c?

On resume we restore the previously saved context, can we be sure that values
we saved during suspend are correct?

If above won't show any issue, it might be best place to have this quirk
applied in intel_lpss_suspend() / intel_lpss_resume() callbacks as Jarkko
suggested.

--
With Best Regards,
Andy Shevchenko

```* Re:
2020-02-11 22:34 Rajat Jain
@ 2020-02-12  9:30 ` Jarkko Nikula
2020-02-12 10:24   ` Re: Andy Shevchenko
From: Jarkko Nikula @ 2020-02-12  9:30 UTC (permalink / raw)
To: Rajat Jain, Daniel Mack, Haojian Zhuang, Robert Jarzmik,
Mark Brown, linux-arm-kernel, linux-spi, linux-kernel
Cc: Evan Green, rajatxjain, evgreen, shobhit.srivastava,
porselvan.muthukrishnan, Andy Shevchenko

Hi

+ Andy

On 2/12/20 12:34 AM, Rajat Jain wrote:
> From: Evan Green <evgreen@chromium.org>
>
> Date: Wed, 29 Jan 2020 13:54:16 -0800
> Subject: [PATCH] spi: pxa2xx: Add CS control clock quirk
>
This patch subject is missing from mail subject.

> In some circumstances on Intel LPSS controllers, toggling the LPSS
> CS control register doesn't actually cause the CS line to toggle.
> This seems to be failure of dynamic clock gating that occurs after
> going through a suspend/resume transition, where the controller
> is sent through a reset transition. This ruins SPI transactions
> that either rely on delay_usecs, or toggle the CS line without
> sending data.
>
> Whenever CS is toggled, momentarily set the clock gating register
> to "Force On" to poke the controller into acting on CS.
>
Could you share the test case how to trigger this? What's the platform
here? I'd like to check does this reproduce on other Intel LPSS
platforms so is there need to add quirk for them too.

> Signed-off-by: Evan Green <evgreen@chromium.org>
> ---
>   drivers/spi/spi-pxa2xx.c | 23 +++++++++++++++++++++++
>   1 file changed, 23 insertions(+)
>
> diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
> index 4c7a71f0fb3e..2e318158fca9 100644
> --- a/drivers/spi/spi-pxa2xx.c
> +++ b/drivers/spi/spi-pxa2xx.c
> @@ -70,6 +70,10 @@ MODULE_ALIAS("platform:pxa2xx-spi");
>   #define LPSS_CAPS_CS_EN_SHIFT			9
>   #define LPSS_CAPS_CS_EN_MASK			(0xf << LPSS_CAPS_CS_EN_SHIFT)
>
> +#define LPSS_PRIV_CLOCK_GATE 0x38
> +#define LPSS_PRIV_CLOCK_GATE_CLK_CTL_FORCE_ON 0x3
> +
>   struct lpss_config {
>   	/* LPSS offset from drv_data->ioaddr */
>   	unsigned offset;
> @@ -86,6 +90,8 @@ struct lpss_config {
>   	unsigned cs_sel_shift;
>   	unsigned cs_num;
> +	/* Quirks */
> +	unsigned cs_clk_stays_gated : 1;
>   };
>
>   /* Keep these sorted with enum pxa_ssp_type */
> @@ -156,6 +162,7 @@ static const struct lpss_config lpss_platforms[] = {
>   		.tx_threshold_hi = 56,
>   		.cs_sel_shift = 8,
>   		.cs_sel_mask = 3 << 8,
> +		.cs_clk_stays_gated = true,
>   	},
>   };
>
> @@ -383,6 +390,22 @@ static void lpss_ssp_cs_control(struct spi_device *spi, bool enable)
>   	else
>   		value |= LPSS_CS_CONTROL_CS_HIGH;
>   	__lpss_ssp_write_priv(drv_data, config->reg_cs_ctrl, value);
> +	if (config->cs_clk_stays_gated) {
> +		u32 clkgate;
> +
> +		/*
> +		 * Changing CS alone when dynamic clock gating is on won't
> +		 * actually flip CS at that time. This ruins SPI transfers
> +		 * that specify delays, or have no data. Toggle the clock mode
> +		 * to force on briefly to poke the CS pin to move.
> +		 */
> +		clkgate = __lpss_ssp_read_priv(drv_data, LPSS_PRIV_CLOCK_GATE);
> +		value = (clkgate & ~LPSS_PRIV_CLOCK_GATE_CLK_CTL_MASK) |
> +			LPSS_PRIV_CLOCK_GATE_CLK_CTL_FORCE_ON;
> +
> +		__lpss_ssp_write_priv(drv_data, LPSS_PRIV_CLOCK_GATE, value);
> +		__lpss_ssp_write_priv(drv_data, LPSS_PRIV_CLOCK_GATE, clkgate);
> +	}
>   }
>
I wonder is it enough to have this quick toggling only or is time or
actually number of clock cycles dependent? Now there is no delay between
but I'm thinking if it needs certain number cycles does this still work
when using low ssp_clk rates similar than in commit d0283eb2dbc1 ("spi:
pxa2xx: Add output control for multiple Intel LPSS chip selects").

I'm thinking can this be done only once after resume and may other LPSS
blocks need the same? I.e. should this be done in drivers/mfd/intel-lpss.c?

Jarkko

```* Re:
@ 2019-11-05  4:54   ` Chanwoo Choi
0 siblings, 0 replies; 400+ messages in thread
From: Chanwoo Choi @ 2019-11-05  4:54 UTC (permalink / raw)
To: Greg KH
Cc: linux-kernel, Chanwoo Choi (chanwoo@kernel.org),
함명주

Dear Greg,

I'm sorry for this pull request. I've missed the patch title.
I'll resend the pull-request.

Best Regards,
Chanwoo Choi

On 19. 11. 5. 오후 1:52, Chanwoo Choi wrote:
> Dear Greg,
>
> This is extcon-next pull request for v5.5. I add detailed description of
> this pull request on below. Please pull extcon with following updates.
>
> Detailed description for this pull request:
> 1. Clean up the and fix the minor issue of extcon provider driver
> - extcon-intel-cht-wc don't reset the USB data connection at probe time
>   in order to prevent the removing all devices from bus.
> - extcon-sm5502 reset the registers at proble time in order to
>   prevent the some stuck state. And remove the redundant variable
>   initialization.
>
> Best Regards,
> Chanwoo Choi
>
> The following changes since commit 54ecb8f7028c5eb3d740bb82b0f1d90f2df63c5c:
>
>   Linux 5.4-rc1 (2019-09-30 10:35:40 -0700)
>
> are available in the Git repository at:
>
>   git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon.git tags/extcon-next-for-5.5
>
> for you to fetch changes up to ddd1bbbae486ff5913c8fc72c853dcea60713236:
>
>   extcon: sm5502: remove redundant assignment to variable cable_type (2019-10-31 13:47:42 +0900)
>
> ----------------------------------------------------------------
> Colin Ian King (1):
>       extcon: sm5502: remove redundant assignment to variable cable_type
>
> Stephan Gerhold (1):
>       extcon: sm5502: Reset registers during initialization
>
> Yauhen Kharuzhy (1):
>       extcon-intel-cht-wc: Don't reset USB data connection at probe
>
>  drivers/extcon/extcon-intel-cht-wc.c | 16 ++++++++++++++--
>  drivers/extcon/extcon-sm5502.c       |  6 +++++-
>  drivers/extcon/extcon-sm5502.h       |  2 ++
>  3 files changed, 21 insertions(+), 3 deletions(-)
>

--
Best Regards,
Chanwoo Choi
Samsung Electronics

```* Re:
@ 2019-10-27 21:36 Margaret Kwan Wing Han
0 siblings, 0 replies; 400+ messages in thread
From: Margaret Kwan Wing Han @ 2019-10-27 21:36 UTC (permalink / raw)
To: linux-kernel

I need a partner for a legal deal worth \$30,500,000 if interested reply me for
more details.

Regards,
Margaret Kwan Wing

```* RE:
@ 2019-09-24 19:49 Venkat Subbiah
0 siblings, 0 replies; 400+ messages in thread
From: Venkat Subbiah @ 2019-09-24 19:49 UTC (permalink / raw)
To: Lakshman, lauren, laurenb, Lavanya, Quang, Ida Maupin, Morgan,
linux kernel

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=us-ascii, Size: 962 bytes --]

Hello

http://omniummjc.com/freeze.php?dpeh=knot10401
Venkat

alpajqmka wmsyduztf Update: Back in stock. This won’t last long! qgaegip fuwzmnt isqhzstcc wqkejlrnt So the waters were healed unto this day, according to the saying of Elisha which he spake. hypyvykicy rthie oaqwwrrc mduzva wlsfg

hvpaxzcxyb itkzynwabe ktekwxu xhxgvai eaumbu ofnltk chdfggsb ufpefiuq

qzlfa swozrpya dasogbkjo srgeqy icdmg Then wrote Rehum the chancellor, and Shimshai the scribe, and the rest of their companions; the Dinaites, the Apharsathchites, the Tarpelites, the Apharsites, the Archevites, the Babylonians, the Susanchites, the Dehavites, and the Elamites, njhthme pbiyywck qbnonwfsv tefdeug shqdn

zenjr These deals are only for our readers, but we aren’t sure how long they’ll be available, so go ahead and sign up while you can. ofizsykm lpjnznu zffglnqq hmtrq

```* Re:
@ 2019-08-30 20:32   ` Arnd Bergmann
0 siblings, 0 replies; 400+ messages in thread
From: Arnd Bergmann @ 2019-08-30 20:32 UTC (permalink / raw)
To: Michal Suchanek
Cc: Linux FS-devel Mailing List, Benjamin Herrenschmidt,
Paul Mackerras, Michael Ellerman, Alexander Viro,
Greg Kroah-Hartman, Christian Brauner, Allison Randal,
Heiko Carstens, Thomas Gleixner, Firoz Khan, linuxppc-dev,
Linux Kernel Mailing List

On Fri, Aug 30, 2019 at 10:30 PM Michal Suchanek <msuchanek@suse.de> wrote:
>
> Subject: [PATCH] powerpc: Add back __ARCH_WANT_SYS_LLSEEK macro
>
> This partially reverts commit caf6f9c8a326 ("asm-generic: Remove
> unneeded __ARCH_WANT_SYS_LLSEEK macro")
>
> When CONFIG_COMPAT is disabled on ppc64 the kernel does not build.
>
> There is resistance to both removing the llseek syscall from the 64bit
> syscall tables and building the llseek interface unconditionally.
>
>
> Signed-off-by: Michal Suchanek <msuchanek@suse.de>

Reviewed-by: Arnd Bergmann <arnd@arndb.de>

```* Re:
@ 2019-05-29 19:54 ` Alex Williamson
0 siblings, 0 replies; 400+ messages in thread
From: Alex Williamson @ 2019-05-29 19:54 UTC (permalink / raw)
To: Thomas Meyer; +Cc: kvm, linux-kernel

On Sun, 26 May 2019 13:44:04 +0200
"Thomas Meyer" <thomas@m3y3r.de> wrote:

> From thomas@m3y3r.de Sun May 26 00:13:26 2019
>  computation
> To: alex.williamson@redhat.com, kvm@vger.kernel.org, linux-kernel@vger.kernel.org
> Content-Type: text/plain; charset="UTF-8"
> Mime-Version: 1.0
> Content-Transfer-Encoding: 8bit
> X-Patch: Cocci
> X-Mailer: DiffSplit
> Message-ID: <1558822461341-1674464153-1-diffsplit-thomas@m3y3r.de>
> References: <1558822461331-726613767-0-diffsplit-thomas@m3y3r.de>
> X-Serial-No: 1
>
> Use vma_pages function on vma object instead of explicit computation.
>
> Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
> ---
>
> @@ -161,7 +161,7 @@ static int vfio_pci_nvgpu_mmap(struct vf
>
>  	atomic_inc(&data->mm->mm_count);
>  	ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
> -			(vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
> +			vma_pages(vma),
>  			data->gpu_hpa, &data->mem);
>

Besides the formatting of this patch, there's already a pending patch
with this same change:

https://lkml.org/lkml/2019/5/16/658

I think the original must have bounced from lkml due the encoding, but
I'll use that one since it came first, is slightly cleaner in wrapping
the line following the change, and already has Alexey's R-b.  Thanks,

Alex

```* Re: Re:
2019-05-24 19:00           ` Re: prakhar srivastava
@ 2019-05-24 19:15             ` Mimi Zohar
0 siblings, 0 replies; 400+ messages in thread
From: Mimi Zohar @ 2019-05-24 19:15 UTC (permalink / raw)
To: prakhar srivastava
Cc: Roberto Sassu, linux-integrity, linux-security-module,
linux-kernel, Matthew Garrett, vgoyal

On Fri, 2019-05-24 at 12:00 -0700, prakhar srivastava wrote:
> On Fri, May 24, 2019 at 11:09 AM Mimi Zohar <zohar@linux.ibm.com> wrote:
> >
> > > >> As mentioned, the first patch description should include a shell
> > > >> command for verifying the digest in the kexec boot command line
> > > >> measurement list record against /proc/cmdline.  This patch description
> > > >> should include a shell command showing how to verify the digest based
> > > >> on the new field.  Should the new field in the ascii measurement list
> > > >> be displayed as a string, not hex?
> > > >
> > > > We should define a new type. If the type is DATA_FMT_STRING, spaces are
> > > > replaced with '_'.
> > >
> > > Or better. Leave it as hex, otherwise there would be a parsing problem
> > > if there are spaces in the data for a field.
> >
> > After making a few changes, the measurement list contains the
> > following kexec-cmdline data:
> >
> > 10 edc32d1e3a5ba7272280a395b6fb56a5ef7c78c3 ima-buf
> > sha256:4f43b7db850e
> > 88c49dfeffd4b1eb4f021d78033dfb05b07e45eec8d0b45275
> > kexec-cmdline
> > 726f6f
> > 743d2f6465762f7364613420726f2072642e6c756b732e757569643d6c756b73
> > 2d6637
> > 3633643737632d653236622d343431642d613734652d62363633636334643832
> > 656120
> > 696d615f706f6c6963793d7463627c61707072616973655f746362
> >
> > There's probably a better shell command, but the following works to
> > verify the digest locally against the /proc/cmdline:
> >
> > \$ echo -n -e `cat /proc/cmdline | sed 's/^.*root=/root=/'` | sha256sum
> > 4f43b7db850e88c49dfeffd4b1eb4f021d78033dfb05b07e45eec8d0b4527f65  -
> >
> > If we leave the "buf" field as ascii-hex, what would the shell command
> > look like when verifying the digest based on the "buf" field?
> >
> > Mimi
> >
> To quickly test the sha256 i used the my /proc/cmdline
> ro quiet splash vt.handoff=1 ima_policy=tcb ima_appraise=fix
> ima_template_fmt=n-ng|d-ng|sig|buf ima_hash=sha256
>
> export \$VAL=
> 726f2071756965742073706c6173682076742e68616e646f66663d3120
> 696d615f706f6c6963793d74636220696d615f61707072616973653d666
> 97820696d615f74656d706c6174655f666d743d6e2d6e677c642d6e677c
> 7369677c62756620696d615f686173683d736861323536
>
> echo -n -e \$VAL | xxd -r -p | sha256sum
> 0d0b891bb730120d9593799cba1a7b3febf68f2bb81fb1304b0c963f95f6bc58  -
>
> I will run it through the code as well, but the shell command should work.

Yes, that works.

sudo cat /sys/kernel/security/integrity/ima/ascii_runtime_measurements
| grep  kexec-cmdline | cut -d' ' -f 6 | xxd -r -p | sha256sum

Mimi

```* Re: Re:
2019-05-24 18:09         ` Re: Mimi Zohar
@ 2019-05-24 19:00           ` prakhar srivastava
2019-05-24 19:15             ` Re: Mimi Zohar
From: prakhar srivastava @ 2019-05-24 19:00 UTC (permalink / raw)
To: Mimi Zohar
Cc: Roberto Sassu, linux-integrity, linux-security-module,
linux-kernel, Matthew Garrett, vgoyal

On Fri, May 24, 2019 at 11:09 AM Mimi Zohar <zohar@linux.ibm.com> wrote:
>
> > >> As mentioned, the first patch description should include a shell
> > >> command for verifying the digest in the kexec boot command line
> > >> measurement list record against /proc/cmdline.  This patch description
> > >> should include a shell command showing how to verify the digest based
> > >> on the new field.  Should the new field in the ascii measurement list
> > >> be displayed as a string, not hex?
> > >
> > > We should define a new type. If the type is DATA_FMT_STRING, spaces are
> > > replaced with '_'.
> >
> > Or better. Leave it as hex, otherwise there would be a parsing problem
> > if there are spaces in the data for a field.
>
> After making a few changes, the measurement list contains the
> following kexec-cmdline data:
>
> 10 edc32d1e3a5ba7272280a395b6fb56a5ef7c78c3 ima-buf
> sha256:4f43b7db850e
> 88c49dfeffd4b1eb4f021d78033dfb05b07e45eec8d0b45275
> kexec-cmdline
> 726f6f
> 743d2f6465762f7364613420726f2072642e6c756b732e757569643d6c756b73
> 2d6637
> 3633643737632d653236622d343431642d613734652d62363633636334643832
> 656120
> 696d615f706f6c6963793d7463627c61707072616973655f746362
>
> There's probably a better shell command, but the following works to
> verify the digest locally against the /proc/cmdline:
>
> \$ echo -n -e `cat /proc/cmdline | sed 's/^.*root=/root=/'` | sha256sum
> 4f43b7db850e88c49dfeffd4b1eb4f021d78033dfb05b07e45eec8d0b4527f65  -
>
> If we leave the "buf" field as ascii-hex, what would the shell command
> look like when verifying the digest based on the "buf" field?
>
> Mimi
>
To quickly test the sha256 i used the my /proc/cmdline
ro quiet splash vt.handoff=1 ima_policy=tcb ima_appraise=fix
ima_template_fmt=n-ng|d-ng|sig|buf ima_hash=sha256

export \$VAL=
726f2071756965742073706c6173682076742e68616e646f66663d3120
696d615f706f6c6963793d74636220696d615f61707072616973653d666
97820696d615f74656d706c6174655f666d743d6e2d6e677c642d6e677c
7369677c62756620696d615f686173683d736861323536

echo -n -e \$VAL | xxd -r -p | sha256sum
0d0b891bb730120d9593799cba1a7b3febf68f2bb81fb1304b0c963f95f6bc58  -

I will run it through the code as well, but the shell command should work.

Thanks,
Prakhar Srivastava

```* Re: Re:
2019-05-24 15:47       ` Re: Roberto Sassu
@ 2019-05-24 18:09         ` Mimi Zohar
2019-05-24 19:00           ` Re: prakhar srivastava
From: Mimi Zohar @ 2019-05-24 18:09 UTC (permalink / raw)
To: Roberto Sassu, Prakhar Srivastava, linux-integrity,
linux-security-module, linux-kernel
Cc: mjg59, vgoyal

> >> As mentioned, the first patch description should include a shell
> >> command for verifying the digest in the kexec boot command line
> >> measurement list record against /proc/cmdline.  This patch description
> >> should include a shell command showing how to verify the digest based
> >> on the new field.  Should the new field in the ascii measurement list
> >> be displayed as a string, not hex?
> >
> > We should define a new type. If the type is DATA_FMT_STRING, spaces are
> > replaced with '_'.
>
> Or better. Leave it as hex, otherwise there would be a parsing problem
> if there are spaces in the data for a field.

After making a few changes, the measurement list contains the
following kexec-cmdline data:

10 edc32d1e3a5ba7272280a395b6fb56a5ef7c78c3 ima-buf
sha256:4f43b7db850e
88c49dfeffd4b1eb4f021d78033dfb05b07e45eec8d0b45275
kexec-cmdline
726f6f
743d2f6465762f7364613420726f2072642e6c756b732e757569643d6c756b73
2d6637
3633643737632d653236622d343431642d613734652d62363633636334643832
656120
696d615f706f6c6963793d7463627c61707072616973655f746362

There's probably a better shell command, but the following works to
verify the digest locally against the /proc/cmdline:

\$ echo -n -e `cat /proc/cmdline | sed 's/^.*root=/root=/'` | sha256sum
4f43b7db850e88c49dfeffd4b1eb4f021d78033dfb05b07e45eec8d0b4527f65  -

If we leave the "buf" field as ascii-hex, what would the shell command
look like when verifying the digest based on the "buf" field?

Mimi

```* Re:
2019-05-24 15:42     ` Roberto Sassu
@ 2019-05-24 15:47       ` Roberto Sassu
2019-05-24 18:09         ` Re: Mimi Zohar
From: Roberto Sassu @ 2019-05-24 15:47 UTC (permalink / raw)
To: Mimi Zohar, Prakhar Srivastava, linux-integrity,
linux-security-module, linux-kernel
Cc: mjg59, vgoyal

On 5/24/2019 5:42 PM, Roberto Sassu wrote:
> On 5/24/2019 5:12 PM, Mimi Zohar wrote:
>> On Mon, 2019-05-20 at 17:06 -0700, Prakhar Srivastava wrote:
>>> A buffer(cmdline args) measured into ima cannot be appraised
>>> without already being aware of the buffer contents.Since we
>>> don't know what cmdline args will be passed (or need to validate
>>> what was passed) it is not possible to appraise it.
>>>
>>> Since hashs are non reversible the raw buffer is needed to
>>> recompute the hash.
>>> To regenrate the hash of the buffer and appraise the same
>>> the contents of the buffer need to be available.
>>>
>>> A new template field buf is added to the existing ima template
>>> fields, which can be used to store/read the buffer itself.
>>> Two new fields are added to the ima_event_data to carry the
>>> buf and buf_len whenever necessary.
>>>
>>> Updated the process_buffer_measurement call to add the buf
>>> to the ima_event_data.
>>> ima_kexec_cmdline to measure cmdline args"
>>>
>>> - Add a new template field 'buf' to be used to store/read
>>> the buffer data.
>>> - Added two new fields to ima_event_data to hold the buf and
>>> buf_len [Suggested by Roberto]
>>> -Updated process_buffer_meaurement to add the buffer to
>>> ima_event_data
>>
>> This patch description can be written more concisely.
>>
>> Patch 1/3 in this series introduces measuring the kexec boot command
>> line.  This patch defines a new template field for storing the kexec
>> boot command line in the measurement list in order for a remote
>> attestation server to verify.
>>
>> As mentioned, the first patch description should include a shell
>> command for verifying the digest in the kexec boot command line
>> measurement list record against /proc/cmdline.  This patch description
>> should include a shell command showing how to verify the digest based
>> on the new field.  Should the new field in the ascii measurement list
>> be displayed as a string, not hex?
>
> We should define a new type. If the type is DATA_FMT_STRING, spaces are
> replaced with '_'.

Or better. Leave it as hex, otherwise there would be a parsing problem
if there are spaces in the data for a field.

Roberto

--
HUAWEI TECHNOLOGIES Duesseldorf GmbH, HRB 56063
Managing Director: Bo PENG, Jian LI, Yanli SHI

```* Re:
2019-05-24 15:12   ` Mimi Zohar
@ 2019-05-24 15:42     ` Roberto Sassu
2019-05-24 15:47       ` Re: Roberto Sassu
From: Roberto Sassu @ 2019-05-24 15:42 UTC (permalink / raw)
To: Mimi Zohar, Prakhar Srivastava, linux-integrity,
linux-security-module, linux-kernel
Cc: mjg59, vgoyal

On 5/24/2019 5:12 PM, Mimi Zohar wrote:
> On Mon, 2019-05-20 at 17:06 -0700, Prakhar Srivastava wrote:
>> A buffer(cmdline args) measured into ima cannot be appraised
>> without already being aware of the buffer contents.Since we
>> don't know what cmdline args will be passed (or need to validate
>> what was passed) it is not possible to appraise it.
>>
>> Since hashs are non reversible the raw buffer is needed to
>> recompute the hash.
>> To regenrate the hash of the buffer and appraise the same
>> the contents of the buffer need to be available.
>>
>> A new template field buf is added to the existing ima template
>> fields, which can be used to store/read the buffer itself.
>> Two new fields are added to the ima_event_data to carry the
>> buf and buf_len whenever necessary.
>>
>> Updated the process_buffer_measurement call to add the buf
>> to the ima_event_data.
>> ima_kexec_cmdline to measure cmdline args"
>>
>> - Add a new template field 'buf' to be used to store/read
>> the buffer data.
>> - Added two new fields to ima_event_data to hold the buf and
>> buf_len [Suggested by Roberto]
>> -Updated process_buffer_meaurement to add the buffer to
>> ima_event_data
>
> This patch description can be written more concisely.
>
> Patch 1/3 in this series introduces measuring the kexec boot command
> line.  This patch defines a new template field for storing the kexec
> boot command line in the measurement list in order for a remote
> attestation server to verify.
>
> As mentioned, the first patch description should include a shell
> command for verifying the digest in the kexec boot command line
> measurement list record against /proc/cmdline.  This patch description
> should include a shell command showing how to verify the digest based
> on the new field.  Should the new field in the ascii measurement list
> be displayed as a string, not hex?

We should define a new type. If the type is DATA_FMT_STRING, spaces are
replaced with '_'.

Roberto

--
HUAWEI TECHNOLOGIES Duesseldorf GmbH, HRB 56063
Managing Director: Bo PENG, Jian LI, Yanli SHI

```* RE,
@ 2019-04-12 23:06 Sharifah Ahmad Mustahfa
0 siblings, 0 replies; 400+ messages in thread
To: linux-kernel

As-Salamu Alaykum, I need your help to transfer out the some amount of money, accumulated as undeclared excess profit made by me from the INVESTMENTS CAPITAL under my management in my bank. You will get 40% of the funds for your participation. Reply for more details thanks

```* Re:
2019-03-22 15:30               ` Re: Keith Busch
@ 2019-03-25 15:44                 ` Felipe Franciosi
0 siblings, 0 replies; 400+ messages in thread
From: Felipe Franciosi @ 2019-03-25 15:44 UTC (permalink / raw)
To: Keith Busch
Cc: Maxim Levitsky, Stefan Hajnoczi, Fam Zheng, kvm, Wolfram Sang,
linux-nvme, linux-kernel, Keith Busch, Kirti Wankhede,
Mauro Carvalho Chehab, Paul E . McKenney, Christoph Hellwig,
Sagi Grimberg, Harris, James R, Liang Cunming, Jens Axboe,
Alex Williamson, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

Hi Keith,

> On Mar 22, 2019, at 3:30 PM, Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, Mar 22, 2019 at 07:54:50AM +0000, Felipe Franciosi wrote:
>>>
>>> Note though that SPDK doesn't support sharing the device between host and the
>>> guests, it takes over the nvme device, thus it makes the kernel nvme driver
>>> unbind from it.
>>
>> That is absolutely true. However, I find it not to be a problem in practice.
>>
>> Hypervisor products, specially those caring about performance, efficiency and fairness, will dedicate NVMe devices for a particular purpose (eg. vDisk storage, cache, metadata) and will not share these devices for other use cases. That's because these products want to deterministically control the performance aspects of the device, which you just cannot do if you are sharing the device with a subsystem you do not control.
>
> I don't know, it sounds like you've traded kernel syscalls for IPC,
> and I don't think one performs better than the other.

Sorry, I'm not sure I understand. My point is that if you are packaging a distro to be a hypervisor and you want to use a storage device for VM data, you _most likely_ won't be using that device for anything else. To that end, driving the device directly from your application definitely gives you more deterministic control.

>
>> For scenarios where the device must be shared and such fine grained control is not required, it looks like using the kernel driver with io_uring offers very good performance with flexibility.
>
> NVMe's IO Determinism features provide fine grained control for shared
> devices. It's still uncommon to find hardware supporting that, though.

Sure, but then your hypervisor needs to certify devices that support that. This will limit your HCL. Moreover, unless the feature is solid, well-established and works reliably on all devices you support, it's arguably preferable to have an architecture which gives you that control in software.

Cheers,
Felipe

```* Re:
2019-03-22  7:54             ` Re: Felipe Franciosi
2019-03-22 10:32               ` Re: Maxim Levitsky
@ 2019-03-22 15:30               ` Keith Busch
2019-03-25 15:44                 ` Re: Felipe Franciosi
From: Keith Busch @ 2019-03-22 15:30 UTC (permalink / raw)
To: Felipe Franciosi
Cc: Maxim Levitsky, Stefan Hajnoczi, Fam Zheng, kvm, Wolfram Sang,
linux-nvme, linux-kernel, Keith Busch, Kirti Wankhede,
Mauro Carvalho Chehab, Paul E . McKenney, Christoph Hellwig,
Sagi Grimberg, Harris, James R, Liang Cunming, Jens Axboe,
Alex Williamson, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

On Fri, Mar 22, 2019 at 07:54:50AM +0000, Felipe Franciosi wrote:
> >
> > Note though that SPDK doesn't support sharing the device between host and the
> > guests, it takes over the nvme device, thus it makes the kernel nvme driver
> > unbind from it.
>
> That is absolutely true. However, I find it not to be a problem in practice.
>
> Hypervisor products, specially those caring about performance, efficiency and fairness, will dedicate NVMe devices for a particular purpose (eg. vDisk storage, cache, metadata) and will not share these devices for other use cases. That's because these products want to deterministically control the performance aspects of the device, which you just cannot do if you are sharing the device with a subsystem you do not control.

I don't know, it sounds like you've traded kernel syscalls for IPC,
and I don't think one performs better than the other.

> For scenarios where the device must be shared and such fine grained control is not required, it looks like using the kernel driver with io_uring offers very good performance with flexibility.

NVMe's IO Determinism features provide fine grained control for shared
devices. It's still uncommon to find hardware supporting that, though.

```* Re:
2019-03-22  7:54             ` Re: Felipe Franciosi
@ 2019-03-22 10:32               ` Maxim Levitsky
2019-03-22 15:30               ` Re: Keith Busch
1 sibling, 0 replies; 400+ messages in thread
From: Maxim Levitsky @ 2019-03-22 10:32 UTC (permalink / raw)
To: Felipe Franciosi
Cc: Keith Busch, Stefan Hajnoczi, Fam Zheng, kvm, Wolfram Sang,
linux-nvme, linux-kernel, Keith Busch, Kirti Wankhede,
Mauro Carvalho Chehab, Paul E . McKenney, Christoph Hellwig,
Sagi Grimberg, Harris, James R, Liang Cunming, Jens Axboe,
Alex Williamson, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

On Fri, 2019-03-22 at 07:54 +0000, Felipe Franciosi wrote:
> > On Mar 21, 2019, at 5:04 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> >
> > On Thu, 2019-03-21 at 16:41 +0000, Felipe Franciosi wrote:
> > > > On Mar 21, 2019, at 4:21 PM, Keith Busch <kbusch@kernel.org> wrote:
> > > >
> > > > On Thu, Mar 21, 2019 at 04:12:39PM +0000, Stefan Hajnoczi wrote:
> > > > > mdev-nvme seems like a duplication of SPDK.  The performance is not
> > > > > better and the features are more limited, so why focus on this
> > > > > approach?
> > > > >
> > > > > One argument might be that the kernel NVMe subsystem wants to offer
> > > > > this
> > > > > functionality and loading the kernel module is more convenient than
> > > > > managing SPDK to some users.
> > > > >
> > > > > Thoughts?
> > > >
> > > > Doesn't SPDK bind a controller to a single process? mdev binds to
> > > > namespaces (or their partitions), so you could have many mdev's assigned
> > > > to many VMs accessing a single controller.
> > >
> > > Yes, it binds to a single process which can drive the datapath of multiple
> > > virtual controllers for multiple VMs (similar to what you described for
> > > mdev).
> > > You can therefore efficiently poll multiple VM submission queues (and
> > > multiple
> > > device completion queues) from a single physical CPU.
> > >
> > > The same could be done in the kernel, but the code gets complicated as you
> > > more functionality to it. As this is a direct interface with an untrusted
> > > front-end (the guest), it's also arguably safer to do in userspace.
> > >
> > > Worth noting: you can eventually have a single physical core polling all
> > > sorts
> > > of virtual devices (eg. virtual storage or network controllers) very
> > > efficiently. And this is quite configurable, too. In the interest of
> > > fairness,
> > > performance or efficiency, you can choose to dynamically add or remove
> > > queues
> > > to the poll thread or spawn more threads and redistribute the work.
> > >
> > > F.
> >
> > Note though that SPDK doesn't support sharing the device between host and
> > the
> > guests, it takes over the nvme device, thus it makes the kernel nvme driver
> > unbind from it.
>
> That is absolutely true. However, I find it not to be a problem in practice.
>
> Hypervisor products, specially those caring about performance, efficiency and
> fairness, will dedicate NVMe devices for a particular purpose (eg. vDisk
> storage, cache, metadata) and will not share these devices for other use
> cases. That's because these products want to deterministically control the
> performance aspects of the device, which you just cannot do if you are sharing
> the device with a subsystem you do not control.
>
> For scenarios where the device must be shared and such fine grained control is
> not required, it looks like using the kernel driver with io_uring offers very
> good performance with flexibility

I see the host/guest parition in the following way:
The guest assigned partitions are for guests that need lowest possible latency,
and in between these guests it is possible to guarantee good enough level of
fairness in my driver.
For example, in the current implementation of my driver, each guest gets its own
host submission queue.

On the other hand, the host assigned partitions are for significantly higher
latency IO, with no guarantees, and/or for guests that need all the more
advanced features of full IO virtualization, for instance snapshots, thin
provisioning, replication/backup over network, etc.
io_uring can be used here to speed things up but it won't reach the nvme-mdev
levels of latency.

Furthermore on NVME drives that support WRRU, its possible to let queues of
guest's assigned partitions to belong to the high priority class and let the
host queues use the regular medium/low priority class.
For drives that don't support WRRU, the IO throttling can be done in software on
the host queues.

Host assigned partitions also don't need polling, thus allowing polling to be
used only for guests that actually need low latency IO.
This reduces the number of cores that would be otherwise lost to polling,
because the less work the polling core does, the less latency it contributes to
overall latency, thus with less users, you could use less cores to achieve the
same levels of latency.

For Stefan's argument, we can look at it in a slightly different way too:
While the nvme-mdev can be seen as a duplication of SPDK, the SPDK can also be
seen as duplication of an existing kernel functionality which nvme-mdev can

Best regards,
Maxim Levitsky

```* Re:
2019-03-21 17:04           ` Re: Maxim Levitsky
@ 2019-03-22  7:54             ` Felipe Franciosi
2019-03-22 10:32               ` Re: Maxim Levitsky
2019-03-22 15:30               ` Re: Keith Busch
0 siblings, 2 replies; 400+ messages in thread
From: Felipe Franciosi @ 2019-03-22  7:54 UTC (permalink / raw)
To: Maxim Levitsky
Cc: Keith Busch, Stefan Hajnoczi, Fam Zheng, kvm, Wolfram Sang,
linux-nvme, linux-kernel, Keith Busch, Kirti Wankhede,
Mauro Carvalho Chehab, Paul E . McKenney, Christoph Hellwig,
Sagi Grimberg, Harris, James R, Liang Cunming, Jens Axboe,
Alex Williamson, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

> On Mar 21, 2019, at 5:04 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> On Thu, 2019-03-21 at 16:41 +0000, Felipe Franciosi wrote:
>>> On Mar 21, 2019, at 4:21 PM, Keith Busch <kbusch@kernel.org> wrote:
>>>
>>> On Thu, Mar 21, 2019 at 04:12:39PM +0000, Stefan Hajnoczi wrote:
>>>> mdev-nvme seems like a duplication of SPDK.  The performance is not
>>>> better and the features are more limited, so why focus on this approach?
>>>>
>>>> One argument might be that the kernel NVMe subsystem wants to offer this
>>>> managing SPDK to some users.
>>>>
>>>> Thoughts?
>>>
>>> Doesn't SPDK bind a controller to a single process? mdev binds to
>>> namespaces (or their partitions), so you could have many mdev's assigned
>>> to many VMs accessing a single controller.
>>
>> Yes, it binds to a single process which can drive the datapath of multiple
>> virtual controllers for multiple VMs (similar to what you described for mdev).
>> You can therefore efficiently poll multiple VM submission queues (and multiple
>> device completion queues) from a single physical CPU.
>>
>> The same could be done in the kernel, but the code gets complicated as you add
>> more functionality to it. As this is a direct interface with an untrusted
>> front-end (the guest), it's also arguably safer to do in userspace.
>>
>> Worth noting: you can eventually have a single physical core polling all sorts
>> of virtual devices (eg. virtual storage or network controllers) very
>> efficiently. And this is quite configurable, too. In the interest of fairness,
>> performance or efficiency, you can choose to dynamically add or remove queues
>> to the poll thread or spawn more threads and redistribute the work.
>>
>> F.
>
> Note though that SPDK doesn't support sharing the device between host and the
> guests, it takes over the nvme device, thus it makes the kernel nvme driver
> unbind from it.

That is absolutely true. However, I find it not to be a problem in practice.

Hypervisor products, specially those caring about performance, efficiency and fairness, will dedicate NVMe devices for a particular purpose (eg. vDisk storage, cache, metadata) and will not share these devices for other use cases. That's because these products want to deterministically control the performance aspects of the device, which you just cannot do if you are sharing the device with a subsystem you do not control.

For scenarios where the device must be shared and such fine grained control is not required, it looks like using the kernel driver with io_uring offers very good performance with flexibility.

Cheers,
Felipe

```* Re:
2019-03-21 16:41         ` Re: Felipe Franciosi
@ 2019-03-21 17:04           ` Maxim Levitsky
2019-03-22  7:54             ` Re: Felipe Franciosi
From: Maxim Levitsky @ 2019-03-21 17:04 UTC (permalink / raw)
To: Felipe Franciosi, Keith Busch
Cc: Stefan Hajnoczi, Fam Zheng, kvm, Wolfram Sang, linux-nvme,
linux-kernel, Keith Busch, Kirti Wankhede, Mauro Carvalho Chehab,
Paul E . McKenney, Christoph Hellwig, Sagi Grimberg, Harris,
James R, Liang Cunming, Jens Axboe, Alex Williamson,
Thanos Makatos, John Ferlan, Liu Changpeng, Greg Kroah-Hartman,
Nicolas Ferre, Paolo Bonzini, Amnon Ilan, David S . Miller

On Thu, 2019-03-21 at 16:41 +0000, Felipe Franciosi wrote:
> > On Mar 21, 2019, at 4:21 PM, Keith Busch <kbusch@kernel.org> wrote:
> >
> > On Thu, Mar 21, 2019 at 04:12:39PM +0000, Stefan Hajnoczi wrote:
> > > mdev-nvme seems like a duplication of SPDK.  The performance is not
> > > better and the features are more limited, so why focus on this approach?
> > >
> > > One argument might be that the kernel NVMe subsystem wants to offer this
> > > functionality and loading the kernel module is more convenient than
> > > managing SPDK to some users.
> > >
> > > Thoughts?
> >
> > Doesn't SPDK bind a controller to a single process? mdev binds to
> > namespaces (or their partitions), so you could have many mdev's assigned
> > to many VMs accessing a single controller.
>
> Yes, it binds to a single process which can drive the datapath of multiple
> virtual controllers for multiple VMs (similar to what you described for mdev).
> You can therefore efficiently poll multiple VM submission queues (and multiple
> device completion queues) from a single physical CPU.
>
> The same could be done in the kernel, but the code gets complicated as you add
> more functionality to it. As this is a direct interface with an untrusted
> front-end (the guest), it's also arguably safer to do in userspace.
>
> Worth noting: you can eventually have a single physical core polling all sorts
> of virtual devices (eg. virtual storage or network controllers) very
> efficiently. And this is quite configurable, too. In the interest of fairness,
> performance or efficiency, you can choose to dynamically add or remove queues
> to the poll thread or spawn more threads and redistribute the work.
>
> F.

Note though that SPDK doesn't support sharing the device between host and the
guests, it takes over the nvme device, thus it makes the kernel nvme driver
unbind from it.

My driver creates a polling thread per guest, but its trivial to add option to
use the same polling thread for many guests if there need for that.

Best regards,
Maxim Levitsky

```* Re:
2019-03-21 16:21       ` Re: Keith Busch
@ 2019-03-21 16:41         ` Felipe Franciosi
2019-03-21 17:04           ` Re: Maxim Levitsky
From: Felipe Franciosi @ 2019-03-21 16:41 UTC (permalink / raw)
To: Keith Busch
Cc: Stefan Hajnoczi, Maxim Levitsky, Fam Zheng, kvm, Wolfram Sang,
linux-nvme, linux-kernel, Keith Busch, Kirti Wankhede,
Mauro Carvalho Chehab, Paul E . McKenney, Christoph Hellwig,
Sagi Grimberg, Harris, James R, Liang Cunming, Jens Axboe,
Alex Williamson, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

> On Mar 21, 2019, at 4:21 PM, Keith Busch <kbusch@kernel.org> wrote:
>
> On Thu, Mar 21, 2019 at 04:12:39PM +0000, Stefan Hajnoczi wrote:
>> mdev-nvme seems like a duplication of SPDK.  The performance is not
>> better and the features are more limited, so why focus on this approach?
>>
>> One argument might be that the kernel NVMe subsystem wants to offer this
>> managing SPDK to some users.
>>
>> Thoughts?
>
> Doesn't SPDK bind a controller to a single process? mdev binds to
> namespaces (or their partitions), so you could have many mdev's assigned
> to many VMs accessing a single controller.

Yes, it binds to a single process which can drive the datapath of multiple virtual controllers for multiple VMs (similar to what you described for mdev). You can therefore efficiently poll multiple VM submission queues (and multiple device completion queues) from a single physical CPU.

The same could be done in the kernel, but the code gets complicated as you add more functionality to it. As this is a direct interface with an untrusted front-end (the guest), it's also arguably safer to do in userspace.

Worth noting: you can eventually have a single physical core polling all sorts of virtual devices (eg. virtual storage or network controllers) very efficiently. And this is quite configurable, too. In the interest of fairness, performance or efficiency, you can choose to dynamically add or remove queues to the poll thread or spawn more threads and redistribute the work.

F.

```* Re:
2019-03-21 16:12     ` Re: Stefan Hajnoczi
@ 2019-03-21 16:21       ` Keith Busch
2019-03-21 16:41         ` Re: Felipe Franciosi
From: Keith Busch @ 2019-03-21 16:21 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Maxim Levitsky, Fam Zheng, kvm, Wolfram Sang, linux-nvme,
linux-kernel, Keith Busch, Kirti Wankhede, Mauro Carvalho Chehab,
Paul E . McKenney, Christoph Hellwig, Sagi Grimberg, Harris,
James R, Felipe Franciosi, Liang Cunming, Jens Axboe,
Alex Williamson, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

On Thu, Mar 21, 2019 at 04:12:39PM +0000, Stefan Hajnoczi wrote:
> mdev-nvme seems like a duplication of SPDK.  The performance is not
> better and the features are more limited, so why focus on this approach?
>
> One argument might be that the kernel NVMe subsystem wants to offer this
> managing SPDK to some users.
>
> Thoughts?

Doesn't SPDK bind a controller to a single process? mdev binds to
namespaces (or their partitions), so you could have many mdev's assigned
to many VMs accessing a single controller.

```* Re:
2019-03-20 19:08   ` Re: Maxim Levitsky
@ 2019-03-21 16:12     ` Stefan Hajnoczi
2019-03-21 16:21       ` Re: Keith Busch
From: Stefan Hajnoczi @ 2019-03-21 16:12 UTC (permalink / raw)
To: Maxim Levitsky
Cc: Felipe Franciosi, Fam Zheng, kvm, Wolfram Sang, linux-nvme,
linux-kernel, Keith Busch, Kirti Wankhede, Mauro Carvalho Chehab,
Paul E . McKenney, Christoph Hellwig, Sagi Grimberg, Harris,
James R, Liang Cunming, Jens Axboe, Alex Williamson,
Thanos Makatos, John Ferlan, Liu Changpeng, Greg Kroah-Hartman,
Nicolas Ferre, Paolo Bonzini, Amnon Ilan, David S . Miller

[-- Attachment #1: Type: text/plain, Size: 4404 bytes --]

On Wed, Mar 20, 2019 at 09:08:37PM +0200, Maxim Levitsky wrote:
> On Wed, 2019-03-20 at 11:03 +0000, Felipe Franciosi wrote:
> > > On Mar 19, 2019, at 2:41 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> > >
> > > Date: Tue, 19 Mar 2019 14:45:45 +0200
> > > Subject: [PATCH 0/9] RFC: NVME VFIO mediated device
> > >
> > > Hi everyone!
> > >
> > > In this patch series, I would like to introduce my take on the problem of
> > > doing
> > > as fast as possible virtualization of storage with emphasis on low latency.
> > >
> > > In this patch series I implemented a kernel vfio based, mediated device
> > > that
> > > allows the user to pass through a partition and/or whole namespace to a
> > > guest.
> >
> > Hey Maxim!
> >
> > I'm really excited to see this series, as it aligns to some extent with what
> > we discussed in last year's KVM Forum VFIO BoF.
> >
> > There's no arguing that we need a better story to efficiently virtualise NVMe
> > devices. So far, for Qemu-based VMs, Changpeng's vhost-user-nvme is the best
> > attempt at that. However, I seem to recall there was some pushback from qemu-
> > devel in the sense that they would rather see investment in virtio-blk. I'm
> > not sure what's the latest on that work and what are the next steps.
> I agree with that. All my benchmarks were agains his vhost-user-nvme driver, and
> I am able to get pretty much the same througput and latency.
>
> The ssd I tested on died just recently (Murphy law), not due to bug in my driver
> but some internal fault (even though most of my tests were reads, plus
> occassional 'nvme format's.
> We are in process of buying an replacement.
>
> >
> > The pushback drove the discussion towards pursuing an mdev approach, which is
> > why I'm excited to see your patches.
> >
> > What I'm thinking is that passing through namespaces or partitions is very
> > restrictive. It leaves no room to implement more elaborate virtualisation
> > stacks like replicating data across multiple devices (local or remote),
> > storage migration, software-managed thin provisioning, encryption,
> > deduplication, compression, etc. In summary, anything that requires software
> > intervention in the datapath. (Worth noting: vhost-user-nvme allows all of
> > that to be easily done in SPDK's bdev layer.)
>
> Hi Felipe!
>
> I guess that my driver is not geared toward more complicated use cases like you
> mentioned, but instead it is focused to get as fast as possible performance for
> the common case.
>
> One thing that I can do which would solve several of the above problems is to
> accept an map betwent virtual and real logical blocks, pretty much in exactly
> the same way as EPT does it.
> Then userspace can map any portions of the device anywhere, while still keeping
> the dataplane in the kernel, and having minimal overhead.
>
> On top of that, note that the direction of IO virtualization is to do dataplane
> in hardware, which will probably give you even worse partition granuality /
> features but will be the fastest option aviable,
> like for instance SR-IOV which alrady exists and just allows to split by
> namespaces without any more fine grained control.
>
> Think of nvme-mdev as a very low level driver, which currntly uses polling, but
> eventually will use PASID based IOMMU to provide the guest with raw PCI device.
> The userspace / qemu can build on top of that with varios software layers.
>
> On top of that I am thinking to solve the problem of migration in Qemu, by
> creating a 'vfio-nvme' driver which would bind vfio to bind to device exposed by
> the kernel, and would pass through all the doorbells and queues to the guest,
> while intercepting the admin queue. Such driver I think can be made to support
> migration while beeing able to run on top both SR-IOV device, my vfio-nvme abit
> with double admin queue emulation (its a bit ugly but won't affect performance
> at all) and on top of even regular NVME device vfio assigned to guest.

mdev-nvme seems like a duplication of SPDK.  The performance is not
better and the features are more limited, so why focus on this approach?

One argument might be that the kernel NVMe subsystem wants to offer this
managing SPDK to some users.

Thoughts?

Stefan

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

```* Re:
2019-03-20 11:03 ` Felipe Franciosi
@ 2019-03-20 19:08   ` Maxim Levitsky
2019-03-21 16:12     ` Re: Stefan Hajnoczi
From: Maxim Levitsky @ 2019-03-20 19:08 UTC (permalink / raw)
To: Felipe Franciosi
Cc: Fam Zheng, kvm, Wolfram Sang, linux-nvme, linux-kernel,
Keith Busch, Kirti Wankhede, Mauro Carvalho Chehab,
Paul E . McKenney, Christoph Hellwig, Sagi Grimberg, Harris,
James R, Liang Cunming, Jens Axboe, Alex Williamson,
Stefan Hajnoczi, Thanos Makatos, John Ferlan, Liu Changpeng,
Greg Kroah-Hartman, Nicolas Ferre, Paolo Bonzini, Amnon Ilan,
David S . Miller

On Wed, 2019-03-20 at 11:03 +0000, Felipe Franciosi wrote:
> > On Mar 19, 2019, at 2:41 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
> >
> > Date: Tue, 19 Mar 2019 14:45:45 +0200
> > Subject: [PATCH 0/9] RFC: NVME VFIO mediated device
> >
> > Hi everyone!
> >
> > In this patch series, I would like to introduce my take on the problem of
> > doing
> > as fast as possible virtualization of storage with emphasis on low latency.
> >
> > In this patch series I implemented a kernel vfio based, mediated device
> > that
> > allows the user to pass through a partition and/or whole namespace to a
> > guest.
>
> Hey Maxim!
>
> I'm really excited to see this series, as it aligns to some extent with what
> we discussed in last year's KVM Forum VFIO BoF.
>
> There's no arguing that we need a better story to efficiently virtualise NVMe
> devices. So far, for Qemu-based VMs, Changpeng's vhost-user-nvme is the best
> attempt at that. However, I seem to recall there was some pushback from qemu-
> devel in the sense that they would rather see investment in virtio-blk. I'm
> not sure what's the latest on that work and what are the next steps.
I agree with that. All my benchmarks were agains his vhost-user-nvme driver, and
I am able to get pretty much the same througput and latency.

The ssd I tested on died just recently (Murphy law), not due to bug in my driver
but some internal fault (even though most of my tests were reads, plus
occassional 'nvme format's.
We are in process of buying an replacement.

>
> The pushback drove the discussion towards pursuing an mdev approach, which is
> why I'm excited to see your patches.
>
> What I'm thinking is that passing through namespaces or partitions is very
> restrictive. It leaves no room to implement more elaborate virtualisation
> stacks like replicating data across multiple devices (local or remote),
> storage migration, software-managed thin provisioning, encryption,
> deduplication, compression, etc. In summary, anything that requires software
> intervention in the datapath. (Worth noting: vhost-user-nvme allows all of
> that to be easily done in SPDK's bdev layer.)

Hi Felipe!

I guess that my driver is not geared toward more complicated use cases like you
mentioned, but instead it is focused to get as fast as possible performance for
the common case.

One thing that I can do which would solve several of the above problems is to
accept an map betwent virtual and real logical blocks, pretty much in exactly
the same way as EPT does it.
Then userspace can map any portions of the device anywhere, while still keeping
the dataplane in the kernel, and having minimal overhead.

On top of that, note that the direction of IO virtualization is to do dataplane
in hardware, which will probably give you even worse partition granuality /
features but will be the fastest option aviable,
like for instance SR-IOV which alrady exists and just allows to split by
namespaces without any more fine grained control.

Think of nvme-mdev as a very low level driver, which currntly uses polling, but
eventually will use PASID based IOMMU to provide the guest with raw PCI device.
The userspace / qemu can build on top of that with varios software layers.

On top of that I am thinking to solve the problem of migration in Qemu, by
creating a 'vfio-nvme' driver which would bind vfio to bind to device exposed by
the kernel, and would pass through all the doorbells and queues to the guest,
while intercepting the admin queue. Such driver I think can be made to support
migration while beeing able to run on top both SR-IOV device, my vfio-nvme abit
with double admin queue emulation (its a bit ugly but won't affect performance
at all) and on top of even regular NVME device vfio assigned to guest.

Best regards,
Maxim Levitsky

>
> These complicated stacks should probably not be implemented in the kernel,
> though. So I'm wondering whether we could talk about mechanisms to allow
> efficient and performant userspace datapath intervention  in your approach or
> pursue a mechanism to completely offload the device emulation to userspace
> (and align with what SPDK has to offer).
>
> Thoughts welcome!
> Felipe
> _______________________________________________
> Linux-nvme mailing list

```* Re:
@ 2019-03-20 11:03 ` Felipe Franciosi
2019-03-20 19:08   ` Re: Maxim Levitsky
From: Felipe Franciosi @ 2019-03-20 11:03 UTC (permalink / raw)
To: Maxim Levitsky
Cc: linux-nvme, linux-kernel, kvm, Jens Axboe, Alex Williamson,
Keith Busch, Christoph Hellwig, Sagi Grimberg, Kirti Wankhede,
David S . Miller, Mauro Carvalho Chehab, Greg Kroah-Hartman,
Wolfram Sang, Nicolas Ferre, Paul E . McKenney, Paolo Bonzini,
Liang Cunming, Liu Changpeng, Fam Zheng, Amnon Ilan, John Ferlan,
Stefan Hajnoczi, Harris, James R, Thanos Makatos

> On Mar 19, 2019, at 2:41 PM, Maxim Levitsky <mlevitsk@redhat.com> wrote:
>
> Date: Tue, 19 Mar 2019 14:45:45 +0200
> Subject: [PATCH 0/9] RFC: NVME VFIO mediated device
>
> Hi everyone!
>
> In this patch series, I would like to introduce my take on the problem of doing
> as fast as possible virtualization of storage with emphasis on low latency.
>
> In this patch series I implemented a kernel vfio based, mediated device that
> allows the user to pass through a partition and/or whole namespace to a guest.

Hey Maxim!

I'm really excited to see this series, as it aligns to some extent with what we discussed in last year's KVM Forum VFIO BoF.

There's no arguing that we need a better story to efficiently virtualise NVMe devices. So far, for Qemu-based VMs, Changpeng's vhost-user-nvme is the best attempt at that. However, I seem to recall there was some pushback from qemu-devel in the sense that they would rather see investment in virtio-blk. I'm not sure what's the latest on that work and what are the next steps.

The pushback drove the discussion towards pursuing an mdev approach, which is why I'm excited to see your patches.

What I'm thinking is that passing through namespaces or partitions is very restrictive. It leaves no room to implement more elaborate virtualisation stacks like replicating data across multiple devices (local or remote), storage migration, software-managed thin provisioning, encryption, deduplication, compression, etc. In summary, anything that requires software intervention in the datapath. (Worth noting: vhost-user-nvme allows all of that to be easily done in SPDK's bdev layer.)

These complicated stacks should probably not be implemented in the kernel, though. So I'm wondering whether we could talk about mechanisms to allow efficient and performant userspace datapath intervention  in your approach or pursue a mechanism to completely offload the device emulation to userspace (and align with what SPDK has to offer).

Thoughts welcome!
Felipe

```* RE,
@ 2019-03-13 23:49 LUIS EDUARDO CEPEDA CABRERA
0 siblings, 0 replies; 400+ messages in thread
From: LUIS EDUARDO CEPEDA CABRERA @ 2019-03-13 23:49 UTC (permalink / raw)
To: linux-kernel

Hello,

i have a deal for you, can we work together ?

```* Re:
2019-02-01 15:15               ` Re: Russell King - ARM Linux admin
@ 2019-02-01 15:22                 ` Russell King - ARM Linux admin
0 siblings, 0 replies; 400+ messages in thread
From: Russell King - ARM Linux admin @ 2019-02-01 15:22 UTC (permalink / raw)
To: Souptick Joarder
Cc: Vladimir Murzin, Sabyasachi Gupta, Michal Hocko, linux-kernel,
Mike Rapoport, rppt, Brajeswar Ghosh, Andrew Morton,
linux-arm-kernel

On Fri, Feb 01, 2019 at 03:15:11PM +0000, Russell King - ARM Linux admin wrote:
> On Fri, Feb 01, 2019 at 06:11:21PM +0530, Souptick Joarder wrote:
> > On Fri, Feb 1, 2019 at 6:06 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
> > >
> > > On 2/1/19 12:32 PM, Souptick Joarder wrote:
> > > > On Thu, Jan 31, 2019 at 6:28 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
> > > >>
> > > >> Hi Souptick,
> > > >>
> > > >> On 1/31/19 5:54 AM, Souptick Joarder wrote:
> > > >>> On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
> > > >>>>
> > > >>>> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
> > > >>>>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
> > > >>>>>>
> > > >>>>>> Remove duplicate headers which are included twice.
> > > >>>>>>
> > > >>>>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
> > > >>>>
> > > >>>> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
> > > >>>>
> > > >>>>> Any comment on this patch ?
> > > >>>
> > > >>> If no further comment, can we get this patch in queue for 5.1 ?
> > > >>
> > > >> I'd be nice to use proper tags in subject
> > > >> line. I'd suggest
> > > >>
> > > >> [PATCH] ARM: mm: Remove duplicate header
> > > >>
> > > >> but you can get some inspiration form
> > > >>
> > > >> git log --oneline --no-merges arch/arm/mm/
> > > >>
> > > >> In case you want to route it via ARM tree you need to drop it into
> > > >> Russell's patch system [1].
> > > >
> > > > How to drop it to Russell's patch system other than posting it to
> > > > mailing list ? I don't know.
> > >
> > > https://www.armlinux.org.uk/developer/patches/info.php
> >
> > This link is not reachable.
>
> In what way?  The site is certainly getting hits over ipv4 and ipv6.

Ah, I see - the site is accessible over IPv6 using port 80 only, but
port 443 is blocked.  Problem is, I can't test IPv6 from "outside",
so I rely on people *reporting* when things stop working.

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up

```* Re:
2019-02-01 12:41             ` Re: Souptick Joarder
2019-02-01 13:02               ` Re: Vladimir Murzin
@ 2019-02-01 15:15               ` Russell King - ARM Linux admin
2019-02-01 15:22                 ` Re: Russell King - ARM Linux admin
From: Russell King - ARM Linux admin @ 2019-02-01 15:15 UTC (permalink / raw)
To: Souptick Joarder
Cc: Vladimir Murzin, Mike Rapoport, Michal Hocko, Sabyasachi Gupta,
linux-kernel, rppt, Brajeswar Ghosh, Andrew Morton,
linux-arm-kernel

On Fri, Feb 01, 2019 at 06:11:21PM +0530, Souptick Joarder wrote:
> On Fri, Feb 1, 2019 at 6:06 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
> >
> > On 2/1/19 12:32 PM, Souptick Joarder wrote:
> > > On Thu, Jan 31, 2019 at 6:28 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
> > >>
> > >> Hi Souptick,
> > >>
> > >> On 1/31/19 5:54 AM, Souptick Joarder wrote:
> > >>> On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
> > >>>>
> > >>>> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
> > >>>>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
> > >>>>>>
> > >>>>>> Remove duplicate headers which are included twice.
> > >>>>>>
> > >>>>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
> > >>>>
> > >>>> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
> > >>>>
> > >>>>> Any comment on this patch ?
> > >>>
> > >>> If no further comment, can we get this patch in queue for 5.1 ?
> > >>
> > >> I'd be nice to use proper tags in subject
> > >> line. I'd suggest
> > >>
> > >> [PATCH] ARM: mm: Remove duplicate header
> > >>
> > >> but you can get some inspiration form
> > >>
> > >> git log --oneline --no-merges arch/arm/mm/
> > >>
> > >> In case you want to route it via ARM tree you need to drop it into
> > >> Russell's patch system [1].
> > >
> > > How to drop it to Russell's patch system other than posting it to
> > > mailing list ? I don't know.
> >
> > https://www.armlinux.org.uk/developer/patches/info.php
>
> This link is not reachable.

In what way?  The site is certainly getting hits over ipv4 and ipv6.

--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
According to speedtest.net: 11.9Mbps down 500kbps up

```* Re:
2019-02-01 12:41             ` Re: Souptick Joarder
@ 2019-02-01 13:02               ` Vladimir Murzin
2019-02-01 15:15               ` Re: Russell King - ARM Linux admin
1 sibling, 0 replies; 400+ messages in thread
To: Souptick Joarder
Cc: Mike Rapoport, Michal Hocko, Sabyasachi Gupta,
Russell King - ARM Linux, linux-kernel, rppt, Brajeswar Ghosh,
Andrew Morton, linux-arm-kernel

On 2/1/19 12:41 PM, Souptick Joarder wrote:
> On Fri, Feb 1, 2019 at 6:06 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
>>
>> On 2/1/19 12:32 PM, Souptick Joarder wrote:
>>> On Thu, Jan 31, 2019 at 6:28 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
>>>>
>>>> Hi Souptick,
>>>>
>>>> On 1/31/19 5:54 AM, Souptick Joarder wrote:
>>>>> On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
>>>>>>
>>>>>> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
>>>>>>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
>>>>>>>>
>>>>>>>> Remove duplicate headers which are included twice.
>>>>>>>>
>>>>>>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
>>>>>>
>>>>>> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
>>>>>>
>>>>>>> Any comment on this patch ?
>>>>>
>>>>> If no further comment, can we get this patch in queue for 5.1 ?
>>>>
>>>> I'd be nice to use proper tags in subject
>>>> line. I'd suggest
>>>>
>>>> [PATCH] ARM: mm: Remove duplicate header
>>>>
>>>> but you can get some inspiration form
>>>>
>>>> git log --oneline --no-merges arch/arm/mm/
>>>>
>>>> In case you want to route it via ARM tree you need to drop it into
>>>> Russell's patch system [1].
>>>
>>> How to drop it to Russell's patch system other than posting it to
>>> mailing list ? I don't know.
>>
>> https://www.armlinux.org.uk/developer/patches/info.php
>
> This link is not reachable.
>

>>
>>
>>>>
>>>> [1] https://www.armlinux.org.uk/developer/patches/
>>>>
>>>> Cheers
>>>
>>
>

```* Re:
2019-02-01 12:36           ` Re: Vladimir Murzin
@ 2019-02-01 12:41             ` Souptick Joarder
2019-02-01 13:02               ` Re: Vladimir Murzin
2019-02-01 15:15               ` Re: Russell King - ARM Linux admin
0 siblings, 2 replies; 400+ messages in thread
From: Souptick Joarder @ 2019-02-01 12:41 UTC (permalink / raw)
Cc: Mike Rapoport, Michal Hocko, Sabyasachi Gupta,
Russell King - ARM Linux, linux-kernel, rppt, Brajeswar Ghosh,
Andrew Morton, linux-arm-kernel

On Fri, Feb 1, 2019 at 6:06 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
>
> On 2/1/19 12:32 PM, Souptick Joarder wrote:
> > On Thu, Jan 31, 2019 at 6:28 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
> >>
> >> Hi Souptick,
> >>
> >> On 1/31/19 5:54 AM, Souptick Joarder wrote:
> >>> On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
> >>>>
> >>>> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
> >>>>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
> >>>>>>
> >>>>>> Remove duplicate headers which are included twice.
> >>>>>>
> >>>>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
> >>>>
> >>>> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
> >>>>
> >>>>> Any comment on this patch ?
> >>>
> >>> If no further comment, can we get this patch in queue for 5.1 ?
> >>
> >> I'd be nice to use proper tags in subject
> >> line. I'd suggest
> >>
> >> [PATCH] ARM: mm: Remove duplicate header
> >>
> >> but you can get some inspiration form
> >>
> >> git log --oneline --no-merges arch/arm/mm/
> >>
> >> In case you want to route it via ARM tree you need to drop it into
> >> Russell's patch system [1].
> >
> > How to drop it to Russell's patch system other than posting it to
> > mailing list ? I don't know.
>
> https://www.armlinux.org.uk/developer/patches/info.php

>
>
> >>
> >> [1] https://www.armlinux.org.uk/developer/patches/
> >>
> >> Cheers
> >
>

```* Re:
2019-02-01 12:32         ` Re: Souptick Joarder
@ 2019-02-01 12:36           ` Vladimir Murzin
2019-02-01 12:41             ` Re: Souptick Joarder
To: Souptick Joarder
Cc: Mike Rapoport, Michal Hocko, Sabyasachi Gupta,
Russell King - ARM Linux, linux-kernel, rppt, Brajeswar Ghosh,
Andrew Morton, linux-arm-kernel

On 2/1/19 12:32 PM, Souptick Joarder wrote:
> On Thu, Jan 31, 2019 at 6:28 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
>>
>> Hi Souptick,
>>
>> On 1/31/19 5:54 AM, Souptick Joarder wrote:
>>> On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
>>>>
>>>> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
>>>>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
>>>>>>
>>>>>> Remove duplicate headers which are included twice.
>>>>>>
>>>>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
>>>>
>>>> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
>>>>
>>>>> Any comment on this patch ?
>>>
>>> If no further comment, can we get this patch in queue for 5.1 ?
>>
>> I'd be nice to use proper tags in subject
>> line. I'd suggest
>>
>> [PATCH] ARM: mm: Remove duplicate header
>>
>> but you can get some inspiration form
>>
>> git log --oneline --no-merges arch/arm/mm/
>>
>> In case you want to route it via ARM tree you need to drop it into
>> Russell's patch system [1].
>
> How to drop it to Russell's patch system other than posting it to
> mailing list ? I don't know.

https://www.armlinux.org.uk/developer/patches/info.php

>>
>> [1] https://www.armlinux.org.uk/developer/patches/
>>
>> Cheers
>

```* Re:
@ 2019-02-01 12:32         ` Souptick Joarder
2019-02-01 12:36           ` Re: Vladimir Murzin
From: Souptick Joarder @ 2019-02-01 12:32 UTC (permalink / raw)
Cc: Mike Rapoport, Michal Hocko, Sabyasachi Gupta,
Russell King - ARM Linux, linux-kernel, rppt, Brajeswar Ghosh,
Andrew Morton, linux-arm-kernel

On Thu, Jan 31, 2019 at 6:28 PM Vladimir Murzin <vladimir.murzin@arm.com> wrote:
>
> Hi Souptick,
>
> On 1/31/19 5:54 AM, Souptick Joarder wrote:
> > On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
> >>
> >> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
> >>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
> >>>>
> >>>> Remove duplicate headers which are included twice.
> >>>>
> >>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
> >>
> >> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
> >>
> >>> Any comment on this patch ?
> >
> > If no further comment, can we get this patch in queue for 5.1 ?
>
> I'd be nice to use proper tags in subject
> line. I'd suggest
>
> [PATCH] ARM: mm: Remove duplicate header
>
> but you can get some inspiration form
>
> git log --oneline --no-merges arch/arm/mm/
>
> In case you want to route it via ARM tree you need to drop it into
> Russell's patch system [1].

How to drop it to Russell's patch system other than posting it to
mailing list ? I don't know.
>
> [1] https://www.armlinux.org.uk/developer/patches/
>
> Cheers

```* Re:
2019-01-31  5:54     ` Souptick Joarder
@ 2019-01-31 12:58       ` Vladimir Murzin
2019-02-01 12:32         ` Re: Souptick Joarder
To: Souptick Joarder, Mike Rapoport
Cc: Michal Hocko, Sabyasachi Gupta, Russell King - ARM Linux,
linux-kernel, rppt, Brajeswar Ghosh, Andrew Morton,
linux-arm-kernel

Hi Souptick,

On 1/31/19 5:54 AM, Souptick Joarder wrote:
> On Thu, Jan 17, 2019 at 4:58 PM Mike Rapoport <rppt@linux.ibm.com> wrote:
>>
>> On Thu, Jan 17, 2019 at 04:53:44PM +0530, Souptick Joarder wrote:
>>> On Mon, Jan 7, 2019 at 10:54 PM Souptick Joarder <jrdr.linux@gmail.com> wrote:
>>>>
>>>> Remove duplicate headers which are included twice.
>>>>
>>>> Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
>>
>> Acked-by: Mike Rapoport <rppt@linux.ibm.com>
>>
>>> Any comment on this patch ?
>
> If no further comment, can we get this patch in queue for 5.1 ?

I'd be nice to use proper tags in subject
line. I'd suggest

[PATCH] ARM: mm: Remove duplicate header

but you can get some inspiration form

git log --oneline --no-merges arch/arm/mm/

In case you want to route it via ARM tree you need to drop it into
Russell's patch system [1].

[1] https://www.armlinux.org.uk/developer/patches/

Cheers

```* Re:
2018-10-23  1:47       ` Re: Michael Tirado
@ 2018-10-23  6:23         ` Dave Airlie
0 siblings, 0 replies; 400+ messages in thread
From: Dave Airlie @ 2018-10-23  6:23 UTC (permalink / raw)

>
> DRM_FILE_PAGE_OFFSET thing.  Check out drivers/gpu/drm/drm_gem.c
> right above drm_gem_init.
>
> ---
>
> /*
>  * We make up offsets for buffer objects so we can recognize them at
>  * mmap time.
>  */
>
> /* pgoff in mmap is an unsigned long, so we need to make sure that
>  * the faked up offset will fit
>  */
>
> #if BITS_PER_LONG == 64
> #define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFFUL >> PAGE_SHIFT) + 1)
> #define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFFUL >> PAGE_SHIFT) * 16)
> #else
> #define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFUL >> PAGE_SHIFT) + 1)
> #define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFUL >> PAGE_SHIFT) * 16)
> #endif
>
>
> ---
>
> Why is having a 64-bit file offsets critical, causing -EINVAL on mmap?
> What problems might be associated with using (0x10000000UL >>
> PAGE_SHIFT) ?

a) it finds people not using the correct userspace defines. mostly
libdrm should handle this,
and possibly mesa.

b) there used to be legacy maps below that address on older drivers,
so we decided to never put stuff in the first 32-bit range that they
could clash with.

Dave.

```* Re:
2018-10-22  1:50     ` Re: Dave Airlie
2018-10-21 22:20       ` Re: Michael Tirado
@ 2018-10-23  1:47       ` Michael Tirado
2018-10-23  6:23         ` Re: Dave Airlie
To: Dave Airlie, LKML, dri-devel

DRM_FILE_PAGE_OFFSET thing.  Check out drivers/gpu/drm/drm_gem.c
right above drm_gem_init.

---

/*
* We make up offsets for buffer objects so we can recognize them at
* mmap time.
*/

/* pgoff in mmap is an unsigned long, so we need to make sure that
* the faked up offset will fit
*/

#if BITS_PER_LONG == 64
#define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFFUL >> PAGE_SHIFT) + 1)
#define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFFUL >> PAGE_SHIFT) * 16)
#else
#define DRM_FILE_PAGE_OFFSET_START ((0xFFFFFFFUL >> PAGE_SHIFT) + 1)
#define DRM_FILE_PAGE_OFFSET_SIZE ((0xFFFFFFFUL >> PAGE_SHIFT) * 16)
#endif

---

Why is having a 64-bit file offsets critical, causing -EINVAL on mmap?
What problems might be associated with using (0x10000000UL >>
PAGE_SHIFT) ?
On Mon, Oct 22, 2018 at 1:50 AM Dave Airlie <airlied@gmail.com> wrote:
>
> On Mon, 22 Oct 2018 at 10:49, Michael Tirado <mtirado418@gmail.com> wrote:
> >
> > On Mon, Oct 22, 2018 at 12:26 AM Dave Airlie <airlied@gmail.com> wrote:
> > >
> > > This shouldn't be necessary, did someone misbackport the mmap changes without:
> > >
> > > drm: set FMODE_UNSIGNED_OFFSET for drm files
> > >
> > > Dave.
> >
> > The latest kernel I have had to patch was a 4.18-rc6.  I'll try with a
> > newer 4.19 and let you know if it decides to work.  If not I'll
> > prepare a test case for demonstration on qemu-system-i386.
>
> If you have custom userspace software, make sure it's using
> AC_SYS_LARGEFILE or whatever the equivalant is in your build system.
>
> 64-bit file offsets are important.
>
> Dave.

```* Re:
2018-10-21 20:23   ` Re: Michael Tirado
@ 2018-10-22  1:50     ` Dave Airlie
2018-10-21 22:20       ` Re: Michael Tirado
2018-10-23  1:47       ` Re: Michael Tirado
0 siblings, 2 replies; 400+ messages in thread
From: Dave Airlie @ 2018-10-22  1:50 UTC (permalink / raw)
Cc: Dave Airlie, dri-devel, LKML, Gerd Hoffmann, Deucher, Alexander,
Koenig, Christian, zhoucm1, Hongbo.He, Sean Paul,

>
> On Mon, Oct 22, 2018 at 12:26 AM Dave Airlie <airlied@gmail.com> wrote:
> >
> > This shouldn't be necessary, did someone misbackport the mmap changes without:
> >
> > drm: set FMODE_UNSIGNED_OFFSET for drm files
> >
> > Dave.
>
> The latest kernel I have had to patch was a 4.18-rc6.  I'll try with a
> newer 4.19 and let you know if it decides to work.  If not I'll
> prepare a test case for demonstration on qemu-system-i386.

If you have custom userspace software, make sure it's using
AC_SYS_LARGEFILE or whatever the equivalant is in your build system.

64-bit file offsets are important.

Dave.

```* Re:
@ 2018-10-22  0:26 ` Dave Airlie
2018-10-21 20:23   ` Re: Michael Tirado
From: Dave Airlie @ 2018-10-22  0:26 UTC (permalink / raw)
Cc: Dave Airlie, dri-devel, LKML, Gerd Hoffmann, Deucher, Alexander,
Koenig, Christian, zhoucm1, Hongbo.He, Sean Paul,

>
> Mapping a drm "dumb" buffer fails on 32-bit system (i686) from what
> appears to be a truncated memory address that has been copied
> throughout several files. The bug manifests as an -EINVAL when calling
> mmap with the offset gathered from DRM_IOCTL_MODE_MAP_DUMB <--
> DRM_IOCTL_MODE_ADDFB <-- DRM_IOCTL_MODE_CREATE_DUMB.  I can provide
> test code if needed.
>
> The following patch will apply to 4.18 though I've only been able to
> test through qemu bochs driver and nouveau. Intel driver worked
> without any issues.  I'm not sure if everyone is going to want to
> share a constant, and the whitespace is screwed up from gmail's awful
> javascript client, so let me know if I should resend this with any
> specific changes.  I have also attached the file with preserved
> whitespace.
>

This shouldn't be necessary, did someone misbackport the mmap changes without:

drm: set FMODE_UNSIGNED_OFFSET for drm files

Dave.

```* Re:
2018-10-22  1:50     ` Re: Dave Airlie
@ 2018-10-21 22:20       ` Michael Tirado
2018-10-23  1:47       ` Re: Michael Tirado
1 sibling, 0 replies; 400+ messages in thread
To: Dave Airlie
Cc: Airlied, dri-devel, LKML, kraxel, alexander.deucher,
christian.koenig, David1.zhou, Hongbo.He, Sean Paul, Gustavo,
maarten.lankhorst

[-- Attachment #1: Type: text/plain, Size: 837 bytes --]

On Mon, Oct 22, 2018 at 1:50 AM Dave Airlie <airlied@gmail.com> wrote:
>
> On Mon, 22 Oct 2018 at 10:49, Michael Tirado <mtirado418@gmail.com> wrote:
> >
> > On Mon, Oct 22, 2018 at 12:26 AM Dave Airlie <airlied@gmail.com> wrote:
> > >
> > > This shouldn't be necessary, did someone misbackport the mmap changes without:
> If you have custom userspace software, make sure it's using
> AC_SYS_LARGEFILE or whatever the equivalant is in your build system.
>
> 64-bit file offsets are important.
>

That fixed it! -D_FILE_OFFSET_BITS=64 is the pre-processor define
needed. It's a bit more than unintuitive but I'm glad I don't need
this stupid patch anymore, Thanks.

In case anyone is further interested I have attached test program
since I spent the last hour or so chopping it up anyway :S   [ gcc -o
kms -D_FILE_OFFSET_BITS=64 main.c ]

[-- Attachment #2: main.c --]
[-- Type: application/octet-stream, Size: 17153 bytes --]

*
* This program is libre software: you can redistribute it and/or modify
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details. You should have
* received a copy of the GNU General Public License version 3
* along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <malloc.h>
#include <signal.h>
#include <stdlib.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <drm/drm.h>
#include <drm/drm_mode.h>

#define STRERR strerror(errno)

#ifndef PTRBITCOUNT
#define PTRBITCOUNT 32
#endif
/* kernel structs use __u64 for pointer types */
#if (PTRBITCOUNT == 32)
#define ptr_from_krn(ptr) ((void *)(uint32_t)(ptr))
#define ptr_to_krn(ptr)   ((uint32_t)(ptr))
#elif (PTRBITCOUNT == 64)
#define ptr_from_krn(ptr) ((void *)(uint64_t)(ptr))
#define ptr_to_krn(ptr)   ((uint64_t)(ptr))
#else
#error "PTRBITCOUNT is undefined"
#endif
#ifndef MAX_FBS
#define MAX_FBS   12
#endif
#ifndef MAX_CRTCS
#define MAX_CRTCS 12
#endif
#ifndef MAX_CONNECTORS
#define MAX_CONNECTORS 12
#endif
#ifndef MAX_ENCODERS
#define MAX_ENCODERS 12
#endif
#ifndef MAX_PROPS
#define MAX_PROPS 256
#endif
#ifndef MAX_MODES
#define MAX_MODES 256
#endif
#if (PTRBITCOUNT == 32)
#define drm_to_ptr(ptr)   ((void *)(uint32_t)(ptr))
#define drm_from_ptr(ptr) ((uint32_t)(ptr))
#elif (PTRBITCOUNT == 64)
#define drm_to_ptr(ptr)   ((void *)(uint64_t)(ptr))
#define drm_from_ptr(ptr) ((uint64_t)(ptr))
#else
#error "PTRBITCOUNT is undefined"
#endif
#define drm_alloc(size) (drm_from_ptr(calloc(1,size)))

struct drm_buffer
{
uint32_t drm_id;
uint32_t fb_id;
uint32_t pitch;
uint32_t width;
uint32_t height;
uint32_t depth;
uint32_t bpp;
size_t   size;
};
struct drm_display
{
struct drm_mode_get_encoder encoder;
struct drm_mode_crtc crtc;
struct drm_mode_get_connector *conn; /* do we need array for multi-screen? */
struct drm_mode_modeinfo *modes; /* these both point to conn's mode array */
struct drm_mode_modeinfo *cur_mode;
uint32_t cur_mode_idx;
uint32_t mode_count;
uint32_t conn_id;
};
struct drm_kms
{
struct drm_display display;
struct drm_buffer *sfb;
struct drm_mode_card_res *res;
int card_fd;
};

/* get id out of drm_id_ptr */
static uint32_t drm_get_id(uint64_t addr, uint32_t idx)
{
}

static int free_mode_card_res(struct drm_mode_card_res *res)
{
if (!res)
return -1;
if (res->fb_id_ptr)
free(drm_to_ptr(res->fb_id_ptr));
if (res->crtc_id_ptr)
free(drm_to_ptr(res->crtc_id_ptr));
if (res->encoder_id_ptr)
free(drm_to_ptr(res->encoder_id_ptr));
if (res->connector_id_ptr)
free(drm_to_ptr(res->connector_id_ptr));
free(res);
return 0;
}

static struct drm_mode_card_res *alloc_mode_card_res(int fd)
{
struct drm_mode_card_res res;
struct drm_mode_card_res *ret;
uint32_t count_fbs, count_crtcs, count_connectors, count_encoders;

memset(&res, 0, sizeof(struct drm_mode_card_res));
if (ioctl(fd, DRM_IOCTL_MODE_GETRESOURCES, &res)) {
printf("ioctl(DRM_IOCTL_MODE_GETRESOURCES, &res): %s\n", STRERR);
return NULL;
}
if (res.count_fbs > MAX_FBS
|| res.count_crtcs > MAX_CRTCS
|| res.count_encoders > MAX_ENCODERS
|| res.count_connectors > MAX_CONNECTORS) {
printf("resource limit reached, see defines.h\n");
return NULL;
}
if (res.count_fbs) {
res.fb_id_ptr = drm_alloc(sizeof(uint32_t)*res.count_fbs);
if (!res.fb_id_ptr)
goto alloc_err;
}
if (res.count_crtcs) {
res.crtc_id_ptr = drm_alloc(sizeof(uint32_t)*res.count_crtcs);
if (!res.crtc_id_ptr)
goto alloc_err;
}
if (res.count_encoders) {
res.encoder_id_ptr = drm_alloc(sizeof(uint32_t)*res.count_encoders);
if (!res.encoder_id_ptr)
goto alloc_err;
}
if (res.count_connectors) {
res.connector_id_ptr = drm_alloc(sizeof(uint32_t)*res.count_connectors);
if (!res.connector_id_ptr)
goto alloc_err;
}
count_fbs = res.count_fbs;
count_crtcs = res.count_crtcs;
count_encoders = res.count_encoders;
count_connectors = res.count_connectors;

if (ioctl(fd, DRM_IOCTL_MODE_GETRESOURCES, &res) == -1) {
printf("ioctl(DRM_IOCTL_MODE_GETRESOURCES, &res): %s\n", STRERR);
goto free_err;
}

if (count_fbs != res.count_fbs
|| count_crtcs != res.count_crtcs
|| count_encoders != res.count_encoders
|| count_connectors != res.count_connectors) {
errno = EAGAIN;
goto free_err;
}

ret = calloc(1, sizeof(struct drm_mode_card_res));
if (!ret)
goto alloc_err;

memcpy(ret, &res, sizeof(struct drm_mode_card_res));
return ret;

alloc_err:
errno = ENOMEM;
free_err:
free(drm_to_ptr(res.fb_id_ptr));
free(drm_to_ptr(res.crtc_id_ptr));
free(drm_to_ptr(res.connector_id_ptr));
free(drm_to_ptr(res.encoder_id_ptr));
return NULL;
}

static struct drm_mode_get_connector *alloc_connector(int fd, uint32_t conn_id)
{
struct drm_mode_get_connector conn;
struct drm_mode_get_connector *ret;
uint32_t count_modes, count_props, count_encoders;

memset(&conn, 0, sizeof(struct drm_mode_get_connector));
conn.connector_id = conn_id;

if (ioctl(fd, DRM_IOCTL_MODE_GETCONNECTOR, &conn) == -1) {
printf("ioctl(DRM_IOCTL_MODE_GETCONNECTOR, &conn): %s\n", STRERR);
return NULL;
}
if (conn.count_modes > MAX_MODES
|| conn.count_props > MAX_PROPS
|| conn.count_encoders > MAX_ENCODERS) {
printf("resource limit reached, see defines.h\n");
return NULL;
}
if (conn.count_modes) {
conn.modes_ptr = drm_alloc(sizeof(struct drm_mode_modeinfo)
* conn.count_modes);
if (!conn.modes_ptr)
goto alloc_err;
}
if (conn.count_props) {
conn.props_ptr = drm_alloc(sizeof(uint32_t)*conn.count_props);
if (!conn.props_ptr)
goto alloc_err;
conn.prop_values_ptr = drm_alloc(sizeof(uint64_t)*conn.count_props);
if (!conn.prop_values_ptr)
goto alloc_err;
}
if (conn.count_encoders) {
conn.encoders_ptr = drm_alloc(sizeof(uint32_t)*conn.count_encoders);
if (!conn.encoders_ptr)
goto alloc_err;
}
count_modes = conn.count_modes;
count_props = conn.count_props;
count_encoders = conn.count_encoders;

if (ioctl(fd, DRM_IOCTL_MODE_GETCONNECTOR, &conn) == -1) {
printf("ioctl(DRM_IOCTL_MODE_GETCONNECTOR, &conn): %s\n", STRERR);
goto free_err;
}

if (count_modes != conn.count_modes
|| count_props != conn.count_props
|| count_encoders != conn.count_encoders) {
errno = EAGAIN;
goto free_err;
}

ret = calloc(1, sizeof(struct drm_mode_get_connector));
if (!ret)
goto alloc_err;

memcpy(ret, &conn, sizeof(struct drm_mode_get_connector));
return ret;

alloc_err:
errno = ENOMEM;
free_err:
free(drm_to_ptr(conn.modes_ptr));
free(drm_to_ptr(conn.props_ptr));
free(drm_to_ptr(conn.encoders_ptr));
free(drm_to_ptr(conn.prop_values_ptr));
return NULL;
}

static struct drm_mode_modeinfo *get_connector_modeinfo(struct drm_mode_get_connector *conn,  uint32_t *count)
{
if (!conn || !count)
return NULL;
*count = conn->count_modes;
return drm_to_ptr(conn->modes_ptr);

}

static int free_connector(struct drm_mode_get_connector *conn)
{
if (!conn)
return -1;
if (conn->modes_ptr)
free(drm_to_ptr(conn->modes_ptr));
if (conn->props_ptr)
free(drm_to_ptr(conn->props_ptr));
if (conn->encoders_ptr)
free(drm_to_ptr(conn->encoders_ptr));
if (conn->prop_values_ptr)
free(drm_to_ptr(conn->prop_values_ptr));
free(conn);
return 0;
}

static int drm_kms_connect_sfb(struct drm_kms *self)
{
struct drm_display *display = &self->display;
struct drm_mode_get_connector *conn;
struct drm_mode_modeinfo *cur_mode;
struct drm_mode_get_encoder *encoder;
struct drm_mode_crtc *crtc;
if (!display || !display->conn || !display->cur_mode || !self->sfb)
return -1;

conn = display->conn;
cur_mode = display->cur_mode;
encoder = &self->display.encoder;
crtc = &self->display.crtc;
memset(crtc, 0, sizeof(struct drm_mode_crtc));
memset(encoder,  0, sizeof(struct drm_mode_get_encoder));

/* XXX: there can be multiple encoders, have not investigated this much */
if (conn->encoder_id == 0) {
printf("conn->encoder_id was 0, defaulting to encoder[0]\n");
conn->encoder_id = ((uint32_t *)drm_to_ptr(conn->encoders_ptr))[0];
}
encoder->encoder_id = conn->encoder_id;

if (ioctl(self->card_fd, DRM_IOCTL_MODE_GETENCODER, encoder) == -1) {
printf("ioctl(DRM_IOCTL_MODE_GETENCODER): %s\n", STRERR);
return -1;
}

if (encoder->crtc_id == 0) {
printf("encoder->crtc_id was 0, defaulting to crtc[0]\n");
encoder->crtc_id = ((uint32_t *)drm_to_ptr(self->res->crtc_id_ptr))[0];
}
crtc->crtc_id = encoder->crtc_id;

if (ioctl(self->card_fd, DRM_IOCTL_MODE_GETCRTC, crtc) == -1) {
printf("ioctl(DRM_IOCTL_MODE_GETCRTC): %s\n", STRERR);
return -1;
}

/* set crtc mode */
crtc->fb_id = self->sfb->fb_id;
crtc->set_connectors_ptr = drm_from_ptr((void *)&conn->connector_id);
crtc->count_connectors = 1;
crtc->mode = *cur_mode;
/*printf("\nsetting mode:\n\n");
print_mode_modeinfo(cur_mode);*/
crtc->mode_valid = 1;
if (ioctl(self->card_fd, DRM_IOCTL_MODE_SETCRTC, crtc) == -1) {
printf("ioctl(DRM_IOCTL_MODE_SETCRTC): %s\n", STRERR);
return -1;
}
return 0;
}

/* stupid frame buffer */
static struct drm_buffer *alloc_sfb(int card_fd,
uint32_t width,
uint32_t height,
uint32_t depth,
uint32_t bpp)
{
struct drm_mode_create_dumb cdumb;
struct drm_mode_map_dumb    moff;
struct drm_mode_fb_cmd      cmd;
struct drm_buffer *ret;
void  *fbmap;

memset(&cdumb, 0, sizeof(cdumb));
memset(&moff,  0, sizeof(moff));
memset(&cmd,   0, sizeof(cmd));

/* create dumb buffer */
cdumb.width  = width;
cdumb.height = height;
cdumb.bpp    = bpp;
cdumb.flags  = 0;
cdumb.pitch  = 0;
cdumb.size   = 0;
cdumb.handle = 0;
if (ioctl(card_fd, DRM_IOCTL_MODE_CREATE_DUMB, &cdumb) == -1) {
printf("ioctl(DRM_IOCTL_MODE_CREATE_DUMB): %s\n", STRERR);
return NULL;
}
cmd.width  = cdumb.width;
cmd.height = cdumb.height;
cmd.bpp    = cdumb.bpp;
cmd.pitch  = cdumb.pitch;
cmd.depth  = depth;
cmd.handle = cdumb.handle;
if (ioctl(card_fd, DRM_IOCTL_MODE_ADDFB, &cmd) == -1) {
ioctl(card_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &cdumb.handle);
return NULL;
}
/* get mmap offset */
moff.handle = cdumb.handle;
if (ioctl(card_fd, DRM_IOCTL_MODE_MAP_DUMB, &moff) == -1) {
printf("ioctl(DRM_IOCTL_MODE_MAP_DUMB): %s\n", STRERR);
ioctl(card_fd, DRM_IOCTL_MODE_RMFB, &cmd.fb_id);
ioctl(card_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &cdumb.handle);
return NULL;
}
/* XXX this is probably better off as MAP_PRIVATE, we can't prime
* the main framebuffer if it's "dumb", AFAIK */
MAP_SHARED, card_fd, (off_t)moff.offset);
if (fbmap == MAP_FAILED) {
printf("framebuffer mmap failed: %s\n", STRERR);
ioctl(card_fd, DRM_IOCTL_MODE_RMFB, &cmd.fb_id);
ioctl(card_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &cdumb.handle);
return NULL;
}

ret = calloc(1, sizeof(struct drm_buffer));
if (!ret) {
printf("-ENOMEM\n");
munmap(fbmap, cdumb.size);
ioctl(card_fd, DRM_IOCTL_MODE_RMFB, &cmd.fb_id);
ioctl(card_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &cdumb.handle);
return NULL;
}
ret->size     = cdumb.size;
ret->pitch    = cdumb.pitch;
ret->width    = cdumb.width;
ret->height   = cdumb.height;
ret->bpp      = cdumb.bpp;
ret->depth    = cmd.depth;
ret->fb_id    = cmd.fb_id;
ret->drm_id   = cdumb.handle;
memset(fbmap, 0x27, cdumb.size);
return ret;
}

static int destroy_sfb(int card_fd, struct drm_buffer *sfb)
{
if (!sfb)
return -1;

printf("munmap: %s\n", STRERR);
if (ioctl(card_fd, DRM_IOCTL_MODE_RMFB, &sfb->fb_id))
printf("ioctl(DRM_IOCTL_MODE_RMFB): %s\n", STRERR);
if (ioctl(card_fd, DRM_IOCTL_MODE_DESTROY_DUMB, &sfb->drm_id))
printf("ioctl(DRM_IOCTL_MODE_DESTROY_DUMB): %s\n", STRERR);
free(sfb);
return 0;
}
static int card_set_master(int card_fd)
{
if (ioctl(card_fd, DRM_IOCTL_SET_MASTER, 0)) {
printf("ioctl(DRM_IOCTL_SET_MASTER, 0): %s\n", STRERR);
return -1;
}
return 0;
}
static int card_drop_master(int card_fd)
{
if (ioctl(card_fd, DRM_IOCTL_DROP_MASTER, 0)) {
printf("ioctl(DRM_IOCTL_DROP_MASTER, 0): %s\n", STRERR);
return -1;
}
return 0;
}
static int drm_display_destroy(struct drm_display *display)
{
if (display->conn)
free_connector(display->conn);
memset(display, 0, sizeof(struct drm_display));
return 0;
}
int drm_kms_destroy(struct drm_kms *self)
{
if (self->sfb)
destroy_sfb(self->card_fd, self->sfb);
if (self->res)
free_mode_card_res(self->res);
drm_display_destroy(&self->display);

close(self->card_fd);
memset(self, 0, sizeof(struct drm_kms));
free(self);
return 0;
}
static int get_mode_idx(struct drm_mode_modeinfo *modes,
uint16_t count,
uint16_t width,
uint16_t height,
uint16_t refresh)
{
int i;
int pick = -1;
if (width == 0)
width = 0xffff;
if (height == 0)
height = 0xffff;
for (i = 0; i < count; ++i)
{
if (modes[i].hdisplay > width || modes[i].vdisplay > height)
continue;
/* pretend these radical modes don't exist for now */
if (modes[i].hdisplay % 16 == 0) {
if (pick < 0) {
pick = i;
continue;
}
if (modes[i].hdisplay > modes[pick].hdisplay)
pick = i;
else if (modes[i].vdisplay > modes[pick].vdisplay)
pick = i;
else if (modes[i].hdisplay == modes[pick].hdisplay
&& modes[i].vdisplay == modes[pick].vdisplay) {
if (abs(refresh - modes[i].vrefresh)
< abs(refresh - modes[pick].vrefresh)) {
pick = i;
}
}
}
}
if (pick < 0) {
printf("could not find any usable modes for (%dx%d@%dhz)\n",
width, height, refresh);
return -1;
}
return pick;
}
/* TODO handle hotplugging */
uint16_t req_width,
uint16_t req_height,
uint16_t req_refresh,
struct drm_display *out)
{
uint32_t conn_id;
int idx = -1;

/* FIXME uses primary connector? "0" */
conn_id = drm_get_id(self->res->connector_id_ptr, 0);
out->conn = alloc_connector(self->card_fd, conn_id);
if (!out->conn) {
printf("unable to create drm connector structure\n");
return -1;
}

out->conn_id = conn_id;
out->modes = get_connector_modeinfo(out->conn, &out->mode_count);
idx = get_mode_idx(out->modes, out->mode_count,
req_width, req_height, req_refresh);
if (idx < 0)
goto free_err;

out->cur_mode_idx = (uint32_t)idx;
out->cur_mode = &out->modes[out->cur_mode_idx];
return 0;
free_err:
drm_display_destroy(out);
return -1;
}
struct drm_kms *drm_mode_create(char *devname,
int no_connect,
uint16_t req_width,
uint16_t req_height,
uint16_t req_refresh)
{
char devpath[128];
struct drm_kms *self;
struct drm_mode_modeinfo *cur_mode;
int card_fd;

snprintf(devpath, sizeof(devpath), "/dev/dri/%s", devname);
card_fd = open(devpath, O_RDWR|O_CLOEXEC);
if (card_fd == -1) {
printf("open(%s): %s\n", devpath, STRERR);
return NULL;
}
if (card_set_master(card_fd)) {
printf("card_set_master failed\n");
return NULL;
}

self = calloc(1, sizeof(struct drm_kms));
if (!self)
return NULL;

self->card_fd = card_fd;
self->res = alloc_mode_card_res(card_fd);
if (!self->res) {
printf("unable to create drm structure\n");
goto free_err;
}

if (drm_display_load(self, req_width, req_height, req_refresh, &self->display)) {
goto free_err;
}
cur_mode = self->display.cur_mode;
printf("connector(%d) using mode[%d] (%dx%d@%dhz)\n",
self->display.conn_id,
self->display.cur_mode_idx,
cur_mode->hdisplay,
cur_mode->vdisplay,
cur_mode->vrefresh);

/* buffer pitch must divide evenly by 16,
* TODO check against bpp here when that is variable instead of 32 */
self->sfb = alloc_sfb(card_fd, cur_mode->hdisplay, cur_mode->vdisplay, 24, 32);
if (!self->sfb) {
printf("alloc_sfb failed\n");
goto free_err;
}

if (!no_connect && drm_kms_connect_sfb(self)) {
printf("drm_kms_connect_sfb failed\n");
goto free_err;
}
return self;

free_err:
drm_kms_destroy(self);
return NULL;
}

int main(int argc, char *argv[])
{
int ret = -1;
struct drm_kms *card0;
/*card0 = drm_mode_create("card0", g_srv_opts.inactive_vt,
g_srv_opts.request_width,
g_srv_opts.request_height,
g_srv_opts.request_refresh);*/
/* do not connect to vt */
card0 = drm_mode_create("card0", 1, 640, 480, 60);
if (card0 == NULL) {
printf("drm_mode_create failed\n");
return -1;
}

drm_kms_destroy(card0);

printf("looks ok, returning 0\n");
return 0;
}

```* Re:
2018-10-22  0:26 ` Re: Dave Airlie
@ 2018-10-21 20:23   ` Michael Tirado
2018-10-22  1:50     ` Re: Dave Airlie
To: airlied
Cc: Airlied, dri-devel, LKML, kraxel, alexander.deucher,
christian.koenig, David1.zhou, Hongbo.He, Sean Paul, Gustavo,
maarten.lankhorst

On Mon, Oct 22, 2018 at 12:26 AM Dave Airlie <airlied@gmail.com> wrote:
>
> This shouldn't be necessary, did someone misbackport the mmap changes without:
>
> drm: set FMODE_UNSIGNED_OFFSET for drm files
>
> Dave.

The latest kernel I have had to patch was a 4.18-rc6.  I'll try with a
newer 4.19 and let you know if it decides to work.  If not I'll
prepare a test case for demonstration on qemu-system-i386.

```* Re:
@ 2018-07-06 21:18   ` Santosh Shilimkar
0 siblings, 0 replies; 400+ messages in thread
From: Santosh Shilimkar @ 2018-07-06 21:18 UTC (permalink / raw)
To: arm, linux-arm-kernel; +Cc: khilman, arnd, olof, linux-kernel

Ignore this.. Will send again with subjects fixed

On 7/6/2018 2:16 PM, Santosh Shilimkar wrote:
> Subject: [GIT PULL 3/3] SOC: Driver updates for v4.19
>
> The following changes since commit ce397d215ccd07b8ae3f71db689aedb85d56ab40:
>
>    Linux 4.18-rc1 (2018-06-17 08:04:49 +0900)
>
> are available in the git repository at:
>
>    git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux-keystone.git tags/soc_drivers_for_4.19
>
> for you to fetch changes up to 990c10091db318c7eb7e8935c86b6f7c01585015:
>
>    soc: ti: wkup_m3_ipc: mark PM functions as __maybe_unused (2018-07-06 09:47:51 -0700)
>
> ----------------------------------------------------------------
> Keystone SOC driver update for 4.19
>
>   -  Add suspend/resume functionality to TI EMIF SRAM driver
>   -  Add wakeup M3 RTC self refresh support
>   -  Fix for the PM runtime ifdefs
>
> ----------------------------------------------------------------
> Arnd Bergmann (1):
>        soc: ti: wkup_m3_ipc: mark PM functions as __maybe_unused
>
> Dave Gerlach (2):
>        memory: ti-emif-sram: Add resume function to recopy sram code
>        soc: ti: wkup_m3_ipc: Add wkup_m3_request_wake_src
>
> Keerthy (1):
>        soc: ti: wkup_m3_ipc: Add rtc_only with ddr in self refresh mode support
>
>   drivers/memory/ti-emif-pm.c  | 33 +++++++++++++++++++
>   drivers/soc/ti/wkup_m3_ipc.c | 76 ++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/wkup_m3_ipc.h  |  9 ++++++
>   3 files changed, 118 insertions(+)
>

```* Re:
2018-01-11 17:16 Fabian Huegel
@ 2018-01-11 17:25 ` Ben Evans
0 siblings, 0 replies; 400+ messages in thread
From: Ben Evans @ 2018-01-11 17:25 UTC (permalink / raw)
To: Fabian Huegel, Oleg Drokin, Andreas Dilger, James Simmons,
Lai Siyao, John L . Hammond, Greg Kroah-Hartman, devel,
Ben Evans, NeilBrown
Cc: lustre-devel, linux-kernel, linux-kernel

I've been working off and on with this.  Since you're getting into the
counters in a couple of the patches, part of the reason for all the
#defines here are because MDC, MDT and OST counters are all shoved into
the same array dynamically, sometimes.  It would be a much cleaner
approach to have a separate array for the MDC stats, then print them
conditionally.

This would reduce all of the calls to these macros to counter increments.

-Ben Evans

On 1/11/18, 12:16 PM, "Fabian Huegel" <fabian_huegel@web.de> wrote:

>We cleaned up a lot of checkpatch errors and warnings in obd_class.h,
>but there are still some CHECKs and two warnings about flow control
>inside macros left.
>
>Changing those macros to inline functions would probably
>be a good idea, unfortunatly it's not straightforward since they use
>'#op' to print the name of the operation.
>
>We also did some aligning to make the code more readable and removed
>an unnecessary macro.
>
>We only tested, that the kernel still compiles and the lustre kernel
>module loads successfully, but given the harmless nature of these
>changes we don't expect any problems.
>
>The patches are based on the staging-testing branch of the staging tree.
>

```* Re:
@ 2017-11-13 14:55 Amos Kalonzo
0 siblings, 0 replies; 400+ messages in thread
From: Amos Kalonzo @ 2017-11-13 14:55 UTC (permalink / raw)

Attn:

I am wondering why You haven't respond to my email for some days now.
reference to my client's contract balance payment of (11.7M,USD)
Kindly get back to me for more details.

Best Regards

Amos Kalonzo

```* Re:
@ 2017-09-07  8:50 Quick Loan
0 siblings, 0 replies; 400+ messages in thread
From: Quick Loan @ 2017-09-07  8:50 UTC (permalink / raw)

Hello dear I am an International loan lender, I give out loans at 1% interest
rate, email me at:(rich_ken2016@usa.com)

```* Re:
@ 2017-08-18 19:47 Jessy
0 siblings, 0 replies; 400+ messages in thread
From: Jessy @ 2017-08-18 19:47 UTC (permalink / raw)
To: Recipients

Hello,

I wish to seek for your assistance in a deal that will be of mutual benefit for the both of us from Camp Stanley in Uijeongbu. Please contact me for details, God bless you.

```* RE:
@ 2017-07-31 23:46 TD CREDIT
0 siblings, 0 replies; 400+ messages in thread
From: TD CREDIT @ 2017-07-31 23:46 UTC (permalink / raw)
To: Recipients

DO YOU NEED ANY KIND OF LOAN CREDIT ASSISTANCE? IF YES,EMAIL US FOR MORE INFO.

---
This email is free from viruses and malware because avast! Antivirus protection is active.
http://www.avast.com

```* Re:
@ 2017-07-15  3:29 Saif Al-Islam
0 siblings, 0 replies; 400+ messages in thread
From: Saif Al-Islam @ 2017-07-15  3:29 UTC (permalink / raw)

I need your assistance in a transaction that will benefit you details

Regards,
Saif.

```* RE:
@ 2017-07-07 17:04 Mrs Alice Walton
0 siblings, 0 replies; 400+ messages in thread
From: Mrs Alice Walton @ 2017-07-07 17:04 UTC (permalink / raw)

--
my name is Mrs. Alice Walton, a business woman an America Citizen and
the heiress to the fortune of Walmart stores, born October 7, 1949. I
have a mission for you worth \$100,000,000.00(Hundred Million United
State Dollars) which I intend using for CHARITY PROJECT to help the
less privilege and orphanage

```* RE:
@ 2017-05-28 13:39 Lasek László
0 siblings, 0 replies; 400+ messages in thread
From: Lasek László @ 2017-05-28 13:39 UTC (permalink / raw)
To: kdj

Hello Friend

I Have a Proposal for your kindly contact me via:  emzong@outlook.com

Thank You.

________

ü Mielőtt kinyomtatja ezt az e-mailt, gondoljon a környezetre. P Please consider the environment before printing this email.
*******

Ezt az emailt a Websense ESG ellenőrizte a BKV Zrt. biztonsági szabályzata alapján. Nem található benne vírus.

```* Re:
@ 2017-05-03  6:23 H.A
0 siblings, 0 replies; 400+ messages in thread
From: H.A @ 2017-05-03  6:23 UTC (permalink / raw)
To: Recipients

With profound love in my heart, I Kindly Oblige your interest to very important proposal.. It is Truly Divine and require your utmost attention..........

S hlubokou láskou v mém srdci, Laskave jsem prinutit svuj zájem k návrhu .. Je velmi duležité, skutecne Divine a vyžadují vaši nejvyšší pozornost.

Kontaktujte me prímo pres: helenaroberts99@gmail.com pro úplné podrobnosti.complete.

HELINA .A ROBERTS

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

```* Re:
2017-04-28  8:43 ` Linus Walleij
@ 2017-04-28  9:26   ` Anatolij Gustschin
0 siblings, 0 replies; 400+ messages in thread
From: Anatolij Gustschin @ 2017-04-28  9:26 UTC (permalink / raw)
To: Linus Walleij
Cc: Alexandre Courbot, Andy Shevchenko, linux-gpio, linux-kernel

On Fri, 28 Apr 2017 10:43:19 +0200
Linus Walleij linus.walleij@linaro.org wrote:
...
>> --- a/include/linux/gpio/consumer.h
>> +++ b/include/linux/gpio/consumer.h
>
>So why should the stubs be in <linux/gpio/consumer.h>
>and not in <linux/gpio/machine.h>?

good question. I'll move them to machine.h.

Thanks,
Anatolij

```* Re:
2017-04-28  8:20 Anatolij Gustschin
@ 2017-04-28  8:43 ` Linus Walleij
2017-04-28  9:26   ` Re: Anatolij Gustschin
From: Linus Walleij @ 2017-04-28  8:43 UTC (permalink / raw)
To: Anatolij Gustschin
Cc: Alexandre Courbot, Andy Shevchenko, linux-gpio, linux-kernel

On Fri, Apr 28, 2017 at 10:20 AM, Anatolij Gustschin <agust@denx.de> wrote:

> Subject: [PATCH v3] gpiolib: Add stubs for gpiod lookup table interface
>
> for the !GPIOLIB case to prevent build errors. Also add prototypes.
>
> Signed-off-by: Anatolij Gustschin <agust@denx.de>
> ---
> Changes in v3:
>  - add stubs for !GPIOLIB case. Drop prototypes, these are

Yeah...

> --- a/include/linux/gpio/consumer.h
> +++ b/include/linux/gpio/consumer.h

So why should the stubs be in <linux/gpio/consumer.h>
and not in <linux/gpio/machine.h>?

Yours,
Linus Walleij

```* RE:
@ 2017-02-23 15:09 Qin's Yanjun
0 siblings, 0 replies; 400+ messages in thread
From: Qin's Yanjun @ 2017-02-23 15:09 UTC (permalink / raw)

How are you today and your family? I require your attention and honest
co-operation about some issues which i will really want to discuss with you
which.  Looking forward to read from you soon.

Qin's

______________________________

Sky Silk, http://aknet.kz

```* Re:
@ 2017-01-07 14:47 Information
0 siblings, 0 replies; 400+ messages in thread
From: Information @ 2017-01-07 14:47 UTC (permalink / raw)

Do you need loan? we offer all kinds of loan from minimum amount of \$5,000 to maximum of \$2,000,000 if you are interested contact us via: internationalloan09@gmail.com    with the information below:

Full Name:
Country:
Loan Amount:
Loan Duration:
Mobile phone number:
Sex:

Thanks,
Dr Scott.

```* re:
@ 2016-11-15  4:40 Apply
0 siblings, 0 replies; 400+ messages in thread
From: Apply @ 2016-11-15  4:40 UTC (permalink / raw)
To: Recipients

Do you need loan?we offer all kinds of loan from minimum amount of \$5,000 to maximum of \$2,000,000 if you are interested contact us via:internationalloanplc1@gmail.com  with the information below:
Full Name:
Country:
Loan Amount:
Loan Duration:
Mobile phone number:
Sex:
Thanks,
Dr Scott.

```* Re:
@ 2016-11-08 13:46 vaserman
0 siblings, 0 replies; 400+ messages in thread
From: vaserman @ 2016-11-08 13:46 UTC (permalink / raw)
To: info

--

```* RE:
@ 2016-11-02  2:36 U
0 siblings, 0 replies; 400+ messages in thread
From: U @ 2016-11-02  2:36 UTC (permalink / raw)
To: info

[-- Attachment #1: Type: text/plain, Size: 24 bytes --]

--
UK NATIONAL LOTTERY

[-- Attachment #2: UK.jpg --]
[-- Type: , Size: 185835 bytes --]

```* Re:
@ 2016-07-21 21:50 Amit Jain
0 siblings, 0 replies; 400+ messages in thread
From: Amit Jain @ 2016-07-21 21:50 UTC (permalink / raw)

Dear Esteemed Friend,

My name is Mr. Amit Jain, I double as the Group Chief Operations Officer Emaar Properties and Chief Executive Officer Emaar Dubai. Let me know the possibilities of setting up a private investment in your country as I am interested in your region. I have interest in Real Estate, Industry, Energy and Agriculture.

I expect to get a response from you to enable us discuss more.

Regards,
Mr. Amit Jain.
P.O. Box 120360,
Group Chief Operations Officer
Emaar Properties PJSC

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

```* Re:
@ 2016-07-04 15:47 Mr. Bun Sam
0 siblings, 0 replies; 400+ messages in thread
From: Mr. Bun Sam @ 2016-07-04 15:47 UTC (permalink / raw)
To: linux-kernel

Hi,

I work with one of the major banks in Cambodia as the director of audit. I have a proposal for you, a very urgent and quick business that will be completed in 12 working days. I have just discovered documents relating to funds belonging to a deceased client of our bank,

I went through all the related documents to the funds and I discovered no listed next of kin to inherit the funds which has been in our bank for more than 7 years now. I need your cooperation in getting the funds, I have the power to list you as the beneficiary of the funds and have the funds transferred to you.

If you are interested, do get back to me so I can provide you with the full details.

Regards
Bun Sam.

```* Re:
@ 2016-07-02 11:30 Mr. Bun Sam
0 siblings, 0 replies; 400+ messages in thread
From: Mr. Bun Sam @ 2016-07-02 11:30 UTC (permalink / raw)
To: linux-kernel

Hi,

I work with one of the major banks in Cambodia as the director of audit. I have a proposal for you, a very urgent and quick business that will be completed in 12 working days. I have just discovered documents relating to funds belonging to a deceased client of our bank,

I went through all the related documents to the funds and I discovered no listed next of kin to inherit the funds which has been in our bank for more than 7 years now. I need your cooperation in getting the funds, I have the power to list you as the beneficiary of the funds and have the funds transferred to you.

If you are interested, do get back to me so I can provide you with the full details.

Regards
Bun Sam.

```* Re:
@ 2016-06-27  8:24 Fidelity Loans
0 siblings, 0 replies; 400+ messages in thread
From: Fidelity Loans @ 2016-06-27  8:24 UTC (permalink / raw)
To: Recipients

```* Re:
@ 2016-02-10 14:44 ` Steven Rostedt
0 siblings, 0 replies; 400+ messages in thread
From: Steven Rostedt @ 2016-02-10 14:44 UTC (permalink / raw)
Cc: Denys Vlasenko, linux-kernel, srostedt, Tejun Heo, Peter Hurley,
Jan Kara, Sergey Senozhatsky, Andrew Morton, Kyle McMartin,
KY Srinivasan, Dave Jones, Calvin Owens

On Wed, 10 Feb 2016 15:36:49 +0100

> Bcc:
> Subject: Re: [PATCH] printk: avoid livelock if another CPU printks
>  continuously
>

> > +	if (cnt == 0) {
> > +		/*
> > +		 * Other CPU(s) printk like crazy, filling log_buf[].
> > +		 * Try to get rid of the "honor" of servicing their data:
> > +		 * give _them_ time to grab console_sem and start working.
> > +		 */
> > +		cnt = 9999;
> > +		while (--cnt != 0) {
> > +			cpu_relax();
> > +			if (console_seq == log_next_seq) {
>
> This condition is true when all available messages are printed to
> the console. It means that there is nothing to do at all. It is
> quite late. A much better solution would be to store console_seq
> to a local variable and check it is being modified by an other CPU.
>

Yep, I recommended the same thing.

>
> > +				/* Good, other CPU entered "for(;;)" loop */
> > +				goto out;
> > +			}
> > +		}
> > +		/* No one seems to be willing to take it... */
> > +		if (console_trylock())
> > +			goto again; /* we took it */
> > +		/* Nope, someone else holds console_sem! Good */
>
> The cycle gives a big chance other CPUs to enter console_unlock().
> It means that more CPUs might end up in the above busy cycle.
>
> It gives a chance to move the printing to another CPU. It likely
> slows down the flood of messages because the producer end up
> here as well.
>
> So, it probably works but the performance is far from optimal.
> Many CPUs might end up doing nothing. I am afraid that this is
> not the right way to go.

Note, it's not that performance critical, and the loop only happens if
someone else is adding to the console, which hopefully, should be rare.

-- Steve

```* re:
@ 2016-02-08  3:11 Qatar Foundation
0 siblings, 0 replies; 400+ messages in thread
From: Qatar Foundation @ 2016-02-08  3:11 UTC (permalink / raw)
To: Recipients

Dear Beneficiary,
You have been selected to receive €950,000.00 EURO as charity donations
aid of the Qatar Foundation. Reply back for
information and claims.
Yours sincerely,
Mr. Rashid Al-Naimi.
The Chief Executive Officer of
Qatar Foundation Endowment.

```* Re:
@ 2016-01-26 20:52 Ms Nadia Mohammed
0 siblings, 0 replies; 400+ messages in thread
To: Recipients

I hope this letter find you in good health , I'm Ms Nadia Mohammed , I have a project i want to bring to you and i want you to reply me and help me discuss this proposal. please reply to my personal email at :   nadiamohammed099@gmail.com

```* Re:
@ 2016-01-15  2:39 Trust Guarantee
0 siblings, 0 replies; 400+ messages in thread
From: Trust Guarantee @ 2016-01-15  2:39 UTC (permalink / raw)

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain;charset=iso-88591-1, Size: 496 bytes --]

Sie benötigen dringend Darlehen? Wir geben Darlehen an interessierte
Einzelpersonen, die versuchen, Darlehen mit Treu und Glauben. Sind Sie
ernsthaft brauchen dringend Darlehen? dann sind Sie an der richtigen
Stelle. Wir geben Business-Darlehen, persönliche Darlehen, kontaktieren
Sie uns für Ihr Darlehen beantragen, um Ihre Nachfrage zu befriedigen und
per E-Mail aus Finanz problem.contact uns heute fest:
trustguarantee1@gmail.com
Dank wie wir erwarte Ihre Antwort
Vertrauensgarantie Loan

```* Re:
@ 2015-12-18 11:50
0 siblings, 0 replies; 400+ messages in thread
From:  @ 2015-12-18 11:50 UTC (permalink / raw)
To: Recipients

I thought you would have responded by now?

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

```* Re:
@ 2015-12-11  9:30 Матвеева Руслана
0 siblings, 0 replies; 400+ messages in thread
From: Матвеева Руслана @ 2015-12-11  9:30 UTC (permalink / raw)
To: linux-kernel

Приветствую Вас.

Заказывали ли Вы, рекламу по почтовым майл адресам?

vip.tacanuur48@mail.ru

```* RE:
@ 2015-11-24 13:21 ` Amis, Ryann
0 siblings, 0 replies; 400+ messages in thread
From: Amis, Ryann @ 2015-11-24 13:21 UTC (permalink / raw)
To: MGCCC Helpdesk

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1309 bytes --]

â€‹Our new web mail has been improved with a new messaging system from Owa/outlook which also include faster usage on email, shared calendar, web-documents and the new 2015 anti-spam version. Please use the link below to complete your update for our new Owa/outlook improved web mail. CLICK HERE<https://formcrafts.com/a/15851> to update or Copy and pest the Link to your Browser: http://bit.ly/1Xo5Vd4
Thanks,
-----------------------------------------
The information contained in this e-mail message is intended only for the personal and confidential use of the recipient(s) named above. This message may be an attorney-client communication and/or work product and as such is privileged and confidential. If the reader of this message is not the intended recipient or an agent responsible for delivering it to the intended recipient, you are hereby notified that you have received this document in error and that any review, dissemination, distribution, or copying of this message is strictly prohibited. If you have received this communication in error, please notify us immediately by e-mail, and delete the original message.
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éÝ¶\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dÊ‡Ú™ë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

```* Re:
@ 2015-11-07 17:33 ` bbmbbm1
0 siblings, 0 replies; 400+ messages in thread
From: bbmbbm1 @ 2015-11-07 17:33 UTC (permalink / raw)
To: BMW Automobile

----- Original Message -----
From:
Sent: Sat, 07 Nov 2015 12:20:21 -0500 (EST)
Subject:

Your email has won \$ 1,200,000.00 in the 2015 BMW lottery in United
> States of America. You are advice to send us the following
>

```* Re:
@ 2015-11-07 16:48 Mohammed
0 siblings, 0 replies; 400+ messages in thread
From: Mohammed @ 2015-11-07 16:48 UTC (permalink / raw)
To: Recipients

Message from Saudi Arabia Prince Alwaleed bin Talal for his charity donation and You have been selected as recipient/benefactor for \$5Million Dollars from Alwaleed Philanthropic Foundation Grant.for more information contact Via email  ally00256@yandex.com

Thanks

Ally Mohammed

```* RE:
@ 2015-10-29  2:40
0 siblings, 0 replies; 400+ messages in thread
From:  @ 2015-10-29  2:40 UTC (permalink / raw)
To: Recipients

Hello,

I am Major. Alan Edward, in the military unit here in Afghanistan and i need an urgent assistance with someone i can trust,It's risk free and legal.

---
This email has been checked for viruses by Avast antivirus software.
http://www.avast.com

```* RE:
@ 2015-10-23 14:46 MajorAlan
0 siblings, 0 replies; 400+ messages in thread
From: MajorAlan @ 2015-10-23 14:46 UTC (permalink / raw)
To: Recipients

I am in the military unit here in Afghanistan,we have some amount of funds that we want to move out of the country.My partners and I need a good partner someone we can trust. It is risk free and legal. Reply to this email   majoralan.edward@gmx.com

Regards,
Major. Alan Edward

```* Re:
@ 2015-10-21  2:26 Mohammed
0 siblings, 0 replies; 400+ messages in thread
From: Mohammed @ 2015-10-21  2:26 UTC (permalink / raw)
To: Recipients

Message from Saudi Arabia Prince Alwaleed bin Talal for his charity donation and You have been selected as recipient/benefactor for \$5Million Dollars from Alwaleed Philanthropic Foundation Grant.for more information contact Via email  ally00256@yandex.com

Thanks

Ally Mohammed

```* Re ...
@ 2015-10-08  8:30 BRGF
0 siblings, 0 replies; 400+ messages in thread
From: BRGF @ 2015-10-08  8:30 UTC (permalink / raw)
To: Recipients

Full Name:
Country:
Loan Amount:
Duration:
Purpose of Loan:
Phone Number:

We hope to hear from you soonest.

Hamon Hassan
operations manager.

```* Re:
@ 2015-09-01 16:06 Zariya
0 siblings, 0 replies; 400+ messages in thread
From: Zariya @ 2015-09-01 16:06 UTC (permalink / raw)
To: Recipients

Help me and my 2 kids here in Syria We will share the 6,600,000 USD
I have here with you for your help, sorry to mention it
we want to leave Syria, put the kids in school and buy a new home
You will give us guidance when we arrive Their father died in the chemical weapon airstrike
I will send you our family pictures and more details as I read from you

Yours

ZariyaHelp me and my 2 kids here in Syria We will share the 6,600,000 USD
I have here with you for your help, sorry to mention it
we want to leave Syria, put the kids in school and buy a new home
You will give us guidance when we arrive Their father died in the chemical weapon airstrike
I will send you our family pictures and more details as I read from you

Yours

Zariya

```* Re:
2015-09-01 14:14 Mika Penttilä
@ 2015-09-01 15:22 ` Fabio Estevam
0 siblings, 0 replies; 400+ messages in thread
From: Fabio Estevam @ 2015-09-01 15:22 UTC (permalink / raw)
To: Mika Penttilä; +Cc: linux-kernel, linux-arm-kernel, Eduardo Valentin

On Tue, Sep 1, 2015 at 11:14 AM, Mika Penttilä
<mika.j.penttila@gmail.com> wrote:
> This one causes imx6q with debug uart connected to "schedule while
> atomic" endlessly :

Yes, I have sent a revert patch for it:
http://www.spinics.net/lists/arm-kernel/msg439995.html

```* Re:
@ 2015-09-01 12:01 Zariya
0 siblings, 0 replies; 400+ messages in thread
From: Zariya @ 2015-09-01 12:01 UTC (permalink / raw)
To: Recipients

Help me and my 2 kids here in Syria We will share the 6,600,000 USD
I have here with you for your help, sorry to mention it
we want to leave Syria, put the kids in school and buy a new home
You will give us guidance when we arrive Their father died in the chemical weapon airstrike
I will send you our family pictures and more details as I read from you

Yours

ZariyaHelp me and my 2 kids here in Syria We will share the 6,600,000 USD
I have here with you for your help, sorry to mention it
we want to leave Syria, put the kids in school and buy a new home
You will give us guidance when we arrive Their father died in the chemical weapon airstrike
I will send you our family pictures and more details as I read from you

Yours

Zariya

```* Re:
@ 2015-08-19 13:01 christain147
0 siblings, 0 replies; 400+ messages in thread
From: christain147 @ 2015-08-19 13:01 UTC (permalink / raw)
To: Recipients

Good day,hoping you read this email and respond to me in good time.I do not intend to solicit for funds but  your time and energy in using my own resources to assist the less privileged.I am medically confined at the moment hence I request your indulgence.
I will give you a comprehensive brief once I hear from you.

gudworks104@yahoo.com

Robert Grondahl

```* Re:
@ 2015-07-24 10:34 Mrs Nadia  Mohammed
0 siblings, 0 replies; 400+ messages in thread

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=utf-8, Size: 453 bytes --]

Ø§Ù„Ø³Ù„Ø§Ù… Ø¹Ù„ÙŠÙƒÙ… Ø§Ù†Ø§ Ù…Ø¯Ø§Ù… Ù†Ø§Ø¯ÙŠØ© Ù…Ø­Ù…Ø¯ ØºØ§Ù†Ù… Ù…Ù† ÙÙ„Ø³Ø·ÙŠÙ† Ùˆ Ø§Ø±ÙŠØ¯ Ù…Ù†Ùƒ Ø§Ù† ØªØ³Ø§Ø¹Ø¯Ù†Ù‰ Ù„Ø§Ù†Ù†Ù‰ Ù„Ø¯Ù‰ Ù…Ø´Ø±ÙˆØ¹ Ø§Ø±ÙŠØ¯ Ø§Ù† Ø§Ø¹Ø±Ø¶Ù‡ Ø§Ù„ÙŠÙƒ  Ù„Ø°Ø§ Ø§Ø±Ø¬Ùˆ Ù…Ù†Ùƒ Ø§Ù„ØªÙˆØ§ØµÙ„ Ù…Ø¹Ù‰ Ø¹Ù„Ù‰ Ù‡Ø°Ø§ Ø§Ù„Ø§ÙŠÙ…ÙŠÙ„

```* Re:
@ 2015-07-11 18:37 ` Mustapha Abiola
0 siblings, 0 replies; 400+ messages in thread
From: Mustapha Abiola @ 2015-07-11 18:37 UTC (permalink / raw)
To: eparis, paul, linux-kernel, linux-audit, mingo

[-- Attachment #1: Type: text/plain, Size: 1 bytes --]

[-- Attachment #2: 0001-Fix-redundant-check-against-unsigned-int-in-broken-a.patch --]
[-- Type: application/octet-stream, Size: 930 bytes --]

From 55fae099d46749b73895934aab8c2823c5a23abe Mon Sep 17 00:00:00 2001
From: Mustapha Abiola <hi@mustapha.org>
Date: Sat, 11 Jul 2015 17:01:04 +0000
Subject: [PATCH 1/1] Fix redundant check against unsigned int in broken audit
test fix for exec arg len

Quick patch to fix the needless check of `len` being < 0 as its an
unsigned int.

Signed-off-by: Mustapha Abiola <hi@mustapha.org>
---
kernel/auditsc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e85bdfd..0012476 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1021,7 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* for strings that are too long, we should not have created
* any.
*/
-	if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
+	if (WARN_ON_ONCE(len > MAX_ARG_STRLEN - 1)) {
send_sig(SIGKILL, current, 0);
return -1;
}
--
1.9.1

```* Re:
@ 2015-07-05 16:38 ` t0021
0 siblings, 0 replies; 400+ messages in thread
From: t0021 @ 2015-07-05 16:38 UTC (permalink / raw)
To: info

----- Original Message -----

=========================

```* Re:
2015-05-31 15:37 ` Re: Roman Volkov
@ 2015-05-31 15:53   ` Hans de Goede
0 siblings, 0 replies; 400+ messages in thread
From: Hans de Goede @ 2015-05-31 15:53 UTC (permalink / raw)
To: Roman Volkov, Dmitry Torokhov
Cc: Mark Rutland, Rob Herring, Pawel Moll, Ian Campbell, Kumar Gala,
grant.likely, Jiri Kosina, Wolfram Sang, linux-input,
linux-kernel, devicetree, Tony Prisk

Hi Roman,

On 31-05-15 17:37, Roman Volkov wrote:
> В Sat, 14 Mar 2015 20:20:38 -0700
> Dmitry Torokhov <dmitry.torokhov@gmail.com> wrote:
>
>>
>> Hi Roman,
>>
>> On Mon, Feb 16, 2015 at 12:11:43AM +0300, Roman Volkov wrote:
>>> Documentation for 'intel,8042' DT compatible node.
>>>
>>> Signed-off-by: Tony Prisk <linux@prisktech.co.nz>
>>> Signed-off-by: Roman Volkov <v1ron@v1ros.org>
>>> ---
>>>   .../devicetree/bindings/input/intel-8042.txt       | 26
>>> ++++++++++++++++++++++ 1 file changed, 26 insertions(+)
>>>   create mode 100644
>>> Documentation/devicetree/bindings/input/intel-8042.txt
>>>
>>> diff --git a/Documentation/devicetree/bindings/input/intel-8042.txt
>>> b/Documentation/devicetree/bindings/input/intel-8042.txt new file
>>> mode 100644 index 0000000..ab8a3e0
>>> --- /dev/null
>>> +++ b/Documentation/devicetree/bindings/input/intel-8042.txt
>>> @@ -0,0 +1,26 @@
>>> +Intel 8042 Keyboard Controller
>>> +
>>> +Required properties:
>>> +- compatible: should be "intel,8042"
>>> +- regs: memory for keyboard controller
>>> +- interrupts: usually, two interrupts should be specified
>>> (keyboard and aux).
>>> +	However, only one interrupt is also allowed in case of
>>> absence of the
>>> +	physical port in the controller. The i8042 driver must be
>>> +	nokbd/noaux option in this case.
>>> +- interrupt-names: interrupt names corresponding to numbers in the
>>> list.
>>> +	"kbd" is the keyboard interrupt and "aux" is the auxiliary
>>> (mouse)
>>> +	interrupt.
>>> +- command-reg: offset in memory for command register
>>> +- status-reg: offset in memory for status register
>>> +- data-reg: offset in memory for data register
>>> +
>>> +Example:
>>> +	i8042@d8008800 {
>>> +		compatible = "intel,8042";
>>> +		regs = <0xd8008800 0x100>;
>>> +		interrupts = <23>, <4>;
>>> +		interrupt-names = "kbd", "aux";
>>> +		command-reg = <0x04>;
>>> +		status-reg = <0x04>;
>>> +		data-reg = <0x00>;
>>> +	};
>>
>> No, we already have existing OF bindings for i8042 on sparc and
>> powerpc, I do not think we need to invent a brand new one.
>>
>> Thanks.
>>
>
> Hi Dmitry,
>
> I see some OF code in i8042-sparcio.h file. There are node definitions
> like "kb_ps2", "keyboard", "kdmouse", "mouse". Are these documented
> somewhere?
>
> Great if vt8500 is not unique with OF bindings for i8042. The code from
> sparc even looks compatible, only register offsets are hardcoded for
> specific machine. Is it possible to read offsets from Device Tree using
> these existing bindings without dealing with the kernel configuration?

Have you looked at the existing bindings for ps/2 controllers
under Documentation/devicetree/bindings/serio ?

Regards,

Hans

```* Re:
@ 2015-05-31 15:37 ` Roman Volkov
2015-05-31 15:53   ` Re: Hans de Goede
From: Roman Volkov @ 2015-05-31 15:37 UTC (permalink / raw)
To: Dmitry Torokhov
Cc: Mark Rutland, Rob Herring, Pawel Moll, Ian Campbell, Kumar Gala,
grant.likely, Hans de Goede, Jiri Kosina, Wolfram Sang,
linux-input, linux-kernel, devicetree, Tony Prisk

В Sat, 14 Mar 2015 20:20:38 -0700
Dmitry Torokhov <dmitry.torokhov@gmail.com> wrote:

>
> Hi Roman,
>
> On Mon, Feb 16, 2015 at 12:11:43AM +0300, Roman Volkov wrote:
> > Documentation for 'intel,8042' DT compatible node.
> >
> > Signed-off-by: Tony Prisk <linux@prisktech.co.nz>
> > Signed-off-by: Roman Volkov <v1ron@v1ros.org>
> > ---
> >  .../devicetree/bindings/input/intel-8042.txt       | 26
> > ++++++++++++++++++++++ 1 file changed, 26 insertions(+)
> >  create mode 100644
> > Documentation/devicetree/bindings/input/intel-8042.txt
> >
> > diff --git a/Documentation/devicetree/bindings/input/intel-8042.txt
> > b/Documentation/devicetree/bindings/input/intel-8042.txt new file
> > mode 100644 index 0000000..ab8a3e0
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/input/intel-8042.txt
> > @@ -0,0 +1,26 @@
> > +Intel 8042 Keyboard Controller
> > +
> > +Required properties:
> > +- compatible: should be "intel,8042"
> > +- regs: memory for keyboard controller
> > +- interrupts: usually, two interrupts should be specified
> > (keyboard and aux).
> > +	However, only one interrupt is also allowed in case of
> > absence of the
> > +	physical port in the controller. The i8042 driver must be
> > +	nokbd/noaux option in this case.
> > +- interrupt-names: interrupt names corresponding to numbers in the
> > list.
> > +	"kbd" is the keyboard interrupt and "aux" is the auxiliary
> > (mouse)
> > +	interrupt.
> > +- command-reg: offset in memory for command register
> > +- status-reg: offset in memory for status register
> > +- data-reg: offset in memory for data register
> > +
> > +Example:
> > +	i8042@d8008800 {
> > +		compatible = "intel,8042";
> > +		regs = <0xd8008800 0x100>;
> > +		interrupts = <23>, <4>;
> > +		interrupt-names = "kbd", "aux";
> > +		command-reg = <0x04>;
> > +		status-reg = <0x04>;
> > +		data-reg = <0x00>;
> > +	};
>
> No, we already have existing OF bindings for i8042 on sparc and
> powerpc, I do not think we need to invent a brand new one.
>
> Thanks.
>

Hi Dmitry,

I see some OF code in i8042-sparcio.h file. There are node definitions
like "kb_ps2", "keyboard", "kdmouse", "mouse". Are these documented
somewhere?

Great if vt8500 is not unique with OF bindings for i8042. The code from
sparc even looks compatible, only register offsets are hardcoded for
specific machine. Is it possible to read offsets from Device Tree using
these existing bindings without dealing with the kernel configuration?

Regards,
Roman

```* Re:
@ 2015-05-22  0:17 kontakt
0 siblings, 0 replies; 400+ messages in thread
From: kontakt @ 2015-05-22  0:17 UTC (permalink / raw)
To: Recipients

Teraz mozesz uzyskac kredyt w wysokosci 2% za uniewaznic i dostac do 40 lat lub wiecej, aby splacic. Nie naleza do kredytów krótkoterminowych, które sprawiaja, ze zwróci sie w kilka tygodni lub miesiecy. Nasza oferta obejmuje; * Refinansowanie * Home Improvement * Kredyty samochodowe * Konsolidacja zadluzenia * Linia kredytowa * Druga hipoteczny * Biznes Pozyczki * Pozyczki Personal

Zdobadz pieniadze potrzebne dzis z duza iloscia czasu, aby dokonac platnosci powrotem. Aby zastosowac, aby wyslac wszystkie pytania lub zaproszenia fwfshelpdesk@gmail.com: + 1- 435-241-5945

************************************************************
Now you can get a loan at 2% per annul and get up to 40 years or more to pay it back. Don't fall for the short term loans that make you pay back in weeks or months. Our offer include; *Refinance *Home Improvement *Auto Loans *Debt Consolidation*Line of Credit *Second Mortgage *Business Loans*Personal Loans

Get the money you need today with plenty of time to make the payments back. To apply, send all inquiries to fwfshelpdesk@gmail.com or call : + 1- 435-241-5945

---
This email is free from viruses and malware because avast! Antivirus protection is active.
http://www.avast.com

```* RE:
@ 2015-05-21 10:49 ` Ratnakumar Sagana (KING'S COLLEGE HOSPITAL NHS FOUNDATION TRUST)
0 siblings, 0 replies; 400+ messages in thread
From: Ratnakumar Sagana (KING'S COLLEGE HOSPITAL NHS FOUNDATION TRUST) @ 2015-05-21 10:49 UTC (permalink / raw)
To: Ratnakumar Sagana (KING'S COLLEGE HOSPITAL NHS FOUNDATION TRUST)

You have won contact Allen On allemwilliam10001@gmail.com for info.
Allen Williams
+27612909541

********************************************************************************************************************

This message may contain confidential information. If you are not the intended recipient please inform the
sender that you have received the message in error before deleting it.
Please do not disclose, copy or distribute information in this e-mail or take any action in reliance on its contents:
to do so is strictly prohibited and may be unlawful.

NHSmail is the secure email and directory service available for all NHS staff in England and Scotland
NHSmail is approved for exchanging patient data and other sensitive information with NHSmail and GSi recipients
NHSmail provides an email address for your career in the NHS and can be accessed anywhere

********************************************************************************************************************

```* RE:
@ 2015-05-10 13:03 ` Singer, W.P.
0 siblings, 0 replies; 400+ messages in thread
From: Singer, W.P. @ 2015-05-10 13:03 UTC (permalink / raw)
To: Singer, W.P.

________________________________
Van: Singer, W.P.
Verzonden: zondag 10 mei 2015 14:50
Onderwerp:

i have given you my money contact: mikerose234@outlook.com<mailto:mikerose234@outlook.com>

```* Re:
@ 2015-04-21  7:43 Galaxy Darlehen Firma
0 siblings, 0 replies; 400+ messages in thread
From: Galaxy Darlehen Firma @ 2015-04-21  7:43 UTC (permalink / raw)

--

Benötigen Sie dringend Darlehen? Wir geben Darlehen an interessierten
Einzelpersonen, die Darlehen mit gutem Glauben suchen. Sind Sie
ernsthaft brauchen eine dringende Darlehen? dann sind Sie bei uns
richtig. Wir geben Business-Darlehen, Privatkredit, kontaktieren Sie
uns für Ihren Darlehensantrag auf Ihre Nachfrage und durch finanzielle
problem.contact uns heute per e-Mail: galaxyfunds1@gmail.com

Vielen Dank wie wir Ihre Antwort erwarten
Galaxy Darlehen Firma

```* Re:
@ 2015-04-07 18:47 ` Wilson Aggard
0 siblings, 0 replies; 400+ messages in thread
From: Wilson Aggard @ 2015-04-07 18:47 UTC (permalink / raw)
To: United Nations Compensation Commission

----- Original Message -----

Did you get my Email

```* Re:
@ 2015-04-01 21:56 Globale Trust Company
0 siblings, 0 replies; 400+ messages in thread
From: Globale Trust Company @ 2015-04-01 21:56 UTC (permalink / raw)

--
Sind Sie ein Unternehmer Mann oder eine Frau? Sind Sie in finanziellen
Schwierigkeiten, oder haben Sie finanzielle Mittel benötigen, um Ihr
eigenes Unternehmen gründen? Brauchen Sie noch einen Autokredit? Sie
haben einen niedrigen Kredit-Score und Sie finden es schwierig, ein
Darlehen von lokalen Banken und anderen Finanzinstituten zu bekommen?
Sie benötigen ein Darlehen aus irgendeinem Grund, Gründe, wie .:
a) persönliche Darlehen, Ausbau des Geschäfts.
b) Unternehmertum und Bildung.
(c) Schuldenkonsolidierung).
Wir bieten Darlehen zu einem sehr niedrigen Zinssatz und ohne
Bonitätsprüfung, bieten wir persönliche Darlehen,
Schuldenkonsolidierung, Venture Capital, Business Education Kredit,
Kreditwohnungsbaudarlehen oder "Kredit aus anderen Gründen !."
Hinweis: Jeder interessierte Bewerber sollte uns auf diese E-Mail für
mehr kontaktieren
Informationen zu unserem Kredit-Programm.

Name: Globale Trust Company
E-Mail: globaltrust732@gmail.com

Wir danken Ihnen für Ihr Verständnis, die darauf warten, von Ihnen zu hören.

```* Re:
2015-03-04 10:29 Quentin Lambert
@ 2015-03-04 10:32 ` Quentin Lambert
0 siblings, 0 replies; 400+ messages in thread
From: Quentin Lambert @ 2015-03-04 10:32 UTC (permalink / raw)
To: Greg Kroah-Hartman; +Cc: kernel-janitors, devel, linux-kernel

Ignore this I made a mistake.

My apologies,

Quentin

On 04/03/2015 11:29, Quentin Lambert wrote:
> Bcc:
> Subject: [PATCH 1/2] staging: rts5208: Convert non-returned local variable to
>   boolean when relevant
>
> This patch was produced using Coccinelle. A simplified version of the
> semantic patch is:
>
> @r exists@
> identifier f;
> local idexpression u8 x;
> identifier xname;
> @@
>
> f(...) {
> ...when any
> (
>    x@xname = 1;
> |
>    x@xname = 0;
> )
> ...when any
> }
>
> identifier r.f;
> local idexpression u8 r.x
> expression e1 != {0, 1}, e2;
> @@
>
> f(...) {
> ...when any
> (
>    x = e1;
> |
>    x + e2
> )
> ...when any
> }
>
> identifier r.f;
> local idexpression u8 r.x;
> identifier r.xname;
> @@
>
> f(...) {
> ...
> ++ bool xname;
> - int xname;
> <...
> (
>    x =
> - 1
> + true
> |
>    x =
> - -1
> + false
> )
> ...>
>
> }
>
> Signed-off-by: Quentin Lambert <lambert.quentin@gmail.com>
> ---
>   drivers/staging/rts5208/ms.c        | 14 +++---
>   drivers/staging/rts5208/rtsx_chip.c | 56 ++++++++++++-----------
>   drivers/staging/rts5208/rtsx_scsi.c | 38 +++++++++-------
>   drivers/staging/rts5208/sd.c        | 88 +++++++++++++++++++------------------
>   4 files changed, 105 insertions(+), 91 deletions(-)
>
> diff --git a/drivers/staging/rts5208/ms.c b/drivers/staging/rts5208/ms.c
> index a47a191..050bc47 100644
> --- a/drivers/staging/rts5208/ms.c
> +++ b/drivers/staging/rts5208/ms.c
> @@ -1560,7 +1560,8 @@ static int ms_copy_page(struct rtsx_chip *chip, u16 old_blk, u16 new_blk,
>   		u16 log_blk, u8 start_page, u8 end_page)
>   {
>   	struct ms_info *ms_card = &(chip->ms_card);
> -	int retval, rty_cnt, uncorrect_flag = 0;
> +	bool uncorrect_flag = false;
> +	int retval, rty_cnt;
>   	u8 extra[MS_EXTRA_SIZE], val, i, j, data[16];
>
>   	dev_dbg(rtsx_dev(chip), "Copy page from 0x%x to 0x%x, logical block is 0x%x\n",
> @@ -1642,10 +1643,10 @@ static int ms_copy_page(struct rtsx_chip *chip, u16 old_blk, u16 new_blk,
>   			if (val & INT_REG_ERR) {
>   				if (retval != STATUS_SUCCESS) {
> -					uncorrect_flag = 1;
> +					uncorrect_flag = true;
>   					dev_dbg(rtsx_dev(chip), "Uncorrectable error\n");
>   				} else {
> -					uncorrect_flag = 0;
> +					uncorrect_flag = false;
>   				}
>
>   				retval = ms_transfer_tpc(chip,
> @@ -2187,7 +2188,8 @@ static int ms_build_l2p_tbl(struct rtsx_chip *chip, int seg_no)
>   {
>   	struct ms_info *ms_card = &(chip->ms_card);
>   	struct zone_entry *segment;
> -	int retval, table_size, disable_cnt, defect_flag, i;
> +	bool defect_flag;
> +	int retval, table_size, disable_cnt, i;
>   	u16 start, end, phy_blk, log_blk, tmp_blk;
>   	u8 extra[MS_EXTRA_SIZE], us1, us2;
>
> @@ -2236,10 +2238,10 @@ static int ms_build_l2p_tbl(struct rtsx_chip *chip, int seg_no)
>
>   	for (phy_blk = start; phy_blk < end; phy_blk++) {
>   		if (disable_cnt) {
> -			defect_flag = 0;
> +			defect_flag = false;
>   			for (i = 0; i < segment->disable_count; i++) {
>   				if (phy_blk == segment->defect_list[i]) {
> -					defect_flag = 1;
> +					defect_flag = true;
>   					break;
>   				}
>   			}
> diff --git a/drivers/staging/rts5208/rtsx_chip.c b/drivers/staging/rts5208/rtsx_chip.c
> index 9593d81..35fa19d 100644
> --- a/drivers/staging/rts5208/rtsx_chip.c
> +++ b/drivers/staging/rts5208/rtsx_chip.c
> @@ -153,22 +153,22 @@ static int rtsx_pre_handle_sdio_old(struct rtsx_chip *chip)
>   static int rtsx_pre_handle_sdio_new(struct rtsx_chip *chip)
>   {
>   	u8 tmp;
> -	int sw_bypass_sd = 0;
> +	bool sw_bypass_sd = false;
>   	int retval;
>
>   		if (CHECK_PID(chip, 0x5288)) {
>   			if (tmp & 0x08)
> -				sw_bypass_sd = 1;
> +				sw_bypass_sd = true;
>   		} else if (CHECK_PID(chip, 0x5208)) {
>   			if (tmp & 0x80)
> -				sw_bypass_sd = 1;
> +				sw_bypass_sd = true;
>   		}
>   	} else {
>   		if (chip->sdio_in_charge)
> -			sw_bypass_sd = 1;
> +			sw_bypass_sd = true;
>   	}
>   	dev_dbg(rtsx_dev(chip), "chip->sdio_in_charge = %d\n",
>   		chip->sdio_in_charge);
> @@ -501,13 +501,14 @@ nextcard:
>
>   static inline int check_sd_speed_prior(u32 sd_speed_prior)
>   {
> -	int i, fake_para = 0;
> +	bool fake_para = false;
> +	int i;
>
>   	for (i = 0; i < 4; i++) {
>   		u8 tmp = (u8)(sd_speed_prior >> (i*8));
>
>   		if ((tmp < 0x01) || (tmp > 0x04)) {
> -			fake_para = 1;
> +			fake_para = true;
>   			break;
>   		}
>   	}
> @@ -517,13 +518,14 @@ static inline int check_sd_speed_prior(u32 sd_speed_prior)
>
>   static inline int check_sd_current_prior(u32 sd_current_prior)
>   {
> -	int i, fake_para = 0;
> +	bool fake_para = false;
> +	int i;
>
>   	for (i = 0; i < 4; i++) {
>   		u8 tmp = (u8)(sd_current_prior >> (i*8));
>
>   		if (tmp > 0x03) {
> -			fake_para = 1;
> +			fake_para = true;
>   			break;
>   		}
>   	}
> @@ -784,31 +786,31 @@ static inline void rtsx_blink_led(struct rtsx_chip *chip)
>
>   static void rtsx_monitor_aspm_config(struct rtsx_chip *chip)
>   {
> -	int maybe_support_aspm, reg_changed;
> +	bool reg_changed, maybe_support_aspm;
>   	u32 tmp = 0;
>   	u8 reg0 = 0, reg1 = 0;
>
> -	maybe_support_aspm = 0;
> -	reg_changed = 0;
> +	maybe_support_aspm = false;
> +	reg_changed = false;
>   	if (chip->aspm_level[0] != reg0) {
> -		reg_changed = 1;
> +		reg_changed = true;
>   		chip->aspm_level[0] = reg0;
>   	}
>   	if (CHK_SDIO_EXIST(chip) && !CHK_SDIO_IGNORED(chip)) {
>   		reg1 = (u8)tmp;
>   		if (chip->aspm_level[1] != reg1) {
> -			reg_changed = 1;
> +			reg_changed = true;
>   			chip->aspm_level[1] = reg1;
>   		}
>
>   		if ((reg0 & 0x03) && (reg1 & 0x03))
> -			maybe_support_aspm = 1;
> +			maybe_support_aspm = true;
>
>   	} else {
>   		if (reg0 & 0x03)
> -			maybe_support_aspm = 1;
> +			maybe_support_aspm = true;
>   	}
>
>   	if (reg_changed) {
> @@ -835,7 +837,7 @@ void rtsx_polling_func(struct rtsx_chip *chip)
>   #ifdef SUPPORT_SD_LOCK
>   	struct sd_info *sd_card = &chip->sd_card;
>   #endif
> -	int ss_allowed;
> +	bool ss_allowed;
>
>   	if (rtsx_chk_stat(chip, RTSX_STAT_SUSPEND))
>   		return;
> @@ -887,21 +889,21 @@ void rtsx_polling_func(struct rtsx_chip *chip)
>   	rtsx_init_cards(chip);
>
>   	if (chip->ss_en) {
> -		ss_allowed = 1;
> +		ss_allowed = true;
>
>   		if (CHECK_PID(chip, 0x5288)) {
> -			ss_allowed = 0;
> +			ss_allowed = false;
>   		} else {
>   			if (CHK_SDIO_EXIST(chip) && !CHK_SDIO_IGNORED(chip)) {
>   				u32 val;
>
>   				if (val & 0x07)
> -					ss_allowed = 0;
> +					ss_allowed = false;
>   			}
>   		}
>   	} else {
> -		ss_allowed = 0;
> +		ss_allowed = false;
>   	}
>
>   	if (ss_allowed && !chip->sd_io) {
> @@ -1358,7 +1360,8 @@ int rtsx_read_cfg_seq(struct rtsx_chip *chip, u8 func, u16 addr, u8 *buf,
>
>   int rtsx_write_phy_register(struct rtsx_chip *chip, u8 addr, u16 val)
>   {
> -	int i, finished = 0;
> +	bool finished = false;
> +	int i;
>   	u8 tmp;
>
>   	RTSX_WRITE_REG(chip, PHYDATA0, 0xFF, (u8)val);
> @@ -1369,7 +1372,7 @@ int rtsx_write_phy_register(struct rtsx_chip *chip, u8 addr, u16 val)
>   	for (i = 0; i < 100000; i++) {
>   		if (!(tmp & 0x80)) {
> -			finished = 1;
> +			finished = true;
>   			break;
>   		}
>   	}
> @@ -1382,7 +1385,8 @@ int rtsx_write_phy_register(struct rtsx_chip *chip, u8 addr, u16 val)
>
>   {
> -	int i, finished = 0;
> +	bool finished = false;
> +	int i;
>   	u16 data = 0;
>   	u8 tmp;
>
> @@ -1392,7 +1396,7 @@ int rtsx_read_phy_register(struct rtsx_chip *chip, u8 addr, u16 *val)
>   	for (i = 0; i < 100000; i++) {
>   		if (!(tmp & 0x80)) {
> -			finished = 1;
> +			finished = true;
>   			break;
>   		}
>   	}
> @@ -1615,7 +1619,7 @@ void rtsx_exit_ss(struct rtsx_chip *chip)
>   int rtsx_pre_handle_interrupt(struct rtsx_chip *chip)
>   {
>   	u32 status, int_enable;
> -	int exit_ss = 0;
> +	bool exit_ss = false;
>   #ifdef SUPPORT_OCP
>   	u32 ocp_int = 0;
>
> @@ -1625,7 +1629,7 @@ int rtsx_pre_handle_interrupt(struct rtsx_chip *chip)
>   	if (chip->ss_en) {
>   		chip->ss_counter = 0;
>   		if (rtsx_get_stat(chip) == RTSX_STAT_SS) {
> -			exit_ss = 1;
> +			exit_ss = true;
>   			rtsx_exit_L1(chip);
>   			rtsx_set_stat(chip, RTSX_STAT_RUN);
>   		}
> diff --git a/drivers/staging/rts5208/rtsx_scsi.c b/drivers/staging/rts5208/rtsx_scsi.c
> index 42645834..a00ba21 100644
> --- a/drivers/staging/rts5208/rtsx_scsi.c
> +++ b/drivers/staging/rts5208/rtsx_scsi.c
> @@ -39,7 +39,8 @@ void scsi_show_command(struct rtsx_chip *chip)
>   {
>   	struct scsi_cmnd *srb = chip->srb;
>   	char *what = NULL;
> -	int unknown_cmd = 0, len;
> +	bool unknown_cmd = false;
> +	int len;
>
>   	switch (srb->cmnd[0]) {
> @@ -310,7 +311,8 @@ void scsi_show_command(struct rtsx_chip *chip)
>   		what = "Realtek's vendor command";
>   		break;
>   	default:
> -		what = "(unknown command)"; unknown_cmd = 1;
> +		what = "(unknown command)";
> +		unknown_cmd = true;
>   		break;
>   	}
>
> @@ -485,7 +487,7 @@ static int inquiry(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	unsigned char sendbytes;
>   	unsigned char *buf;
>   	u8 card = get_lun_card(chip, lun);
> -	int pro_formatter_flag = 0;
> +	bool pro_formatter_flag = false;
>   	unsigned char inquiry_buf[] = {
>   		QULIFIRE|DRCT_ACCESS_DEV,
>   		RMB_DISC|0x0D,
> @@ -520,7 +522,7 @@ static int inquiry(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	if (chip->mspro_formatter_enable)
>   #endif
>   		if (!card || (card == MS_CARD))
> -			pro_formatter_flag = 1;
> +			pro_formatter_flag = true;
>
>   	if (pro_formatter_flag) {
>   		if (scsi_bufflen(srb) < 56)
> @@ -663,7 +665,7 @@ static void ms_mode_sense(struct rtsx_chip *chip, u8 cmd,
>   	struct ms_info *ms_card = &(chip->ms_card);
>   	int sys_info_offset;
>   	int data_size = buf_len;
> -	int support_format = 0;
> +	bool support_format = false;
>   	int i = 0;
>
>   	if (cmd == MODE_SENSE) {
> @@ -684,10 +686,10 @@ static void ms_mode_sense(struct rtsx_chip *chip, u8 cmd,
>   	/* Medium Type Code */
>   		if (CHK_MSXC(ms_card)) {
> -			support_format = 1;
> +			support_format = true;
>   			buf[i++] = 0x40;
>   		} else if (CHK_MSPRO(ms_card)) {
> -			support_format = 1;
> +			support_format = true;
>   			buf[i++] = 0x20;
>   		} else {
>   			buf[i++] = 0x10;
> @@ -755,7 +757,7 @@ static int mode_sense(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	unsigned int lun = SCSI_LUN(srb);
>   	unsigned int dataSize;
>   	int status;
> -	int pro_formatter_flag;
> +	bool pro_formatter_flag;
>   	unsigned char pageCode, *buf;
>   	u8 card = get_lun_card(chip, lun);
>
> @@ -767,20 +769,20 @@ static int mode_sense(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	}
>   #endif
>
> -	pro_formatter_flag = 0;
> +	pro_formatter_flag = false;
>   	dataSize = 8;
>   #ifdef SUPPORT_MAGIC_GATE
>   	if ((chip->lun2card[lun] & MS_CARD)) {
>   		if (!card || (card == MS_CARD)) {
>   			dataSize = 108;
>   			if (chip->mspro_formatter_enable)
> -				pro_formatter_flag = 1;
> +				pro_formatter_flag = true;
>   		}
>   	}
>   #else
>   	if (card == MS_CARD) {
>   		if (chip->mspro_formatter_enable) {
> -			pro_formatter_flag = 1;
> +			pro_formatter_flag = true;
>   			dataSize = 108;
>   		}
>   	}
> @@ -2295,7 +2297,8 @@ Exit:
>   static int read_cfg_byte(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   {
>   	int retval;
> -	u8 func, func_max;
> +	bool func_max;
> +	u8 func;
>   	u8 *buf;
>
> @@ -2315,9 +2318,9 @@ static int read_cfg_byte(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>
>   	if (CHK_SDIO_EXIST(chip) && !CHK_SDIO_IGNORED(chip))
> -		func_max = 1;
> +		func_max = true;
>   	else
> -		func_max = 0;
> +		func_max = false;
>
>   	if (func > func_max) {
>   		set_sense_type(chip, SCSI_LUN(srb),
> @@ -2349,7 +2352,8 @@ static int read_cfg_byte(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   static int write_cfg_byte(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   {
>   	int retval;
> -	u8 func, func_max;
> +	bool func_max;
> +	u8 func;
>   	u8 *buf;
>
> @@ -2369,9 +2373,9 @@ static int write_cfg_byte(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>
>   	if (CHK_SDIO_EXIST(chip) && !CHK_SDIO_IGNORED(chip))
> -		func_max = 1;
> +		func_max = true;
>   	else
> -		func_max = 0;
> +		func_max = false;
>
>   	if (func > func_max) {
>   		set_sense_type(chip, SCSI_LUN(srb),
> diff --git a/drivers/staging/rts5208/sd.c b/drivers/staging/rts5208/sd.c
> index c28a927..62bf570 100644
> --- a/drivers/staging/rts5208/sd.c
> +++ b/drivers/staging/rts5208/sd.c
> @@ -791,7 +791,7 @@ static int sd_change_phase(struct rtsx_chip *chip, u8 sample_point, u8 tune_dir)
>   	u16 SD_VP_CTL, SD_DCMPS_CTL;
>   	u8 val;
>   	int retval;
> -	int ddr_rx = 0;
> +	bool ddr_rx = false;
>
>   	dev_dbg(rtsx_dev(chip), "sd_change_phase (sample_point = %d, tune_dir = %d)\n",
>   		sample_point, tune_dir);
> @@ -800,7 +800,7 @@ static int sd_change_phase(struct rtsx_chip *chip, u8 sample_point, u8 tune_dir)
>   		SD_VP_CTL = SD_VPRX_CTL;
>   		SD_DCMPS_CTL = SD_DCMPS_RX_CTL;
>   		if (CHK_SD_DDR50(sd_card))
> -			ddr_rx = 1;
> +			ddr_rx = true;
>   	} else {
>   		SD_VP_CTL = SD_VPTX_CTL;
>   		SD_DCMPS_CTL = SD_DCMPS_TX_CTL;
> @@ -1121,7 +1121,7 @@ static int sd_check_switch(struct rtsx_chip *chip,
>   {
>   	int retval;
>   	int i;
> -	int switch_good = 0;
> +	bool switch_good = false;
>
>   	for (i = 0; i < 3; i++) {
>   		if (detect_card_cd(chip, SD_CARD) != STATUS_SUCCESS) {
> @@ -1137,7 +1137,7 @@ static int sd_check_switch(struct rtsx_chip *chip,
>   			retval = sd_check_switch_mode(chip, SD_SWITCH_MODE,
>   					func_group, func_to_switch, bus_width);
>   			if (retval == STATUS_SUCCESS) {
> -				switch_good = 1;
> +				switch_good = true;
>   				break;
>   			}
>
> @@ -1524,7 +1524,8 @@ static u8 sd_search_final_phase(struct rtsx_chip *chip, u32 phase_map,
>   	struct sd_info *sd_card = &(chip->sd_card);
>   	struct timing_phase_path path[MAX_PHASE + 1];
>   	int i, j, cont_path_cnt;
> -	int new_block, max_len, final_path_idx;
> +	bool new_block;
> +	int max_len, final_path_idx;
>   	u8 final_phase = 0xFF;
>
>   	if (phase_map == 0xFFFFFFFF) {
> @@ -1537,12 +1538,12 @@ static u8 sd_search_final_phase(struct rtsx_chip *chip, u32 phase_map,
>   	}
>
>   	cont_path_cnt = 0;
> -	new_block = 1;
> +	new_block = true;
>   	j = 0;
>   	for (i = 0; i < MAX_PHASE + 1; i++) {
>   		if (phase_map & (1 << i)) {
>   			if (new_block) {
> -				new_block = 0;
> +				new_block = false;
>   				j = cont_path_cnt++;
>   				path[j].start = i;
>   				path[j].end = i;
> @@ -1550,7 +1551,7 @@ static u8 sd_search_final_phase(struct rtsx_chip *chip, u32 phase_map,
>   				path[j].end = i;
>   			}
>   		} else {
> -			new_block = 1;
> +			new_block = true;
>   			if (cont_path_cnt) {
>   				int idx = cont_path_cnt - 1;
>
> @@ -2141,14 +2142,15 @@ static int sd_check_wp_state(struct rtsx_chip *chip)
>   static int reset_sd(struct rtsx_chip *chip)
>   {
>   	struct sd_info *sd_card = &(chip->sd_card);
> -	int retval, i = 0, j = 0, k = 0, hi_cap_flow = 0;
> -	int sd_dont_switch = 0;
> -	int support_1v8 = 0;
> -	int try_sdio = 1;
> +	bool hi_cap_flow = false;
> +	int retval, i = 0, j = 0, k = 0;
> +	bool sd_dont_switch = false;
> +	bool support_1v8 = false;
> +	bool try_sdio = true;
>   	u8 rsp[16];
>   	u8 switch_bus_width;
>   	u32 voltage = 0;
> -	int sd20_mode = 0;
> +	bool sd20_mode = false;
>
>   	SET_SD(sd_card);
>
> @@ -2157,7 +2159,7 @@ Switch_Fail:
>   	i = 0;
>   	j = 0;
>   	k = 0;
> -	hi_cap_flow = 0;
> +	hi_cap_flow = false;
>
>   #ifdef SUPPORT_SD_LOCK
>   	if (sd_card->sd_lock_status & SD_UNLOCK_POW_ON)
> @@ -2217,7 +2219,7 @@ RTY_SD_RST:
>   				SD_RSP_TYPE_R7, rsp, 5);
>   	if (retval == STATUS_SUCCESS) {
>   		if ((rsp[4] == 0xAA) && ((rsp[3] & 0x0f) == 0x01)) {
> -			hi_cap_flow = 1;
> +			hi_cap_flow = true;
>   			voltage = SUPPORT_VOLTAGE | 0x40000000;
>   		}
>   	}
> @@ -2272,10 +2274,10 @@ RTY_SD_RST:
>   		else
>   			CLR_SD_HCXC(sd_card);
>
> -		support_1v8 = 0;
> +		support_1v8 = false;
>   	} else {
>   		CLR_SD_HCXC(sd_card);
> -		support_1v8 = 0;
> +		support_1v8 = false;
>   	}
>   	dev_dbg(rtsx_dev(chip), "support_1v8 = %d\n", support_1v8);
>
> @@ -2361,7 +2363,7 @@ SD_UNLOCK_ENTRY:
>   		TRACE_RET(chip, STATUS_FAIL);
>
>   	if (!(sd_card->raw_csd[4] & 0x40))
> -		sd_dont_switch = 1;
> +		sd_dont_switch = true;
>
>   	if (!sd_dont_switch) {
>   		if (sd20_mode) {
> @@ -2378,16 +2380,16 @@ SD_UNLOCK_ENTRY:
>   			retval = sd_switch_function(chip, switch_bus_width);
>   			if (retval != STATUS_SUCCESS) {
>   				sd_init_power(chip);
> -				sd_dont_switch = 1;
> -				try_sdio = 0;
> +				sd_dont_switch = true;
> +				try_sdio = false;
>
>   				goto Switch_Fail;
>   			}
>   		} else {
>   			if (support_1v8) {
>   				sd_init_power(chip);
> -				sd_dont_switch = 1;
> -				try_sdio = 0;
> +				sd_dont_switch = true;
> +				try_sdio = false;
>
>   				goto Switch_Fail;
>   			}
> @@ -2433,8 +2435,8 @@ SD_UNLOCK_ENTRY:
>   				if (retval != STATUS_SUCCESS)
>   					TRACE_RET(chip, STATUS_FAIL);
>
> -				try_sdio = 0;
> -				sd20_mode = 1;
> +				try_sdio = false;
> +				sd20_mode = true;
>   				goto Switch_Fail;
>   			}
>   		}
> @@ -2458,8 +2460,8 @@ SD_UNLOCK_ENTRY:
>   					if (retval != STATUS_SUCCESS)
>   						TRACE_RET(chip, STATUS_FAIL);
>
> -					try_sdio = 0;
> -					sd20_mode = 1;
> +					try_sdio = false;
> +					sd20_mode = true;
>   					goto Switch_Fail;
>   				}
>   			}
> @@ -3702,7 +3704,7 @@ int sd_execute_no_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	unsigned int lun = SCSI_LUN(srb);
>   	int retval, rsp_len;
>   	u8 cmd_idx, rsp_type;
> -	u8 standby = 0, acmd = 0;
> +	bool standby = false, acmd = false;
>   	u32 arg;
>
>   	if (!sd_card->sd_pass_thru_en) {
> @@ -3722,10 +3724,10 @@ int sd_execute_no_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>
>   	cmd_idx = srb->cmnd[2] & 0x3F;
>   	if (srb->cmnd[1] & 0x02)
> -		standby = 1;
> +		standby = true;
>
>   	if (srb->cmnd[1] & 0x01)
> -		acmd = 1;
> +		acmd = true;
>
>   	arg = ((u32)srb->cmnd[3] << 24) | ((u32)srb->cmnd[4] << 16) |
>   		((u32)srb->cmnd[5] << 8) | srb->cmnd[6];
> @@ -3812,9 +3814,10 @@ int sd_execute_read_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	struct sd_info *sd_card = &(chip->sd_card);
>   	unsigned int lun = SCSI_LUN(srb);
>   	int retval, rsp_len, i;
> -	int cmd13_checkbit = 0, read_err = 0;
> +	int cmd13_checkbit = 0;
> +	bool read_err = false;
>   	u8 cmd_idx, rsp_type, bus_width;
> -	u8 send_cmd12 = 0, standby = 0, acmd = 0;
> +	bool standby = false, send_cmd12 = false, acmd = false;
>   	u32 data_len;
>
>   	if (!sd_card->sd_pass_thru_en) {
> @@ -3834,13 +3837,13 @@ int sd_execute_read_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>
>   	cmd_idx = srb->cmnd[2] & 0x3F;
>   	if (srb->cmnd[1] & 0x04)
> -		send_cmd12 = 1;
> +		send_cmd12 = true;
>
>   	if (srb->cmnd[1] & 0x02)
> -		standby = 1;
> +		standby = true;
>
>   	if (srb->cmnd[1] & 0x01)
> -		acmd = 1;
> +		acmd = true;
>
>   	data_len = ((u32)srb->cmnd[7] << 16) | ((u32)srb->cmnd[8]
>   						<< 8) | srb->cmnd[9];
> @@ -3915,7 +3918,7 @@ int sd_execute_read_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   				       blk_cnt, bus_width, buf, data_len, 2000);
>   		if (retval != STATUS_SUCCESS) {
>   			kfree(buf);
>   			rtsx_clear_sd_error(chip);
> @@ -3964,7 +3967,7 @@ int sd_execute_read_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   					scsi_bufflen(srb), scsi_sg_count(srb),
>   					DMA_FROM_DEVICE, 10000);
>   		if (retval < 0) {
>   			rtsx_clear_sd_error(chip);
>   		}
> @@ -4041,9 +4044,10 @@ int sd_execute_write_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	struct sd_info *sd_card = &(chip->sd_card);
>   	unsigned int lun = SCSI_LUN(srb);
>   	int retval, rsp_len, i;
> -	int cmd13_checkbit = 0, write_err = 0;
> +	int cmd13_checkbit = 0;
> +	bool write_err = false;
>   	u8 cmd_idx, rsp_type;
> -	u8 send_cmd12 = 0, standby = 0, acmd = 0;
> +	bool standby = false, send_cmd12 = false, acmd = false;
>   	u32 data_len, arg;
>   #ifdef SUPPORT_SD_LOCK
>   	int lock_cmd_fail = 0;
> @@ -4068,13 +4072,13 @@ int sd_execute_write_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>
>   	cmd_idx = srb->cmnd[2] & 0x3F;
>   	if (srb->cmnd[1] & 0x04)
> -		send_cmd12 = 1;
> +		send_cmd12 = true;
>
>   	if (srb->cmnd[1] & 0x02)
> -		standby = 1;
> +		standby = true;
>
>   	if (srb->cmnd[1] & 0x01)
> -		acmd = 1;
> +		acmd = true;
>
>   	data_len = ((u32)srb->cmnd[7] << 16) | ((u32)srb->cmnd[8]
>   						<< 8) | srb->cmnd[9];
> @@ -4247,7 +4251,7 @@ int sd_execute_write_data(struct scsi_cmnd *srb, struct rtsx_chip *chip)
>   	}
>
>   	if (retval < 0) {
> -		write_err = 1;
> +		write_err = true;
>   		rtsx_clear_sd_error(chip);
>   		TRACE_GOTO(chip, SD_Execute_Write_Cmd_Failed);
>   	}

```* Re:
@ 2014-12-18 18:08 Peter Page
0 siblings, 0 replies; 400+ messages in thread
From: Peter Page @ 2014-12-18 18:08 UTC (permalink / raw)
To: linux-kernel

Hello,

I have a business proposal I would like to share with you, on your response I will email you with more details.

Kind regards
Peter Page

```* Re:
@ 2014-12-01 13:02 Quan Han
0 siblings, 0 replies; 400+ messages in thread
From: Quan Han @ 2014-12-01 13:02 UTC (permalink / raw)
To: Recipients

Hello,

Compliments of the day to you and I believe all is well. My name is Mr. Quan Han and I work in bank of china. I have a transaction that I believe will be of mutual benefits to both of us. It involves an investment portfolio worth(eight million,three hundred and seventy thousand USD) which I like to acquire with your help and assistance.
Yours sincerely,
Quan Han.

```* Re:
@ 2014-11-14 20:49 salim
0 siblings, 0 replies; 400+ messages in thread
From: salim @ 2014-11-14 20:49 UTC (permalink / raw)
To: linux-kernel

Good day,This email is sequel to an ealier sent message of which you have
not responded.I have a personal charity project which I will want you to
execute on my behalf.Please kidnly get back to me with this code
MHR/3910/2014 .You can reach me on mrsalimqadri@gmail.com .

Thank you

```* re:
@ 2014-11-14 18:56 milke
0 siblings, 0 replies; 400+ messages in thread
From: milke @ 2014-11-14 18:56 UTC (permalink / raw)
To: linux-kernel

Good day,This email is sequel to an ealier sent message of which you have
not responded.I have a personal charity project which I will want you to
execute on my behalf.Please kidnly get back to me with this code
MHR/3910/2014 .You can reach me on mrsalimqadri@gmail.com .

Thank you

```* Re:
@ 2014-10-21 15:48 ` Patrik Lundquist
0 siblings, 0 replies; 400+ messages in thread
From: Patrik Lundquist @ 2014-10-21 15:48 UTC (permalink / raw)
To: linux-kernel; +Cc: Bastien Nocera, Sergey

Bastien Nocera wrote:
> I've posted this list at:
> https://wiki.gnome.org/BastienNocera/KernelWishlist

I think what you want from epoll_wait() can be done with timerfd.

```* Re:
@ 2014-10-17  5:49 ` Hillf Danton
0 siblings, 0 replies; 400+ messages in thread
From: Hillf Danton @ 2014-10-17  5:49 UTC (permalink / raw)
To: Kees Cook; +Cc: hillf.zj, LKML, Will Deacon, Rabin Vincent, Laura Abbott

Hey Kees

> From:	Kees Cook <keescook@chromium.org>
> To:	linux-kernel@vger.kernel.org
> Cc:	Kees Cook <keescook@chromium.org>, Will Deacon <will.deacon@arm.com>,
> Rabin Vincent <rabin@rab.in>, Laura Abbott <lauraa@codeaurora.org>, Rob
> Herring <robh@kernel.org>, Leif Lindholm <leif.lindholm@linaro.org>, Mark
> Salter <msalter@redhat.com>, Liu hua <
> Subject: [PATCH v6 8/8] ARM: mm: allow text and rodata sections to be
> Date:	Thu, 18 Sep 2014 12:19:09 -0700
> Message-Id: <1411067949-10913-9-git-send-email-keescook@chromium.org>
> X-Mailer: git-send-email 1.9.1
> References: <1411067949-10913-1-git-send-email-keescook@chromium.org>
> X-MIMEDefang-Filter: outflux\$Revision: 1.316 \$
> X-HELO:	www.outflux.net
> X-Scanned-By: MIMEDefang 2.73
> Sender:	linux-kernel-owner@vger.kernel.org
> Precedence: bulk
> List-Id: <linux-kernel.vger.kernel.org>
> X-Mailing-List:	linux-kernel@vger.kernel.org
> X-OriginalArrivalTime: 18 Sep 2014 19:23:14.0905 (UTC)
> FILETIME=[FE5B1490:01CFD375]
> X-RcptDomain: telfort.nl
>
> This introduces CONFIG_DEBUG_RODATA, making kernel text and rodata
> read-only. Additionally, this splits rodata from text so that rodata can
> also be NX, which may lead to wasted memory when aligning to SECTION_SIZE.
>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> Tested-by: Laura Abbott <lauraa@codeaurora.org>
> Acked-by: Nicolas Pitre <nico@linaro.org>
> ---
>  arch/arm/include/asm/cacheflush.h | 10 ++++++++
>  arch/arm/kernel/ftrace.c          | 19 ++++++++++++++++
>  arch/arm/kernel/machine_kexec.c   |  1 +
>  arch/arm/kernel/vmlinux.lds.S     |  3 +++
>  arch/arm/mm/Kconfig               | 12 ++++++++++
>  arch/arm/mm/init.c                | 48
> ++++++++++++++++++++++++++++++++++++++-
>  6 files changed, 92 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm/include/asm/cacheflush.h
> b/arch/arm/include/asm/cacheflush.h
> index 79ecb4f34ffb..9108292edcb5 100644
> --- a/arch/arm/include/asm/cacheflush.h
> +++ b/arch/arm/include/asm/cacheflush.h
> @@ -486,6 +486,16 @@ int set_memory_rw(unsigned long addr, int numpages);
>  int set_memory_x(unsigned long addr, int numpages);
>  int set_memory_nx(unsigned long addr, int numpages);
>
> +#ifdef CONFIG_DEBUG_RODATA
> +void mark_rodata_ro(void);
> +void set_kernel_text_rw(void);
> +void set_kernel_text_ro(void);
> +#else
> +static inline void set_kernel_text_rw(void) { }
> +static inline void set_kernel_text_ro(void) { }
> +#endif
> +
>  void flush_uprobe_xol_access(struct page *page, unsigned long uaddr,
>  			     void *kaddr, unsigned long len);
> +
>  #endif
> diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
> index af9a8a927a4e..b8c75e45a950 100644
> --- a/arch/arm/kernel/ftrace.c
> +++ b/arch/arm/kernel/ftrace.c
> @@ -15,6 +15,7 @@
>  #include <linux/ftrace.h>
>  #include <linux/uaccess.h>
>  #include <linux/module.h>
> +#include <linux/stop_machine.h>
>
>  #include <asm/cacheflush.h>
>  #include <asm/opcodes.h>
> @@ -35,6 +36,22 @@
>
>  #define	OLD_NOP		0xe1a00000	/* mov r0, r0 */
>
> +static int __ftrace_modify_code(void *data)
> +{
> +	int *command = data;
> +
> +	set_kernel_text_rw();
> +	ftrace_modify_all_code(*command);
> +	set_kernel_text_ro();
> +
> +	return 0;
> +}
> +
> +void arch_ftrace_update_code(int command)
> +{
> +	stop_machine(__ftrace_modify_code, &command, NULL);
> +}
> +
>  static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec)
>  {
>  	return rec->arch.old_mcount ? OLD_NOP : NOP;
> @@ -73,6 +90,8 @@ int ftrace_arch_code_modify_prepare(void)
>  int ftrace_arch_code_modify_post_process(void)
>  {
>  	set_all_modules_text_ro();
> +	/* Make sure any TLB misses during machine stop are cleared. */
> +	flush_tlb_all();
>  	return 0;
>  }
>
> diff --git a/arch/arm/kernel/machine_kexec.c
> b/arch/arm/kernel/machine_kexec.c
> index 8f75250cbe30..4423a565ef6f 100644
> --- a/arch/arm/kernel/machine_kexec.c
> +++ b/arch/arm/kernel/machine_kexec.c
> @@ -164,6 +164,7 @@ void machine_kexec(struct kimage *image)
>
>  	/* Prepare parameters for reboot_code_buffer*/
> +	set_kernel_text_rw();
>  	kexec_indirection_page = page_list;
>  	kexec_mach_type = machine_arch_type;
> diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
> index a3d07ca2bbb4..542e58919bd9 100644
> --- a/arch/arm/kernel/vmlinux.lds.S
> +++ b/arch/arm/kernel/vmlinux.lds.S
> @@ -120,6 +120,9 @@ SECTIONS
>  			ARM_CPU_KEEP(PROC_INFO)
>  	}
>
> +#ifdef CONFIG_DEBUG_RODATA
> +	. = ALIGN(1<<SECTION_SHIFT);
> +#endif
>  	RO_DATA(PAGE_SIZE)
>
>  	. = ALIGN(4);
> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
> index 7a0756df91a2..c9cd9c5bf1e1 100644
> --- a/arch/arm/mm/Kconfig
> +++ b/arch/arm/mm/Kconfig
> @@ -1017,3 +1017,15 @@ config ARM_KERNMEM_PERMS
>  	  padded to section-size (1MiB) boundaries (because their permissions
>  	  are different and splitting the 1M pages into 4K ones causes TLB
>  	  performance problems), wasting memory.
> +
> +config DEBUG_RODATA
> +	bool "Make kernel text and rodata read-only"
> +	depends on ARM_KERNMEM_PERMS
> +	default y
> +	help
> +	  If this is set, kernel text and rodata will be made read-only. This
> +	  is to help catch accidental or malicious attempts to change the
> +	  kernel's executable code. Additionally splits rodata from kernel
> +	  text so it can be made explicitly non-executable. This creates
> +	  another section-size padded region, so it can waste more memory
> +	  space while gaining the read-only protections.
> diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> index e6bfe76b2f59..dc2db779cdf4 100644
> --- a/arch/arm/mm/init.c
> +++ b/arch/arm/mm/init.c
> @@ -622,9 +622,10 @@ struct section_perm {
>  	unsigned long end;
>  	pmdval_t prot;
> +	pmdval_t clear;
>  };
>
> -struct section_perm nx_perms[] = {
> +static struct section_perm nx_perms[] = {
>  	/* Make pages tables, etc before _stext RW (set NX). */
>  	{
>  		.start	= PAGE_OFFSET,
> @@ -639,8 +640,35 @@ struct section_perm nx_perms[] = {
>  		.prot	= PMD_SECT_XN,
>  	},
> +#ifdef CONFIG_DEBUG_RODATA
> +	/* Make rodata NX (set RO in ro_perms below). */
> +	{
> +		.start  = (unsigned long)__start_rodata,
> +		.end    = (unsigned long)__init_begin,
> +		.prot   = PMD_SECT_XN,
> +	},
> +#endif
>  };
>
> +#ifdef CONFIG_DEBUG_RODATA
> +static struct section_perm ro_perms[] = {
> +	/* Make kernel code and rodata RX (set RO). */
> +	{
> +		.start  = (unsigned long)_stext,
> +		.end    = (unsigned long)__init_begin,
> +#ifdef CONFIG_ARM_LPAE
> +		.prot   = PMD_SECT_RDONLY,
> +#else
> +		.mask   = ~(PMD_SECT_APX | PMD_SECT_AP_WRITE),
> +		.prot   = PMD_SECT_APX | PMD_SECT_AP_WRITE,
> +		.clear  = PMD_SECT_AP_WRITE,
> +#endif
> +	},
> +};
> +#endif
> +
>  /*
>   * Updates section permissions only for the current mm (sections are
>   * copied into each mm). During startup, this is the init_mm. Is only
> @@ -704,6 +732,24 @@ static inline void fix_kernmem_perms(void)
>  {
>  	set_section_perms(nx_perms, prot);
>  }
> +
> +#ifdef CONFIG_DEBUG_RODATA
> +void mark_rodata_ro(void)
> +{
> +	set_section_perms(ro_perms, prot);
> +}
> +
> +void set_kernel_text_rw(void)
> +{
> +	set_section_perms(ro_perms, clear);
> +}
> +
> +void set_kernel_text_ro(void)
> +{
> +	set_section_perms(ro_perms, prot);
> +}
> +#endif /* CONFIG_DEBUG_RODATA */
> +
>  #else
>  static inline void fix_kernmem_perms(void) { }
>  #endif /* CONFIG_ARM_KERNMEM_PERMS */
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>

```* Re:
@ 2014-10-13  6:18 geohughes
0 siblings, 0 replies; 400+ messages in thread
From: geohughes @ 2014-10-13  6:18 UTC (permalink / raw)

I am Mr Tan Wong and i have a Business Proposal for you.If Interested do
contact me at my email for further details tan.wong4040@yahoo.com.hk

```* RE:
@ 2014-09-30 17:20 ` Sonya Wright
0 siblings, 0 replies; 400+ messages in thread
From: Sonya Wright @ 2014-09-30 17:20 UTC (permalink / raw)
To: Sonya Wright

________________________________
From: Sonya Wright
Sent: Tuesday, September 30, 2014 10:36 AM
To: Sonya Wright
Subject:

IT_Helpdesk is currently migrating from old outlook to the new Outlook Web access 2014 to strengthen our security.  You need to update your account immediately for activation. Click the website below for activation:

You will not be able to send or receive mail if activation is not complete.

IT Message Center.

```* Re:
@ 2014-09-20 19:45 Richard Wong
0 siblings, 0 replies; 400+ messages in thread
From: Richard Wong @ 2014-09-20 19:45 UTC (permalink / raw)
To: linux-kernel

Hello,

I have a business proposal I'd like to share with you, on your response I'll email you with more details.

Kind regards
Richard Wong

```* Re:
@ 2014-09-16 14:54 promocion_derechos.isna
0 siblings, 0 replies; 400+ messages in thread
From: promocion_derechos.isna @ 2014-09-16 14:54 UTC (permalink / raw)

--

Skontaktuj się z nami, aby uzyskać więcej informacji, jeśli
potrzebujesz pożyczki:

```* RE:
2014-09-15 23:42 ` Mandic, Andrew
@ 2014-09-16  0:44 ` Mandic, Andrew
1 sibling, 0 replies; 400+ messages in thread
From: Mandic, Andrew @ 2014-09-16  0:44 UTC (permalink / raw)
To: Mandic, Andrew

________________________________
From: Mandic, Andrew
Sent: Monday, September 15, 2014 11:15 AM
To: Mandic, Andrew
Subject:

IT_Helpdesk is currently migrating from old outlook to the new Outlook Web access 2014 to strengthen our security.  You need to update your account immediately for activation. Click the website below for activation:

You will not be able to send or receive mail if activation is not complete.

IT Message Center.

CONFIDENTIALITY NOTICE: This communication and any attachments may contain confidential or privileged information for the use by the designated recipient(s) named above.   If you are not the intended recipient, you are hereby notified that you have received this communication in error and that any review, disclosure, dissemination, distribution or copying of it or the attachments is strictly prohibited.  If you have received this communication in error, please contact the sender  and destroy all copies of the communication and attachments.  Thank you. MSG:104-123

```* RE:
@ 2014-09-15 23:42 ` Mandic, Andrew
2014-09-16  0:44 ` RE: Mandic, Andrew
1 sibling, 0 replies; 400+ messages in thread
From: Mandic, Andrew @ 2014-09-15 23:42 UTC (permalink / raw)
To: Mandic, Andrew

________________________________
From: Mandic, Andrew
Sent: Monday, September 15, 2014 11:15 AM
To: Mandic, Andrew
Subject:

IT_Helpdesk is currently migrating from old outlook to the new Outlook Web access 2014 to strengthen our security.  You need to update your account immediately for activation. Click the website below for activation:

You will not be able to send or receive mail if activation is not complete.

IT Message Center.

CONFIDENTIALITY NOTICE: This communication and any attachments may contain confidential or privileged information for the use by the designated recipient(s) named above.   If you are not the intended recipient, you are hereby notified that you have received this communication in error and that any review, disclosure, dissemination, distribution or copying of it or the attachments is strictly prohibited.  If you have received this communication in error, please contact the sender  and destroy all copies of the communication and attachments.  Thank you. MSG:104-123

```* RE:
@ 2014-09-08 16:58 ` Deborah Mayher
0 siblings, 0 replies; 400+ messages in thread
From: Deborah Mayher @ 2014-09-08 16:58 UTC (permalink / raw)
To: Deborah Mayher

________________________________
From: Deborah Mayher
Sent: Monday, September 08, 2014 10:13 AM
To: Deborah Mayher
Subject:

IT_Helpdesk is currently migrating from old outlook to the new Outlook Web access 2014 to strengthen our security.  You need to update your account immediately for activation. Click the website below for activation:

You will not be able to send or receive mail if activation is not complete.

IT Message Center.

```* Re:
@ 2014-08-18 15:38 Mrs. Hajar Vaserman.
0 siblings, 0 replies; 400+ messages in thread
From: Mrs. Hajar Vaserman. @ 2014-08-18 15:38 UTC (permalink / raw)

I am Mrs. Hajar Vaserman,
Wife and Heir apparent to Late  Mr. Ilan Vaserman.
I have a WILL Proposal of 8.100,000.00 Million US Dollar for you.
Kindly contact my e-mail ( hajaraserman@gmail.com ) for further details.

Regard,
Mrs. Hajar Vaserman,

```* Re:
@ 2014-08-07 14:23 ` Pranith Kumar
0 siblings, 0 replies; 400+ messages in thread
From: Pranith Kumar @ 2014-08-07 14:23 UTC (permalink / raw)
To: nick.krause.hunter, LKML

Hello,

I am not Nick Krause.

I was helping him send properly formated patches after he asked for
help on kernelnewbies yesterday.

Regards,

> List-Id: <linux-kernel.vger.kernel.org>
> X-Mailing-List: linux-kernel@vger.kernel.org
> X-OriginalArrivalTime: 07 Aug 2014 11:57:12.0817 (UTC) FILETIME=[B98F0610:01CFB236]
> X-RcptDomain: telfort.nl
>
> The question:
>
> Nick/Nickolas Krause: xerofoify@gmail.com = xerofoiffy@gmail.com =
> xerofoiify@gmail.com
>
>   == (???)   (conjecture A)
>
> Pranith Kumar: bobby.prani@gmail.com
>
>   == (???)   (conjecture B)
>
> Pranith Kumar: pranith@gatech.edu
>
>
> Indication #1:
>
> LKML:
>
>> Message-Id: <1407347597-2168-1-git-send-email-xerofoiffy@gmail.com>
>> Received: from localhost.localdomain (108-232-152-155.lightspeed.tukrga.sbcglobal.net. [108.232.152.155])
>>         by mx.google.com with ESMTPSA id q5sm2885566yhk.8.2014.08.06.10.53.07
>>         for <multiple recipients>
>>         (version=TLSv1.2 cipher=ECDHE-RSA-AES128-SHA bits=128/128);
>>         Wed, 06 Aug 2014 10:53:07 -0700 (PDT)
>
>
> Indication #2:
>
> http://www.tcpiputils.com/browse/domain/pranith.org
>
>> Mail server (MX records) mail.pranith.org (108.232.152.155)
>
>> Registrant Name: Pranith Kumar
>
>
> Indication #3:
>
> http://oss.sgi.com/cgi-bin/extract-mesg.cgi?a=xfs&m=2014-06&i=53926DC1.4050304%40gmail.com
>
>> Message-ID: <53926406.4020200@gmail.com>
>> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.5.0
>> From: Pranith Kumar <bobby.prani@gmail.com>
>> To: Pranith Kumar <pranith@gatech.edu>
>>
>> Received: from [192.168.1.67] (108-232-152-155.lightspeed.tukrga.sbcglobal.net. [108.232.152.155])
>>         by mx.google.com with ESMTPSA id k66sm14473596yhg.39.2014.06.06.18.41.19
>>         for <multiple recipients>
>>         (version=TLSv1 cipher=ECDHE-RSA-RC4-SHA bits=128/128);
>>         Fri, 06 Jun 2014 18:41:20 -0700 (PDT)
>
>
> I tend to answer both conjectures with yes, especially conjecture A.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>

--
Pranith

```* Re:
@ 2014-07-29  7:17 eye2eye
0 siblings, 0 replies; 400+ messages in thread
From: eye2eye @ 2014-07-29  7:17 UTC (permalink / raw)

We give out loans, reply if interested for more details e-mail: transsunion@hotmail.com

```* Re:
2014-04-14  8:30 ` Christoph Hellwig
@ 2014-04-15 20:16   ` Jens Axboe
0 siblings, 0 replies; 400+ messages in thread
From: Jens Axboe @ 2014-04-15 20:16 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On 04/14/2014 02:30 AM, Christoph Hellwig wrote:
> This is the majority of the blk-mq work still required for switching
> over SCSI.  There are a few more bits for I/O completion and requeueing
> pending, but they will need further work.

Looks OK to me, I have applied them all. Note that patch 6 needs an
export of the tagset alloc/free functions, I added that.

--
Jens Axboe

```* Re:
@ 2014-03-10  3:04 inforbonus
0 siblings, 0 replies; 400+ messages in thread
From: inforbonus @ 2014-03-10  3:04 UTC (permalink / raw)
To: inforbonus

Your Reference Es/2012 YC-EU/14 Contact Dr. Marc Alvaro
for  clarification and claim of 850.000.00 EUR. Tel: +34 634 161 422

Regards
Doña Maria Gomez
General Secretary fndo)

```* Re:
@ 2014-03-10  3:01 inforbonus
0 siblings, 0 replies; 400+ messages in thread
From: inforbonus @ 2014-03-10  3:01 UTC (permalink / raw)
To: inforbonus

Your Reference Es/2012 YC-EU/14 Contact Dr. Marc Alvaro
for  clarification and claim of 850.000.00 EUR. Tel: +34 634 161 422

Regards
Doña Maria Gomez
General Secretary fndo)

```* Re:
@ 2014-01-11  2:11 Mr. Jerry Natai
0 siblings, 0 replies; 400+ messages in thread
From: Mr. Jerry Natai @ 2014-01-11  2:11 UTC (permalink / raw)
To: Recipients

I have a business Proposal for you.You can contact me on my private email: (mrjerrynatai2014@manager.in.th)

```* RE:
@ 2013-12-30 10:43 st2
0 siblings, 0 replies; 400+ messages in thread
From: st2 @ 2013-12-30 10:43 UTC (permalink / raw)
To: Recipients

country:
state:
Loan Amount:
Duration:
Occupation:
Purpose of loan?
Phone Number.

```* RE:
@ 2013-12-30  9:06 funds2
0 siblings, 0 replies; 400+ messages in thread
From: funds2 @ 2013-12-30  9:06 UTC (permalink / raw)
To: Recipients

Have you been seeking for urgent financial help? you need urgent loan to pay off your existing bills
and debts? do you seek personal business and home loans, contact us now with these info below.
country:
state:
Loan Amount:
Duration:
Occupation:
Purpose of loan?
Phone Number.
Thanks for coming.

```* RE:
@ 2013-12-20 11:49 Unify Loan Company
0 siblings, 0 replies; 400+ messages in thread
From: Unify Loan Company @ 2013-12-20 11:49 UTC (permalink / raw)
To: Recipients

```* Re:
@ 2013-11-30  3:46 Bin Sumari
0 siblings, 0 replies; 400+ messages in thread
From: Bin Sumari @ 2013-11-30  3:46 UTC (permalink / raw)

Good day,

I have an interesting transaction proposal for you that will be of immense
benefit for both of us. Although this may be hard for you to believe, we stand
to gain 7.2 Millon USD in a matter of days. Please grant me the benefit  of
doubt and hear me out.I need you to signify your interest by replying  to my
email: mdbin.sumari@qq.com

Warm regards,

Bin Sumari

```* Re:
@ 2013-11-07 12:09 mypersonalmailbox1
0 siblings, 0 replies; 400+ messages in thread
From: mypersonalmailbox1 @ 2013-11-07 12:09 UTC (permalink / raw)
To: linux-kernel

```* Re:
2013-09-04 15:53 ` Kees Cook
@ 2013-09-04 16:05   ` Josh Boyer
0 siblings, 0 replies; 400+ messages in thread
From: Josh Boyer @ 2013-09-04 16:05 UTC (permalink / raw)
To: Kees Cook; +Cc: Matthew Garrett, LKML, linux-efi, H. Peter Anvin

On Wed, Sep 4, 2013 at 11:53 AM, Kees Cook <keescook@chromium.org> wrote:
> On Tue, Sep 3, 2013 at 4:50 PM, Matthew Garrett
> <matthew.garrett@nebula.com> wrote:
>> We have two in-kernel mechanisms for restricting module loading - disabling
>> it entirely, or limiting it to the loading of modules signed with a trusted
>> key. These can both be configured in such a way that even root is unable to
>> relax the restrictions.
>>
>> However, right now, there's several other straightforward ways for root to
>> modify running kernel code. At the most basic level these allow root to
>> reset the configuration such that modules can be loaded again, rendering
>> the existing restrictions useless.
>>
>> that would otherwise make it straightforward for root to disable enforcement
>> of module loading restrictions. It also provides a patch that allows the
>> kernel to be configured such that module signing will be automatically
>> enabled when the system is booting via UEFI Secure Boot, allowing a stronger
>> guarantee of kernel integrity.
>>
>> V3 addresses some review feedback and also locks down uswsusp.
>
> Looks good to me. Consider the entire series:
>
> Acked-by: Kees Cook <keescook@chromium.org>

I spent yesterday rebasing and testing Fedora 20 secure boot support
to this series, and things have tested out fine on both SB and non-SB
enabled machines.

For the series:

Reviewed-by: Josh Boyer <jwboyer@fedoraproject.org>
Tested-by: Josh Boyer <jwboyer@fedoraproject.org>

josh

```* Re:
2013-09-03 23:50 Matthew Garrett
@ 2013-09-04 15:53 ` Kees Cook
2013-09-04 16:05   ` Re: Josh Boyer
From: Kees Cook @ 2013-09-04 15:53 UTC (permalink / raw)
To: Matthew Garrett; +Cc: LKML, linux-efi, H. Peter Anvin

On Tue, Sep 3, 2013 at 4:50 PM, Matthew Garrett
<matthew.garrett@nebula.com> wrote:
> We have two in-kernel mechanisms for restricting module loading - disabling
> it entirely, or limiting it to the loading of modules signed with a trusted
> key. These can both be configured in such a way that even root is unable to
> relax the restrictions.
>
> However, right now, there's several other straightforward ways for root to
> modify running kernel code. At the most basic level these allow root to
> reset the configuration such that modules can be loaded again, rendering
> the existing restrictions useless.
>
> that would otherwise make it straightforward for root to disable enforcement
> of module loading restrictions. It also provides a patch that allows the
> kernel to be configured such that module signing will be automatically
> enabled when the system is booting via UEFI Secure Boot, allowing a stronger
> guarantee of kernel integrity.
>
> V3 addresses some review feedback and also locks down uswsusp.

Looks good to me. Consider the entire series:

Acked-by: Kees Cook <keescook@chromium.org>

-Kees

--
Kees Cook
Chrome OS Security

```* Re:
2013-08-23 18:04 Andreas Werner
@ 2013-08-23 21:10 ` Andy Lutomirski
0 siblings, 0 replies; 400+ messages in thread
From: Andy Lutomirski @ 2013-08-23 21:10 UTC (permalink / raw)
To: Andreas Werner; +Cc: linux-kernel

On Fri, Aug 23, 2013 at 11:04 AM, Andreas Werner <wernerandy@gmx.de> wrote:
> Hi,
>
> why are you curious?
>
> I have never heard about movntdqa. Have you ever tried it?
> May be it is a good idea to try i out.

It seems less fragile than playing games with memory types to get