* [PATCH v9] iomap: Support file tail packing
@ 2021-07-27 2:59 Gao Xiang
2021-07-27 15:10 ` Darrick J. Wong
2021-08-01 12:03 ` [PATCH v9] iomap: Support file tail packing Andreas Gruenbacher
0 siblings, 2 replies; 417+ messages in thread
From: Gao Xiang @ 2021-07-27 2:59 UTC (permalink / raw)
To: linux-erofs, linux-fsdevel
Cc: LKML, Huang Jianan, Joseph Qi, Gao Xiang, Darrick J . Wong,
Christoph Hellwig, Matthew Wilcox, Andreas Gruenbacher
The existing inline data support only works for cases where the entire
file is stored as inline data. For larger files, EROFS stores the
initial blocks separately and then can pack a small tail adjacent to the
inode. Generalise inline data to allow for tail packing. Tails may not
cross a page boundary in memory.
We currently have no filesystems that support tail packing and writing,
so that case is currently disabled (see iomap_write_begin_inline).
Cc: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
v8: https://lore.kernel.org/r/20210726145734.214295-1-hsiangkao@linux.alibaba.com
changes since v8:
- update the subject to 'iomap: Support file tail packing' as there
are clearly a number of ways to make the inline data support more
flexible (Matthew);
- add one extra safety check (Darrick):
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
fs/iomap/buffered-io.c | 42 ++++++++++++++++++++++++++++++------------
fs/iomap/direct-io.c | 10 ++++++----
include/linux/iomap.h | 18 ++++++++++++++++++
3 files changed, 54 insertions(+), 16 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 87ccb3438bec..f429b9d87dbe 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -205,25 +205,32 @@ struct iomap_readpage_ctx {
struct readahead_control *rac;
};
-static void
-iomap_read_inline_data(struct inode *inode, struct page *page,
+static int iomap_read_inline_data(struct inode *inode, struct page *page,
struct iomap *iomap)
{
- size_t size = i_size_read(inode);
+ size_t size = i_size_read(inode) - iomap->offset;
void *addr;
if (PageUptodate(page))
- return;
+ return 0;
- BUG_ON(page_has_private(page));
- BUG_ON(page->index);
- BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ /* inline data must start page aligned in the file */
+ if (WARN_ON_ONCE(offset_in_page(iomap->offset)))
+ return -EIO;
+ if (WARN_ON_ONCE(size > PAGE_SIZE -
+ offset_in_page(iomap->inline_data)))
+ return -EIO;
+ if (WARN_ON_ONCE(size > iomap->length))
+ return -EIO;
+ if (WARN_ON_ONCE(page_has_private(page)))
+ return -EIO;
addr = kmap_atomic(page);
memcpy(addr, iomap->inline_data, size);
memset(addr + size, 0, PAGE_SIZE - size);
kunmap_atomic(addr);
SetPageUptodate(page);
+ return 0;
}
static inline bool iomap_block_needs_zeroing(struct inode *inode,
@@ -247,8 +254,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
sector_t sector;
if (iomap->type == IOMAP_INLINE) {
- WARN_ON_ONCE(pos);
- iomap_read_inline_data(inode, page, iomap);
+ int ret = iomap_read_inline_data(inode, page, iomap);
+
+ if (ret)
+ return ret;
return PAGE_SIZE;
}
@@ -589,6 +598,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
return 0;
}
+static int iomap_write_begin_inline(struct inode *inode,
+ struct page *page, struct iomap *srcmap)
+{
+ /* needs more work for the tailpacking case, disable for now */
+ if (WARN_ON_ONCE(srcmap->offset != 0))
+ return -EIO;
+ return iomap_read_inline_data(inode, page, srcmap);
+}
+
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
@@ -618,7 +636,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
}
if (srcmap->type == IOMAP_INLINE)
- iomap_read_inline_data(inode, page, srcmap);
+ status = iomap_write_begin_inline(inode, page, srcmap);
else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(page, pos, len, NULL, srcmap);
else
@@ -671,11 +689,11 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
void *addr;
WARN_ON_ONCE(!PageUptodate(page));
- BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ BUG_ON(!iomap_inline_data_valid(iomap));
flush_dcache_page(page);
addr = kmap_atomic(page);
- memcpy(iomap->inline_data + pos, addr + pos, copied);
+ memcpy(iomap_inline_data(iomap, pos), addr + pos, copied);
kunmap_atomic(addr);
mark_inode_dirty(inode);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 9398b8c31323..41ccbfc9dc82 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
struct iomap_dio *dio, struct iomap *iomap)
{
struct iov_iter *iter = dio->submit.iter;
+ void *inline_data = iomap_inline_data(iomap, pos);
size_t copied;
- BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
+ return -EIO;
if (dio->flags & IOMAP_DIO_WRITE) {
loff_t size = inode->i_size;
if (pos > size)
- memset(iomap->inline_data + size, 0, pos - size);
- copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+ memset(iomap_inline_data(iomap, size), 0, pos - size);
+ copied = copy_from_iter(inline_data, length, iter);
if (copied) {
if (pos + copied > size)
i_size_write(inode, pos + copied);
mark_inode_dirty(inode);
}
} else {
- copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+ copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
return copied;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 479c1da3e221..b8ec145b2975 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos)
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
+/*
+ * Returns the inline data pointer for logical offset @pos.
+ */
+static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos)
+{
+ return iomap->inline_data + pos - iomap->offset;
+}
+
+/*
+ * Check if the mapping's length is within the valid range for inline data.
+ * This is used to guard against accessing data beyond the page inline_data
+ * points at.
+ */
+static inline bool iomap_inline_data_valid(struct iomap *iomap)
+{
+ return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
+}
+
/*
* When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
* and page_done will be called for each page written to. This only applies to
--
2.24.4
^ permalink raw reply related [flat|nested] 417+ messages in thread
* (no subject)
2021-07-27 2:59 [PATCH v9] iomap: Support file tail packing Gao Xiang
@ 2021-07-27 15:10 ` Darrick J. Wong
2021-07-27 15:23 ` Andreas Grünbacher
2021-07-27 15:30 ` Re: Gao Xiang
2021-08-01 12:03 ` [PATCH v9] iomap: Support file tail packing Andreas Gruenbacher
1 sibling, 2 replies; 417+ messages in thread
From: Darrick J. Wong @ 2021-07-27 15:10 UTC (permalink / raw)
To: Gao Xiang
Cc: linux-erofs, linux-fsdevel, LKML, Huang Jianan, Joseph Qi,
Christoph Hellwig, Matthew Wilcox, Andreas Gruenbacher
I'll change the subject to:
iomap: support reading inline data from non-zero pos
The existing inline data support only works for cases where the entire
file is stored as inline data. For larger files, EROFS stores the
initial blocks separately and the remainder of the file ("file tail")
adjacent to the inode. Generalise inline data to allow reading the
inline file tail. Tails may not cross a page boundary in memory.
We currently have no filesystems that support tails and writing,
so that case is currently disabled (see iomap_write_begin_inline).
If that's ok with everyone,
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
--D
On Tue, Jul 27, 2021 at 10:59:56AM +0800, Gao Xiang wrote:
> The existing inline data support only works for cases where the entire
> file is stored as inline data. For larger files, EROFS stores the
> initial blocks separately and then can pack a small tail adjacent to the
> inode. Generalise inline data to allow for tail packing. Tails may not
> cross a page boundary in memory.
>
> We currently have no filesystems that support tail packing and writing,
> so that case is currently disabled (see iomap_write_begin_inline).
>
> Cc: Darrick J. Wong <djwong@kernel.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> v8: https://lore.kernel.org/r/20210726145734.214295-1-hsiangkao@linux.alibaba.com
> changes since v8:
> - update the subject to 'iomap: Support file tail packing' as there
> are clearly a number of ways to make the inline data support more
> flexible (Matthew);
>
> - add one extra safety check (Darrick):
> if (WARN_ON_ONCE(size > iomap->length))
> return -EIO;
>
> fs/iomap/buffered-io.c | 42 ++++++++++++++++++++++++++++++------------
> fs/iomap/direct-io.c | 10 ++++++----
> include/linux/iomap.h | 18 ++++++++++++++++++
> 3 files changed, 54 insertions(+), 16 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 87ccb3438bec..f429b9d87dbe 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -205,25 +205,32 @@ struct iomap_readpage_ctx {
> struct readahead_control *rac;
> };
>
> -static void
> -iomap_read_inline_data(struct inode *inode, struct page *page,
> +static int iomap_read_inline_data(struct inode *inode, struct page *page,
> struct iomap *iomap)
> {
> - size_t size = i_size_read(inode);
> + size_t size = i_size_read(inode) - iomap->offset;
> void *addr;
>
> if (PageUptodate(page))
> - return;
> + return 0;
>
> - BUG_ON(page_has_private(page));
> - BUG_ON(page->index);
> - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> + /* inline data must start page aligned in the file */
> + if (WARN_ON_ONCE(offset_in_page(iomap->offset)))
> + return -EIO;
> + if (WARN_ON_ONCE(size > PAGE_SIZE -
> + offset_in_page(iomap->inline_data)))
> + return -EIO;
> + if (WARN_ON_ONCE(size > iomap->length))
> + return -EIO;
> + if (WARN_ON_ONCE(page_has_private(page)))
> + return -EIO;
>
> addr = kmap_atomic(page);
> memcpy(addr, iomap->inline_data, size);
> memset(addr + size, 0, PAGE_SIZE - size);
> kunmap_atomic(addr);
> SetPageUptodate(page);
> + return 0;
> }
>
> static inline bool iomap_block_needs_zeroing(struct inode *inode,
> @@ -247,8 +254,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> sector_t sector;
>
> if (iomap->type == IOMAP_INLINE) {
> - WARN_ON_ONCE(pos);
> - iomap_read_inline_data(inode, page, iomap);
> + int ret = iomap_read_inline_data(inode, page, iomap);
> +
> + if (ret)
> + return ret;
> return PAGE_SIZE;
> }
>
> @@ -589,6 +598,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
> return 0;
> }
>
> +static int iomap_write_begin_inline(struct inode *inode,
> + struct page *page, struct iomap *srcmap)
> +{
> + /* needs more work for the tailpacking case, disable for now */
> + if (WARN_ON_ONCE(srcmap->offset != 0))
> + return -EIO;
> + return iomap_read_inline_data(inode, page, srcmap);
> +}
> +
> static int
> iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
> @@ -618,7 +636,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
> }
>
> if (srcmap->type == IOMAP_INLINE)
> - iomap_read_inline_data(inode, page, srcmap);
> + status = iomap_write_begin_inline(inode, page, srcmap);
> else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
> status = __block_write_begin_int(page, pos, len, NULL, srcmap);
> else
> @@ -671,11 +689,11 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
> void *addr;
>
> WARN_ON_ONCE(!PageUptodate(page));
> - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
> + BUG_ON(!iomap_inline_data_valid(iomap));
>
> flush_dcache_page(page);
> addr = kmap_atomic(page);
> - memcpy(iomap->inline_data + pos, addr + pos, copied);
> + memcpy(iomap_inline_data(iomap, pos), addr + pos, copied);
> kunmap_atomic(addr);
>
> mark_inode_dirty(inode);
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 9398b8c31323..41ccbfc9dc82 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
> struct iomap_dio *dio, struct iomap *iomap)
> {
> struct iov_iter *iter = dio->submit.iter;
> + void *inline_data = iomap_inline_data(iomap, pos);
> size_t copied;
>
> - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
> + return -EIO;
>
> if (dio->flags & IOMAP_DIO_WRITE) {
> loff_t size = inode->i_size;
>
> if (pos > size)
> - memset(iomap->inline_data + size, 0, pos - size);
> - copied = copy_from_iter(iomap->inline_data + pos, length, iter);
> + memset(iomap_inline_data(iomap, size), 0, pos - size);
> + copied = copy_from_iter(inline_data, length, iter);
> if (copied) {
> if (pos + copied > size)
> i_size_write(inode, pos + copied);
> mark_inode_dirty(inode);
> }
> } else {
> - copied = copy_to_iter(iomap->inline_data + pos, length, iter);
> + copied = copy_to_iter(inline_data, length, iter);
> }
> dio->size += copied;
> return copied;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 479c1da3e221..b8ec145b2975 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos)
> return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
> }
>
> +/*
> + * Returns the inline data pointer for logical offset @pos.
> + */
> +static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos)
> +{
> + return iomap->inline_data + pos - iomap->offset;
> +}
> +
> +/*
> + * Check if the mapping's length is within the valid range for inline data.
> + * This is used to guard against accessing data beyond the page inline_data
> + * points at.
> + */
> +static inline bool iomap_inline_data_valid(struct iomap *iomap)
> +{
> + return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
> +}
> +
> /*
> * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
> * and page_done will be called for each page written to. This only applies to
> --
> 2.24.4
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2021-07-27 15:10 ` Darrick J. Wong
@ 2021-07-27 15:23 ` Andreas Grünbacher
2021-07-27 15:30 ` Re: Gao Xiang
1 sibling, 0 replies; 417+ messages in thread
From: Andreas Grünbacher @ 2021-07-27 15:23 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Gao Xiang, linux-erofs, Linux FS-devel Mailing List, LKML,
Huang Jianan, Joseph Qi, Christoph Hellwig, Matthew Wilcox,
Andreas Gruenbacher
Am Di., 27. Juli 2021 um 17:11 Uhr schrieb Darrick J. Wong <djwong@kernel.org>:
> I'll change the subject to:
>
> iomap: support reading inline data from non-zero pos
That surely works for me.
Thanks,
Andreas
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2021-07-27 15:10 ` Darrick J. Wong
2021-07-27 15:23 ` Andreas Grünbacher
@ 2021-07-27 15:30 ` Gao Xiang
1 sibling, 0 replies; 417+ messages in thread
From: Gao Xiang @ 2021-07-27 15:30 UTC (permalink / raw)
To: Darrick J. Wong
Cc: linux-erofs, linux-fsdevel, LKML, Huang Jianan, Joseph Qi,
Christoph Hellwig, Matthew Wilcox, Andreas Gruenbacher
On Tue, Jul 27, 2021 at 08:10:51AM -0700, Darrick J. Wong wrote:
> I'll change the subject to:
>
> iomap: support reading inline data from non-zero pos
I'm fine with this too. Many thanks for updating!
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH v9] iomap: Support file tail packing
2021-07-27 2:59 [PATCH v9] iomap: Support file tail packing Gao Xiang
2021-07-27 15:10 ` Darrick J. Wong
@ 2021-08-01 12:03 ` Andreas Gruenbacher
2021-08-02 12:15 ` Gao Xiang
1 sibling, 1 reply; 417+ messages in thread
From: Andreas Gruenbacher @ 2021-08-01 12:03 UTC (permalink / raw)
To: Gao Xiang
Cc: linux-erofs, linux-fsdevel, LKML, Huang Jianan, Joseph Qi,
Darrick J . Wong, Christoph Hellwig, Matthew Wilcox
On Tue, Jul 27, 2021 at 5:00 AM Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
> The existing inline data support only works for cases where the entire
> file is stored as inline data. For larger files, EROFS stores the
> initial blocks separately and then can pack a small tail adjacent to the
> inode. Generalise inline data to allow for tail packing. Tails may not
> cross a page boundary in memory.
>
> We currently have no filesystems that support tail packing and writing,
> so that case is currently disabled (see iomap_write_begin_inline).
>
> Cc: Darrick J. Wong <djwong@kernel.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
> Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> ---
> v8: https://lore.kernel.org/r/20210726145734.214295-1-hsiangkao@linux.alibaba.com
> changes since v8:
> - update the subject to 'iomap: Support file tail packing' as there
> are clearly a number of ways to make the inline data support more
> flexible (Matthew);
>
> - add one extra safety check (Darrick):
> if (WARN_ON_ONCE(size > iomap->length))
> return -EIO;
>
> fs/iomap/buffered-io.c | 42 ++++++++++++++++++++++++++++++------------
> fs/iomap/direct-io.c | 10 ++++++----
> include/linux/iomap.h | 18 ++++++++++++++++++
> 3 files changed, 54 insertions(+), 16 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 87ccb3438bec..f429b9d87dbe 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -205,25 +205,32 @@ struct iomap_readpage_ctx {
> struct readahead_control *rac;
> };
>
> -static void
> -iomap_read_inline_data(struct inode *inode, struct page *page,
> +static int iomap_read_inline_data(struct inode *inode, struct page *page,
> struct iomap *iomap)
> {
> - size_t size = i_size_read(inode);
> + size_t size = i_size_read(inode) - iomap->offset;
> void *addr;
>
> if (PageUptodate(page))
> - return;
> + return 0;
>
> - BUG_ON(page_has_private(page));
> - BUG_ON(page->index);
> - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
> + /* inline data must start page aligned in the file */
> + if (WARN_ON_ONCE(offset_in_page(iomap->offset)))
> + return -EIO;
> + if (WARN_ON_ONCE(size > PAGE_SIZE -
> + offset_in_page(iomap->inline_data)))
> + return -EIO;
> + if (WARN_ON_ONCE(size > iomap->length))
> + return -EIO;
> + if (WARN_ON_ONCE(page_has_private(page)))
> + return -EIO;
>
> addr = kmap_atomic(page);
> memcpy(addr, iomap->inline_data, size);
> memset(addr + size, 0, PAGE_SIZE - size);
> kunmap_atomic(addr);
> SetPageUptodate(page);
> + return 0;
> }
>
> static inline bool iomap_block_needs_zeroing(struct inode *inode,
> @@ -247,8 +254,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> sector_t sector;
>
> if (iomap->type == IOMAP_INLINE) {
> - WARN_ON_ONCE(pos);
> - iomap_read_inline_data(inode, page, iomap);
> + int ret = iomap_read_inline_data(inode, page, iomap);
> +
> + if (ret)
> + return ret;
> return PAGE_SIZE;
> }
>
> @@ -589,6 +598,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
> return 0;
> }
>
> +static int iomap_write_begin_inline(struct inode *inode,
> + struct page *page, struct iomap *srcmap)
> +{
> + /* needs more work for the tailpacking case, disable for now */
Nit: the comma should be a semicolon or period here.
> + if (WARN_ON_ONCE(srcmap->offset != 0))
> + return -EIO;
> + return iomap_read_inline_data(inode, page, srcmap);
> +}
> +
> static int
> iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
> struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
> @@ -618,7 +636,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
> }
>
> if (srcmap->type == IOMAP_INLINE)
> - iomap_read_inline_data(inode, page, srcmap);
> + status = iomap_write_begin_inline(inode, page, srcmap);
> else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
> status = __block_write_begin_int(page, pos, len, NULL, srcmap);
> else
> @@ -671,11 +689,11 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
> void *addr;
>
> WARN_ON_ONCE(!PageUptodate(page));
> - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
> + BUG_ON(!iomap_inline_data_valid(iomap));
>
> flush_dcache_page(page);
> addr = kmap_atomic(page);
> - memcpy(iomap->inline_data + pos, addr + pos, copied);
> + memcpy(iomap_inline_data(iomap, pos), addr + pos, copied);
> kunmap_atomic(addr);
>
> mark_inode_dirty(inode);
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 9398b8c31323..41ccbfc9dc82 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
> struct iomap_dio *dio, struct iomap *iomap)
> {
> struct iov_iter *iter = dio->submit.iter;
> + void *inline_data = iomap_inline_data(iomap, pos);
> size_t copied;
>
> - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
> + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
> + return -EIO;
>
> if (dio->flags & IOMAP_DIO_WRITE) {
> loff_t size = inode->i_size;
>
> if (pos > size)
> - memset(iomap->inline_data + size, 0, pos - size);
> - copied = copy_from_iter(iomap->inline_data + pos, length, iter);
> + memset(iomap_inline_data(iomap, size), 0, pos - size);
> + copied = copy_from_iter(inline_data, length, iter);
> if (copied) {
> if (pos + copied > size)
> i_size_write(inode, pos + copied);
> mark_inode_dirty(inode);
> }
> } else {
> - copied = copy_to_iter(iomap->inline_data + pos, length, iter);
> + copied = copy_to_iter(inline_data, length, iter);
> }
> dio->size += copied;
> return copied;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 479c1da3e221..b8ec145b2975 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos)
> return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
> }
>
> +/*
> + * Returns the inline data pointer for logical offset @pos.
> + */
> +static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos)
> +{
> + return iomap->inline_data + pos - iomap->offset;
> +}
> +
> +/*
> + * Check if the mapping's length is within the valid range for inline data.
> + * This is used to guard against accessing data beyond the page inline_data
> + * points at.
> + */
> +static inline bool iomap_inline_data_valid(struct iomap *iomap)
> +{
> + return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
> +}
> +
> /*
> * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
> * and page_done will be called for each page written to. This only applies to
> --
> 2.24.4
>
Andreas
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH v9] iomap: Support file tail packing
2021-08-01 12:03 ` [PATCH v9] iomap: Support file tail packing Andreas Gruenbacher
@ 2021-08-02 12:15 ` Gao Xiang
2021-08-02 21:58 ` Darrick J. Wong
2021-08-03 0:17 ` [PATCH v10] iomap: support reading inline data from non-zero pos Gao Xiang
0 siblings, 2 replies; 417+ messages in thread
From: Gao Xiang @ 2021-08-02 12:15 UTC (permalink / raw)
To: Andreas Gruenbacher, Darrick J . Wong
Cc: linux-erofs, linux-fsdevel, LKML, Huang Jianan, Joseph Qi,
Christoph Hellwig, Matthew Wilcox
Hi Andreas,
On Sun, Aug 01, 2021 at 02:03:33PM +0200, Andreas Gruenbacher wrote:
> On Tue, Jul 27, 2021 at 5:00 AM Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
...
> > +static int iomap_write_begin_inline(struct inode *inode,
> > + struct page *page, struct iomap *srcmap)
> > +{
> > + /* needs more work for the tailpacking case, disable for now */
>
> Nit: the comma should be a semicolon or period here.
Sorry for some delay (busy in other things...)
Yeah, that's fine to me, in English contexts it'd be better like that
(there is some punctuation rule difference between languages.)
Hi Darrick,
Should I resend v10 for this punctuation change or could you kindly
help revise this?
(btw, would you mind set up a for-next iomap branch so I could rebase
other EROFS patches on iomap for-next, thank you very much!)
Thanks,
Gao Xiang
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH v9] iomap: Support file tail packing
2021-08-02 12:15 ` Gao Xiang
@ 2021-08-02 21:58 ` Darrick J. Wong
2021-08-03 0:17 ` [PATCH v10] iomap: support reading inline data from non-zero pos Gao Xiang
1 sibling, 0 replies; 417+ messages in thread
From: Darrick J. Wong @ 2021-08-02 21:58 UTC (permalink / raw)
To: Andreas Gruenbacher, linux-erofs, linux-fsdevel, LKML,
Huang Jianan, Joseph Qi, Christoph Hellwig, Matthew Wilcox
On Mon, Aug 02, 2021 at 08:15:41PM +0800, Gao Xiang wrote:
> Hi Andreas,
>
> On Sun, Aug 01, 2021 at 02:03:33PM +0200, Andreas Gruenbacher wrote:
> > On Tue, Jul 27, 2021 at 5:00 AM Gao Xiang <hsiangkao@linux.alibaba.com> wrote:
>
> ...
>
> > > +static int iomap_write_begin_inline(struct inode *inode,
> > > + struct page *page, struct iomap *srcmap)
> > > +{
> > > + /* needs more work for the tailpacking case, disable for now */
> >
> > Nit: the comma should be a semicolon or period here.
>
> Sorry for some delay (busy in other things...)
>
> Yeah, that's fine to me, in English contexts it'd be better like that
> (there is some punctuation rule difference between languages.)
>
>
> Hi Darrick,
> Should I resend v10 for this punctuation change or could you kindly
> help revise this?
>
> (btw, would you mind set up a for-next iomap branch so I could rebase
> other EROFS patches on iomap for-next, thank you very much!)
Er... I'll send a group email shortly. Assembling the branch hasn't
been as straightforward as it usually is...
--D
> Thanks,
> Gao Xiang
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH v10] iomap: support reading inline data from non-zero pos
2021-08-02 12:15 ` Gao Xiang
2021-08-02 21:58 ` Darrick J. Wong
@ 2021-08-03 0:17 ` Gao Xiang
1 sibling, 0 replies; 417+ messages in thread
From: Gao Xiang @ 2021-08-03 0:17 UTC (permalink / raw)
To: linux-erofs, linux-fsdevel
Cc: LKML, Huang Jianan, Joseph Qi, Gao Xiang, Darrick J . Wong,
Christoph Hellwig, Matthew Wilcox, Andreas Gruenbacher
The existing inline data support only works for cases where the entire
file is stored as inline data. For larger files, EROFS stores the
initial blocks separately and the remainder of the file ("file tail")
adjacent to the inode. Generalise inline data to allow reading the
inline file tail. Tails may not cross a page boundary in memory.
We currently have no filesystems that support tails and writing,
so that case is currently disabled (see iomap_write_begin_inline).
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
changes since v9:
- update commit message suggested by Darrick and
collect his RVB;
- update a semicolon suggested by Andreas.
fs/iomap/buffered-io.c | 42 ++++++++++++++++++++++++++++++------------
fs/iomap/direct-io.c | 10 ++++++----
include/linux/iomap.h | 18 ++++++++++++++++++
3 files changed, 54 insertions(+), 16 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 87ccb3438bec..dd1e2cbec5a0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -205,25 +205,32 @@ struct iomap_readpage_ctx {
struct readahead_control *rac;
};
-static void
-iomap_read_inline_data(struct inode *inode, struct page *page,
+static int iomap_read_inline_data(struct inode *inode, struct page *page,
struct iomap *iomap)
{
- size_t size = i_size_read(inode);
+ size_t size = i_size_read(inode) - iomap->offset;
void *addr;
if (PageUptodate(page))
- return;
+ return 0;
- BUG_ON(page_has_private(page));
- BUG_ON(page->index);
- BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ /* inline data must start page aligned in the file */
+ if (WARN_ON_ONCE(offset_in_page(iomap->offset)))
+ return -EIO;
+ if (WARN_ON_ONCE(size > PAGE_SIZE -
+ offset_in_page(iomap->inline_data)))
+ return -EIO;
+ if (WARN_ON_ONCE(size > iomap->length))
+ return -EIO;
+ if (WARN_ON_ONCE(page_has_private(page)))
+ return -EIO;
addr = kmap_atomic(page);
memcpy(addr, iomap->inline_data, size);
memset(addr + size, 0, PAGE_SIZE - size);
kunmap_atomic(addr);
SetPageUptodate(page);
+ return 0;
}
static inline bool iomap_block_needs_zeroing(struct inode *inode,
@@ -247,8 +254,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
sector_t sector;
if (iomap->type == IOMAP_INLINE) {
- WARN_ON_ONCE(pos);
- iomap_read_inline_data(inode, page, iomap);
+ int ret = iomap_read_inline_data(inode, page, iomap);
+
+ if (ret)
+ return ret;
return PAGE_SIZE;
}
@@ -589,6 +598,15 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
return 0;
}
+static int iomap_write_begin_inline(struct inode *inode,
+ struct page *page, struct iomap *srcmap)
+{
+ /* needs more work for the tailpacking case; disable for now */
+ if (WARN_ON_ONCE(srcmap->offset != 0))
+ return -EIO;
+ return iomap_read_inline_data(inode, page, srcmap);
+}
+
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
@@ -618,7 +636,7 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
}
if (srcmap->type == IOMAP_INLINE)
- iomap_read_inline_data(inode, page, srcmap);
+ status = iomap_write_begin_inline(inode, page, srcmap);
else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(page, pos, len, NULL, srcmap);
else
@@ -671,11 +689,11 @@ static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
void *addr;
WARN_ON_ONCE(!PageUptodate(page));
- BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ BUG_ON(!iomap_inline_data_valid(iomap));
flush_dcache_page(page);
addr = kmap_atomic(page);
- memcpy(iomap->inline_data + pos, addr + pos, copied);
+ memcpy(iomap_inline_data(iomap, pos), addr + pos, copied);
kunmap_atomic(addr);
mark_inode_dirty(inode);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 9398b8c31323..41ccbfc9dc82 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -378,23 +378,25 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
struct iomap_dio *dio, struct iomap *iomap)
{
struct iov_iter *iter = dio->submit.iter;
+ void *inline_data = iomap_inline_data(iomap, pos);
size_t copied;
- BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+ if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
+ return -EIO;
if (dio->flags & IOMAP_DIO_WRITE) {
loff_t size = inode->i_size;
if (pos > size)
- memset(iomap->inline_data + size, 0, pos - size);
- copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+ memset(iomap_inline_data(iomap, size), 0, pos - size);
+ copied = copy_from_iter(inline_data, length, iter);
if (copied) {
if (pos + copied > size)
i_size_write(inode, pos + copied);
mark_inode_dirty(inode);
}
} else {
- copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+ copied = copy_to_iter(inline_data, length, iter);
}
dio->size += copied;
return copied;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 479c1da3e221..b8ec145b2975 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -97,6 +97,24 @@ iomap_sector(struct iomap *iomap, loff_t pos)
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
+/*
+ * Returns the inline data pointer for logical offset @pos.
+ */
+static inline void *iomap_inline_data(struct iomap *iomap, loff_t pos)
+{
+ return iomap->inline_data + pos - iomap->offset;
+}
+
+/*
+ * Check if the mapping's length is within the valid range for inline data.
+ * This is used to guard against accessing data beyond the page inline_data
+ * points at.
+ */
+static inline bool iomap_inline_data_valid(struct iomap *iomap)
+{
+ return iomap->length <= PAGE_SIZE - offset_in_page(iomap->inline_data);
+}
+
/*
* When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
* and page_done will be called for each page written to. This only applies to
--
2.24.4
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re: [PATCH 2/3] net: microchip_t1s: add support for LAN867x Rev.C1
@ 2023-11-27 13:37 Andrew Lunn
[not found] ` <20231205102039.2917039-1-felix@piedallu.me>
0 siblings, 1 reply; 417+ messages in thread
From: Andrew Lunn @ 2023-11-27 13:37 UTC (permalink / raw)
To: Ramón N.Rodriguez
Cc: Heiner Kallweit, Russell King, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, netdev, linux-kernel
> #define PHY_ID_LAN867X_REVB1 0x0007C162
> +#define PHY_ID_LAN867X_REVC1 0x0007C164
So there is a gap in the revisions. Maybe a B2 exists?
> +static int lan867x_revc1_read_fixup_value(struct phy_device *phydev, u16 addr)
> +{
> + int regval;
> + /* The AN pretty much just states 'trust us' regarding these magic vals */
> + const u16 magic_or = 0xE0;
> + const u16 magic_reg_mask = 0x1F;
> + const u16 magic_check_mask = 0x10;
Reverse christmass tree please. Longest first, shorted last.
> + regval = lan865x_revb0_indirect_read(phydev, addr);
> + if (regval < 0)
> + return regval;
> +
> + regval &= magic_reg_mask;
> +
> + return (regval & magic_check_mask) ? regval | magic_or : regval;
> +}
> +
> +static int lan867x_revc1_config_init(struct phy_device *phydev)
> +{
> + int err;
> + int regval;
> + u16 override0;
> + u16 override1;
> + const u16 override_addr0 = 0x4;
> + const u16 override_addr1 = 0x8;
> + const u8 index_to_override0 = 2;
> + const u8 index_to_override1 = 3;
Same here.
> +
> + err = lan867x_wait_for_reset_complete(phydev);
> + if (err)
> + return err;
> +
> + /* The application note specifies a super convenient process
> + * where 2 of the fixup regs needs a write with a value that is
> + * a modified result of another reg read.
> + * Enjoy the magic show.
> + */
I really do hope that by revision D1 they get the firmware sorted out
so none of this undocumented magic is needed.
Andrew
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2023-11-11 4:21 Andrew Worsley
2023-11-11 8:22 ` Javier Martinez Canillas
0 siblings, 1 reply; 417+ messages in thread
From: Andrew Worsley @ 2023-11-11 4:21 UTC (permalink / raw)
To: Thomas Zimmermann, Javier Martinez Canillas, Maarten Lankhorst,
Maxime Ripard, David Airlie, Daniel Vetter,
open list:DRM DRIVER FOR FIRMWARE FRAMEBUFFERS, open list
This patch fix's the failure of the frame buffer driver on my Asahi kernel
which prevented X11 from starting on my Apple M1 laptop. It seems like a straight
forward failure to follow the procedure described in drivers/video/aperture.c
to remove the ealier driver. This patch is very simple and minimal. Very likely
there may be better ways to fix this and very like there may be other drivers
which have the same problem but I submit this so at least there is
an interim fix for my problem.
Thanks
Andrew Worsley
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-11-11 4:21 Andrew Worsley
@ 2023-11-11 8:22 ` Javier Martinez Canillas
0 siblings, 0 replies; 417+ messages in thread
From: Javier Martinez Canillas @ 2023-11-11 8:22 UTC (permalink / raw)
To: Andrew Worsley, Thomas Zimmermann, Maarten Lankhorst,
Maxime Ripard, David Airlie, Daniel Vetter,
open list:DRM DRIVER FOR FIRMWARE FRAMEBUFFERS, open list
Andrew Worsley <amworsley@gmail.com> writes:
Hello Andrew,
> This patch fix's the failure of the frame buffer driver on my Asahi kernel
> which prevented X11 from starting on my Apple M1 laptop. It seems like a straight
> forward failure to follow the procedure described in drivers/video/aperture.c
> to remove the ealier driver. This patch is very simple and minimal. Very likely
> there may be better ways to fix this and very like there may be other drivers
> which have the same problem but I submit this so at least there is
> an interim fix for my problem.
>
Which partch? I think you forgot to include in your email?
> Thanks
>
> Andrew Worsley
>
--
Best regards,
Javier Martinez Canillas
Core Platforms
Red Hat
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <DB3PR10MB6835AF75D60D9A96465F35C2E8AAA@DB3PR10MB6835.EURPRD10.PROD.OUTLOOK.COM>]
* PIC probing code from e179f6914152 failing
@ 2023-10-18 18:50 Mario Limonciello
2023-10-18 22:50 ` Thomas Gleixner
0 siblings, 1 reply; 417+ messages in thread
From: Mario Limonciello @ 2023-10-18 18:50 UTC (permalink / raw)
To: Hans de Goede, kys, tglx, hpa; +Cc: x86, LKML, Petkov, Borislav
Hi,
Recently an issue was reported to Bugzilla [1] that the Keyboard (IRQ 1)
and GPIO controller (IRQ 7) weren't working properly on two separate
Lenovo machines. The issues are unique to Linux.
In digging through them, they happen because Lenovo didn't set up the
PIC in the BIOS.
Specifically the PIC probing code introduced by e179f6914152 ("x86, irq,
pic: Probe for legacy PIC and set legacy_pic appropriately") expects
that the BIOS sets up the PIC and uses that assertion to let Linux set
it up.
One of the reporters confirmed that reverting e179f6914152 fixes the
issue. Keyboard and GPIO controller both work properly.
I wanted to ask if we can please revert that and come up with a
different solution for kexec with HyperV.
Can guests instead perhaps detect in early boot code they're running in
an EFI based hypervisor and explicitly set 'legacy_pic = &null_legacy_pic;'?
[1] https://bugzilla.kernel.org/show_bug.cgi?id=218003
Thanks,
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-18 18:50 PIC probing code from e179f6914152 failing Mario Limonciello
@ 2023-10-18 22:50 ` Thomas Gleixner
2023-10-19 21:20 ` Mario Limonciello
0 siblings, 1 reply; 417+ messages in thread
From: Thomas Gleixner @ 2023-10-18 22:50 UTC (permalink / raw)
To: Mario Limonciello, Hans de Goede, kys, hpa; +Cc: x86, LKML, Borislav Petkov
On Wed, Oct 18 2023 at 13:50, Mario Limonciello wrote:
> Recently an issue was reported to Bugzilla [1] that the Keyboard (IRQ 1)
> and GPIO controller (IRQ 7) weren't working properly on two separate
> Lenovo machines. The issues are unique to Linux.
>
> In digging through them, they happen because Lenovo didn't set up the
> PIC in the BIOS.
> Specifically the PIC probing code introduced by e179f6914152 ("x86, irq,
> pic: Probe for legacy PIC and set legacy_pic appropriately") expects
> that the BIOS sets up the PIC and uses that assertion to let Linux set
> it up.
>
> One of the reporters confirmed that reverting e179f6914152 fixes the
> issue. Keyboard and GPIO controller both work properly.
>
> I wanted to ask if we can please revert that and come up with a
> different solution for kexec with HyperV.
> Can guests instead perhaps detect in early boot code they're running in
> an EFI based hypervisor and explicitly set 'legacy_pic = &null_legacy_pic;'?
No. This detection mechanism prevents PIC usage also in other
scenarios.
It's perfectly valid code and the assumption that you can read back what
you wrote to the master IMR is entirely correct. If that's not the case
then there is no PIC, the BIOS has disabled some parts of the legacy
block or did not initialize it.
Letting the kernel blindly assume that there is always a PIC is just
wrong. Worse, it puts the burden on everyone else to sprinkle
"legacy_pic = null_pic;" all over the place with dubious
conditions. That's exactly what the commit in question avoided.
So no, we are not going back there.
We could obviously change the probe() function to issue a full PIC
initialization sequence before reading a known written value
back. That's basically what the revert causes to happen via
legacy_pic->init().
But I fundamentally hate to do that because forcing the init sequence
just to make presumably broken code which has some dubious dependencies
on the PIC or on nr_legacy_irqs > 0 happy is not really a solution when
the PIC is actually not needed at all. For anything modern where all
legacy interrupts are routed to the IO/APIC the PIC does not make any
sense whatsoever.
We rather go and fix the real underlying problem.
The kernel can handle the legacy interrupts perfectly fine through the
IOAPIC. There is no hard requirement for the PIC at all except for
really old systems which had the timer interrupt wired to the PIC and
therefore required to route the timer interrupt through the PIC instead
of the IO/APIC or did not provide routing entries for the IO/APIC. See
the horrible hackery in check_timer() and the grossly misnomed
init_IO_APIC_traps().
I just took a random machine, forced the NULL PIC and added
'apic=verbose' to the kernel command line and magically all the legacy
interrupts are set up via IO/APIC despite the NULL PIC and therefore 0
preallocated legacy interrupts.
apic 0 pin 0 not connected
IOAPIC[0]: Preconfigured routing entry (0-1 -> IRQ 1 Level:0 ActiveLow:0)
IOAPIC[0]: Preconfigured routing entry (0-2 -> IRQ 2 Level:0 ActiveLow:0)
IOAPIC[0]: Preconfigured routing entry (0-3 -> IRQ 3 Level:0 ActiveLow:0)
...
which is identical to the output with PIC enabled. That debug message is
emitted from mp_irqdomain_alloc() which is invoked via the PNP resource
management code.
Now /proc/interrupts:
CPU0 CPU1 CPU2 CPU3
1: 0 56 0 0 IO-APIC 1-edge i8042
4: 442 0 0 0 IO-APIC 4-edge ttyS0
8: 0 0 0 0 IO-APIC 8-edge rtc0
9: 0 0 0 0 IO-APIC 9-fasteoi acpi
12: 0 0 122 0 IO-APIC 12-edge i8042
Keyboard and serial are working, see?
The only interrupt which does not work is interrupt 0 because nothing
allocates interrupt 0 due to nr_legacy_irqs == 0, but that's a trivially
solvable problem.
That machine does not even need the timer interrupt because it has a
usable APIC and TSC deadline timer, so no APIC timer calibration
required. The same is true for CPUs which do not have a TSC deadline
timer, but enumerate the APIC frequency via CPUID or MSRs.
But that brings up an interesting question. How are those affected
machines even reaching a state where the user notices that just the
keyboard and the GPIO are not working? Why?
The CPUID/MSR APIC frequency enumeration is Intel specific and
everything else depends on a working timer interrupt to calibrate the
APIC timer frequency. So AMD CPUs require the timer interrupt to
work. The only explanation why this "works" in that null PIC case is
that the PIT/HPET interrupt is actually wired to pin 0, but that's
something to be determined...
Can I please get the following information from an affected machine:
1) dmesg with 'apic=verbose' on the command line
2) /proc/interrupts
3) /sys/kernel/debug/irq/irqs/{0..15}
#3 requires CONFIG_GENERIC_IRQ_DEBUGFS to be set.
Two versions of that please:
1) Unpatched kernel
2) Patched kernel
Thanks,
tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-18 22:50 ` Thomas Gleixner
@ 2023-10-19 21:20 ` Mario Limonciello
2023-10-23 15:59 ` Thomas Gleixner
0 siblings, 1 reply; 417+ messages in thread
From: Mario Limonciello @ 2023-10-19 21:20 UTC (permalink / raw)
To: Thomas Gleixner, Hans de Goede, kys, hpa; +Cc: x86, LKML, Borislav Petkov
On 10/18/2023 17:50, Thomas Gleixner wrote:
> On Wed, Oct 18 2023 at 13:50, Mario Limonciello wrote:
>> Recently an issue was reported to Bugzilla [1] that the Keyboard (IRQ 1)
>> and GPIO controller (IRQ 7) weren't working properly on two separate
>> Lenovo machines. The issues are unique to Linux.
>>
>> In digging through them, they happen because Lenovo didn't set up the
>> PIC in the BIOS.
>> Specifically the PIC probing code introduced by e179f6914152 ("x86, irq,
>> pic: Probe for legacy PIC and set legacy_pic appropriately") expects
>> that the BIOS sets up the PIC and uses that assertion to let Linux set
>> it up.
>>
>> One of the reporters confirmed that reverting e179f6914152 fixes the
>> issue. Keyboard and GPIO controller both work properly.
>>
>> I wanted to ask if we can please revert that and come up with a
>> different solution for kexec with HyperV.
>> Can guests instead perhaps detect in early boot code they're running in
>> an EFI based hypervisor and explicitly set 'legacy_pic = &null_legacy_pic;'?
>
> No. This detection mechanism prevents PIC usage also in other
> scenarios.
>
> It's perfectly valid code and the assumption that you can read back what
> you wrote to the master IMR is entirely correct. If that's not the case
> then there is no PIC, the BIOS has disabled some parts of the legacy
> block or did not initialize it.
>
> Letting the kernel blindly assume that there is always a PIC is just
> wrong. Worse, it puts the burden on everyone else to sprinkle
> "legacy_pic = null_pic;" all over the place with dubious
> conditions. That's exactly what the commit in question avoided.
>
> So no, we are not going back there.
>
> We could obviously change the probe() function to issue a full PIC
> initialization sequence before reading a known written value
> back. That's basically what the revert causes to happen via
> legacy_pic->init().
>
> But I fundamentally hate to do that because forcing the init sequence
> just to make presumably broken code which has some dubious dependencies
> on the PIC or on nr_legacy_irqs > 0 happy is not really a solution when
> the PIC is actually not needed at all. For anything modern where all
> legacy interrupts are routed to the IO/APIC the PIC does not make any
> sense whatsoever.
>
> We rather go and fix the real underlying problem.
Looking at the logs from David and also trying to mock up the failure on
my side I have a few findings I'll share, please agree or disagree with
them.
>
> The kernel can handle the legacy interrupts perfectly fine through the
> IOAPIC. There is no hard requirement for the PIC at all except for
> really old systems which had the timer interrupt wired to the PIC and
> therefore required to route the timer interrupt through the PIC instead
> of the IO/APIC or did not provide routing entries for the IO/APIC. See
> the horrible hackery in check_timer() and the grossly misnomed
> init_IO_APIC_traps().
>
> I just took a random machine, forced the NULL PIC and added
> 'apic=verbose' to the kernel command line and magically all the legacy
> interrupts are set up via IO/APIC despite the NULL PIC and therefore 0
> preallocated legacy interrupts.
>
> apic 0 pin 0 not connected
> IOAPIC[0]: Preconfigured routing entry (0-1 -> IRQ 1 Level:0 ActiveLow:0)
> IOAPIC[0]: Preconfigured routing entry (0-2 -> IRQ 2 Level:0 ActiveLow:0)
> IOAPIC[0]: Preconfigured routing entry (0-3 -> IRQ 3 Level:0 ActiveLow:0)
> ...
>
> which is identical to the output with PIC enabled. That debug message is
> emitted from mp_irqdomain_alloc() which is invoked via the PNP resource
> management code.
>
> Now /proc/interrupts:
>
> CPU0 CPU1 CPU2 CPU3
> 1: 0 56 0 0 IO-APIC 1-edge i8042
> 4: 442 0 0 0 IO-APIC 4-edge ttyS0
> 8: 0 0 0 0 IO-APIC 8-edge rtc0
> 9: 0 0 0 0 IO-APIC 9-fasteoi acpi
> 12: 0 0 122 0 IO-APIC 12-edge i8042
>
> Keyboard and serial are working, see?
>
> The only interrupt which does not work is interrupt 0 because nothing
> allocates interrupt 0 due to nr_legacy_irqs == 0, but that's a trivially
> solvable problem.
From David's logs I can see that the timer interrupt gets wired up to
IRQ2 instead of IRQ0.
IOAPIC[0]: Preconfigured routing entry (33-2 -> IRQ 2 Level:0 ActiveLow:0)
In my hacked up forcing NULL pic case this fixes that:
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 43c1c24e934b..885687e64e4e 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -424,7 +424,7 @@ static int legacy_pic_probe(void)
}
struct legacy_pic null_legacy_pic = {
- .nr_legacy_irqs = 0,
+ .nr_legacy_irqs = 1,
.chip = &dummy_irq_chip,
.mask = legacy_pic_uint_noop,
.unmask = legacy_pic_uint_noop,
I think it's cleaner than changing all the places that use
nr_legacy_irqs(). On my side this makes:
IOAPIC[0]: Preconfigured routing entry (33-2 -> IRQ 0 Level:0 ActiveLow:0)
>
> That machine does not even need the timer interrupt because it has a
> usable APIC and TSC deadline timer, so no APIC timer calibration
> required. The same is true for CPUs which do not have a TSC deadline
> timer, but enumerate the APIC frequency via CPUID or MSRs.
Don't you need it for things like rtcwake to be able to work?
>
> But that brings up an interesting question. How are those affected
> machines even reaching a state where the user notices that just the
> keyboard and the GPIO are not working? Why?
So the GPIO controller driver (pinctrl-amd) uses platform_get_irq() to
try to discover the IRQ to use.
This calls acpi_irq_get() which isn't implemented on x86 (hardcodes
-EINVAL).
I can "work around it" by:
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 76bfcba25003..2b4b436c65d8 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -187,7 +187,8 @@ int platform_get_irq_optional(struct platform_device
*dev, unsigned int num)
}
r = platform_get_resource(dev, IORESOURCE_IRQ, num);
- if (has_acpi_companion(&dev->dev)) {
+ if (IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) &&
+ has_acpi_companion(&dev->dev)) {
if (r && r->flags & IORESOURCE_DISABLED) {
ret = acpi_irq_get(ACPI_HANDLE(&dev->dev), num, r);
if (ret)
but the resource that is returned from the next hunk has the resource
flags set wrong in the NULL pic case:
NULL case:
r: AMDI0030:00 flags: 0x30000418
PIC case:
r: AMDI0030:00 flags: 0x418
IOW NULL pic case has IORESOURCE_DISABLED / IORESOURCE_UNSET
This then later the GPIO controller interrupts are not actually working.
For example the attn pin for my I2C touchpad doesn't work.
Checking the debugfs with my "work around" in place I can see a few
things set up differently:
NULL case:
handler: handle_edge_irq
dstate: 0x3740c208
IRQ_TYPE_LEVEL_LOW
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_MOVE_PCNTXT
IRQD_AFFINITY_ON_ACTIVATE
IRQD_CAN_RESERVE
IRQD_WAKEUP_STATE
IRQD_DEFAULT_TRIGGER_SET
IRQD_HANDLE_ENFORCE_IRQCTX
PIC case:
handler: handle_fasteoi_irq
dstate: 0x3740e208
IRQ_TYPE_LEVEL_LOW
IRQD_LEVEL
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_MOVE_PCNTXT
IRQD_AFFINITY_ON_ACTIVATE
IRQD_CAN_RESERVE
IRQD_WAKEUP_STATE
IRQD_DEFAULT_TRIGGER_SET
IRQD_HANDLE_ENFORCE_IRQCTX
I guess something related to the callpath for mp_register_handler().
Maybe this is the same reason for the keyboard not working right.
>
> The CPUID/MSR APIC frequency enumeration is Intel specific and
> everything else depends on a working timer interrupt to calibrate the
> APIC timer frequency. So AMD CPUs require the timer interrupt to
> work. The only explanation why this "works" in that null PIC case is
> that the PIT/HPET interrupt is actually wired to pin 0, but that's
> something to be determined...
>
> Can I please get the following information from an affected machine:
>
> 1) dmesg with 'apic=verbose' on the command line
> 2) /proc/interrupts
> 3) /sys/kernel/debug/irq/irqs/{0..15}
>
> #3 requires CONFIG_GENERIC_IRQ_DEBUGFS to be set.
>
> Two versions of that please:
>
> 1) Unpatched kernel
> 2) Patched kernel
>
> Thanks,
>
> tglx
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-19 21:20 ` Mario Limonciello
@ 2023-10-23 15:59 ` Thomas Gleixner
2023-10-25 9:23 ` Thomas Gleixner
0 siblings, 1 reply; 417+ messages in thread
From: Thomas Gleixner @ 2023-10-23 15:59 UTC (permalink / raw)
To: Mario Limonciello, Hans de Goede, kys, hpa; +Cc: x86, LKML, Borislav Petkov
On Thu, Oct 19 2023 at 16:20, Mario Limonciello wrote:
> On 10/18/2023 17:50, Thomas Gleixner wrote:
>> The only interrupt which does not work is interrupt 0 because nothing
>> allocates interrupt 0 due to nr_legacy_irqs == 0, but that's a trivially
>> solvable problem.
>
> From David's logs I can see that the timer interrupt gets wired up to
> IRQ2 instead of IRQ0.
Sure, but that's not really a problem. Nothing needs the timer
interrupt in principle.
> IOAPIC[0]: Preconfigured routing entry (33-2 -> IRQ 2 Level:0 ActiveLow:0)
>
> In my hacked up forcing NULL pic case this fixes that:
>
> diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
> index 43c1c24e934b..885687e64e4e 100644
> --- a/arch/x86/kernel/i8259.c
> +++ b/arch/x86/kernel/i8259.c
> @@ -424,7 +424,7 @@ static int legacy_pic_probe(void)
> }
>
> struct legacy_pic null_legacy_pic = {
> - .nr_legacy_irqs = 0,
> + .nr_legacy_irqs = 1,
> .chip = &dummy_irq_chip,
> .mask = legacy_pic_uint_noop,
> .unmask = legacy_pic_uint_noop,
>
> I think it's cleaner than changing all the places that use
> nr_legacy_irqs().
No. It's not cleaner. It's a hack and you still need to audit all places
which depend on nr_legacy_irqs(). Also why '1'? You could as well use
'16', no?
> On my side this makes:
>
> IOAPIC[0]: Preconfigured routing entry (33-2 -> IRQ 0 Level:0
> ActiveLow:0)
Sure, but that can be achieved by other means in a clean way as
well. Can we please focus on analyzing the underlying problems instead
of trying random hacks? The timer part is well understood already.
>> That machine does not even need the timer interrupt because it has a
>> usable APIC and TSC deadline timer, so no APIC timer calibration
>> required. The same is true for CPUs which do not have a TSC deadline
>> timer, but enumerate the APIC frequency via CPUID or MSRs.
>
> Don't you need it for things like rtcwake to be able to work?
Timer != RTC.
The RTC interrupt is separate (IRQ 8), but in the case of this system it
is using the HPET-RTC emulation which fails to initialize because
interrupt 0 is not available.
>> But that brings up an interesting question. How are those affected
>> machines even reaching a state where the user notices that just the
>> keyboard and the GPIO are not working? Why?
>
> So the GPIO controller driver (pinctrl-amd) uses platform_get_irq() to
> try to discover the IRQ to use.
>
> This calls acpi_irq_get() which isn't implemented on x86 (hardcodes
> -EINVAL).
>
> I can "work around it" by:
>
> diff --git a/drivers/base/platform.c b/drivers/base/platform.c
> index 76bfcba25003..2b4b436c65d8 100644
> --- a/drivers/base/platform.c
> +++ b/drivers/base/platform.c
> @@ -187,7 +187,8 @@ int platform_get_irq_optional(struct platform_device
> *dev, unsigned int num)
> }
>
> r = platform_get_resource(dev, IORESOURCE_IRQ, num);
> - if (has_acpi_companion(&dev->dev)) {
> + if (IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) &&
> + has_acpi_companion(&dev->dev)) {
> if (r && r->flags & IORESOURCE_DISABLED) {
> ret = acpi_irq_get(ACPI_HANDLE(&dev->dev), num, r);
> if (ret)
So why is acpi_irq_get() reached when the PIC is disabled, but not when
the PIC is enabled? Because of the below:
> but the resource that is returned from the next hunk ?
next hunk? The resource is returned by platform_get_resource() above, no?
> has the resource flags set wrong in the NULL pic case:
>
> NULL case:
> r: AMDI0030:00 flags: 0x30000418
> PIC case:
> r: AMDI0030:00 flags: 0x418
>
> IOW NULL pic case has IORESOURCE_DISABLED / IORESOURCE_UNSET
So the real question is WHY are the DISABLED/UNSET flags not set in the
PIC case?
> NULL case:
> handler: handle_edge_irq
> dstate: 0x3740c208
> IRQ_TYPE_LEVEL_LOW
>
> PIC case:
> handler: handle_fasteoi_irq
> dstate: 0x3740e208
> IRQ_TYPE_LEVEL_LOW
> IRQD_LEVEL
>
> I guess something related to the callpath for mp_register_handler().
Guessing is not helpful.
There is a difference in how the allocation info is set up when legacy
PIC is enabled, but that does not explain the above resource flag
difference.
As there is no override for IRQ7:
[ 0.011415] ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 dfl dfl)
[ 0.011417] Int: type 0, pol 0, trig 0, bus 00, IRQ 00, APIC ID 20, APIC INT 02
[ 0.011418] ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
[ 0.011419] Int: type 0, pol 3, trig 3, bus 00, IRQ 09, APIC ID 20, APIC INT 09
...
[ 0.011425] Int: type 0, pol 0, trig 0, bus 00, IRQ 07, APIC ID 20, APIC INT 07
the initial setup of the IOAPIC interrupt is edge, while the initial
setup of the legacy PIC is level. But that gets changed later to edge
when the IOAPIC is initialized.
I'm not seeing the magic which make the above different yet, though I'm
100% sure by now that this "works" definitely not by design. It just
works by pure luck.
Now when platform_get_irq_optional() sets the trigger type via
irqd_set_trigger_type() it just sets LEVEL_LOW, but does not change the
handler and does not set IRQD_LEVEL. It does neither change the IO/APIC
pin setup. This happens because the IOAPIC interrupt chip does not
implement an irq_set_type() callback.
IOW the whole machinery depends on magic setup ordering vs. PIC and pure
luck. Adding the callback is not rocket science, but while it should
make the interrupt work it still does not explain the magic "working"
when the legacy PIC is enabled.
Let me sit down and add a pile of debug printks to all the relevant
places as we really need to understand the underlying magic effects of
legacy PIC first.
Thanks,
tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-23 15:59 ` Thomas Gleixner
@ 2023-10-25 9:23 ` Thomas Gleixner
2023-10-25 14:41 ` Mario Limonciello
0 siblings, 1 reply; 417+ messages in thread
From: Thomas Gleixner @ 2023-10-25 9:23 UTC (permalink / raw)
To: Mario Limonciello, Hans de Goede, kys, hpa
Cc: x86, LKML, Borislav Petkov, Rafael J. Wysocki
On Mon, Oct 23 2023 at 17:59, Thomas Gleixner wrote:
> On Thu, Oct 19 2023 at 16:20, Mario Limonciello wrote:
>> struct legacy_pic null_legacy_pic = {
>> - .nr_legacy_irqs = 0,
>> + .nr_legacy_irqs = 1,
>> .chip = &dummy_irq_chip,
>> .mask = legacy_pic_uint_noop,
>> .unmask = legacy_pic_uint_noop,
>>
>> I think it's cleaner than changing all the places that use
>> nr_legacy_irqs().
>
> No. It's not cleaner. It's a hack and you still need to audit all places
> which depend on nr_legacy_irqs(). Also why '1'? You could as well use
> '16', no?
So I sat down and did a thorough analysis of legacy PIC dependencies.
Unfortunately this is an unholy mess and sprinkled all over the place,
so there is no trivial way to resolve this quickly. This needs a proper
overhaul to decouple the actual PIC driver selection from the fact that
the kernel runs on a i8259 equipped hardware and therefore needs to
honour the legacy PNP overrides etc.
The probing itself is to stay in order to avoid sprinkling weird
conditions and NULL PIC selections all over the place.
It could be argued that the probe function should try to initialize the
PIC, but that's overkill for scenarios where the PIC does not exist.
Though it turns out that ACPI/MADT is helpful here because the MADT
header has a flags field which denotes in bit 0, whether the system has
a 8259 setup or not.
This allows to override the probe for now until we actually resolved the
dependency problems in a clean way.
Untested patch below.
Thanks,
tglx
---
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -69,6 +69,8 @@ struct legacy_pic {
void (*make_irq)(unsigned int irq);
};
+void legacy_pic_pcat_compat(void);
+
extern struct legacy_pic *legacy_pic;
extern struct legacy_pic null_legacy_pic;
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct
pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+ legacy_pic_pcat_compat();
+
/* ACPI 6.3 and newer support the online capable bit. */
if (acpi_gbl_FADT.header.revision > 6 ||
(acpi_gbl_FADT.header.revision == 6 &&
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,6 +32,7 @@
*/
static void init_8259A(int auto_eoi);
+static bool pcat_compat __ro_after_init;
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
@@ -299,15 +300,32 @@ static void unmask_8259A(void)
static int probe_8259A(void)
{
+ unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
unsigned long flags;
- unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
- unsigned char new_val;
+
+ /*
+ * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+ * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+ * fails.
+ *
+ * Right now this causes problems as quite some code depends on
+ * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+ * when the system has an IO/APIC because then PIC is not required
+ * at all, except for really old machines where the timer interrupt
+ * must be routed through the PIC. So just pretend that the PIC is
+ * there and let legacy_pic->init() initialize it for nothing.
+ *
+ * Alternatively this could just try to initialize the PIC and
+ * repeat the probe, but for cases where there is no PIC that's
+ * just pointless.
+ */
+ if (pcat_compat)
+ return nr_legacy_irqs();
+
/*
- * Check to see if we have a PIC.
- * Mask all except the cascade and read
- * back the value we just wrote. If we don't
- * have a PIC, we will read 0xff as opposed to the
- * value we wrote.
+ * Check to see if we have a PIC. Mask all except the cascade and
+ * read back the value we just wrote. If we don't have a PIC, we
+ * will read 0xff as opposed to the value we wrote.
*/
raw_spin_lock_irqsave(&i8259A_lock, flags);
@@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void)
return 0;
}
-
device_initcall(i8259A_init_ops);
+
+void __init legacy_pic_pcat_compat(void)
+{
+ pcat_compat = true;
+}
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-25 9:23 ` Thomas Gleixner
@ 2023-10-25 14:41 ` Mario Limonciello
2023-10-25 15:25 ` David Lazar
0 siblings, 1 reply; 417+ messages in thread
From: Mario Limonciello @ 2023-10-25 14:41 UTC (permalink / raw)
To: Thomas Gleixner, Hans de Goede, kys, hpa, dlazar
Cc: x86, LKML, Borislav Petkov, Rafael J. Wysocki
On 10/25/2023 04:23, Thomas Gleixner wrote:
> On Mon, Oct 23 2023 at 17:59, Thomas Gleixner wrote:
>> On Thu, Oct 19 2023 at 16:20, Mario Limonciello wrote:
>>> struct legacy_pic null_legacy_pic = {
>>> - .nr_legacy_irqs = 0,
>>> + .nr_legacy_irqs = 1,
>>> .chip = &dummy_irq_chip,
>>> .mask = legacy_pic_uint_noop,
>>> .unmask = legacy_pic_uint_noop,
>>>
>>> I think it's cleaner than changing all the places that use
>>> nr_legacy_irqs().
>>
>> No. It's not cleaner. It's a hack and you still need to audit all places
>> which depend on nr_legacy_irqs(). Also why '1'? You could as well use
>> '16', no?
>
> So I sat down and did a thorough analysis of legacy PIC dependencies.
>
> Unfortunately this is an unholy mess and sprinkled all over the place,
> so there is no trivial way to resolve this quickly. This needs a proper
> overhaul to decouple the actual PIC driver selection from the fact that
> the kernel runs on a i8259 equipped hardware and therefore needs to
> honour the legacy PNP overrides etc.
>
> The probing itself is to stay in order to avoid sprinkling weird
> conditions and NULL PIC selections all over the place.
>
> It could be argued that the probe function should try to initialize the
> PIC, but that's overkill for scenarios where the PIC does not exist.
>
> Though it turns out that ACPI/MADT is helpful here because the MADT
> header has a flags field which denotes in bit 0, whether the system has
> a 8259 setup or not.
>
> This allows to override the probe for now until we actually resolved the
> dependency problems in a clean way.
>
> Untested patch below.
+David from the bugzilla.
I checked his acpidump and I do think this will work for him.
[024h 0036 4] Local Apic Address : FEE00000
[028h 0040 4] Flags (decoded below) : 00000001
PC-AT Compatibility : 1
David - can you see if the below helps your hardware?
>
> Thanks,
>
> tglx
> ---
> --- a/arch/x86/include/asm/i8259.h
> +++ b/arch/x86/include/asm/i8259.h
> @@ -69,6 +69,8 @@ struct legacy_pic {
> void (*make_irq)(unsigned int irq);
> };
>
> +void legacy_pic_pcat_compat(void);
> +
> extern struct legacy_pic *legacy_pic;
> extern struct legacy_pic null_legacy_pic;
>
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct
> pr_debug("Local APIC address 0x%08x\n", madt->address);
> }
>
> + if (madt->flags & ACPI_MADT_PCAT_COMPAT)
> + legacy_pic_pcat_compat();
> +
> /* ACPI 6.3 and newer support the online capable bit. */
> if (acpi_gbl_FADT.header.revision > 6 ||
> (acpi_gbl_FADT.header.revision == 6 &&
> --- a/arch/x86/kernel/i8259.c
> +++ b/arch/x86/kernel/i8259.c
> @@ -32,6 +32,7 @@
> */
> static void init_8259A(int auto_eoi);
>
> +static bool pcat_compat __ro_after_init;
> static int i8259A_auto_eoi;
> DEFINE_RAW_SPINLOCK(i8259A_lock);
>
> @@ -299,15 +300,32 @@ static void unmask_8259A(void)
>
> static int probe_8259A(void)
> {
> + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
> unsigned long flags;
> - unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
> - unsigned char new_val;
> +
> + /*
> + * If MADT has the PCAT_COMPAT flag set, then do not bother probing
> + * for the PIC. Some BIOSes leave the PIC uninitialized and probing
> + * fails.
> + *
> + * Right now this causes problems as quite some code depends on
> + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
> + * when the system has an IO/APIC because then PIC is not required
> + * at all, except for really old machines where the timer interrupt
> + * must be routed through the PIC. So just pretend that the PIC is
> + * there and let legacy_pic->init() initialize it for nothing.
> + *
> + * Alternatively this could just try to initialize the PIC and
> + * repeat the probe, but for cases where there is no PIC that's
> + * just pointless.
> + */
> + if (pcat_compat)
> + return nr_legacy_irqs();
> +
> /*
> - * Check to see if we have a PIC.
> - * Mask all except the cascade and read
> - * back the value we just wrote. If we don't
> - * have a PIC, we will read 0xff as opposed to the
> - * value we wrote.
> + * Check to see if we have a PIC. Mask all except the cascade and
> + * read back the value we just wrote. If we don't have a PIC, we
> + * will read 0xff as opposed to the value we wrote.
> */
> raw_spin_lock_irqsave(&i8259A_lock, flags);
>
> @@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void)
>
> return 0;
> }
> -
> device_initcall(i8259A_init_ops);
> +
> +void __init legacy_pic_pcat_compat(void)
> +{
> + pcat_compat = true;
> +}
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-25 14:41 ` Mario Limonciello
@ 2023-10-25 15:25 ` David Lazar
2023-10-25 17:31 ` Thomas Gleixner
0 siblings, 1 reply; 417+ messages in thread
From: David Lazar @ 2023-10-25 15:25 UTC (permalink / raw)
To: Mario Limonciello, Thomas Gleixner
Cc: Hans de Goede, kys, hpa, x86, LKML, Borislav Petkov, Rafael J. Wysocki
--- On Wed, 25 Oct 2023, Mario Limonciello wrote:
> David - can you see if the below helps your hardware?
The keyboard and mouse work fine with Thomas' patch.
I've uploaded the debug info to the bug:
https://bugzilla.kernel.org/attachment.cgi?id=305291&action=edit
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: PIC probing code from e179f6914152 failing
2023-10-25 15:25 ` David Lazar
@ 2023-10-25 17:31 ` Thomas Gleixner
2023-10-25 21:04 ` [PATCH] x86/i8259: Skip probing when ACPI/MADT advertises PCAT compatibility, Thomas Gleixner
0 siblings, 1 reply; 417+ messages in thread
From: Thomas Gleixner @ 2023-10-25 17:31 UTC (permalink / raw)
To: David Lazar, Mario Limonciello
Cc: Hans de Goede, kys, hpa, x86, LKML, Borislav Petkov, Rafael J. Wysocki
On Wed, Oct 25 2023 at 17:25, David Lazar wrote:
> --- On Wed, 25 Oct 2023, Mario Limonciello wrote:
>> David - can you see if the below helps your hardware?
>
> The keyboard and mouse work fine with Thomas' patch.
>
> I've uploaded the debug info to the bug:
>
> https://bugzilla.kernel.org/attachment.cgi?id=305291&action=edit
Let me write a changelog then. Unless Rafael has any objections to that
approach.
Thanks,
tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH] x86/i8259: Skip probing when ACPI/MADT advertises PCAT compatibility,
2023-10-25 17:31 ` Thomas Gleixner
@ 2023-10-25 21:04 ` Thomas Gleixner
2023-10-25 22:11 ` Mario Limonciello
0 siblings, 1 reply; 417+ messages in thread
From: Thomas Gleixner @ 2023-10-25 21:04 UTC (permalink / raw)
To: David Lazar, Mario Limonciello
Cc: Hans de Goede, kys, hpa, x86, LKML, Borislav Petkov, Rafael J. Wysocki
David and a few others reported that on certain newer systems some legacy
interrupts fail to work correctly.
Debugging revealed that the BIOS of these systems leaves the legacy PIC in
uninitialized state which makes the PIC detection fail and the kernel
switches to a dummy implementation.
Unfortunately this fallback causes quite some code to fail as it depends on
checks for the number of legacy PIC interrupts or the availability of the
real PIC.
In theory there is no reason to use the PIC on any modern system when
IO/APIC is available, but the dependencies on the related checks cannot be
resolved trivially and on short notice. This needs lots of analysis and
rework.
The PIC detection has been added to avoid quirky checks and force selection
of the dummy implementation all over the place, especially in VM guest
scenarios. So it's not an option to revert the relevant commit as that
would break a lot of other scenarios.
One solution would be to try to initialize the PIC on detection fail and
retry the detection, but that puts the burden on everything which does not
have a PIC.
Fortunately the ACPI/MADT table header has a flag field, which advertises
in bit 0 that the system is PCAT compatible, which means it has a legacy
8259 PIC.
Evaluate that bit and if set avoid the detection routine and keep the real
PIC installed, which then gets initialized (for nothing) and makes the rest
of the code with all the dependencies work again.
Fixes: e179f6914152 ("x86, irq, pic: Probe for legacy PIC and set legacy_pic appropriately")
Reported-by: David Lazar <dlazar@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: David Lazar <dlazar@gmail.com>
Cc: stable@vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218003
---
---
arch/x86/include/asm/i8259.h | 2 ++
arch/x86/kernel/acpi/boot.c | 3 +++
arch/x86/kernel/i8259.c | 38 ++++++++++++++++++++++++++++++--------
3 files changed, 35 insertions(+), 8 deletions(-)
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -69,6 +69,8 @@ struct legacy_pic {
void (*make_irq)(unsigned int irq);
};
+void legacy_pic_pcat_compat(void);
+
extern struct legacy_pic *legacy_pic;
extern struct legacy_pic null_legacy_pic;
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct
pr_debug("Local APIC address 0x%08x\n", madt->address);
}
+ if (madt->flags & ACPI_MADT_PCAT_COMPAT)
+ legacy_pic_pcat_compat();
+
/* ACPI 6.3 and newer support the online capable bit. */
if (acpi_gbl_FADT.header.revision > 6 ||
(acpi_gbl_FADT.header.revision == 6 &&
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -32,6 +32,7 @@
*/
static void init_8259A(int auto_eoi);
+static bool pcat_compat __ro_after_init;
static int i8259A_auto_eoi;
DEFINE_RAW_SPINLOCK(i8259A_lock);
@@ -299,15 +300,32 @@ static void unmask_8259A(void)
static int probe_8259A(void)
{
+ unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
unsigned long flags;
- unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
- unsigned char new_val;
+
+ /*
+ * If MADT has the PCAT_COMPAT flag set, then do not bother probing
+ * for the PIC. Some BIOSes leave the PIC uninitialized and probing
+ * fails.
+ *
+ * Right now this causes problems as quite some code depends on
+ * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
+ * when the system has an IO/APIC because then PIC is not required
+ * at all, except for really old machines where the timer interrupt
+ * must be routed through the PIC. So just pretend that the PIC is
+ * there and let legacy_pic->init() initialize it for nothing.
+ *
+ * Alternatively this could just try to initialize the PIC and
+ * repeat the probe, but for cases where there is no PIC that's
+ * just pointless.
+ */
+ if (pcat_compat)
+ return nr_legacy_irqs();
+
/*
- * Check to see if we have a PIC.
- * Mask all except the cascade and read
- * back the value we just wrote. If we don't
- * have a PIC, we will read 0xff as opposed to the
- * value we wrote.
+ * Check to see if we have a PIC. Mask all except the cascade and
+ * read back the value we just wrote. If we don't have a PIC, we
+ * will read 0xff as opposed to the value we wrote.
*/
raw_spin_lock_irqsave(&i8259A_lock, flags);
@@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void)
return 0;
}
-
device_initcall(i8259A_init_ops);
+
+void __init legacy_pic_pcat_compat(void)
+{
+ pcat_compat = true;
+}
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-10-25 21:04 ` [PATCH] x86/i8259: Skip probing when ACPI/MADT advertises PCAT compatibility, Thomas Gleixner
@ 2023-10-25 22:11 ` Mario Limonciello
2023-10-26 9:27 ` Re: Thomas Gleixner
0 siblings, 1 reply; 417+ messages in thread
From: Mario Limonciello @ 2023-10-25 22:11 UTC (permalink / raw)
To: Thomas Gleixner, David Lazar
Cc: Hans de Goede, kys, hpa, x86, LKML, Borislav Petkov,
Rafael J. Wysocki, Linux kernel regressions list
On 10/25/2023 16:04, Thomas Gleixner wrote:
> David and a few others reported that on certain newer systems some legacy
> interrupts fail to work correctly.
>
> Debugging revealed that the BIOS of these systems leaves the legacy PIC in
> uninitialized state which makes the PIC detection fail and the kernel
> switches to a dummy implementation.
>
> Unfortunately this fallback causes quite some code to fail as it depends on
> checks for the number of legacy PIC interrupts or the availability of the
> real PIC.
>
> In theory there is no reason to use the PIC on any modern system when
> IO/APIC is available, but the dependencies on the related checks cannot be
> resolved trivially and on short notice. This needs lots of analysis and
> rework.
>
> The PIC detection has been added to avoid quirky checks and force selection
> of the dummy implementation all over the place, especially in VM guest
> scenarios. So it's not an option to revert the relevant commit as that
> would break a lot of other scenarios.
>
> One solution would be to try to initialize the PIC on detection fail and
> retry the detection, but that puts the burden on everything which does not
> have a PIC.
>
> Fortunately the ACPI/MADT table header has a flag field, which advertises
> in bit 0 that the system is PCAT compatible, which means it has a legacy
> 8259 PIC.
>
> Evaluate that bit and if set avoid the detection routine and keep the real
> PIC installed, which then gets initialized (for nothing) and makes the rest
> of the code with all the dependencies work again.
>
> Fixes: e179f6914152 ("x86, irq, pic: Probe for legacy PIC and set legacy_pic appropriately")
> Reported-by: David Lazar <dlazar@gmail.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Tested-by: David Lazar <dlazar@gmail.com>
> Cc: stable@vger.kernel.org
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218003
s/Link/Closes/
Presumably you will add a proper subject when this is committed?
With adding title and fixing that tag:
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
> ---
> ---
> arch/x86/include/asm/i8259.h | 2 ++
> arch/x86/kernel/acpi/boot.c | 3 +++
> arch/x86/kernel/i8259.c | 38 ++++++++++++++++++++++++++++++--------
> 3 files changed, 35 insertions(+), 8 deletions(-)
>
> --- a/arch/x86/include/asm/i8259.h
> +++ b/arch/x86/include/asm/i8259.h
> @@ -69,6 +69,8 @@ struct legacy_pic {
> void (*make_irq)(unsigned int irq);
> };
>
> +void legacy_pic_pcat_compat(void);
> +
> extern struct legacy_pic *legacy_pic;
> extern struct legacy_pic null_legacy_pic;
>
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -148,6 +148,9 @@ static int __init acpi_parse_madt(struct
> pr_debug("Local APIC address 0x%08x\n", madt->address);
> }
>
> + if (madt->flags & ACPI_MADT_PCAT_COMPAT)
> + legacy_pic_pcat_compat();
> +
> /* ACPI 6.3 and newer support the online capable bit. */
> if (acpi_gbl_FADT.header.revision > 6 ||
> (acpi_gbl_FADT.header.revision == 6 &&
> --- a/arch/x86/kernel/i8259.c
> +++ b/arch/x86/kernel/i8259.c
> @@ -32,6 +32,7 @@
> */
> static void init_8259A(int auto_eoi);
>
> +static bool pcat_compat __ro_after_init;
> static int i8259A_auto_eoi;
> DEFINE_RAW_SPINLOCK(i8259A_lock);
>
> @@ -299,15 +300,32 @@ static void unmask_8259A(void)
>
> static int probe_8259A(void)
> {
> + unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR);
> unsigned long flags;
> - unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
> - unsigned char new_val;
> +
> + /*
> + * If MADT has the PCAT_COMPAT flag set, then do not bother probing
> + * for the PIC. Some BIOSes leave the PIC uninitialized and probing
> + * fails.
> + *
> + * Right now this causes problems as quite some code depends on
> + * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly
> + * when the system has an IO/APIC because then PIC is not required
> + * at all, except for really old machines where the timer interrupt
> + * must be routed through the PIC. So just pretend that the PIC is
> + * there and let legacy_pic->init() initialize it for nothing.
> + *
> + * Alternatively this could just try to initialize the PIC and
> + * repeat the probe, but for cases where there is no PIC that's
> + * just pointless.
> + */
> + if (pcat_compat)
> + return nr_legacy_irqs();
> +
> /*
> - * Check to see if we have a PIC.
> - * Mask all except the cascade and read
> - * back the value we just wrote. If we don't
> - * have a PIC, we will read 0xff as opposed to the
> - * value we wrote.
> + * Check to see if we have a PIC. Mask all except the cascade and
> + * read back the value we just wrote. If we don't have a PIC, we
> + * will read 0xff as opposed to the value we wrote.
> */
> raw_spin_lock_irqsave(&i8259A_lock, flags);
>
> @@ -429,5 +447,9 @@ static int __init i8259A_init_ops(void)
>
> return 0;
> }
> -
> device_initcall(i8259A_init_ops);
> +
> +void __init legacy_pic_pcat_compat(void)
> +{
> + pcat_compat = true;
> +}
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-10-25 22:11 ` Mario Limonciello
@ 2023-10-26 9:27 ` Thomas Gleixner
0 siblings, 0 replies; 417+ messages in thread
From: Thomas Gleixner @ 2023-10-26 9:27 UTC (permalink / raw)
To: Mario Limonciello, David Lazar
Cc: Hans de Goede, kys, hpa, x86, LKML, Borislav Petkov,
Rafael J. Wysocki, Linux kernel regressions list
On Wed, Oct 25 2023 at 17:11, Mario Limonciello wrote:
> On 10/25/2023 16:04, Thomas Gleixner wrote:
>> Cc: stable@vger.kernel.org
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218003
>
> s/Link/Closes/
Sure.
> Presumably you will add a proper subject when this is committed?
Bah, yes. I stopped replacing the subject line right after clearing it :(
> With adding title and fixing that tag:
>
> Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <64b09dbb.630a0220.e80b9.e2ed@mx.google.com>]
* Re:
[not found] <64b09dbb.630a0220.e80b9.e2ed@mx.google.com>
@ 2023-07-14 8:05 ` Andy Shevchenko
0 siblings, 0 replies; 417+ messages in thread
From: Andy Shevchenko @ 2023-07-14 8:05 UTC (permalink / raw)
To: luoruihong
Cc: ilpo.jarvinen, gregkh, jirislaby, linux-kernel, linux-serial,
luoruihong, weipengliang, wengjinfei
On Fri, Jul 14, 2023 at 08:58:29AM +0800, luoruihong wrote:
> On Thu, Jul 13, 2023 at 07:51:14PM +0300, Andy Shevchenko wrote:
> > On Thu, Jul 13, 2023 at 08:42:36AM +0800, Ruihong Luo wrote:
> > > Preserve the original value of the Divisor Latch Fraction (DLF) register.
> > > When the DLF register is modified without preservation, it can disrupt
> > > the baudrate settings established by firmware or bootloader, leading to
> > > data corruption and the generation of unreadable or distorted characters.
> >
> > You forgot to add my tag. Why? Do you think the name of variable warrants this?
> > Whatever,
> > Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> >
> > Next time if you don't pick up somebody's tag, care to explain in the changelog
> > why.
> >
> > > Fixes: 701c5e73b296 ("serial: 8250_dw: add fractional divisor support")
> > > Signed-off-by: Ruihong Luo <colorsu1922@gmail.com>
>
> I'm sorry, I didn't know about this rule. Thank you for helping me add
> the missing tags back and for all your previous kind assistance.
For now no need to do anything, just wait for Ilpo's and/or Greg's answer(s),
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2023-05-11 12:58 Ryan Roberts
2023-05-11 13:13 ` Ryan Roberts
0 siblings, 1 reply; 417+ messages in thread
From: Ryan Roberts @ 2023-05-11 12:58 UTC (permalink / raw)
To: Andrew Morton, Matthew Wilcox (Oracle),
Kirill A. Shutemov, SeongJae Park
Cc: Ryan Roberts, linux-kernel, linux-mm, damon
Date: Thu, 11 May 2023 11:38:28 +0100
Subject: [PATCH v1 0/5] Encapsulate PTE contents from non-arch code
Hi All,
This series improves the encapsulation of pte entries by disallowing non-arch
code from directly dereferencing pte_t pointers. Instead code must use a new
helper, `pte_t ptep_deref(pte_t *ptep)`. By default, this helper does a direct
dereference of the pointer, so generated code should be exactly the same. But
it's presence sets us up for arch code being able to override the default to
"virtualize" the ptes without needing to maintain a shadow table.
I intend to take advantage of this for arm64 to enable use of its "contiguous
bit" to coalesce multiple ptes into a single tlb entry, reducing pressure and
improving performance. I have an RFC for the first part of this work at [1]. The
cover letter there also explains the second part, which this series is enabling.
I intend to post an RFC for the contpte changes in due course, but it would be
good to get the ball rolling on this enabler.
There are 2 reasons that I need the encapsulation:
- Prevent leaking the arch-private PTE_CONT bit to the core code. If the core
code reads a pte that contains this bit, it could end up calling
set_pte_at() with the bit set which would confuse the implementation. So we
can always clear PTE_CONT in ptep_deref() (and ptep_get()) to avoid a leaky
abstraction.
- Contiguous ptes have a single access and dirty bit for the contiguous range.
So we need to "mix-in" those bits when the core is dereferencing a pte that
lies in the contig range. There is code that dereferences the pte then takes
different actions based on access/dirty (see e.g. write_protect_page()).
While ptep_get() and ptep_get_lockless() already exist, both of them are
implemented using READ_ONCE() by default. While we could use ptep_get() instead
of the new ptep_deref(), I didn't want to risk performance regression.
Alternatively, all call sites that currently use ptep_get() that need the
lockless behaviour could be upgraded to ptep_get_lockless() and ptep_get() could
be downgraded to a simple dereference. That would be cleanest, but is a much
bigger (and likely error prone) change because all the arch code would need to
be updated for the new definitions of ptep_get().
The series is split up as follows:
patchs 1-2: Fix bugs where code was _setting_ ptes directly, rather than using
set_pte_at() and friends.
patch 3: Fix highmem unmapping issue I spotted while doing the work.
patch 4: Introduce the new ptep_deref() helper with default implementation.
patch 5: Convert all direct dereferences to use ptep_deref().
[1] https://lore.kernel.org/linux-mm/20230414130303.2345383-1-ryan.roberts@arm.com/
Thanks,
Ryan
Ryan Roberts (5):
mm: vmalloc must set pte via arch code
mm: damon must atomically clear young on ptes and pmds
mm: Fix failure to unmap pte on highmem systems
mm: Add new ptep_deref() helper to fully encapsulate pte_t
mm: ptep_deref() conversion
.../drm/i915/gem/selftests/i915_gem_mman.c | 8 +-
drivers/misc/sgi-gru/grufault.c | 2 +-
drivers/vfio/vfio_iommu_type1.c | 7 +-
drivers/xen/privcmd.c | 2 +-
fs/proc/task_mmu.c | 33 +++---
fs/userfaultfd.c | 6 +-
include/linux/hugetlb.h | 2 +-
include/linux/mm_inline.h | 2 +-
include/linux/pgtable.h | 13 ++-
kernel/events/uprobes.c | 2 +-
mm/damon/ops-common.c | 18 ++-
mm/damon/ops-common.h | 4 +-
mm/damon/paddr.c | 6 +-
mm/damon/vaddr.c | 14 ++-
mm/filemap.c | 2 +-
mm/gup.c | 21 ++--
mm/highmem.c | 12 +-
mm/hmm.c | 2 +-
mm/huge_memory.c | 4 +-
mm/hugetlb.c | 2 +-
mm/hugetlb_vmemmap.c | 6 +-
mm/kasan/init.c | 9 +-
mm/kasan/shadow.c | 10 +-
mm/khugepaged.c | 24 ++--
mm/ksm.c | 22 ++--
mm/madvise.c | 6 +-
mm/mapping_dirty_helpers.c | 4 +-
mm/memcontrol.c | 4 +-
mm/memory-failure.c | 6 +-
mm/memory.c | 103 +++++++++---------
mm/mempolicy.c | 6 +-
mm/migrate.c | 14 ++-
mm/migrate_device.c | 14 ++-
mm/mincore.c | 2 +-
mm/mlock.c | 6 +-
mm/mprotect.c | 8 +-
mm/mremap.c | 2 +-
mm/page_table_check.c | 4 +-
mm/page_vma_mapped.c | 26 +++--
mm/pgtable-generic.c | 2 +-
mm/rmap.c | 32 +++---
mm/sparse-vmemmap.c | 8 +-
mm/swap_state.c | 4 +-
mm/swapfile.c | 16 +--
mm/userfaultfd.c | 4 +-
mm/vmalloc.c | 11 +-
mm/vmscan.c | 14 ++-
virt/kvm/kvm_main.c | 9 +-
48 files changed, 302 insertions(+), 236 deletions(-)
--
2.25.1
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-11 12:58 Ryan Roberts
@ 2023-05-11 13:13 ` Ryan Roberts
0 siblings, 0 replies; 417+ messages in thread
From: Ryan Roberts @ 2023-05-11 13:13 UTC (permalink / raw)
To: Andrew Morton, Matthew Wilcox (Oracle),
Kirill A. Shutemov, SeongJae Park
Cc: linux-kernel, linux-mm, damon
My appologies for the noise: A blank line between Cc and Subject has broken the
subject and grouping in lore.
Please Ignore this, I will resend.
On 11/05/2023 13:58, Ryan Roberts wrote:
> Date: Thu, 11 May 2023 11:38:28 +0100
> Subject: [PATCH v1 0/5] Encapsulate PTE contents from non-arch code
>
> Hi All,
>
> This series improves the encapsulation of pte entries by disallowing non-arch
> code from directly dereferencing pte_t pointers. Instead code must use a new
> helper, `pte_t ptep_deref(pte_t *ptep)`. By default, this helper does a direct
> dereference of the pointer, so generated code should be exactly the same. But
> it's presence sets us up for arch code being able to override the default to
> "virtualize" the ptes without needing to maintain a shadow table.
>
> I intend to take advantage of this for arm64 to enable use of its "contiguous
> bit" to coalesce multiple ptes into a single tlb entry, reducing pressure and
> improving performance. I have an RFC for the first part of this work at [1]. The
> cover letter there also explains the second part, which this series is enabling.
>
> I intend to post an RFC for the contpte changes in due course, but it would be
> good to get the ball rolling on this enabler.
>
> There are 2 reasons that I need the encapsulation:
>
> - Prevent leaking the arch-private PTE_CONT bit to the core code. If the core
> code reads a pte that contains this bit, it could end up calling
> set_pte_at() with the bit set which would confuse the implementation. So we
> can always clear PTE_CONT in ptep_deref() (and ptep_get()) to avoid a leaky
> abstraction.
> - Contiguous ptes have a single access and dirty bit for the contiguous range.
> So we need to "mix-in" those bits when the core is dereferencing a pte that
> lies in the contig range. There is code that dereferences the pte then takes
> different actions based on access/dirty (see e.g. write_protect_page()).
>
> While ptep_get() and ptep_get_lockless() already exist, both of them are
> implemented using READ_ONCE() by default. While we could use ptep_get() instead
> of the new ptep_deref(), I didn't want to risk performance regression.
> Alternatively, all call sites that currently use ptep_get() that need the
> lockless behaviour could be upgraded to ptep_get_lockless() and ptep_get() could
> be downgraded to a simple dereference. That would be cleanest, but is a much
> bigger (and likely error prone) change because all the arch code would need to
> be updated for the new definitions of ptep_get().
>
> The series is split up as follows:
>
> patchs 1-2: Fix bugs where code was _setting_ ptes directly, rather than using
> set_pte_at() and friends.
> patch 3: Fix highmem unmapping issue I spotted while doing the work.
> patch 4: Introduce the new ptep_deref() helper with default implementation.
> patch 5: Convert all direct dereferences to use ptep_deref().
>
> [1] https://lore.kernel.org/linux-mm/20230414130303.2345383-1-ryan.roberts@arm.com/
>
> Thanks,
> Ryan
>
>
> Ryan Roberts (5):
> mm: vmalloc must set pte via arch code
> mm: damon must atomically clear young on ptes and pmds
> mm: Fix failure to unmap pte on highmem systems
> mm: Add new ptep_deref() helper to fully encapsulate pte_t
> mm: ptep_deref() conversion
>
> .../drm/i915/gem/selftests/i915_gem_mman.c | 8 +-
> drivers/misc/sgi-gru/grufault.c | 2 +-
> drivers/vfio/vfio_iommu_type1.c | 7 +-
> drivers/xen/privcmd.c | 2 +-
> fs/proc/task_mmu.c | 33 +++---
> fs/userfaultfd.c | 6 +-
> include/linux/hugetlb.h | 2 +-
> include/linux/mm_inline.h | 2 +-
> include/linux/pgtable.h | 13 ++-
> kernel/events/uprobes.c | 2 +-
> mm/damon/ops-common.c | 18 ++-
> mm/damon/ops-common.h | 4 +-
> mm/damon/paddr.c | 6 +-
> mm/damon/vaddr.c | 14 ++-
> mm/filemap.c | 2 +-
> mm/gup.c | 21 ++--
> mm/highmem.c | 12 +-
> mm/hmm.c | 2 +-
> mm/huge_memory.c | 4 +-
> mm/hugetlb.c | 2 +-
> mm/hugetlb_vmemmap.c | 6 +-
> mm/kasan/init.c | 9 +-
> mm/kasan/shadow.c | 10 +-
> mm/khugepaged.c | 24 ++--
> mm/ksm.c | 22 ++--
> mm/madvise.c | 6 +-
> mm/mapping_dirty_helpers.c | 4 +-
> mm/memcontrol.c | 4 +-
> mm/memory-failure.c | 6 +-
> mm/memory.c | 103 +++++++++---------
> mm/mempolicy.c | 6 +-
> mm/migrate.c | 14 ++-
> mm/migrate_device.c | 14 ++-
> mm/mincore.c | 2 +-
> mm/mlock.c | 6 +-
> mm/mprotect.c | 8 +-
> mm/mremap.c | 2 +-
> mm/page_table_check.c | 4 +-
> mm/page_vma_mapped.c | 26 +++--
> mm/pgtable-generic.c | 2 +-
> mm/rmap.c | 32 +++---
> mm/sparse-vmemmap.c | 8 +-
> mm/swap_state.c | 4 +-
> mm/swapfile.c | 16 +--
> mm/userfaultfd.c | 4 +-
> mm/vmalloc.c | 11 +-
> mm/vmscan.c | 14 ++-
> virt/kvm/kvm_main.c | 9 +-
> 48 files changed, 302 insertions(+), 236 deletions(-)
>
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH v2] uas: Add US_FL_NO_REPORT_OPCODES for JMicron JMS583Gen 2
@ 2023-03-12 6:52 Greg Kroah-Hartman
2023-03-27 13:54 ` Yaroslav Furman
0 siblings, 1 reply; 417+ messages in thread
From: Greg Kroah-Hartman @ 2023-03-12 6:52 UTC (permalink / raw)
To: Yaroslav Furman; +Cc: Alan Stern, linux-usb, usb-storage, linux-kernel
On Sat, Mar 11, 2023 at 07:12:26PM +0200, Yaroslav Furman wrote:
> Just like other JMicron JMS5xx enclosures, it chokes on report-opcodes,
> let's avoid them.
>
> Signed-off-by: Yaroslav Furman <yaro330@gmail.com>
> ---
> drivers/usb/storage/unusual_uas.h | 7 +++++++
> 1 file changed, 7 insertions(+)
>
> diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
> index c7b763d6d102..1f8c9b16a0fb 100644
> --- a/drivers/usb/storage/unusual_uas.h
> +++ b/drivers/usb/storage/unusual_uas.h
> @@ -111,6 +111,13 @@ UNUSUAL_DEV(0x152d, 0x0578, 0x0000, 0x9999,
> USB_SC_DEVICE, USB_PR_DEVICE, NULL,
> US_FL_BROKEN_FUA),
>
> +/* Reported by: Yaroslav Furman <yaro330@gmail.com> */
> +UNUSUAL_DEV(0x152d, 0x0583, 0x0000, 0x9999,
> + "JMicron",
> + "JMS583Gen 2",
> + USB_SC_DEVICE, USB_PR_DEVICE, NULL,
> + US_FL_NO_REPORT_OPCODES),
> +
> /* Reported-by: Thinh Nguyen <thinhn@synopsys.com> */
> UNUSUAL_DEV(0x154b, 0xf00b, 0x0000, 0x9999,
> "PNY",
> --
> 2.39.2
>
Hi,
This is the friendly patch-bot of Greg Kroah-Hartman. You have sent him
a patch that has triggered this response. He used to manually respond
to these common problems, but in order to save his sanity (he kept
writing the same thing over and over, yet to different people), I was
created. Hopefully you will not take offence and will fix the problem
in your patch and resubmit it so that it can be accepted into the Linux
kernel tree.
You are receiving this message because of the following common error(s)
as indicated below:
- This looks like a new version of a previously submitted patch, but you
did not list below the --- line any changes from the previous version.
Please read the section entitled "The canonical patch format" in the
kernel file, Documentation/process/submitting-patches.rst for what
needs to be done here to properly describe this.
If you wish to discuss this problem further, or you have questions about
how to resolve this issue, please feel free to respond to this email and
Greg will reply once he has dug out from the pending patches received
from other developers.
thanks,
greg k-h's patch email bot
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH v5 0/5] CXL Poison List Retrieval & Tracing
@ 2023-01-18 20:59 alison.schofield
2023-01-27 1:59 ` Dan Williams
0 siblings, 1 reply; 417+ messages in thread
From: alison.schofield @ 2023-01-18 20:59 UTC (permalink / raw)
To: Dan Williams, Ira Weiny, Vishal Verma, Dave Jiang, Ben Widawsky,
Steven Rostedt
Cc: Alison Schofield, linux-cxl, linux-kernel
From: Alison Schofield <alison.schofield@intel.com>
**RESENDING this cover letter previously mis-threaded.
Changes in v5:
- Rebase on cxl/next
- Use struct_size() to calc mbox cmd payload .min_out
- s/INTERNAL/INJECTED mocked poison record source
- Added Jonathan Reviewed-by tag on Patch 3
Link to v4:
https://lore.kernel.org/linux-cxl/cover.1671135967.git.alison.schofield@intel.com/
Add support for retrieving device poison lists and store the returned
error records as kernel trace events.
The handling of the poison list is guided by the CXL 3.0 Specification
Section 8.2.9.8.4.1. [1]
Example, triggered by memdev:
$ echo 1 > /sys/bus/cxl/devices/mem3/trigger_poison_list
cxl_poison: memdev=mem3 pcidev=cxl_mem.3 region= region_uuid=00000000-0000-0000-0000-000000000000 dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
Example, triggered by region:
$ echo 1 > /sys/bus/cxl/devices/region5/trigger_poison_list
cxl_poison: memdev=mem0 pcidev=cxl_mem.0 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
cxl_poison: memdev=mem1 pcidev=cxl_mem.1 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
[1]: https://www.computeexpresslink.org/download-the-specification
Alison Schofield (5):
cxl/mbox: Add GET_POISON_LIST mailbox command
cxl/trace: Add TRACE support for CXL media-error records
cxl/memdev: Add trigger_poison_list sysfs attribute
cxl/region: Add trigger_poison_list sysfs attribute
tools/testing/cxl: Mock support for Get Poison List
Documentation/ABI/testing/sysfs-bus-cxl | 28 +++++++++
drivers/cxl/core/mbox.c | 78 +++++++++++++++++++++++
drivers/cxl/core/memdev.c | 45 ++++++++++++++
drivers/cxl/core/region.c | 33 ++++++++++
drivers/cxl/core/trace.h | 83 +++++++++++++++++++++++++
drivers/cxl/cxlmem.h | 69 +++++++++++++++++++-
drivers/cxl/pci.c | 4 ++
tools/testing/cxl/test/mem.c | 42 +++++++++++++
8 files changed, 381 insertions(+), 1 deletion(-)
base-commit: 589c3357370a596ef7c99c00baca8ac799fce531
--
2.37.3
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
2023-01-18 20:59 [PATCH v5 0/5] CXL Poison List Retrieval & Tracing alison.schofield
@ 2023-01-27 1:59 ` Dan Williams
2023-01-27 16:10 ` Alison Schofield
0 siblings, 1 reply; 417+ messages in thread
From: Dan Williams @ 2023-01-27 1:59 UTC (permalink / raw)
To: alison.schofield, Dan Williams, Ira Weiny, Vishal Verma,
Dave Jiang, Ben Widawsky, Steven Rostedt
Cc: Alison Schofield, linux-cxl, linux-kernel
alison.schofield@ wrote:
> From: Alison Schofield <alison.schofield@intel.com>
>
> Subject: [PATCH v5 0/5] CXL Poison List Retrieval & Tracing
>
> Changes in v5:
> - Rebase on cxl/next
> - Use struct_size() to calc mbox cmd payload .min_out
> - s/INTERNAL/INJECTED mocked poison record source
> - Added Jonathan Reviewed-by tag on Patch 3
>
> Link to v4:
> https://lore.kernel.org/linux-cxl/cover.1671135967.git.alison.schofield@intel.com/
>
> Add support for retrieving device poison lists and store the returned
> error records as kernel trace events.
>
> The handling of the poison list is guided by the CXL 3.0 Specification
> Section 8.2.9.8.4.1. [1]
>
> Example, triggered by memdev:
> $ echo 1 > /sys/bus/cxl/devices/mem3/trigger_poison_list
> cxl_poison: memdev=mem3 pcidev=cxl_mem.3 region= region_uuid=00000000-0000-0000-0000-000000000000 dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
I think the pcidev= field wants to be called something like "host" or
"parent", because there is no strict requirement that a 'struct
cxl_memdev' is related to a 'struct pci_dev'. In fact in that example
"cxl_mem.3" is a 'struct platform_device'. Now that I think about it, I
think all CXL device events should be emitting the PCIe serial number
for the memdev.
I will look in the implementation, but do region= and region_uuid= get
populated when mem3 is a member of the region?
>
> Example, triggered by region:
> $ echo 1 > /sys/bus/cxl/devices/region5/trigger_poison_list
> cxl_poison: memdev=mem0 pcidev=cxl_mem.0 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> cxl_poison: memdev=mem1 pcidev=cxl_mem.1 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
>
> [1]: https://www.computeexpresslink.org/download-the-specification
>
> Alison Schofield (5):
> cxl/mbox: Add GET_POISON_LIST mailbox command
> cxl/trace: Add TRACE support for CXL media-error records
> cxl/memdev: Add trigger_poison_list sysfs attribute
> cxl/region: Add trigger_poison_list sysfs attribute
> tools/testing/cxl: Mock support for Get Poison List
>
> Documentation/ABI/testing/sysfs-bus-cxl | 28 +++++++++
> drivers/cxl/core/mbox.c | 78 +++++++++++++++++++++++
> drivers/cxl/core/memdev.c | 45 ++++++++++++++
> drivers/cxl/core/region.c | 33 ++++++++++
> drivers/cxl/core/trace.h | 83 +++++++++++++++++++++++++
> drivers/cxl/cxlmem.h | 69 +++++++++++++++++++-
> drivers/cxl/pci.c | 4 ++
> tools/testing/cxl/test/mem.c | 42 +++++++++++++
> 8 files changed, 381 insertions(+), 1 deletion(-)
>
>
> base-commit: 589c3357370a596ef7c99c00baca8ac799fce531
> --
> 2.37.3
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-01-27 1:59 ` Dan Williams
@ 2023-01-27 16:10 ` Alison Schofield
2023-01-27 19:16 ` Re: Dan Williams
0 siblings, 1 reply; 417+ messages in thread
From: Alison Schofield @ 2023-01-27 16:10 UTC (permalink / raw)
To: Dan Williams
Cc: Ira Weiny, Vishal Verma, Dave Jiang, Ben Widawsky,
Steven Rostedt, linux-cxl, linux-kernel
On Thu, Jan 26, 2023 at 05:59:03PM -0800, Dan Williams wrote:
> alison.schofield@ wrote:
> > From: Alison Schofield <alison.schofield@intel.com>
> >
> > Subject: [PATCH v5 0/5] CXL Poison List Retrieval & Tracing
> >
> > Changes in v5:
> > - Rebase on cxl/next
> > - Use struct_size() to calc mbox cmd payload .min_out
> > - s/INTERNAL/INJECTED mocked poison record source
> > - Added Jonathan Reviewed-by tag on Patch 3
> >
> > Link to v4:
> > https://lore.kernel.org/linux-cxl/cover.1671135967.git.alison.schofield@intel.com/
> >
> > Add support for retrieving device poison lists and store the returned
> > error records as kernel trace events.
> >
> > The handling of the poison list is guided by the CXL 3.0 Specification
> > Section 8.2.9.8.4.1. [1]
> >
> > Example, triggered by memdev:
> > $ echo 1 > /sys/bus/cxl/devices/mem3/trigger_poison_list
> > cxl_poison: memdev=mem3 pcidev=cxl_mem.3 region= region_uuid=00000000-0000-0000-0000-000000000000 dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
>
> I think the pcidev= field wants to be called something like "host" or
> "parent", because there is no strict requirement that a 'struct
> cxl_memdev' is related to a 'struct pci_dev'. In fact in that example
> "cxl_mem.3" is a 'struct platform_device'. Now that I think about it, I
> think all CXL device events should be emitting the PCIe serial number
> for the memdev.
]
Will do, 'host' and add PCIe serial no.
>
> I will look in the implementation, but do region= and region_uuid= get
> populated when mem3 is a member of the region?
Not always.
In the case above, where the trigger was by memdev, no.
Region= and region_uuid= (and in the follow-on patch, hpa=) only get
populated if the poison was triggered by region, like the case below.
It could be looked up for the by memdev cases. Is that wanted?
Thanks for the reviews Dan!
>
> >
> > Example, triggered by region:
> > $ echo 1 > /sys/bus/cxl/devices/region5/trigger_poison_list
> > cxl_poison: memdev=mem0 pcidev=cxl_mem.0 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > cxl_poison: memdev=mem1 pcidev=cxl_mem.1 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> >
> > [1]: https://www.computeexpresslink.org/download-the-specification
> >
> > Alison Schofield (5):
> > cxl/mbox: Add GET_POISON_LIST mailbox command
> > cxl/trace: Add TRACE support for CXL media-error records
> > cxl/memdev: Add trigger_poison_list sysfs attribute
> > cxl/region: Add trigger_poison_list sysfs attribute
> > tools/testing/cxl: Mock support for Get Poison List
> >
> > Documentation/ABI/testing/sysfs-bus-cxl | 28 +++++++++
> > drivers/cxl/core/mbox.c | 78 +++++++++++++++++++++++
> > drivers/cxl/core/memdev.c | 45 ++++++++++++++
> > drivers/cxl/core/region.c | 33 ++++++++++
> > drivers/cxl/core/trace.h | 83 +++++++++++++++++++++++++
> > drivers/cxl/cxlmem.h | 69 +++++++++++++++++++-
> > drivers/cxl/pci.c | 4 ++
> > tools/testing/cxl/test/mem.c | 42 +++++++++++++
> > 8 files changed, 381 insertions(+), 1 deletion(-)
> >
> >
> > base-commit: 589c3357370a596ef7c99c00baca8ac799fce531
> > --
> > 2.37.3
> >
>
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-01-27 16:10 ` Alison Schofield
@ 2023-01-27 19:16 ` Dan Williams
2023-01-27 21:36 ` Re: Alison Schofield
0 siblings, 1 reply; 417+ messages in thread
From: Dan Williams @ 2023-01-27 19:16 UTC (permalink / raw)
To: Alison Schofield, Dan Williams
Cc: Ira Weiny, Vishal Verma, Dave Jiang, Ben Widawsky,
Steven Rostedt, linux-cxl, linux-kernel
Alison Schofield wrote:
> On Thu, Jan 26, 2023 at 05:59:03PM -0800, Dan Williams wrote:
> > alison.schofield@ wrote:
> > > From: Alison Schofield <alison.schofield@intel.com>
> > >
> > > Subject: [PATCH v5 0/5] CXL Poison List Retrieval & Tracing
> > >
> > > Changes in v5:
> > > - Rebase on cxl/next
> > > - Use struct_size() to calc mbox cmd payload .min_out
> > > - s/INTERNAL/INJECTED mocked poison record source
> > > - Added Jonathan Reviewed-by tag on Patch 3
> > >
> > > Link to v4:
> > > https://lore.kernel.org/linux-cxl/cover.1671135967.git.alison.schofield@intel.com/
> > >
> > > Add support for retrieving device poison lists and store the returned
> > > error records as kernel trace events.
> > >
> > > The handling of the poison list is guided by the CXL 3.0 Specification
> > > Section 8.2.9.8.4.1. [1]
> > >
> > > Example, triggered by memdev:
> > > $ echo 1 > /sys/bus/cxl/devices/mem3/trigger_poison_list
> > > cxl_poison: memdev=mem3 pcidev=cxl_mem.3 region= region_uuid=00000000-0000-0000-0000-000000000000 dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> >
> > I think the pcidev= field wants to be called something like "host" or
> > "parent", because there is no strict requirement that a 'struct
> > cxl_memdev' is related to a 'struct pci_dev'. In fact in that example
> > "cxl_mem.3" is a 'struct platform_device'. Now that I think about it, I
> > think all CXL device events should be emitting the PCIe serial number
> > for the memdev.
> ]
>
> Will do, 'host' and add PCIe serial no.
>
> >
> > I will look in the implementation, but do region= and region_uuid= get
> > populated when mem3 is a member of the region?
>
> Not always.
> In the case above, where the trigger was by memdev, no.
> Region= and region_uuid= (and in the follow-on patch, hpa=) only get
> populated if the poison was triggered by region, like the case below.
>
> It could be looked up for the by memdev cases. Is that wanted?
Just trying to understand the semantics. However, I do think it makes sense
for a memdev trigger to lookup information on all impacted regions
across all of the device's DPA and the region trigger makes sense to
lookup all memdevs, but bounded by the DPA that contributes to that
region. I just want to avoid someone having to trigger the region to get
extra information that was readily available from a memdev listing.
>
> Thanks for the reviews Dan!
> >
> > >
> > > Example, triggered by region:
> > > $ echo 1 > /sys/bus/cxl/devices/region5/trigger_poison_list
> > > cxl_poison: memdev=mem0 pcidev=cxl_mem.0 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > > cxl_poison: memdev=mem1 pcidev=cxl_mem.1 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > >
> > > [1]: https://www.computeexpresslink.org/download-the-specification
> > >
> > > Alison Schofield (5):
> > > cxl/mbox: Add GET_POISON_LIST mailbox command
> > > cxl/trace: Add TRACE support for CXL media-error records
> > > cxl/memdev: Add trigger_poison_list sysfs attribute
> > > cxl/region: Add trigger_poison_list sysfs attribute
> > > tools/testing/cxl: Mock support for Get Poison List
> > >
> > > Documentation/ABI/testing/sysfs-bus-cxl | 28 +++++++++
> > > drivers/cxl/core/mbox.c | 78 +++++++++++++++++++++++
> > > drivers/cxl/core/memdev.c | 45 ++++++++++++++
> > > drivers/cxl/core/region.c | 33 ++++++++++
> > > drivers/cxl/core/trace.h | 83 +++++++++++++++++++++++++
> > > drivers/cxl/cxlmem.h | 69 +++++++++++++++++++-
> > > drivers/cxl/pci.c | 4 ++
> > > tools/testing/cxl/test/mem.c | 42 +++++++++++++
> > > 8 files changed, 381 insertions(+), 1 deletion(-)
> > >
> > >
> > > base-commit: 589c3357370a596ef7c99c00baca8ac799fce531
> > > --
> > > 2.37.3
> > >
> >
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-01-27 19:16 ` Re: Dan Williams
@ 2023-01-27 21:36 ` Alison Schofield
2023-01-27 22:04 ` Re: Dan Williams
0 siblings, 1 reply; 417+ messages in thread
From: Alison Schofield @ 2023-01-27 21:36 UTC (permalink / raw)
To: Dan Williams
Cc: Ira Weiny, Vishal Verma, Dave Jiang, Ben Widawsky,
Steven Rostedt, linux-cxl, linux-kernel
On Fri, Jan 27, 2023 at 11:16:49AM -0800, Dan Williams wrote:
> Alison Schofield wrote:
> > On Thu, Jan 26, 2023 at 05:59:03PM -0800, Dan Williams wrote:
> > > alison.schofield@ wrote:
> > > > From: Alison Schofield <alison.schofield@intel.com>
> > > >
> > > > Subject: [PATCH v5 0/5] CXL Poison List Retrieval & Tracing
> > > >
> > > > Changes in v5:
> > > > - Rebase on cxl/next
> > > > - Use struct_size() to calc mbox cmd payload .min_out
> > > > - s/INTERNAL/INJECTED mocked poison record source
> > > > - Added Jonathan Reviewed-by tag on Patch 3
> > > >
> > > > Link to v4:
> > > > https://lore.kernel.org/linux-cxl/cover.1671135967.git.alison.schofield@intel.com/
> > > >
> > > > Add support for retrieving device poison lists and store the returned
> > > > error records as kernel trace events.
> > > >
> > > > The handling of the poison list is guided by the CXL 3.0 Specification
> > > > Section 8.2.9.8.4.1. [1]
> > > >
> > > > Example, triggered by memdev:
> > > > $ echo 1 > /sys/bus/cxl/devices/mem3/trigger_poison_list
> > > > cxl_poison: memdev=mem3 pcidev=cxl_mem.3 region= region_uuid=00000000-0000-0000-0000-000000000000 dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > >
> > > I think the pcidev= field wants to be called something like "host" or
> > > "parent", because there is no strict requirement that a 'struct
> > > cxl_memdev' is related to a 'struct pci_dev'. In fact in that example
> > > "cxl_mem.3" is a 'struct platform_device'. Now that I think about it, I
> > > think all CXL device events should be emitting the PCIe serial number
> > > for the memdev.
> > ]
> >
> > Will do, 'host' and add PCIe serial no.
> >
> > >
> > > I will look in the implementation, but do region= and region_uuid= get
> > > populated when mem3 is a member of the region?
> >
> > Not always.
> > In the case above, where the trigger was by memdev, no.
> > Region= and region_uuid= (and in the follow-on patch, hpa=) only get
> > populated if the poison was triggered by region, like the case below.
> >
> > It could be looked up for the by memdev cases. Is that wanted?
>
> Just trying to understand the semantics. However, I do think it makes sense
> for a memdev trigger to lookup information on all impacted regions
> across all of the device's DPA and the region trigger makes sense to
> lookup all memdevs, but bounded by the DPA that contributes to that
> region. I just want to avoid someone having to trigger the region to get
> extra information that was readily available from a memdev listing.
>
Dan -
Confirming my take-away from this email, and our chat:
Remove the by-region trigger_poison_list option entirely. User space
needs to trigger by-memdev the memdevs participating in the region and
filter those events by region.
Add the region info (region name, uuid) to the TRACE_EVENTs when the
poisoned DPA is part of any region.
Alison
> >
> > Thanks for the reviews Dan!
> > >
> > > >
> > > > Example, triggered by region:
> > > > $ echo 1 > /sys/bus/cxl/devices/region5/trigger_poison_list
> > > > cxl_poison: memdev=mem0 pcidev=cxl_mem.0 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > > > cxl_poison: memdev=mem1 pcidev=cxl_mem.1 region=region5 region_uuid=bfcb7a29-890e-4a41-8236-fe22221fc75c dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > > >
> > > > [1]: https://www.computeexpresslink.org/download-the-specification
> > > >
> > > > Alison Schofield (5):
> > > > cxl/mbox: Add GET_POISON_LIST mailbox command
> > > > cxl/trace: Add TRACE support for CXL media-error records
> > > > cxl/memdev: Add trigger_poison_list sysfs attribute
> > > > cxl/region: Add trigger_poison_list sysfs attribute
> > > > tools/testing/cxl: Mock support for Get Poison List
> > > >
> > > > Documentation/ABI/testing/sysfs-bus-cxl | 28 +++++++++
> > > > drivers/cxl/core/mbox.c | 78 +++++++++++++++++++++++
> > > > drivers/cxl/core/memdev.c | 45 ++++++++++++++
> > > > drivers/cxl/core/region.c | 33 ++++++++++
> > > > drivers/cxl/core/trace.h | 83 +++++++++++++++++++++++++
> > > > drivers/cxl/cxlmem.h | 69 +++++++++++++++++++-
> > > > drivers/cxl/pci.c | 4 ++
> > > > tools/testing/cxl/test/mem.c | 42 +++++++++++++
> > > > 8 files changed, 381 insertions(+), 1 deletion(-)
> > > >
> > > >
> > > > base-commit: 589c3357370a596ef7c99c00baca8ac799fce531
> > > > --
> > > > 2.37.3
> > > >
> > >
> > >
>
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-01-27 21:36 ` Re: Alison Schofield
@ 2023-01-27 22:04 ` Dan Williams
0 siblings, 0 replies; 417+ messages in thread
From: Dan Williams @ 2023-01-27 22:04 UTC (permalink / raw)
To: Alison Schofield, Dan Williams
Cc: Ira Weiny, Vishal Verma, Dave Jiang, Ben Widawsky,
Steven Rostedt, linux-cxl, linux-kernel
Alison Schofield wrote:
> On Fri, Jan 27, 2023 at 11:16:49AM -0800, Dan Williams wrote:
> > Alison Schofield wrote:
> > > On Thu, Jan 26, 2023 at 05:59:03PM -0800, Dan Williams wrote:
> > > > alison.schofield@ wrote:
> > > > > From: Alison Schofield <alison.schofield@intel.com>
> > > > >
> > > > > Subject: [PATCH v5 0/5] CXL Poison List Retrieval & Tracing
> > > > >
> > > > > Changes in v5:
> > > > > - Rebase on cxl/next
> > > > > - Use struct_size() to calc mbox cmd payload .min_out
> > > > > - s/INTERNAL/INJECTED mocked poison record source
> > > > > - Added Jonathan Reviewed-by tag on Patch 3
> > > > >
> > > > > Link to v4:
> > > > > https://lore.kernel.org/linux-cxl/cover.1671135967.git.alison.schofield@intel.com/
> > > > >
> > > > > Add support for retrieving device poison lists and store the returned
> > > > > error records as kernel trace events.
> > > > >
> > > > > The handling of the poison list is guided by the CXL 3.0 Specification
> > > > > Section 8.2.9.8.4.1. [1]
> > > > >
> > > > > Example, triggered by memdev:
> > > > > $ echo 1 > /sys/bus/cxl/devices/mem3/trigger_poison_list
> > > > > cxl_poison: memdev=mem3 pcidev=cxl_mem.3 region= region_uuid=00000000-0000-0000-0000-000000000000 dpa=0x0 length=0x40 source=Internal flags= overflow_time=0
> > > >
> > > > I think the pcidev= field wants to be called something like "host" or
> > > > "parent", because there is no strict requirement that a 'struct
> > > > cxl_memdev' is related to a 'struct pci_dev'. In fact in that example
> > > > "cxl_mem.3" is a 'struct platform_device'. Now that I think about it, I
> > > > think all CXL device events should be emitting the PCIe serial number
> > > > for the memdev.
> > > ]
> > >
> > > Will do, 'host' and add PCIe serial no.
> > >
> > > >
> > > > I will look in the implementation, but do region= and region_uuid= get
> > > > populated when mem3 is a member of the region?
> > >
> > > Not always.
> > > In the case above, where the trigger was by memdev, no.
> > > Region= and region_uuid= (and in the follow-on patch, hpa=) only get
> > > populated if the poison was triggered by region, like the case below.
> > >
> > > It could be looked up for the by memdev cases. Is that wanted?
> >
> > Just trying to understand the semantics. However, I do think it makes sense
> > for a memdev trigger to lookup information on all impacted regions
> > across all of the device's DPA and the region trigger makes sense to
> > lookup all memdevs, but bounded by the DPA that contributes to that
> > region. I just want to avoid someone having to trigger the region to get
> > extra information that was readily available from a memdev listing.
> >
>
> Dan -
>
> Confirming my take-away from this email, and our chat:
>
> Remove the by-region trigger_poison_list option entirely. User space
> needs to trigger by-memdev the memdevs participating in the region and
> filter those events by region.
>
> Add the region info (region name, uuid) to the TRACE_EVENTs when the
> poisoned DPA is part of any region.
That's what I was thinking, yes. So the internals of
cxl_mem_get_poison() will take the cxl_region_rwsem for read and compare
the device's endpoint decoder settings against the media error records
to do the region (and later HPA) lookup.
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2022-11-21 11:11 Denis Arefev
2022-11-21 14:28 ` Jason Yan
0 siblings, 1 reply; 417+ messages in thread
From: Denis Arefev @ 2022-11-21 11:11 UTC (permalink / raw)
To: Anil Gurumurthy
Cc: Sudarsana Kalluru, James E.J. Bottomley, Martin K. Petersen,
linux-scsi, linux-kernel, trufanov, vfh
Date: Mon, 21 Nov 2022 13:29:03 +0300
Subject: [PATCH] scsi:bfa: Eliminated buffer overflow
Buffer 'cmd->adapter_hwpath' of size 32 accessed at
bfad_bsg.c:101:103 can overflow, since its index 'i'
can have value 32 that is out of range.
Signed-off-by: Denis Arefev <arefev@swemel.ru>
---
drivers/scsi/bfa/bfad_bsg.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/bfa/bfad_bsg.c b/drivers/scsi/bfa/bfad_bsg.c
index be8dfbe13e90..78615ffc62ef 100644
--- a/drivers/scsi/bfa/bfad_bsg.c
+++ b/drivers/scsi/bfa/bfad_bsg.c
@@ -98,9 +98,9 @@ bfad_iocmd_ioc_get_info(struct bfad_s *bfad, void *cmd)
/* set adapter hw path */
strcpy(iocmd->adapter_hwpath, bfad->pci_name);
- for (i = 0; iocmd->adapter_hwpath[i] != ':' && i < BFA_STRING_32; i++)
+ for (i = 0; iocmd->adapter_hwpath[i] != ':' && i < BFA_STRING_32-2; i++)
;
- for (; iocmd->adapter_hwpath[++i] != ':' && i < BFA_STRING_32; )
+ for (; iocmd->adapter_hwpath[++i] != ':' && i < BFA_STRING_32-1; )
;
iocmd->adapter_hwpath[i] = '\0';
iocmd->status = BFA_STATUS_OK;
--
2.25.1
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2022-11-21 11:11 Denis Arefev
@ 2022-11-21 14:28 ` Jason Yan
0 siblings, 0 replies; 417+ messages in thread
From: Jason Yan @ 2022-11-21 14:28 UTC (permalink / raw)
To: Denis Arefev, Anil Gurumurthy
Cc: Sudarsana Kalluru, James E.J. Bottomley, Martin K. Petersen,
linux-scsi, linux-kernel, trufanov, vfh
You may need a real subject, not a subject text in the email.
type "git help send-email" if you don't know how to use it.
On 2022/11/21 19:11, Denis Arefev wrote:
> Date: Mon, 21 Nov 2022 13:29:03 +0300
> Subject: [PATCH] scsi:bfa: Eliminated buffer overflow
>
> Buffer 'cmd->adapter_hwpath' of size 32 accessed at
> bfad_bsg.c:101:103 can overflow, since its index 'i'
> can have value 32 that is out of range.
>
> Signed-off-by: Denis Arefev <arefev@swemel.ru>
> ---
> drivers/scsi/bfa/bfad_bsg.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/scsi/bfa/bfad_bsg.c b/drivers/scsi/bfa/bfad_bsg.c
> index be8dfbe13e90..78615ffc62ef 100644
> --- a/drivers/scsi/bfa/bfad_bsg.c
> +++ b/drivers/scsi/bfa/bfad_bsg.c
> @@ -98,9 +98,9 @@ bfad_iocmd_ioc_get_info(struct bfad_s *bfad, void *cmd)
>
> /* set adapter hw path */
> strcpy(iocmd->adapter_hwpath, bfad->pci_name);
> - for (i = 0; iocmd->adapter_hwpath[i] != ':' && i < BFA_STRING_32; i++)
> + for (i = 0; iocmd->adapter_hwpath[i] != ':' && i < BFA_STRING_32-2; i++)
> ;
> - for (; iocmd->adapter_hwpath[++i] != ':' && i < BFA_STRING_32; )
> + for (; iocmd->adapter_hwpath[++i] != ':' && i < BFA_STRING_32-1; )
> ;
> iocmd->adapter_hwpath[i] = '\0';
> iocmd->status = BFA_STATUS_OK;
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2022-09-14 13:12 Amjad Ouled-Ameur
2022-09-14 13:18 ` Amjad Ouled-Ameur
0 siblings, 1 reply; 417+ messages in thread
From: Amjad Ouled-Ameur @ 2022-09-14 13:12 UTC (permalink / raw)
To: Rob Herring
Cc: Amjad Ouled-Ameur, Krzysztof Kozlowski, Matthias Brugger,
devicetree, linux-arm-kernel, linux-mediatek, linux-kernel
Subject: [PATCH] arm64: dts: mediatek: mt8183: remove thermal zones without
trips.
Thermal zones without trip point are not registered by thermal core.
tzts1 ~ tzts6 zones of mt8183 were intially introduced for test-purpose
only but are not supposed to remain on DT.
Remove the zones above and keep only cpu_thermal.
Signed-off-by: Amjad Ouled-Ameur <aouledameur@baylibre.com>
---
arch/arm64/boot/dts/mediatek/mt8183.dtsi | 57 ------------------------
1 file changed, 57 deletions(-)
diff --git a/arch/arm64/boot/dts/mediatek/mt8183.dtsi b/arch/arm64/boot/dts/mediatek/mt8183.dtsi
index 9d32871973a2..f65fae8939de 100644
--- a/arch/arm64/boot/dts/mediatek/mt8183.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8183.dtsi
@@ -1182,63 +1182,6 @@ THERMAL_NO_LIMIT
};
};
};
-
- /* The tzts1 ~ tzts6 don't need to polling */
- /* The tzts1 ~ tzts6 don't need to thermal throttle */
-
- tzts1: tzts1 {
- polling-delay-passive = <0>;
- polling-delay = <0>;
- thermal-sensors = <&thermal 1>;
- sustainable-power = <5000>;
- trips {};
- cooling-maps {};
- };
-
- tzts2: tzts2 {
- polling-delay-passive = <0>;
- polling-delay = <0>;
- thermal-sensors = <&thermal 2>;
- sustainable-power = <5000>;
- trips {};
- cooling-maps {};
- };
-
- tzts3: tzts3 {
- polling-delay-passive = <0>;
- polling-delay = <0>;
- thermal-sensors = <&thermal 3>;
- sustainable-power = <5000>;
- trips {};
- cooling-maps {};
- };
-
- tzts4: tzts4 {
- polling-delay-passive = <0>;
- polling-delay = <0>;
- thermal-sensors = <&thermal 4>;
- sustainable-power = <5000>;
- trips {};
- cooling-maps {};
- };
-
- tzts5: tzts5 {
- polling-delay-passive = <0>;
- polling-delay = <0>;
- thermal-sensors = <&thermal 5>;
- sustainable-power = <5000>;
- trips {};
- cooling-maps {};
- };
-
- tztsABB: tztsABB {
- polling-delay-passive = <0>;
- polling-delay = <0>;
- thermal-sensors = <&thermal 6>;
- sustainable-power = <5000>;
- trips {};
- cooling-maps {};
- };
};
pwm0: pwm@1100e000 {
--
2.37.3
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2022-09-14 13:12 Amjad Ouled-Ameur
@ 2022-09-14 13:18 ` Amjad Ouled-Ameur
0 siblings, 0 replies; 417+ messages in thread
From: Amjad Ouled-Ameur @ 2022-09-14 13:18 UTC (permalink / raw)
To: Rob Herring
Cc: Krzysztof Kozlowski, Matthias Brugger, devicetree,
linux-arm-kernel, linux-mediatek, linux-kernel
Hi,
The subject has not been parsed correctly, I resent a proper patch here:
https://patchwork.kernel.org/project/linux-mediatek/patch/20220914131339.18348-1-aouledameur@baylibre.com/
Sorry for the noise.
Regards,
Amjad
On 9/14/22 15:12, Amjad Ouled-Ameur wrote:
> Subject: [PATCH] arm64: dts: mediatek: mt8183: remove thermal zones without
> trips.
>
> Thermal zones without trip point are not registered by thermal core.
>
> tzts1 ~ tzts6 zones of mt8183 were intially introduced for test-purpose
> only but are not supposed to remain on DT.
>
> Remove the zones above and keep only cpu_thermal.
>
> Signed-off-by: Amjad Ouled-Ameur <aouledameur@baylibre.com>
> ---
> arch/arm64/boot/dts/mediatek/mt8183.dtsi | 57 ------------------------
> 1 file changed, 57 deletions(-)
>
> diff --git a/arch/arm64/boot/dts/mediatek/mt8183.dtsi b/arch/arm64/boot/dts/mediatek/mt8183.dtsi
> index 9d32871973a2..f65fae8939de 100644
> --- a/arch/arm64/boot/dts/mediatek/mt8183.dtsi
> +++ b/arch/arm64/boot/dts/mediatek/mt8183.dtsi
> @@ -1182,63 +1182,6 @@ THERMAL_NO_LIMIT
> };
> };
> };
> -
> - /* The tzts1 ~ tzts6 don't need to polling */
> - /* The tzts1 ~ tzts6 don't need to thermal throttle */
> -
> - tzts1: tzts1 {
> - polling-delay-passive = <0>;
> - polling-delay = <0>;
> - thermal-sensors = <&thermal 1>;
> - sustainable-power = <5000>;
> - trips {};
> - cooling-maps {};
> - };
> -
> - tzts2: tzts2 {
> - polling-delay-passive = <0>;
> - polling-delay = <0>;
> - thermal-sensors = <&thermal 2>;
> - sustainable-power = <5000>;
> - trips {};
> - cooling-maps {};
> - };
> -
> - tzts3: tzts3 {
> - polling-delay-passive = <0>;
> - polling-delay = <0>;
> - thermal-sensors = <&thermal 3>;
> - sustainable-power = <5000>;
> - trips {};
> - cooling-maps {};
> - };
> -
> - tzts4: tzts4 {
> - polling-delay-passive = <0>;
> - polling-delay = <0>;
> - thermal-sensors = <&thermal 4>;
> - sustainable-power = <5000>;
> - trips {};
> - cooling-maps {};
> - };
> -
> - tzts5: tzts5 {
> - polling-delay-passive = <0>;
> - polling-delay = <0>;
> - thermal-sensors = <&thermal 5>;
> - sustainable-power = <5000>;
> - trips {};
> - cooling-maps {};
> - };
> -
> - tztsABB: tztsABB {
> - polling-delay-passive = <0>;
> - polling-delay = <0>;
> - thermal-sensors = <&thermal 6>;
> - sustainable-power = <5000>;
> - trips {};
> - cooling-maps {};
> - };
> };
>
> pwm0: pwm@1100e000 {
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH bpf-next 1/2] cpuidle/rcu: Making arch_cpu_idle and rcu_idle_exit noinstr
@ 2022-05-15 20:36 Jiri Olsa
2023-05-20 9:47 ` Ze Gao
0 siblings, 1 reply; 417+ messages in thread
From: Jiri Olsa @ 2022-05-15 20:36 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Masami Hiramatsu, Paul E. McKenney
Cc: netdev, bpf, lkml, Martin KaFai Lau, Song Liu, Yonghong Song,
John Fastabend, KP Singh, Steven Rostedt
Making arch_cpu_idle and rcu_idle_exit noinstr. Both functions run
in rcu 'not watching' context and if there's tracer attached to
them, which uses rcu (e.g. kprobe multi interface) it will hit RCU
warning like:
[ 3.017540] WARNING: suspicious RCU usage
...
[ 3.018363] kprobe_multi_link_handler+0x68/0x1c0
[ 3.018364] ? kprobe_multi_link_handler+0x3e/0x1c0
[ 3.018366] ? arch_cpu_idle_dead+0x10/0x10
[ 3.018367] ? arch_cpu_idle_dead+0x10/0x10
[ 3.018371] fprobe_handler.part.0+0xab/0x150
[ 3.018374] 0xffffffffa00080c8
[ 3.018393] ? arch_cpu_idle+0x5/0x10
[ 3.018398] arch_cpu_idle+0x5/0x10
[ 3.018399] default_idle_call+0x59/0x90
[ 3.018401] do_idle+0x1c3/0x1d0
The call path is following:
default_idle_call
rcu_idle_enter
arch_cpu_idle
rcu_idle_exit
The arch_cpu_idle and rcu_idle_exit are the only ones from above
path that are traceble and cause this problem on my setup.
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
arch/x86/kernel/process.c | 2 +-
kernel/rcu/tree.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b370767f5b19..1345cb0124a6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -720,7 +720,7 @@ void arch_cpu_idle_dead(void)
/*
* Called from the generic idle code.
*/
-void arch_cpu_idle(void)
+void noinstr arch_cpu_idle(void)
{
x86_idle();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4b8189455d5..20d529722f51 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -896,7 +896,7 @@ static void noinstr rcu_eqs_exit(bool user)
* If you add or remove a call to rcu_idle_exit(), be sure to test with
* CONFIG_RCU_EQS_DEBUG=y.
*/
-void rcu_idle_exit(void)
+void noinstr rcu_idle_exit(void)
{
unsigned long flags;
--
2.35.3
^ permalink raw reply related [flat|nested] 417+ messages in thread
* (no subject)
2022-05-15 20:36 [PATCH bpf-next 1/2] cpuidle/rcu: Making arch_cpu_idle and rcu_idle_exit noinstr Jiri Olsa
@ 2023-05-20 9:47 ` Ze Gao
2023-05-21 3:58 ` Yonghong Song
2023-05-21 8:08 ` Re: Jiri Olsa
0 siblings, 2 replies; 417+ messages in thread
From: Ze Gao @ 2023-05-20 9:47 UTC (permalink / raw)
To: jolsa
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Hao Luo,
John Fastabend, KP Singh, Martin KaFai Lau, Masami Hiramatsu,
Song Liu, Stanislav Fomichev, Steven Rostedt, Yonghong Song, bpf,
linux-kernel, linux-trace-kernel, kafai, kpsingh, netdev,
paulmck, songliubraving, Ze Gao
Hi Jiri,
Would you like to consider to add rcu_is_watching check in
to solve this from the viewpoint of kprobe_multi_link_prog_run
itself? And accounting of missed runs can be added as well
to imporve observability.
Regards,
Ze
-----------------
From 29fd3cd713e65461325c2703cf5246a6fae5d4fe Mon Sep 17 00:00:00 2001
From: Ze Gao <zegao@tencent.com>
Date: Sat, 20 May 2023 17:32:05 +0800
Subject: [PATCH] bpf: kprobe_multi runs bpf progs only when rcu_is_watching
From the perspective of kprobe_multi_link_prog_run, any traceable
functions can be attached while bpf progs need specical care and
ought to be under rcu protection. To solve the likely rcu lockdep
warns once for good, when (future) functions in idle path were
attached accidentally, we better paying some cost to check at least
in kernel-side, and return when rcu is not watching, which helps
to avoid any unpredictable results.
Signed-off-by: Ze Gao <zegao@tencent.com>
---
kernel/trace/bpf_trace.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9a050e36dc6c..3e6ea7274765 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2622,7 +2622,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
struct bpf_run_ctx *old_run_ctx;
int err;
- if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1 || !rcu_is_watching())) {
err = 0;
goto out;
}
--
2.40.1
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2023-05-20 9:47 ` Ze Gao
@ 2023-05-21 3:58 ` Yonghong Song
2023-05-21 15:10 ` Re: Ze Gao
2023-05-21 8:08 ` Re: Jiri Olsa
1 sibling, 1 reply; 417+ messages in thread
From: Yonghong Song @ 2023-05-21 3:58 UTC (permalink / raw)
To: Ze Gao, jolsa
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Hao Luo,
John Fastabend, KP Singh, Martin KaFai Lau, Masami Hiramatsu,
Song Liu, Stanislav Fomichev, Steven Rostedt, Yonghong Song, bpf,
linux-kernel, linux-trace-kernel, kafai, kpsingh, netdev,
paulmck, songliubraving, Ze Gao
On 5/20/23 2:47 AM, Ze Gao wrote:
>
> Hi Jiri,
>
> Would you like to consider to add rcu_is_watching check in
> to solve this from the viewpoint of kprobe_multi_link_prog_run
> itself? And accounting of missed runs can be added as well
> to imporve observability.
>
> Regards,
> Ze
>
>
> -----------------
> From 29fd3cd713e65461325c2703cf5246a6fae5d4fe Mon Sep 17 00:00:00 2001
> From: Ze Gao <zegao@tencent.com>
> Date: Sat, 20 May 2023 17:32:05 +0800
> Subject: [PATCH] bpf: kprobe_multi runs bpf progs only when rcu_is_watching
>
> From the perspective of kprobe_multi_link_prog_run, any traceable
> functions can be attached while bpf progs need specical care and
> ought to be under rcu protection. To solve the likely rcu lockdep
> warns once for good, when (future) functions in idle path were
> attached accidentally, we better paying some cost to check at least
> in kernel-side, and return when rcu is not watching, which helps
> to avoid any unpredictable results.
kprobe_multi/fprobe share the same set of attachments with fentry.
Currently, fentry does not filter with !rcu_is_watching, maybe
because this is an extreme corner case. Not sure whether it is
worthwhile or not.
Maybe if you can give a concrete example (e.g., attachment point)
with current code base to show what the issue you encountered and
it will make it easier to judge whether adding !rcu_is_watching()
is necessary or not.
>
> Signed-off-by: Ze Gao <zegao@tencent.com>
> ---
> kernel/trace/bpf_trace.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 9a050e36dc6c..3e6ea7274765 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -2622,7 +2622,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> struct bpf_run_ctx *old_run_ctx;
> int err;
>
> - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1 || !rcu_is_watching())) {
> err = 0;
> goto out;
> }
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-21 3:58 ` Yonghong Song
@ 2023-05-21 15:10 ` Ze Gao
2023-05-21 20:26 ` Re: Jiri Olsa
0 siblings, 1 reply; 417+ messages in thread
From: Ze Gao @ 2023-05-21 15:10 UTC (permalink / raw)
To: Yonghong Song
Cc: jolsa, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Hao Luo, John Fastabend, KP Singh, Martin KaFai Lau,
Masami Hiramatsu, Song Liu, Stanislav Fomichev, Steven Rostedt,
Yonghong Song, bpf, linux-kernel, linux-trace-kernel, kafai,
kpsingh, netdev, paulmck, songliubraving, Ze Gao
> kprobe_multi/fprobe share the same set of attachments with fentry.
> Currently, fentry does not filter with !rcu_is_watching, maybe
> because this is an extreme corner case. Not sure whether it is
> worthwhile or not.
Agreed, it's rare, especially after Peter's patches which push narrow
down rcu eqs regions
in the idle path and reduce the chance of any traceable functions
happening in between.
However, from RCU's perspective, we ought to check if rcu_is_watching
theoretically
when there's a chance our code will run in the idle path and also we
need rcu to be alive,
And also we cannot simply make assumptions for any future changes in
the idle path.
You know, just like what was hit in the thread.
> Maybe if you can give a concrete example (e.g., attachment point)
> with current code base to show what the issue you encountered and
> it will make it easier to judge whether adding !rcu_is_watching()
> is necessary or not.
I can reproduce likely warnings on v6.1.18 where arch_cpu_idle is
traceable but not on the latest version
so far. But as I state above, in theory we need it. So here is a
gentle ping :) .
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-21 15:10 ` Re: Ze Gao
@ 2023-05-21 20:26 ` Jiri Olsa
2023-05-22 1:36 ` Re: Masami Hiramatsu
2023-05-22 2:07 ` Re: Ze Gao
0 siblings, 2 replies; 417+ messages in thread
From: Jiri Olsa @ 2023-05-21 20:26 UTC (permalink / raw)
To: Ze Gao
Cc: Yonghong Song, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Hao Luo, John Fastabend, KP Singh,
Martin KaFai Lau, Masami Hiramatsu, Song Liu, Stanislav Fomichev,
Steven Rostedt, Yonghong Song, bpf, linux-kernel,
linux-trace-kernel, kafai, kpsingh, netdev, paulmck,
songliubraving, Ze Gao
On Sun, May 21, 2023 at 11:10:16PM +0800, Ze Gao wrote:
> > kprobe_multi/fprobe share the same set of attachments with fentry.
> > Currently, fentry does not filter with !rcu_is_watching, maybe
> > because this is an extreme corner case. Not sure whether it is
> > worthwhile or not.
>
> Agreed, it's rare, especially after Peter's patches which push narrow
> down rcu eqs regions
> in the idle path and reduce the chance of any traceable functions
> happening in between.
>
> However, from RCU's perspective, we ought to check if rcu_is_watching
> theoretically
> when there's a chance our code will run in the idle path and also we
> need rcu to be alive,
> And also we cannot simply make assumptions for any future changes in
> the idle path.
> You know, just like what was hit in the thread.
>
> > Maybe if you can give a concrete example (e.g., attachment point)
> > with current code base to show what the issue you encountered and
> > it will make it easier to judge whether adding !rcu_is_watching()
> > is necessary or not.
>
> I can reproduce likely warnings on v6.1.18 where arch_cpu_idle is
> traceable but not on the latest version
> so far. But as I state above, in theory we need it. So here is a
> gentle ping :) .
hum, this change [1] added rcu_is_watching check to ftrace_test_recursion_trylock,
which we use in fprobe_handler and is coming to fprobe_exit_handler in [2]
I might be missing something, but it seems like we don't need another
rcu_is_watching call on kprobe_multi level
jirka
[1] d099dbfd3306 cpuidle: tracing: Warn about !rcu_is_watching()
[2] https://lore.kernel.org/bpf/20230517034510.15639-4-zegao@tencent.com/
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-21 20:26 ` Re: Jiri Olsa
@ 2023-05-22 1:36 ` Masami Hiramatsu
2023-05-22 2:07 ` Re: Ze Gao
1 sibling, 0 replies; 417+ messages in thread
From: Masami Hiramatsu @ 2023-05-22 1:36 UTC (permalink / raw)
To: Jiri Olsa
Cc: Ze Gao, Yonghong Song, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Hao Luo, John Fastabend, KP Singh,
Martin KaFai Lau, Masami Hiramatsu, Song Liu, Stanislav Fomichev,
Steven Rostedt, Yonghong Song, bpf, linux-kernel,
linux-trace-kernel, kafai, kpsingh, netdev, paulmck,
songliubraving, Ze Gao
On Sun, 21 May 2023 22:26:37 +0200
Jiri Olsa <olsajiri@gmail.com> wrote:
> On Sun, May 21, 2023 at 11:10:16PM +0800, Ze Gao wrote:
> > > kprobe_multi/fprobe share the same set of attachments with fentry.
> > > Currently, fentry does not filter with !rcu_is_watching, maybe
> > > because this is an extreme corner case. Not sure whether it is
> > > worthwhile or not.
> >
> > Agreed, it's rare, especially after Peter's patches which push narrow
> > down rcu eqs regions
> > in the idle path and reduce the chance of any traceable functions
> > happening in between.
> >
> > However, from RCU's perspective, we ought to check if rcu_is_watching
> > theoretically
> > when there's a chance our code will run in the idle path and also we
> > need rcu to be alive,
> > And also we cannot simply make assumptions for any future changes in
> > the idle path.
> > You know, just like what was hit in the thread.
> >
> > > Maybe if you can give a concrete example (e.g., attachment point)
> > > with current code base to show what the issue you encountered and
> > > it will make it easier to judge whether adding !rcu_is_watching()
> > > is necessary or not.
> >
> > I can reproduce likely warnings on v6.1.18 where arch_cpu_idle is
> > traceable but not on the latest version
> > so far. But as I state above, in theory we need it. So here is a
> > gentle ping :) .
>
> hum, this change [1] added rcu_is_watching check to ftrace_test_recursion_trylock,
> which we use in fprobe_handler and is coming to fprobe_exit_handler in [2]
>
> I might be missing something, but it seems like we don't need another
> rcu_is_watching call on kprobe_multi level
Good point! OK, then it seems we don't need it. The rethook continues to
use the rcu_is_watching() because it is also used from kprobes, but the
kprobe_multi doesn't need it.
Thank you,
>
> jirka
>
>
> [1] d099dbfd3306 cpuidle: tracing: Warn about !rcu_is_watching()
> [2] https://lore.kernel.org/bpf/20230517034510.15639-4-zegao@tencent.com/
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-21 20:26 ` Re: Jiri Olsa
2023-05-22 1:36 ` Re: Masami Hiramatsu
@ 2023-05-22 2:07 ` Ze Gao
2023-05-23 4:38 ` Re: Yonghong Song
2023-05-23 5:30 ` Re: Masami Hiramatsu
1 sibling, 2 replies; 417+ messages in thread
From: Ze Gao @ 2023-05-22 2:07 UTC (permalink / raw)
To: Jiri Olsa
Cc: Yonghong Song, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Hao Luo, John Fastabend, KP Singh,
Martin KaFai Lau, Masami Hiramatsu, Song Liu, Stanislav Fomichev,
Steven Rostedt, Yonghong Song, bpf, linux-kernel,
linux-trace-kernel, kafai, kpsingh, netdev, paulmck,
songliubraving, Ze Gao
Oops, I missed that. Thanks for pointing that out, which I thought is
conditional use of rcu_is_watching before.
One last point, I think we should double check on this
"fentry does not filter with !rcu_is_watching"
as quoted from Yonghong and argue whether it needs
the same check for fentry as well.
Regards,
Ze
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-22 2:07 ` Re: Ze Gao
@ 2023-05-23 4:38 ` Yonghong Song
2023-05-23 5:30 ` Re: Masami Hiramatsu
1 sibling, 0 replies; 417+ messages in thread
From: Yonghong Song @ 2023-05-23 4:38 UTC (permalink / raw)
To: Ze Gao, Jiri Olsa
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Hao Luo,
John Fastabend, KP Singh, Martin KaFai Lau, Masami Hiramatsu,
Song Liu, Stanislav Fomichev, Steven Rostedt, Yonghong Song, bpf,
linux-kernel, linux-trace-kernel, kafai, kpsingh, netdev,
paulmck, songliubraving, Ze Gao
On 5/21/23 7:07 PM, Ze Gao wrote:
> Oops, I missed that. Thanks for pointing that out, which I thought is
> conditional use of rcu_is_watching before.
>
> One last point, I think we should double check on this
> "fentry does not filter with !rcu_is_watching"
> as quoted from Yonghong and argue whether it needs
> the same check for fentry as well.
I would suggest that we address rcu_is_watching issue for fentry
only if we do have a reproducible case to show something goes wrong...
>
> Regards,
> Ze
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-22 2:07 ` Re: Ze Gao
2023-05-23 4:38 ` Re: Yonghong Song
@ 2023-05-23 5:30 ` Masami Hiramatsu
2023-05-23 6:59 ` Re: Paul E. McKenney
1 sibling, 1 reply; 417+ messages in thread
From: Masami Hiramatsu @ 2023-05-23 5:30 UTC (permalink / raw)
To: Ze Gao
Cc: Jiri Olsa, Yonghong Song, Alexei Starovoitov, Andrii Nakryiko,
Daniel Borkmann, Hao Luo, John Fastabend, KP Singh,
Martin KaFai Lau, Masami Hiramatsu, Song Liu, Stanislav Fomichev,
Steven Rostedt, Yonghong Song, bpf, linux-kernel,
linux-trace-kernel, kafai, kpsingh, netdev, paulmck,
songliubraving, Ze Gao
On Mon, 22 May 2023 10:07:42 +0800
Ze Gao <zegao2021@gmail.com> wrote:
> Oops, I missed that. Thanks for pointing that out, which I thought is
> conditional use of rcu_is_watching before.
>
> One last point, I think we should double check on this
> "fentry does not filter with !rcu_is_watching"
> as quoted from Yonghong and argue whether it needs
> the same check for fentry as well.
rcu_is_watching() comment says;
* if the current CPU is not in its idle loop or is in an interrupt or
* NMI handler, return true.
Thus it returns *fault* if the current CPU is in the idle loop and not
any interrupt(including NMI) context. This means if any tracable function
is called from idle loop, it can be !rcu_is_watching(). I meant, this is
'context' based check, thus fentry can not filter out that some commonly
used functions is called from that context but it can be detected.
Thank you,
>
> Regards,
> Ze
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-23 5:30 ` Re: Masami Hiramatsu
@ 2023-05-23 6:59 ` Paul E. McKenney
2023-05-25 0:13 ` Re: Masami Hiramatsu
0 siblings, 1 reply; 417+ messages in thread
From: Paul E. McKenney @ 2023-05-23 6:59 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Ze Gao, Jiri Olsa, Yonghong Song, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Hao Luo, John Fastabend,
KP Singh, Martin KaFai Lau, Song Liu, Stanislav Fomichev,
Steven Rostedt, Yonghong Song, bpf, linux-kernel,
linux-trace-kernel, kafai, kpsingh, netdev, songliubraving,
Ze Gao
On Tue, May 23, 2023 at 01:30:19PM +0800, Masami Hiramatsu wrote:
> On Mon, 22 May 2023 10:07:42 +0800
> Ze Gao <zegao2021@gmail.com> wrote:
>
> > Oops, I missed that. Thanks for pointing that out, which I thought is
> > conditional use of rcu_is_watching before.
> >
> > One last point, I think we should double check on this
> > "fentry does not filter with !rcu_is_watching"
> > as quoted from Yonghong and argue whether it needs
> > the same check for fentry as well.
>
> rcu_is_watching() comment says;
>
> * if the current CPU is not in its idle loop or is in an interrupt or
> * NMI handler, return true.
>
> Thus it returns *fault* if the current CPU is in the idle loop and not
> any interrupt(including NMI) context. This means if any tracable function
> is called from idle loop, it can be !rcu_is_watching(). I meant, this is
> 'context' based check, thus fentry can not filter out that some commonly
> used functions is called from that context but it can be detected.
It really does return false (rather than faulting?) if the current CPU
is deep within the idle loop.
In addition, the recent x86/entry rework (thank you Peter and
Thomas!) mean that the "idle loop" is quite restricted, as can be
seen by the invocations of ct_cpuidle_enter() and ct_cpuidle_exit().
For example, in default_idle_call(), these are immediately before and
after the call to arch_cpu_idle().
Would the following help? Or am I missing your point?
Thanx, Paul
------------------------------------------------------------------------
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1449cb69a0e0..fae9b4e29c93 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -679,10 +679,14 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
/**
* rcu_is_watching - see if RCU thinks that the current CPU is not idle
*
- * Return true if RCU is watching the running CPU, which means that this
- * CPU can safely enter RCU read-side critical sections. In other words,
- * if the current CPU is not in its idle loop or is in an interrupt or
- * NMI handler, return true.
+ * Return @true if RCU is watching the running CPU and @false otherwise.
+ * An @true return means that this CPU can safely enter RCU read-side
+ * critical sections.
+ *
+ * More specifically, if the current CPU is not deep within its idle
+ * loop, return @true. Note that rcu_is_watching() will return @true if
+ * invoked from an interrupt or NMI handler, even if that interrupt or
+ * NMI interrupted the CPU while it was deep within its idle loop.
*
* Make notrace because it can be called by the internal functions of
* ftrace, and making this notrace removes unnecessary recursion calls.
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2023-05-23 6:59 ` Re: Paul E. McKenney
@ 2023-05-25 0:13 ` Masami Hiramatsu
0 siblings, 0 replies; 417+ messages in thread
From: Masami Hiramatsu @ 2023-05-25 0:13 UTC (permalink / raw)
To: paulmck
Cc: Ze Gao, Jiri Olsa, Yonghong Song, Alexei Starovoitov,
Andrii Nakryiko, Daniel Borkmann, Hao Luo, John Fastabend,
KP Singh, Martin KaFai Lau, Song Liu, Stanislav Fomichev,
Steven Rostedt, Yonghong Song, bpf, linux-kernel,
linux-trace-kernel, kafai, kpsingh, netdev, songliubraving,
Ze Gao
On Mon, 22 May 2023 23:59:28 -0700
"Paul E. McKenney" <paulmck@kernel.org> wrote:
> On Tue, May 23, 2023 at 01:30:19PM +0800, Masami Hiramatsu wrote:
> > On Mon, 22 May 2023 10:07:42 +0800
> > Ze Gao <zegao2021@gmail.com> wrote:
> >
> > > Oops, I missed that. Thanks for pointing that out, which I thought is
> > > conditional use of rcu_is_watching before.
> > >
> > > One last point, I think we should double check on this
> > > "fentry does not filter with !rcu_is_watching"
> > > as quoted from Yonghong and argue whether it needs
> > > the same check for fentry as well.
> >
> > rcu_is_watching() comment says;
> >
> > * if the current CPU is not in its idle loop or is in an interrupt or
> > * NMI handler, return true.
> >
> > Thus it returns *fault* if the current CPU is in the idle loop and not
> > any interrupt(including NMI) context. This means if any tracable function
> > is called from idle loop, it can be !rcu_is_watching(). I meant, this is
> > 'context' based check, thus fentry can not filter out that some commonly
> > used functions is called from that context but it can be detected.
>
> It really does return false (rather than faulting?) if the current CPU
> is deep within the idle loop.
>
> In addition, the recent x86/entry rework (thank you Peter and
> Thomas!) mean that the "idle loop" is quite restricted, as can be
> seen by the invocations of ct_cpuidle_enter() and ct_cpuidle_exit().
> For example, in default_idle_call(), these are immediately before and
> after the call to arch_cpu_idle().
Thanks! I also found that the default_idle_call() is enough small and
it seems not happening on fentry because there are no commonly used
functions on that path.
>
> Would the following help? Or am I missing your point?
Yes, thank you for the update!
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index 1449cb69a0e0..fae9b4e29c93 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -679,10 +679,14 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
> /**
> * rcu_is_watching - see if RCU thinks that the current CPU is not idle
> *
> - * Return true if RCU is watching the running CPU, which means that this
> - * CPU can safely enter RCU read-side critical sections. In other words,
> - * if the current CPU is not in its idle loop or is in an interrupt or
> - * NMI handler, return true.
> + * Return @true if RCU is watching the running CPU and @false otherwise.
> + * An @true return means that this CPU can safely enter RCU read-side
> + * critical sections.
> + *
> + * More specifically, if the current CPU is not deep within its idle
> + * loop, return @true. Note that rcu_is_watching() will return @true if
> + * invoked from an interrupt or NMI handler, even if that interrupt or
> + * NMI interrupted the CPU while it was deep within its idle loop.
> *
> * Make notrace because it can be called by the internal functions of
> * ftrace, and making this notrace removes unnecessary recursion calls.
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-20 9:47 ` Ze Gao
2023-05-21 3:58 ` Yonghong Song
@ 2023-05-21 8:08 ` Jiri Olsa
2023-05-21 10:09 ` Re: Masami Hiramatsu
1 sibling, 1 reply; 417+ messages in thread
From: Jiri Olsa @ 2023-05-21 8:08 UTC (permalink / raw)
To: Ze Gao
Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann, Hao Luo,
John Fastabend, KP Singh, Martin KaFai Lau, Masami Hiramatsu,
Song Liu, Stanislav Fomichev, Steven Rostedt, Yonghong Song, bpf,
linux-kernel, linux-trace-kernel, kafai, kpsingh, netdev,
paulmck, songliubraving, Ze Gao
On Sat, May 20, 2023 at 05:47:24PM +0800, Ze Gao wrote:
>
> Hi Jiri,
>
> Would you like to consider to add rcu_is_watching check in
> to solve this from the viewpoint of kprobe_multi_link_prog_run
I think this was discussed in here:
https://lore.kernel.org/bpf/20230321020103.13494-1-laoar.shao@gmail.com/
and was considered a bug, there's fix mentioned later in the thread
there's also this recent patchset:
https://lore.kernel.org/bpf/20230517034510.15639-3-zegao@tencent.com/
that solves related problems
> itself? And accounting of missed runs can be added as well
> to imporve observability.
right, we count fprobe->nmissed but it's not exposed, we should allow
to get 'missed' stats from both fprobe and kprobe_multi later, which
is missing now, will check
thanks,
jirka
>
> Regards,
> Ze
>
>
> -----------------
> From 29fd3cd713e65461325c2703cf5246a6fae5d4fe Mon Sep 17 00:00:00 2001
> From: Ze Gao <zegao@tencent.com>
> Date: Sat, 20 May 2023 17:32:05 +0800
> Subject: [PATCH] bpf: kprobe_multi runs bpf progs only when rcu_is_watching
>
> From the perspective of kprobe_multi_link_prog_run, any traceable
> functions can be attached while bpf progs need specical care and
> ought to be under rcu protection. To solve the likely rcu lockdep
> warns once for good, when (future) functions in idle path were
> attached accidentally, we better paying some cost to check at least
> in kernel-side, and return when rcu is not watching, which helps
> to avoid any unpredictable results.
>
> Signed-off-by: Ze Gao <zegao@tencent.com>
> ---
> kernel/trace/bpf_trace.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 9a050e36dc6c..3e6ea7274765 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -2622,7 +2622,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> struct bpf_run_ctx *old_run_ctx;
> int err;
>
> - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1 || !rcu_is_watching())) {
> err = 0;
> goto out;
> }
> --
> 2.40.1
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-21 8:08 ` Re: Jiri Olsa
@ 2023-05-21 10:09 ` Masami Hiramatsu
2023-05-21 14:19 ` Re: Ze Gao
0 siblings, 1 reply; 417+ messages in thread
From: Masami Hiramatsu @ 2023-05-21 10:09 UTC (permalink / raw)
To: Jiri Olsa
Cc: Ze Gao, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Hao Luo, John Fastabend, KP Singh, Martin KaFai Lau,
Masami Hiramatsu, Song Liu, Stanislav Fomichev, Steven Rostedt,
Yonghong Song, bpf, linux-kernel, linux-trace-kernel, kafai,
kpsingh, netdev, paulmck, songliubraving, Ze Gao
On Sun, 21 May 2023 10:08:46 +0200
Jiri Olsa <olsajiri@gmail.com> wrote:
> On Sat, May 20, 2023 at 05:47:24PM +0800, Ze Gao wrote:
> >
> > Hi Jiri,
> >
> > Would you like to consider to add rcu_is_watching check in
> > to solve this from the viewpoint of kprobe_multi_link_prog_run
>
> I think this was discussed in here:
> https://lore.kernel.org/bpf/20230321020103.13494-1-laoar.shao@gmail.com/
>
> and was considered a bug, there's fix mentioned later in the thread
>
> there's also this recent patchset:
> https://lore.kernel.org/bpf/20230517034510.15639-3-zegao@tencent.com/
>
> that solves related problems
I think this rcu_is_watching() is a bit different issue. This rcu_is_watching()
check is required if the kprobe_multi_link_prog_run() uses any RCU API.
E.g. rethook_try_get() is also checks rcu_is_watching() because it uses
call_rcu().
Thank you,
>
> > itself? And accounting of missed runs can be added as well
> > to imporve observability.
>
> right, we count fprobe->nmissed but it's not exposed, we should allow
> to get 'missed' stats from both fprobe and kprobe_multi later, which
> is missing now, will check
>
> thanks,
> jirka
>
> >
> > Regards,
> > Ze
> >
> >
> > -----------------
> > From 29fd3cd713e65461325c2703cf5246a6fae5d4fe Mon Sep 17 00:00:00 2001
> > From: Ze Gao <zegao@tencent.com>
> > Date: Sat, 20 May 2023 17:32:05 +0800
> > Subject: [PATCH] bpf: kprobe_multi runs bpf progs only when rcu_is_watching
> >
> > From the perspective of kprobe_multi_link_prog_run, any traceable
> > functions can be attached while bpf progs need specical care and
> > ought to be under rcu protection. To solve the likely rcu lockdep
> > warns once for good, when (future) functions in idle path were
> > attached accidentally, we better paying some cost to check at least
> > in kernel-side, and return when rcu is not watching, which helps
> > to avoid any unpredictable results.
> >
> > Signed-off-by: Ze Gao <zegao@tencent.com>
> > ---
> > kernel/trace/bpf_trace.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > index 9a050e36dc6c..3e6ea7274765 100644
> > --- a/kernel/trace/bpf_trace.c
> > +++ b/kernel/trace/bpf_trace.c
> > @@ -2622,7 +2622,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> > struct bpf_run_ctx *old_run_ctx;
> > int err;
> >
> > - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> > + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1 || !rcu_is_watching())) {
> > err = 0;
> > goto out;
> > }
> > --
> > 2.40.1
> >
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2023-05-21 10:09 ` Re: Masami Hiramatsu
@ 2023-05-21 14:19 ` Ze Gao
0 siblings, 0 replies; 417+ messages in thread
From: Ze Gao @ 2023-05-21 14:19 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Jiri Olsa, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
Hao Luo, John Fastabend, KP Singh, Martin KaFai Lau, Song Liu,
Stanislav Fomichev, Steven Rostedt, Yonghong Song, bpf,
linux-kernel, linux-trace-kernel, kafai, kpsingh, netdev,
paulmck, songliubraving, Ze Gao
On Sun, May 21, 2023 at 6:09 PM Masami Hiramatsu <mhiramat@kernel.org> wrote:
>
> On Sun, 21 May 2023 10:08:46 +0200
> Jiri Olsa <olsajiri@gmail.com> wrote:
>
> > On Sat, May 20, 2023 at 05:47:24PM +0800, Ze Gao wrote:
> > >
> > > Hi Jiri,
> > >
> > > Would you like to consider to add rcu_is_watching check in
> > > to solve this from the viewpoint of kprobe_multi_link_prog_run
> >
> > I think this was discussed in here:
> > https://lore.kernel.org/bpf/20230321020103.13494-1-laoar.shao@gmail.com/
> >
> > and was considered a bug, there's fix mentioned later in the thread
> >
> > there's also this recent patchset:
> > https://lore.kernel.org/bpf/20230517034510.15639-3-zegao@tencent.com/
> >
> > that solves related problems
>
> I think this rcu_is_watching() is a bit different issue. This rcu_is_watching()
> check is required if the kprobe_multi_link_prog_run() uses any RCU API.
> E.g. rethook_try_get() is also checks rcu_is_watching() because it uses
> call_rcu().
Yes, that's my point!
Regards,
Ze
>
> >
> > > itself? And accounting of missed runs can be added as well
> > > to imporve observability.
> >
> > right, we count fprobe->nmissed but it's not exposed, we should allow
> > to get 'missed' stats from both fprobe and kprobe_multi later, which
> > is missing now, will check
> >
> > thanks,
> > jirka
> >
> > >
> > > Regards,
> > > Ze
> > >
> > >
> > > -----------------
> > > From 29fd3cd713e65461325c2703cf5246a6fae5d4fe Mon Sep 17 00:00:00 2001
> > > From: Ze Gao <zegao@tencent.com>
> > > Date: Sat, 20 May 2023 17:32:05 +0800
> > > Subject: [PATCH] bpf: kprobe_multi runs bpf progs only when rcu_is_watching
> > >
> > > From the perspective of kprobe_multi_link_prog_run, any traceable
> > > functions can be attached while bpf progs need specical care and
> > > ought to be under rcu protection. To solve the likely rcu lockdep
> > > warns once for good, when (future) functions in idle path were
> > > attached accidentally, we better paying some cost to check at least
> > > in kernel-side, and return when rcu is not watching, which helps
> > > to avoid any unpredictable results.
> > >
> > > Signed-off-by: Ze Gao <zegao@tencent.com>
> > > ---
> > > kernel/trace/bpf_trace.c | 2 +-
> > > 1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > index 9a050e36dc6c..3e6ea7274765 100644
> > > --- a/kernel/trace/bpf_trace.c
> > > +++ b/kernel/trace/bpf_trace.c
> > > @@ -2622,7 +2622,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
> > > struct bpf_run_ctx *old_run_ctx;
> > > int err;
> > >
> > > - if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
> > > + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1 || !rcu_is_watching())) {
> > > err = 0;
> > > goto out;
> > > }
> > > --
> > > 2.40.1
> > >
>
>
> --
> Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <CANiq72k+5Rdj7i3Df2dcE6_OPYPXK3z5EWLKnY56sSMz4G3OvA@mail.gmail.com>]
* (no subject)
@ 2022-04-21 16:41 Yury Norov
2022-04-21 23:04 ` John Hubbard
0 siblings, 1 reply; 417+ messages in thread
From: Yury Norov @ 2022-04-21 16:41 UTC (permalink / raw)
To: Andrew Morton, Minchan Kim, John Hubbard, linux-mm, linux-kernel
Cc: Yury Norov
Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
API requires struct page **pages to be provided (not NULL). However,
the comment to pin_user_pages() says:
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
This patch fixes comments along the pin_user_pages code, and also adds
WARN_ON(!pages), so that API users will have better understanding
on how to use it.
It has been independently spotted by Minchan Kim and confirmed with
John Hubbard:
https://lore.kernel.org/all/YgWA0ghrrzHONehH@google.com/
Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
---
mm/gup.c | 26 ++++++++++++++++++++++----
1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/mm/gup.c b/mm/gup.c
index f598a037eb04..559626457585 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2871,6 +2871,10 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ /* FOLL_PIN requires pages != NULL */
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -2893,6 +2897,10 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
*/
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return 0;
+
+ /* FOLL_PIN requires pages != NULL */
+ if (WARN_ON_ONCE(!pages))
+ return 0;
/*
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
@@ -2920,8 +2928,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
+ * Should be at least nr_pages long.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
@@ -2944,6 +2951,10 @@ long pin_user_pages_remote(struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ /* FOLL_PIN requires pages != NULL */
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
@@ -2957,8 +2968,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
+ * Should be at least nr_pages long.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
@@ -2976,6 +2986,10 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ /* FOLL_PIN requires pages != NULL */
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
@@ -2994,6 +3008,10 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ /* FOLL_PIN requires pages != NULL */
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
--
2.32.0
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2022-04-21 16:41 Yury Norov
@ 2022-04-21 23:04 ` John Hubbard
2022-04-21 23:09 ` Re: John Hubbard
2022-04-21 23:17 ` Re: Yury Norov
0 siblings, 2 replies; 417+ messages in thread
From: John Hubbard @ 2022-04-21 23:04 UTC (permalink / raw)
To: Yury Norov, Andrew Morton, Minchan Kim, linux-mm, linux-kernel
On 4/21/22 09:41, Yury Norov wrote:
> Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
>
Hi Yuri,
Thanks for picking this up. I have been distracted and didn't trust
myself to focus on this properly, so it's good to have help!
IT/admin point: somehow the first line of the commit description didn't
make it into an actual email subject. The subject line was blank when it
arrived in my inbox, and the subject is in the body here instead. Not
sure how that happened.
Maybe check your git-sendemail setup?
> pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
> API requires struct page **pages to be provided (not NULL). However,
> the comment to pin_user_pages() says:
>
> * @pages: array that receives pointers to the pages pinned.
> * Should be at least nr_pages long. Or NULL, if caller
> * only intends to ensure the pages are faulted in.
>
> This patch fixes comments along the pin_user_pages code, and also adds
> WARN_ON(!pages), so that API users will have better understanding
> on how to use it.
No need to quote the code in the commit log. Instead, just summarize.
For example:
pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
API requires struct page **pages to be provided (not NULL). However, the
comment to pin_user_pages() clearly allows for passing in a NULL @pages
argument.
Remove the incorrect comments, and add WARN_ON_ONCE(!pages) calls to
enforce the API.
>
> It has been independently spotted by Minchan Kim and confirmed with
> John Hubbard:
>
> https://lore.kernel.org/all/YgWA0ghrrzHONehH@google.com/
Let's add a Cc: line for Michan as well:
Cc: Minchan Kim <minchan@kernel.org>
>
> Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
> ---
> mm/gup.c | 26 ++++++++++++++++++++++----
> 1 file changed, 22 insertions(+), 4 deletions(-)
>
> diff --git a/mm/gup.c b/mm/gup.c
> index f598a037eb04..559626457585 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2871,6 +2871,10 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
> if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> return -EINVAL;
>
> + /* FOLL_PIN requires pages != NULL */
Please delete each and every one of these one-line comments, because
they merely echo what the code says.
> + if (WARN_ON_ONCE(!pages))
> + return -EINVAL;
> +
> gup_flags |= FOLL_PIN;
> return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
> }
> @@ -2893,6 +2897,10 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
> */
> if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> return 0;
> +
> + /* FOLL_PIN requires pages != NULL */
> + if (WARN_ON_ONCE(!pages))
> + return 0;
> /*
> * FOLL_FAST_ONLY is required in order to match the API description of
> * this routine: no fall back to regular ("slow") GUP.
> @@ -2920,8 +2928,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
> * @nr_pages: number of pages from start to pin
> * @gup_flags: flags modifying lookup behaviour
> * @pages: array that receives pointers to the pages pinned.
> - * Should be at least nr_pages long. Or NULL, if caller
> - * only intends to ensure the pages are faulted in.
> + * Should be at least nr_pages long.
> * @vmas: array of pointers to vmas corresponding to each page.
> * Or NULL if the caller does not require them.
> * @locked: pointer to lock flag indicating whether lock is held and
> @@ -2944,6 +2951,10 @@ long pin_user_pages_remote(struct mm_struct *mm,
> if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> return -EINVAL;
>
> + /* FOLL_PIN requires pages != NULL */
> + if (WARN_ON_ONCE(!pages))
> + return -EINVAL;
> +
> gup_flags |= FOLL_PIN;
> return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
> pages, vmas, locked);
> @@ -2957,8 +2968,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
> * @nr_pages: number of pages from start to pin
> * @gup_flags: flags modifying lookup behaviour
> * @pages: array that receives pointers to the pages pinned.
> - * Should be at least nr_pages long. Or NULL, if caller
> - * only intends to ensure the pages are faulted in.
> + * Should be at least nr_pages long.
> * @vmas: array of pointers to vmas corresponding to each page.
> * Or NULL if the caller does not require them.
> *
> @@ -2976,6 +2986,10 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
> if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> return -EINVAL;
>
> + /* FOLL_PIN requires pages != NULL */
> + if (WARN_ON_ONCE(!pages))
> + return -EINVAL;
> +
> gup_flags |= FOLL_PIN;
> return __gup_longterm_locked(current->mm, start, nr_pages,
> pages, vmas, gup_flags);
> @@ -2994,6 +3008,10 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> return -EINVAL;
>
> + /* FOLL_PIN requires pages != NULL */
> + if (WARN_ON_ONCE(!pages))
> + return -EINVAL;
> +
> gup_flags |= FOLL_PIN;
> return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
> }
I hope we don't break any callers with the newly enforced !pages, but it's
the right thing to do, in order to avoid misunderstandings.
thanks,
--
John Hubbard
NVIDIA
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-04-21 23:04 ` John Hubbard
@ 2022-04-21 23:09 ` John Hubbard
2022-04-21 23:17 ` Re: Yury Norov
1 sibling, 0 replies; 417+ messages in thread
From: John Hubbard @ 2022-04-21 23:09 UTC (permalink / raw)
To: Yury Norov, Andrew Morton, Minchan Kim, linux-mm, linux-kernel
On 4/21/22 16:04, John Hubbard wrote:
> On 4/21/22 09:41, Yury Norov wrote:
>> Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
>>
>
> Hi Yuri,
...and I see that I have typo'd both Yury's and Minchan's name (further
down), in the same email!
Really apologize for screwing that up. It's Yury-with-a-"y", I know. :)
thanks,
--
John Hubbard
NVIDIA
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-04-21 23:04 ` John Hubbard
2022-04-21 23:09 ` Re: John Hubbard
@ 2022-04-21 23:17 ` Yury Norov
2022-04-21 23:21 ` Re: John Hubbard
1 sibling, 1 reply; 417+ messages in thread
From: Yury Norov @ 2022-04-21 23:17 UTC (permalink / raw)
To: John Hubbard; +Cc: Andrew Morton, Minchan Kim, linux-mm, linux-kernel
On Thu, Apr 21, 2022 at 04:04:44PM -0700, John Hubbard wrote:
> On 4/21/22 09:41, Yury Norov wrote:
> > Subject: [PATCH] mm/gup: fix comments to pin_user_pages_*()
> >
>
> Hi Yuri,
>
> Thanks for picking this up. I have been distracted and didn't trust
> myself to focus on this properly, so it's good to have help!
>
> IT/admin point: somehow the first line of the commit description didn't
> make it into an actual email subject. The subject line was blank when it
> arrived in my inbox, and the subject is in the body here instead. Not
> sure how that happened.
>
> Maybe check your git-sendemail setup?
git-sendmail is OK. I just accidentally added empty line above Subject,
which broke format. My bad, sorry for this.
> > pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
> > API requires struct page **pages to be provided (not NULL). However,
> > the comment to pin_user_pages() says:
> >
> > * @pages: array that receives pointers to the pages pinned.
> > * Should be at least nr_pages long. Or NULL, if caller
> > * only intends to ensure the pages are faulted in.
> >
> > This patch fixes comments along the pin_user_pages code, and also adds
> > WARN_ON(!pages), so that API users will have better understanding
> > on how to use it.
>
> No need to quote the code in the commit log. Instead, just summarize.
> For example:
>
> pin_user_pages API forces FOLL_PIN in gup_flags, which means that the
> API requires struct page **pages to be provided (not NULL). However, the
> comment to pin_user_pages() clearly allows for passing in a NULL @pages
> argument.
>
> Remove the incorrect comments, and add WARN_ON_ONCE(!pages) calls to
> enforce the API.
>
> >
> > It has been independently spotted by Minchan Kim and confirmed with
> > John Hubbard:
> >
> > https://lore.kernel.org/all/YgWA0ghrrzHONehH@google.com/
>
> Let's add a Cc: line for Michan as well:
>
> Cc: Minchan Kim <minchan@kernel.org>
He's in CC already, I think...
> > Signed-off-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
> > ---
> > mm/gup.c | 26 ++++++++++++++++++++++----
> > 1 file changed, 22 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/gup.c b/mm/gup.c
> > index f598a037eb04..559626457585 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -2871,6 +2871,10 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
> > if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> > return -EINVAL;
> > + /* FOLL_PIN requires pages != NULL */
>
> Please delete each and every one of these one-line comments, because
> they merely echo what the code says.
Sure.
> > + if (WARN_ON_ONCE(!pages))
> > + return -EINVAL;
> > +
> > gup_flags |= FOLL_PIN;
> > return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
> > }
> > @@ -2893,6 +2897,10 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
> > */
> > if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> > return 0;
> > +
> > + /* FOLL_PIN requires pages != NULL */
> > + if (WARN_ON_ONCE(!pages))
> > + return 0;
> > /*
> > * FOLL_FAST_ONLY is required in order to match the API description of
> > * this routine: no fall back to regular ("slow") GUP.
> > @@ -2920,8 +2928,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
> > * @nr_pages: number of pages from start to pin
> > * @gup_flags: flags modifying lookup behaviour
> > * @pages: array that receives pointers to the pages pinned.
> > - * Should be at least nr_pages long. Or NULL, if caller
> > - * only intends to ensure the pages are faulted in.
> > + * Should be at least nr_pages long.
> > * @vmas: array of pointers to vmas corresponding to each page.
> > * Or NULL if the caller does not require them.
> > * @locked: pointer to lock flag indicating whether lock is held and
> > @@ -2944,6 +2951,10 @@ long pin_user_pages_remote(struct mm_struct *mm,
> > if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> > return -EINVAL;
> > + /* FOLL_PIN requires pages != NULL */
> > + if (WARN_ON_ONCE(!pages))
> > + return -EINVAL;
> > +
> > gup_flags |= FOLL_PIN;
> > return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
> > pages, vmas, locked);
> > @@ -2957,8 +2968,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
> > * @nr_pages: number of pages from start to pin
> > * @gup_flags: flags modifying lookup behaviour
> > * @pages: array that receives pointers to the pages pinned.
> > - * Should be at least nr_pages long. Or NULL, if caller
> > - * only intends to ensure the pages are faulted in.
> > + * Should be at least nr_pages long.
> > * @vmas: array of pointers to vmas corresponding to each page.
> > * Or NULL if the caller does not require them.
> > *
> > @@ -2976,6 +2986,10 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
> > if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> > return -EINVAL;
> > + /* FOLL_PIN requires pages != NULL */
> > + if (WARN_ON_ONCE(!pages))
> > + return -EINVAL;
> > +
> > gup_flags |= FOLL_PIN;
> > return __gup_longterm_locked(current->mm, start, nr_pages,
> > pages, vmas, gup_flags);
> > @@ -2994,6 +3008,10 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> > if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> > return -EINVAL;
> > + /* FOLL_PIN requires pages != NULL */
> > + if (WARN_ON_ONCE(!pages))
> > + return -EINVAL;
> > +
> > gup_flags |= FOLL_PIN;
> > return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
> > }
>
> I hope we don't break any callers with the newly enforced !pages, but it's
> the right thing to do, in order to avoid misunderstandings.
>
> thanks,
> --
> John Hubbard
> NVIDIA
Let me test v2 and resend shortly.
Thanks,
Yury
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-04-21 23:17 ` Re: Yury Norov
@ 2022-04-21 23:21 ` John Hubbard
0 siblings, 0 replies; 417+ messages in thread
From: John Hubbard @ 2022-04-21 23:21 UTC (permalink / raw)
To: Yury Norov; +Cc: Andrew Morton, Minchan Kim, linux-mm, linux-kernel
On 4/21/22 16:17, Yury Norov wrote:
>> Let's add a Cc: line for Michan as well:
>>
>> Cc: Minchan Kim <minchan@kernel.org>
>
> He's in CC already, I think...
>
Here, I am talking about attribution in the commit log, as opposed
to the email Cc. In other words, I'm suggesting that you literally
add this line to the commit description:
Cc: Minchan Kim <minchan@kernel.org>
thanks,
--
John Hubbard
NVIDIA
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2022-03-25 6:30 Michael S. Tsirkin
2022-03-25 7:52 ` Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-25 6:30 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, maz, tglx, peterz, sgarzare, keirf,
Paul E. McKenney
Bcc:
Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
Reply-To:
In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
>
> 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > This is a rework on the previous IRQ hardening that is done for
> > > virtio-pci where several drawbacks were found and were reverted:
> > >
> > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > that is used by some device such as virtio-blk
> > > 2) done only for PCI transport
> > >
> > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > by introducing a global irq_soft_enabled variable for each
> > > virtio_device. Then we can to toggle it during
> > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > the future, we may provide config_ops for the transport that doesn't
> > > use IRQ. With this, vring_interrupt() can return check and early if
> > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > but the cost should be acceptable.
> > Maybe it should be but is it? Can't we use synchronize_irq instead?
>
>
> Even if we allow the transport driver to synchornize through
> synchronize_irq() we still need a check in the vring_interrupt().
>
> We do something like the following previously:
>
> if (!READ_ONCE(vp_dev->intx_soft_enabled))
> return IRQ_NONE;
>
> But it looks like a bug since speculative read can be done before the check
> where the interrupt handler can't see the uncommitted setup which is done by
> the driver.
I don't think so - if you sync after setting the value then
you are guaranteed that any handler running afterwards
will see the new value.
Although I couldn't find anything about this in memory-barriers.txt
which surprises me.
CC Paul to help make sure I'm right.
>
> >
> > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > module parameter is introduced to enable the hardening so function
> > > hardening is disabled by default.
> > Which devices are these? How come they send an interrupt before there
> > are any buffers in any queues?
>
>
> I copied this from the commit log for 22b7050a024d7
>
> "
>
> This change will also benefit old hypervisors (before 2009)
> that send interrupts without checking DRIVER_OK: previously,
> the callback could race with driver-specific initialization.
> "
>
> If this is only for config interrupt, I can remove the above log.
This is only for config interrupt.
>
> >
> > > Note that the hardening is only done for vring interrupt since the
> > > config interrupt hardening is already done in commit 22b7050a024d7
> > > ("virtio: defer config changed notifications"). But the method that is
> > > used by config interrupt can't be reused by the vring interrupt
> > > handler because it uses spinlock to do the synchronization which is
> > > expensive.
> > >
> > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> >
> > > ---
> > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > include/linux/virtio.h | 4 ++++
> > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > index 8dde44ea044a..85e331efa9cc 100644
> > > --- a/drivers/virtio/virtio.c
> > > +++ b/drivers/virtio/virtio.c
> > > @@ -7,6 +7,12 @@
> > > #include <linux/of.h>
> > > #include <uapi/linux/virtio_ids.h>
> > > +static bool irq_hardening = false;
> > > +
> > > +module_param(irq_hardening, bool, 0444);
> > > +MODULE_PARM_DESC(irq_hardening,
> > > + "Disalbe IRQ software processing when it is not expected");
> > > +
> > > /* Unique numbering for virtio devices. */
> > > static DEFINE_IDA(virtio_index_ida);
> > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > * */
> > > void virtio_reset_device(struct virtio_device *dev)
> > > {
> > > + /*
> > > + * The below synchronize_rcu() guarantees that any
> > > + * interrupt for this line arriving after
> > > + * synchronize_rcu() has completed is guaranteed to see
> > > + * irq_soft_enabled == false.
> > News to me I did not know synchronize_rcu has anything to do
> > with interrupts. Did not you intend to use synchronize_irq?
> > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > though it's most likely is ...
>
>
> According to the comment above tree RCU version of synchronize_rcu():
>
> """
>
> * RCU read-side critical sections are delimited by rcu_read_lock()
> * and rcu_read_unlock(), and may be nested. In addition, but only in
> * v5.0 and later, regions of code across which interrupts, preemption,
> * or softirqs have been disabled also serve as RCU read-side critical
> * sections. This includes hardware interrupt handlers, softirq handlers,
> * and NMI handlers.
> """
>
> So interrupt handlers are treated as read-side critical sections.
>
> And it has the comment for explain the barrier:
>
> """
>
> * Note that this guarantee implies further memory-ordering guarantees.
> * On systems with more than one CPU, when synchronize_rcu() returns,
> * each CPU is guaranteed to have executed a full memory barrier since
> * the end of its last RCU read-side critical section whose beginning
> * preceded the call to synchronize_rcu(). In addition, each CPU having
> """
>
> So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> barrier, if the interrupt come after WRITE_ONCE() it will see the
> irq_soft_enabled as false.
>
You are right. So then
1. I do not think we need load_acquire - why is it needed? Just
READ_ONCE should do.
2. isn't synchronize_irq also doing the same thing?
> >
> > > + */
> > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > + synchronize_rcu();
> > > +
> > > dev->config->reset(dev);
> > > }
> > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > Please add comment explaining where it will be enabled.
> > Also, we *really* don't need to synch if it was already disabled,
> > let's not add useless overhead to the boot sequence.
>
>
> Ok.
>
>
> >
> >
> > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > spin_lock_init(&dev->config_lock);
> > > dev->config_enabled = false;
> > > dev->config_change_pending = false;
> > > + dev->irq_soft_check = irq_hardening;
> > > +
> > > + if (dev->irq_soft_check)
> > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > /* We always start by resetting the device, in case a previous
> > > * driver messed it up. This also tests that code path a little. */
> > one of the points of hardening is it's also helpful for buggy
> > devices. this flag defeats the purpose.
>
>
> Do you mean:
>
> 1) we need something like config_enable? This seems not easy to be
> implemented without obvious overhead, mainly the synchronize with the
> interrupt handlers
But synchronize is only on tear-down path. That is not critical for any
users at the moment, even less than probe.
> 2) enable this by default, so I don't object, but this may have some risk
> for old hypervisors
The risk if there's a driver adding buffers without setting DRIVER_OK.
So with this approach, how about we rename the flag "driver_ok"?
And then add_buf can actually test it and BUG_ON if not there (at least
in the debug build).
And going down from there, how about we cache status in the
device? Then we don't need to keep re-reading it every time,
speeding boot up a tiny bit.
>
> >
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 962f1477b1fa..0170f8c784d8 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > }
> > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > {
> > > + struct virtqueue *_vq = v;
> > > + struct virtio_device *vdev = _vq->vdev;
> > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > + return IRQ_NONE;
> > > + }
> > > +
> > > if (!more_used(vq)) {
> > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > return IRQ_NONE;
> > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > index 5464f398912a..957d6ad604ac 100644
> > > --- a/include/linux/virtio.h
> > > +++ b/include/linux/virtio.h
> > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > * @config_enabled: configuration change reporting enabled
> > > * @config_change_pending: configuration change reported while disabled
> > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > + * @irq_soft_enabled: callbacks enabled
> > > * @config_lock: protects configuration change reporting
> > > * @dev: underlying device.
> > > * @id: the device type identification (used to match it with a driver).
> > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > bool failed;
> > > bool config_enabled;
> > > bool config_change_pending;
> > > + bool irq_soft_check;
> > > + bool irq_soft_enabled;
> > > spinlock_t config_lock;
> > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > struct device dev;
> > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > index dafdc7f48c01..9c1b61f2e525 100644
> > > --- a/include/linux/virtio_config.h
> > > +++ b/include/linux/virtio_config.h
> > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > return __virtio_test_bit(vdev, fbit);
> > > }
> > > +/*
> > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > + * @vdev: the device
> > > + */
> > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > +{
> > > + if (!vdev->irq_soft_check)
> > > + return true;
> > > +
> > > + /*
> > > + * Read irq_soft_enabled before reading other device specific
> > > + * data. Paried with smp_store_relase() in
> > paired
>
>
> Will fix.
>
> Thanks
>
>
> >
> > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > + * virtio_reset_device().
> > > + */
> > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > +}
> > > +
> > > /**
> > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > * @vdev: the device
> > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > if (dev->config->enable_cbs)
> > > dev->config->enable_cbs(dev);
> > > + /*
> > > + * Commit the driver setup before enabling the virtqueue
> > > + * callbacks. Paried with smp_load_acuqire() in
> > > + * virtio_irq_soft_enabled()
> > > + */
> > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > +
> > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > }
> > > --
> > > 2.25.1
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-25 6:30 Michael S. Tsirkin
@ 2022-03-25 7:52 ` Jason Wang
2022-03-25 9:10 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-25 7:52 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> Bcc:
> Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> Reply-To:
> In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
>
> On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> >
> > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > This is a rework on the previous IRQ hardening that is done for
> > > > virtio-pci where several drawbacks were found and were reverted:
> > > >
> > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > that is used by some device such as virtio-blk
> > > > 2) done only for PCI transport
> > > >
> > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > by introducing a global irq_soft_enabled variable for each
> > > > virtio_device. Then we can to toggle it during
> > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > the future, we may provide config_ops for the transport that doesn't
> > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > but the cost should be acceptable.
> > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> >
> >
> > Even if we allow the transport driver to synchornize through
> > synchronize_irq() we still need a check in the vring_interrupt().
> >
> > We do something like the following previously:
> >
> > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > return IRQ_NONE;
> >
> > But it looks like a bug since speculative read can be done before the check
> > where the interrupt handler can't see the uncommitted setup which is done by
> > the driver.
>
> I don't think so - if you sync after setting the value then
> you are guaranteed that any handler running afterwards
> will see the new value.
The problem is not disabled but the enable. We use smp_store_relase()
to make sure the driver commits the setup before enabling the irq. It
means the read needs to be ordered as well in vring_interrupt().
>
> Although I couldn't find anything about this in memory-barriers.txt
> which surprises me.
>
> CC Paul to help make sure I'm right.
>
>
> >
> > >
> > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > module parameter is introduced to enable the hardening so function
> > > > hardening is disabled by default.
> > > Which devices are these? How come they send an interrupt before there
> > > are any buffers in any queues?
> >
> >
> > I copied this from the commit log for 22b7050a024d7
> >
> > "
> >
> > This change will also benefit old hypervisors (before 2009)
> > that send interrupts without checking DRIVER_OK: previously,
> > the callback could race with driver-specific initialization.
> > "
> >
> > If this is only for config interrupt, I can remove the above log.
>
>
> This is only for config interrupt.
Ok.
>
> >
> > >
> > > > Note that the hardening is only done for vring interrupt since the
> > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > ("virtio: defer config changed notifications"). But the method that is
> > > > used by config interrupt can't be reused by the vring interrupt
> > > > handler because it uses spinlock to do the synchronization which is
> > > > expensive.
> > > >
> > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > >
> > > > ---
> > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > include/linux/virtio.h | 4 ++++
> > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > --- a/drivers/virtio/virtio.c
> > > > +++ b/drivers/virtio/virtio.c
> > > > @@ -7,6 +7,12 @@
> > > > #include <linux/of.h>
> > > > #include <uapi/linux/virtio_ids.h>
> > > > +static bool irq_hardening = false;
> > > > +
> > > > +module_param(irq_hardening, bool, 0444);
> > > > +MODULE_PARM_DESC(irq_hardening,
> > > > + "Disalbe IRQ software processing when it is not expected");
> > > > +
> > > > /* Unique numbering for virtio devices. */
> > > > static DEFINE_IDA(virtio_index_ida);
> > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > * */
> > > > void virtio_reset_device(struct virtio_device *dev)
> > > > {
> > > > + /*
> > > > + * The below synchronize_rcu() guarantees that any
> > > > + * interrupt for this line arriving after
> > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > + * irq_soft_enabled == false.
> > > News to me I did not know synchronize_rcu has anything to do
> > > with interrupts. Did not you intend to use synchronize_irq?
> > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > though it's most likely is ...
> >
> >
> > According to the comment above tree RCU version of synchronize_rcu():
> >
> > """
> >
> > * RCU read-side critical sections are delimited by rcu_read_lock()
> > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > * v5.0 and later, regions of code across which interrupts, preemption,
> > * or softirqs have been disabled also serve as RCU read-side critical
> > * sections. This includes hardware interrupt handlers, softirq handlers,
> > * and NMI handlers.
> > """
> >
> > So interrupt handlers are treated as read-side critical sections.
> >
> > And it has the comment for explain the barrier:
> >
> > """
> >
> > * Note that this guarantee implies further memory-ordering guarantees.
> > * On systems with more than one CPU, when synchronize_rcu() returns,
> > * each CPU is guaranteed to have executed a full memory barrier since
> > * the end of its last RCU read-side critical section whose beginning
> > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > """
> >
> > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > irq_soft_enabled as false.
> >
>
> You are right. So then
> 1. I do not think we need load_acquire - why is it needed? Just
> READ_ONCE should do.
See above.
> 2. isn't synchronize_irq also doing the same thing?
Yes, but it requires a config ops since the IRQ knowledge is transport specific.
>
>
> > >
> > > > + */
> > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > + synchronize_rcu();
> > > > +
> > > > dev->config->reset(dev);
> > > > }
> > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > Please add comment explaining where it will be enabled.
> > > Also, we *really* don't need to synch if it was already disabled,
> > > let's not add useless overhead to the boot sequence.
> >
> >
> > Ok.
> >
> >
> > >
> > >
> > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > spin_lock_init(&dev->config_lock);
> > > > dev->config_enabled = false;
> > > > dev->config_change_pending = false;
> > > > + dev->irq_soft_check = irq_hardening;
> > > > +
> > > > + if (dev->irq_soft_check)
> > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > /* We always start by resetting the device, in case a previous
> > > > * driver messed it up. This also tests that code path a little. */
> > > one of the points of hardening is it's also helpful for buggy
> > > devices. this flag defeats the purpose.
> >
> >
> > Do you mean:
> >
> > 1) we need something like config_enable? This seems not easy to be
> > implemented without obvious overhead, mainly the synchronize with the
> > interrupt handlers
>
> But synchronize is only on tear-down path. That is not critical for any
> users at the moment, even less than probe.
I meant if we have vq->irq_pending, we need to call vring_interrupt()
in the virtio_device_ready() and synchronize the IRQ handlers with
spinlock or others.
>
> > 2) enable this by default, so I don't object, but this may have some risk
> > for old hypervisors
>
>
> The risk if there's a driver adding buffers without setting DRIVER_OK.
Probably not, we have devices that accept random inputs from outside,
net, console, input etc. I've done a round of audits of the Qemu
codes. They look all fine since day0.
> So with this approach, how about we rename the flag "driver_ok"?
> And then add_buf can actually test it and BUG_ON if not there (at least
> in the debug build).
This looks like a hardening of the driver in the core instead of the
device. I think it can be done but in a separate series.
>
> And going down from there, how about we cache status in the
> device? Then we don't need to keep re-reading it every time,
> speeding boot up a tiny bit.
I don't fully understand here, actually spec requires status to be
read back for validation in many cases.
Thanks
>
> >
> > >
> > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > --- a/drivers/virtio/virtio_ring.c
> > > > +++ b/drivers/virtio/virtio_ring.c
> > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > }
> > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > {
> > > > + struct virtqueue *_vq = v;
> > > > + struct virtio_device *vdev = _vq->vdev;
> > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > + return IRQ_NONE;
> > > > + }
> > > > +
> > > > if (!more_used(vq)) {
> > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > return IRQ_NONE;
> > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > index 5464f398912a..957d6ad604ac 100644
> > > > --- a/include/linux/virtio.h
> > > > +++ b/include/linux/virtio.h
> > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > * @config_enabled: configuration change reporting enabled
> > > > * @config_change_pending: configuration change reported while disabled
> > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > + * @irq_soft_enabled: callbacks enabled
> > > > * @config_lock: protects configuration change reporting
> > > > * @dev: underlying device.
> > > > * @id: the device type identification (used to match it with a driver).
> > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > bool failed;
> > > > bool config_enabled;
> > > > bool config_change_pending;
> > > > + bool irq_soft_check;
> > > > + bool irq_soft_enabled;
> > > > spinlock_t config_lock;
> > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > struct device dev;
> > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > --- a/include/linux/virtio_config.h
> > > > +++ b/include/linux/virtio_config.h
> > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > return __virtio_test_bit(vdev, fbit);
> > > > }
> > > > +/*
> > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > + * @vdev: the device
> > > > + */
> > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > +{
> > > > + if (!vdev->irq_soft_check)
> > > > + return true;
> > > > +
> > > > + /*
> > > > + * Read irq_soft_enabled before reading other device specific
> > > > + * data. Paried with smp_store_relase() in
> > > paired
> >
> >
> > Will fix.
> >
> > Thanks
> >
> >
> > >
> > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > + * virtio_reset_device().
> > > > + */
> > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > +}
> > > > +
> > > > /**
> > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > * @vdev: the device
> > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > if (dev->config->enable_cbs)
> > > > dev->config->enable_cbs(dev);
> > > > + /*
> > > > + * Commit the driver setup before enabling the virtqueue
> > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > + * virtio_irq_soft_enabled()
> > > > + */
> > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > +
> > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > }
> > > > --
> > > > 2.25.1
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-25 7:52 ` Jason Wang
@ 2022-03-25 9:10 ` Michael S. Tsirkin
2022-03-25 9:20 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-25 9:10 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > Bcc:
> > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > Reply-To:
> > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> >
> > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > >
> > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > >
> > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > that is used by some device such as virtio-blk
> > > > > 2) done only for PCI transport
> > > > >
> > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > by introducing a global irq_soft_enabled variable for each
> > > > > virtio_device. Then we can to toggle it during
> > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > but the cost should be acceptable.
> > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > >
> > >
> > > Even if we allow the transport driver to synchornize through
> > > synchronize_irq() we still need a check in the vring_interrupt().
> > >
> > > We do something like the following previously:
> > >
> > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > return IRQ_NONE;
> > >
> > > But it looks like a bug since speculative read can be done before the check
> > > where the interrupt handler can't see the uncommitted setup which is done by
> > > the driver.
> >
> > I don't think so - if you sync after setting the value then
> > you are guaranteed that any handler running afterwards
> > will see the new value.
>
> The problem is not disabled but the enable.
So a misbehaving device can lose interrupts? That's not a problem at all
imo.
> We use smp_store_relase()
> to make sure the driver commits the setup before enabling the irq. It
> means the read needs to be ordered as well in vring_interrupt().
>
> >
> > Although I couldn't find anything about this in memory-barriers.txt
> > which surprises me.
> >
> > CC Paul to help make sure I'm right.
> >
> >
> > >
> > > >
> > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > module parameter is introduced to enable the hardening so function
> > > > > hardening is disabled by default.
> > > > Which devices are these? How come they send an interrupt before there
> > > > are any buffers in any queues?
> > >
> > >
> > > I copied this from the commit log for 22b7050a024d7
> > >
> > > "
> > >
> > > This change will also benefit old hypervisors (before 2009)
> > > that send interrupts without checking DRIVER_OK: previously,
> > > the callback could race with driver-specific initialization.
> > > "
> > >
> > > If this is only for config interrupt, I can remove the above log.
> >
> >
> > This is only for config interrupt.
>
> Ok.
>
> >
> > >
> > > >
> > > > > Note that the hardening is only done for vring interrupt since the
> > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > handler because it uses spinlock to do the synchronization which is
> > > > > expensive.
> > > > >
> > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > >
> > > > > ---
> > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > include/linux/virtio.h | 4 ++++
> > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > --- a/drivers/virtio/virtio.c
> > > > > +++ b/drivers/virtio/virtio.c
> > > > > @@ -7,6 +7,12 @@
> > > > > #include <linux/of.h>
> > > > > #include <uapi/linux/virtio_ids.h>
> > > > > +static bool irq_hardening = false;
> > > > > +
> > > > > +module_param(irq_hardening, bool, 0444);
> > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > +
> > > > > /* Unique numbering for virtio devices. */
> > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > * */
> > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > {
> > > > > + /*
> > > > > + * The below synchronize_rcu() guarantees that any
> > > > > + * interrupt for this line arriving after
> > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > + * irq_soft_enabled == false.
> > > > News to me I did not know synchronize_rcu has anything to do
> > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > though it's most likely is ...
> > >
> > >
> > > According to the comment above tree RCU version of synchronize_rcu():
> > >
> > > """
> > >
> > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > * or softirqs have been disabled also serve as RCU read-side critical
> > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > * and NMI handlers.
> > > """
> > >
> > > So interrupt handlers are treated as read-side critical sections.
> > >
> > > And it has the comment for explain the barrier:
> > >
> > > """
> > >
> > > * Note that this guarantee implies further memory-ordering guarantees.
> > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > * each CPU is guaranteed to have executed a full memory barrier since
> > > * the end of its last RCU read-side critical section whose beginning
> > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > """
> > >
> > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > irq_soft_enabled as false.
> > >
> >
> > You are right. So then
> > 1. I do not think we need load_acquire - why is it needed? Just
> > READ_ONCE should do.
>
> See above.
>
> > 2. isn't synchronize_irq also doing the same thing?
>
>
> Yes, but it requires a config ops since the IRQ knowledge is transport specific.
>
> >
> >
> > > >
> > > > > + */
> > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > + synchronize_rcu();
> > > > > +
> > > > > dev->config->reset(dev);
> > > > > }
> > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > Please add comment explaining where it will be enabled.
> > > > Also, we *really* don't need to synch if it was already disabled,
> > > > let's not add useless overhead to the boot sequence.
> > >
> > >
> > > Ok.
> > >
> > >
> > > >
> > > >
> > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > spin_lock_init(&dev->config_lock);
> > > > > dev->config_enabled = false;
> > > > > dev->config_change_pending = false;
> > > > > + dev->irq_soft_check = irq_hardening;
> > > > > +
> > > > > + if (dev->irq_soft_check)
> > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > /* We always start by resetting the device, in case a previous
> > > > > * driver messed it up. This also tests that code path a little. */
> > > > one of the points of hardening is it's also helpful for buggy
> > > > devices. this flag defeats the purpose.
> > >
> > >
> > > Do you mean:
> > >
> > > 1) we need something like config_enable? This seems not easy to be
> > > implemented without obvious overhead, mainly the synchronize with the
> > > interrupt handlers
> >
> > But synchronize is only on tear-down path. That is not critical for any
> > users at the moment, even less than probe.
>
> I meant if we have vq->irq_pending, we need to call vring_interrupt()
> in the virtio_device_ready() and synchronize the IRQ handlers with
> spinlock or others.
>
> >
> > > 2) enable this by default, so I don't object, but this may have some risk
> > > for old hypervisors
> >
> >
> > The risk if there's a driver adding buffers without setting DRIVER_OK.
>
> Probably not, we have devices that accept random inputs from outside,
> net, console, input etc. I've done a round of audits of the Qemu
> codes. They look all fine since day0.
>
> > So with this approach, how about we rename the flag "driver_ok"?
> > And then add_buf can actually test it and BUG_ON if not there (at least
> > in the debug build).
>
> This looks like a hardening of the driver in the core instead of the
> device. I think it can be done but in a separate series.
>
> >
> > And going down from there, how about we cache status in the
> > device? Then we don't need to keep re-reading it every time,
> > speeding boot up a tiny bit.
>
> I don't fully understand here, actually spec requires status to be
> read back for validation in many cases.
>
> Thanks
>
> >
> > >
> > > >
> > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > }
> > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > {
> > > > > + struct virtqueue *_vq = v;
> > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > + return IRQ_NONE;
> > > > > + }
> > > > > +
> > > > > if (!more_used(vq)) {
> > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > return IRQ_NONE;
> > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > --- a/include/linux/virtio.h
> > > > > +++ b/include/linux/virtio.h
> > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > * @config_enabled: configuration change reporting enabled
> > > > > * @config_change_pending: configuration change reported while disabled
> > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > * @config_lock: protects configuration change reporting
> > > > > * @dev: underlying device.
> > > > > * @id: the device type identification (used to match it with a driver).
> > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > bool failed;
> > > > > bool config_enabled;
> > > > > bool config_change_pending;
> > > > > + bool irq_soft_check;
> > > > > + bool irq_soft_enabled;
> > > > > spinlock_t config_lock;
> > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > struct device dev;
> > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > --- a/include/linux/virtio_config.h
> > > > > +++ b/include/linux/virtio_config.h
> > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > return __virtio_test_bit(vdev, fbit);
> > > > > }
> > > > > +/*
> > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > + * @vdev: the device
> > > > > + */
> > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > +{
> > > > > + if (!vdev->irq_soft_check)
> > > > > + return true;
> > > > > +
> > > > > + /*
> > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > + * data. Paried with smp_store_relase() in
> > > > paired
> > >
> > >
> > > Will fix.
> > >
> > > Thanks
> > >
> > >
> > > >
> > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > + * virtio_reset_device().
> > > > > + */
> > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > +}
> > > > > +
> > > > > /**
> > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > * @vdev: the device
> > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > if (dev->config->enable_cbs)
> > > > > dev->config->enable_cbs(dev);
> > > > > + /*
> > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > + * virtio_irq_soft_enabled()
> > > > > + */
> > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > +
> > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > }
> > > > > --
> > > > > 2.25.1
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-25 9:10 ` Re: Michael S. Tsirkin
@ 2022-03-25 9:20 ` Jason Wang
2022-03-25 10:09 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-25 9:20 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > Bcc:
> > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > Reply-To:
> > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > >
> > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > >
> > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > >
> > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > that is used by some device such as virtio-blk
> > > > > > 2) done only for PCI transport
> > > > > >
> > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > virtio_device. Then we can to toggle it during
> > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > but the cost should be acceptable.
> > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > >
> > > >
> > > > Even if we allow the transport driver to synchornize through
> > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > >
> > > > We do something like the following previously:
> > > >
> > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > return IRQ_NONE;
> > > >
> > > > But it looks like a bug since speculative read can be done before the check
> > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > the driver.
> > >
> > > I don't think so - if you sync after setting the value then
> > > you are guaranteed that any handler running afterwards
> > > will see the new value.
> >
> > The problem is not disabled but the enable.
>
> So a misbehaving device can lose interrupts? That's not a problem at all
> imo.
It's the interrupt raised before setting irq_soft_enabled to true:
CPU 0 probe) driver specific setup (not commited)
CPU 1 IRQ handler) read the uninitialized variable
CPU 0 probe) set irq_soft_enabled to true
CPU 1 IRQ handler) read irq_soft_enable as true
CPU 1 IRQ handler) use the uninitialized variable
Thanks
>
> > We use smp_store_relase()
> > to make sure the driver commits the setup before enabling the irq. It
> > means the read needs to be ordered as well in vring_interrupt().
> >
> > >
> > > Although I couldn't find anything about this in memory-barriers.txt
> > > which surprises me.
> > >
> > > CC Paul to help make sure I'm right.
> > >
> > >
> > > >
> > > > >
> > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > module parameter is introduced to enable the hardening so function
> > > > > > hardening is disabled by default.
> > > > > Which devices are these? How come they send an interrupt before there
> > > > > are any buffers in any queues?
> > > >
> > > >
> > > > I copied this from the commit log for 22b7050a024d7
> > > >
> > > > "
> > > >
> > > > This change will also benefit old hypervisors (before 2009)
> > > > that send interrupts without checking DRIVER_OK: previously,
> > > > the callback could race with driver-specific initialization.
> > > > "
> > > >
> > > > If this is only for config interrupt, I can remove the above log.
> > >
> > >
> > > This is only for config interrupt.
> >
> > Ok.
> >
> > >
> > > >
> > > > >
> > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > expensive.
> > > > > >
> > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > >
> > > > > > ---
> > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > include/linux/virtio.h | 4 ++++
> > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > >
> > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > --- a/drivers/virtio/virtio.c
> > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > @@ -7,6 +7,12 @@
> > > > > > #include <linux/of.h>
> > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > +static bool irq_hardening = false;
> > > > > > +
> > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > +
> > > > > > /* Unique numbering for virtio devices. */
> > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > * */
> > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > {
> > > > > > + /*
> > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > + * interrupt for this line arriving after
> > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > + * irq_soft_enabled == false.
> > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > though it's most likely is ...
> > > >
> > > >
> > > > According to the comment above tree RCU version of synchronize_rcu():
> > > >
> > > > """
> > > >
> > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > * and NMI handlers.
> > > > """
> > > >
> > > > So interrupt handlers are treated as read-side critical sections.
> > > >
> > > > And it has the comment for explain the barrier:
> > > >
> > > > """
> > > >
> > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > * the end of its last RCU read-side critical section whose beginning
> > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > """
> > > >
> > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > irq_soft_enabled as false.
> > > >
> > >
> > > You are right. So then
> > > 1. I do not think we need load_acquire - why is it needed? Just
> > > READ_ONCE should do.
> >
> > See above.
> >
> > > 2. isn't synchronize_irq also doing the same thing?
> >
> >
> > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> >
> > >
> > >
> > > > >
> > > > > > + */
> > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > + synchronize_rcu();
> > > > > > +
> > > > > > dev->config->reset(dev);
> > > > > > }
> > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > Please add comment explaining where it will be enabled.
> > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > let's not add useless overhead to the boot sequence.
> > > >
> > > >
> > > > Ok.
> > > >
> > > >
> > > > >
> > > > >
> > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > spin_lock_init(&dev->config_lock);
> > > > > > dev->config_enabled = false;
> > > > > > dev->config_change_pending = false;
> > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > +
> > > > > > + if (dev->irq_soft_check)
> > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > /* We always start by resetting the device, in case a previous
> > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > one of the points of hardening is it's also helpful for buggy
> > > > > devices. this flag defeats the purpose.
> > > >
> > > >
> > > > Do you mean:
> > > >
> > > > 1) we need something like config_enable? This seems not easy to be
> > > > implemented without obvious overhead, mainly the synchronize with the
> > > > interrupt handlers
> > >
> > > But synchronize is only on tear-down path. That is not critical for any
> > > users at the moment, even less than probe.
> >
> > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > in the virtio_device_ready() and synchronize the IRQ handlers with
> > spinlock or others.
> >
> > >
> > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > for old hypervisors
> > >
> > >
> > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> >
> > Probably not, we have devices that accept random inputs from outside,
> > net, console, input etc. I've done a round of audits of the Qemu
> > codes. They look all fine since day0.
> >
> > > So with this approach, how about we rename the flag "driver_ok"?
> > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > in the debug build).
> >
> > This looks like a hardening of the driver in the core instead of the
> > device. I think it can be done but in a separate series.
> >
> > >
> > > And going down from there, how about we cache status in the
> > > device? Then we don't need to keep re-reading it every time,
> > > speeding boot up a tiny bit.
> >
> > I don't fully understand here, actually spec requires status to be
> > read back for validation in many cases.
> >
> > Thanks
> >
> > >
> > > >
> > > > >
> > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > }
> > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > {
> > > > > > + struct virtqueue *_vq = v;
> > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > + return IRQ_NONE;
> > > > > > + }
> > > > > > +
> > > > > > if (!more_used(vq)) {
> > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > return IRQ_NONE;
> > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > --- a/include/linux/virtio.h
> > > > > > +++ b/include/linux/virtio.h
> > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > * @config_lock: protects configuration change reporting
> > > > > > * @dev: underlying device.
> > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > bool failed;
> > > > > > bool config_enabled;
> > > > > > bool config_change_pending;
> > > > > > + bool irq_soft_check;
> > > > > > + bool irq_soft_enabled;
> > > > > > spinlock_t config_lock;
> > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > struct device dev;
> > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > --- a/include/linux/virtio_config.h
> > > > > > +++ b/include/linux/virtio_config.h
> > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > }
> > > > > > +/*
> > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > + * @vdev: the device
> > > > > > + */
> > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > +{
> > > > > > + if (!vdev->irq_soft_check)
> > > > > > + return true;
> > > > > > +
> > > > > > + /*
> > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > + * data. Paried with smp_store_relase() in
> > > > > paired
> > > >
> > > >
> > > > Will fix.
> > > >
> > > > Thanks
> > > >
> > > >
> > > > >
> > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > + * virtio_reset_device().
> > > > > > + */
> > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > +}
> > > > > > +
> > > > > > /**
> > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > * @vdev: the device
> > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > if (dev->config->enable_cbs)
> > > > > > dev->config->enable_cbs(dev);
> > > > > > + /*
> > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > + * virtio_irq_soft_enabled()
> > > > > > + */
> > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > +
> > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > }
> > > > > > --
> > > > > > 2.25.1
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-25 9:20 ` Re: Jason Wang
@ 2022-03-25 10:09 ` Michael S. Tsirkin
2022-03-28 4:56 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-25 10:09 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > Bcc:
> > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > Reply-To:
> > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > >
> > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > >
> > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > >
> > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > that is used by some device such as virtio-blk
> > > > > > > 2) done only for PCI transport
> > > > > > >
> > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > but the cost should be acceptable.
> > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > >
> > > > >
> > > > > Even if we allow the transport driver to synchornize through
> > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > >
> > > > > We do something like the following previously:
> > > > >
> > > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > return IRQ_NONE;
> > > > >
> > > > > But it looks like a bug since speculative read can be done before the check
> > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > the driver.
> > > >
> > > > I don't think so - if you sync after setting the value then
> > > > you are guaranteed that any handler running afterwards
> > > > will see the new value.
> > >
> > > The problem is not disabled but the enable.
> >
> > So a misbehaving device can lose interrupts? That's not a problem at all
> > imo.
>
> It's the interrupt raised before setting irq_soft_enabled to true:
>
> CPU 0 probe) driver specific setup (not commited)
> CPU 1 IRQ handler) read the uninitialized variable
> CPU 0 probe) set irq_soft_enabled to true
> CPU 1 IRQ handler) read irq_soft_enable as true
> CPU 1 IRQ handler) use the uninitialized variable
>
> Thanks
Yea, it hurts if you do it. So do not do it then ;).
irq_soft_enabled (I think driver_ok or status is a better name)
should be initialized to false *before* irq is requested.
And requesting irq commits all memory otherwise all drivers would be
broken, if it doesn't it just needs to be fixed, not worked around in
virtio.
> >
> > > We use smp_store_relase()
> > > to make sure the driver commits the setup before enabling the irq. It
> > > means the read needs to be ordered as well in vring_interrupt().
> > >
> > > >
> > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > which surprises me.
> > > >
> > > > CC Paul to help make sure I'm right.
> > > >
> > > >
> > > > >
> > > > > >
> > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > hardening is disabled by default.
> > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > are any buffers in any queues?
> > > > >
> > > > >
> > > > > I copied this from the commit log for 22b7050a024d7
> > > > >
> > > > > "
> > > > >
> > > > > This change will also benefit old hypervisors (before 2009)
> > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > the callback could race with driver-specific initialization.
> > > > > "
> > > > >
> > > > > If this is only for config interrupt, I can remove the above log.
> > > >
> > > >
> > > > This is only for config interrupt.
> > >
> > > Ok.
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > expensive.
> > > > > > >
> > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > >
> > > > > > > ---
> > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > @@ -7,6 +7,12 @@
> > > > > > > #include <linux/of.h>
> > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > +static bool irq_hardening = false;
> > > > > > > +
> > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > +
> > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > * */
> > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > {
> > > > > > > + /*
> > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > + * interrupt for this line arriving after
> > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > + * irq_soft_enabled == false.
> > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > though it's most likely is ...
> > > > >
> > > > >
> > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > >
> > > > > """
> > > > >
> > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > * and NMI handlers.
> > > > > """
> > > > >
> > > > > So interrupt handlers are treated as read-side critical sections.
> > > > >
> > > > > And it has the comment for explain the barrier:
> > > > >
> > > > > """
> > > > >
> > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > """
> > > > >
> > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > irq_soft_enabled as false.
> > > > >
> > > >
> > > > You are right. So then
> > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > READ_ONCE should do.
> > >
> > > See above.
> > >
> > > > 2. isn't synchronize_irq also doing the same thing?
> > >
> > >
> > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > >
> > > >
> > > >
> > > > > >
> > > > > > > + */
> > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > + synchronize_rcu();
> > > > > > > +
> > > > > > > dev->config->reset(dev);
> > > > > > > }
> > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > Please add comment explaining where it will be enabled.
> > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > let's not add useless overhead to the boot sequence.
> > > > >
> > > > >
> > > > > Ok.
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > dev->config_enabled = false;
> > > > > > > dev->config_change_pending = false;
> > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > +
> > > > > > > + if (dev->irq_soft_check)
> > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > devices. this flag defeats the purpose.
> > > > >
> > > > >
> > > > > Do you mean:
> > > > >
> > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > interrupt handlers
> > > >
> > > > But synchronize is only on tear-down path. That is not critical for any
> > > > users at the moment, even less than probe.
> > >
> > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > spinlock or others.
> > >
> > > >
> > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > for old hypervisors
> > > >
> > > >
> > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > >
> > > Probably not, we have devices that accept random inputs from outside,
> > > net, console, input etc. I've done a round of audits of the Qemu
> > > codes. They look all fine since day0.
> > >
> > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > in the debug build).
> > >
> > > This looks like a hardening of the driver in the core instead of the
> > > device. I think it can be done but in a separate series.
> > >
> > > >
> > > > And going down from there, how about we cache status in the
> > > > device? Then we don't need to keep re-reading it every time,
> > > > speeding boot up a tiny bit.
> > >
> > > I don't fully understand here, actually spec requires status to be
> > > read back for validation in many cases.
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > }
> > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > {
> > > > > > > + struct virtqueue *_vq = v;
> > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > + return IRQ_NONE;
> > > > > > > + }
> > > > > > > +
> > > > > > > if (!more_used(vq)) {
> > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > return IRQ_NONE;
> > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > --- a/include/linux/virtio.h
> > > > > > > +++ b/include/linux/virtio.h
> > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > * @dev: underlying device.
> > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > bool failed;
> > > > > > > bool config_enabled;
> > > > > > > bool config_change_pending;
> > > > > > > + bool irq_soft_check;
> > > > > > > + bool irq_soft_enabled;
> > > > > > > spinlock_t config_lock;
> > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > struct device dev;
> > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > }
> > > > > > > +/*
> > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > + * @vdev: the device
> > > > > > > + */
> > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > +{
> > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > + return true;
> > > > > > > +
> > > > > > > + /*
> > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > paired
> > > > >
> > > > >
> > > > > Will fix.
> > > > >
> > > > > Thanks
> > > > >
> > > > >
> > > > > >
> > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > + * virtio_reset_device().
> > > > > > > + */
> > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > +}
> > > > > > > +
> > > > > > > /**
> > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > * @vdev: the device
> > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > if (dev->config->enable_cbs)
> > > > > > > dev->config->enable_cbs(dev);
> > > > > > > + /*
> > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > + */
> > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > +
> > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > }
> > > > > > > --
> > > > > > > 2.25.1
> > > >
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-25 10:09 ` Re: Michael S. Tsirkin
@ 2022-03-28 4:56 ` Jason Wang
2022-03-28 5:59 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-28 4:56 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > Bcc:
> > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > Reply-To:
> > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > >
> > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > >
> > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > >
> > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > that is used by some device such as virtio-blk
> > > > > > > > 2) done only for PCI transport
> > > > > > > >
> > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > but the cost should be acceptable.
> > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > >
> > > > > >
> > > > > > Even if we allow the transport driver to synchornize through
> > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > >
> > > > > > We do something like the following previously:
> > > > > >
> > > > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > return IRQ_NONE;
> > > > > >
> > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > the driver.
> > > > >
> > > > > I don't think so - if you sync after setting the value then
> > > > > you are guaranteed that any handler running afterwards
> > > > > will see the new value.
> > > >
> > > > The problem is not disabled but the enable.
> > >
> > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > imo.
> >
> > It's the interrupt raised before setting irq_soft_enabled to true:
> >
> > CPU 0 probe) driver specific setup (not commited)
> > CPU 1 IRQ handler) read the uninitialized variable
> > CPU 0 probe) set irq_soft_enabled to true
> > CPU 1 IRQ handler) read irq_soft_enable as true
> > CPU 1 IRQ handler) use the uninitialized variable
> >
> > Thanks
>
> Yea, it hurts if you do it. So do not do it then ;).
>
> irq_soft_enabled (I think driver_ok or status is a better name)
I can change it to driver_ok.
> should be initialized to false *before* irq is requested.
>
> And requesting irq commits all memory otherwise all drivers would be
> broken,
So I think we might talk different issues:
1) Whether request_irq() commits the previous setups, I think the
answer is yes, since the spin_unlock of desc->lock (release) can
guarantee this though there seems no documentation around
request_irq() to say this.
And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
using smp_wmb() before the request_irq().
And even if write is ordered we still need read to be ordered to be
paired with that.
> if it doesn't it just needs to be fixed, not worked around in
> virtio.
2) virtio drivers might do a lot of setups between request_irq() and
virtio_device_ready():
request_irq()
driver specific setups
virtio_device_ready()
CPU 0 probe) request_irq()
CPU 1 IRQ handler) read the uninitialized variable
CPU 0 probe) driver specific setups
CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
CPU 1 IRQ handler) read irq_soft_enable as true
CPU 1 IRQ handler) use the uninitialized variable
Thanks
>
>
> > >
> > > > We use smp_store_relase()
> > > > to make sure the driver commits the setup before enabling the irq. It
> > > > means the read needs to be ordered as well in vring_interrupt().
> > > >
> > > > >
> > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > which surprises me.
> > > > >
> > > > > CC Paul to help make sure I'm right.
> > > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > hardening is disabled by default.
> > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > are any buffers in any queues?
> > > > > >
> > > > > >
> > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > >
> > > > > > "
> > > > > >
> > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > the callback could race with driver-specific initialization.
> > > > > > "
> > > > > >
> > > > > > If this is only for config interrupt, I can remove the above log.
> > > > >
> > > > >
> > > > > This is only for config interrupt.
> > > >
> > > > Ok.
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > expensive.
> > > > > > > >
> > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > >
> > > > > > > > ---
> > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > #include <linux/of.h>
> > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > +static bool irq_hardening = false;
> > > > > > > > +
> > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > +
> > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > * */
> > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > {
> > > > > > > > + /*
> > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > + * interrupt for this line arriving after
> > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > + * irq_soft_enabled == false.
> > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > though it's most likely is ...
> > > > > >
> > > > > >
> > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > >
> > > > > > """
> > > > > >
> > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > * and NMI handlers.
> > > > > > """
> > > > > >
> > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > >
> > > > > > And it has the comment for explain the barrier:
> > > > > >
> > > > > > """
> > > > > >
> > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > """
> > > > > >
> > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > irq_soft_enabled as false.
> > > > > >
> > > > >
> > > > > You are right. So then
> > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > READ_ONCE should do.
> > > >
> > > > See above.
> > > >
> > > > > 2. isn't synchronize_irq also doing the same thing?
> > > >
> > > >
> > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > > + */
> > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > + synchronize_rcu();
> > > > > > > > +
> > > > > > > > dev->config->reset(dev);
> > > > > > > > }
> > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > let's not add useless overhead to the boot sequence.
> > > > > >
> > > > > >
> > > > > > Ok.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > dev->config_enabled = false;
> > > > > > > > dev->config_change_pending = false;
> > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > +
> > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > devices. this flag defeats the purpose.
> > > > > >
> > > > > >
> > > > > > Do you mean:
> > > > > >
> > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > interrupt handlers
> > > > >
> > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > users at the moment, even less than probe.
> > > >
> > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > spinlock or others.
> > > >
> > > > >
> > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > for old hypervisors
> > > > >
> > > > >
> > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > >
> > > > Probably not, we have devices that accept random inputs from outside,
> > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > codes. They look all fine since day0.
> > > >
> > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > in the debug build).
> > > >
> > > > This looks like a hardening of the driver in the core instead of the
> > > > device. I think it can be done but in a separate series.
> > > >
> > > > >
> > > > > And going down from there, how about we cache status in the
> > > > > device? Then we don't need to keep re-reading it every time,
> > > > > speeding boot up a tiny bit.
> > > >
> > > > I don't fully understand here, actually spec requires status to be
> > > > read back for validation in many cases.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > }
> > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > {
> > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > + return IRQ_NONE;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > if (!more_used(vq)) {
> > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > return IRQ_NONE;
> > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > * @dev: underlying device.
> > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > bool failed;
> > > > > > > > bool config_enabled;
> > > > > > > > bool config_change_pending;
> > > > > > > > + bool irq_soft_check;
> > > > > > > > + bool irq_soft_enabled;
> > > > > > > > spinlock_t config_lock;
> > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > struct device dev;
> > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > }
> > > > > > > > +/*
> > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > + * @vdev: the device
> > > > > > > > + */
> > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > +{
> > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > + return true;
> > > > > > > > +
> > > > > > > > + /*
> > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > paired
> > > > > >
> > > > > >
> > > > > > Will fix.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > + * virtio_reset_device().
> > > > > > > > + */
> > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > /**
> > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > * @vdev: the device
> > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > + /*
> > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > + */
> > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > +
> > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > }
> > > > > > > > --
> > > > > > > > 2.25.1
> > > > >
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-28 4:56 ` Re: Jason Wang
@ 2022-03-28 5:59 ` Michael S. Tsirkin
2022-03-28 6:18 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-28 5:59 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > Bcc:
> > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > Reply-To:
> > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > >
> > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > >
> > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > >
> > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > that is used by some device such as virtio-blk
> > > > > > > > > 2) done only for PCI transport
> > > > > > > > >
> > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > but the cost should be acceptable.
> > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > >
> > > > > > >
> > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > >
> > > > > > > We do something like the following previously:
> > > > > > >
> > > > > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > return IRQ_NONE;
> > > > > > >
> > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > the driver.
> > > > > >
> > > > > > I don't think so - if you sync after setting the value then
> > > > > > you are guaranteed that any handler running afterwards
> > > > > > will see the new value.
> > > > >
> > > > > The problem is not disabled but the enable.
> > > >
> > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > imo.
> > >
> > > It's the interrupt raised before setting irq_soft_enabled to true:
> > >
> > > CPU 0 probe) driver specific setup (not commited)
> > > CPU 1 IRQ handler) read the uninitialized variable
> > > CPU 0 probe) set irq_soft_enabled to true
> > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > CPU 1 IRQ handler) use the uninitialized variable
> > >
> > > Thanks
> >
> > Yea, it hurts if you do it. So do not do it then ;).
> >
> > irq_soft_enabled (I think driver_ok or status is a better name)
>
> I can change it to driver_ok.
>
> > should be initialized to false *before* irq is requested.
> >
> > And requesting irq commits all memory otherwise all drivers would be
> > broken,
>
> So I think we might talk different issues:
>
> 1) Whether request_irq() commits the previous setups, I think the
> answer is yes, since the spin_unlock of desc->lock (release) can
> guarantee this though there seems no documentation around
> request_irq() to say this.
>
> And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> using smp_wmb() before the request_irq().
>
> And even if write is ordered we still need read to be ordered to be
> paired with that.
>
> > if it doesn't it just needs to be fixed, not worked around in
> > virtio.
>
> 2) virtio drivers might do a lot of setups between request_irq() and
> virtio_device_ready():
>
> request_irq()
> driver specific setups
> virtio_device_ready()
>
> CPU 0 probe) request_irq()
> CPU 1 IRQ handler) read the uninitialized variable
> CPU 0 probe) driver specific setups
> CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> CPU 1 IRQ handler) read irq_soft_enable as true
> CPU 1 IRQ handler) use the uninitialized variable
>
> Thanks
As I said, virtio_device_ready needs to do synchronize_irq.
That will guarantee all setup is visible to the specific IRQ, this
is what it's point is.
> >
> >
> > > >
> > > > > We use smp_store_relase()
> > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > >
> > > > > >
> > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > which surprises me.
> > > > > >
> > > > > > CC Paul to help make sure I'm right.
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > hardening is disabled by default.
> > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > are any buffers in any queues?
> > > > > > >
> > > > > > >
> > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > >
> > > > > > > "
> > > > > > >
> > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > the callback could race with driver-specific initialization.
> > > > > > > "
> > > > > > >
> > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > >
> > > > > >
> > > > > > This is only for config interrupt.
> > > > >
> > > > > Ok.
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > expensive.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > >
> > > > > > > > > ---
> > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > #include <linux/of.h>
> > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > +
> > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > +
> > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > * */
> > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > {
> > > > > > > > > + /*
> > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > though it's most likely is ...
> > > > > > >
> > > > > > >
> > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > >
> > > > > > > """
> > > > > > >
> > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > * and NMI handlers.
> > > > > > > """
> > > > > > >
> > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > >
> > > > > > > And it has the comment for explain the barrier:
> > > > > > >
> > > > > > > """
> > > > > > >
> > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > """
> > > > > > >
> > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > irq_soft_enabled as false.
> > > > > > >
> > > > > >
> > > > > > You are right. So then
> > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > READ_ONCE should do.
> > > > >
> > > > > See above.
> > > > >
> > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > >
> > > > >
> > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > + */
> > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > + synchronize_rcu();
> > > > > > > > > +
> > > > > > > > > dev->config->reset(dev);
> > > > > > > > > }
> > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > >
> > > > > > >
> > > > > > > Ok.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > dev->config_enabled = false;
> > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > +
> > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > devices. this flag defeats the purpose.
> > > > > > >
> > > > > > >
> > > > > > > Do you mean:
> > > > > > >
> > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > interrupt handlers
> > > > > >
> > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > users at the moment, even less than probe.
> > > > >
> > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > spinlock or others.
> > > > >
> > > > > >
> > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > for old hypervisors
> > > > > >
> > > > > >
> > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > >
> > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > codes. They look all fine since day0.
> > > > >
> > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > in the debug build).
> > > > >
> > > > > This looks like a hardening of the driver in the core instead of the
> > > > > device. I think it can be done but in a separate series.
> > > > >
> > > > > >
> > > > > > And going down from there, how about we cache status in the
> > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > speeding boot up a tiny bit.
> > > > >
> > > > > I don't fully understand here, actually spec requires status to be
> > > > > read back for validation in many cases.
> > > > >
> > > > > Thanks
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > }
> > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > {
> > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > + return IRQ_NONE;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > return IRQ_NONE;
> > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > * @dev: underlying device.
> > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > bool failed;
> > > > > > > > > bool config_enabled;
> > > > > > > > > bool config_change_pending;
> > > > > > > > > + bool irq_soft_check;
> > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > spinlock_t config_lock;
> > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > struct device dev;
> > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > }
> > > > > > > > > +/*
> > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > + * @vdev: the device
> > > > > > > > > + */
> > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > +{
> > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > + return true;
> > > > > > > > > +
> > > > > > > > > + /*
> > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > paired
> > > > > > >
> > > > > > >
> > > > > > > Will fix.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > + * virtio_reset_device().
> > > > > > > > > + */
> > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > /**
> > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > * @vdev: the device
> > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > + /*
> > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > + */
> > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > +
> > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > }
> > > > > > > > > --
> > > > > > > > > 2.25.1
> > > > > >
> > > >
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-28 5:59 ` Re: Michael S. Tsirkin
@ 2022-03-28 6:18 ` Jason Wang
2022-03-28 10:40 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-28 6:18 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Mon, Mar 28, 2022 at 1:59 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> > On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > Bcc:
> > > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > > Reply-To:
> > > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > > >
> > > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > > >
> > > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > > >
> > > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > > that is used by some device such as virtio-blk
> > > > > > > > > > 2) done only for PCI transport
> > > > > > > > > >
> > > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > > but the cost should be acceptable.
> > > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > > >
> > > > > > > >
> > > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > > >
> > > > > > > > We do something like the following previously:
> > > > > > > >
> > > > > > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > > return IRQ_NONE;
> > > > > > > >
> > > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > > the driver.
> > > > > > >
> > > > > > > I don't think so - if you sync after setting the value then
> > > > > > > you are guaranteed that any handler running afterwards
> > > > > > > will see the new value.
> > > > > >
> > > > > > The problem is not disabled but the enable.
> > > > >
> > > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > > imo.
> > > >
> > > > It's the interrupt raised before setting irq_soft_enabled to true:
> > > >
> > > > CPU 0 probe) driver specific setup (not commited)
> > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > CPU 0 probe) set irq_soft_enabled to true
> > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > CPU 1 IRQ handler) use the uninitialized variable
> > > >
> > > > Thanks
> > >
> > > Yea, it hurts if you do it. So do not do it then ;).
> > >
> > > irq_soft_enabled (I think driver_ok or status is a better name)
> >
> > I can change it to driver_ok.
> >
> > > should be initialized to false *before* irq is requested.
> > >
> > > And requesting irq commits all memory otherwise all drivers would be
> > > broken,
> >
> > So I think we might talk different issues:
> >
> > 1) Whether request_irq() commits the previous setups, I think the
> > answer is yes, since the spin_unlock of desc->lock (release) can
> > guarantee this though there seems no documentation around
> > request_irq() to say this.
> >
> > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > using smp_wmb() before the request_irq().
> >
> > And even if write is ordered we still need read to be ordered to be
> > paired with that.
> >
> > > if it doesn't it just needs to be fixed, not worked around in
> > > virtio.
> >
> > 2) virtio drivers might do a lot of setups between request_irq() and
> > virtio_device_ready():
> >
> > request_irq()
> > driver specific setups
> > virtio_device_ready()
> >
> > CPU 0 probe) request_irq()
> > CPU 1 IRQ handler) read the uninitialized variable
> > CPU 0 probe) driver specific setups
> > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > CPU 1 IRQ handler) read irq_soft_enable as true
> > CPU 1 IRQ handler) use the uninitialized variable
> >
> > Thanks
>
>
> As I said, virtio_device_ready needs to do synchronize_irq.
> That will guarantee all setup is visible to the specific IRQ,
Only the interrupt after synchronize_irq() returns.
>this
> is what it's point is.
What happens if an interrupt is raised in the middle like:
smp_store_release(dev->irq_soft_enabled, true)
IRQ handler
synchornize_irq()
If we don't enforce a reading order, the IRQ handler may still see the
uninitialized variable.
Thanks
>
>
> > >
> > >
> > > > >
> > > > > > We use smp_store_relase()
> > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > >
> > > > > > >
> > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > which surprises me.
> > > > > > >
> > > > > > > CC Paul to help make sure I'm right.
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > hardening is disabled by default.
> > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > are any buffers in any queues?
> > > > > > > >
> > > > > > > >
> > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > >
> > > > > > > > "
> > > > > > > >
> > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > "
> > > > > > > >
> > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > >
> > > > > > >
> > > > > > > This is only for config interrupt.
> > > > > >
> > > > > > Ok.
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > expensive.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > +
> > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > +
> > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > * */
> > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > {
> > > > > > > > > > + /*
> > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > though it's most likely is ...
> > > > > > > >
> > > > > > > >
> > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > >
> > > > > > > > """
> > > > > > > >
> > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > * and NMI handlers.
> > > > > > > > """
> > > > > > > >
> > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > >
> > > > > > > > And it has the comment for explain the barrier:
> > > > > > > >
> > > > > > > > """
> > > > > > > >
> > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > """
> > > > > > > >
> > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > irq_soft_enabled as false.
> > > > > > > >
> > > > > > >
> > > > > > > You are right. So then
> > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > READ_ONCE should do.
> > > > > >
> > > > > > See above.
> > > > > >
> > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > >
> > > > > >
> > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > + */
> > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > +
> > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > }
> > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > >
> > > > > > > >
> > > > > > > > Ok.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > +
> > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > >
> > > > > > > >
> > > > > > > > Do you mean:
> > > > > > > >
> > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > interrupt handlers
> > > > > > >
> > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > users at the moment, even less than probe.
> > > > > >
> > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > spinlock or others.
> > > > > >
> > > > > > >
> > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > for old hypervisors
> > > > > > >
> > > > > > >
> > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > >
> > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > codes. They look all fine since day0.
> > > > > >
> > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > in the debug build).
> > > > > >
> > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > device. I think it can be done but in a separate series.
> > > > > >
> > > > > > >
> > > > > > > And going down from there, how about we cache status in the
> > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > speeding boot up a tiny bit.
> > > > > >
> > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > read back for validation in many cases.
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > }
> > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > {
> > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > + }
> > > > > > > > > > +
> > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > bool failed;
> > > > > > > > > > bool config_enabled;
> > > > > > > > > > bool config_change_pending;
> > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > struct device dev;
> > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > }
> > > > > > > > > > +/*
> > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > + * @vdev: the device
> > > > > > > > > > + */
> > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > +{
> > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > + return true;
> > > > > > > > > > +
> > > > > > > > > > + /*
> > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > paired
> > > > > > > >
> > > > > > > >
> > > > > > > > Will fix.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > + */
> > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > /**
> > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > * @vdev: the device
> > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > + /*
> > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > + */
> > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > +
> > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > }
> > > > > > > > > > --
> > > > > > > > > > 2.25.1
> > > > > > >
> > > > >
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-28 6:18 ` Re: Jason Wang
@ 2022-03-28 10:40 ` Michael S. Tsirkin
2022-03-29 7:12 ` Re: Jason Wang
2022-03-29 8:35 ` Re: Thomas Gleixner
0 siblings, 2 replies; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-28 10:40 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> On Mon, Mar 28, 2022 at 1:59 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> > > On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > Bcc:
> > > > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > > > Reply-To:
> > > > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > > > >
> > > > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > > > >
> > > > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > > > >
> > > > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > > > that is used by some device such as virtio-blk
> > > > > > > > > > > 2) done only for PCI transport
> > > > > > > > > > >
> > > > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > > > but the cost should be acceptable.
> > > > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > > > >
> > > > > > > > > We do something like the following previously:
> > > > > > > > >
> > > > > > > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > > > return IRQ_NONE;
> > > > > > > > >
> > > > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > > > the driver.
> > > > > > > >
> > > > > > > > I don't think so - if you sync after setting the value then
> > > > > > > > you are guaranteed that any handler running afterwards
> > > > > > > > will see the new value.
> > > > > > >
> > > > > > > The problem is not disabled but the enable.
> > > > > >
> > > > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > > > imo.
> > > > >
> > > > > It's the interrupt raised before setting irq_soft_enabled to true:
> > > > >
> > > > > CPU 0 probe) driver specific setup (not commited)
> > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > CPU 0 probe) set irq_soft_enabled to true
> > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > >
> > > > > Thanks
> > > >
> > > > Yea, it hurts if you do it. So do not do it then ;).
> > > >
> > > > irq_soft_enabled (I think driver_ok or status is a better name)
> > >
> > > I can change it to driver_ok.
> > >
> > > > should be initialized to false *before* irq is requested.
> > > >
> > > > And requesting irq commits all memory otherwise all drivers would be
> > > > broken,
> > >
> > > So I think we might talk different issues:
> > >
> > > 1) Whether request_irq() commits the previous setups, I think the
> > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > guarantee this though there seems no documentation around
> > > request_irq() to say this.
> > >
> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > using smp_wmb() before the request_irq().
> > >
> > > And even if write is ordered we still need read to be ordered to be
> > > paired with that.
IMO it synchronizes with the CPU to which irq is
delivered. Otherwise basically all drivers would be broken,
wouldn't they be?
I don't know whether it's correct on all platforms, but if not
we need to fix request_irq.
> > >
> > > > if it doesn't it just needs to be fixed, not worked around in
> > > > virtio.
> > >
> > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > virtio_device_ready():
> > >
> > > request_irq()
> > > driver specific setups
> > > virtio_device_ready()
> > >
> > > CPU 0 probe) request_irq()
> > > CPU 1 IRQ handler) read the uninitialized variable
> > > CPU 0 probe) driver specific setups
> > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > CPU 1 IRQ handler) use the uninitialized variable
> > >
> > > Thanks
> >
> >
> > As I said, virtio_device_ready needs to do synchronize_irq.
> > That will guarantee all setup is visible to the specific IRQ,
>
> Only the interrupt after synchronize_irq() returns.
Anything else is a buggy device though.
> >this
> > is what it's point is.
>
> What happens if an interrupt is raised in the middle like:
>
> smp_store_release(dev->irq_soft_enabled, true)
> IRQ handler
> synchornize_irq()
>
> If we don't enforce a reading order, the IRQ handler may still see the
> uninitialized variable.
>
> Thanks
IMHO variables should be initialized before request_irq
to a value meaning "not a valid interrupt".
Specifically driver_ok = false.
Handler in the scenario you describe will then see !driver_ok
and exit immediately.
> >
> >
> > > >
> > > >
> > > > > >
> > > > > > > We use smp_store_relase()
> > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > >
> > > > > > > >
> > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > which surprises me.
> > > > > > > >
> > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > are any buffers in any queues?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > >
> > > > > > > > > "
> > > > > > > > >
> > > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > > "
> > > > > > > > >
> > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > >
> > > > > > > >
> > > > > > > > This is only for config interrupt.
> > > > > > >
> > > > > > > Ok.
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > expensive.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > +
> > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > +
> > > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > * */
> > > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > {
> > > > > > > > > > > + /*
> > > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > though it's most likely is ...
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > >
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > * and NMI handlers.
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > >
> > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > >
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > > """
> > > > > > > > >
> > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > irq_soft_enabled as false.
> > > > > > > > >
> > > > > > > >
> > > > > > > > You are right. So then
> > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > READ_ONCE should do.
> > > > > > >
> > > > > > > See above.
> > > > > > >
> > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > >
> > > > > > >
> > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > + */
> > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > +
> > > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > > }
> > > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Ok.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > +
> > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Do you mean:
> > > > > > > > >
> > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > interrupt handlers
> > > > > > > >
> > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > users at the moment, even less than probe.
> > > > > > >
> > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > spinlock or others.
> > > > > > >
> > > > > > > >
> > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > for old hypervisors
> > > > > > > >
> > > > > > > >
> > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > >
> > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > codes. They look all fine since day0.
> > > > > > >
> > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > > in the debug build).
> > > > > > >
> > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > device. I think it can be done but in a separate series.
> > > > > > >
> > > > > > > >
> > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > speeding boot up a tiny bit.
> > > > > > >
> > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > read back for validation in many cases.
> > > > > > >
> > > > > > > Thanks
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > }
> > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > {
> > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > > + }
> > > > > > > > > > > +
> > > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > bool failed;
> > > > > > > > > > > bool config_enabled;
> > > > > > > > > > > bool config_change_pending;
> > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > struct device dev;
> > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > }
> > > > > > > > > > > +/*
> > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > + */
> > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > +{
> > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > + return true;
> > > > > > > > > > > +
> > > > > > > > > > > + /*
> > > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > > paired
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Will fix.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > > + */
> > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > /**
> > > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > * @vdev: the device
> > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > > + /*
> > > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > > + */
> > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > +
> > > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > }
> > > > > > > > > > > --
> > > > > > > > > > > 2.25.1
> > > > > > > >
> > > > > >
> > > >
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-28 10:40 ` Re: Michael S. Tsirkin
@ 2022-03-29 7:12 ` Jason Wang
2022-03-29 14:08 ` Re: Michael S. Tsirkin
2022-03-29 8:35 ` Re: Thomas Gleixner
1 sibling, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-29 7:12 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Mon, Mar 28, 2022 at 6:41 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> > On Mon, Mar 28, 2022 at 1:59 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, Mar 28, 2022 at 12:56:41PM +0800, Jason Wang wrote:
> > > > On Fri, Mar 25, 2022 at 6:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, Mar 25, 2022 at 05:20:19PM +0800, Jason Wang wrote:
> > > > > > On Fri, Mar 25, 2022 at 5:10 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, Mar 25, 2022 at 03:52:00PM +0800, Jason Wang wrote:
> > > > > > > > On Fri, Mar 25, 2022 at 2:31 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > Bcc:
> > > > > > > > > Subject: Re: [PATCH 3/3] virtio: harden vring IRQ
> > > > > > > > > Message-ID: <20220325021422-mutt-send-email-mst@kernel.org>
> > > > > > > > > Reply-To:
> > > > > > > > > In-Reply-To: <f7046303-7d7d-e39f-3c71-3688126cc812@redhat.com>
> > > > > > > > >
> > > > > > > > > On Fri, Mar 25, 2022 at 11:04:08AM +0800, Jason Wang wrote:
> > > > > > > > > >
> > > > > > > > > > 在 2022/3/24 下午7:03, Michael S. Tsirkin 写道:
> > > > > > > > > > > On Thu, Mar 24, 2022 at 04:40:04PM +0800, Jason Wang wrote:
> > > > > > > > > > > > This is a rework on the previous IRQ hardening that is done for
> > > > > > > > > > > > virtio-pci where several drawbacks were found and were reverted:
> > > > > > > > > > > >
> > > > > > > > > > > > 1) try to use IRQF_NO_AUTOEN which is not friendly to affinity managed IRQ
> > > > > > > > > > > > that is used by some device such as virtio-blk
> > > > > > > > > > > > 2) done only for PCI transport
> > > > > > > > > > > >
> > > > > > > > > > > > In this patch, we tries to borrow the idea from the INTX IRQ hardening
> > > > > > > > > > > > in the reverted commit 080cd7c3ac87 ("virtio-pci: harden INTX interrupts")
> > > > > > > > > > > > by introducing a global irq_soft_enabled variable for each
> > > > > > > > > > > > virtio_device. Then we can to toggle it during
> > > > > > > > > > > > virtio_reset_device()/virtio_device_ready(). A synchornize_rcu() is
> > > > > > > > > > > > used in virtio_reset_device() to synchronize with the IRQ handlers. In
> > > > > > > > > > > > the future, we may provide config_ops for the transport that doesn't
> > > > > > > > > > > > use IRQ. With this, vring_interrupt() can return check and early if
> > > > > > > > > > > > irq_soft_enabled is false. This lead to smp_load_acquire() to be used
> > > > > > > > > > > > but the cost should be acceptable.
> > > > > > > > > > > Maybe it should be but is it? Can't we use synchronize_irq instead?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Even if we allow the transport driver to synchornize through
> > > > > > > > > > synchronize_irq() we still need a check in the vring_interrupt().
> > > > > > > > > >
> > > > > > > > > > We do something like the following previously:
> > > > > > > > > >
> > > > > > > > > > if (!READ_ONCE(vp_dev->intx_soft_enabled))
> > > > > > > > > > return IRQ_NONE;
> > > > > > > > > >
> > > > > > > > > > But it looks like a bug since speculative read can be done before the check
> > > > > > > > > > where the interrupt handler can't see the uncommitted setup which is done by
> > > > > > > > > > the driver.
> > > > > > > > >
> > > > > > > > > I don't think so - if you sync after setting the value then
> > > > > > > > > you are guaranteed that any handler running afterwards
> > > > > > > > > will see the new value.
> > > > > > > >
> > > > > > > > The problem is not disabled but the enable.
> > > > > > >
> > > > > > > So a misbehaving device can lose interrupts? That's not a problem at all
> > > > > > > imo.
> > > > > >
> > > > > > It's the interrupt raised before setting irq_soft_enabled to true:
> > > > > >
> > > > > > CPU 0 probe) driver specific setup (not commited)
> > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > CPU 0 probe) set irq_soft_enabled to true
> > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > Yea, it hurts if you do it. So do not do it then ;).
> > > > >
> > > > > irq_soft_enabled (I think driver_ok or status is a better name)
> > > >
> > > > I can change it to driver_ok.
> > > >
> > > > > should be initialized to false *before* irq is requested.
> > > > >
> > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > broken,
> > > >
> > > > So I think we might talk different issues:
> > > >
> > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > guarantee this though there seems no documentation around
> > > > request_irq() to say this.
> > > >
> > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > using smp_wmb() before the request_irq().
> > > >
> > > > And even if write is ordered we still need read to be ordered to be
> > > > paired with that.
>
> IMO it synchronizes with the CPU to which irq is
> delivered. Otherwise basically all drivers would be broken,
> wouldn't they be?
I guess it's because most of the drivers don't care much about the
buggy/malicious device. And most of the devices may require an extra
step to enable device IRQ after request_irq(). Or it's the charge of
the driver to do the synchronization.
> I don't know whether it's correct on all platforms, but if not
> we need to fix request_irq.
>
> > > >
> > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > virtio.
> > > >
> > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > virtio_device_ready():
> > > >
> > > > request_irq()
> > > > driver specific setups
> > > > virtio_device_ready()
> > > >
> > > > CPU 0 probe) request_irq()
> > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > CPU 0 probe) driver specific setups
> > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > CPU 1 IRQ handler) use the uninitialized variable
> > > >
> > > > Thanks
> > >
> > >
> > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > That will guarantee all setup is visible to the specific IRQ,
> >
> > Only the interrupt after synchronize_irq() returns.
>
> Anything else is a buggy device though.
Yes, but the goal of this patch is to prevent the possible attack from
buggy(malicious) devices.
>
> > >this
> > > is what it's point is.
> >
> > What happens if an interrupt is raised in the middle like:
> >
> > smp_store_release(dev->irq_soft_enabled, true)
> > IRQ handler
> > synchornize_irq()
> >
> > If we don't enforce a reading order, the IRQ handler may still see the
> > uninitialized variable.
> >
> > Thanks
>
> IMHO variables should be initialized before request_irq
> to a value meaning "not a valid interrupt".
> Specifically driver_ok = false.
> Handler in the scenario you describe will then see !driver_ok
> and exit immediately.
So just to make sure we're on the same page.
1) virtio_reset_device() will set the driver_ok to false;
2) virtio_device_ready() will set the driver_ok to true
So for virtio drivers, it often did:
1) virtio_reset_device()
2) find_vqs() which will call request_irq()
3) other driver specific setups
4) virtio_device_ready()
In virtio_device_ready(), the patch perform the following currently:
smp_store_release(driver_ok, true);
set_status(DRIVER_OK);
Per your suggestion, to add synchronize_irq() after
smp_store_release() so we had
smp_store_release(driver_ok, true);
synchornize_irq()
set_status(DRIVER_OK)
Suppose there's a interrupt raised before the synchronize_irq(), if we do:
if (READ_ONCE(driver_ok)) {
vq->callback()
}
It will see the driver_ok as true but how can we make sure
vq->callback sees the driver specific setups (3) above?
And an example is virtio_scsi():
virtio_reset_device()
virtscsi_probe()
virtscsi_init()
virtio_find_vqs()
...
virtscsi_init_vq(&vscsi->event_vq, vqs[1])
....
virtio_device_ready()
In virtscsi_event_done():
virtscsi_event_done():
virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
Thanks
>
>
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > > > We use smp_store_relase()
> > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > which surprises me.
> > > > > > > > >
> > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > >
> > > > > > > > > > "
> > > > > > > > > >
> > > > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > > > "
> > > > > > > > > >
> > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > This is only for config interrupt.
> > > > > > > >
> > > > > > > > Ok.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > expensive.
> > > > > > > > > > > >
> > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > >
> > > > > > > > > > > > ---
> > > > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > +
> > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > +
> > > > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > * */
> > > > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > {
> > > > > > > > > > > > + /*
> > > > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > though it's most likely is ...
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > >
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > * and NMI handlers.
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > >
> > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > >
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > > > """
> > > > > > > > > >
> > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > You are right. So then
> > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > READ_ONCE should do.
> > > > > > > >
> > > > > > > > See above.
> > > > > > > >
> > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > >
> > > > > > > >
> > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > + */
> > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > +
> > > > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > > > }
> > > > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Ok.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > +
> > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Do you mean:
> > > > > > > > > >
> > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > interrupt handlers
> > > > > > > > >
> > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > users at the moment, even less than probe.
> > > > > > > >
> > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > spinlock or others.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > for old hypervisors
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > >
> > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > codes. They look all fine since day0.
> > > > > > > >
> > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > > > in the debug build).
> > > > > > > >
> > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > speeding boot up a tiny bit.
> > > > > > > >
> > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > read back for validation in many cases.
> > > > > > > >
> > > > > > > > Thanks
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > }
> > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > {
> > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > > > + }
> > > > > > > > > > > > +
> > > > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > bool failed;
> > > > > > > > > > > > bool config_enabled;
> > > > > > > > > > > > bool config_change_pending;
> > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > struct device dev;
> > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > }
> > > > > > > > > > > > +/*
> > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > + */
> > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > +{
> > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > + return true;
> > > > > > > > > > > > +
> > > > > > > > > > > > + /*
> > > > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > > > paired
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Will fix.
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > > > + */
> > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > /**
> > > > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > * @vdev: the device
> > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > > > + /*
> > > > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > > > + */
> > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > +
> > > > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > }
> > > > > > > > > > > > --
> > > > > > > > > > > > 2.25.1
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 7:12 ` Re: Jason Wang
@ 2022-03-29 14:08 ` Michael S. Tsirkin
2022-03-30 2:40 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-29 14:08 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > broken,
> > > > >
> > > > > So I think we might talk different issues:
> > > > >
> > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > guarantee this though there seems no documentation around
> > > > > request_irq() to say this.
> > > > >
> > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > using smp_wmb() before the request_irq().
> > > > >
> > > > > And even if write is ordered we still need read to be ordered to be
> > > > > paired with that.
> >
> > IMO it synchronizes with the CPU to which irq is
> > delivered. Otherwise basically all drivers would be broken,
> > wouldn't they be?
>
> I guess it's because most of the drivers don't care much about the
> buggy/malicious device. And most of the devices may require an extra
> step to enable device IRQ after request_irq(). Or it's the charge of
> the driver to do the synchronization.
It is true that the use-case of malicious devices is somewhat boutique.
But I think most drivers do want to have their hotplug routines to be
robust, yes.
> > I don't know whether it's correct on all platforms, but if not
> > we need to fix request_irq.
> >
> > > > >
> > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > virtio.
> > > > >
> > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > virtio_device_ready():
> > > > >
> > > > > request_irq()
> > > > > driver specific setups
> > > > > virtio_device_ready()
> > > > >
> > > > > CPU 0 probe) request_irq()
> > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > CPU 0 probe) driver specific setups
> > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > >
> > > > > Thanks
> > > >
> > > >
> > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > That will guarantee all setup is visible to the specific IRQ,
> > >
> > > Only the interrupt after synchronize_irq() returns.
> >
> > Anything else is a buggy device though.
>
> Yes, but the goal of this patch is to prevent the possible attack from
> buggy(malicious) devices.
Right. However if a driver of a *buggy* device somehow sees driver_ok =
false even though it's actually initialized, that is not a deal breaker
as that does not open us up to an attack.
> >
> > > >this
> > > > is what it's point is.
> > >
> > > What happens if an interrupt is raised in the middle like:
> > >
> > > smp_store_release(dev->irq_soft_enabled, true)
> > > IRQ handler
> > > synchornize_irq()
> > >
> > > If we don't enforce a reading order, the IRQ handler may still see the
> > > uninitialized variable.
> > >
> > > Thanks
> >
> > IMHO variables should be initialized before request_irq
> > to a value meaning "not a valid interrupt".
> > Specifically driver_ok = false.
> > Handler in the scenario you describe will then see !driver_ok
> > and exit immediately.
>
> So just to make sure we're on the same page.
>
> 1) virtio_reset_device() will set the driver_ok to false;
> 2) virtio_device_ready() will set the driver_ok to true
>
> So for virtio drivers, it often did:
>
> 1) virtio_reset_device()
> 2) find_vqs() which will call request_irq()
> 3) other driver specific setups
> 4) virtio_device_ready()
>
> In virtio_device_ready(), the patch perform the following currently:
>
> smp_store_release(driver_ok, true);
> set_status(DRIVER_OK);
>
> Per your suggestion, to add synchronize_irq() after
> smp_store_release() so we had
>
> smp_store_release(driver_ok, true);
> synchornize_irq()
> set_status(DRIVER_OK)
>
> Suppose there's a interrupt raised before the synchronize_irq(), if we do:
>
> if (READ_ONCE(driver_ok)) {
> vq->callback()
> }
>
> It will see the driver_ok as true but how can we make sure
> vq->callback sees the driver specific setups (3) above?
>
> And an example is virtio_scsi():
>
> virtio_reset_device()
> virtscsi_probe()
> virtscsi_init()
> virtio_find_vqs()
> ...
> virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> ....
> virtio_device_ready()
>
> In virtscsi_event_done():
>
> virtscsi_event_done():
> virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
>
> We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
>
> Thanks
See response by Thomas. A simple if (!dev->driver_ok) should be enough,
it's all under a lock.
> >
> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > We use smp_store_relase()
> > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > which surprises me.
> > > > > > > > > >
> > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > >
> > > > > > > > > > > "
> > > > > > > > > > >
> > > > > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > > > > "
> > > > > > > > > > >
> > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > This is only for config interrupt.
> > > > > > > > >
> > > > > > > > > Ok.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > expensive.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > >
> > > > > > > > > > > > > ---
> > > > > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > +
> > > > > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > * */
> > > > > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > {
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > >
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > * and NMI handlers.
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > >
> > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > >
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > > > > """
> > > > > > > > > > >
> > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > You are right. So then
> > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > READ_ONCE should do.
> > > > > > > > >
> > > > > > > > > See above.
> > > > > > > > >
> > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > +
> > > > > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > > > > }
> > > > > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Ok.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Do you mean:
> > > > > > > > > > >
> > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > interrupt handlers
> > > > > > > > > >
> > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > >
> > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > spinlock or others.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > for old hypervisors
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > >
> > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > codes. They look all fine since day0.
> > > > > > > > >
> > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > > > > in the debug build).
> > > > > > > > >
> > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > >
> > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > read back for validation in many cases.
> > > > > > > > >
> > > > > > > > > Thanks
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > }
> > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > {
> > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > > > > + }
> > > > > > > > > > > > > +
> > > > > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > bool failed;
> > > > > > > > > > > > > bool config_enabled;
> > > > > > > > > > > > > bool config_change_pending;
> > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > struct device dev;
> > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > }
> > > > > > > > > > > > > +/*
> > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > +{
> > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > + return true;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > > > > paired
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Will fix.
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > +}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > /**
> > > > > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > * @vdev: the device
> > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > > > > + /*
> > > > > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > > > > + */
> > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > +
> > > > > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > }
> > > > > > > > > > > > > --
> > > > > > > > > > > > > 2.25.1
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 14:08 ` Re: Michael S. Tsirkin
@ 2022-03-30 2:40 ` Jason Wang
2022-03-30 5:14 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-30 2:40 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Tue, Mar 29, 2022 at 10:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > > broken,
> > > > > >
> > > > > > So I think we might talk different issues:
> > > > > >
> > > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > > guarantee this though there seems no documentation around
> > > > > > request_irq() to say this.
> > > > > >
> > > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > > using smp_wmb() before the request_irq().
> > > > > >
> > > > > > And even if write is ordered we still need read to be ordered to be
> > > > > > paired with that.
> > >
> > > IMO it synchronizes with the CPU to which irq is
> > > delivered. Otherwise basically all drivers would be broken,
> > > wouldn't they be?
> >
> > I guess it's because most of the drivers don't care much about the
> > buggy/malicious device. And most of the devices may require an extra
> > step to enable device IRQ after request_irq(). Or it's the charge of
> > the driver to do the synchronization.
>
> It is true that the use-case of malicious devices is somewhat boutique.
> But I think most drivers do want to have their hotplug routines to be
> robust, yes.
>
> > > I don't know whether it's correct on all platforms, but if not
> > > we need to fix request_irq.
> > >
> > > > > >
> > > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > > virtio.
> > > > > >
> > > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > > virtio_device_ready():
> > > > > >
> > > > > > request_irq()
> > > > > > driver specific setups
> > > > > > virtio_device_ready()
> > > > > >
> > > > > > CPU 0 probe) request_irq()
> > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > CPU 0 probe) driver specific setups
> > > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > >
> > > > > > Thanks
> > > > >
> > > > >
> > > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > > That will guarantee all setup is visible to the specific IRQ,
> > > >
> > > > Only the interrupt after synchronize_irq() returns.
> > >
> > > Anything else is a buggy device though.
> >
> > Yes, but the goal of this patch is to prevent the possible attack from
> > buggy(malicious) devices.
>
> Right. However if a driver of a *buggy* device somehow sees driver_ok =
> false even though it's actually initialized, that is not a deal breaker
> as that does not open us up to an attack.
>
> > >
> > > > >this
> > > > > is what it's point is.
> > > >
> > > > What happens if an interrupt is raised in the middle like:
> > > >
> > > > smp_store_release(dev->irq_soft_enabled, true)
> > > > IRQ handler
> > > > synchornize_irq()
> > > >
> > > > If we don't enforce a reading order, the IRQ handler may still see the
> > > > uninitialized variable.
> > > >
> > > > Thanks
> > >
> > > IMHO variables should be initialized before request_irq
> > > to a value meaning "not a valid interrupt".
> > > Specifically driver_ok = false.
> > > Handler in the scenario you describe will then see !driver_ok
> > > and exit immediately.
> >
> > So just to make sure we're on the same page.
> >
> > 1) virtio_reset_device() will set the driver_ok to false;
> > 2) virtio_device_ready() will set the driver_ok to true
> >
> > So for virtio drivers, it often did:
> >
> > 1) virtio_reset_device()
> > 2) find_vqs() which will call request_irq()
> > 3) other driver specific setups
> > 4) virtio_device_ready()
> >
> > In virtio_device_ready(), the patch perform the following currently:
> >
> > smp_store_release(driver_ok, true);
> > set_status(DRIVER_OK);
> >
> > Per your suggestion, to add synchronize_irq() after
> > smp_store_release() so we had
> >
> > smp_store_release(driver_ok, true);
> > synchornize_irq()
> > set_status(DRIVER_OK)
> >
> > Suppose there's a interrupt raised before the synchronize_irq(), if we do:
> >
> > if (READ_ONCE(driver_ok)) {
> > vq->callback()
> > }
> >
> > It will see the driver_ok as true but how can we make sure
> > vq->callback sees the driver specific setups (3) above?
> >
> > And an example is virtio_scsi():
> >
> > virtio_reset_device()
> > virtscsi_probe()
> > virtscsi_init()
> > virtio_find_vqs()
> > ...
> > virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> > ....
> > virtio_device_ready()
> >
> > In virtscsi_event_done():
> >
> > virtscsi_event_done():
> > virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
> >
> > We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
> >
> > Thanks
>
>
> See response by Thomas. A simple if (!dev->driver_ok) should be enough,
> it's all under a lock.
Ordered through ACQUIRE+RELEASE actually since the irq handler is not
running under the lock.
Another question, for synchronize_irq() do you prefer
1) transport specific callbacks
or
2) a simple synchornize_rcu()
Thanks
>
> > >
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > We use smp_store_relase()
> > > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > > which surprises me.
> > > > > > > > > > >
> > > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > > >
> > > > > > > > > > > > "
> > > > > > > > > > > >
> > > > > > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > > > > > "
> > > > > > > > > > > >
> > > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > This is only for config interrupt.
> > > > > > > > > >
> > > > > > > > > > Ok.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > > expensive.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > > * */
> > > > > > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > > {
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > > >
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > > * and NMI handlers.
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > > >
> > > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > > >
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > > > > > """
> > > > > > > > > > > >
> > > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > You are right. So then
> > > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > > READ_ONCE should do.
> > > > > > > > > >
> > > > > > > > > > See above.
> > > > > > > > > >
> > > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Ok.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Do you mean:
> > > > > > > > > > > >
> > > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > > interrupt handlers
> > > > > > > > > > >
> > > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > > >
> > > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > > spinlock or others.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > > for old hypervisors
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > > >
> > > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > > codes. They look all fine since day0.
> > > > > > > > > >
> > > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > > > > > in the debug build).
> > > > > > > > > >
> > > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > > >
> > > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > > read back for validation in many cases.
> > > > > > > > > >
> > > > > > > > > > Thanks
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > > {
> > > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > > > > > + }
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > > bool failed;
> > > > > > > > > > > > > > bool config_enabled;
> > > > > > > > > > > > > > bool config_change_pending;
> > > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > > struct device dev;
> > > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > +/*
> > > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > > + return true;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > > > > > paired
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Will fix.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > /**
> > > > > > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > > * @vdev: the device
> > > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > }
> > > > > > > > > > > > > > --
> > > > > > > > > > > > > > 2.25.1
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-30 2:40 ` Re: Jason Wang
@ 2022-03-30 5:14 ` Michael S. Tsirkin
2022-03-30 5:53 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-30 5:14 UTC (permalink / raw)
To: Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Wed, Mar 30, 2022 at 10:40:59AM +0800, Jason Wang wrote:
> On Tue, Mar 29, 2022 at 10:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > > > broken,
> > > > > > >
> > > > > > > So I think we might talk different issues:
> > > > > > >
> > > > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > > > guarantee this though there seems no documentation around
> > > > > > > request_irq() to say this.
> > > > > > >
> > > > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > > > using smp_wmb() before the request_irq().
> > > > > > >
> > > > > > > And even if write is ordered we still need read to be ordered to be
> > > > > > > paired with that.
> > > >
> > > > IMO it synchronizes with the CPU to which irq is
> > > > delivered. Otherwise basically all drivers would be broken,
> > > > wouldn't they be?
> > >
> > > I guess it's because most of the drivers don't care much about the
> > > buggy/malicious device. And most of the devices may require an extra
> > > step to enable device IRQ after request_irq(). Or it's the charge of
> > > the driver to do the synchronization.
> >
> > It is true that the use-case of malicious devices is somewhat boutique.
> > But I think most drivers do want to have their hotplug routines to be
> > robust, yes.
> >
> > > > I don't know whether it's correct on all platforms, but if not
> > > > we need to fix request_irq.
> > > >
> > > > > > >
> > > > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > > > virtio.
> > > > > > >
> > > > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > > > virtio_device_ready():
> > > > > > >
> > > > > > > request_irq()
> > > > > > > driver specific setups
> > > > > > > virtio_device_ready()
> > > > > > >
> > > > > > > CPU 0 probe) request_irq()
> > > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > > CPU 0 probe) driver specific setups
> > > > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > > >
> > > > > > > Thanks
> > > > > >
> > > > > >
> > > > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > > > That will guarantee all setup is visible to the specific IRQ,
> > > > >
> > > > > Only the interrupt after synchronize_irq() returns.
> > > >
> > > > Anything else is a buggy device though.
> > >
> > > Yes, but the goal of this patch is to prevent the possible attack from
> > > buggy(malicious) devices.
> >
> > Right. However if a driver of a *buggy* device somehow sees driver_ok =
> > false even though it's actually initialized, that is not a deal breaker
> > as that does not open us up to an attack.
> >
> > > >
> > > > > >this
> > > > > > is what it's point is.
> > > > >
> > > > > What happens if an interrupt is raised in the middle like:
> > > > >
> > > > > smp_store_release(dev->irq_soft_enabled, true)
> > > > > IRQ handler
> > > > > synchornize_irq()
> > > > >
> > > > > If we don't enforce a reading order, the IRQ handler may still see the
> > > > > uninitialized variable.
> > > > >
> > > > > Thanks
> > > >
> > > > IMHO variables should be initialized before request_irq
> > > > to a value meaning "not a valid interrupt".
> > > > Specifically driver_ok = false.
> > > > Handler in the scenario you describe will then see !driver_ok
> > > > and exit immediately.
> > >
> > > So just to make sure we're on the same page.
> > >
> > > 1) virtio_reset_device() will set the driver_ok to false;
> > > 2) virtio_device_ready() will set the driver_ok to true
> > >
> > > So for virtio drivers, it often did:
> > >
> > > 1) virtio_reset_device()
> > > 2) find_vqs() which will call request_irq()
> > > 3) other driver specific setups
> > > 4) virtio_device_ready()
> > >
> > > In virtio_device_ready(), the patch perform the following currently:
> > >
> > > smp_store_release(driver_ok, true);
> > > set_status(DRIVER_OK);
> > >
> > > Per your suggestion, to add synchronize_irq() after
> > > smp_store_release() so we had
> > >
> > > smp_store_release(driver_ok, true);
> > > synchornize_irq()
> > > set_status(DRIVER_OK)
> > >
> > > Suppose there's a interrupt raised before the synchronize_irq(), if we do:
> > >
> > > if (READ_ONCE(driver_ok)) {
> > > vq->callback()
> > > }
> > >
> > > It will see the driver_ok as true but how can we make sure
> > > vq->callback sees the driver specific setups (3) above?
> > >
> > > And an example is virtio_scsi():
> > >
> > > virtio_reset_device()
> > > virtscsi_probe()
> > > virtscsi_init()
> > > virtio_find_vqs()
> > > ...
> > > virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> > > ....
> > > virtio_device_ready()
> > >
> > > In virtscsi_event_done():
> > >
> > > virtscsi_event_done():
> > > virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
> > >
> > > We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
> > >
> > > Thanks
> >
> >
> > See response by Thomas. A simple if (!dev->driver_ok) should be enough,
> > it's all under a lock.
>
> Ordered through ACQUIRE+RELEASE actually since the irq handler is not
> running under the lock.
>
> Another question, for synchronize_irq() do you prefer
>
> 1) transport specific callbacks
> or
> 2) a simple synchornize_rcu()
>
> Thanks
1) I think, and I'd add a wrapper so we can switch to 2 if we really
want to. But for now synchronizing the specific irq is obviously designed to
make any changes to memory visible to this irq. that
seems cleaner and easier to understand than memory ordering tricks
and relying on side effects of synchornize_rcu, even though
internally this all boils down to memory ordering since
memory is what's used to implement locks :).
Not to mention, synchronize_irq just scales much better from performance
POV.
> >
> > > >
> > > >
> > > > > >
> > > > > >
> > > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > We use smp_store_relase()
> > > > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > > > which surprises me.
> > > > > > > > > > > >
> > > > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > > > >
> > > > > > > > > > > > > "
> > > > > > > > > > > > >
> > > > > > > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > > > > > > "
> > > > > > > > > > > > >
> > > > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > This is only for config interrupt.
> > > > > > > > > > >
> > > > > > > > > > > Ok.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > > > expensive.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > > > * */
> > > > > > > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > > > >
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > > > * and NMI handlers.
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > > > >
> > > > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > > > >
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > > > > > > """
> > > > > > > > > > > > >
> > > > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > You are right. So then
> > > > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > > > READ_ONCE should do.
> > > > > > > > > > >
> > > > > > > > > > > See above.
> > > > > > > > > > >
> > > > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Ok.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Do you mean:
> > > > > > > > > > > > >
> > > > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > > > interrupt handlers
> > > > > > > > > > > >
> > > > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > > > >
> > > > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > > > spinlock or others.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > > > for old hypervisors
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > > > >
> > > > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > > > codes. They look all fine since day0.
> > > > > > > > > > >
> > > > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > > > > > > in the debug build).
> > > > > > > > > > >
> > > > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > > > >
> > > > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > > > read back for validation in many cases.
> > > > > > > > > > >
> > > > > > > > > > > Thanks
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > > > > > > + }
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > > > bool failed;
> > > > > > > > > > > > > > > bool config_enabled;
> > > > > > > > > > > > > > > bool config_change_pending;
> > > > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > > > struct device dev;
> > > > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > +/*
> > > > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > > > + return true;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > > > > > > paired
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Will fix.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Thanks
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > /**
> > > > > > > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > > > * @vdev: the device
> > > > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > 2.25.1
> > > > > > > > > > > >
> > > > > > > > > >
> > > > > > > >
> > > > > >
> > > >
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-30 5:14 ` Re: Michael S. Tsirkin
@ 2022-03-30 5:53 ` Jason Wang
0 siblings, 0 replies; 417+ messages in thread
From: Jason Wang @ 2022-03-30 5:53 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, linux-kernel, Marc Zyngier, Thomas Gleixner,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Wed, Mar 30, 2022 at 1:14 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Mar 30, 2022 at 10:40:59AM +0800, Jason Wang wrote:
> > On Tue, Mar 29, 2022 at 10:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Mar 29, 2022 at 03:12:14PM +0800, Jason Wang wrote:
> > > > > > > > > And requesting irq commits all memory otherwise all drivers would be
> > > > > > > > > broken,
> > > > > > > >
> > > > > > > > So I think we might talk different issues:
> > > > > > > >
> > > > > > > > 1) Whether request_irq() commits the previous setups, I think the
> > > > > > > > answer is yes, since the spin_unlock of desc->lock (release) can
> > > > > > > > guarantee this though there seems no documentation around
> > > > > > > > request_irq() to say this.
> > > > > > > >
> > > > > > > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> > > > > > > > using smp_wmb() before the request_irq().
> > > > > > > >
> > > > > > > > And even if write is ordered we still need read to be ordered to be
> > > > > > > > paired with that.
> > > > >
> > > > > IMO it synchronizes with the CPU to which irq is
> > > > > delivered. Otherwise basically all drivers would be broken,
> > > > > wouldn't they be?
> > > >
> > > > I guess it's because most of the drivers don't care much about the
> > > > buggy/malicious device. And most of the devices may require an extra
> > > > step to enable device IRQ after request_irq(). Or it's the charge of
> > > > the driver to do the synchronization.
> > >
> > > It is true that the use-case of malicious devices is somewhat boutique.
> > > But I think most drivers do want to have their hotplug routines to be
> > > robust, yes.
> > >
> > > > > I don't know whether it's correct on all platforms, but if not
> > > > > we need to fix request_irq.
> > > > >
> > > > > > > >
> > > > > > > > > if it doesn't it just needs to be fixed, not worked around in
> > > > > > > > > virtio.
> > > > > > > >
> > > > > > > > 2) virtio drivers might do a lot of setups between request_irq() and
> > > > > > > > virtio_device_ready():
> > > > > > > >
> > > > > > > > request_irq()
> > > > > > > > driver specific setups
> > > > > > > > virtio_device_ready()
> > > > > > > >
> > > > > > > > CPU 0 probe) request_irq()
> > > > > > > > CPU 1 IRQ handler) read the uninitialized variable
> > > > > > > > CPU 0 probe) driver specific setups
> > > > > > > > CPU 0 probe) smp_store_release(intr_soft_enabled, true), commit the setups
> > > > > > > > CPU 1 IRQ handler) read irq_soft_enable as true
> > > > > > > > CPU 1 IRQ handler) use the uninitialized variable
> > > > > > > >
> > > > > > > > Thanks
> > > > > > >
> > > > > > >
> > > > > > > As I said, virtio_device_ready needs to do synchronize_irq.
> > > > > > > That will guarantee all setup is visible to the specific IRQ,
> > > > > >
> > > > > > Only the interrupt after synchronize_irq() returns.
> > > > >
> > > > > Anything else is a buggy device though.
> > > >
> > > > Yes, but the goal of this patch is to prevent the possible attack from
> > > > buggy(malicious) devices.
> > >
> > > Right. However if a driver of a *buggy* device somehow sees driver_ok =
> > > false even though it's actually initialized, that is not a deal breaker
> > > as that does not open us up to an attack.
> > >
> > > > >
> > > > > > >this
> > > > > > > is what it's point is.
> > > > > >
> > > > > > What happens if an interrupt is raised in the middle like:
> > > > > >
> > > > > > smp_store_release(dev->irq_soft_enabled, true)
> > > > > > IRQ handler
> > > > > > synchornize_irq()
> > > > > >
> > > > > > If we don't enforce a reading order, the IRQ handler may still see the
> > > > > > uninitialized variable.
> > > > > >
> > > > > > Thanks
> > > > >
> > > > > IMHO variables should be initialized before request_irq
> > > > > to a value meaning "not a valid interrupt".
> > > > > Specifically driver_ok = false.
> > > > > Handler in the scenario you describe will then see !driver_ok
> > > > > and exit immediately.
> > > >
> > > > So just to make sure we're on the same page.
> > > >
> > > > 1) virtio_reset_device() will set the driver_ok to false;
> > > > 2) virtio_device_ready() will set the driver_ok to true
> > > >
> > > > So for virtio drivers, it often did:
> > > >
> > > > 1) virtio_reset_device()
> > > > 2) find_vqs() which will call request_irq()
> > > > 3) other driver specific setups
> > > > 4) virtio_device_ready()
> > > >
> > > > In virtio_device_ready(), the patch perform the following currently:
> > > >
> > > > smp_store_release(driver_ok, true);
> > > > set_status(DRIVER_OK);
> > > >
> > > > Per your suggestion, to add synchronize_irq() after
> > > > smp_store_release() so we had
> > > >
> > > > smp_store_release(driver_ok, true);
> > > > synchornize_irq()
> > > > set_status(DRIVER_OK)
> > > >
> > > > Suppose there's a interrupt raised before the synchronize_irq(), if we do:
> > > >
> > > > if (READ_ONCE(driver_ok)) {
> > > > vq->callback()
> > > > }
> > > >
> > > > It will see the driver_ok as true but how can we make sure
> > > > vq->callback sees the driver specific setups (3) above?
> > > >
> > > > And an example is virtio_scsi():
> > > >
> > > > virtio_reset_device()
> > > > virtscsi_probe()
> > > > virtscsi_init()
> > > > virtio_find_vqs()
> > > > ...
> > > > virtscsi_init_vq(&vscsi->event_vq, vqs[1])
> > > > ....
> > > > virtio_device_ready()
> > > >
> > > > In virtscsi_event_done():
> > > >
> > > > virtscsi_event_done():
> > > > virtscsi_vq_done(vscsi, &vscsi->event_vq, ...);
> > > >
> > > > We need to make sure the even_done reads driver_ok before read vscsi->event_vq.
> > > >
> > > > Thanks
> > >
> > >
> > > See response by Thomas. A simple if (!dev->driver_ok) should be enough,
> > > it's all under a lock.
> >
> > Ordered through ACQUIRE+RELEASE actually since the irq handler is not
> > running under the lock.
> >
> > Another question, for synchronize_irq() do you prefer
> >
> > 1) transport specific callbacks
> > or
> > 2) a simple synchornize_rcu()
> >
> > Thanks
>
>
> 1) I think, and I'd add a wrapper so we can switch to 2 if we really
> want to. But for now synchronizing the specific irq is obviously designed to
> make any changes to memory visible to this irq. that
> seems cleaner and easier to understand than memory ordering tricks
> and relying on side effects of synchornize_rcu, even though
> internally this all boils down to memory ordering since
> memory is what's used to implement locks :).
> Not to mention, synchronize_irq just scales much better from performance
> POV.
Ok. Let me try to do that in V2.
Thanks
>
>
> > >
> > > > >
> > > > >
> > > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > We use smp_store_relase()
> > > > > > > > > > > > to make sure the driver commits the setup before enabling the irq. It
> > > > > > > > > > > > means the read needs to be ordered as well in vring_interrupt().
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > Although I couldn't find anything about this in memory-barriers.txt
> > > > > > > > > > > > > which surprises me.
> > > > > > > > > > > > >
> > > > > > > > > > > > > CC Paul to help make sure I'm right.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > To avoid breaking legacy device which can send IRQ before DRIVER_OK, a
> > > > > > > > > > > > > > > > module parameter is introduced to enable the hardening so function
> > > > > > > > > > > > > > > > hardening is disabled by default.
> > > > > > > > > > > > > > > Which devices are these? How come they send an interrupt before there
> > > > > > > > > > > > > > > are any buffers in any queues?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I copied this from the commit log for 22b7050a024d7
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > "
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > This change will also benefit old hypervisors (before 2009)
> > > > > > > > > > > > > > that send interrupts without checking DRIVER_OK: previously,
> > > > > > > > > > > > > > the callback could race with driver-specific initialization.
> > > > > > > > > > > > > > "
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > If this is only for config interrupt, I can remove the above log.
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > This is only for config interrupt.
> > > > > > > > > > > >
> > > > > > > > > > > > Ok.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Note that the hardening is only done for vring interrupt since the
> > > > > > > > > > > > > > > > config interrupt hardening is already done in commit 22b7050a024d7
> > > > > > > > > > > > > > > > ("virtio: defer config changed notifications"). But the method that is
> > > > > > > > > > > > > > > > used by config interrupt can't be reused by the vring interrupt
> > > > > > > > > > > > > > > > handler because it uses spinlock to do the synchronization which is
> > > > > > > > > > > > > > > > expensive.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Signed-off-by: Jason Wang <jasowang@redhat.com>
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > ---
> > > > > > > > > > > > > > > > drivers/virtio/virtio.c | 19 +++++++++++++++++++
> > > > > > > > > > > > > > > > drivers/virtio/virtio_ring.c | 9 ++++++++-
> > > > > > > > > > > > > > > > include/linux/virtio.h | 4 ++++
> > > > > > > > > > > > > > > > include/linux/virtio_config.h | 25 +++++++++++++++++++++++++
> > > > > > > > > > > > > > > > 4 files changed, 56 insertions(+), 1 deletion(-)
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > > index 8dde44ea044a..85e331efa9cc 100644
> > > > > > > > > > > > > > > > --- a/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio.c
> > > > > > > > > > > > > > > > @@ -7,6 +7,12 @@
> > > > > > > > > > > > > > > > #include <linux/of.h>
> > > > > > > > > > > > > > > > #include <uapi/linux/virtio_ids.h>
> > > > > > > > > > > > > > > > +static bool irq_hardening = false;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +module_param(irq_hardening, bool, 0444);
> > > > > > > > > > > > > > > > +MODULE_PARM_DESC(irq_hardening,
> > > > > > > > > > > > > > > > + "Disalbe IRQ software processing when it is not expected");
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > /* Unique numbering for virtio devices. */
> > > > > > > > > > > > > > > > static DEFINE_IDA(virtio_index_ida);
> > > > > > > > > > > > > > > > @@ -220,6 +226,15 @@ static int virtio_features_ok(struct virtio_device *dev)
> > > > > > > > > > > > > > > > * */
> > > > > > > > > > > > > > > > void virtio_reset_device(struct virtio_device *dev)
> > > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > > + * The below synchronize_rcu() guarantees that any
> > > > > > > > > > > > > > > > + * interrupt for this line arriving after
> > > > > > > > > > > > > > > > + * synchronize_rcu() has completed is guaranteed to see
> > > > > > > > > > > > > > > > + * irq_soft_enabled == false.
> > > > > > > > > > > > > > > News to me I did not know synchronize_rcu has anything to do
> > > > > > > > > > > > > > > with interrupts. Did not you intend to use synchronize_irq?
> > > > > > > > > > > > > > > I am not even 100% sure synchronize_rcu is by design a memory barrier
> > > > > > > > > > > > > > > though it's most likely is ...
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > According to the comment above tree RCU version of synchronize_rcu():
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > * RCU read-side critical sections are delimited by rcu_read_lock()
> > > > > > > > > > > > > > * and rcu_read_unlock(), and may be nested. In addition, but only in
> > > > > > > > > > > > > > * v5.0 and later, regions of code across which interrupts, preemption,
> > > > > > > > > > > > > > * or softirqs have been disabled also serve as RCU read-side critical
> > > > > > > > > > > > > > * sections. This includes hardware interrupt handlers, softirq handlers,
> > > > > > > > > > > > > > * and NMI handlers.
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So interrupt handlers are treated as read-side critical sections.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > And it has the comment for explain the barrier:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > * Note that this guarantee implies further memory-ordering guarantees.
> > > > > > > > > > > > > > * On systems with more than one CPU, when synchronize_rcu() returns,
> > > > > > > > > > > > > > * each CPU is guaranteed to have executed a full memory barrier since
> > > > > > > > > > > > > > * the end of its last RCU read-side critical section whose beginning
> > > > > > > > > > > > > > * preceded the call to synchronize_rcu(). In addition, each CPU having
> > > > > > > > > > > > > > """
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So on SMP it provides a full barrier. And for UP/tiny RCU we don't need the
> > > > > > > > > > > > > > barrier, if the interrupt come after WRITE_ONCE() it will see the
> > > > > > > > > > > > > > irq_soft_enabled as false.
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > You are right. So then
> > > > > > > > > > > > > 1. I do not think we need load_acquire - why is it needed? Just
> > > > > > > > > > > > > READ_ONCE should do.
> > > > > > > > > > > >
> > > > > > > > > > > > See above.
> > > > > > > > > > > >
> > > > > > > > > > > > > 2. isn't synchronize_irq also doing the same thing?
> > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > Yes, but it requires a config ops since the IRQ knowledge is transport specific.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > > + WRITE_ONCE(dev->irq_soft_enabled, false);
> > > > > > > > > > > > > > > > + synchronize_rcu();
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > dev->config->reset(dev);
> > > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > > EXPORT_SYMBOL_GPL(virtio_reset_device);
> > > > > > > > > > > > > > > Please add comment explaining where it will be enabled.
> > > > > > > > > > > > > > > Also, we *really* don't need to synch if it was already disabled,
> > > > > > > > > > > > > > > let's not add useless overhead to the boot sequence.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Ok.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > @@ -427,6 +442,10 @@ int register_virtio_device(struct virtio_device *dev)
> > > > > > > > > > > > > > > > spin_lock_init(&dev->config_lock);
> > > > > > > > > > > > > > > > dev->config_enabled = false;
> > > > > > > > > > > > > > > > dev->config_change_pending = false;
> > > > > > > > > > > > > > > > + dev->irq_soft_check = irq_hardening;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > + if (dev->irq_soft_check)
> > > > > > > > > > > > > > > > + dev_info(&dev->dev, "IRQ hardening is enabled\n");
> > > > > > > > > > > > > > > > /* We always start by resetting the device, in case a previous
> > > > > > > > > > > > > > > > * driver messed it up. This also tests that code path a little. */
> > > > > > > > > > > > > > > one of the points of hardening is it's also helpful for buggy
> > > > > > > > > > > > > > > devices. this flag defeats the purpose.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Do you mean:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > 1) we need something like config_enable? This seems not easy to be
> > > > > > > > > > > > > > implemented without obvious overhead, mainly the synchronize with the
> > > > > > > > > > > > > > interrupt handlers
> > > > > > > > > > > > >
> > > > > > > > > > > > > But synchronize is only on tear-down path. That is not critical for any
> > > > > > > > > > > > > users at the moment, even less than probe.
> > > > > > > > > > > >
> > > > > > > > > > > > I meant if we have vq->irq_pending, we need to call vring_interrupt()
> > > > > > > > > > > > in the virtio_device_ready() and synchronize the IRQ handlers with
> > > > > > > > > > > > spinlock or others.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > 2) enable this by default, so I don't object, but this may have some risk
> > > > > > > > > > > > > > for old hypervisors
> > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > The risk if there's a driver adding buffers without setting DRIVER_OK.
> > > > > > > > > > > >
> > > > > > > > > > > > Probably not, we have devices that accept random inputs from outside,
> > > > > > > > > > > > net, console, input etc. I've done a round of audits of the Qemu
> > > > > > > > > > > > codes. They look all fine since day0.
> > > > > > > > > > > >
> > > > > > > > > > > > > So with this approach, how about we rename the flag "driver_ok"?
> > > > > > > > > > > > > And then add_buf can actually test it and BUG_ON if not there (at least
> > > > > > > > > > > > > in the debug build).
> > > > > > > > > > > >
> > > > > > > > > > > > This looks like a hardening of the driver in the core instead of the
> > > > > > > > > > > > device. I think it can be done but in a separate series.
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > And going down from there, how about we cache status in the
> > > > > > > > > > > > > device? Then we don't need to keep re-reading it every time,
> > > > > > > > > > > > > speeding boot up a tiny bit.
> > > > > > > > > > > >
> > > > > > > > > > > > I don't fully understand here, actually spec requires status to be
> > > > > > > > > > > > read back for validation in many cases.
> > > > > > > > > > > >
> > > > > > > > > > > > Thanks
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > > index 962f1477b1fa..0170f8c784d8 100644
> > > > > > > > > > > > > > > > --- a/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > > +++ b/drivers/virtio/virtio_ring.c
> > > > > > > > > > > > > > > > @@ -2144,10 +2144,17 @@ static inline bool more_used(const struct vring_virtqueue *vq)
> > > > > > > > > > > > > > > > return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq);
> > > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > > -irqreturn_t vring_interrupt(int irq, void *_vq)
> > > > > > > > > > > > > > > > +irqreturn_t vring_interrupt(int irq, void *v)
> > > > > > > > > > > > > > > > {
> > > > > > > > > > > > > > > > + struct virtqueue *_vq = v;
> > > > > > > > > > > > > > > > + struct virtio_device *vdev = _vq->vdev;
> > > > > > > > > > > > > > > > struct vring_virtqueue *vq = to_vvq(_vq);
> > > > > > > > > > > > > > > > + if (!virtio_irq_soft_enabled(vdev)) {
> > > > > > > > > > > > > > > > + dev_warn_once(&vdev->dev, "virtio vring IRQ raised before DRIVER_OK");
> > > > > > > > > > > > > > > > + return IRQ_NONE;
> > > > > > > > > > > > > > > > + }
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > if (!more_used(vq)) {
> > > > > > > > > > > > > > > > pr_debug("virtqueue interrupt with no work for %p\n", vq);
> > > > > > > > > > > > > > > > return IRQ_NONE;
> > > > > > > > > > > > > > > > diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> > > > > > > > > > > > > > > > index 5464f398912a..957d6ad604ac 100644
> > > > > > > > > > > > > > > > --- a/include/linux/virtio.h
> > > > > > > > > > > > > > > > +++ b/include/linux/virtio.h
> > > > > > > > > > > > > > > > @@ -95,6 +95,8 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *vq);
> > > > > > > > > > > > > > > > * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore)
> > > > > > > > > > > > > > > > * @config_enabled: configuration change reporting enabled
> > > > > > > > > > > > > > > > * @config_change_pending: configuration change reported while disabled
> > > > > > > > > > > > > > > > + * @irq_soft_check: whether or not to check @irq_soft_enabled
> > > > > > > > > > > > > > > > + * @irq_soft_enabled: callbacks enabled
> > > > > > > > > > > > > > > > * @config_lock: protects configuration change reporting
> > > > > > > > > > > > > > > > * @dev: underlying device.
> > > > > > > > > > > > > > > > * @id: the device type identification (used to match it with a driver).
> > > > > > > > > > > > > > > > @@ -109,6 +111,8 @@ struct virtio_device {
> > > > > > > > > > > > > > > > bool failed;
> > > > > > > > > > > > > > > > bool config_enabled;
> > > > > > > > > > > > > > > > bool config_change_pending;
> > > > > > > > > > > > > > > > + bool irq_soft_check;
> > > > > > > > > > > > > > > > + bool irq_soft_enabled;
> > > > > > > > > > > > > > > > spinlock_t config_lock;
> > > > > > > > > > > > > > > > spinlock_t vqs_list_lock; /* Protects VQs list access */
> > > > > > > > > > > > > > > > struct device dev;
> > > > > > > > > > > > > > > > diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > > index dafdc7f48c01..9c1b61f2e525 100644
> > > > > > > > > > > > > > > > --- a/include/linux/virtio_config.h
> > > > > > > > > > > > > > > > +++ b/include/linux/virtio_config.h
> > > > > > > > > > > > > > > > @@ -174,6 +174,24 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev,
> > > > > > > > > > > > > > > > return __virtio_test_bit(vdev, fbit);
> > > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > > +/*
> > > > > > > > > > > > > > > > + * virtio_irq_soft_enabled: whether we can execute callbacks
> > > > > > > > > > > > > > > > + * @vdev: the device
> > > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > > +static inline bool virtio_irq_soft_enabled(const struct virtio_device *vdev)
> > > > > > > > > > > > > > > > +{
> > > > > > > > > > > > > > > > + if (!vdev->irq_soft_check)
> > > > > > > > > > > > > > > > + return true;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > > + * Read irq_soft_enabled before reading other device specific
> > > > > > > > > > > > > > > > + * data. Paried with smp_store_relase() in
> > > > > > > > > > > > > > > paired
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Will fix.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Thanks
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > + * virtio_device_ready() and WRITE_ONCE()/synchronize_rcu() in
> > > > > > > > > > > > > > > > + * virtio_reset_device().
> > > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > > + return smp_load_acquire(&vdev->irq_soft_enabled);
> > > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > /**
> > > > > > > > > > > > > > > > * virtio_has_dma_quirk - determine whether this device has the DMA quirk
> > > > > > > > > > > > > > > > * @vdev: the device
> > > > > > > > > > > > > > > > @@ -236,6 +254,13 @@ void virtio_device_ready(struct virtio_device *dev)
> > > > > > > > > > > > > > > > if (dev->config->enable_cbs)
> > > > > > > > > > > > > > > > dev->config->enable_cbs(dev);
> > > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > > + * Commit the driver setup before enabling the virtqueue
> > > > > > > > > > > > > > > > + * callbacks. Paried with smp_load_acuqire() in
> > > > > > > > > > > > > > > > + * virtio_irq_soft_enabled()
> > > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > > + smp_store_release(&dev->irq_soft_enabled, true);
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > BUG_ON(status & VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > > > dev->config->set_status(dev, status | VIRTIO_CONFIG_S_DRIVER_OK);
> > > > > > > > > > > > > > > > }
> > > > > > > > > > > > > > > > --
> > > > > > > > > > > > > > > > 2.25.1
> > > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > >
> > > > > > >
> > > > >
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-28 10:40 ` Re: Michael S. Tsirkin
2022-03-29 7:12 ` Re: Jason Wang
@ 2022-03-29 8:35 ` Thomas Gleixner
2022-03-29 14:37 ` Re: Michael S. Tsirkin
2022-04-12 6:55 ` Re: Michael S. Tsirkin
1 sibling, 2 replies; 417+ messages in thread
From: Thomas Gleixner @ 2022-03-29 8:35 UTC (permalink / raw)
To: Michael S. Tsirkin, Jason Wang
Cc: virtualization, linux-kernel, Marc Zyngier, Peter Zijlstra,
Stefano Garzarella, Keir Fraser, Paul E. McKenney
On Mon, Mar 28 2022 at 06:40, Michael S. Tsirkin wrote:
> On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
>> > > So I think we might talk different issues:
>> > >
>> > > 1) Whether request_irq() commits the previous setups, I think the
>> > > answer is yes, since the spin_unlock of desc->lock (release) can
>> > > guarantee this though there seems no documentation around
>> > > request_irq() to say this.
>> > >
>> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
>> > > using smp_wmb() before the request_irq().
That's a complete bogus example especially as there is not a single
smp_rmb() which pairs with the smp_wmb().
>> > > And even if write is ordered we still need read to be ordered to be
>> > > paired with that.
>
> IMO it synchronizes with the CPU to which irq is
> delivered. Otherwise basically all drivers would be broken,
> wouldn't they be?
> I don't know whether it's correct on all platforms, but if not
> we need to fix request_irq.
There is nothing to fix:
request_irq()
raw_spin_lock_irq(desc->lock); // ACQUIRE
....
raw_spin_unlock_irq(desc->lock); // RELEASE
interrupt()
raw_spin_lock(desc->lock); // ACQUIRE
set status to IN_PROGRESS
raw_spin_unlock(desc->lock); // RELEASE
invoke handler()
So anything which the driver set up _before_ request_irq() is visible to
the interrupt handler. No?
>> What happens if an interrupt is raised in the middle like:
>>
>> smp_store_release(dev->irq_soft_enabled, true)
>> IRQ handler
>> synchornize_irq()
This is bogus. The obvious order of things is:
dev->ok = false;
request_irq();
moar_setup();
synchronize_irq(); // ACQUIRE + RELEASE
dev->ok = true;
The reverse operation on teardown:
dev->ok = false;
synchronize_irq(); // ACQUIRE + RELEASE
teardown();
So in both cases a simple check in the handler is sufficient:
handler()
if (!dev->ok)
return;
I'm not understanding what you folks are trying to "fix" here. If any
driver does this in the wrong order, then the driver is broken.
Sure, you can do the same with:
dev->ok = false;
request_irq();
moar_setup();
smp_wmb();
dev->ok = true;
for the price of a smp_rmb() in the interrupt handler:
handler()
if (!dev->ok)
return;
smp_rmb();
but that's only working for the setup case correctly and not for
teardown.
Thanks,
tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 8:35 ` Re: Thomas Gleixner
@ 2022-03-29 14:37 ` Michael S. Tsirkin
2022-03-29 18:13 ` Re: Thomas Gleixner
2022-04-12 6:55 ` Re: Michael S. Tsirkin
1 sibling, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-29 14:37 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> On Mon, Mar 28 2022 at 06:40, Michael S. Tsirkin wrote:
> > On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> >> > > So I think we might talk different issues:
> >> > >
> >> > > 1) Whether request_irq() commits the previous setups, I think the
> >> > > answer is yes, since the spin_unlock of desc->lock (release) can
> >> > > guarantee this though there seems no documentation around
> >> > > request_irq() to say this.
> >> > >
> >> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> >> > > using smp_wmb() before the request_irq().
>
> That's a complete bogus example especially as there is not a single
> smp_rmb() which pairs with the smp_wmb().
>
> >> > > And even if write is ordered we still need read to be ordered to be
> >> > > paired with that.
> >
> > IMO it synchronizes with the CPU to which irq is
> > delivered. Otherwise basically all drivers would be broken,
> > wouldn't they be?
> > I don't know whether it's correct on all platforms, but if not
> > we need to fix request_irq.
>
> There is nothing to fix:
>
> request_irq()
> raw_spin_lock_irq(desc->lock); // ACQUIRE
> ....
> raw_spin_unlock_irq(desc->lock); // RELEASE
>
> interrupt()
> raw_spin_lock(desc->lock); // ACQUIRE
> set status to IN_PROGRESS
> raw_spin_unlock(desc->lock); // RELEASE
> invoke handler()
>
> So anything which the driver set up _before_ request_irq() is visible to
> the interrupt handler. No?
> >> What happens if an interrupt is raised in the middle like:
> >>
> >> smp_store_release(dev->irq_soft_enabled, true)
> >> IRQ handler
> >> synchornize_irq()
>
> This is bogus. The obvious order of things is:
>
> dev->ok = false;
> request_irq();
>
> moar_setup();
> synchronize_irq(); // ACQUIRE + RELEASE
> dev->ok = true;
>
> The reverse operation on teardown:
>
> dev->ok = false;
> synchronize_irq(); // ACQUIRE + RELEASE
>
> teardown();
>
> So in both cases a simple check in the handler is sufficient:
>
> handler()
> if (!dev->ok)
> return;
Thanks a lot for the analysis Thomas. This is more or less what I was
thinking.
>
> I'm not understanding what you folks are trying to "fix" here.
We are trying to fix the driver since at the moment it does not
have the dev->ok flag at all.
And I suspect virtio is not alone in that.
So it would have been nice if there was a standard flag
replacing the driver-specific dev->ok above, and ideally
would also handle the case of an interrupt triggering
too early by deferring the interrupt until the flag is set.
And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
enable_irq instead of dev->ok = true, except
- it doesn't work with affinity managed IRQs
- it does not work with shared IRQs
So using dev->ok as you propose above seems better at this point.
> If any
> driver does this in the wrong order, then the driver is broken.
I agree, however:
$ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
113
$ git grep -l request_irq drivers/net/|wc -l
397
I suspect there are more drivers which in theory need the
synchronize_irq dance but in practice do not execute it.
> Sure, you can do the same with:
>
> dev->ok = false;
> request_irq();
> moar_setup();
> smp_wmb();
> dev->ok = true;
>
> for the price of a smp_rmb() in the interrupt handler:
>
> handler()
> if (!dev->ok)
> return;
> smp_rmb();
>
> but that's only working for the setup case correctly and not for
> teardown.
>
> Thanks,
>
> tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 14:37 ` Re: Michael S. Tsirkin
@ 2022-03-29 18:13 ` Thomas Gleixner
2022-03-29 22:04 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Thomas Gleixner @ 2022-03-29 18:13 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> We are trying to fix the driver since at the moment it does not
> have the dev->ok flag at all.
>
> And I suspect virtio is not alone in that.
> So it would have been nice if there was a standard flag
> replacing the driver-specific dev->ok above, and ideally
> would also handle the case of an interrupt triggering
> too early by deferring the interrupt until the flag is set.
>
> And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> enable_irq instead of dev->ok = true, except
> - it doesn't work with affinity managed IRQs
> - it does not work with shared IRQs
>
> So using dev->ok as you propose above seems better at this point.
Unless there is a big enough amount of drivers which could make use of a
generic mechanism for that.
>> If any driver does this in the wrong order, then the driver is
>> broken.
>
> I agree, however:
> $ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> 113
> $ git grep -l request_irq drivers/net/|wc -l
> 397
>
> I suspect there are more drivers which in theory need the
> synchronize_irq dance but in practice do not execute it.
That really depends on when the driver requests the interrupt, when
it actually enables the interrupt in the device itself and how the
interrupt service routine works.
So just doing that grep dance does not tell much. You really have to do
a case by case analysis.
Thanks,
tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 18:13 ` Re: Thomas Gleixner
@ 2022-03-29 22:04 ` Michael S. Tsirkin
2022-03-30 2:38 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-29 22:04 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > We are trying to fix the driver since at the moment it does not
> > have the dev->ok flag at all.
> >
> > And I suspect virtio is not alone in that.
> > So it would have been nice if there was a standard flag
> > replacing the driver-specific dev->ok above, and ideally
> > would also handle the case of an interrupt triggering
> > too early by deferring the interrupt until the flag is set.
> >
> > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > enable_irq instead of dev->ok = true, except
> > - it doesn't work with affinity managed IRQs
> > - it does not work with shared IRQs
> >
> > So using dev->ok as you propose above seems better at this point.
>
> Unless there is a big enough amount of drivers which could make use of a
> generic mechanism for that.
>
> >> If any driver does this in the wrong order, then the driver is
> >> broken.
> >
> > I agree, however:
> > $ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > 113
> > $ git grep -l request_irq drivers/net/|wc -l
> > 397
> >
> > I suspect there are more drivers which in theory need the
> > synchronize_irq dance but in practice do not execute it.
>
> That really depends on when the driver requests the interrupt, when
> it actually enables the interrupt in the device itself
This last point does not matter since we are talking about protecting
against buggy/malicious devices. They can inject the interrupt anyway
even if driver did not configure it.
> and how the
> interrupt service routine works.
>
> So just doing that grep dance does not tell much. You really have to do
> a case by case analysis.
>
> Thanks,
>
> tglx
I agree. In fact, at least for network the standard approach is to
request interrupts in the open call, virtio net is unusual
in doing it in probe. We should consider changing that.
Jason?
--
MST
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 22:04 ` Re: Michael S. Tsirkin
@ 2022-03-30 2:38 ` Jason Wang
2022-03-30 5:09 ` Re: Michael S. Tsirkin
0 siblings, 1 reply; 417+ messages in thread
From: Jason Wang @ 2022-03-30 2:38 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Thomas Gleixner, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Wed, Mar 30, 2022 at 6:04 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> > On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > > We are trying to fix the driver since at the moment it does not
> > > have the dev->ok flag at all.
> > >
> > > And I suspect virtio is not alone in that.
> > > So it would have been nice if there was a standard flag
> > > replacing the driver-specific dev->ok above, and ideally
> > > would also handle the case of an interrupt triggering
> > > too early by deferring the interrupt until the flag is set.
> > >
> > > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > > enable_irq instead of dev->ok = true, except
> > > - it doesn't work with affinity managed IRQs
> > > - it does not work with shared IRQs
> > >
> > > So using dev->ok as you propose above seems better at this point.
> >
> > Unless there is a big enough amount of drivers which could make use of a
> > generic mechanism for that.
> >
> > >> If any driver does this in the wrong order, then the driver is
> > >> broken.
> > >
> > > I agree, however:
> > > $ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > > 113
> > > $ git grep -l request_irq drivers/net/|wc -l
> > > 397
> > >
> > > I suspect there are more drivers which in theory need the
> > > synchronize_irq dance but in practice do not execute it.
> >
> > That really depends on when the driver requests the interrupt, when
> > it actually enables the interrupt in the device itself
>
> This last point does not matter since we are talking about protecting
> against buggy/malicious devices. They can inject the interrupt anyway
> even if driver did not configure it.
>
> > and how the
> > interrupt service routine works.
> >
> > So just doing that grep dance does not tell much. You really have to do
> > a case by case analysis.
> >
> > Thanks,
> >
> > tglx
>
>
> I agree. In fact, at least for network the standard approach is to
> request interrupts in the open call, virtio net is unusual
> in doing it in probe. We should consider changing that.
> Jason?
This probably works only for virtio-net and it looks like not trivial
since we don't have a specific core API to request interrupts.
Thanks
>
> --
> MST
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-30 2:38 ` Re: Jason Wang
@ 2022-03-30 5:09 ` Michael S. Tsirkin
2022-03-30 5:53 ` Re: Jason Wang
0 siblings, 1 reply; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-03-30 5:09 UTC (permalink / raw)
To: Jason Wang
Cc: Thomas Gleixner, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Wed, Mar 30, 2022 at 10:38:06AM +0800, Jason Wang wrote:
> On Wed, Mar 30, 2022 at 6:04 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> > > On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > > > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > > > We are trying to fix the driver since at the moment it does not
> > > > have the dev->ok flag at all.
> > > >
> > > > And I suspect virtio is not alone in that.
> > > > So it would have been nice if there was a standard flag
> > > > replacing the driver-specific dev->ok above, and ideally
> > > > would also handle the case of an interrupt triggering
> > > > too early by deferring the interrupt until the flag is set.
> > > >
> > > > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > > > enable_irq instead of dev->ok = true, except
> > > > - it doesn't work with affinity managed IRQs
> > > > - it does not work with shared IRQs
> > > >
> > > > So using dev->ok as you propose above seems better at this point.
> > >
> > > Unless there is a big enough amount of drivers which could make use of a
> > > generic mechanism for that.
> > >
> > > >> If any driver does this in the wrong order, then the driver is
> > > >> broken.
> > > >
> > > > I agree, however:
> > > > $ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > > > 113
> > > > $ git grep -l request_irq drivers/net/|wc -l
> > > > 397
> > > >
> > > > I suspect there are more drivers which in theory need the
> > > > synchronize_irq dance but in practice do not execute it.
> > >
> > > That really depends on when the driver requests the interrupt, when
> > > it actually enables the interrupt in the device itself
> >
> > This last point does not matter since we are talking about protecting
> > against buggy/malicious devices. They can inject the interrupt anyway
> > even if driver did not configure it.
> >
> > > and how the
> > > interrupt service routine works.
> > >
> > > So just doing that grep dance does not tell much. You really have to do
> > > a case by case analysis.
> > >
> > > Thanks,
> > >
> > > tglx
> >
> >
> > I agree. In fact, at least for network the standard approach is to
> > request interrupts in the open call, virtio net is unusual
> > in doing it in probe. We should consider changing that.
> > Jason?
>
> This probably works only for virtio-net and it looks like not trivial
> since we don't have a specific core API to request interrupts.
>
> Thanks
We'll need a new API, for sure. E.g. find vqs with no
callback on probe, and then virtio_request_vq_callbacks separately.
The existing API that specifies callbacks during find vqs
can be used by other drivers.
> >
> > --
> > MST
> >
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-30 5:09 ` Re: Michael S. Tsirkin
@ 2022-03-30 5:53 ` Jason Wang
0 siblings, 0 replies; 417+ messages in thread
From: Jason Wang @ 2022-03-30 5:53 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Thomas Gleixner, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Wed, Mar 30, 2022 at 1:09 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Mar 30, 2022 at 10:38:06AM +0800, Jason Wang wrote:
> > On Wed, Mar 30, 2022 at 6:04 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Tue, Mar 29, 2022 at 08:13:57PM +0200, Thomas Gleixner wrote:
> > > > On Tue, Mar 29 2022 at 10:37, Michael S. Tsirkin wrote:
> > > > > On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> > > > > We are trying to fix the driver since at the moment it does not
> > > > > have the dev->ok flag at all.
> > > > >
> > > > > And I suspect virtio is not alone in that.
> > > > > So it would have been nice if there was a standard flag
> > > > > replacing the driver-specific dev->ok above, and ideally
> > > > > would also handle the case of an interrupt triggering
> > > > > too early by deferring the interrupt until the flag is set.
> > > > >
> > > > > And in fact, it does kind of exist: IRQF_NO_AUTOEN, and you would call
> > > > > enable_irq instead of dev->ok = true, except
> > > > > - it doesn't work with affinity managed IRQs
> > > > > - it does not work with shared IRQs
> > > > >
> > > > > So using dev->ok as you propose above seems better at this point.
> > > >
> > > > Unless there is a big enough amount of drivers which could make use of a
> > > > generic mechanism for that.
> > > >
> > > > >> If any driver does this in the wrong order, then the driver is
> > > > >> broken.
> > > > >
> > > > > I agree, however:
> > > > > $ git grep synchronize_irq `git grep -l request_irq drivers/net/`|wc -l
> > > > > 113
> > > > > $ git grep -l request_irq drivers/net/|wc -l
> > > > > 397
> > > > >
> > > > > I suspect there are more drivers which in theory need the
> > > > > synchronize_irq dance but in practice do not execute it.
> > > >
> > > > That really depends on when the driver requests the interrupt, when
> > > > it actually enables the interrupt in the device itself
> > >
> > > This last point does not matter since we are talking about protecting
> > > against buggy/malicious devices. They can inject the interrupt anyway
> > > even if driver did not configure it.
> > >
> > > > and how the
> > > > interrupt service routine works.
> > > >
> > > > So just doing that grep dance does not tell much. You really have to do
> > > > a case by case analysis.
> > > >
> > > > Thanks,
> > > >
> > > > tglx
> > >
> > >
> > > I agree. In fact, at least for network the standard approach is to
> > > request interrupts in the open call, virtio net is unusual
> > > in doing it in probe. We should consider changing that.
> > > Jason?
> >
> > This probably works only for virtio-net and it looks like not trivial
> > since we don't have a specific core API to request interrupts.
> >
> > Thanks
>
> We'll need a new API, for sure. E.g. find vqs with no
> callback on probe, and then virtio_request_vq_callbacks separately.
>
> The existing API that specifies callbacks during find vqs
> can be used by other drivers.
Ok, I will do it.
Thanks
>
> > >
> > > --
> > > MST
> > >
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-03-29 8:35 ` Re: Thomas Gleixner
2022-03-29 14:37 ` Re: Michael S. Tsirkin
@ 2022-04-12 6:55 ` Michael S. Tsirkin
1 sibling, 0 replies; 417+ messages in thread
From: Michael S. Tsirkin @ 2022-04-12 6:55 UTC (permalink / raw)
To: Thomas Gleixner
Cc: Jason Wang, virtualization, linux-kernel, Marc Zyngier,
Peter Zijlstra, Stefano Garzarella, Keir Fraser,
Paul E. McKenney
On Tue, Mar 29, 2022 at 10:35:21AM +0200, Thomas Gleixner wrote:
> On Mon, Mar 28 2022 at 06:40, Michael S. Tsirkin wrote:
> > On Mon, Mar 28, 2022 at 02:18:22PM +0800, Jason Wang wrote:
> >> > > So I think we might talk different issues:
> >> > >
> >> > > 1) Whether request_irq() commits the previous setups, I think the
> >> > > answer is yes, since the spin_unlock of desc->lock (release) can
> >> > > guarantee this though there seems no documentation around
> >> > > request_irq() to say this.
> >> > >
> >> > > And I can see at least drivers/video/fbdev/omap2/omapfb/dss/dispc.c is
> >> > > using smp_wmb() before the request_irq().
>
> That's a complete bogus example especially as there is not a single
> smp_rmb() which pairs with the smp_wmb().
>
> >> > > And even if write is ordered we still need read to be ordered to be
> >> > > paired with that.
> >
> > IMO it synchronizes with the CPU to which irq is
> > delivered. Otherwise basically all drivers would be broken,
> > wouldn't they be?
> > I don't know whether it's correct on all platforms, but if not
> > we need to fix request_irq.
>
> There is nothing to fix:
>
> request_irq()
> raw_spin_lock_irq(desc->lock); // ACQUIRE
> ....
> raw_spin_unlock_irq(desc->lock); // RELEASE
>
> interrupt()
> raw_spin_lock(desc->lock); // ACQUIRE
> set status to IN_PROGRESS
> raw_spin_unlock(desc->lock); // RELEASE
> invoke handler()
>
> So anything which the driver set up _before_ request_irq() is visible to
> the interrupt handler. No?
>
> >> What happens if an interrupt is raised in the middle like:
> >>
> >> smp_store_release(dev->irq_soft_enabled, true)
> >> IRQ handler
> >> synchornize_irq()
>
> This is bogus. The obvious order of things is:
>
> dev->ok = false;
> request_irq();
>
> moar_setup();
> synchronize_irq(); // ACQUIRE + RELEASE
> dev->ok = true;
>
> The reverse operation on teardown:
>
> dev->ok = false;
> synchronize_irq(); // ACQUIRE + RELEASE
>
> teardown();
>
> So in both cases a simple check in the handler is sufficient:
>
> handler()
> if (!dev->ok)
> return;
Does this need to be if (!READ_ONCE(dev->ok)) ?
> I'm not understanding what you folks are trying to "fix" here. If any
> driver does this in the wrong order, then the driver is broken.
>
> Sure, you can do the same with:
>
> dev->ok = false;
> request_irq();
> moar_setup();
> smp_wmb();
> dev->ok = true;
>
> for the price of a smp_rmb() in the interrupt handler:
>
> handler()
> if (!dev->ok)
> return;
> smp_rmb();
>
> but that's only working for the setup case correctly and not for
> teardown.
>
> Thanks,
>
> tglx
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2022-01-20 15:28 Myrtle Shah
2022-01-20 15:37 ` Vitaly Wool
0 siblings, 1 reply; 417+ messages in thread
From: Myrtle Shah @ 2022-01-20 15:28 UTC (permalink / raw)
To: linux-riscv, paul.walmsley, palmer; +Cc: linux-kernel
These are some initial patches to bugs I found attempting to
get a XIP kernel working on hardware:
- 32-bit VexRiscv processor
- kernel in SPI flash, at 0x00200000
- 16MB of RAM at 0x10000000
- MMU enabled
I still have some more debugging to do, but these at least
get the kernel as far as initialising the MMU, and I would
appreciate feedback if anyone else is working on RISC-V XIP.
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-01-20 15:28 Myrtle Shah
@ 2022-01-20 15:37 ` Vitaly Wool
2022-01-20 23:29 ` Re: Damien Le Moal
2022-02-04 21:45 ` Re: Palmer Dabbelt
0 siblings, 2 replies; 417+ messages in thread
From: Vitaly Wool @ 2022-01-20 15:37 UTC (permalink / raw)
To: Myrtle Shah; +Cc: linux-riscv, Paul Walmsley, Palmer Dabbelt, LKML
Hey,
On Thu, Jan 20, 2022 at 4:30 PM Myrtle Shah <gatecat@ds0.me> wrote:
>
> These are some initial patches to bugs I found attempting to
> get a XIP kernel working on hardware:
> - 32-bit VexRiscv processor
> - kernel in SPI flash, at 0x00200000
> - 16MB of RAM at 0x10000000
> - MMU enabled
>
> I still have some more debugging to do, but these at least
> get the kernel as far as initialising the MMU, and I would
> appreciate feedback if anyone else is working on RISC-V XIP.
I'll try to support you as much as I can, unfortunately I don't have
any 32-bit RISC-V around so I was rather thinking of extending the
RISC-V XIP support to 64-bit non-MMU targets.
For now just please keep in mind that there might be some inherent
assumptions that a target is 64 bit.
Best regards,
Vitaly
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-01-20 15:37 ` Vitaly Wool
@ 2022-01-20 23:29 ` Damien Le Moal
2022-02-04 21:45 ` Re: Palmer Dabbelt
1 sibling, 0 replies; 417+ messages in thread
From: Damien Le Moal @ 2022-01-20 23:29 UTC (permalink / raw)
To: Vitaly Wool, Myrtle Shah; +Cc: linux-riscv, Paul Walmsley, Palmer Dabbelt, LKML
On 2022/01/21 0:37, Vitaly Wool wrote:
> Hey,
>
> On Thu, Jan 20, 2022 at 4:30 PM Myrtle Shah <gatecat@ds0.me> wrote:
>>
>> These are some initial patches to bugs I found attempting to
>> get a XIP kernel working on hardware:
>> - 32-bit VexRiscv processor
>> - kernel in SPI flash, at 0x00200000
>> - 16MB of RAM at 0x10000000
>> - MMU enabled
>>
>> I still have some more debugging to do, but these at least
>> get the kernel as far as initialising the MMU, and I would
>> appreciate feedback if anyone else is working on RISC-V XIP.
>
> I'll try to support you as much as I can, unfortunately I don't have
> any 32-bit RISC-V around so I was rather thinking of extending the
> RISC-V XIP support to 64-bit non-MMU targets.
That would be great ! I am completing the buildroot patches for the K210. Got
u-boot almost working for SD card boot too (fighting a problem with rootfs
kernel mount on boot when using u-boot though).
> For now just please keep in mind that there might be some inherent
> assumptions that a target is 64 bit.
>
> Best regards,
> Vitaly
>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2022-01-20 15:37 ` Vitaly Wool
2022-01-20 23:29 ` Re: Damien Le Moal
@ 2022-02-04 21:45 ` Palmer Dabbelt
1 sibling, 0 replies; 417+ messages in thread
From: Palmer Dabbelt @ 2022-02-04 21:45 UTC (permalink / raw)
To: vitaly.wool; +Cc: gatecat, linux-riscv, Paul Walmsley, linux-kernel
On Thu, 20 Jan 2022 07:37:00 PST (-0800), vitaly.wool@konsulko.com wrote:
> Hey,
>
> On Thu, Jan 20, 2022 at 4:30 PM Myrtle Shah <gatecat@ds0.me> wrote:
>>
>> These are some initial patches to bugs I found attempting to
>> get a XIP kernel working on hardware:
>> - 32-bit VexRiscv processor
>> - kernel in SPI flash, at 0x00200000
>> - 16MB of RAM at 0x10000000
>> - MMU enabled
>>
>> I still have some more debugging to do, but these at least
>> get the kernel as far as initialising the MMU, and I would
>> appreciate feedback if anyone else is working on RISC-V XIP.
>
> I'll try to support you as much as I can, unfortunately I don't have
> any 32-bit RISC-V around so I was rather thinking of extending the
> RISC-V XIP support to 64-bit non-MMU targets.
> For now just please keep in mind that there might be some inherent
> assumptions that a target is 64 bit.
I don't test any of the XIP configs, but if you guys have something that's sane
to run in QEMU I'm happy to do so. Given that there's now some folks finding
boot bugs it's probably worth getting what does boot into a regression test so
it's less likely to break moving forwards.
These are on fixes, with the second one split up so it's got a better chance of
landing in the stable trees.
Thanks!
>
> Best regards,
> Vitaly
>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <20211126221034.21331-1-lukasz.bartosik@semihalf.com--annotate>]
* Re:
[not found] <20211126221034.21331-1-lukasz.bartosik@semihalf.com--annotate>
@ 2021-11-29 21:59 ` sean.wang
0 siblings, 0 replies; 417+ messages in thread
From: sean.wang @ 2021-11-29 21:59 UTC (permalink / raw)
To: lb
Cc: marcel, johan.hedberg, luiz.dentz, upstream, linux-bluetooth,
linux-mediatek, linux-kernel, Sean Wang
From: Sean Wang <sean.wang@mediatek.com>
>Enable msft opcode for btmtksdio driver.
>
>Signed-off-by: Łukasz Bartosik <lb@semihalf.com>
>---
> drivers/bluetooth/btmtksdio.c | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index d9cf0c492e29..2a7a615663b9 100644
>--- a/drivers/bluetooth/btmtksdio.c
>+++ b/drivers/bluetooth/btmtksdio.c
>@@ -887,6 +887,7 @@ static int btmtksdio_setup(struct hci_dev *hdev)
> if (enable_autosuspend)
> pm_runtime_allow(bdev->dev);
>
>+ hci_set_msft_opcode(hdev, 0xFD30);
Hi Łukasz,
msft feature is supposed only supported on mt7921. Could you help rework the patch to enalbe msft opocde only for mt7921?
Sean
> bt_dev_info(hdev, "Device setup in %llu usecs", duration);
>
> return 0;
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH v5 00/11] Add support for X86/ACPI camera sensor/PMIC setup with clk and regulator platform data
@ 2021-11-02 9:48 Hans de Goede
2021-11-02 9:49 ` [PATCH v5 05/11] clk: Introduce clk-tps68470 driver Hans de Goede
0 siblings, 1 reply; 417+ messages in thread
From: Hans de Goede @ 2021-11-02 9:48 UTC (permalink / raw)
To: Rafael J . Wysocki, Mark Gross, Andy Shevchenko, Wolfram Sang,
Mika Westerberg, Daniel Scally, Laurent Pinchart,
Mauro Carvalho Chehab, Liam Girdwood, Mark Brown,
Michael Turquette, Stephen Boyd
Cc: Hans de Goede, Len Brown, linux-acpi, platform-driver-x86,
linux-kernel, linux-i2c, Sakari Ailus, Kate Hsuan, linux-media,
linux-clk
Here is v5 of my patch-set adding support for camera sensor connected to a
TPS68470 PMIC on x86/ACPI devices.
Changes in v5:
- Update regulator_init_data in patch 10/11 to include the VCM regulator
- Address various small review remarks from Andy
- Make a couple of functions / vars static in the clk + regulator drivers
Reported-by: kernel test robot <lkp@intel.com>
Changes in v4:
[PATCH 01/11] ACPI: delay enumeration of devices with a _DEP
pointing to an INT3472 device:
- Move the acpi_dev_ready_for_enumeration() check to acpi_bus_attach()
(replacing the acpi_device_is_present() check there)
[PATCH 04/11] regulator: Introduce tps68470-regulator driver:
- Make the top comment block use c++ style comments
- Drop the bogus builtin regulator_init_data
- Make the driver enable the PMIC clk when enabling the Core buck
regulator, this switching regulator needs the PLL to be on
- Kconfig: add || COMPILE_TEST, fix help text
[PATCH 05/11] clk: Introduce clk-tps68470 driver
- Kconfig: select REGMAP_I2C, add || COMPILE_TEST, fix help text
- tps68470_clk_prepare(): Wait for the PLL to lock before returning
- tps68470_clk_unprepare(): Remove unnecesary clearing of divider regs
- tps68470_clk_probe(): Use devm_clk_hw_register()
- Misc. small cleanups
I'm quite happy with how this works now, so from my pov this is the final
version of the device-instantiation deferral code / approach.
###
The clk and regulator frameworks expect clk/regulator consumer-devices
to have info about the consumed clks/regulators described in the device's
fw_node, but on ACPI this info is missing.
This series worksaround this by providing platform_data with the info to
the TPS68470 clk/regulator MFD cells.
Patches 1 - 2 deal with a probe-ordering problem this introduces,
since the lookups are only registered when the provider-driver binds,
trying to get these clks/regulators before then results in a -ENOENT
error for clks and a dummy regulator for regulators. See the patches
for more details.
Patch 3 adds a header file which adds tps68470_clk_platform_data and
tps68470_regulator_platform_data structs. The futher patches depend on
this new header file.
Patch 4 + 5 add the TPS68470 clk and regulator drivers
Patches 6 - 11 Modify the INT3472 driver which instantiates the MFD cells to
provide the necessary platform-data.
Assuming this series is acceptable to everyone, we need to talk about how
to merge this.
Patch 2 has already been acked by Wolfram for merging by Rafael, so patch
1 + 2 can be merged into linux-pm, independent of the rest of the series
(there are some runtime deps on other changes for everything to work,
but the camera-sensors impacted by this are not fully supported yet in
the mainline kernel anyways).
For "[PATCH 03/13] platform_data: Add linux/platform_data/tps68470.h file",
which all further patches depend on I plan to provide an immutable branch
myself (once it has been reviewed), which the clk / regulator maintainers
can then merge before merging the clk / regulator driver which depends on
this.
And I will merge that IM-branch + patches 6-11 into the pdx86 tree myself.
Regards,
Hans
Daniel Scally (1):
platform/x86: int3472: Enable I2c daisy chain
Hans de Goede (10):
ACPI: delay enumeration of devices with a _DEP pointing to an INT3472
device
i2c: acpi: Use acpi_dev_ready_for_enumeration() helper
platform_data: Add linux/platform_data/tps68470.h file
regulator: Introduce tps68470-regulator driver
clk: Introduce clk-tps68470 driver
platform/x86: int3472: Split into 2 drivers
platform/x86: int3472: Add get_sensor_adev_and_name() helper
platform/x86: int3472: Pass tps68470_clk_platform_data to the
tps68470-regulator MFD-cell
platform/x86: int3472: Pass tps68470_regulator_platform_data to the
tps68470-regulator MFD-cell
platform/x86: int3472: Deal with probe ordering issues
drivers/acpi/scan.c | 37 ++-
drivers/clk/Kconfig | 8 +
drivers/clk/Makefile | 1 +
drivers/clk/clk-tps68470.c | 257 ++++++++++++++++++
drivers/i2c/i2c-core-acpi.c | 5 +-
drivers/platform/x86/intel/int3472/Makefile | 9 +-
...lk_and_regulator.c => clk_and_regulator.c} | 2 +-
drivers/platform/x86/intel/int3472/common.c | 82 ++++++
.../{intel_skl_int3472_common.h => common.h} | 6 +-
...ntel_skl_int3472_discrete.c => discrete.c} | 51 ++--
.../intel/int3472/intel_skl_int3472_common.c | 106 --------
...ntel_skl_int3472_tps68470.c => tps68470.c} | 99 ++++++-
drivers/platform/x86/intel/int3472/tps68470.h | 25 ++
.../x86/intel/int3472/tps68470_board_data.c | 134 +++++++++
drivers/regulator/Kconfig | 9 +
drivers/regulator/Makefile | 1 +
drivers/regulator/tps68470-regulator.c | 212 +++++++++++++++
include/acpi/acpi_bus.h | 5 +-
include/linux/mfd/tps68470.h | 11 +
include/linux/platform_data/tps68470.h | 35 +++
20 files changed, 944 insertions(+), 151 deletions(-)
create mode 100644 drivers/clk/clk-tps68470.c
rename drivers/platform/x86/intel/int3472/{intel_skl_int3472_clk_and_regulator.c => clk_and_regulator.c} (99%)
create mode 100644 drivers/platform/x86/intel/int3472/common.c
rename drivers/platform/x86/intel/int3472/{intel_skl_int3472_common.h => common.h} (94%)
rename drivers/platform/x86/intel/int3472/{intel_skl_int3472_discrete.c => discrete.c} (91%)
delete mode 100644 drivers/platform/x86/intel/int3472/intel_skl_int3472_common.c
rename drivers/platform/x86/intel/int3472/{intel_skl_int3472_tps68470.c => tps68470.c} (54%)
create mode 100644 drivers/platform/x86/intel/int3472/tps68470.h
create mode 100644 drivers/platform/x86/intel/int3472/tps68470_board_data.c
create mode 100644 drivers/regulator/tps68470-regulator.c
create mode 100644 include/linux/platform_data/tps68470.h
--
2.31.1
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH v5 05/11] clk: Introduce clk-tps68470 driver
2021-11-02 9:48 [PATCH v5 00/11] Add support for X86/ACPI camera sensor/PMIC setup with clk and regulator platform data Hans de Goede
@ 2021-11-02 9:49 ` Hans de Goede
[not found] ` <163588780885.2993099.2088131017920983969@swboyd.mtv.corp.google.com>
0 siblings, 1 reply; 417+ messages in thread
From: Hans de Goede @ 2021-11-02 9:49 UTC (permalink / raw)
To: Rafael J . Wysocki, Mark Gross, Andy Shevchenko, Wolfram Sang,
Mika Westerberg, Daniel Scally, Laurent Pinchart,
Mauro Carvalho Chehab, Liam Girdwood, Mark Brown,
Michael Turquette, Stephen Boyd
Cc: Hans de Goede, Len Brown, linux-acpi, platform-driver-x86,
linux-kernel, linux-i2c, Sakari Ailus, Kate Hsuan, linux-media,
linux-clk
The TPS68470 PMIC provides Clocks, GPIOs and Regulators. At present in
the kernel the Regulators and Clocks are controlled by an OpRegion
driver designed to work with power control methods defined in ACPI, but
some platforms lack those methods, meaning drivers need to be able to
consume the resources of these chips through the usual frameworks.
This commit adds a driver for the clocks provided by the tps68470,
and is designed to bind to the platform_device registered by the
intel_skl_int3472 module.
This is based on this out of tree driver written by Intel:
https://github.com/intel/linux-intel-lts/blob/4.14/base/drivers/clk/clk-tps68470.c
with various cleanups added.
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
Changes in v5:
- Small comment cleanups based on review from Andy
Changes in v4:
- Kconfig: select REGMAP_I2C, add || COMPILE_TEST, fix help text
- tps68470_clk_prepare(): Wait for the PLL to lock before returning
- tps68470_clk_unprepare(): Remove unnecesary clearing of divider regs
- tps68470_clk_probe(): Use devm_clk_hw_register()
- Misc. small cleanups
Changes in v2:
- Update the comment on why a subsys_initcall is used to register the drv
- Fix trailing whitespice on line 100
---
drivers/clk/Kconfig | 8 ++
drivers/clk/Makefile | 1 +
drivers/clk/clk-tps68470.c | 257 +++++++++++++++++++++++++++++++++++
include/linux/mfd/tps68470.h | 11 ++
4 files changed, 277 insertions(+)
create mode 100644 drivers/clk/clk-tps68470.c
diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index c5b3dc97396a..4e9098d79249 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -169,6 +169,14 @@ config COMMON_CLK_CDCE706
help
This driver supports TI CDCE706 programmable 3-PLL clock synthesizer.
+config COMMON_CLK_TPS68470
+ tristate "Clock Driver for TI TPS68470 PMIC"
+ depends on I2C
+ depends on INTEL_SKL_INT3472 || COMPILE_TEST
+ select REGMAP_I2C
+ help
+ This driver supports the clocks provided by the TPS68470 PMIC.
+
config COMMON_CLK_CDCE925
tristate "Clock driver for TI CDCE913/925/937/949 devices"
depends on I2C
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index e42312121e51..6b6a88ae1425 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_COMMON_CLK_SI570) += clk-si570.o
obj-$(CONFIG_COMMON_CLK_STM32F) += clk-stm32f4.o
obj-$(CONFIG_COMMON_CLK_STM32H7) += clk-stm32h7.o
obj-$(CONFIG_COMMON_CLK_STM32MP157) += clk-stm32mp1.o
+obj-$(CONFIG_COMMON_CLK_TPS68470) += clk-tps68470.o
obj-$(CONFIG_CLK_TWL6040) += clk-twl6040.o
obj-$(CONFIG_ARCH_VT8500) += clk-vt8500.o
obj-$(CONFIG_COMMON_CLK_VC5) += clk-versaclock5.o
diff --git a/drivers/clk/clk-tps68470.c b/drivers/clk/clk-tps68470.c
new file mode 100644
index 000000000000..2ad0ac2f4096
--- /dev/null
+++ b/drivers/clk/clk-tps68470.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clock driver for TPS68470 PMIC
+ *
+ * Copyright (c) 2021 Red Hat Inc.
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Hans de Goede <hdegoede@redhat.com>
+ * Zaikuo Wang <zaikuo.wang@intel.com>
+ * Tianshu Qiu <tian.shu.qiu@intel.com>
+ * Jian Xu Zheng <jian.xu.zheng@intel.com>
+ * Yuning Pu <yuning.pu@intel.com>
+ * Antti Laakso <antti.laakso@intel.com>
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+#include <linux/kernel.h>
+#include <linux/mfd/tps68470.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/tps68470.h>
+#include <linux/regmap.h>
+
+#define TPS68470_CLK_NAME "tps68470-clk"
+
+#define to_tps68470_clkdata(clkd) \
+ container_of(clkd, struct tps68470_clkdata, clkout_hw)
+
+static struct tps68470_clkout_freqs {
+ unsigned long freq;
+ unsigned int xtaldiv;
+ unsigned int plldiv;
+ unsigned int postdiv;
+ unsigned int buckdiv;
+ unsigned int boostdiv;
+} clk_freqs[] = {
+/*
+ * The PLL is used to multiply the crystal oscillator
+ * frequency range of 3 MHz to 27 MHz by a programmable
+ * factor of F = (M/N)*(1/P) such that the output
+ * available at the HCLK_A or HCLK_B pins are in the range
+ * of 4 MHz to 64 MHz in increments of 0.1 MHz.
+ *
+ * hclk_# = osc_in * (((plldiv*2)+320) / (xtaldiv+30)) * (1 / 2^postdiv)
+ *
+ * PLL_REF_CLK should be as close as possible to 100kHz
+ * PLL_REF_CLK = input clk / XTALDIV[7:0] + 30)
+ *
+ * PLL_VCO_CLK = (PLL_REF_CLK * (plldiv*2 + 320))
+ *
+ * BOOST should be as close as possible to 2Mhz
+ * BOOST = PLL_VCO_CLK / (BOOSTDIV[4:0] + 16) *
+ *
+ * BUCK should be as close as possible to 5.2Mhz
+ * BUCK = PLL_VCO_CLK / (BUCKDIV[3:0] + 5)
+ *
+ * osc_in xtaldiv plldiv postdiv hclk_#
+ * 20Mhz 170 32 1 19.2Mhz
+ * 20Mhz 170 40 1 20Mhz
+ * 20Mhz 170 80 1 24Mhz
+ */
+ { 19200000, 170, 32, 1, 2, 3 },
+ { 20000000, 170, 40, 1, 3, 4 },
+ { 24000000, 170, 80, 1, 4, 8 },
+};
+
+struct tps68470_clkdata {
+ struct clk_hw clkout_hw;
+ struct regmap *regmap;
+ unsigned int clk_cfg_idx;
+};
+
+static int tps68470_clk_is_prepared(struct clk_hw *hw)
+{
+ struct tps68470_clkdata *clkdata = to_tps68470_clkdata(hw);
+ int val;
+
+ if (regmap_read(clkdata->regmap, TPS68470_REG_PLLCTL, &val))
+ return 0;
+
+ return val & TPS68470_PLL_EN_MASK;
+}
+
+static int tps68470_clk_prepare(struct clk_hw *hw)
+{
+ struct tps68470_clkdata *clkdata = to_tps68470_clkdata(hw);
+ unsigned int idx = clkdata->clk_cfg_idx;
+
+ regmap_write(clkdata->regmap, TPS68470_REG_BOOSTDIV, clk_freqs[idx].boostdiv);
+ regmap_write(clkdata->regmap, TPS68470_REG_BUCKDIV, clk_freqs[idx].buckdiv);
+ regmap_write(clkdata->regmap, TPS68470_REG_PLLSWR, TPS68470_PLLSWR_DEFAULT);
+ regmap_write(clkdata->regmap, TPS68470_REG_XTALDIV, clk_freqs[idx].xtaldiv);
+ regmap_write(clkdata->regmap, TPS68470_REG_PLLDIV, clk_freqs[idx].plldiv);
+ regmap_write(clkdata->regmap, TPS68470_REG_POSTDIV, clk_freqs[idx].postdiv);
+ regmap_write(clkdata->regmap, TPS68470_REG_POSTDIV2, clk_freqs[idx].postdiv);
+ regmap_write(clkdata->regmap, TPS68470_REG_CLKCFG2, TPS68470_CLKCFG2_DRV_STR_2MA);
+
+ regmap_write(clkdata->regmap, TPS68470_REG_PLLCTL,
+ TPS68470_OSC_EXT_CAP_DEFAULT << TPS68470_OSC_EXT_CAP_SHIFT |
+ TPS68470_CLK_SRC_XTAL << TPS68470_CLK_SRC_SHIFT);
+
+ regmap_write(clkdata->regmap, TPS68470_REG_CLKCFG1,
+ (TPS68470_PLL_OUTPUT_ENABLE << TPS68470_OUTPUT_A_SHIFT) |
+ (TPS68470_PLL_OUTPUT_ENABLE << TPS68470_OUTPUT_B_SHIFT));
+
+ regmap_update_bits(clkdata->regmap, TPS68470_REG_PLLCTL,
+ TPS68470_PLL_EN_MASK, TPS68470_PLL_EN_MASK);
+
+ /*
+ * The PLLCTL reg lock bit is set by the PMIC after approx. 4ms and
+ * does not indicate a true lock, so just wait 4 ms.
+ */
+ usleep_range(4000, 5000);
+
+ return 0;
+}
+
+static void tps68470_clk_unprepare(struct clk_hw *hw)
+{
+ struct tps68470_clkdata *clkdata = to_tps68470_clkdata(hw);
+
+ /* Disable clock first ... */
+ regmap_update_bits(clkdata->regmap, TPS68470_REG_PLLCTL, TPS68470_PLL_EN_MASK, 0);
+
+ /* ... and then tri-state the clock outputs. */
+ regmap_write(clkdata->regmap, TPS68470_REG_CLKCFG1, 0);
+}
+
+static unsigned long tps68470_clk_recalc_rate(struct clk_hw *hw, unsigned long parent_rate)
+{
+ struct tps68470_clkdata *clkdata = to_tps68470_clkdata(hw);
+
+ return clk_freqs[clkdata->clk_cfg_idx].freq;
+}
+
+/*
+ * This returns the index of the clk_freqs[] cfg with the closest rate for
+ * use in tps68470_clk_round_rate(). tps68470_clk_set_rate() checks that
+ * the rate of the returned cfg is an exact match.
+ */
+static unsigned int tps68470_clk_cfg_lookup(unsigned long rate)
+{
+ long diff, best_diff = LONG_MAX;
+ unsigned int i, best_idx = 0;
+
+ for (i = 0; i < ARRAY_SIZE(clk_freqs); i++) {
+ diff = clk_freqs[i].freq - rate;
+ if (diff == 0)
+ return i;
+
+ diff = abs(diff);
+ if (diff < best_diff) {
+ best_diff = diff;
+ best_idx = i;
+ }
+ }
+
+ return best_idx;
+}
+
+static long tps68470_clk_round_rate(struct clk_hw *hw, unsigned long rate,
+ unsigned long *parent_rate)
+{
+ unsigned int idx = tps68470_clk_cfg_lookup(rate);
+
+ return clk_freqs[idx].freq;
+}
+
+static int tps68470_clk_set_rate(struct clk_hw *hw, unsigned long rate,
+ unsigned long parent_rate)
+{
+ struct tps68470_clkdata *clkdata = to_tps68470_clkdata(hw);
+ unsigned int idx = tps68470_clk_cfg_lookup(rate);
+
+ if (rate != clk_freqs[idx].freq)
+ return -EINVAL;
+
+ clkdata->clk_cfg_idx = idx;
+
+ return 0;
+}
+
+static const struct clk_ops tps68470_clk_ops = {
+ .is_prepared = tps68470_clk_is_prepared,
+ .prepare = tps68470_clk_prepare,
+ .unprepare = tps68470_clk_unprepare,
+ .recalc_rate = tps68470_clk_recalc_rate,
+ .round_rate = tps68470_clk_round_rate,
+ .set_rate = tps68470_clk_set_rate,
+};
+
+static const struct clk_init_data tps68470_clk_initdata = {
+ .name = TPS68470_CLK_NAME,
+ .ops = &tps68470_clk_ops,
+};
+
+static int tps68470_clk_probe(struct platform_device *pdev)
+{
+ struct tps68470_clk_platform_data *pdata = pdev->dev.platform_data;
+ struct tps68470_clkdata *tps68470_clkdata;
+ int ret;
+
+ tps68470_clkdata = devm_kzalloc(&pdev->dev, sizeof(*tps68470_clkdata),
+ GFP_KERNEL);
+ if (!tps68470_clkdata)
+ return -ENOMEM;
+
+ tps68470_clkdata->regmap = dev_get_drvdata(pdev->dev.parent);
+ tps68470_clkdata->clkout_hw.init = &tps68470_clk_initdata;
+ ret = devm_clk_hw_register(&pdev->dev, &tps68470_clkdata->clkout_hw);
+ if (ret)
+ return ret;
+
+ ret = devm_clk_hw_register_clkdev(&pdev->dev, &tps68470_clkdata->clkout_hw,
+ TPS68470_CLK_NAME, NULL);
+ if (ret)
+ return ret;
+
+ if (pdata) {
+ ret = devm_clk_hw_register_clkdev(&pdev->dev,
+ &tps68470_clkdata->clkout_hw,
+ pdata->consumer_con_id,
+ pdata->consumer_dev_name);
+ }
+
+ return ret;
+}
+
+static struct platform_driver tps68470_clk_driver = {
+ .driver = {
+ .name = TPS68470_CLK_NAME,
+ },
+ .probe = tps68470_clk_probe,
+};
+
+/*
+ * The ACPI tps68470 probe-ordering depends on the clk/gpio/regulator drivers
+ * registering before the drivers for the camera-sensors which use them bind.
+ * subsys_initcall() ensures this when the drivers are builtin.
+ */
+static int __init tps68470_clk_init(void)
+{
+ return platform_driver_register(&tps68470_clk_driver);
+}
+subsys_initcall(tps68470_clk_init);
+
+static void __exit tps68470_clk_exit(void)
+{
+ platform_driver_unregister(&tps68470_clk_driver);
+}
+module_exit(tps68470_clk_exit);
+
+MODULE_ALIAS("platform:tps68470-clk");
+MODULE_DESCRIPTION("clock driver for TPS68470 pmic");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/tps68470.h b/include/linux/mfd/tps68470.h
index ffe81127d91c..7807fa329db0 100644
--- a/include/linux/mfd/tps68470.h
+++ b/include/linux/mfd/tps68470.h
@@ -75,6 +75,17 @@
#define TPS68470_CLKCFG1_MODE_A_MASK GENMASK(1, 0)
#define TPS68470_CLKCFG1_MODE_B_MASK GENMASK(3, 2)
+#define TPS68470_CLKCFG2_DRV_STR_2MA 0x05
+#define TPS68470_PLL_OUTPUT_ENABLE 0x02
+#define TPS68470_CLK_SRC_XTAL BIT(0)
+#define TPS68470_PLLSWR_DEFAULT GENMASK(1, 0)
+#define TPS68470_OSC_EXT_CAP_DEFAULT 0x05
+
+#define TPS68470_OUTPUT_A_SHIFT 0x00
+#define TPS68470_OUTPUT_B_SHIFT 0x02
+#define TPS68470_CLK_SRC_SHIFT GENMASK(2, 0)
+#define TPS68470_OSC_EXT_CAP_SHIFT BIT(2)
+
#define TPS68470_GPIO_CTL_REG_A(x) (TPS68470_REG_GPCTL0A + (x) * 2)
#define TPS68470_GPIO_CTL_REG_B(x) (TPS68470_REG_GPCTL0B + (x) * 2)
#define TPS68470_GPIO_MODE_MASK GENMASK(1, 0)
--
2.31.1
^ permalink raw reply related [flat|nested] 417+ messages in thread
[parent not found: <CAP7CzPcLhtXDyLudfmR2pWR5fzSQ_jhJSoRheH=cytoDnb_ujg@mail.gmail.com>]
* Re:
[not found] <CAP7CzPcLhtXDyLudfmR2pWR5fzSQ_jhJSoRheH=cytoDnb_ujg@mail.gmail.com>
@ 2021-09-14 15:37 ` Nick Desaulniers
0 siblings, 0 replies; 417+ messages in thread
From: Nick Desaulniers @ 2021-09-14 15:37 UTC (permalink / raw)
To: zhao xc
Cc: tglx, mingo, bp, x86, hpa, nathan, tony.luck, linux, mpe,
dan.j.williams, linux-kernel, clang-built-linux
On Sun, Sep 12, 2021 at 10:42 PM zhao xc <xinchao.zhao.kernelz@gmail.com> wrote:
>
> Hi maintainer:
> This is a patch fix the unused macro definition
Hi Zhao,
Thanks for the patch. Would you mind following the standard procedure
for submitting patches to the list for review. I wrote up
https://nickdesaulniers.github.io/blog/2017/05/16/submitting-your-first-patch-to-the-linux-kernel-and-responding-to-feedback/
a while ago, but I think it's still helpful.
--
Thanks,
~Nick Desaulniers
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2021-08-12 9:21 Valdis Klētnieks
2021-08-12 9:42 ` SeongJae Park
0 siblings, 1 reply; 417+ messages in thread
From: Valdis Klētnieks @ 2021-08-12 9:21 UTC (permalink / raw)
To: SeongJae Park, Andrew Morton; +Cc: linux-mm, linux-kernel
[-- Attachment #1: Type: text/plain, Size: 832 bytes --]
In this commit:
commit fedc37448fb1be5d03e420ca7791d4286893d5ec
Author: SeongJae Park <sjpark@amazon.de>
Date: Tue Aug 10 16:55:51 2021 +1000
mm/idle_page_tracking: make PG_idle reusable
diff --git a/mm/Kconfig b/mm/Kconfig
index 504336de9a1e..d0b85dc12429 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -739,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT
lifetime of the system until these kthreads finish the
initialisation.
+config PAGE_IDLE_FLAG
+ bool "Add PG_idle and PG_young flags"
+ help
+ This feature adds PG_idle and PG_young flags in 'struct page'. PTE
+ Accessed bit writers can set the state of the bit in the flags to let
+ other PTE Accessed bit readers don't disturbed.
This needs to be converted to proper, or at least comprehensible, English....
[-- Attachment #2: Type: application/pgp-signature, Size: 494 bytes --]
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2021-08-12 9:21 Valdis Klētnieks
@ 2021-08-12 9:42 ` SeongJae Park
2021-08-12 20:19 ` Re: Andrew Morton
0 siblings, 1 reply; 417+ messages in thread
From: SeongJae Park @ 2021-08-12 9:42 UTC (permalink / raw)
To: Valdis Klētnieks
Cc: SeongJae Park, Andrew Morton, linux-mm, linux-kernel
From: SeongJae Park <sjpark@amazon.de>
Hello Valdis,
On Thu, 12 Aug 2021 05:21:57 -0400 "Valdis =?utf-8?Q?Kl=c4=93tnieks?=" <valdis.kletnieks@vt.edu> wrote:
> In this commit:
>
> commit fedc37448fb1be5d03e420ca7791d4286893d5ec
> Author: SeongJae Park <sjpark@amazon.de>
> Date: Tue Aug 10 16:55:51 2021 +1000
>
> mm/idle_page_tracking: make PG_idle reusable
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 504336de9a1e..d0b85dc12429 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -739,10 +739,18 @@ config DEFERRED_STRUCT_PAGE_INIT
> lifetime of the system until these kthreads finish the
> initialisation.
>
> +config PAGE_IDLE_FLAG
> + bool "Add PG_idle and PG_young flags"
> + help
> + This feature adds PG_idle and PG_young flags in 'struct page'. PTE
> + Accessed bit writers can set the state of the bit in the flags to let
> + other PTE Accessed bit readers don't disturbed.
>
> This needs to be converted to proper, or at least comprehensible, English....
Thank you for the comment.
How about below?
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
bool "Add PG_idle and PG_young flags"
select PAGE_EXTENSION if !64BIT
help
- This feature adds PG_idle and PG_young flags in 'struct page'. PTE
- Accessed bit writers can set the state of the bit in the flags to let
- other PTE Accessed bit readers don't disturbed.
+ This feature adds 'PG_idle' and 'PG_young' flags in 'struct page'.
+ PTE Accessed bit writers can save the state of the bit in the flags
+ to let other PTE Accessed bit readers don't get disturbed.
Thanks,
SeongJae Park
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2021-08-12 9:42 ` SeongJae Park
@ 2021-08-12 20:19 ` Andrew Morton
2021-08-13 8:14 ` Re: SeongJae Park
0 siblings, 1 reply; 417+ messages in thread
From: Andrew Morton @ 2021-08-12 20:19 UTC (permalink / raw)
To: SeongJae Park
Cc: Valdis Klētnieks , SeongJae Park, linux-mm, linux-kernel
On Thu, 12 Aug 2021 09:42:40 +0000 SeongJae Park <sj38.park@gmail.com> wrote:
> > +config PAGE_IDLE_FLAG
> > + bool "Add PG_idle and PG_young flags"
> > + help
> > + This feature adds PG_idle and PG_young flags in 'struct page'. PTE
> > + Accessed bit writers can set the state of the bit in the flags to let
> > + other PTE Accessed bit readers don't disturbed.
> >
> > This needs to be converted to proper, or at least comprehensible, English....
>
> Thank you for the comment.
>
> How about below?
>
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
> bool "Add PG_idle and PG_young flags"
> select PAGE_EXTENSION if !64BIT
> help
> - This feature adds PG_idle and PG_young flags in 'struct page'. PTE
> - Accessed bit writers can set the state of the bit in the flags to let
> - other PTE Accessed bit readers don't disturbed.
> + This feature adds 'PG_idle' and 'PG_young' flags in 'struct page'.
> + PTE Accessed bit writers can save the state of the bit in the flags
> + to let other PTE Accessed bit readers don't get disturbed.
How about this?
--- a/mm/Kconfig~mm-idle_page_tracking-make-pg_idle-reusable-fix-fix
+++ a/mm/Kconfig
@@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
bool "Add PG_idle and PG_young flags"
select PAGE_EXTENSION if !64BIT
help
- This feature adds PG_idle and PG_young flags in 'struct page'. PTE
- Accessed bit writers can set the state of the bit in the flags to let
- other PTE Accessed bit readers don't disturbed.
+ This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed
+ bit writers can set the state of the bit in the flags so that PTE
+ Accessed bit readers may avoid disturbance.
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
Also, is there any way in which we can avoid presenting this option to
the user? Because most users will have real trouble understanding what
this thing is for. Can we simply select it when needed, as dictated by
other, higher-level config options?
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2021-08-12 20:19 ` Re: Andrew Morton
@ 2021-08-13 8:14 ` SeongJae Park
0 siblings, 0 replies; 417+ messages in thread
From: SeongJae Park @ 2021-08-13 8:14 UTC (permalink / raw)
To: Andrew Morton
Cc: SeongJae Park, Valdis Klētnieks ,
SeongJae Park, linux-mm, linux-kernel
From: SeongJae Park <sjpark@amazon.de>
On Thu, 12 Aug 2021 13:19:21 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
> On Thu, 12 Aug 2021 09:42:40 +0000 SeongJae Park <sj38.park@gmail.com> wrote:
>
> > > +config PAGE_IDLE_FLAG
> > > + bool "Add PG_idle and PG_young flags"
> > > + help
> > > + This feature adds PG_idle and PG_young flags in 'struct page'. PTE
> > > + Accessed bit writers can set the state of the bit in the flags to let
> > > + other PTE Accessed bit readers don't disturbed.
> > >
> > > This needs to be converted to proper, or at least comprehensible, English....
> >
> > Thank you for the comment.
> >
> > How about below?
> >
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
> > bool "Add PG_idle and PG_young flags"
> > select PAGE_EXTENSION if !64BIT
> > help
> > - This feature adds PG_idle and PG_young flags in 'struct page'. PTE
> > - Accessed bit writers can set the state of the bit in the flags to let
> > - other PTE Accessed bit readers don't disturbed.
> > + This feature adds 'PG_idle' and 'PG_young' flags in 'struct page'.
> > + PTE Accessed bit writers can save the state of the bit in the flags
> > + to let other PTE Accessed bit readers don't get disturbed.
>
> How about this?
>
> --- a/mm/Kconfig~mm-idle_page_tracking-make-pg_idle-reusable-fix-fix
> +++ a/mm/Kconfig
> @@ -743,9 +743,9 @@ config PAGE_IDLE_FLAG
> bool "Add PG_idle and PG_young flags"
> select PAGE_EXTENSION if !64BIT
> help
> - This feature adds PG_idle and PG_young flags in 'struct page'. PTE
> - Accessed bit writers can set the state of the bit in the flags to let
> - other PTE Accessed bit readers don't disturbed.
> + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed
> + bit writers can set the state of the bit in the flags so that PTE
> + Accessed bit readers may avoid disturbance.
>
> config IDLE_PAGE_TRACKING
> bool "Enable idle page tracking"
So good, thank you!
>
> Also, is there any way in which we can avoid presenting this option to
> the user? Because most users will have real trouble understanding what
> this thing is for. Can we simply select it when needed, as dictated by
> other, higher-level config options?
I believe this is the right way to go! I sent a patch for removing the prompt
of this option:
https://lore.kernel.org/linux-mm/20210813081238.34705-1-sj38.park@gmail.com/
Thanks,
SeongJae Park
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2021-06-06 19:19 Davidlohr Bueso
2021-06-07 16:02 ` André Almeida
0 siblings, 1 reply; 417+ messages in thread
From: Davidlohr Bueso @ 2021-06-06 19:19 UTC (permalink / raw)
To: Andr� Almeida
Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
linux-kernel, Steven Rostedt, Sebastian Andrzej Siewior, kernel,
krisman, pgriffais, z.figura12, joel, malteskarupke, linux-api,
fweimer, libc-alpha, linux-kselftest, shuah, acme, corbet,
Peter Oskolkov, Andrey Semashev, mtk.manpages
Bcc:
Subject: Re: [PATCH v4 07/15] docs: locking: futex2: Add documentation
Reply-To:
In-Reply-To: <20210603195924.361327-8-andrealmeid@collabora.com>
On Thu, 03 Jun 2021, Andr� Almeida wrote:
>Add a new documentation file specifying both userspace API and internal
>implementation details of futex2 syscalls.
I think equally important would be to provide a manpage for each new
syscall you are introducing, and keep mkt in the loop as in the past he
extensively documented and improved futex manpages, and overall has a
lot of experience with dealing with kernel interfaces.
Thanks,
Davidlohr
>
>Signed-off-by: André Almeida <andrealmeid@collabora.com>
>---
> Documentation/locking/futex2.rst | 198 +++++++++++++++++++++++++++++++
> Documentation/locking/index.rst | 1 +
> 2 files changed, 199 insertions(+)
> create mode 100644 Documentation/locking/futex2.rst
>
>diff --git a/Documentation/locking/futex2.rst b/Documentation/locking/futex2.rst
>new file mode 100644
>index 000000000000..2f74d7c97a55
>--- /dev/null
>+++ b/Documentation/locking/futex2.rst
>@@ -0,0 +1,198 @@
>+.. SPDX-License-Identifier: GPL-2.0
>+
>+======
>+futex2
>+======
>+
>+:Author: André Almeida <andrealmeid@collabora.com>
>+
>+futex, or fast user mutex, is a set of syscalls to allow userspace to create
>+performant synchronization mechanisms, such as mutexes, semaphores and
>+conditional variables in userspace. C standard libraries, like glibc, uses it
>+as a means to implement more high level interfaces like pthreads.
>+
>+The interface
>+=============
>+
>+uAPI functions
>+--------------
>+
>+.. kernel-doc:: kernel/futex2.c
>+ :identifiers: sys_futex_wait sys_futex_wake sys_futex_waitv sys_futex_requeue
>+
>+uAPI structures
>+---------------
>+
>+.. kernel-doc:: include/uapi/linux/futex.h
>+
>+The ``flag`` argument
>+---------------------
>+
>+The flag is used to specify the size of the futex word
>+(FUTEX_[8, 16, 32, 64]). It's mandatory to define one, since there's no
>+default size.
>+
>+By default, the timeout uses a monotonic clock, but can be used as a realtime
>+one by using the FUTEX_REALTIME_CLOCK flag.
>+
>+By default, futexes are of the private type, that means that this user address
>+will be accessed by threads that share the same memory region. This allows for
>+some internal optimizations, so they are faster. However, if the address needs
>+to be shared with different processes (like using ``mmap()`` or ``shm()``), they
>+need to be defined as shared and the flag FUTEX_SHARED_FLAG is used to set that.
>+
>+By default, the operation has no NUMA-awareness, meaning that the user can't
>+choose the memory node where the kernel side futex data will be stored. The
>+user can choose the node where it wants to operate by setting the
>+FUTEX_NUMA_FLAG and using the following structure (where X can be 8, 16, 32 or
>+64)::
>+
>+ struct futexX_numa {
>+ __uX value;
>+ __sX hint;
>+ };
>+
>+This structure should be passed at the ``void *uaddr`` of futex functions. The
>+address of the structure will be used to be waited on/waken on, and the
>+``value`` will be compared to ``val`` as usual. The ``hint`` member is used to
>+define which node the futex will use. When waiting, the futex will be
>+registered on a kernel-side table stored on that node; when waking, the futex
>+will be searched for on that given table. That means that there's no redundancy
>+between tables, and the wrong ``hint`` value will lead to undesired behavior.
>+Userspace is responsible for dealing with node migrations issues that may
>+occur. ``hint`` can range from [0, MAX_NUMA_NODES), for specifying a node, or
>+-1, to use the same node the current process is using.
>+
>+When not using FUTEX_NUMA_FLAG on a NUMA system, the futex will be stored on a
>+global table on allocated on the first node.
>+
>+The ``timo`` argument
>+---------------------
>+
>+As per the Y2038 work done in the kernel, new interfaces shouldn't add timeout
>+options known to be buggy. Given that, ``timo`` should be a 64-bit timeout at
>+all platforms, using an absolute timeout value.
>+
>+Implementation
>+==============
>+
>+The internal implementation follows a similar design to the original futex.
>+Given that we want to replicate the same external behavior of current futex,
>+this should be somewhat expected.
>+
>+Waiting
>+-------
>+
>+For the wait operations, they are all treated as if you want to wait on N
>+futexes, so the path for futex_wait and futex_waitv is the basically the same.
>+For both syscalls, the first step is to prepare an internal list for the list
>+of futexes to wait for (using struct futexv_head). For futex_wait() calls, this
>+list will have a single object.
>+
>+We have a hash table, where waiters register themselves before sleeping. Then
>+the wake function checks this table looking for waiters at uaddr. The hash
>+bucket to be used is determined by a struct futex_key, that stores information
>+to uniquely identify an address from a given process. Given the huge address
>+space, there'll be hash collisions, so we store information to be later used on
>+collision treatment.
>+
>+First, for every futex we want to wait on, we check if (``*uaddr == val``).
>+This check is done holding the bucket lock, so we are correctly serialized with
>+any futex_wake() calls. If any waiter fails the check above, we dequeue all
>+futexes. The check (``*uaddr == val``) can fail for two reasons:
>+
>+- The values are different, and we return -EAGAIN. However, if while
>+ dequeueing we found that some futexes were awakened, we prioritize this
>+ and return success.
>+
>+- When trying to access the user address, we do so with page faults
>+ disabled because we are holding a bucket's spin lock (and can't sleep
>+ while holding a spin lock). If there's an error, it might be a page
>+ fault, or an invalid address. We release the lock, dequeue everyone
>+ (because it's illegal to sleep while there are futexes enqueued, we
>+ could lose wakeups) and try again with page fault enabled. If we
>+ succeed, this means that the address is valid, but we need to do
>+ all the work again. For serialization reasons, we need to have the
>+ spin lock when getting the user value. Additionally, for shared
>+ futexes, we also need to recalculate the hash, since the underlying
>+ mapping mechanisms could have changed when dealing with page fault.
>+ If, even with page fault enabled, we can't access the address, it
>+ means it's an invalid user address, and we return -EFAULT. For this
>+ case, we prioritize the error, even if some futexes were awaken.
>+
>+If the check is OK, they are enqueued on a linked list in our bucket, and
>+proceed to the next one. If all waiters succeed, we put the thread to sleep
>+until a futex_wake() call, timeout expires or we get a signal. After waking up,
>+we dequeue everyone, and check if some futex was awakened. This dequeue is done
>+by iteratively walking at each element of struct futex_head list.
>+
>+All enqueuing/dequeuing operations requires to hold the bucket lock, to avoid
>+racing while modifying the list.
>+
>+Waking
>+------
>+
>+We get the bucket that's storing the waiters at uaddr, and wake the required
>+number of waiters, checking for hash collision.
>+
>+There's an optimization that makes futex_wake() not take the bucket lock if
>+there's no one to be woken on that bucket. It checks an atomic counter that each
>+bucket has, if it says 0, then the syscall exits. In order for this to work, the
>+waiter thread increases it before taking the lock, so the wake thread will
>+correctly see that there's someone waiting and will continue the path to take
>+the bucket lock. To get the correct serialization, the waiter issues a memory
>+barrier after increasing the bucket counter and the waker issues a memory
>+barrier before checking it.
>+
>+Requeuing
>+---------
>+
>+The requeue path first checks for each struct futex_requeue and their flags.
>+Then, it will compare the expected value with the one at uaddr1::uaddr.
>+Following the same serialization explained at Waking_, we increase the atomic
>+counter for the bucket of uaddr2 before taking the lock. We need to have both
>+buckets locks at same time so we don't race with other futex operation. To
>+ensure the locks are taken in the same order for all threads (and thus avoiding
>+deadlocks), every requeue operation takes the "smaller" bucket first, when
>+comparing both addresses.
>+
>+If the compare with user value succeeds, we proceed by waking ``nr_wake``
>+futexes, and then requeuing ``nr_requeue`` from bucket of uaddr1 to the uaddr2.
>+This consists in a simple list deletion/addition and replacing the old futex key
>+with the new one.
>+
>+Futex keys
>+----------
>+
>+There are two types of futexes: private and shared ones. The private are futexes
>+meant to be used by threads that share the same memory space, are easier to be
>+uniquely identified and thus can have some performance optimization. The
>+elements for identifying one are: the start address of the page where the
>+address is, the address offset within the page and the current->mm pointer.
>+
>+Now, for uniquely identifying a shared futex:
>+
>+- If the page containing the user address is an anonymous page, we can
>+ just use the same data used for private futexes (the start address of
>+ the page, the address offset within the page and the current->mm
>+ pointer); that will be enough for uniquely identifying such futex. We
>+ also set one bit at the key to differentiate if a private futex is
>+ used on the same address (mixing shared and private calls does not
>+ work).
>+
>+- If the page is file-backed, current->mm maybe isn't the same one for
>+ every user of this futex, so we need to use other data: the
>+ page->index, a UUID for the struct inode and the offset within the
>+ page.
>+
>+Note that members of futex_key don't have any particular meaning after they
>+are part of the struct - they are just bytes to identify a futex. Given that,
>+we don't need to use a particular name or type that matches the original data,
>+we only need to care about the bitsize of each component and make both private
>+and shared fit in the same memory space.
>+
>+Source code documentation
>+=========================
>+
>+.. kernel-doc:: kernel/futex2.c
>+ :no-identifiers: sys_futex_wait sys_futex_wake sys_futex_waitv sys_futex_requeue
>diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
>index 7003bd5aeff4..9bf03c7fa1ec 100644
>--- a/Documentation/locking/index.rst
>+++ b/Documentation/locking/index.rst
>@@ -24,6 +24,7 @@ locking
> percpu-rw-semaphore
> robust-futexes
> robust-futex-ABI
>+ futex2
>
> .. only:: subproject and html
>
>--
>2.31.1
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2021-06-06 19:19 Davidlohr Bueso
@ 2021-06-07 16:02 ` André Almeida
0 siblings, 0 replies; 417+ messages in thread
From: André Almeida @ 2021-06-07 16:02 UTC (permalink / raw)
To: Davidlohr Bueso
Cc: Thomas Gleixner, Ingo Molnar, Peter Zijlstra, Darren Hart,
linux-kernel, Steven Rostedt, Sebastian Andrzej Siewior, kernel,
krisman, pgriffais, z.figura12, joel, malteskarupke, linux-api,
fweimer, libc-alpha, linux-kselftest, shuah, acme, corbet,
Peter Oskolkov, Andrey Semashev, mtk.manpages
Às 16:19 de 06/06/21, Davidlohr Bueso escreveu:
> Bcc:
> Subject: Re: [PATCH v4 07/15] docs: locking: futex2: Add documentation
> Reply-To:
> In-Reply-To: <20210603195924.361327-8-andrealmeid@collabora.com>
>
> On Thu, 03 Jun 2021, Andr� Almeida wrote:
>
>> Add a new documentation file specifying both userspace API and internal
>> implementation details of futex2 syscalls.
>
> I think equally important would be to provide a manpage for each new
> syscall you are introducing, and keep mkt in the loop as in the past he
> extensively documented and improved futex manpages, and overall has a
> lot of experience with dealing with kernel interfaces.
Right, I'll add the man pages in a future version and make sure to have
mkt in the loop, thanks for the tip.
>
> Thanks,
> Davidlohr
>
>>
>> Signed-off-by: André Almeida <andrealmeid@collabora.com>
>> ---
>> Documentation/locking/futex2.rst | 198 +++++++++++++++++++++++++++++++
>> Documentation/locking/index.rst | 1 +
>> 2 files changed, 199 insertions(+)
>> create mode 100644 Documentation/locking/futex2.rst
>>
>> diff --git a/Documentation/locking/futex2.rst
>> b/Documentation/locking/futex2.rst
>> new file mode 100644
>> index 000000000000..2f74d7c97a55
>> --- /dev/null
>> +++ b/Documentation/locking/futex2.rst
>> @@ -0,0 +1,198 @@
>> +.. SPDX-License-Identifier: GPL-2.0
>> +
>> +======
>> +futex2
>> +======
>> +
>> +:Author: André Almeida <andrealmeid@collabora.com>
>> +
>> +futex, or fast user mutex, is a set of syscalls to allow userspace to
>> create
>> +performant synchronization mechanisms, such as mutexes, semaphores and
>> +conditional variables in userspace. C standard libraries, like glibc,
>> uses it
>> +as a means to implement more high level interfaces like pthreads.
>> +
>> +The interface
>> +=============
>> +
>> +uAPI functions
>> +--------------
>> +
>> +.. kernel-doc:: kernel/futex2.c
>> + :identifiers: sys_futex_wait sys_futex_wake sys_futex_waitv
>> sys_futex_requeue
>> +
>> +uAPI structures
>> +---------------
>> +
>> +.. kernel-doc:: include/uapi/linux/futex.h
>> +
>> +The ``flag`` argument
>> +---------------------
>> +
>> +The flag is used to specify the size of the futex word
>> +(FUTEX_[8, 16, 32, 64]). It's mandatory to define one, since there's no
>> +default size.
>> +
>> +By default, the timeout uses a monotonic clock, but can be used as a
>> realtime
>> +one by using the FUTEX_REALTIME_CLOCK flag.
>> +
>> +By default, futexes are of the private type, that means that this
>> user address
>> +will be accessed by threads that share the same memory region. This
>> allows for
>> +some internal optimizations, so they are faster. However, if the
>> address needs
>> +to be shared with different processes (like using ``mmap()`` or
>> ``shm()``), they
>> +need to be defined as shared and the flag FUTEX_SHARED_FLAG is used
>> to set that.
>> +
>> +By default, the operation has no NUMA-awareness, meaning that the
>> user can't
>> +choose the memory node where the kernel side futex data will be
>> stored. The
>> +user can choose the node where it wants to operate by setting the
>> +FUTEX_NUMA_FLAG and using the following structure (where X can be 8,
>> 16, 32 or
>> +64)::
>> +
>> + struct futexX_numa {
>> + __uX value;
>> + __sX hint;
>> + };
>> +
>> +This structure should be passed at the ``void *uaddr`` of futex
>> functions. The
>> +address of the structure will be used to be waited on/waken on, and the
>> +``value`` will be compared to ``val`` as usual. The ``hint`` member
>> is used to
>> +define which node the futex will use. When waiting, the futex will be
>> +registered on a kernel-side table stored on that node; when waking,
>> the futex
>> +will be searched for on that given table. That means that there's no
>> redundancy
>> +between tables, and the wrong ``hint`` value will lead to undesired
>> behavior.
>> +Userspace is responsible for dealing with node migrations issues that
>> may
>> +occur. ``hint`` can range from [0, MAX_NUMA_NODES), for specifying a
>> node, or
>> +-1, to use the same node the current process is using.
>> +
>> +When not using FUTEX_NUMA_FLAG on a NUMA system, the futex will be
>> stored on a
>> +global table on allocated on the first node.
>> +
>> +The ``timo`` argument
>> +---------------------
>> +
>> +As per the Y2038 work done in the kernel, new interfaces shouldn't
>> add timeout
>> +options known to be buggy. Given that, ``timo`` should be a 64-bit
>> timeout at
>> +all platforms, using an absolute timeout value.
>> +
>> +Implementation
>> +==============
>> +
>> +The internal implementation follows a similar design to the original
>> futex.
>> +Given that we want to replicate the same external behavior of current
>> futex,
>> +this should be somewhat expected.
>> +
>> +Waiting
>> +-------
>> +
>> +For the wait operations, they are all treated as if you want to wait
>> on N
>> +futexes, so the path for futex_wait and futex_waitv is the basically
>> the same.
>> +For both syscalls, the first step is to prepare an internal list for
>> the list
>> +of futexes to wait for (using struct futexv_head). For futex_wait()
>> calls, this
>> +list will have a single object.
>> +
>> +We have a hash table, where waiters register themselves before
>> sleeping. Then
>> +the wake function checks this table looking for waiters at uaddr.
>> The hash
>> +bucket to be used is determined by a struct futex_key, that stores
>> information
>> +to uniquely identify an address from a given process. Given the huge
>> address
>> +space, there'll be hash collisions, so we store information to be
>> later used on
>> +collision treatment.
>> +
>> +First, for every futex we want to wait on, we check if (``*uaddr ==
>> val``).
>> +This check is done holding the bucket lock, so we are correctly
>> serialized with
>> +any futex_wake() calls. If any waiter fails the check above, we
>> dequeue all
>> +futexes. The check (``*uaddr == val``) can fail for two reasons:
>> +
>> +- The values are different, and we return -EAGAIN. However, if while
>> + dequeueing we found that some futexes were awakened, we prioritize
>> this
>> + and return success.
>> +
>> +- When trying to access the user address, we do so with page faults
>> + disabled because we are holding a bucket's spin lock (and can't sleep
>> + while holding a spin lock). If there's an error, it might be a page
>> + fault, or an invalid address. We release the lock, dequeue everyone
>> + (because it's illegal to sleep while there are futexes enqueued, we
>> + could lose wakeups) and try again with page fault enabled. If we
>> + succeed, this means that the address is valid, but we need to do
>> + all the work again. For serialization reasons, we need to have the
>> + spin lock when getting the user value. Additionally, for shared
>> + futexes, we also need to recalculate the hash, since the underlying
>> + mapping mechanisms could have changed when dealing with page fault.
>> + If, even with page fault enabled, we can't access the address, it
>> + means it's an invalid user address, and we return -EFAULT. For this
>> + case, we prioritize the error, even if some futexes were awaken.
>> +
>> +If the check is OK, they are enqueued on a linked list in our bucket,
>> and
>> +proceed to the next one. If all waiters succeed, we put the thread to
>> sleep
>> +until a futex_wake() call, timeout expires or we get a signal. After
>> waking up,
>> +we dequeue everyone, and check if some futex was awakened. This
>> dequeue is done
>> +by iteratively walking at each element of struct futex_head list.
>> +
>> +All enqueuing/dequeuing operations requires to hold the bucket lock,
>> to avoid
>> +racing while modifying the list.
>> +
>> +Waking
>> +------
>> +
>> +We get the bucket that's storing the waiters at uaddr, and wake the
>> required
>> +number of waiters, checking for hash collision.
>> +
>> +There's an optimization that makes futex_wake() not take the bucket
>> lock if
>> +there's no one to be woken on that bucket. It checks an atomic
>> counter that each
>> +bucket has, if it says 0, then the syscall exits. In order for this
>> to work, the
>> +waiter thread increases it before taking the lock, so the wake thread
>> will
>> +correctly see that there's someone waiting and will continue the path
>> to take
>> +the bucket lock. To get the correct serialization, the waiter issues
>> a memory
>> +barrier after increasing the bucket counter and the waker issues a
>> memory
>> +barrier before checking it.
>> +
>> +Requeuing
>> +---------
>> +
>> +The requeue path first checks for each struct futex_requeue and their
>> flags.
>> +Then, it will compare the expected value with the one at uaddr1::uaddr.
>> +Following the same serialization explained at Waking_, we increase
>> the atomic
>> +counter for the bucket of uaddr2 before taking the lock. We need to
>> have both
>> +buckets locks at same time so we don't race with other futex
>> operation. To
>> +ensure the locks are taken in the same order for all threads (and
>> thus avoiding
>> +deadlocks), every requeue operation takes the "smaller" bucket first,
>> when
>> +comparing both addresses.
>> +
>> +If the compare with user value succeeds, we proceed by waking
>> ``nr_wake``
>> +futexes, and then requeuing ``nr_requeue`` from bucket of uaddr1 to
>> the uaddr2.
>> +This consists in a simple list deletion/addition and replacing the
>> old futex key
>> +with the new one.
>> +
>> +Futex keys
>> +----------
>> +
>> +There are two types of futexes: private and shared ones. The private
>> are futexes
>> +meant to be used by threads that share the same memory space, are
>> easier to be
>> +uniquely identified and thus can have some performance optimization. The
>> +elements for identifying one are: the start address of the page where
>> the
>> +address is, the address offset within the page and the current->mm
>> pointer.
>> +
>> +Now, for uniquely identifying a shared futex:
>> +
>> +- If the page containing the user address is an anonymous page, we can
>> + just use the same data used for private futexes (the start address of
>> + the page, the address offset within the page and the current->mm
>> + pointer); that will be enough for uniquely identifying such futex. We
>> + also set one bit at the key to differentiate if a private futex is
>> + used on the same address (mixing shared and private calls does not
>> + work).
>> +
>> +- If the page is file-backed, current->mm maybe isn't the same one for
>> + every user of this futex, so we need to use other data: the
>> + page->index, a UUID for the struct inode and the offset within the
>> + page.
>> +
>> +Note that members of futex_key don't have any particular meaning
>> after they
>> +are part of the struct - they are just bytes to identify a futex.
>> Given that,
>> +we don't need to use a particular name or type that matches the
>> original data,
>> +we only need to care about the bitsize of each component and make
>> both private
>> +and shared fit in the same memory space.
>> +
>> +Source code documentation
>> +=========================
>> +
>> +.. kernel-doc:: kernel/futex2.c
>> + :no-identifiers: sys_futex_wait sys_futex_wake sys_futex_waitv
>> sys_futex_requeue
>> diff --git a/Documentation/locking/index.rst
>> b/Documentation/locking/index.rst
>> index 7003bd5aeff4..9bf03c7fa1ec 100644
>> --- a/Documentation/locking/index.rst
>> +++ b/Documentation/locking/index.rst
>> @@ -24,6 +24,7 @@ locking
>> percpu-rw-semaphore
>> robust-futexes
>> robust-futex-ABI
>> + futex2
>>
>> .. only:: subproject and html
>>
>> --
>> 2.31.1
>>
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2021-04-05 0:01 Mitali Borkar
2021-04-06 7:03 ` Arnd Bergmann
0 siblings, 1 reply; 417+ messages in thread
From: Mitali Borkar @ 2021-04-05 0:01 UTC (permalink / raw)
To: manish, GR-Linux-NIC-Dev, gregkh; +Cc: linux-staging, linux-kernel
outreachy-kernel@googlegroups.com, mitaliborkar810@gmail.com
Bcc:
Subject: [PATCH] staging: qlge:remove else after break
Reply-To:
Fixed Warning:- else is not needed after break
break terminates the loop if encountered. else is unnecessary and
increases indenatation
Signed-off-by: Mitali Borkar <mitaliborkar810@gmail.com>
---
drivers/staging/qlge/qlge_mpi.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/staging/qlge/qlge_mpi.c b/drivers/staging/qlge/qlge_mpi.c
index 2630ebf50341..3a49f187203b 100644
--- a/drivers/staging/qlge/qlge_mpi.c
+++ b/drivers/staging/qlge/qlge_mpi.c
@@ -935,13 +935,11 @@ static int qlge_idc_wait(struct qlge_adapter *qdev)
netif_err(qdev, drv, qdev->ndev, "IDC Success.\n");
status = 0;
break;
- } else {
- netif_err(qdev, drv, qdev->ndev,
+ } netif_err(qdev, drv, qdev->ndev,
"IDC: Invalid State 0x%.04x.\n",
mbcp->mbox_out[0]);
status = -EIO;
break;
- }
}
return status;
--
2.30.2
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2021-04-05 0:01 Mitali Borkar
@ 2021-04-06 7:03 ` Arnd Bergmann
0 siblings, 0 replies; 417+ messages in thread
From: Arnd Bergmann @ 2021-04-06 7:03 UTC (permalink / raw)
To: Mitali Borkar
Cc: manish, GR-Linux-NIC-Dev, gregkh, linux-staging,
Linux Kernel Mailing List
On Mon, Apr 5, 2021 at 2:03 AM Mitali Borkar <mitaliborkar810@gmail.com> wrote:
>
> outreachy-kernel@googlegroups.com, mitaliborkar810@gmail.com
> Bcc:
> Subject: [PATCH] staging: qlge:remove else after break
> Reply-To:
>
> Fixed Warning:- else is not needed after break
> break terminates the loop if encountered. else is unnecessary and
> increases indenatation
>
> Signed-off-by: Mitali Borkar <mitaliborkar810@gmail.com>
> ---
> drivers/staging/qlge/qlge_mpi.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/staging/qlge/qlge_mpi.c b/drivers/staging/qlge/qlge_mpi.c
> index 2630ebf50341..3a49f187203b 100644
> --- a/drivers/staging/qlge/qlge_mpi.c
> +++ b/drivers/staging/qlge/qlge_mpi.c
> @@ -935,13 +935,11 @@ static int qlge_idc_wait(struct qlge_adapter *qdev)
> netif_err(qdev, drv, qdev->ndev, "IDC Success.\n");
> status = 0;
> break;
> - } else {
> - netif_err(qdev, drv, qdev->ndev,
> + } netif_err(qdev, drv, qdev->ndev,
> "IDC: Invalid State 0x%.04x.\n",
> mbcp->mbox_out[0]);
> status = -EIO;
> break;
> - }
> }
It looks like you got this one wrong in multiple ways:
- This is not an equivalent transformation, since the errror is now
printed in the first part of the 'if()' block as well.
- The indentation is wrong now, with the netif_err() starting in the
same line as the '}'.
- The description mentions a change in indentation, but you did not
actually change it.
- The changelog text appears mangled.
Arnd
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <CAPncsNOFoUt7uEDEdihDTZY4pJsuPxt146W-L+Ju53SgZ6ezYw@mail.gmail.com>]
* (no subject)
@ 2021-01-19 0:10 David Howells
2021-01-20 14:46 ` Jarkko Sakkinen
0 siblings, 1 reply; 417+ messages in thread
From: David Howells @ 2021-01-19 0:10 UTC (permalink / raw)
To: torvalds
Cc: Tobias Markus, Tianjia Zhang, dhowells, keyrings, linux-crypto,
linux-security-module, stable, linux-kernel
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
On the following call path, `sig->pkey_algo` is not assigned
in asymmetric_key_verify_signature(), which causes runtime
crash in public_key_verify_signature().
keyctl_pkey_verify
asymmetric_key_verify_signature
verify_signature
public_key_verify_signature
This patch simply check this situation and fixes the crash
caused by NULL pointer.
Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification")
Reported-by: Tobias Markus <tobias@markus-regensburg.de>
Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-and-tested-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: João Fonseca <jpedrofonseca@ua.pt>
Cc: stable@vger.kernel.org # v5.10+
---
crypto/asymmetric_keys/public_key.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 8892908ad58c..788a4ba1e2e7 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -356,7 +356,8 @@ int public_key_verify_signature(const struct public_key *pkey,
if (ret)
goto error_free_key;
- if (strcmp(sig->pkey_algo, "sm2") == 0 && sig->data_size) {
+ if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 &&
+ sig->data_size) {
ret = cert_sig_digest_update(sig, tfm);
if (ret)
goto error_free_key;
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2021-01-19 0:10 David Howells
@ 2021-01-20 14:46 ` Jarkko Sakkinen
0 siblings, 0 replies; 417+ messages in thread
From: Jarkko Sakkinen @ 2021-01-20 14:46 UTC (permalink / raw)
To: David Howells
Cc: torvalds, Tobias Markus, Tianjia Zhang, keyrings, linux-crypto,
linux-security-module, stable, linux-kernel
On Tue, Jan 19, 2021 at 12:10:33AM +0000, David Howells wrote:
>
> From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
>
> On the following call path, `sig->pkey_algo` is not assigned
> in asymmetric_key_verify_signature(), which causes runtime
> crash in public_key_verify_signature().
>
> keyctl_pkey_verify
> asymmetric_key_verify_signature
> verify_signature
> public_key_verify_signature
>
> This patch simply check this situation and fixes the crash
> caused by NULL pointer.
>
> Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification")
> Reported-by: Tobias Markus <tobias@markus-regensburg.de>
> Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
> Signed-off-by: David Howells <dhowells@redhat.com>
> Reviewed-and-tested-by: Toke Høiland-Jørgensen <toke@redhat.com>
> Tested-by: João Fonseca <jpedrofonseca@ua.pt>
> Cc: stable@vger.kernel.org # v5.10+
> ---
For what it's worth
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
/Jarkko
>
> crypto/asymmetric_keys/public_key.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
> index 8892908ad58c..788a4ba1e2e7 100644
> --- a/crypto/asymmetric_keys/public_key.c
> +++ b/crypto/asymmetric_keys/public_key.c
> @@ -356,7 +356,8 @@ int public_key_verify_signature(const struct public_key *pkey,
> if (ret)
> goto error_free_key;
>
> - if (strcmp(sig->pkey_algo, "sm2") == 0 && sig->data_size) {
> + if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 &&
> + sig->data_size) {
> ret = cert_sig_digest_update(sig, tfm);
> if (ret)
> goto error_free_key;
>
>
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <CAGMNF6W8baS_zLYL8DwVsbfPWTP2ohzRB7xutW0X=MUzv93pbA@mail.gmail.com>]
* Re:
[not found] <CAGMNF6W8baS_zLYL8DwVsbfPWTP2ohzRB7xutW0X=MUzv93pbA@mail.gmail.com>
@ 2020-12-02 17:09 ` Kun Yi
0 siblings, 0 replies; 417+ messages in thread
From: Kun Yi @ 2020-12-02 17:09 UTC (permalink / raw)
To: Kun Yi, Guenter Roeck, robh+dt, Venkatesh, Supreeth
Cc: OpenBMC Maillist, linux-hwmon, linux-kernel
Much apologies for the super late reply.. I was out for an extended
period of time due to personal circumstances.
I have now addressed most of the comments in the v4 series.
Also cc'ed Supreeth who works on the AMD System Manageability stack.
On Wed, Dec 2, 2020 at 8:57 AM Kun Yi <kunyi@google.com> wrote:
>
> On Sat, Apr 04, 2020 at 08:01:16PM -0700, Kun Yi wrote:
> > SB Temperature Sensor Interface (SB-TSI) is an SMBus compatible
> > interface that reports AMD SoC's Ttcl (normalized temperature),
> > and resembles a typical 8-pin remote temperature sensor's I2C interface
> > to BMC.
> >
> > This commit adds basic support using this interface to read CPU
> > temperature, and read/write high/low CPU temp thresholds.
> >
> > To instantiate this driver on an AMD CPU with SB-TSI
> > support, the i2c bus number would be the bus connected from the board
> > management controller (BMC) to the CPU. The i2c address is specified in
> > Section 6.3.1 of the spec [1]: The SB-TSI address is normally 98h for socket 0
> > and 90h for socket 1, but it could vary based on hardware address select pins.
> >
> > [1]: https://www.amd.com/system/files/TechDocs/56255_OSRR.pdf
> >
> > Test status: tested reading temp1_input, and reading/writing
> > temp1_max/min.
> >
> > Signed-off-by: Kun Yi <kunyi at google.com>
> > ---
> > drivers/hwmon/Kconfig | 10 ++
> > drivers/hwmon/Makefile | 1 +
> > drivers/hwmon/sbtsi_temp.c | 259 +++++++++++++++++++++++++++++++++++++
> > 3 files changed, 270 insertions(+)
> > create mode 100644 drivers/hwmon/sbtsi_temp.c
> >
> > diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
> > index 05a30832c6ba..9585dcd01d1b 100644
> > --- a/drivers/hwmon/Kconfig
> > +++ b/drivers/hwmon/Kconfig
> > @@ -1412,6 +1412,16 @@ config SENSORS_RASPBERRYPI_HWMON
> > This driver can also be built as a module. If so, the module
> > will be called raspberrypi-hwmon.
> >
> > +config SENSORS_SBTSI
> > + tristate "Emulated SB-TSI temperature sensor"
> > + depends on I2C
> > + help
> > + If you say yes here you get support for emulated temperature
> > + sensors on AMD SoCs with SB-TSI interface connected to a BMC device.
> > +
> > + This driver can also be built as a module. If so, the module will
> > + be called sbtsi_temp.
> > +
> > config SENSORS_SHT15
> > tristate "Sensiron humidity and temperature sensors. SHT15 and compat."
> > depends on GPIOLIB || COMPILE_TEST
> > diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
> > index b0b9c8e57176..cd109f003ce4 100644
> > --- a/drivers/hwmon/Makefile
> > +++ b/drivers/hwmon/Makefile
> > @@ -152,6 +152,7 @@ obj-$(CONFIG_SENSORS_POWR1220) += powr1220.o
> > obj-$(CONFIG_SENSORS_PWM_FAN) += pwm-fan.o
> > obj-$(CONFIG_SENSORS_RASPBERRYPI_HWMON) += raspberrypi-hwmon.o
> > obj-$(CONFIG_SENSORS_S3C) += s3c-hwmon.o
> > +obj-$(CONFIG_SENSORS_SBTSI) += sbtsi_temp.o
> > obj-$(CONFIG_SENSORS_SCH56XX_COMMON)+= sch56xx-common.o
> > obj-$(CONFIG_SENSORS_SCH5627) += sch5627.o
> > obj-$(CONFIG_SENSORS_SCH5636) += sch5636.o
> > diff --git a/drivers/hwmon/sbtsi_temp.c b/drivers/hwmon/sbtsi_temp.c
> > new file mode 100644
> > index 000000000000..e3ad6a9f7ec1
> > --- /dev/null
> > +++ b/drivers/hwmon/sbtsi_temp.c
> > @@ -0,0 +1,259 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + * sbtsi_temp.c - hwmon driver for a SBI Temperature Sensor Interface (SB-TSI)
> > + * compliant AMD SoC temperature device.
> > + *
> > + * Copyright (c) 2020, Google Inc.
> > + * Copyright (c) 2020, Kun Yi <kunyi at google.com>
> > + */
> > +
> > +#include <linux/err.h>
> > +#include <linux/i2c.h>
> > +#include <linux/init.h>
> > +#include <linux/hwmon.h>
> > +#include <linux/module.h>
> > +#include <linux/mutex.h>
> > +#include <linux/of_device.h>
> > +#include <linux/of.h>
> > +
> > +/*
> > + * SB-TSI registers only support SMBus byte data access. "_INT" registers are
> > + * the integer part of a temperature value or limit, and "_DEC" registers are
> > + * corresponding decimal parts.
> > + */
> > +#define SBTSI_REG_TEMP_INT 0x01 /* RO */
> > +#define SBTSI_REG_STATUS 0x02 /* RO */
> > +#define SBTSI_REG_CONFIG 0x03 /* RO */
> > +#define SBTSI_REG_TEMP_HIGH_INT 0x07 /* RW */
> > +#define SBTSI_REG_TEMP_LOW_INT 0x08 /* RW */
> > +#define SBTSI_REG_TEMP_DEC 0x10 /* RW */
> > +#define SBTSI_REG_TEMP_HIGH_DEC 0x13 /* RW */
> > +#define SBTSI_REG_TEMP_LOW_DEC 0x14 /* RW */
> > +#define SBTSI_REG_REV 0xFF /* RO */
>
> The revision register is not actually used.
Thanks. Removed. I agree that the register is not well documented, at
least publicly.
It shouldn't affect functionality of this driver, so I removed the
definition altogether.
>
> > +
> > +#define SBTSI_CONFIG_READ_ORDER_SHIFT 5
> > +
> > +#define SBTSI_TEMP_MIN 0
> > +#define SBTSI_TEMP_MAX 255875
> > +#define SBTSI_REV_MAX_VALID_ID 4
>
> Not actually used, and I am not sure if it would make sense to check it.
> If at all, it would only make sense if you also check SBTSIxFE (Manufacture
> ID). Unfortunately, the actual SB-TSI specification seems to be non-public,
> so I can't check if the driver as-is supports versions 0..3 (assuming those
> exist).
Thanks. Removed.
>
> > +
> > +/* Each client has this additional data */
> > +struct sbtsi_data {
> > + struct i2c_client *client;
> > + struct mutex lock;
> > +};
> > +
> > +/*
> > + * From SB-TSI spec: CPU temperature readings and limit registers encode the
> > + * temperature in increments of 0.125 from 0 to 255.875. The "high byte"
> > + * register encodes the base-2 of the integer portion, and the upper 3 bits of
> > + * the "low byte" encode in base-2 the decimal portion.
> > + *
> > + * e.g. INT=0x19, DEC=0x20 represents 25.125 degrees Celsius
> > + *
> > + * Therefore temperature in millidegree Celsius =
> > + * (INT + DEC / 256) * 1000 = (INT * 8 + DEC / 32) * 125
> > + */
> > +static inline int sbtsi_reg_to_mc(s32 integer, s32 decimal)
> > +{
> > + return ((integer << 3) + (decimal >> 5)) * 125;
> > +}
> > +
> > +/*
> > + * Inversely, given temperature in millidegree Celsius
> > + * INT = (TEMP / 125) / 8
> > + * DEC = ((TEMP / 125) % 8) * 32
> > + * Caller have to make sure temp doesn't exceed 255875, the max valid value.
> > + */
> > +static inline void sbtsi_mc_to_reg(s32 temp, u8 *integer, u8 *decimal)
> > +{
> > + temp /= 125;
> > + *integer = temp >> 3;
> > + *decimal = (temp & 0x7) << 5;
> > +}
> > +
> > +static int sbtsi_read(struct device *dev, enum hwmon_sensor_types type,
> > + u32 attr, int channel, long *val)
> > +{
> > + struct sbtsi_data *data = dev_get_drvdata(dev);
> > + s32 temp_int, temp_dec;
> > + int err, reg_int, reg_dec;
> > + u8 read_order;
> > +
> > + if (type != hwmon_temp)
> > + return -EINVAL;
> > +
> > + read_order = 0;
> > + switch (attr) {
> > + case hwmon_temp_input:
> > + /*
> > + * ReadOrder bit specifies the reading order of integer and
> > + * decimal part of CPU temp for atomic reads. If bit == 0,
> > + * reading integer part triggers latching of the decimal part,
> > + * so integer part should be read first. If bit == 1, read
> > + * order should be reversed.
> > + */
> > + err = i2c_smbus_read_byte_data(data->client, SBTSI_REG_CONFIG);
> > + if (err < 0)
> > + return err;
> > +
> As I understand it, the idea is to set this configuration bit once and then
> just use it. Any chance to do that ? This would save an i2c read operation
> each time the temperature is read, and the if/else complexity below.
Unfortunately, the read-order register bit is read-only.
>
> > + read_order = (u8)err & BIT(SBTSI_CONFIG_READ_ORDER_SHIFT);
>
> Nit: typecast is unnecessary.
Done.
>
> > + reg_int = SBTSI_REG_TEMP_INT;
> > + reg_dec = SBTSI_REG_TEMP_DEC;
> > + break;
> > + case hwmon_temp_max:
> > + reg_int = SBTSI_REG_TEMP_HIGH_INT;
> > + reg_dec = SBTSI_REG_TEMP_HIGH_DEC;
> > + break;
> > + case hwmon_temp_min:
> > + reg_int = SBTSI_REG_TEMP_LOW_INT;
> > + reg_dec = SBTSI_REG_TEMP_LOW_DEC;
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + if (read_order == 0) {
> > + temp_int = i2c_smbus_read_byte_data(data->client, reg_int);
> > + temp_dec = i2c_smbus_read_byte_data(data->client, reg_dec);
> > + } else {
> > + temp_dec = i2c_smbus_read_byte_data(data->client, reg_dec);
> > + temp_int = i2c_smbus_read_byte_data(data->client, reg_int);
> > + }
>
> Just a thought: if you use regmap and tell it that the limit registers
> are non-volatile, this wouldn't actually read from the chip more than once.
That's a great suggestion, although in our normal use cases the limit
values are read and cached by the
userspace application. Seems changing to regmap would require some
messaging of the code. Would it
be acceptable to keep the initial driver as-is and do that in a following patch?
>
> Also, since the read involves reading two registers, and the first read
> locks the value for the second, you'll need mutex protection when reading
> the current temperature (not for limits, though).
Added mutex locking before/after the temp input reading.
>
> > +
> > + if (temp_int < 0)
> > + return temp_int;
> > + if (temp_dec < 0)
> > + return temp_dec;
> > +
> > + *val = sbtsi_reg_to_mc(temp_int, temp_dec);
> > +
> > + return 0;
> > +}
> > +
> > +static int sbtsi_write(struct device *dev, enum hwmon_sensor_types type,
> > + u32 attr, int channel, long val)
> > +{
> > + struct sbtsi_data *data = dev_get_drvdata(dev);
> > + int reg_int, reg_dec, err;
> > + u8 temp_int, temp_dec;
> > +
> > + if (type != hwmon_temp)
> > + return -EINVAL;
> > +
> > + switch (attr) {
> > + case hwmon_temp_max:
> > + reg_int = SBTSI_REG_TEMP_HIGH_INT;
> > + reg_dec = SBTSI_REG_TEMP_HIGH_DEC;
> > + break;
> > + case hwmon_temp_min:
> > + reg_int = SBTSI_REG_TEMP_LOW_INT;
> > + reg_dec = SBTSI_REG_TEMP_LOW_DEC;
> > + break;
> > + default:
> > + return -EINVAL;
> > + }
> > +
> > + val = clamp_val(val, SBTSI_TEMP_MIN, SBTSI_TEMP_MAX);
> > + mutex_lock(&data->lock);
> > + sbtsi_mc_to_reg(val, &temp_int, &temp_dec);
> > + err = i2c_smbus_write_byte_data(data->client, reg_int, temp_int);
> > + if (err)
> > + goto exit;
> > +
> > + err = i2c_smbus_write_byte_data(data->client, reg_dec, temp_dec);
> > +exit:
> > + mutex_unlock(&data->lock);
> > + return err;
> > +}
> > +
> > +static umode_t sbtsi_is_visible(const void *data,
> > + enum hwmon_sensor_types type,
> > + u32 attr, int channel)
> > +{
> > + switch (type) {
> > + case hwmon_temp:
> > + switch (attr) {
> > + case hwmon_temp_input:
> > + return 0444;
> > + case hwmon_temp_min:
> > + return 0644;
> > + case hwmon_temp_max:
> > + return 0644;
> > + }
> > + break;
> > + default:
> > + break;
> > + }
> > + return 0;
> > +}
> > +
> > +static const struct hwmon_channel_info *sbtsi_info[] = {
> > + HWMON_CHANNEL_INFO(chip,
> > + HWMON_C_REGISTER_TZ),
> > + HWMON_CHANNEL_INFO(temp,
> > + HWMON_T_INPUT | HWMON_T_MIN | HWMON_T_MAX),
>
> For your consideration: SB-TSI supports reporting high/low alerts.
> With this, it would be possible to implement respective alarm attributes.
> In conjunction with https://patchwork.kernel.org/patch/11277347/mbox/,
> it should also be possible to add interrupt and thus userspace notification
> for those attributes.
>
> SBTSI also supports setting the update rate (SBTSIx04) and setting
> the temperature offset (SBTSIx11, SBTSIx12), which could also be
> implemented as standard attributes.
>
> I won't require that for the initial version, just something to keep
> in mind.
Ack and thanks for the suggestions. I will keep in mind for future improvements.
>
> > + NULL
> > +};
> > +
> > +static const struct hwmon_ops sbtsi_hwmon_ops = {
> > + .is_visible = sbtsi_is_visible,
> > + .read = sbtsi_read,
> > + .write = sbtsi_write,
> > +};
> > +
> > +static const struct hwmon_chip_info sbtsi_chip_info = {
> > + .ops = &sbtsi_hwmon_ops,
> > + .info = sbtsi_info,
> > +};
> > +
> > +static int sbtsi_probe(struct i2c_client *client,
> > + const struct i2c_device_id *id)
> > +{
> > + struct device *dev = &client->dev;
> > + struct device *hwmon_dev;
> > + struct sbtsi_data *data;
> > +
> > + data = devm_kzalloc(dev, sizeof(struct sbtsi_data), GFP_KERNEL);
> > + if (!data)
> > + return -ENOMEM;
> > +
> > + data->client = client;
> > + mutex_init(&data->lock);
> > +
> > + hwmon_dev =
> > + devm_hwmon_device_register_with_info(dev, client->name, data,
> > + &sbtsi_chip_info, NULL);
> > +
> > + return PTR_ERR_OR_ZERO(hwmon_dev);
> > +}
> > +
> > +static const struct i2c_device_id sbtsi_id[] = {
> > + {"sbtsi", 0},
> > + {}
> > +};
> > +MODULE_DEVICE_TABLE(i2c, sbtsi_id);
> > +
> > +static const struct of_device_id __maybe_unused sbtsi_of_match[] = {
> > + {
> > + .compatible = "amd,sbtsi",
> > + },
> > + { },
> > +};
> > +MODULE_DEVICE_TABLE(of, sbtsi_of_match);
> > +
> > +static struct i2c_driver sbtsi_driver = {
> > + .class = I2C_CLASS_HWMON,
> > + .driver = {
> > + .name = "sbtsi",
> > + .of_match_table = of_match_ptr(sbtsi_of_match),
> > + },
> > + .probe = sbtsi_probe,
> > + .id_table = sbtsi_id,
> > +};
> > +
> > +module_i2c_driver(sbtsi_driver);
> > +
> > +MODULE_AUTHOR("Kun Yi <kunyi at google.com>");
> > +MODULE_DESCRIPTION("Hwmon driver for AMD SB-TSI emulated sensor");
> > +MODULE_LICENSE("GPL");
--
Regards,
Kun
^ permalink raw reply [flat|nested] 417+ messages in thread
* [PATCH] lib/find_bit: Add find_prev_*_bit functions.
@ 2020-12-02 1:10 Yun Levi
2020-12-02 9:47 ` Andy Shevchenko
0 siblings, 1 reply; 417+ messages in thread
From: Yun Levi @ 2020-12-02 1:10 UTC (permalink / raw)
To: dushistov, arnd, akpm, gustavo, vilhelm.gray, richard.weiyang,
andriy.shevchenko, joseph.qi, skalluru, yury.norov, jpoimboe
Cc: linux-kernel, linux-arch
Inspired find_next_*bit function series, add find_prev_*_bit series.
I'm not sure whether it'll be used right now But, I add these functions
for future usage.
Signed-off-by: Levi Yun <ppbuk5246@gmail.com>
---
fs/ufs/util.h | 24 +++---
include/asm-generic/bitops/find.h | 69 ++++++++++++++++
include/asm-generic/bitops/le.h | 33 ++++++++
include/linux/bitops.h | 34 +++++---
lib/find_bit.c | 126 +++++++++++++++++++++++++++++-
5 files changed, 260 insertions(+), 26 deletions(-)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 4931bec1a01c..7c87c77d10ca 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -2,7 +2,7 @@
/*
* linux/fs/ufs/util.h
*
- * Copyright (C) 1998
+ * Copyright (C) 1998
* Daniel Pirkl <daniel.pirkl@email.cz>
* Charles University, Faculty of Mathematics and Physics
*/
@@ -263,7 +263,7 @@ extern int ufs_prepare_chunk(struct page *page,
loff_t pos, unsigned len);
/*
* These functions manipulate ufs buffers
*/
-#define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size)
+#define ubh_bread(sb,fragment,size) _ubh_bread_(uspi,sb,fragment,size)
extern struct ufs_buffer_head * _ubh_bread_(struct
ufs_sb_private_info *, struct super_block *, u64 , u64);
extern struct ufs_buffer_head * ubh_bread_uspi(struct
ufs_sb_private_info *, struct super_block *, u64, u64);
extern void ubh_brelse (struct ufs_buffer_head *);
@@ -296,7 +296,7 @@ static inline void *get_usb_offset(struct
ufs_sb_private_info *uspi,
unsigned int offset)
{
unsigned int index;
-
+
index = offset >> uspi->s_fshift;
offset &= ~uspi->s_fmask;
return uspi->s_ubh.bh[index]->b_data + offset;
@@ -411,9 +411,9 @@ static inline unsigned _ubh_find_next_zero_bit_(
offset = 0;
}
return (base << uspi->s_bpfshift) + pos - begin;
-}
+}
-static inline unsigned find_last_zero_bit (unsigned char * bitmap,
+static inline unsigned __ubh_find_last_zero_bit(unsigned char * bitmap,
unsigned size, unsigned offset)
{
unsigned bit, i;
@@ -453,7 +453,7 @@ static inline unsigned _ubh_find_last_zero_bit_(
size + (uspi->s_bpf - start), uspi->s_bpf)
- (uspi->s_bpf - start);
size -= count;
- pos = find_last_zero_bit (ubh->bh[base]->b_data,
+ pos = __ubh_find_last_zero_bit(ubh->bh[base]->b_data,
start, start - count);
if (pos > start - count || !size)
break;
@@ -461,7 +461,7 @@ static inline unsigned _ubh_find_last_zero_bit_(
start = uspi->s_bpf;
}
return (base << uspi->s_bpfshift) + pos - begin;
-}
+}
#define ubh_isblockclear(ubh,begin,block)
(!_ubh_isblockset_(uspi,ubh,begin,block))
@@ -483,7 +483,7 @@ static inline int _ubh_isblockset_(struct
ufs_sb_private_info * uspi,
mask = 0x01 << (block & 0x07);
return (*ubh_get_addr (ubh, begin + (block >> 3)) &
mask) == mask;
}
- return 0;
+ return 0;
}
#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
@@ -492,8 +492,8 @@ static inline void _ubh_clrblock_(struct
ufs_sb_private_info * uspi,
{
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr (ubh, begin + block) = 0x00;
- return;
+ *ubh_get_addr (ubh, begin + block) = 0x00;
+ return;
case 4:
*ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f <<
((block & 0x01) << 2));
return;
@@ -531,9 +531,9 @@ static inline void ufs_fragacct (struct
super_block * sb, unsigned blockmap,
{
struct ufs_sb_private_info * uspi;
unsigned fragsize, pos;
-
+
uspi = UFS_SB(sb)->s_uspi;
-
+
fragsize = 0;
for (pos = 0; pos < uspi->s_fpb; pos++) {
if (blockmap & (1 << pos)) {
diff --git a/include/asm-generic/bitops/find.h
b/include/asm-generic/bitops/find.h
index 9fdf21302fdf..ca18b2ec954c 100644
--- a/include/asm-generic/bitops/find.h
+++ b/include/asm-generic/bitops/find.h
@@ -16,6 +16,20 @@ extern unsigned long find_next_bit(const unsigned
long *addr, unsigned long
size, unsigned long offset);
#endif
+#ifndef find_prev_bit
+/**
+ * find_prev_bit - find the prev set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the prev set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_prev_bit(const unsigned long *addr, unsigned long
+ size, unsigned long offset);
+#endif
+
#ifndef find_next_and_bit
/**
* find_next_and_bit - find the next set bit in both memory regions
@@ -32,6 +46,22 @@ extern unsigned long find_next_and_bit(const
unsigned long *addr1,
unsigned long offset);
#endif
+#ifndef find_prev_and_bit
+/**
+ * find_prev_and_bit - find the prev set bit in both memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the prev set bit
+ * If no bits are set, returns @size.
+ */
+extern unsigned long find_prev_and_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size,
+ unsigned long offset);
+#endif
+
#ifndef find_next_zero_bit
/**
* find_next_zero_bit - find the next cleared bit in a memory region
@@ -46,6 +76,20 @@ extern unsigned long find_next_zero_bit(const
unsigned long *addr, unsigned
long size, unsigned long offset);
#endif
+#ifndef find_prev_zero_bit
+/**
+ * find_prev_zero_bit - find the prev cleared bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number of the prev zero bit
+ * If no bits are zero, returns @size.
+ */
+extern unsigned long find_prev_zero_bit(const unsigned long *addr, unsigned
+ long size, unsigned long offset);
+#endif
+
#ifdef CONFIG_GENERIC_FIND_FIRST_BIT
/**
@@ -80,6 +124,31 @@ extern unsigned long find_first_zero_bit(const
unsigned long *addr,
#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */
+#ifndef find_last_bit
+/**
+ * find_last_bit - find the last set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The number of bits to search
+ *
+ * Returns the bit number of the last set bit, or size.
+ */
+extern unsigned long find_last_bit(const unsigned long *addr,
+ unsigned long size);
+#endif
+
+#ifndef find_last_zero_bit
+/**
+ * find_last_zero_bit - find the last cleared bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum number of bits to search
+ *
+ * Returns the bit number of the first cleared bit.
+ * If no bits are zero, returns @size.
+ */
+extern unsigned long find_last_zero_bit(const unsigned long *addr,
+ unsigned long size);
+#endif
+
/**
* find_next_clump8 - find next 8-bit clump with set bits in a memory region
* @clump: location to store copy of found clump
diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h
index 188d3eba3ace..d0bd15bc34d9 100644
--- a/include/asm-generic/bitops/le.h
+++ b/include/asm-generic/bitops/le.h
@@ -27,6 +27,24 @@ static inline unsigned long
find_first_zero_bit_le(const void *addr,
return find_first_zero_bit(addr, size);
}
+static inline unsigned long find_prev_zero_bit_le(const void *addr,
+ unsigned long size, unsigned long offset)
+{
+ return find_prev_zero_bit(addr, size, offset);
+}
+
+static inline unsigned long find_prev_bit_le(const void *addr,
+ unsigned long size, unsigned long offset)
+{
+ return find_prev_bit(addr, size, offset);
+}
+
+static inline unsigned long find_last_zero_bit_le(const void *addr,
+ unsigned long size)
+{
+ return find_last_zero_bit(addr, size);
+}
+
#elif defined(__BIG_ENDIAN)
#define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
@@ -41,11 +59,26 @@ extern unsigned long find_next_bit_le(const void *addr,
unsigned long size, unsigned long offset);
#endif
+#ifndef find_prev_zero_bit_le
+extern unsigned long find_prev_zero_bit_le(const void *addr,
+ unsigned long size, unsigned long offset);
+#endif
+
+#ifndef find_prev_bit_le
+extern unsigned long find_prev_bit_le(const void *addr,
+ unsigned long size, unsigned long offset);
+#endif
+
#ifndef find_first_zero_bit_le
#define find_first_zero_bit_le(addr, size) \
find_next_zero_bit_le((addr), (size), 0)
#endif
+#ifndef find_last_zero_bit_le
+#define find_last_zero_bit_le(addr, size) \
+ find_prev_zero_bit_le((addr), (size), (size - 1))
+#endif
+
#else
#error "Please fix <asm/byteorder.h>"
#endif
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 5b74bdf159d6..124c68929861 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -50,6 +50,28 @@ extern unsigned long __sw_hweight64(__u64 w);
(bit) < (size); \
(bit) = find_next_zero_bit((addr), (size), (bit) + 1))
+#define for_each_set_bit_reverse(bit, addr, size) \
+ for ((bit) = find_last_bit((addr), (size)); \
+ (bit) < (size); \
+ (bit) = find_prev_bit((addr), (size), (bit)))
+
+/* same as for_each_set_bit_reverse() but use bit as value to start with */
+#define for_each_set_bit_from_reverse(bit, addr, size) \
+ for ((bit) = find_prev_bit((addr), (size), (bit)); \
+ (bit) < (size); \
+ (bit) = find_prev_bit((addr), (size), (bit - 1)))
+
+#define for_each_clear_bit_reverse(bit, addr, size) \
+ for ((bit) = find_last_zero_bit((addr), (size)); \
+ (bit) < (size); \
+ (bit) = find_prev_zero_bit((addr), (size), (bit)))
+
+/* same as for_each_clear_bit_reverse() but use bit as value to start with */
+#define for_each_clear_bit_from_reverse(bit, addr, size) \
+ for ((bit) = find_prev_zero_bit((addr), (size), (bit)); \
+ (bit) < (size); \
+ (bit) = find_next_zero_bit((addr), (size), (bit - 1)))
+
/**
* for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
* @start: bit offset to start search and to store the current iteration offset
@@ -283,17 +305,5 @@ static __always_inline void __assign_bit(long nr,
volatile unsigned long *addr,
})
#endif
-#ifndef find_last_bit
-/**
- * find_last_bit - find the last set bit in a memory region
- * @addr: The address to start the search at
- * @size: The number of bits to search
- *
- * Returns the bit number of the last set bit, or size.
- */
-extern unsigned long find_last_bit(const unsigned long *addr,
- unsigned long size);
-#endif
-
#endif /* __KERNEL__ */
#endif
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 4a8751010d59..cbe06abd3d21 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -69,6 +69,58 @@ static unsigned long _find_next_bit(const unsigned
long *addr1,
}
#endif
+#if !defined(find_prev_bit) || !defined(find_prev_zero_bit) ||
\
+ !defined(find_prev_bit_le) || !defined(find_prev_zero_bit_le)
|| \
+ !defined(find_prev_and_bit)
+/*
+ * This is a common helper function for find_prev_bit, find_prev_zero_bit, and
+ * find_prev_and_bit. The differences are:
+ * - The "invert" argument, which is XORed with each fetched word before
+ * searching it for one bits.
+ * - The optional "addr2", which is anded with "addr1" if present.
+ */
+static unsigned long _find_prev_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long nbits,
+ unsigned long start, unsigned long invert, unsigned long le)
+{
+ unsigned long tmp, mask;
+
+ if (unlikely(start >= nbits))
+ return nbits;
+
+ tmp = addr1[start / BITS_PER_LONG];
+ if (addr2)
+ tmp &= addr2[start / BITS_PER_LONG];
+ tmp ^= invert;
+
+ /* Handle 1st word. */
+ mask = BITMAP_LAST_WORD_MASK(start + 1);
+ if (le)
+ mask = swab(mask);
+
+ tmp &= mask;
+
+ start = round_down(start, BITS_PER_LONG);
+
+ while (!tmp) {
+ start -= BITS_PER_LONG;
+ if (start >= nbits)
+ return nbits;
+
+ tmp = addr1[start / BITS_PER_LONG];
+ if (addr2)
+ tmp &= addr2[start / BITS_PER_LONG];
+ tmp ^= invert;
+ }
+
+ if (le)
+ tmp = swab(tmp);
+
+ return start + __fls(tmp);
+}
+#endif
+
+
#ifndef find_next_bit
/*
* Find the next set bit in a memory region.
@@ -81,6 +133,18 @@ unsigned long find_next_bit(const unsigned long
*addr, unsigned long size,
EXPORT_SYMBOL(find_next_bit);
#endif
+#ifndef find_prev_bit
+/*
+ * Find the prev set bit in a memory region.
+ */
+unsigned long find_prev_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ return _find_prev_bit(addr, NULL, size, offset, 0UL, 0);
+}
+EXPORT_SYMBOL(find_prev_bit);
+#endif
+
#ifndef find_next_zero_bit
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
@@ -90,7 +154,16 @@ unsigned long find_next_zero_bit(const unsigned
long *addr, unsigned long size,
EXPORT_SYMBOL(find_next_zero_bit);
#endif
-#if !defined(find_next_and_bit)
+#ifndef find_prev_zero_bit
+unsigned long find_prev_zero_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ return _find_prev_bit(addr, NULL, size, offset, ~0UL, 0);
+}
+EXPORT_SYMBOL(find_prev_zero_bit);
+#endif
+
+#ifndef find_next_and_bit
unsigned long find_next_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size,
unsigned long offset)
@@ -100,6 +173,16 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
EXPORT_SYMBOL(find_next_and_bit);
#endif
+#ifndef find_prev_and_bit
+unsigned long find_prev_and_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size,
+ unsigned long offset)
+{
+ return _find_prev_bit(addr1, addr2, size, offset, 0UL, 0);
+}
+EXPORT_SYMBOL(find_prev_and_bit);
+#endif
+
#ifndef find_first_bit
/*
* Find the first set bit in a memory region.
@@ -141,7 +224,7 @@ unsigned long find_last_bit(const unsigned long
*addr, unsigned long size)
{
if (size) {
unsigned long val = BITMAP_LAST_WORD_MASK(size);
- unsigned long idx = (size-1) / BITS_PER_LONG;
+ unsigned long idx = (size - 1) / BITS_PER_LONG;
do {
val &= addr[idx];
@@ -156,6 +239,27 @@ unsigned long find_last_bit(const unsigned long
*addr, unsigned long size)
EXPORT_SYMBOL(find_last_bit);
#endif
+#ifndef find_last_zero_bit
+unsigned long find_last_zero_bit(const unsigned long *addr, unsigned long size)
+{
+ if (size) {
+ unsigned long val = BITMAP_LAST_WORD_MASK(size);
+ unsigned long idx = (size - 1) / BITS_PER_LONG;
+
+ do {
+ val &= ~addr[idx];
+ if (val)
+ return idx * BITS_PER_LONG + __fls(val);
+
+ val = ~0ul;
+ } while (idx--);
+ }
+
+ return size;
+}
+EXPORT_SYMBOL(find_last_zero_bit);
+#endif
+
#ifdef __BIG_ENDIAN
#ifndef find_next_zero_bit_le
@@ -167,6 +271,15 @@ unsigned long find_next_zero_bit_le(const void
*addr, unsigned
EXPORT_SYMBOL(find_next_zero_bit_le);
#endif
+#ifndef find_prev_zero_bit_le
+unsigned long find_prev_zero_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ return _find_prev_bit(addr, NULL, size, offset, ~0UL, 1);
+}
+EXPORT_SYMBOL(find_prev_zero_bit_le);
+#endif
+
#ifndef find_next_bit_le
unsigned long find_next_bit_le(const void *addr, unsigned
long size, unsigned long offset)
@@ -176,6 +289,15 @@ unsigned long find_next_bit_le(const void *addr, unsigned
EXPORT_SYMBOL(find_next_bit_le);
#endif
+#ifdef find_prev_bit_le
+unsigned long find_prev_bit_le(const void *addr, unsigned
+ long size, unsigned long offset)
+{
+ return _find_prev_bit(addr, NULL, size, offset, 0UL, 1);
+}
+EXPORT_SYMBOL(find_prev_bit_le);
+#endif
+
#endif /* __BIG_ENDIAN */
unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr,
--
2.29.2
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02 1:10 [PATCH] lib/find_bit: Add find_prev_*_bit functions Yun Levi
@ 2020-12-02 9:47 ` Andy Shevchenko
2020-12-02 10:04 ` Rasmus Villemoes
0 siblings, 1 reply; 417+ messages in thread
From: Andy Shevchenko @ 2020-12-02 9:47 UTC (permalink / raw)
To: Yun Levi
Cc: dushistov, arnd, akpm, gustavo, vilhelm.gray, richard.weiyang,
joseph.qi, skalluru, yury.norov, jpoimboe, linux-kernel,
linux-arch
On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
> Inspired find_next_*bit function series, add find_prev_*_bit series.
> I'm not sure whether it'll be used right now But, I add these functions
> for future usage.
This patch has few issues:
- it has more things than described (should be several patches instead)
- new functionality can be split logically to couple or more pieces as well
- it proposes functionality w/o user (dead code)
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02 9:47 ` Andy Shevchenko
@ 2020-12-02 10:04 ` Rasmus Villemoes
2020-12-02 11:50 ` Yun Levi
0 siblings, 1 reply; 417+ messages in thread
From: Rasmus Villemoes @ 2020-12-02 10:04 UTC (permalink / raw)
To: Andy Shevchenko, Yun Levi
Cc: dushistov, arnd, akpm, gustavo, vilhelm.gray, richard.weiyang,
joseph.qi, skalluru, yury.norov, jpoimboe, linux-kernel,
linux-arch
On 02/12/2020 10.47, Andy Shevchenko wrote:
> On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
>> Inspired find_next_*bit function series, add find_prev_*_bit series.
>> I'm not sure whether it'll be used right now But, I add these functions
>> for future usage.
>
> This patch has few issues:
> - it has more things than described (should be several patches instead)
> - new functionality can be split logically to couple or more pieces as well
> - it proposes functionality w/o user (dead code)
Yeah, the last point means it can't be applied - please submit it again
if and when you have an actual use case. And I'll add
- it lacks extension of the bitmap test module to cover the new
functions (that also wants to be a separate patch).
Rasmus
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH] lib/find_bit: Add find_prev_*_bit functions.
2020-12-02 10:04 ` Rasmus Villemoes
@ 2020-12-02 11:50 ` Yun Levi
[not found] ` <CAAH8bW-jUeFVU-0OrJzK-MuGgKJgZv38RZugEQzFRJHSXFRRDA@mail.gmail.com>
0 siblings, 1 reply; 417+ messages in thread
From: Yun Levi @ 2020-12-02 11:50 UTC (permalink / raw)
To: Rasmus Villemoes
Cc: Andy Shevchenko, dushistov, arnd, akpm, gustavo, vilhelm.gray,
richard.weiyang, joseph.qi, skalluru, yury.norov, jpoimboe,
linux-kernel, linux-arch
Thanks for kind advice. But I'm so afraid to have questions below:
> - it proposes functionality w/o user (dead code)
Actually, I add these series functions to rewrite some of the
resource clean-up routine.
A typical case is ethtool_set_per_queue_coalesce 's rollback label.
Could this usage be an actual use case?
>- it lacks extension of the bitmap test module to cover the new
> functions (that also wants to be a separate patch).
I see, then Could I add some of testcase on lib/test_bitops.c for testing?
On Wed, Dec 2, 2020 at 7:04 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> On 02/12/2020 10.47, Andy Shevchenko wrote:
> > On Wed, Dec 02, 2020 at 10:10:09AM +0900, Yun Levi wrote:
> >> Inspired find_next_*bit function series, add find_prev_*_bit series.
> >> I'm not sure whether it'll be used right now But, I add these functions
> >> for future usage.
> >
> > This patch has few issues:
> > - it has more things than described (should be several patches instead)
> > - new functionality can be split logically to couple or more pieces as well
> > - it proposes functionality w/o user (dead code)
>
> Yeah, the last point means it can't be applied - please submit it again
> if and when you have an actual use case. And I'll add
>
> - it lacks extension of the bitmap test module to cover the new
> functions (that also wants to be a separate patch).
>
> Rasmus
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re: [PATCH v4] arm64: dts: qcom: Add support for Xiaomi Poco F1 (Beryllium)
@ 2020-08-05 11:02 Amit Pundir
2020-08-06 22:31 ` Konrad Dybcio
0 siblings, 1 reply; 417+ messages in thread
From: Amit Pundir @ 2020-08-05 11:02 UTC (permalink / raw)
To: Andy Gross, Bjorn Andersson, Rob Herring, John Stultz, Sumit Semwal
Cc: linux-arm-msm, dt, lkml
On Wed, 5 Aug 2020 at 16:21, Amit Pundir <amit.pundir@linaro.org> wrote:
>
> Add initial dts support for Xiaomi Poco F1 (Beryllium).
>
> This initial support is based on upstream Dragonboard 845c
> (sdm845) device. With this dts, Beryllium boots AOSP up to
> ADB shell over USB-C.
>
> Supported functionality includes UFS, USB-C (peripheral),
> microSD card and Vol+/Vol-/power keys. Bluetooth should work
> too but couldn't be verified from adb command line, it is
> verified when enabled from UI with few WIP display patches.
>
> Just like initial db845c support, initializing the SMMU is
> clearing the mapping used for the splash screen framebuffer,
> which causes the device to hang during boot and recovery
> needs a hard power reset. This can be worked around using:
>
> fastboot oem select-display-panel none
>
> To switch ON the display back run:
>
> fastboot oem select-display-panel
>
> But this only works on Beryllium devices running bootloader
> version BOOT.XF.2.0-00369-SDM845LZB-1 that shipped with
> Android-9 based release. Newer bootloader version do not
> support switching OFF the display panel at all. So we need
> a few additional smmu patches (under review) from here to
> boot to shell:
> https://github.com/pundiramit/linux/commits/beryllium-mainline
>
> Signed-off-by: Amit Pundir <amit.pundir@linaro.org>
> ---
> v4: Added more downstream reserved memory regions. It probably
> need more work, but for now I see adsp/cdsp/wlan remoteprocs
> powering up properly. Also removed the regulator nodes not
> required for the device, as suggested by Bjorn.
Forgot to mention that I added couple of clocks to protected clocks in v4,
which need for display to work.
> v3: Added a reserved-memory region from downstream kernel to fix
> a boot regression with recent dma-pool changes in v5.8-rc6.
> v2: Updated machine compatible string for seemingly inevitable
> future quirks.
>
> arch/arm64/boot/dts/qcom/Makefile | 1 +
> arch/arm64/boot/dts/qcom/sdm845-beryllium.dts | 383 ++++++++++++++++++++++++++
> 2 files changed, 384 insertions(+)
> create mode 100644 arch/arm64/boot/dts/qcom/sdm845-beryllium.dts
>
> diff --git a/arch/arm64/boot/dts/qcom/Makefile b/arch/arm64/boot/dts/qcom/Makefile
> index 0f2c33d611df..3ef1b48bc0cb 100644
> --- a/arch/arm64/boot/dts/qcom/Makefile
> +++ b/arch/arm64/boot/dts/qcom/Makefile
> @@ -21,6 +21,7 @@ dtb-$(CONFIG_ARCH_QCOM) += sdm845-cheza-r1.dtb
> dtb-$(CONFIG_ARCH_QCOM) += sdm845-cheza-r2.dtb
> dtb-$(CONFIG_ARCH_QCOM) += sdm845-cheza-r3.dtb
> dtb-$(CONFIG_ARCH_QCOM) += sdm845-db845c.dtb
> +dtb-$(CONFIG_ARCH_QCOM) += sdm845-beryllium.dtb
> dtb-$(CONFIG_ARCH_QCOM) += sdm845-mtp.dtb
> dtb-$(CONFIG_ARCH_QCOM) += sdm850-lenovo-yoga-c630.dtb
> dtb-$(CONFIG_ARCH_QCOM) += sm8150-mtp.dtb
> diff --git a/arch/arm64/boot/dts/qcom/sdm845-beryllium.dts b/arch/arm64/boot/dts/qcom/sdm845-beryllium.dts
> new file mode 100644
> index 000000000000..0f9f61bf9fa4
> --- /dev/null
> +++ b/arch/arm64/boot/dts/qcom/sdm845-beryllium.dts
> @@ -0,0 +1,383 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/dts-v1/;
> +
> +#include <dt-bindings/gpio/gpio.h>
> +#include <dt-bindings/pinctrl/qcom,pmic-gpio.h>
> +#include <dt-bindings/regulator/qcom,rpmh-regulator.h>
> +#include "sdm845.dtsi"
> +#include "pm8998.dtsi"
> +#include "pmi8998.dtsi"
> +
> +/ {
> + model = "Xiaomi Technologies Inc. Beryllium";
> + compatible = "xiaomi,beryllium", "qcom,sdm845";
> +
> + /* required for bootloader to select correct board */
> + qcom,board-id = <69 0>;
> + qcom,msm-id = <321 0x20001>;
> +
> + aliases {
> + hsuart0 = &uart6;
> + };
> +
> + gpio-keys {
> + compatible = "gpio-keys";
> + autorepeat;
> +
> + pinctrl-names = "default";
> + pinctrl-0 = <&vol_up_pin_a>;
> +
> + vol-up {
> + label = "Volume Up";
> + linux,code = <KEY_VOLUMEUP>;
> + gpios = <&pm8998_gpio 6 GPIO_ACTIVE_LOW>;
> + };
> + };
> +
> + vreg_s4a_1p8: vreg-s4a-1p8 {
> + compatible = "regulator-fixed";
> + regulator-name = "vreg_s4a_1p8";
> +
> + regulator-min-microvolt = <1800000>;
> + regulator-max-microvolt = <1800000>;
> + regulator-always-on;
> + };
> +};
> +
> +&adsp_pas {
> + status = "okay";
> + firmware-name = "qcom/sdm845/adsp.mdt";
> +};
> +
> +&apps_rsc {
> + pm8998-rpmh-regulators {
> + compatible = "qcom,pm8998-rpmh-regulators";
> + qcom,pmic-id = "a";
> +
> + vreg_l1a_0p875: ldo1 {
> + regulator-min-microvolt = <880000>;
> + regulator-max-microvolt = <880000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l5a_0p8: ldo5 {
> + regulator-min-microvolt = <800000>;
> + regulator-max-microvolt = <800000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l7a_1p8: ldo7 {
> + regulator-min-microvolt = <1800000>;
> + regulator-max-microvolt = <1800000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l12a_1p8: ldo12 {
> + regulator-min-microvolt = <1800000>;
> + regulator-max-microvolt = <1800000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l13a_2p95: ldo13 {
> + regulator-min-microvolt = <1800000>;
> + regulator-max-microvolt = <2960000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l17a_1p3: ldo17 {
> + regulator-min-microvolt = <1304000>;
> + regulator-max-microvolt = <1304000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l20a_2p95: ldo20 {
> + regulator-min-microvolt = <2960000>;
> + regulator-max-microvolt = <2968000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l21a_2p95: ldo21 {
> + regulator-min-microvolt = <2960000>;
> + regulator-max-microvolt = <2968000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l24a_3p075: ldo24 {
> + regulator-min-microvolt = <3088000>;
> + regulator-max-microvolt = <3088000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l25a_3p3: ldo25 {
> + regulator-min-microvolt = <3300000>;
> + regulator-max-microvolt = <3312000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> +
> + vreg_l26a_1p2: ldo26 {
> + regulator-min-microvolt = <1200000>;
> + regulator-max-microvolt = <1200000>;
> + regulator-initial-mode = <RPMH_REGULATOR_MODE_HPM>;
> + };
> + };
> +};
> +
> +&cdsp_pas {
> + status = "okay";
> + firmware-name = "qcom/sdm845/cdsp.mdt";
> +};
> +
> +&gcc {
> + protected-clocks = <GCC_QSPI_CORE_CLK>,
> + <GCC_QSPI_CORE_CLK_SRC>,
> + <GCC_QSPI_CNOC_PERIPH_AHB_CLK>,
> + <GCC_LPASS_Q6_AXI_CLK>,
> + <GCC_LPASS_SWAY_CLK>;
> +};
> +
> +&gpu {
> + zap-shader {
> + memory-region = <&gpu_mem>;
> + firmware-name = "qcom/sdm845/a630_zap.mbn";
> + };
> +};
> +
> +/* Reserved memory changes from downstream */
> +/delete-node/ &adsp_mem;
> +/delete-node/ &wlan_msa_mem;
> +/delete-node/ &mpss_region;
> +/delete-node/ &venus_mem;
> +/delete-node/ &cdsp_mem;
> +/delete-node/ &mba_region;
> +/delete-node/ &slpi_mem;
> +/delete-node/ &spss_mem;
> +/delete-node/ &rmtfs_mem;
> +/ {
> + reserved-memory {
> + // This removed_region is needed to boot the device
> + // TODO: Find out the user of this reserved memory
> + removed_region: memory@88f00000 {
> + reg = <0 0x88f00000 0 0x1a00000>;
> + no-map;
> + };
> +
> + adsp_mem: memory@8c500000 {
> + reg = <0 0x8c500000 0 0x1e00000>;
> + no-map;
> + };
> +
> + wlan_msa_mem: memory@8e300000 {
> + reg = <0 0x8e300000 0 0x100000>;
> + no-map;
> + };
> +
> + mpss_region: memory@8e400000 {
> + reg = <0 0x8e400000 0 0x7800000>;
> + no-map;
> + };
> +
> + venus_mem: memory@95c00000 {
> + reg = <0 0x95c00000 0 0x500000>;
> + no-map;
> + };
> +
> + cdsp_mem: memory@96100000 {
> + reg = <0 0x96100000 0 0x800000>;
> + no-map;
> + };
> +
> + mba_region: memory@96900000 {
> + reg = <0 0x96900000 0 0x200000>;
> + no-map;
> + };
> +
> + slpi_mem: memory@96b00000 {
> + reg = <0 0x96b00000 0 0x1400000>;
> + no-map;
> + };
> +
> + spss_mem: memory@97f00000 {
> + reg = <0 0x97f00000 0 0x100000>;
> + no-map;
> + };
> +
> + rmtfs_mem: memory@f6301000 {
> + compatible = "qcom,rmtfs-mem";
> + reg = <0 0xf6301000 0 0x200000>;
> + no-map;
> +
> + qcom,client-id = <1>;
> + qcom,vmid = <15>;
> + };
> + };
> +};
> +
> +&mss_pil {
> + status = "okay";
> + firmware-name = "qcom/sdm845/mba.mbn", "qcom/sdm845/modem.mdt";
> +};
> +
> +&pm8998_gpio {
> + vol_up_pin_a: vol-up-active {
> + pins = "gpio6";
> + function = "normal";
> + input-enable;
> + bias-pull-up;
> + qcom,drive-strength = <PMIC_GPIO_STRENGTH_NO>;
> + };
> +};
> +
> +&pm8998_pon {
> + resin {
> + compatible = "qcom,pm8941-resin";
> + interrupts = <0x0 0x8 1 IRQ_TYPE_EDGE_BOTH>;
> + debounce = <15625>;
> + bias-pull-up;
> + linux,code = <KEY_VOLUMEDOWN>;
> + };
> +};
> +
> +&qupv3_id_0 {
> + status = "okay";
> +};
> +
> +&sdhc_2 {
> + status = "okay";
> +
> + pinctrl-names = "default";
> + pinctrl-0 = <&sdc2_default_state &sdc2_card_det_n>;
> +
> + vmmc-supply = <&vreg_l21a_2p95>;
> + vqmmc-supply = <&vreg_l13a_2p95>;
> +
> + bus-width = <4>;
> + cd-gpios = <&tlmm 126 GPIO_ACTIVE_HIGH>;
> +};
> +
> +&tlmm {
> + gpio-reserved-ranges = <0 4>, <81 4>;
> +
> + sdc2_default_state: sdc2-default {
> + clk {
> + pins = "sdc2_clk";
> + bias-disable;
> +
> + /*
> + * It seems that mmc_test reports errors if drive
> + * strength is not 16 on clk, cmd, and data pins.
> + */
> + drive-strength = <16>;
> + };
> +
> + cmd {
> + pins = "sdc2_cmd";
> + bias-pull-up;
> + drive-strength = <10>;
> + };
> +
> + data {
> + pins = "sdc2_data";
> + bias-pull-up;
> + drive-strength = <10>;
> + };
> + };
> +
> + sdc2_card_det_n: sd-card-det-n {
> + pins = "gpio126";
> + function = "gpio";
> + bias-pull-up;
> + };
> +};
> +
> +&uart6 {
> + status = "okay";
> +
> + bluetooth {
> + compatible = "qcom,wcn3990-bt";
> +
> + vddio-supply = <&vreg_s4a_1p8>;
> + vddxo-supply = <&vreg_l7a_1p8>;
> + vddrf-supply = <&vreg_l17a_1p3>;
> + vddch0-supply = <&vreg_l25a_3p3>;
> + max-speed = <3200000>;
> + };
> +};
> +
> +&usb_1 {
> + status = "okay";
> +};
> +
> +&usb_1_dwc3 {
> + dr_mode = "peripheral";
> +};
> +
> +&usb_1_hsphy {
> + status = "okay";
> +
> + vdd-supply = <&vreg_l1a_0p875>;
> + vdda-pll-supply = <&vreg_l12a_1p8>;
> + vdda-phy-dpdm-supply = <&vreg_l24a_3p075>;
> +
> + qcom,imp-res-offset-value = <8>;
> + qcom,hstx-trim-value = <QUSB2_V2_HSTX_TRIM_21_6_MA>;
> + qcom,preemphasis-level = <QUSB2_V2_PREEMPHASIS_5_PERCENT>;
> + qcom,preemphasis-width = <QUSB2_V2_PREEMPHASIS_WIDTH_HALF_BIT>;
> +};
> +
> +&usb_1_qmpphy {
> + status = "okay";
> +
> + vdda-phy-supply = <&vreg_l26a_1p2>;
> + vdda-pll-supply = <&vreg_l1a_0p875>;
> +};
> +
> +&ufs_mem_hc {
> + status = "okay";
> +
> + reset-gpios = <&tlmm 150 GPIO_ACTIVE_LOW>;
> +
> + vcc-supply = <&vreg_l20a_2p95>;
> + vcc-max-microamp = <800000>;
> +};
> +
> +&ufs_mem_phy {
> + status = "okay";
> +
> + vdda-phy-supply = <&vreg_l1a_0p875>;
> + vdda-pll-supply = <&vreg_l26a_1p2>;
> +};
> +
> +&wifi {
> + status = "okay";
> +
> + vdd-0.8-cx-mx-supply = <&vreg_l5a_0p8>;
> + vdd-1.8-xo-supply = <&vreg_l7a_1p8>;
> + vdd-1.3-rfa-supply = <&vreg_l17a_1p3>;
> + vdd-3.3-ch0-supply = <&vreg_l25a_3p3>;
> +};
> +
> +/* PINCTRL - additions to nodes defined in sdm845.dtsi */
> +
> +&qup_uart6_default {
> + pinmux {
> + pins = "gpio45", "gpio46", "gpio47", "gpio48";
> + function = "qup6";
> + };
> +
> + cts {
> + pins = "gpio45";
> + bias-disable;
> + };
> +
> + rts-tx {
> + pins = "gpio46", "gpio47";
> + drive-strength = <2>;
> + bias-disable;
> + };
> +
> + rx {
> + pins = "gpio48";
> + bias-pull-up;
> + };
> +};
> --
> 2.7.4
>
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
2020-08-05 11:02 [PATCH v4] arm64: dts: qcom: Add support for Xiaomi Poco F1 (Beryllium) Amit Pundir
@ 2020-08-06 22:31 ` Konrad Dybcio
2020-08-12 13:37 ` Amit Pundir
0 siblings, 1 reply; 417+ messages in thread
From: Konrad Dybcio @ 2020-08-06 22:31 UTC (permalink / raw)
To: amit.pundir
Cc: agross, bjorn.andersson, devicetree, john.stultz, linux-arm-msm,
linux-kernel, robh+dt, sumit.semwal, Konrad Dybcio
Subject: Re: [PATCH v4] arm64: dts: qcom: Add support for Xiaomi Poco F1 (Beryllium)
>// This removed_region is needed to boot the device
> // TODO: Find out the user of this reserved memory
> removed_region: memory@88f00000 {
This region seems to belong to the Trust Zone. When Linux tries to access it, TZ bites and shuts the device down.
Konrad
^ permalink raw reply [flat|nested] 417+ messages in thread
* Re:
2020-08-06 22:31 ` Konrad Dybcio
@ 2020-08-12 13:37 ` Amit Pundir
0 siblings, 0 replies; 417+ messages in thread
From: Amit Pundir @ 2020-08-12 13:37 UTC (permalink / raw)
To: Konrad Dybcio
Cc: Andy Gross, Bjorn Andersson, dt, John Stultz, linux-arm-msm,
lkml, Rob Herring, Sumit Semwal
On Fri, 7 Aug 2020 at 04:02, Konrad Dybcio <konradybcio@gmail.com> wrote:
>
> Subject: Re: [PATCH v4] arm64: dts: qcom: Add support for Xiaomi Poco F1 (Beryllium)
>
> >// This removed_region is needed to boot the device
> > // TODO: Find out the user of this reserved memory
> > removed_region: memory@88f00000 {
>
> This region seems to belong to the Trust Zone. When Linux tries to access it, TZ bites and shuts the device down.
That is totally possible. Plus it falls right in between TZ and QSEE
reserved-memory regions. However, I do not find any credible source
of information which can confirm this. So I'm hesitant to update the
TODO item in the above comment.
>
> Konrad
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2020-06-30 17:56 Vasiliy Kupriakov
2020-07-10 20:36 ` Andy Shevchenko
0 siblings, 1 reply; 417+ messages in thread
From: Vasiliy Kupriakov @ 2020-06-30 17:56 UTC (permalink / raw)
To: Corentin Chary, Darren Hart, Andy Shevchenko
Cc: Vasiliy Kupriakov,
open list:ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS,
open list:ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS,
open list
Subject: [PATCH] platform/x86: asus-wmi: allow BAT1 battery name
The battery on my laptop ASUS TUF Gaming FX706II is named BAT1.
This patch allows battery extension to load.
Signed-off-by: Vasiliy Kupriakov <rublag-ns@yandex.ru>
---
drivers/platform/x86/asus-wmi.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 877aade19497..8f4acdc06b13 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -441,6 +441,7 @@ static int asus_wmi_battery_add(struct power_supply *battery)
* battery is named BATT.
*/
if (strcmp(battery->desc->name, "BAT0") != 0 &&
+ strcmp(battery->desc->name, "BAT1") != 0 &&
strcmp(battery->desc->name, "BATT") != 0)
return -ENODEV;
--
2.27.0
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2020-06-30 17:56 Vasiliy Kupriakov
@ 2020-07-10 20:36 ` Andy Shevchenko
0 siblings, 0 replies; 417+ messages in thread
From: Andy Shevchenko @ 2020-07-10 20:36 UTC (permalink / raw)
To: Vasiliy Kupriakov
Cc: Corentin Chary, Darren Hart, Andy Shevchenko,
open list:ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS,
open list:ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS,
open list
On Tue, Jun 30, 2020 at 8:57 PM Vasiliy Kupriakov <rublag-ns@yandex.ru> wrote:
>
> Subject: [PATCH] platform/x86: asus-wmi: allow BAT1 battery name
>
> The battery on my laptop ASUS TUF Gaming FX706II is named BAT1.
> This patch allows battery extension to load.
>
Pushed to my review and testing queue, thanks!
> Signed-off-by: Vasiliy Kupriakov <rublag-ns@yandex.ru>
> ---
> drivers/platform/x86/asus-wmi.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
> index 877aade19497..8f4acdc06b13 100644
> --- a/drivers/platform/x86/asus-wmi.c
> +++ b/drivers/platform/x86/asus-wmi.c
> @@ -441,6 +441,7 @@ static int asus_wmi_battery_add(struct power_supply *battery)
> * battery is named BATT.
> */
> if (strcmp(battery->desc->name, "BAT0") != 0 &&
> + strcmp(battery->desc->name, "BAT1") != 0 &&
> strcmp(battery->desc->name, "BATT") != 0)
> return -ENODEV;
>
> --
> 2.27.0
>
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2020-05-06 5:52 Jiaxun Yang
2020-05-06 17:17 ` Nick Desaulniers
0 siblings, 1 reply; 417+ messages in thread
From: Jiaxun Yang @ 2020-05-06 5:52 UTC (permalink / raw)
To: linux-mips
Cc: Jiaxun Yang, clang-built-linux, Maciej W . Rozycki, Fangrui Song,
Kees Cook, Nathan Chancellor, Thomas Bogendoerfer, Paul Burton,
Masahiro Yamada, Jouni Hogander, Kevin Darbyshire-Bryant,
Borislav Petkov, Heiko Carstens, linux-kernel
Subject: [PATCH v6] MIPS: Truncate link address into 32bit for 32bit kernel
In-Reply-To: <20200413062651.3992652-1-jiaxun.yang@flygoat.com>
LLD failed to link vmlinux with 64bit load address for 32bit ELF
while bfd will strip 64bit address into 32bit silently.
To fix LLD build, we should truncate load address provided by platform
into 32bit for 32bit kernel.
Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
Link: https://github.com/ClangBuiltLinux/linux/issues/786
Link: https://sourceware.org/bugzilla/show_bug.cgi?id=25784
Reviewed-by: Fangrui Song <maskray@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Cc: Maciej W. Rozycki <macro@linux-mips.org>
---
V2: Take MaskRay's shell magic.
V3: After spent an hour on dealing with special character issue in
Makefile, I gave up to do shell hacks and write a util in C instead.
Thanks Maciej for pointing out Makefile variable problem.
v4: Finally we managed to find a Makefile method to do it properly
thanks to Kees. As it's too far from the initial version, I removed
Review & Test tag from Nick and Fangrui and Cc instead.
v5: Care vmlinuz as well.
v6: Rename to LIKER_LOAD_ADDRESS
---
arch/mips/Makefile | 13 ++++++++++++-
arch/mips/boot/compressed/Makefile | 2 +-
arch/mips/kernel/vmlinux.lds.S | 2 +-
3 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index e1c44aed8156..68c0f22fefc0 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -288,12 +288,23 @@ ifdef CONFIG_64BIT
endif
endif
+# When linking a 32-bit executable the LLVM linker cannot cope with a
+# 32-bit load address that has been sign-extended to 64 bits. Simply
+# remove the upper 32 bits then, as it is safe to do so with other
+# linkers.
+ifdef CONFIG_64BIT
+ load-ld = $(load-y)
+else
+ load-ld = $(subst 0xffffffff,0x,$(load-y))
+endif
+
KBUILD_AFLAGS += $(cflags-y)
KBUILD_CFLAGS += $(cflags-y)
-KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y)
+KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y) -DLINKER_LOAD_ADDRESS=$(load-ld)
KBUILD_CPPFLAGS += -DDATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)
bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y) \
+ LINKER_LOAD_ADDRESS=$(load-ld) \
VMLINUX_ENTRY_ADDRESS=$(entry-y) \
PLATFORM="$(platform-y)" \
ITS_INPUTS="$(its-y)"
diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile
index 0df0ee8a298d..3d391256ab7e 100644
--- a/arch/mips/boot/compressed/Makefile
+++ b/arch/mips/boot/compressed/Makefile
@@ -90,7 +90,7 @@ ifneq ($(zload-y),)
VMLINUZ_LOAD_ADDRESS := $(zload-y)
else
VMLINUZ_LOAD_ADDRESS = $(shell $(obj)/calc_vmlinuz_load_addr \
- $(obj)/vmlinux.bin $(VMLINUX_LOAD_ADDRESS))
+ $(obj)/vmlinux.bin $(LINKER_LOAD_ADDRESS))
endif
UIMAGE_LOADADDR = $(VMLINUZ_LOAD_ADDRESS)
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index a5f00ec73ea6..5226cd8e4bee 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -55,7 +55,7 @@ SECTIONS
/* . = 0xa800000000300000; */
. = 0xffffffff80300000;
#endif
- . = VMLINUX_LOAD_ADDRESS;
+ . = LINKER_LOAD_ADDRESS;
/* read-only */
_text = .; /* Text and read-only data */
.text : {
^ permalink raw reply related [flat|nested] 417+ messages in thread
* Re:
2020-05-06 5:52 Jiaxun Yang
@ 2020-05-06 17:17 ` Nick Desaulniers
0 siblings, 0 replies; 417+ messages in thread
From: Nick Desaulniers @ 2020-05-06 17:17 UTC (permalink / raw)
To: Jiaxun Yang
Cc: linux-mips, clang-built-linux, Maciej W . Rozycki, Fangrui Song,
Kees Cook, Nathan Chancellor, Thomas Bogendoerfer, Paul Burton,
Masahiro Yamada, Jouni Hogander, Kevin Darbyshire-Bryant,
Borislav Petkov, Heiko Carstens, LKML
On Tue, May 5, 2020 at 10:52 PM Jiaxun Yang <jiaxun.yang@flygoat.com> wrote:
>
> Subject: [PATCH v6] MIPS: Truncate link address into 32bit for 32bit kernel
> In-Reply-To: <20200413062651.3992652-1-jiaxun.yang@flygoat.com>
>
> LLD failed to link vmlinux with 64bit load address for 32bit ELF
> while bfd will strip 64bit address into 32bit silently.
> To fix LLD build, we should truncate load address provided by platform
> into 32bit for 32bit kernel.
>
> Signed-off-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
> Link: https://github.com/ClangBuiltLinux/linux/issues/786
> Link: https://sourceware.org/bugzilla/show_bug.cgi?id=25784
> Reviewed-by: Fangrui Song <maskray@google.com>
> Reviewed-by: Kees Cook <keescook@chromium.org>
> Tested-by: Nathan Chancellor <natechancellor@gmail.com>
> Cc: Maciej W. Rozycki <macro@linux-mips.org>
Cool, this revision looks a bit simpler. Thanks for chasing this.
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
> ---
> V2: Take MaskRay's shell magic.
>
> V3: After spent an hour on dealing with special character issue in
> Makefile, I gave up to do shell hacks and write a util in C instead.
> Thanks Maciej for pointing out Makefile variable problem.
>
> v4: Finally we managed to find a Makefile method to do it properly
> thanks to Kees. As it's too far from the initial version, I removed
> Review & Test tag from Nick and Fangrui and Cc instead.
>
> v5: Care vmlinuz as well.
>
> v6: Rename to LIKER_LOAD_ADDRESS
> ---
> arch/mips/Makefile | 13 ++++++++++++-
> arch/mips/boot/compressed/Makefile | 2 +-
> arch/mips/kernel/vmlinux.lds.S | 2 +-
> 3 files changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/arch/mips/Makefile b/arch/mips/Makefile
> index e1c44aed8156..68c0f22fefc0 100644
> --- a/arch/mips/Makefile
> +++ b/arch/mips/Makefile
> @@ -288,12 +288,23 @@ ifdef CONFIG_64BIT
> endif
> endif
>
> +# When linking a 32-bit executable the LLVM linker cannot cope with a
> +# 32-bit load address that has been sign-extended to 64 bits. Simply
> +# remove the upper 32 bits then, as it is safe to do so with other
> +# linkers.
> +ifdef CONFIG_64BIT
> + load-ld = $(load-y)
> +else
> + load-ld = $(subst 0xffffffff,0x,$(load-y))
> +endif
> +
> KBUILD_AFLAGS += $(cflags-y)
> KBUILD_CFLAGS += $(cflags-y)
> -KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y)
> +KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y) -DLINKER_LOAD_ADDRESS=$(load-ld)
> KBUILD_CPPFLAGS += -DDATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)
>
> bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y) \
> + LINKER_LOAD_ADDRESS=$(load-ld) \
> VMLINUX_ENTRY_ADDRESS=$(entry-y) \
> PLATFORM="$(platform-y)" \
> ITS_INPUTS="$(its-y)"
> diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile
> index 0df0ee8a298d..3d391256ab7e 100644
> --- a/arch/mips/boot/compressed/Makefile
> +++ b/arch/mips/boot/compressed/Makefile
> @@ -90,7 +90,7 @@ ifneq ($(zload-y),)
> VMLINUZ_LOAD_ADDRESS := $(zload-y)
> else
> VMLINUZ_LOAD_ADDRESS = $(shell $(obj)/calc_vmlinuz_load_addr \
> - $(obj)/vmlinux.bin $(VMLINUX_LOAD_ADDRESS))
> + $(obj)/vmlinux.bin $(LINKER_LOAD_ADDRESS))
> endif
> UIMAGE_LOADADDR = $(VMLINUZ_LOAD_ADDRESS)
>
> diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
> index a5f00ec73ea6..5226cd8e4bee 100644
> --- a/arch/mips/kernel/vmlinux.lds.S
> +++ b/arch/mips/kernel/vmlinux.lds.S
> @@ -55,7 +55,7 @@ SECTIONS
> /* . = 0xa800000000300000; */
> . = 0xffffffff80300000;
> #endif
> - . = VMLINUX_LOAD_ADDRESS;
> + . = LINKER_LOAD_ADDRESS;
> /* read-only */
> _text = .; /* Text and read-only data */
> .text : {
>
> --
--
Thanks,
~Nick Desaulniers
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <5e7dc543.vYG3wru8B/me1sOV%chenanqing@oppo.com>]
* Re:
[not found] <5e7dc543.vYG3wru8B/me1sOV%chenanqing@oppo.com>
@ 2020-03-27 15:53 ` Lee Duncan
0 siblings, 0 replies; 417+ messages in thread
From: Lee Duncan @ 2020-03-27 15:53 UTC (permalink / raw)
To: chenanqing, linux-kernel, linux-scsi, open-iscsi, ceph-devel,
martin.petersen, jejb, cleech
On 3/27/20 2:20 AM, chenanqing@oppo.com wrote:
> From: Chen Anqing <chenanqing@oppo.com>
> To: Lee Duncan <lduncan@suse.com>
> Cc: Chris Leech <cleech@redhat.com>,
> "James E . J . Bottomley" <jejb@linux.ibm.com>,
> "Martin K . Petersen" <martin.petersen@oracle.com>,
> ceph-devel@vger.kernel.org,
> open-iscsi@googlegroups.com,
> linux-scsi@vger.kernel.org,
> linux-kernel@vger.kernel.org,
> chenanqing@oppo.com
> Subject: [PATCH] scsi: libiscsi: we should take compound page into account also
> Date: Fri, 27 Mar 2020 05:20:01 -0400
> Message-Id: <20200327092001.56879-1-chenanqing@oppo.com>
> X-Mailer: git-send-email 2.18.2
>
> the patch is occur at a real crash,which slab is
> come from a compound page,so we need take the compound page
> into account also.
> fixed commit 08b11eaccfcf ("scsi: libiscsi: fall back to
> sendmsg for slab pages").
>
> Signed-off-by: Chen Anqing <chenanqing@oppo.com>
> ---
> drivers/scsi/libiscsi_tcp.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c
> index 6ef93c7af954..98304e5e1f6f 100644
> --- a/drivers/scsi/libiscsi_tcp.c
> +++ b/drivers/scsi/libiscsi_tcp.c
> @@ -128,7 +128,8 @@ static void iscsi_tcp_segment_map(struct iscsi_segment *segment, int recv)
> * coalescing neighboring slab objects into a single frag which
> * triggers one of hardened usercopy checks.
> */
> - if (!recv && page_count(sg_page(sg)) >= 1 && !PageSlab(sg_page(sg)))
> + if (!recv && page_count(sg_page(sg)) >= 1 &&
> + !PageSlab(compound_head(sg_page(sg))))
> return;
>
> if (recv) {
> --
> 2.18.2
>
This is missing a proper subject ...
^ permalink raw reply [flat|nested] 417+ messages in thread
[parent not found: <5e7dbb10.ulraq/ljeOm297+z%chenanqing@oppo.com>]
* Re:
[not found] <5e7dbb10.ulraq/ljeOm297+z%chenanqing@oppo.com>
@ 2020-03-27 8:59 ` Ilya Dryomov
0 siblings, 0 replies; 417+ messages in thread
From: Ilya Dryomov @ 2020-03-27 8:59 UTC (permalink / raw)
To: chenanqing; +Cc: LKML, netdev, Ceph Development, kuba, Sage Weil, Jeff Layton
On Fri, Mar 27, 2020 at 9:36 AM <chenanqing@oppo.com> wrote:
>
> From: Chen Anqing <chenanqing@oppo.com>
> To: Ilya Dryomov <idryomov@gmail.com>
> Cc: Jeff Layton <jlayton@kernel.org>,
> Sage Weil <sage@redhat.com>,
> Jakub Kicinski <kuba@kernel.org>,
> ceph-devel@vger.kernel.org,
> netdev@vger.kernel.org,
> linux-kernel@vger.kernel.org,
> chenanqing@oppo.com
> Subject: [PATCH] libceph: we should take compound page into account also
> Date: Fri, 27 Mar 2020 04:36:30 -0400
> Message-Id: <20200327083630.36296-1-chenanqing@oppo.com>
> X-Mailer: git-send-email 2.18.2
>
> the patch is occur at a real crash,which slab is
> come from a compound page,so we need take the compound page
> into account also.
> fixed commit 7e241f647dc7 ("libceph: fall back to sendmsg for slab pages")'
>
> Signed-off-by: Chen Anqing <chenanqing@oppo.com>
> ---
> net/ceph/messenger.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index f8ca5edc5f2c..e08c1c334cd9 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -582,7 +582,7 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
> * coalescing neighboring slab objects into a single frag which
> * triggers one of hardened usercopy checks.
> */
> - if (page_count(page) >= 1 && !PageSlab(page))
> + if (page_count(page) >= 1 && !PageSlab(compound_head(page)))
> sendpage = sock->ops->sendpage;
> else
> sendpage = sock_no_sendpage;
Hi Chen,
AFAICT compound pages should already be taken into account, because
PageSlab is defined as:
__PAGEFLAG(Slab, slab, PF_NO_TAIL)
#define __PAGEFLAG(uname, lname, policy) \
TESTPAGEFLAG(uname, lname, policy) \
__SETPAGEFLAG(uname, lname, policy) \
__CLEARPAGEFLAG(uname, lname, policy)
#define TESTPAGEFLAG(uname, lname, policy) \
static __always_inline int Page##uname(struct page *page) \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
and PF_NO_TAIL policy is defined as:
#define PF_NO_TAIL(page, enforce) ({ \
VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
PF_POISONED_CHECK(compound_head(page)); })
So compound_head() is called behind the scenes.
Could you please explain what crash did you observe in more detail?
Perhaps you backported this patch to an older kernel?
Thanks,
Ilya
^ permalink raw reply [flat|nested] 417+ messages in thread
* (no subject)
@ 2020-03-03 15:27 Gene Chen
2020-03-04 14:56 ` Matthias Brugger
0 siblings, 1 reply; 417+ messages in thread
From: Gene Chen @ 2020-03-03 15:27 UTC (permalink / raw)
To: lee.jones, matthias.bgg
Cc: linux-arm-kernel, linux-mediatek, linux-kernel, gene_chen,
Wilma.Wu, shufan_lee, cy_huang
Add mfd driver for mt6360 pmic chip include
Battery Charger/USB_PD/Flash LED/RGB LED/LDO/Buck
Signed-off-by: Gene Chen <gene_chen@richtek.com
---
drivers/mfd/Kconfig | 12 ++
drivers/mfd/Makefile | 1 +
drivers/mfd/mt6360-core.c | 425 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/mfd/mt6360.h | 240 +++++++++++++++++++++++++
4 files changed, 678 insertions(+)
create mode 100644 drivers/mfd/mt6360-core.c
create mode 100644 include/linux/mfd/mt6360.h
changelogs between v1 & v2
- include missing header file
changelogs between v2 & v3
- add changelogs
changelogs between v3 & v4
- fix Kconfig description
- replace mt6360_pmu_info with mt6360_pmu_data
- replace probe with probe_new
- remove unnecessary irq_chip variable
- remove annotation
- replace MT6360_MFD_CELL with OF_MFD_CELL
changelogs between v4 & v5
- remove unnecessary parse dt function
- use devm_i2c_new_dummy_device
- add base-commit message
changelogs between v5 & v6
- review return value
- remove i2c id_table
- use GPL license v2
changelogs between v6 & v7
- add author description
- replace MT6360_REGMAP_IRQ_REG by REGMAP_IRQ_REG_LINE
- remove mt6360-private.h
changelogs between v7 & v8
- fix kbuild auto reboot by include interrupt header
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2b20329..0f8c341 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -857,6 +857,18 @@ config MFD_MAX8998
additional drivers must be enabled in order to use the functionality
of the device.
+config MFD_MT6360
+ tristate "Mediatek MT6360 SubPMIC"
+ select MFD_CORE
+ select REGMAP_I2C
+ select REGMAP_IRQ
+ depends on I2C
+ help
+ Say Y here to enable MT6360 PMU/PMIC/LDO functional support.
+ PMU part includes Charger, Flashlight, RGB LED
+ PMIC part includes 2-channel BUCKs and 2-channel LDOs
+ LDO part includes 4-channel LDOs
+
config MFD_MT6397
tristate "MediaTek MT6397 PMIC Support"
select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index b83f172..8c35816 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -238,6 +238,7 @@ obj-$(CONFIG_INTEL_SOC_PMIC) += intel-soc-pmic.o
obj-$(CONFIG_INTEL_SOC_PMIC_BXTWC) += intel_soc_pmic_bxtwc.o
obj-$(CONFIG_INTEL_SOC_PMIC_CHTWC) += intel_soc_pmic_chtwc.o
obj-$(CONFIG_INTEL_SOC_PMIC_CHTDC_TI) += intel_soc_pmic_chtdc_ti.o
+obj-$(CONFIG_MFD_MT6360) += mt6360-core.o
mt6397-objs := mt6397-core.o mt6397-irq.o
obj-$(CONFIG_MFD_MT6397) += mt6397.o
obj-$(CONFIG_INTEL_SOC_PMIC_MRFLD) += intel_soc_pmic_mrfld.o
diff --git a/drivers/mfd/mt6360-core.c b/drivers/mfd/mt6360-core.c
new file mode 100644
index 0000000..d1168f8
--- /dev/null
+++ b/drivers/mfd/mt6360-core.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 MediaTek Inc.
+ *
+ * Author: Gene Chen <gene_chen@richtek.com>
+ */
+
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/module.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/version.h>
+
+#include <linux/mfd/mt6360.h>
+
+/* reg 0 -> 0 ~ 7 */
+#define MT6360_CHG_TREG_EVT (4)
+#define MT6360_CHG_AICR_EVT (5)
+#define MT6360_CHG_MIVR_EVT (6)
+#define MT6360_PWR_RDY_EVT (7)
+/* REG 1 -> 8 ~ 15 */
+#define MT6360_CHG_BATSYSUV_EVT (9)
+#define MT6360_FLED_CHG_VINOVP_EVT (11)
+#define MT6360_CHG_VSYSUV_EVT (12)
+#define MT6360_CHG_VSYSOV_EVT (13)
+#define MT6360_CHG_VBATOV_EVT (14)
+#define MT6360_CHG_VBUSOV_EVT (15)
+/* REG 2 -> 16 ~ 23 */
+/* REG 3 -> 24 ~ 31 */
+#define MT6360_WD_PMU_DET (25)
+#define MT6360_WD_PMU_DONE (26)
+#define MT6360_CHG_TMRI (27)
+#define MT6360_CHG_ADPBADI (29)
+#define MT6360_CHG_RVPI (30)
+#define MT6360_OTPI (31)
+/* REG 4 -> 32 ~ 39 */
+#define MT6360_CHG_AICCMEASL (32)
+#define MT6360_CHGDET_DONEI (34)
+#define MT6360_WDTMRI (35)
+#define MT6360_SSFINISHI (36)
+#define MT6360_CHG_RECHGI (37)
+#define MT6360_CHG_TERMI (38)
+#define MT6360_CHG_IEOCI (39)
+/* REG 5 -> 40 ~ 47 */
+#define MT6360_PUMPX_DONEI (40)
+#define MT6360_BAT_OVP_ADC_EVT (41)
+#define MT6360_TYPEC_OTP_EVT (42)
+#define MT6360_ADC_WAKEUP_EVT (43)
+#define MT6360_ADC_DONEI (44)
+#define MT6360_BST_BATUVI (45)
+#define MT6360_BST_VBUSOVI (46)
+#define MT6360_BST_OLPI (47)
+/* REG 6 -> 48 ~ 55 */
+#define MT6360_ATTACH_I (48)
+#define MT6360_DETACH_I (49)
+#define MT6360_QC30_STPDONE (51)
+#define MT6360_QC_VBUSDET_DONE (52)
+#define MT6360_HVDCP_DET (53)
+#define MT6360_CHGDETI (54)
+#define MT6360_DCDTI (55)
+/* REG 7 -> 56 ~ 63 */
+#define MT6360_FOD_DONE_EVT (56)
+#define MT6360_FOD_OV_EVT (57)
+#define MT6360_CHRDET_UVP_EVT (58)
+#define MT6360_CHRDET_OVP_EVT (59)
+#define MT6360_CHRDET_EXT_EVT (60)
+#define MT6360_FOD_LR_EVT (61)
+#define MT6360_FOD_HR_EVT (62)
+#define MT6360_FOD_DISCHG_FAIL_EVT (63)
+/* REG 8 -> 64 ~ 71 */
+#define MT6360_USBID_EVT (64)
+#define MT6360_APWDTRST_EVT (65)
+#define MT6360_EN_EVT (66)
+#define MT6360_QONB_RST_EVT (67)
+#define MT6360_MRSTB_EVT (68)
+#define MT6360_OTP_EVT (69)
+#define MT6360_VDDAOV_EVT (70)
+#define MT6360_SYSUV_EVT (71)
+/* REG 9 -> 72 ~ 79 */
+#define MT6360_FLED_STRBPIN_EVT (72)
+#define MT6360_FLED_TORPIN_EVT (73)
+#define MT6360_FLED_TX_EVT (74)
+#define MT6360_FLED_LVF_EVT (75)
+#define MT6360_FLED2_SHORT_EVT (78)
+#define MT6360_FLED1_SHORT_EVT (79)
+/* REG 10 -> 80 ~ 87 */
+#define MT6360_FLED2_STRB_EVT (80)
+#define MT6360_FLED1_STRB_EVT (81)
+#define MT6360_FLED2_STRB_TO_EVT (82)
+#define MT6360_FLED1_STRB_TO_EVT (83)
+#define MT6360_FLED2_TOR_EVT (84)
+#define MT6360_FLED1_TOR_EVT (85)
+/* REG 11 -> 88 ~ 95 */
+/* REG 12 -> 96 ~ 103 */
+#define MT6360_BUCK1_PGB_EVT (96)
+#define MT6360_BUCK1_OC_EVT (100)
+#define MT6360_BUCK1_OV_EVT (101)
+#define MT6360_BUCK1_UV_EVT (102)
+/* REG 13 -> 104 ~ 111 */
+#define MT6360_BUCK2_PGB_EVT (104)
+#define MT6360_BUCK2_OC_EVT (108)
+#define MT6360_BUCK2_OV_EVT (109)
+#define MT6360_BUCK2_UV_EVT (110)
+/* REG 14 -> 112 ~ 119 */
+#define MT6360_LDO1_OC_EVT (113)
+#define MT6360_LDO2_OC_EVT (114)
+#define MT6360_LDO3_OC_EVT (115)
+#define MT6360_LDO5_OC_EVT (117)
+#define MT6360_LDO6_OC_EVT (118)
+#define MT6360_LDO7_OC_EVT (119)
+/* REG 15 -> 120 ~ 127 */
+#define MT6360_LDO1_PGB_EVT (121)
+#define MT6360_LDO2_PGB_EVT (122)
+#define MT6360_LDO3_PGB_EVT (123)
+#define MT6360_LDO5_PGB_EVT (125)
+#define MT6360_LDO6_PGB_EVT (126)
+#define MT6360_LDO7_PGB_EVT (127)
+
+static const struct regmap_irq mt6360_pmu_irqs[] = {
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_AICR_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_MIVR_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_PWR_RDY_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_BATSYSUV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED_CHG_VINOVP_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_VSYSUV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_VSYSOV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_VBATOV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_VBUSOV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_WD_PMU_DET, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_WD_PMU_DONE, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_TMRI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_ADPBADI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_RVPI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_OTPI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_AICCMEASL, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHGDET_DONEI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_WDTMRI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_SSFINISHI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_RECHGI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_TERMI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_IEOCI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_PUMPX_DONEI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHG_TREG_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BAT_OVP_ADC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_TYPEC_OTP_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_ADC_WAKEUP_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_ADC_DONEI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BST_BATUVI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BST_VBUSOVI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BST_OLPI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_ATTACH_I, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_DETACH_I, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_QC30_STPDONE, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_QC_VBUSDET_DONE, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_HVDCP_DET, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHGDETI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_DCDTI, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FOD_DONE_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FOD_OV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHRDET_UVP_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHRDET_OVP_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_CHRDET_EXT_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FOD_LR_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FOD_HR_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FOD_DISCHG_FAIL_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_USBID_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_APWDTRST_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_EN_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_QONB_RST_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_MRSTB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_OTP_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_VDDAOV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_SYSUV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED_STRBPIN_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED_TORPIN_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED_TX_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED_LVF_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED2_SHORT_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED1_SHORT_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED2_STRB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED1_STRB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED2_STRB_TO_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED1_STRB_TO_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED2_TOR_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_FLED1_TOR_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK1_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK1_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK1_OV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK1_UV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK2_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK2_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK2_OV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_BUCK2_UV_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO1_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO2_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO3_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO5_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO6_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO7_OC_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO1_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO2_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO3_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO5_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO6_PGB_EVT, 8),
+ REGMAP_IRQ_REG_LINE(MT6360_LDO7_PGB_EVT, 8),
+};
+
+static int mt6360_pmu_handle_post_irq(void *irq_drv_data)
+{
+ struct mt6360_pmu_data *mpd = irq_drv_data;
+
+ return regmap_update_bits(mpd->regmap,
+ MT6360_PMU_IRQ_SET, MT6360_IRQ_RETRIG, MT6360_IRQ_RETRIG);
+}
+
+static struct regmap_irq_chip mt6360_pmu_irq_chip = {
+ .irqs = mt6360_pmu_irqs,
+ .num_irqs = ARRAY_SIZE(mt6360_pmu_irqs),
+ .num_regs = MT6360_PMU_IRQ_REGNUM,
+ .mask_base = MT6360_PMU_CHG_MASK1,
+ .status_base = MT6360_PMU_CHG_IRQ1,
+ .ack_base = MT6360_PMU_CHG_IRQ1,
+ .init_ack_masked = true,
+ .use_ack = true,
+ .handle_post_irq = mt6360_pmu_handle_post_irq,
+};
+
+static const struct regmap_config mt6360_pmu_regmap_config = {
+ .reg_bits = 8,
+ .val_bits = 8,
+ .max_register = MT6360_PMU_MAXREG,
+};
+
+static const struct resource mt6360_adc_resources[] = {
+ DEFINE_RES_IRQ_NAMED(MT6360_ADC_DONEI, "adc_donei"),
+};
+
+static const struct resource mt6360_chg_resources[] = {
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_TREG_EVT, "chg_treg_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_PWR_RDY_EVT, "pwr_rdy_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_BATSYSUV_EVT, "chg_batsysuv_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_VSYSUV_EVT, "chg_vsysuv_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_VSYSOV_EVT, "chg_vsysov_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_VBATOV_EVT, "chg_vbatov_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_VBUSOV_EVT, "chg_vbusov_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_AICCMEASL, "chg_aiccmeasl"),
+ DEFINE_RES_IRQ_NAMED(MT6360_WDTMRI, "wdtmri"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_RECHGI, "chg_rechgi"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_TERMI, "chg_termi"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHG_IEOCI, "chg_ieoci"),
+ DEFINE_RES_IRQ_NAMED(MT6360_PUMPX_DONEI, "pumpx_donei"),
+ DEFINE_RES_IRQ_NAMED(MT6360_ATTACH_I, "attach_i"),
+ DEFINE_RES_IRQ_NAMED(MT6360_CHRDET_EXT_EVT, "chrdet_ext_evt"),
+};
+
+static const struct resource mt6360_led_resources[] = {
+ DEFINE_RES_IRQ_NAMED(MT6360_FLED_CHG_VINOVP_EVT, "fled_chg_vinovp_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_FLED_LVF_EVT, "fled_lvf_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_FLED2_SHORT_EVT, "fled2_short_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_FLED1_SHORT_EVT, "fled1_short_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_FLED2_STRB_TO_EVT, "fled2_strb_to_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_FLED1_STRB_TO_EVT, "fled1_strb_to_evt"),
+};
+
+static const struct resource mt6360_pmic_resources[] = {
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_PGB_EVT, "buck1_pgb_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_OC_EVT, "buck1_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_OV_EVT, "buck1_ov_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK1_UV_EVT, "buck1_uv_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_PGB_EVT, "buck2_pgb_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_OC_EVT, "buck2_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_OV_EVT, "buck2_ov_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_BUCK2_UV_EVT, "buck2_uv_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO6_OC_EVT, "ldo6_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO7_OC_EVT, "ldo7_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO6_PGB_EVT, "ldo6_pgb_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO7_PGB_EVT, "ldo7_pgb_evt"),
+};
+
+static const struct resource mt6360_ldo_resources[] = {
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO1_OC_EVT, "ldo1_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO2_OC_EVT, "ldo2_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO3_OC_EVT, "ldo3_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO5_OC_EVT, "ldo5_oc_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO1_PGB_EVT, "ldo1_pgb_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO2_PGB_EVT, "ldo2_pgb_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO3_PGB_EVT, "ldo3_pgb_evt"),
+ DEFINE_RES_IRQ_NAMED(MT6360_LDO5_PGB_EVT, "ldo5_pgb_evt"),
+};
+
+static const struct mfd_cell mt6360_devs[] = {
+ OF_MFD_CELL("mt6360_adc", mt6360_adc_resources,
+ NULL, 0, 0, "mediatek,mt6360_adc"),
+ OF_MFD_CELL("mt6360_chg", mt6360_chg_resources,
+ NULL, 0, 0, "mediatek,mt6360_chg"),
+ OF_MFD_CELL("mt6360_led", mt6360_led_resources,
+ NULL, 0, 0, "mediatek,mt6360_led"),
+ OF_MFD_CELL("mt6360_pmic", mt6360_pmic_resources,
+ NULL, 0, 0, "mediatek,mt6360_pmic"),
+ OF_MFD_CELL("mt6360_ldo", mt6360_ldo_resources,
+ NULL, 0, 0, "mediatek,mt6360_ldo"),
+ OF_MFD_CELL("mt6360_tcpc", NULL,
+ NULL, 0, 0, "mediatek,mt6360_tcpc"),
+};
+
+static const unsigned short mt6360_slave_addr[MT6360_SLAVE_MAX] = {
+ MT6360_PMU_SLAVEID,
+ MT6360_PMIC_SLAVEID,
+ MT6360_LDO_SLAVEID,
+ MT6360_TCPC_SLAVEID,
+};
+
+static int mt6360_pmu_probe(struct i2c_client *client)
+{
+ struct mt6360_pmu_data *mpd;
+ unsigned int reg_data;
+ int i, ret;
+
+ mpd = devm_kzalloc(&client->dev, sizeof(*mpd), GFP_KERNEL);
+ if (!mpd)
+ return -ENOMEM;
+
+ mpd->dev = &client->dev;
+ i2c_set_clientdata(client, mpd);
+
+ mpd->regmap = devm_regmap_init_i2c(client, &mt6360_pmu_regmap_config);
+ if (IS_ERR(mpd->regmap)) {
+ dev_err(&client->dev, "Failed to register regmap\n");
+ return PTR_ERR(mpd->regmap);
+ }
+
+ ret = regmap_read(mpd->regmap, MT6360_PMU_DEV_INFO, ®_data);
+ if (ret) {
+ dev_err(&client->dev, "Device not found\n");
+ return ret;
+ }
+
+ mpd->chip_rev = reg_data & CHIP_REV_MASK;
+ if (mpd->chip_rev != CHIP_VEN_MT6360) {
+ dev_err(&client->dev, "Device not supported\n");
+ return -ENODEV;
+ }
+
+ mt6360_pmu_irq_chip.irq_drv_data = mpd;
+ ret = devm_regmap_add_irq_chip(&client->dev, mpd->regmap, client->irq,
+ IRQF_TRIGGER_FALLING, 0,
+ &mt6360_pmu_irq_chip, &mpd->irq_data);
+ if (ret) {
+ dev_err(&client->dev, "Failed to add Regmap IRQ Chip\n");
+ return ret;
+ }
+
+ mpd->i2c[0] = client;
+ for (i = 1; i < MT6360_SLAVE_MAX; i++) {
+ mpd->i2c[i] = devm_i2c_new_dummy_device(&client->dev,
+ client->adapter,
+ mt6360_slave_addr[i]);
+ if (IS_ERR(mpd->i2c[i])) {
+ dev_err(&client->dev,
+ "Failed to get new dummy I2C device for address 0x%x",
+ mt6360_slave_addr[i]);
+ return PTR_ERR(mpd->i2c[i]);
+ }
+ i2c_set_clientdata(mpd->i2c[i], mpd);
+ }
+
+ ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO,
+ mt6360_devs, ARRAY_SIZE(mt6360_devs), NULL,
+ 0, regmap_irq_get_domain(mpd->irq_data));
+ if (ret) {
+ dev_err(&client->dev,
+ "Failed to register subordinate devices\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused mt6360_pmu_suspend(struct device *dev)
+{
+ struct i2c_client *i2c = to_i2c_client(dev);
+
+ if (device_may_wakeup(dev))
+ enable_irq_wake(i2c->irq);
+
+ return 0;
+}
+
+static int __maybe_unused mt6360_pmu_resume(struct device *dev)
+{
+
+ struct i2c_client *i2c = to_i2c_client(dev);
+
+ if (device_may_wakeup(dev))
+ disable_irq_wake(i2c->irq);
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(mt6360_pmu_pm_ops,
+ mt6360_pmu_suspend, mt6360_pmu_resume);
+
+static const struct of_device_id __maybe_unused mt6360_pmu_of_id[] = {
+ { .compatible = "mediatek,mt6360_pmu", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, mt6360_pmu_of_id);
+
+static struct i2c_driver mt6360_pmu_driver = {
+ .driver = {
+ .pm = &mt6360_pmu_pm_ops,
+ .of_match_table = of_match_ptr(mt6360_pmu_of_id),
+ },
+ .probe_new = mt6360_pmu_probe,
+};
+module_i2c_driver(mt6360_pmu_driver);
+
+MODULE_AUTHOR("Gene Chen <gene_chen@richtek.com>");
+MODULE_DESCRIPTION("MT6360 PMU I2C Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/mt6360.h b/include/linux/mfd/mt6360.h
new file mode 100644
index 0000000..c03e6d1
--- /dev/null
+++ b/include/linux/mfd/mt6360.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2019 MediaTek Inc.
+ */
+
+#ifndef __MT6360_H__
+#define __MT6360_H__
+
+#include <linux/regmap.h>
+
+enum {
+ MT6360_SLAVE_PMU = 0,
+ MT6360_SLAVE_PMIC,
+ MT6360_SLAVE_LDO,
+ MT6360_SLAVE_TCPC,
+ MT6360_SLAVE_MAX,
+};
+
+#define MT6360_PMU_SLAVEID (0x34)
+#define MT6360_PMIC_SLAVEID (0x1A)
+#define MT6360_LDO_SLAVEID (0x64)
+#define MT6360_TCPC_SLAVEID (0x4E)
+
+struct mt6360_pmu_data {
+ struct i2c_client *i2c[MT6360_SLAVE_MAX];
+ struct device *dev;
+ struct regmap *regmap;
+ struct regmap_irq_chip_data *irq_data;
+ unsigned int chip_rev;
+};
+
+/* PMU register defininition */
+#define MT6360_PMU_DEV_INFO (0x00)
+#define MT6360_PMU_CORE_CTRL1 (0x01)
+#define MT6360_PMU_RST1 (0x02)
+#define MT6360_PMU_CRCEN (0x03)
+#define MT6360_PMU_RST_PAS_CODE1 (0x04)
+#define MT6360_PMU_RST_PAS_CODE2 (0x05)
+#define MT6360_PMU_CORE_CTRL2 (0x06)
+#define MT6360_PMU_TM_PAS_CODE1 (0x07)
+#define MT6360_PMU_TM_PAS_CODE2 (0x08)
+#define MT6360_PMU_TM_PAS_CODE3 (0x09)
+#define MT6360_PMU_TM_PAS_CODE4 (0x0A)
+#define MT6360_PMU_IRQ_IND (0x0B)
+#define MT6360_PMU_IRQ_MASK (0x0C)
+#define MT6360_PMU_IRQ_SET (0x0D)
+#define MT6360_PMU_SHDN_CTRL (0x0E)
+#define MT6360_PMU_TM_INF (0x0F)
+#define MT6360_PMU_I2C_CTRL (0x10)
+#define MT6360_PMU_CHG_CTRL1 (0x11)
+#define MT6360_PMU_CHG_CTRL2 (0x12)
+#define MT6360_PMU_CHG_CTRL3 (0x13)
+#define MT6360_PMU_CHG_CTRL4 (0x14)
+#define MT6360_PMU_CHG_CTRL5 (0x15)
+#define MT6360_PMU_CHG_CTRL6 (0x16)
+#define MT6360_PMU_CHG_CTRL7 (0x17)
+#define MT6360_PMU_CHG_CTRL8 (0x18)
+#define MT6360_PMU_CHG_CTRL9 (0x19)
+#define MT6360_PMU_CHG_CTRL10 (0x1A)
+#define MT6360_PMU_CHG_CTRL11 (0x1B)
+#define MT6360_PMU_CHG_CTRL12 (0x1C)
+#define MT6360_PMU_CHG_CTRL13 (0x1D)
+#define MT6360_PMU_CHG_CTRL14 (0x1E)
+#define MT6360_PMU_CHG_CTRL15 (0x1F)
+#define MT6360_PMU_CHG_CTRL16 (0x20)
+#define MT6360_PMU_CHG_AICC_RESULT (0x21)
+#define MT6360_PMU_DEVICE_TYPE (0x22)
+#define MT6360_PMU_QC_CONTROL1 (0x23)
+#define MT6360_PMU_QC_CONTROL2 (0x24)
+#define MT6360_PMU_QC30_CONTROL1 (0x25)
+#define MT6360_PMU_QC30_CONTROL2 (0x26)
+#define MT6360_PMU_USB_STATUS1 (0x27)
+#define MT6360_PMU_QC_STATUS1 (0x28)
+#define MT6360_PMU_QC_STATUS2 (0x29)
+#define MT6360_PMU_CHG_PUMP (0x2A)
+#define MT6360_PMU_CHG_CTRL17 (0x2B)
+#define MT6360_PMU_CHG_CTRL18 (0x2C)
+#define MT6360_PMU_CHRDET_CTRL1 (0x2D)
+#define MT6360_PMU_CHRDET_CTRL2 (0x2E)
+#define MT6360_PMU_DPDN_CTRL (0x2F)
+#define MT6360_PMU_CHG_HIDDEN_CTRL1 (0x30)
+#define MT6360_PMU_CHG_HIDDEN_CTRL2 (0x31)
+#define MT6360_PMU_CHG_HIDDEN_CTRL3 (0x32)
+#define MT6360_PMU_CHG_HIDDEN_CTRL4 (0x33)
+#define MT6360_PMU_CHG_HIDDEN_CTRL5 (0x34)
+#define MT6360_PMU_CHG_HIDDEN_CTRL6 (0x35)
+#define MT6360_PMU_CHG_HIDDEN_CTRL7 (0x36)
+#define MT6360_PMU_CHG_HIDDEN_CTRL8 (0x37)
+#define MT6360_PMU_CHG_HIDDEN_CTRL9 (0x38)
+#define MT6360_PMU_CHG_HIDDEN_CTRL10 (0x39)
+#define MT6360_PMU_CHG_HIDDEN_CTRL11 (0x3A)
+#define MT6360_PMU_CHG_HIDDEN_CTRL12 (0x3B)
+#define MT6360_PMU_CHG_HIDDEN_CTRL13 (0x3C)
+#define MT6360_PMU_CHG_HIDDEN_CTRL14 (0x3D)
+#define MT6360_PMU_CHG_HIDDEN_CTRL15 (0x3E)
+#define MT6360_PMU_CHG_HIDDEN_CTRL16 (0x3F)
+#define MT6360_PMU_CHG_HIDDEN_CTRL17 (0x40)
+#define MT6360_PMU_CHG_HIDDEN_CTRL18 (0x41)
+#define MT6360_PMU_CHG_HIDDEN_CTRL19 (0x42)
+#define MT6360_PMU_CHG_HIDDEN_CTRL20 (0x43)
+#define MT6360_PMU_CHG_HIDDEN_CTRL21 (0x44)
+#define MT6360_PMU_CHG_HIDDEN_CTRL22 (0x45)
+#define MT6360_PMU_CHG_HIDDEN_CTRL23 (0x46)
+#define MT6360_PMU_CHG_HIDDEN_CTRL24 (0x47)
+#define MT6360_PMU_CHG_HIDDEN_CTRL25 (0x48)
+#define MT6360_PMU_BC12_CTRL (0x49)
+#define MT6360_PMU_CHG_STAT (0x4A)
+#define MT6360_PMU_RESV1 (0x4B)
+#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEH (0x4E)
+#define MT6360_PMU_TYPEC_OTP_TH_SEL_CODEL (0x4F)
+#define MT6360_PMU_TYPEC_OTP_HYST_TH (0x50)
+#define MT6360_PMU_TYPEC_OTP_CTRL (0x51)
+#define MT6360_PMU_ADC_BAT_DATA_H (0x52)
+#define MT6360_PMU_ADC_BAT_DATA_L (0x53)
+#define MT6360_PMU_IMID_BACKBST_ON (0x54)
+#define MT6360_PMU_IMID_BACKBST_OFF (0x55)
+#define MT6360_PMU_ADC_CONFIG (0x56)
+#define MT6360_PMU_ADC_EN2 (0x57)
+#define MT6360_PMU_ADC_IDLE_T (0x58)
+#define MT6360_PMU_ADC_RPT_1 (0x5A)
+#define MT6360_PMU_ADC_RPT_2 (0x5B)
+#define MT6360_PMU_ADC_RPT_3 (0x5C)
+#define MT6360_PMU_ADC_RPT_ORG1 (0x5D)
+#define MT6360_PMU_ADC_RPT_ORG2 (0x5E)
+#define MT6360_PMU_BAT_OVP_TH_SEL_CODEH (0x5F)
+#define MT6360_PMU_BAT_OVP_TH_SEL_CODEL (0x60)
+#define MT6360_PMU_CHG_CTRL19 (0x61)
+#define MT6360_PMU_VDDASUPPLY (0x62)
+#define MT6360_PMU_BC12_MANUAL (0x63)
+#define MT6360_PMU_CHGDET_FUNC (0x64)
+#define MT6360_PMU_FOD_CTRL (0x65)
+#define MT6360_PMU_CHG_CTRL20 (0x66)
+#define MT6360_PMU_CHG_HIDDEN_CTRL26 (0x67)
+#define MT6360_PMU_CHG_HIDDEN_CTRL27 (0x68)
+#define MT6360_PMU_RESV2 (0x69)
+#define MT6360_PMU_USBID_CTRL1 (0x6D)
+#define MT6360_PMU_USBID_CTRL2 (0x6E)
+#define MT6360_PMU_USBID_CTRL3 (0x6F)
+#define MT6360_PMU_FLED_CFG (0x70)
+#define MT6360_P