Linux-ext4 Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
@ 2019-08-20 17:08 Sebastian Siewior
  2019-08-20 17:17 ` Matthew Wilcox
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Siewior @ 2019-08-20 17:08 UTC (permalink / raw)
  To: LKML, linux-fsdevel, linux-ext4
  Cc: Thomas Gleixner, Peter Zijlstra, Ingo Molnar,
	Anna-Maria Gleixner, Steven Rostedt, Julia Cartwright, Jan Kara,
	Theodore Tso, Matthew Wilcox, Alexander Viro, Jan Kara,
	Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

From: Thomas Gleixner <tglx@linutronix.de>

Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
disable preemption, which is undesired for latency reasons and breaks when
regular spinlocks are taken within the bit_spinlock locked region because
regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
replaces the bit spinlocks with regular spinlocks to avoid this problem.
Bit spinlocks are also not covered by lock debugging, e.g. lockdep.

Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: remove the wrapper and use always spinlock_t]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 fs/buffer.c                 | 19 +++++++------------
 fs/ext4/page-io.c           |  8 +++-----
 fs/ntfs/aops.c              |  9 +++------
 include/linux/buffer_head.h |  6 +++---
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 131d39ec7d316..eab37fbaa439f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,8 +275,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -289,8 +288,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -302,8 +300,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	return;
 }
 
@@ -331,8 +328,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -344,14 +340,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	return;
 }
 EXPORT_SYMBOL(end_buffer_async_write);
@@ -3420,6 +3414,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		spin_lock_init(&ret->uptodate_lock);
 		preempt_disable();
 		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 12ceadef32c5a..7745ed23c6ad9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -87,11 +87,10 @@ static void ext4_finish_bio(struct bio *bio)
 		}
 		bh = head = page_buffers(page);
 		/*
-		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * We check all buffers in the page under uptodate_lock
 		 * to avoid races with other end io clearing async_write flags
 		 */
-		local_irq_save(flags);
-		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		spin_lock_irqsave(&head->uptodate_lock, flags);
 		do {
 			if (bh_offset(bh) < bio_start ||
 			    bh_offset(bh) + bh->b_size > bio_end) {
@@ -103,8 +102,7 @@ static void ext4_finish_bio(struct bio *bio)
 			if (bio->bi_status)
 				buffer_io_error(bh);
 		} while ((bh = bh->b_this_page) != head);
-		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-		local_irq_restore(flags);
+		spin_unlock_irqrestore(&head->uptodate_lock, flags);
 		if (!under_io) {
 			fscrypt_free_bounce_page(bounce_page);
 			end_page_writeback(page);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7202a1e39d70c..14ca433b3a9e4 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	return;
 }
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7b73ef7f902d4..c8f2a3076ce00 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -22,9 +22,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
 
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
@@ -62,6 +59,9 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
  */
 struct buffer_head {
 	unsigned long b_state;		/* buffer state bitmap (see above) */
+	spinlock_t	uptodate_lock;	/* Used by the first bh in a page, to
+					 * serialise IO completion of other
+					 * buffers in the page */
 	struct buffer_head *b_this_page;/* circular list of page's buffers */
 	struct page *b_page;		/* the page this bh is mapped to */
 
-- 
2.23.0.rc1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-08-20 17:08 [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t Sebastian Siewior
@ 2019-08-20 17:17 ` Matthew Wilcox
  2019-08-20 18:01   ` Thomas Gleixner
  0 siblings, 1 reply; 13+ messages in thread
From: Matthew Wilcox @ 2019-08-20 17:17 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: LKML, linux-fsdevel, linux-ext4, Thomas Gleixner, Peter Zijlstra,
	Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Jan Kara, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On Tue, Aug 20, 2019 at 07:08:18PM +0200, Sebastian Siewior wrote:
> Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
> disable preemption, which is undesired for latency reasons and breaks when
> regular spinlocks are taken within the bit_spinlock locked region because
> regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
> replaces the bit spinlocks with regular spinlocks to avoid this problem.
> Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
> 
> Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> [bigeasy: remove the wrapper and use always spinlock_t]

Uhh ... always grow the buffer_head, even for non-PREEMPT_RT?  Why?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-08-20 17:17 ` Matthew Wilcox
@ 2019-08-20 18:01   ` Thomas Gleixner
  2019-10-11 11:25     ` Sebastian Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: Thomas Gleixner @ 2019-08-20 18:01 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Sebastian Siewior, LKML, linux-fsdevel, linux-ext4,
	Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Jan Kara, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On Tue, 20 Aug 2019, Matthew Wilcox wrote:
> On Tue, Aug 20, 2019 at 07:08:18PM +0200, Sebastian Siewior wrote:
> > Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
> > disable preemption, which is undesired for latency reasons and breaks when
> > regular spinlocks are taken within the bit_spinlock locked region because
> > regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
> > replaces the bit spinlocks with regular spinlocks to avoid this problem.
> > Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
> > 
> > Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
> > 
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > [bigeasy: remove the wrapper and use always spinlock_t]
> 
> Uhh ... always grow the buffer_head, even for non-PREEMPT_RT?  Why?

Christoph requested that:

  https://lkml.kernel.org/r/20190802075612.GA20962@infradead.org

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-08-20 18:01   ` Thomas Gleixner
@ 2019-10-11 11:25     ` Sebastian Siewior
  2019-11-15 14:56       ` Jan Kara
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Siewior @ 2019-10-11 11:25 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Matthew Wilcox, LKML, linux-fsdevel, linux-ext4, Peter Zijlstra,
	Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Jan Kara, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On 2019-08-20 20:01:14 [+0200], Thomas Gleixner wrote:
> On Tue, 20 Aug 2019, Matthew Wilcox wrote:
> > On Tue, Aug 20, 2019 at 07:08:18PM +0200, Sebastian Siewior wrote:
> > > Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
> > > disable preemption, which is undesired for latency reasons and breaks when
> > > regular spinlocks are taken within the bit_spinlock locked region because
> > > regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
> > > replaces the bit spinlocks with regular spinlocks to avoid this problem.
> > > Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
> > > 
> > > Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
> > > 
> > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > > [bigeasy: remove the wrapper and use always spinlock_t]
> > 
> > Uhh ... always grow the buffer_head, even for non-PREEMPT_RT?  Why?
> 
> Christoph requested that:
> 
>   https://lkml.kernel.org/r/20190802075612.GA20962@infradead.org

What do we do about this one?

> Thanks,
> 
> 	tglx

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-10-11 11:25     ` Sebastian Siewior
@ 2019-11-15 14:56       ` Jan Kara
  2019-11-15 17:36         ` Theodore Y. Ts'o
  2019-11-15 17:54         ` [PATCH v2] " Sebastian Siewior
  0 siblings, 2 replies; 13+ messages in thread
From: Jan Kara @ 2019-11-15 14:56 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel, linux-ext4,
	Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Jan Kara, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On Fri 11-10-19 13:25:25, Sebastian Siewior wrote:
> On 2019-08-20 20:01:14 [+0200], Thomas Gleixner wrote:
> > On Tue, 20 Aug 2019, Matthew Wilcox wrote:
> > > On Tue, Aug 20, 2019 at 07:08:18PM +0200, Sebastian Siewior wrote:
> > > > Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
> > > > disable preemption, which is undesired for latency reasons and breaks when
> > > > regular spinlocks are taken within the bit_spinlock locked region because
> > > > regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
> > > > replaces the bit spinlocks with regular spinlocks to avoid this problem.
> > > > Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
> > > > 
> > > > Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
> > > > 
> > > > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > > > [bigeasy: remove the wrapper and use always spinlock_t]
> > > 
> > > Uhh ... always grow the buffer_head, even for non-PREEMPT_RT?  Why?
> > 
> > Christoph requested that:
> > 
> >   https://lkml.kernel.org/r/20190802075612.GA20962@infradead.org
> 
> What do we do about this one?

I was thinking about this for quite some time. In the end I think the patch
is almost fine but I'd name the lock b_update_lock and put it just after
b_size element in struct buffer_head to use the hole there. That way we
don't grow struct buffer_head.

With some effort, we could even shrink struct buffer_head from 104 bytes
(on x86_64) to 96 bytes but I don't think that effort is worth it (I'd find
it better use of time to actually work on getting rid of buffer heads
completely).

								Honza

-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-15 14:56       ` Jan Kara
@ 2019-11-15 17:36         ` Theodore Y. Ts'o
  2019-11-18  9:35           ` Jan Kara
  2019-11-15 17:54         ` [PATCH v2] " Sebastian Siewior
  1 sibling, 1 reply; 13+ messages in thread
From: Theodore Y. Ts'o @ 2019-11-15 17:36 UTC (permalink / raw)
  To: Jan Kara
  Cc: Sebastian Siewior, Thomas Gleixner, Matthew Wilcox, LKML,
	linux-fsdevel, linux-ext4, Peter Zijlstra, Ingo Molnar,
	Anna-Maria Gleixner, Steven Rostedt, Julia Cartwright,
	Alexander Viro, Jan Kara, Mark Fasheh, Joseph Qi,
	Christoph Hellwig, Joel Becker

On Fri, Nov 15, 2019 at 03:56:38PM +0100, Jan Kara wrote:
> With some effort, we could even shrink struct buffer_head from 104 bytes
> (on x86_64) to 96 bytes but I don't think that effort is worth it (I'd find
> it better use of time to actually work on getting rid of buffer heads
> completely).

Is that really realistic?  All aside from the very large number of
file systems which use buffer_heads that would have to be reworked,
the concept of buffer heads is pretty fundamental to how jbd2 is
architected.

					- Ted

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v2] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-15 14:56       ` Jan Kara
  2019-11-15 17:36         ` Theodore Y. Ts'o
@ 2019-11-15 17:54         ` " Sebastian Siewior
  2019-11-18  9:38           ` Jan Kara
  1 sibling, 1 reply; 13+ messages in thread
From: Sebastian Siewior @ 2019-11-15 17:54 UTC (permalink / raw)
  To: Jan Kara
  Cc: Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel, linux-ext4,
	Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Theodore Tso, Alexander Viro, Jan Kara,
	Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

From: Thomas Gleixner <tglx@linutronix.de>

Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
disable preemption, which is undesired for latency reasons and breaks when
regular spinlocks are taken within the bit_spinlock locked region because
regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
replaces the bit spinlocks with regular spinlocks to avoid this problem.
Bit spinlocks are also not covered by lock debugging, e.g. lockdep.

Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: remove the wrapper and use always spinlock_t and move it into
          the padding hole]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---

v1…v2: Move the spinlock_t to the padding hole as per Jan Kara. pahole says
its total size remained unchanged, before

| atomic_t                   b_count;              /*    96     4 */
|
| /* size: 104, cachelines: 2, members: 12 */
| /* padding: 4 */
| /* last cacheline: 40 bytes */

after

| atomic_t                   b_count;              /*    96     4 */
| spinlock_t                 uptodate_lock;        /*   100     4 */
|
| /* size: 104, cachelines: 2, members: 13 */
| /* last cacheline: 40 bytes */

 fs/buffer.c                 | 19 +++++++------------
 fs/ext4/page-io.c           |  8 +++-----
 fs/ntfs/aops.c              |  9 +++------
 include/linux/buffer_head.h |  6 +++---
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 0ac4ae15ea4dc..2c6f2b41a88e4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -277,8 +277,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -291,8 +290,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -304,8 +302,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	return;
 }
 
@@ -333,8 +330,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -346,14 +342,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	return;
 }
 EXPORT_SYMBOL(end_buffer_async_write);
@@ -3422,6 +3416,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		spin_lock_init(&ret->uptodate_lock);
 		preempt_disable();
 		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 893913d1c2fe3..592816713e0d6 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -125,11 +125,10 @@ static void ext4_finish_bio(struct bio *bio)
 		}
 		bh = head = page_buffers(page);
 		/*
-		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * We check all buffers in the page under uptodate_lock
 		 * to avoid races with other end io clearing async_write flags
 		 */
-		local_irq_save(flags);
-		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		spin_lock_irqsave(&head->uptodate_lock, flags);
 		do {
 			if (bh_offset(bh) < bio_start ||
 			    bh_offset(bh) + bh->b_size > bio_end) {
@@ -141,8 +140,7 @@ static void ext4_finish_bio(struct bio *bio)
 			if (bio->bi_status)
 				buffer_io_error(bh);
 		} while ((bh = bh->b_this_page) != head);
-		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-		local_irq_restore(flags);
+		spin_unlock_irqrestore(&head->uptodate_lock, flags);
 		if (!under_io) {
 			fscrypt_free_bounce_page(bounce_page);
 			end_page_writeback(page);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7202a1e39d70c..14ca433b3a9e4 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->uptodate_lock, flags);
 	return;
 }
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7b73ef7f902d4..e3f8421d17bab 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -22,9 +22,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
 
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
@@ -76,6 +73,9 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+	spinlock_t	uptodate_lock;	/* Used by the first bh in a page, to
+					 * serialise IO completion of other
+					 * buffers in the page */
 };
 
 /*
-- 
2.24.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-15 17:36         ` Theodore Y. Ts'o
@ 2019-11-18  9:35           ` Jan Kara
  0 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2019-11-18  9:35 UTC (permalink / raw)
  To: Theodore Y. Ts'o
  Cc: Jan Kara, Sebastian Siewior, Thomas Gleixner, Matthew Wilcox,
	LKML, linux-fsdevel, linux-ext4, Peter Zijlstra, Ingo Molnar,
	Anna-Maria Gleixner, Steven Rostedt, Julia Cartwright,
	Alexander Viro, Jan Kara, Mark Fasheh, Joseph Qi,
	Christoph Hellwig, Joel Becker

On Fri 15-11-19 12:36:34, Theodore Y. Ts'o wrote:
> On Fri, Nov 15, 2019 at 03:56:38PM +0100, Jan Kara wrote:
> > With some effort, we could even shrink struct buffer_head from 104 bytes
> > (on x86_64) to 96 bytes but I don't think that effort is worth it (I'd find
> > it better use of time to actually work on getting rid of buffer heads
> > completely).
> 
> Is that really realistic?  All aside from the very large number of
> file systems which use buffer_heads that would have to be reworked,
> the concept of buffer heads is pretty fundamental to how jbd2 is
> architected.

I think it is reasonably possible to remove buffer_heads from data path
(including direct IO path) of all filesystems. That way memory consumption
of buffer_heads becomes mostly irrelevant and we can have a look how much
from the current bh framework still makes sense...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-15 17:54         ` [PATCH v2] " Sebastian Siewior
@ 2019-11-18  9:38           ` Jan Kara
  2019-11-18 13:28             ` [PATCH v3] " Sebastian Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: Jan Kara @ 2019-11-18  9:38 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Jan Kara, Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel,
	linux-ext4, Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner,
	Steven Rostedt, Julia Cartwright, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On Fri 15-11-19 18:54:20, Sebastian Siewior wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
> disable preemption, which is undesired for latency reasons and breaks when
> regular spinlocks are taken within the bit_spinlock locked region because
> regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
> replaces the bit spinlocks with regular spinlocks to avoid this problem.
> Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
> 
> Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> [bigeasy: remove the wrapper and use always spinlock_t and move it into
>           the padding hole]
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
> 
> v1…v2: Move the spinlock_t to the padding hole as per Jan Kara. pahole says
> its total size remained unchanged, before
> 
> | atomic_t                   b_count;              /*    96     4 */
> |
> | /* size: 104, cachelines: 2, members: 12 */
> | /* padding: 4 */
> | /* last cacheline: 40 bytes */
> 
> after
> 
> | atomic_t                   b_count;              /*    96     4 */
> | spinlock_t                 uptodate_lock;        /*   100     4 */
> |
> | /* size: 104, cachelines: 2, members: 13 */
> | /* last cacheline: 40 bytes */

To follow the naming of other members in struct buffer_head, please name
this b_uptodate_lock (note the b_ prefix). Otherwise the patch looks good
to me so feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

after fixing this.

								Honza

> 
>  fs/buffer.c                 | 19 +++++++------------
>  fs/ext4/page-io.c           |  8 +++-----
>  fs/ntfs/aops.c              |  9 +++------
>  include/linux/buffer_head.h |  6 +++---
>  4 files changed, 16 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 0ac4ae15ea4dc..2c6f2b41a88e4 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -277,8 +277,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  	 * decide that the page is now completely done.
>  	 */
>  	first = page_buffers(page);
> -	local_irq_save(flags);
> -	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
> +	spin_lock_irqsave(&first->uptodate_lock, flags);
>  	clear_buffer_async_read(bh);
>  	unlock_buffer(bh);
>  	tmp = bh;
> @@ -291,8 +290,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  		}
>  		tmp = tmp->b_this_page;
>  	} while (tmp != bh);
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->uptodate_lock, flags);
>  
>  	/*
>  	 * If none of the buffers had errors and they are all
> @@ -304,8 +302,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  	return;
>  
>  still_busy:
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->uptodate_lock, flags);
>  	return;
>  }
>  
> @@ -333,8 +330,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
>  	}
>  
>  	first = page_buffers(page);
> -	local_irq_save(flags);
> -	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
> +	spin_lock_irqsave(&first->uptodate_lock, flags);
>  
>  	clear_buffer_async_write(bh);
>  	unlock_buffer(bh);
> @@ -346,14 +342,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
>  		}
>  		tmp = tmp->b_this_page;
>  	}
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->uptodate_lock, flags);
>  	end_page_writeback(page);
>  	return;
>  
>  still_busy:
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->uptodate_lock, flags);
>  	return;
>  }
>  EXPORT_SYMBOL(end_buffer_async_write);
> @@ -3422,6 +3416,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
>  	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
>  	if (ret) {
>  		INIT_LIST_HEAD(&ret->b_assoc_buffers);
> +		spin_lock_init(&ret->uptodate_lock);
>  		preempt_disable();
>  		__this_cpu_inc(bh_accounting.nr);
>  		recalc_bh_state();
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 893913d1c2fe3..592816713e0d6 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -125,11 +125,10 @@ static void ext4_finish_bio(struct bio *bio)
>  		}
>  		bh = head = page_buffers(page);
>  		/*
> -		 * We check all buffers in the page under BH_Uptodate_Lock
> +		 * We check all buffers in the page under uptodate_lock
>  		 * to avoid races with other end io clearing async_write flags
>  		 */
> -		local_irq_save(flags);
> -		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
> +		spin_lock_irqsave(&head->uptodate_lock, flags);
>  		do {
>  			if (bh_offset(bh) < bio_start ||
>  			    bh_offset(bh) + bh->b_size > bio_end) {
> @@ -141,8 +140,7 @@ static void ext4_finish_bio(struct bio *bio)
>  			if (bio->bi_status)
>  				buffer_io_error(bh);
>  		} while ((bh = bh->b_this_page) != head);
> -		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
> -		local_irq_restore(flags);
> +		spin_unlock_irqrestore(&head->uptodate_lock, flags);
>  		if (!under_io) {
>  			fscrypt_free_bounce_page(bounce_page);
>  			end_page_writeback(page);
> diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
> index 7202a1e39d70c..14ca433b3a9e4 100644
> --- a/fs/ntfs/aops.c
> +++ b/fs/ntfs/aops.c
> @@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  				"0x%llx.", (unsigned long long)bh->b_blocknr);
>  	}
>  	first = page_buffers(page);
> -	local_irq_save(flags);
> -	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
> +	spin_lock_irqsave(&first->uptodate_lock, flags);
>  	clear_buffer_async_read(bh);
>  	unlock_buffer(bh);
>  	tmp = bh;
> @@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  		}
>  		tmp = tmp->b_this_page;
>  	} while (tmp != bh);
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->uptodate_lock, flags);
>  	/*
>  	 * If none of the buffers had errors then we can set the page uptodate,
>  	 * but we first have to perform the post read mst fixups, if the
> @@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  	unlock_page(page);
>  	return;
>  still_busy:
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->uptodate_lock, flags);
>  	return;
>  }
>  
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index 7b73ef7f902d4..e3f8421d17bab 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -22,9 +22,6 @@ enum bh_state_bits {
>  	BH_Dirty,	/* Is dirty */
>  	BH_Lock,	/* Is locked */
>  	BH_Req,		/* Has been submitted for I/O */
> -	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
> -			  * IO completion of other buffers in the page
> -			  */
>  
>  	BH_Mapped,	/* Has a disk mapping */
>  	BH_New,		/* Disk mapping was newly created by get_block */
> @@ -76,6 +73,9 @@ struct buffer_head {
>  	struct address_space *b_assoc_map;	/* mapping this buffer is
>  						   associated with */
>  	atomic_t b_count;		/* users using this buffer_head */
> +	spinlock_t	uptodate_lock;	/* Used by the first bh in a page, to
> +					 * serialise IO completion of other
> +					 * buffers in the page */
>  };
>  
>  /*
> -- 
> 2.24.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v3] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-18  9:38           ` Jan Kara
@ 2019-11-18 13:28             ` " Sebastian Siewior
  2019-11-19  9:30               ` Jan Kara
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Siewior @ 2019-11-18 13:28 UTC (permalink / raw)
  To: Jan Kara
  Cc: Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel, linux-ext4,
	Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Theodore Tso, Alexander Viro, Jan Kara,
	Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

From: Thomas Gleixner <tglx@linutronix.de>

Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
disable preemption, which is undesired for latency reasons and breaks when
regular spinlocks are taken within the bit_spinlock locked region because
regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
replaces the bit spinlocks with regular spinlocks to avoid this problem.
Bit spinlocks are also not covered by lock debugging, e.g. lockdep.

Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: remove the wrapper and use always spinlock_t and move it into
          the padding hole]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
v2…v3: rename uptodate_lock to b_uptodate_lock.

v1…v2: Move the spinlock_t to the padding hole as per Jan Kara. pahole says
its total size remained unchanged, before

| atomic_t                   b_count;              /*    96     4 */
|
| /* size: 104, cachelines: 2, members: 12 */
| /* padding: 4 */
| /* last cacheline: 40 bytes */

after

| atomic_t                   b_count;              /*    96     4 */
| spinlock_t                 uptodate_lock;        /*   100     4 */
|
| /* size: 104, cachelines: 2, members: 13 */
| /* last cacheline: 40 bytes */

 fs/buffer.c                 | 19 +++++++------------
 fs/ext4/page-io.c           |  8 +++-----
 fs/ntfs/aops.c              |  9 +++------
 include/linux/buffer_head.h |  6 +++---
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 86a38b9793235..4baea587981e0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,8 +275,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -289,8 +288,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -302,8 +300,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -331,8 +328,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -344,14 +340,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 EXPORT_SYMBOL(end_buffer_async_write);
@@ -3368,6 +3362,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
 	if (ret) {
 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
+		spin_lock_init(&ret->b_uptodate_lock);
 		preempt_disable();
 		__this_cpu_inc(bh_accounting.nr);
 		recalc_bh_state();
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 12ceadef32c5a..64d4c06fbf836 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -87,11 +87,10 @@ static void ext4_finish_bio(struct bio *bio)
 		}
 		bh = head = page_buffers(page);
 		/*
-		 * We check all buffers in the page under BH_Uptodate_Lock
+		 * We check all buffers in the page under b_uptodate_lock
 		 * to avoid races with other end io clearing async_write flags
 		 */
-		local_irq_save(flags);
-		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+		spin_lock_irqsave(&head->b_uptodate_lock, flags);
 		do {
 			if (bh_offset(bh) < bio_start ||
 			    bh_offset(bh) + bh->b_size > bio_end) {
@@ -103,8 +102,7 @@ static void ext4_finish_bio(struct bio *bio)
 			if (bio->bi_status)
 				buffer_io_error(bh);
 		} while ((bh = bh->b_this_page) != head);
-		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
-		local_irq_restore(flags);
+		spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
 		if (!under_io) {
 			fscrypt_free_bounce_page(bounce_page);
 			end_page_writeback(page);
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7202a1e39d70c..554b744f41bf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7b73ef7f902d4..e0b020eaf32e2 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -22,9 +22,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
 
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
@@ -76,6 +73,9 @@ struct buffer_head {
 	struct address_space *b_assoc_map;	/* mapping this buffer is
 						   associated with */
 	atomic_t b_count;		/* users using this buffer_head */
+	spinlock_t b_uptodate_lock;	/* Used by the first bh in a page, to
+					 * serialise IO completion of other
+					 * buffers in the page */
 };
 
 /*
-- 
2.24.0


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-18 13:28             ` [PATCH v3] " Sebastian Siewior
@ 2019-11-19  9:30               ` Jan Kara
  2019-11-19 10:00                 ` Sebastian Siewior
  0 siblings, 1 reply; 13+ messages in thread
From: Jan Kara @ 2019-11-19  9:30 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Jan Kara, Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel,
	linux-ext4, Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner,
	Steven Rostedt, Julia Cartwright, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On Mon 18-11-19 14:28:24, Sebastian Siewior wrote:
> From: Thomas Gleixner <tglx@linutronix.de>
> 
> Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
> disable preemption, which is undesired for latency reasons and breaks when
> regular spinlocks are taken within the bit_spinlock locked region because
> regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
> replaces the bit spinlocks with regular spinlocks to avoid this problem.
> Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
> 
> Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> [bigeasy: remove the wrapper and use always spinlock_t and move it into
>           the padding hole]
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

OK, how do we push this? Do you plan to push this through tip tree?

								Honza

> ---
> v2…v3: rename uptodate_lock to b_uptodate_lock.
> 
> v1…v2: Move the spinlock_t to the padding hole as per Jan Kara. pahole says
> its total size remained unchanged, before
> 
> | atomic_t                   b_count;              /*    96     4 */
> |
> | /* size: 104, cachelines: 2, members: 12 */
> | /* padding: 4 */
> | /* last cacheline: 40 bytes */
> 
> after
> 
> | atomic_t                   b_count;              /*    96     4 */
> | spinlock_t                 uptodate_lock;        /*   100     4 */
> |
> | /* size: 104, cachelines: 2, members: 13 */
> | /* last cacheline: 40 bytes */
> 
>  fs/buffer.c                 | 19 +++++++------------
>  fs/ext4/page-io.c           |  8 +++-----
>  fs/ntfs/aops.c              |  9 +++------
>  include/linux/buffer_head.h |  6 +++---
>  4 files changed, 16 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 86a38b9793235..4baea587981e0 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -275,8 +275,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  	 * decide that the page is now completely done.
>  	 */
>  	first = page_buffers(page);
> -	local_irq_save(flags);
> -	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
> +	spin_lock_irqsave(&first->b_uptodate_lock, flags);
>  	clear_buffer_async_read(bh);
>  	unlock_buffer(bh);
>  	tmp = bh;
> @@ -289,8 +288,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  		}
>  		tmp = tmp->b_this_page;
>  	} while (tmp != bh);
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
>  
>  	/*
>  	 * If none of the buffers had errors and they are all
> @@ -302,8 +300,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  	return;
>  
>  still_busy:
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
>  	return;
>  }
>  
> @@ -331,8 +328,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
>  	}
>  
>  	first = page_buffers(page);
> -	local_irq_save(flags);
> -	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
> +	spin_lock_irqsave(&first->b_uptodate_lock, flags);
>  
>  	clear_buffer_async_write(bh);
>  	unlock_buffer(bh);
> @@ -344,14 +340,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
>  		}
>  		tmp = tmp->b_this_page;
>  	}
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
>  	end_page_writeback(page);
>  	return;
>  
>  still_busy:
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
>  	return;
>  }
>  EXPORT_SYMBOL(end_buffer_async_write);
> @@ -3368,6 +3362,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
>  	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
>  	if (ret) {
>  		INIT_LIST_HEAD(&ret->b_assoc_buffers);
> +		spin_lock_init(&ret->b_uptodate_lock);
>  		preempt_disable();
>  		__this_cpu_inc(bh_accounting.nr);
>  		recalc_bh_state();
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index 12ceadef32c5a..64d4c06fbf836 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -87,11 +87,10 @@ static void ext4_finish_bio(struct bio *bio)
>  		}
>  		bh = head = page_buffers(page);
>  		/*
> -		 * We check all buffers in the page under BH_Uptodate_Lock
> +		 * We check all buffers in the page under b_uptodate_lock
>  		 * to avoid races with other end io clearing async_write flags
>  		 */
> -		local_irq_save(flags);
> -		bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
> +		spin_lock_irqsave(&head->b_uptodate_lock, flags);
>  		do {
>  			if (bh_offset(bh) < bio_start ||
>  			    bh_offset(bh) + bh->b_size > bio_end) {
> @@ -103,8 +102,7 @@ static void ext4_finish_bio(struct bio *bio)
>  			if (bio->bi_status)
>  				buffer_io_error(bh);
>  		} while ((bh = bh->b_this_page) != head);
> -		bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
> -		local_irq_restore(flags);
> +		spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
>  		if (!under_io) {
>  			fscrypt_free_bounce_page(bounce_page);
>  			end_page_writeback(page);
> diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
> index 7202a1e39d70c..554b744f41bf8 100644
> --- a/fs/ntfs/aops.c
> +++ b/fs/ntfs/aops.c
> @@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  				"0x%llx.", (unsigned long long)bh->b_blocknr);
>  	}
>  	first = page_buffers(page);
> -	local_irq_save(flags);
> -	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
> +	spin_lock_irqsave(&first->b_uptodate_lock, flags);
>  	clear_buffer_async_read(bh);
>  	unlock_buffer(bh);
>  	tmp = bh;
> @@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  		}
>  		tmp = tmp->b_this_page;
>  	} while (tmp != bh);
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
>  	/*
>  	 * If none of the buffers had errors then we can set the page uptodate,
>  	 * but we first have to perform the post read mst fixups, if the
> @@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
>  	unlock_page(page);
>  	return;
>  still_busy:
> -	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
> -	local_irq_restore(flags);
> +	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
>  	return;
>  }
>  
> diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
> index 7b73ef7f902d4..e0b020eaf32e2 100644
> --- a/include/linux/buffer_head.h
> +++ b/include/linux/buffer_head.h
> @@ -22,9 +22,6 @@ enum bh_state_bits {
>  	BH_Dirty,	/* Is dirty */
>  	BH_Lock,	/* Is locked */
>  	BH_Req,		/* Has been submitted for I/O */
> -	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
> -			  * IO completion of other buffers in the page
> -			  */
>  
>  	BH_Mapped,	/* Has a disk mapping */
>  	BH_New,		/* Disk mapping was newly created by get_block */
> @@ -76,6 +73,9 @@ struct buffer_head {
>  	struct address_space *b_assoc_map;	/* mapping this buffer is
>  						   associated with */
>  	atomic_t b_count;		/* users using this buffer_head */
> +	spinlock_t b_uptodate_lock;	/* Used by the first bh in a page, to
> +					 * serialise IO completion of other
> +					 * buffers in the page */
>  };
>  
>  /*
> -- 
> 2.24.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-19  9:30               ` Jan Kara
@ 2019-11-19 10:00                 ` Sebastian Siewior
  2019-11-19 11:59                   ` Jan Kara
  0 siblings, 1 reply; 13+ messages in thread
From: Sebastian Siewior @ 2019-11-19 10:00 UTC (permalink / raw)
  To: Jan Kara
  Cc: Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel, linux-ext4,
	Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner, Steven Rostedt,
	Julia Cartwright, Theodore Tso, Alexander Viro, Jan Kara,
	Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On 2019-11-19 10:30:01 [+0100], Jan Kara wrote:
> OK, how do we push this? Do you plan to push this through tip tree?

Is it reasonable to push this via the ext4 tree?

> 								Honza

Sebastian

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v3] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t
  2019-11-19 10:00                 ` Sebastian Siewior
@ 2019-11-19 11:59                   ` Jan Kara
  0 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2019-11-19 11:59 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Jan Kara, Thomas Gleixner, Matthew Wilcox, LKML, linux-fsdevel,
	linux-ext4, Peter Zijlstra, Ingo Molnar, Anna-Maria Gleixner,
	Steven Rostedt, Julia Cartwright, Theodore Tso, Alexander Viro,
	Jan Kara, Mark Fasheh, Joseph Qi, Christoph Hellwig, Joel Becker

On Tue 19-11-19 11:00:22, Sebastian Siewior wrote:
> On 2019-11-19 10:30:01 [+0100], Jan Kara wrote:
> > OK, how do we push this? Do you plan to push this through tip tree?
> 
> Is it reasonable to push this via the ext4 tree?

I don't know and I don't think it largely matters. That code is rarely
touched. Lately changes to that code have been flowing through block tree
(Jens Axboe) or iomap tree (Darrick Wong) because those were the people
needing the changes in...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, back to index

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-20 17:08 [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular spinlock_t Sebastian Siewior
2019-08-20 17:17 ` Matthew Wilcox
2019-08-20 18:01   ` Thomas Gleixner
2019-10-11 11:25     ` Sebastian Siewior
2019-11-15 14:56       ` Jan Kara
2019-11-15 17:36         ` Theodore Y. Ts'o
2019-11-18  9:35           ` Jan Kara
2019-11-15 17:54         ` [PATCH v2] " Sebastian Siewior
2019-11-18  9:38           ` Jan Kara
2019-11-18 13:28             ` [PATCH v3] " Sebastian Siewior
2019-11-19  9:30               ` Jan Kara
2019-11-19 10:00                 ` Sebastian Siewior
2019-11-19 11:59                   ` Jan Kara

Linux-ext4 Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-ext4/0 linux-ext4/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-ext4 linux-ext4/ https://lore.kernel.org/linux-ext4 \
		linux-ext4@vger.kernel.org
	public-inbox-index linux-ext4

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-ext4


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git