linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
@ 2001-09-20  7:12 Beau Kuiper
  2001-09-20 12:16 ` Chris Mason
  2001-09-21 13:26 ` Matthias Andree
  0 siblings, 2 replies; 14+ messages in thread
From: Beau Kuiper @ 2001-09-20  7:12 UTC (permalink / raw)
  To: linux-kernel; +Cc: tovarlds

Hi,

Resierfs on 2.4 has always been bog slow.

I have identified kupdated as the culprit, and have 3 patches that fix the
peformance problems I have had been suffering.

I would like these patches to be reviewed an put into the mainline kernel
so that others can testthe changes.

Patch 1.

This patch fixes reiserfs to use the kupdated code path when told to
resync its super block, like it did in 2.2.19. This is the culpit for bad
reiserfs performace in 2.4. Unfortunately, this fix relies on the second
patch to work properly.

Patch 2

This patch implements a simple mechinism to ensure that each superblock
only gets told to be flushed once. With reiserfs and the first patch, the
superblock is still dirty after being told to sync (probably becasue it
doesn't want to write out the entire journal every 5 seconds when kupdate
calls it). This caused an infinite loop because sync_supers would
always find the reiserfs superblock dirty when called from kupdated. I am
not convinced that this patch is the best one for this problem
(suggestions?)

Patch 3

This patch was generated as I was exploring the buffer cache, wondering
why reiserfs was so slow on 2.4. I found that kupdated may write buffers
that are not actually old back to disk. Eg

Imagine that there are 20 dirty buffers. 16 of them are more that 30
seconds old (and should be written back), but the other 4 are younger than
30 seconds. The current code would force all 20 out to disk, interrupting
programs still using the young 4 until the disk write was complete.

I know that it isn't a major problem, but I found it and I have written
the patch for it :-)

Please try out these patches and give comments about style, performace
ect. They fixed my problems, sliced almost a minute off 2.2.19 kernel
compile time on my duron 700 (from 4min 30sec to 3min 45sec)

Thanks
Beau Kuiper
kuib-kl@ljbc.wa.edu.au

------------------------- PATCH 1:

--- linux-2.4.10pre11/fs/reiserfs/journal.c	Fri Sep 21 14:35:25 2001
+++ linux/fs/reiserfs/journal.c	Fri Sep 21 11:04:36 2001
@@ -2388,7 +2388,7 @@
   if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
     return 0  ;
   }
-  if (!strcmp(current->comm, "kupdate")) {
+  if (!strcmp(current->comm, "kupdated")) {
     immediate = 0 ;
     keep_dirty = 1 ;
   }

------------------------- PATCH 2:

--- linux-2.4.10pre11/fs/super.c	Fri Sep 21 14:35:25 2001
+++ linux/fs/super.c	Fri Sep 21 12:15:12 2001
@@ -708,11 +708,14 @@
 		return;
 	}
 restart:
+	// since reiserfs does not garrentee super is not dirty (journal may
+	// be dirty still with kupdated), have to do this the hard way
 	spin_lock(&sb_lock);
 	sb = sb_entry(super_blocks.next);
 	while (sb != sb_entry(&super_blocks))
-		if (sb->s_dirt) {
+		if (sb->s_dirt && !(sb->s_flushed)) {
 			sb->s_count++;
+			sb->s_flushed = 1;
 			spin_unlock(&sb_lock);
 			down_read(&sb->s_umount);
 			write_super(sb);
@@ -720,6 +723,16 @@
 			goto restart;
 		} else
 			sb = sb_entry(sb->s_list.next);
+
+	// now unflush all supers
+
+	sb = sb_entry(super_blocks.next);
+	while (sb != sb_entry(&super_blocks))
+	{
+		sb->s_flushed = 0;
+		sb = sb_entry(sb->s_list.next);
+	}
+
 	spin_unlock(&sb_lock);
 }

@@ -805,6 +818,7 @@
 		sema_init(&s->s_dquot.dqio_sem, 1);
 		sema_init(&s->s_dquot.dqoff_sem, 1);
 		s->s_maxbytes = MAX_NON_LFS;
+		s->s_flushed = 0;
 	}
 	return s;
 }
--- linux-2.4.10pre11/include/linux/fs.h	Fri Sep 21 14:35:31 2001
+++ linux/include/linux/fs.h	Fri Sep 21 12:11:49 2001
@@ -689,6 +689,7 @@
 	unsigned long		s_blocksize;
 	unsigned char		s_blocksize_bits;
 	unsigned char		s_dirt;
+	unsigned char		s_flushed;
 	unsigned long long	s_maxbytes;	/* Max file size */
 	struct file_system_type	*s_type;
 	struct super_operations	*s_op;

----------------------- PATCH 3

--- linux-2.4.10pre11/fs/buffer.c	Fri Sep 21 14:35:23 2001
+++ linux/fs/buffer.c	Fri Sep 21 12:05:06 2001
@@ -201,7 +201,7 @@
  * return without it!
  */
 #define NRSYNC (32)
-static int write_some_buffers(kdev_t dev)
+static int write_some_buffers(kdev_t dev, int oldonly)
 {
 	struct buffer_head *next;
 	struct buffer_head *array[NRSYNC];
@@ -217,6 +217,8 @@

 		if (dev && bh->b_dev != dev)
 			continue;
+		if (oldonly && time_before(jiffies, bh->b_flushtime))
+			break;
 		if (test_and_set_bit(BH_Lock, &bh->b_state))
 			continue;
 		if (atomic_set_buffer_clean(bh)) {
@@ -247,7 +249,7 @@
 {
 	do {
 		spin_lock(&lru_list_lock);
-	} while (write_some_buffers(dev));
+	} while (write_some_buffers(dev, 0));
 	run_task_queue(&tq_disk);
 }

@@ -1239,7 +1241,7 @@

 	/* If we're getting into imbalance, start write-out */
 	spin_lock(&lru_list_lock);
-	write_some_buffers(NODEV);
+	write_some_buffers(NODEV, 0);

 	/*
 	 * And if we're _really_ out of balance, wait for
@@ -2768,7 +2770,7 @@
 		bh = lru_list[BUF_DIRTY];
 		if (!bh || time_before(jiffies, bh->b_flushtime))
 			break;
-		if (write_some_buffers(NODEV))
+		if (write_some_buffers(NODEV, 1))
 			continue;
 		return 0;
 	}
@@ -2867,7 +2869,7 @@
 		CHECK_EMERGENCY_SYNC

 		spin_lock(&lru_list_lock);
-		if (!write_some_buffers(NODEV) || balance_dirty_state() < 0) {
+		if (!write_some_buffers(NODEV, 0) || balance_dirty_state() < 0) {
 			wait_for_some_buffers(NODEV);
 			interruptible_sleep_on(&bdflush_wait);
 		}



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-20  7:12 [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes Beau Kuiper
@ 2001-09-20 12:16 ` Chris Mason
  2001-09-20 15:20   ` Beau Kuiper
  2001-09-21 13:26 ` Matthias Andree
  1 sibling, 1 reply; 14+ messages in thread
From: Chris Mason @ 2001-09-20 12:16 UTC (permalink / raw)
  To: Beau Kuiper, linux-kernel; +Cc: tovarlds



On Thursday, September 20, 2001 03:12:44 PM +0800 Beau Kuiper
<kuib-kl@ljbc.wa.edu.au> wrote:

> Hi,
> 
> Resierfs on 2.4 has always been bog slow.
> 
> I have identified kupdated as the culprit, and have 3 patches that fix the
> peformance problems I have had been suffering.

Thanks for sending these along.

> 
> I would like these patches to be reviewed an put into the mainline kernel
> so that others can testthe changes.
> 
> Patch 1.
> 
> This patch fixes reiserfs to use the kupdated code path when told to
> resync its super block, like it did in 2.2.19. This is the culpit for bad
> reiserfs performace in 2.4. Unfortunately, this fix relies on the second
> patch to work properly.

I promised linus I would never reactivate this code, it is just too nasty
;-)  The problem is that write_super doesn't know if it is called from sync
or from kupdated.  The right fix is to have an extra param to write_super,
or another super_block method that gets called instead of write_super when
an immediate commit is not required.

It is possible to get almost the same behaviour as 2.2.x by changing the
metadata sync interval in bdflush to 30 seconds.

> 
> Patch 2
> 
> This patch implements a simple mechinism to ensure that each superblock
> only gets told to be flushed once. With reiserfs and the first patch, the
> superblock is still dirty after being told to sync (probably becasue it
> doesn't want to write out the entire journal every 5 seconds when kupdate
> calls it). This caused an infinite loop because sync_supers would
> always find the reiserfs superblock dirty when called from kupdated. I am
> not convinced that this patch is the best one for this problem
> (suggestions?)

It is ok to leave the superblock dirty, after all, since the commit wasn't
done, the super is still dirty.  If the checks from reiserfs_write_super
are actually slowing things down, then it is probably best to fix the
checks.

> 
> Patch 3
> 
> This patch was generated as I was exploring the buffer cache, wondering
> why reiserfs was so slow on 2.4. I found that kupdated may write buffers
> that are not actually old back to disk. Eg
> 
> Imagine that there are 20 dirty buffers. 16 of them are more that 30
> seconds old (and should be written back), but the other 4 are younger than
> 30 seconds. The current code would force all 20 out to disk, interrupting
> programs still using the young 4 until the disk write was complete.
> 
> I know that it isn't a major problem, but I found it and I have written
> the patch for it :-)
> 
> Please try out these patches and give comments about style, performace
> ect. They fixed my problems, sliced almost a minute off 2.2.19 kernel
> compile time on my duron 700 (from 4min 30sec to 3min 45sec)

Doe you have the results of the individual fixes?

thanks,
Chris


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-20 12:16 ` Chris Mason
@ 2001-09-20 15:20   ` Beau Kuiper
  2001-09-20 16:15     ` Andreas Dilger
  0 siblings, 1 reply; 14+ messages in thread
From: Beau Kuiper @ 2001-09-20 15:20 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-kernel



On Thu, 20 Sep 2001, Chris Mason wrote:

>
>
> On Thursday, September 20, 2001 03:12:44 PM +0800 Beau Kuiper
> <kuib-kl@ljbc.wa.edu.au> wrote:
>
> > Hi,
> >
> > Resierfs on 2.4 has always been bog slow.
> >
> > I have identified kupdated as the culprit, and have 3 patches that fix the
> > peformance problems I have had been suffering.
>
> Thanks for sending these along.
>
> >
> > I would like these patches to be reviewed an put into the mainline kernel
> > so that others can testthe changes.
> >
> > Patch 1.
> >
> > This patch fixes reiserfs to use the kupdated code path when told to
> > resync its super block, like it did in 2.2.19. This is the culpit for bad
> > reiserfs performace in 2.4. Unfortunately, this fix relies on the second
> > patch to work properly.
>
> I promised linus I would never reactivate this code, it is just too nasty
> ;-)  The problem is that write_super doesn't know if it is called from sync
> or from kupdated.  The right fix is to have an extra param to write_super,
> or another super_block method that gets called instead of write_super when
> an immediate commit is not required.

I don't think that this could happen until 2.5.x though, as either
solution touches every file system. However, if we added an extra methed,
we could do this while only slightly touching the other filesystems (where
kupdated sync == real sync) Simply see if the method exists (is non-null)
and call that instead with a kupdate sync instead of the normal
super_sync. Are you interested in me writing a patch to do this?


>
> It is possible to get almost the same behaviour as 2.2.x by changing the
> metadata sync interval in bdflush to 30 seconds.
>

But then kupdate doesn't flush normal data as regularly as it should, plus
it is almost as messy as Patch 1 :-)

> >
> > Patch 2
> >
> > This patch implements a simple mechinism to ensure that each superblock
> > only gets told to be flushed once. With reiserfs and the first patch, the
> > superblock is still dirty after being told to sync (probably becasue it
> > doesn't want to write out the entire journal every 5 seconds when kupdate
> > calls it). This caused an infinite loop because sync_supers would
> > always find the reiserfs superblock dirty when called from kupdated. I am
> > not convinced that this patch is the best one for this problem
> > (suggestions?)
>
> It is ok to leave the superblock dirty, after all, since the commit wasn't
> done, the super is still dirty.  If the checks from reiserfs_write_super
> are actually slowing things down, then it is probably best to fix the
> checks.

I meant, there might be better wway to prevent the endless loop than
adding an extra field to the superblock data structure. I beleive (I
havn't explored reiserfs code much) the slowdown is caused by the journal
being synced with the superblock, thus causing:

1) Too much contention for disk resources.
2) A huge increase in the number of times programs must be suspended to
wait for the disk
3) Poor CPU utilization in code that uses the filesystem regularly (like
compiling)

>
> >
> > Patch 3
> >
> > This patch was generated as I was exploring the buffer cache, wondering
> > why reiserfs was so slow on 2.4. I found that kupdated may write buffers
> > that are not actually old back to disk. Eg
> >
> > Imagine that there are 20 dirty buffers. 16 of them are more that 30
> > seconds old (and should be written back), but the other 4 are younger than
> > 30 seconds. The current code would force all 20 out to disk, interrupting
> > programs still using the young 4 until the disk write was complete.
> >
> > I know that it isn't a major problem, but I found it and I have written
> > the patch for it :-)
> >
> > Please try out these patches and give comments about style, performace
> > ect. They fixed my problems, sliced almost a minute off 2.2.19 kernel
> > compile time on my duron 700 (from 4min 30sec to 3min 45sec)
>
> Doe you have the results of the individual fixes?

Patch 3 doesn't improve performace much (even in theory the number of
dirty buffers being wrongly flushed is pretty low)

Patch 2 doesn't improve performace at all (unless you apply patch 1,
without it, the computer will bog itself into the ground on the
first kupdated)

Patch 1 makes a huge difference because it stops reiserfs from reacting
badly on a kupdated.

Are there any good benchmarks you want me to run, on the plain and modded
kernels.

Thanks
Beau Kuiper
kuib-kl@ljbc.wa.edu.au

>
> thanks,
> Chris
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-20 15:20   ` Beau Kuiper
@ 2001-09-20 16:15     ` Andreas Dilger
  2001-09-20 16:22       ` Beau Kuiper
  0 siblings, 1 reply; 14+ messages in thread
From: Andreas Dilger @ 2001-09-20 16:15 UTC (permalink / raw)
  To: Beau Kuiper; +Cc: Chris Mason, linux-kernel

On Sep 20, 2001  23:20 +0800, Beau Kuiper wrote:
> Patch 3 doesn't improve performace much (even in theory the number of
> dirty buffers being wrongly flushed is pretty low)

Actually, it _may_ even make performance worse (hard to say).  Consider
the case where the "young" dirty buffers are in the same area of the
disk as the "old" dirty buffers.  Once you are forced to write the "old"
buffers, you pretty much get to write the new buffers for free (low seek
overhead).  They _could_ be merged in the elevator code.

Sadly, it is hard to tell whether this is possible or not, because the
code to do these things live in different places.  Maybe we could have
an "optimistic" elevator merge, which only added "young" buffers if
they merged with other old buffers.

Cheers, Andreas
--
Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
                 \  would they cancel out, leaving him still hungry?"
http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-20 16:15     ` Andreas Dilger
@ 2001-09-20 16:22       ` Beau Kuiper
  2001-09-21 20:12         ` Lehmann 
  0 siblings, 1 reply; 14+ messages in thread
From: Beau Kuiper @ 2001-09-20 16:22 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Chris Mason, linux-kernel



On Thu, 20 Sep 2001, Andreas Dilger wrote:

> On Sep 20, 2001  23:20 +0800, Beau Kuiper wrote:
> > Patch 3 doesn't improve performace much (even in theory the number of
> > dirty buffers being wrongly flushed is pretty low)
>
> Actually, it _may_ even make performance worse (hard to say).  Consider
> the case where the "young" dirty buffers are in the same area of the
> disk as the "old" dirty buffers.  Once you are forced to write the "old"
> buffers, you pretty much get to write the new buffers for free (low seek
> overhead).  They _could_ be merged in the elevator code.
>
> Sadly, it is hard to tell whether this is possible or not, because the
> code to do these things live in different places.  Maybe we could have
> an "optimistic" elevator merge, which only added "young" buffers if
> they merged with other old buffers.

But don't forget that when dirty buffers are being flushed, they become
locked while the disk update is occuring. That means any programs that are
still using these "young" dirty buffers must wait for disk io to complete
(including all the older buffers that have to be written to disk before
it). When several hundred other buffers are slso being flushed to disk by
kupdated , it could be a long wait.

Also, it is nicer behaviour not to write out to disk before the time
indicated in the bdflush tuning structure. It allows sys admins to better
tune the overal performace of a system. (unless the kernel need s more
free memory)

Have fun
Beau Kuiper
kuib-kl@ljbc.wa.edu.au




 >
> Cheers, Andreas
> --
> Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
>                  \  would they cancel out, leaving him still hungry?"
> http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-20  7:12 [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes Beau Kuiper
  2001-09-20 12:16 ` Chris Mason
@ 2001-09-21 13:26 ` Matthias Andree
  2001-09-22 18:38   ` Beau Kuiper
  2001-09-24 14:37   ` Stephen C. Tweedie
  1 sibling, 2 replies; 14+ messages in thread
From: Matthias Andree @ 2001-09-21 13:26 UTC (permalink / raw)
  To: Beau Kuiper; +Cc: linux-kernel, tovarlds

On Thu, 20 Sep 2001, Beau Kuiper wrote:

> Patch 3
> 
> This patch was generated as I was exploring the buffer cache, wondering
> why reiserfs was so slow on 2.4. I found that kupdated may write buffers
> that are not actually old back to disk. Eg
> 
> Imagine that there are 20 dirty buffers. 16 of them are more that 30
> seconds old (and should be written back), but the other 4 are younger than
> 30 seconds. The current code would force all 20 out to disk, interrupting
> programs still using the young 4 until the disk write was complete.
> 
> I know that it isn't a major problem, but I found it and I have written
> the patch for it :-)

Be careful! MTAs rely on this behaviour on fsync(). The official
consensus on ReiserFS and ext3 on current Linux 2.4.x kernels (x >= 9)
is that "any synchronous operation flushes all pending operations", and
if that is changed, you MUST make sure that the changed ReiserFS/ext3fs
still make all the guarantees that softupdated BSD file systems make,
lest you want people to run their mail queues off "sync" disks.

Note also, if these blocks belong to newly-opened files, you definitely
want kupdated to flush these to disk on its next run so that the files
are still there after a crash.

I'm not exactly sure what your patch 3 does and how the buffers work,
but I'm absolutely sure we want the "fsync on an open file also syncs
all pending rename/link/open operations the corresponding file has
undergone."

Softupdates FFS (BSD) does that, and ReiserFS/ext3fs also do that by
just flushing all pending operations on a synchronous one, and it's not
slow.

This may cause additional writes, but the reliability issues must be
taken into account here. We don't gain anything if files get lost over a
crash just because you want to save 4 writes.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-20 16:22       ` Beau Kuiper
@ 2001-09-21 20:12         ` Lehmann 
  2001-09-21 20:57           ` Rik van Riel
  0 siblings, 1 reply; 14+ messages in thread
From: Lehmann  @ 2001-09-21 20:12 UTC (permalink / raw)
  To: linux-kernel

On Fri, Sep 21, 2001 at 12:22:59AM +0800, Beau Kuiper <kuib-kl@ljbc.wa.edu.au> wrote:
> Also, it is nicer behaviour not to write out to disk before the time
> indicated in the bdflush tuning structure. It allows sys admins to better
> tune the overal performace of a system. (unless the kernel need s more
> free memory)

A contiguous write doesn't cost anything much, so it could be major
win if the kernel could flush dirty buffers that are behind and after
the dirty block to be written. But this would nicely conflict with
allocate-on-flush ;)

-- 
      -----==-                                             |
      ----==-- _                                           |
      ---==---(_)__  __ ____  __       Marc Lehmann      +--
      --==---/ / _ \/ // /\ \/ /       pcg@goof.com      |e|
      -=====/_/_//_/\_,_/ /_/\_\       XX11-RIPE         --+
    The choice of a GNU generation                       |
                                                         |

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-21 20:12         ` Lehmann 
@ 2001-09-21 20:57           ` Rik van Riel
  2001-09-21 22:50             ` Lehmann 
  0 siblings, 1 reply; 14+ messages in thread
From: Rik van Riel @ 2001-09-21 20:57 UTC (permalink / raw)
  To: Marc A. Lehmann; +Cc: linux-kernel

On Fri, 21 Sep 2001, Marc A. Lehmann wrote:

> A contiguous write doesn't cost anything much, so it could be major
> win if the kernel could flush dirty buffers that are behind and after
> the dirty block to be written. But this would nicely conflict with
> allocate-on-flush ;)

On the contrary, when you have a bunch of small files to sync
you just allocate them next to each other and flush them all
at once ;)

Rik
-- 
IA64: a worthy successor to i860.

http://www.surriel.com/		http://distro.conectiva.com/

Send all your spam to aardvark@nl.linux.org (spam digging piggy)


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-21 20:57           ` Rik van Riel
@ 2001-09-21 22:50             ` Lehmann 
  0 siblings, 0 replies; 14+ messages in thread
From: Lehmann  @ 2001-09-21 22:50 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-kernel

On Fri, Sep 21, 2001 at 05:57:36PM -0300, Rik van Riel <riel@conectiva.com.br> wrote:
> On the contrary, when you have a bunch of small files to sync
> you just allocate them next to each other and flush them all
> at once ;)

Different layer really: The idea was to sync blocks contiguous to the
dirty block we want to sync (regardless of filesystem, i.e. similar to
the elevator which also works at abelow-the-fs-level). (Allocated) block
numbers are necessary for this to work.

(but i don't know how relevant this is in practise. I think it might gain
a lot, though, but that doesn't mean much ;)

-- 
      -----==-                                             |
      ----==-- _                                           |
      ---==---(_)__  __ ____  __       Marc Lehmann      +--
      --==---/ / _ \/ // /\ \/ /       pcg@goof.com      |e|
      -=====/_/_//_/\_,_/ /_/\_\       XX11-RIPE         --+
    The choice of a GNU generation                       |
                                                         |

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-21 13:26 ` Matthias Andree
@ 2001-09-22 18:38   ` Beau Kuiper
  2001-09-22 19:32     ` Matthias Andree
  2001-09-23 15:24     ` Chris Mason
  2001-09-24 14:37   ` Stephen C. Tweedie
  1 sibling, 2 replies; 14+ messages in thread
From: Beau Kuiper @ 2001-09-22 18:38 UTC (permalink / raw)
  To: Matthias Andree; +Cc: linux-kernel



On Fri, 21 Sep 2001, Matthias Andree wrote:

> On Thu, 20 Sep 2001, Beau Kuiper wrote:
>
> > Patch 3
> >
> > This patch was generated as I was exploring the buffer cache, wondering
> > why reiserfs was so slow on 2.4. I found that kupdated may write buffers
> > that are not actually old back to disk. Eg
> >
> > Imagine that there are 20 dirty buffers. 16 of them are more that 30
> > seconds old (and should be written back), but the other 4 are younger than
> > 30 seconds. The current code would force all 20 out to disk, interrupting
> > programs still using the young 4 until the disk write was complete.
> >
> > I know that it isn't a major problem, but I found it and I have written
> > the patch for it :-)
>
> Be careful! MTAs rely on this behaviour on fsync(). The official
> consensus on ReiserFS and ext3 on current Linux 2.4.x kernels (x >= 9)
> is that "any synchronous operation flushes all pending operations", and
> if that is changed, you MUST make sure that the changed ReiserFS/ext3fs
> still make all the guarantees that softupdated BSD file systems make,
> lest you want people to run their mail queues off "sync" disks.

This code change does not affect the functionality of fsync(), only
kupdated. kupdated is responsible for flushing buffers that have been
sitting around too long to disk.

>
> Note also, if these blocks belong to newly-opened files, you definitely
> want kupdated to flush these to disk on its next run so that the files
> are still there after a crash.

They still are (they would be flushed out by the sync_inodes call in
sync_old_buffers. But why are the file records for any new file more
important than changes to old files? (This is what an application can
determince, the kernel should just do what the app says, and treat
everything else fairly)

>
> I'm not exactly sure what your patch 3 does and how the buffers work,
> but I'm absolutely sure we want the "fsync on an open file also syncs
> all pending rename/link/open operations the corresponding file has
> undergone."

That still works.

>
> Softupdates FFS (BSD) does that, and ReiserFS/ext3fs also do that by
> just flushing all pending operations on a synchronous one, and it's not
> slow.
>

But this isn't about sync operations, it is about kupdated (a kernel
thread)

> This may cause additional writes, but the reliability issues must be
> taken into account here. We don't gain anything if files get lost over a
> crash just because you want to save 4 writes.
>

We never gain everything in a crash. We have to set limits to the amount
of damage we can accept, and try to keep within that limit. However,
performace is another critical limit. The four writes could consume
a lot of time, and be invalidated by changes the application could
make (and written out AGAIN)

For example, given a fast hard drive, with 2 writes on one side of the
driver and the other 2 on the other side, the application could be stalled
for

	15ms for first seek and write
	8ms for second seek and write
	15ms to seek and write the 3rd buffer
	8ms for the fourth write.

Umm, 46 ms. Do this often (if you are extremely unlucky) and you could
chew through a lot of time where it would have been more sane just to sit
back and let the application finish with the file.

Beau Kuiper
kuib-kl@ljbc.wa.edu.au




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-22 18:38   ` Beau Kuiper
@ 2001-09-22 19:32     ` Matthias Andree
  2001-09-23 15:24     ` Chris Mason
  1 sibling, 0 replies; 14+ messages in thread
From: Matthias Andree @ 2001-09-22 19:32 UTC (permalink / raw)
  To: Beau Kuiper; +Cc: Matthias Andree, linux-kernel

On Sun, 23 Sep 2001, Beau Kuiper wrote:

> > Be careful! MTAs rely on this behaviour on fsync(). The official
> > consensus on ReiserFS and ext3 on current Linux 2.4.x kernels (x >= 9)
> > is that "any synchronous operation flushes all pending operations", and
> > if that is changed, you MUST make sure that the changed ReiserFS/ext3fs
> > still make all the guarantees that softupdated BSD file systems make,
> > lest you want people to run their mail queues off "sync" disks.
> 
> This code change does not affect the functionality of fsync(), only
> kupdated. kupdated is responsible for flushing buffers that have been
> sitting around too long to disk.

Sorry, I didn't grok that when writing my previous mail in this thread.
I thought kupdate was the one that writes ReiserFS transactions, but
that's kreiserfsd, I think.

> > Note also, if these blocks belong to newly-opened files, you definitely
> > want kupdated to flush these to disk on its next run so that the files
> > are still there after a crash.
> 
> They still are (they would be flushed out by the sync_inodes call in
> sync_old_buffers. But why are the file records for any new file more
> important than changes to old files? (This is what an application can
> determince, the kernel should just do what the app says, and treat
> everything else fairly)

Not really, but open is a directory operation, while writing is a file
operation. Directory operations have been flushed earlier traditionally
(5s vs. 30s).

-- 
Matthias Andree

"Those who give up essential liberties for temporary safety deserve
neither liberty nor safety." - Benjamin Franklin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-22 18:38   ` Beau Kuiper
  2001-09-22 19:32     ` Matthias Andree
@ 2001-09-23 15:24     ` Chris Mason
  1 sibling, 0 replies; 14+ messages in thread
From: Chris Mason @ 2001-09-23 15:24 UTC (permalink / raw)
  To: Matthias Andree, Beau Kuiper; +Cc: linux-kernel



On Saturday, September 22, 2001 09:32:18 PM +0200 Matthias Andree
<matthias.andree@stud.uni-dortmund.de> wrote:
> On Sun, 23 Sep 2001, Beau Kuiper wrote:
>
>> This code change does not affect the functionality of fsync(), only
>> kupdated. kupdated is responsible for flushing buffers that have been
>> sitting around too long to disk.

Correct, fsync is unchanged, but sync() does rely on the super dirty bit.
If the super isn't dirty, then sync doesn't try to force metadata to disk
at all.

> 
> Sorry, I didn't grok that when writing my previous mail in this thread.
> I thought kupdate was the one that writes ReiserFS transactions, but
> that's kreiserfsd, I think.

Well, kreiserfsd usually writes the log blocks, when a transaction ends on
its own (full, or too old).  If someone requests a synchronous commit
(fsync, any caller of write_super), that process writes the log blocks
themselves.  So, kupdated ends up writing log blocks too.

Plus, reiserfs uses write_super as a trigger for metadata writeback, so
kupdate writes those blocks as well.

-chris


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
  2001-09-21 13:26 ` Matthias Andree
  2001-09-22 18:38   ` Beau Kuiper
@ 2001-09-24 14:37   ` Stephen C. Tweedie
  1 sibling, 0 replies; 14+ messages in thread
From: Stephen C. Tweedie @ 2001-09-24 14:37 UTC (permalink / raw)
  To: Beau Kuiper, linux-kernel, tovarlds, Stephen Tweedie

Hi,

On Fri, Sep 21, 2001 at 03:26:27PM +0200, Matthias Andree wrote:

> Be careful! MTAs rely on this behaviour on fsync(). The official
> consensus on ReiserFS and ext3 on current Linux 2.4.x kernels (x >= 9)
> is that "any synchronous operation flushes all pending operations", and
> if that is changed, you MUST make sure that the changed ReiserFS/ext3fs
> still make all the guarantees that softupdated BSD file systems make,
> lest you want people to run their mail queues off "sync" disks.

Reiserfs and ext3 have their own IO ordering --- they don't commit
transactions until the log writes for _all_ of the blocks in those
transactions have been acknowledged.  Reordering outstanding IOs won't
affect the fsync guarantees at all.

--Stephen

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes
@ 2001-09-20 17:23 Chris Mason
  0 siblings, 0 replies; 14+ messages in thread
From: Chris Mason @ 2001-09-20 17:23 UTC (permalink / raw)
  To: Beau Kuiper; +Cc: linux-kernel



On Thursday, September 20, 2001 11:20:26 PM +0800 Beau Kuiper <kuib-kl@ljbc.wa.edu.au> wrote:

> I don't think that this could happen until 2.5.x though, as either
> solution touches every file system. However, if we added an extra methed,
> we could do this while only slightly touching the other filesystems (where
> kupdated sync == real sync) Simply see if the method exists (is non-null)
> and call that instead with a kupdate sync instead of the normal
> super_sync. Are you interested in me writing a patch to do this?

If the patch below doesn't fix things for you, I've got another one
that adds a new super_operation.

> 
> 
>> 
>> It is possible to get almost the same behaviour as 2.2.x by changing the
>> metadata sync interval in bdflush to 30 seconds.
>> 
> 
> But then kupdate doesn't flush normal data as regularly as it should, plus
> it is almost as messy as Patch 1 :-)

Tuning through /proc/sys/vm/bdflush is much nicer than doing a
strcmp on the current process name.

The real problem is probably the way commits are done.  When there 
are other FS writers, the commits are done outside the journal
lock, but if you hit things at just the wrong time, you might
trigger writing the commit blocks inside the lock, which slows
things down.  The patch below tries harder to keep log block io outside
the journal lock.

This way you can play with the kupdate interval to find the right
balance for your workload.

> 
>> > 
>> > Patch 2
>> > 
>> > This patch implements a simple mechinism to ensure that each superblock
>> > only gets told to be flushed once. With reiserfs and the first patch,
>> > the superblock is still dirty after being told to sync (probably
>> > becasue it doesn't want to write out the entire journal every 5
>> > seconds when kupdate calls it). This caused an infinite loop because
>> > sync_supers would always find the reiserfs superblock dirty when
>> > called from kupdated. I am not convinced that this patch is the best
>> > one for this problem (suggestions?)
>> 
>> It is ok to leave the superblock dirty, after all, since the commit
>> wasn't done, the super is still dirty.  If the checks from
>> reiserfs_write_super are actually slowing things down, then it is
>> probably best to fix the checks.
> 
> I meant, there might be better wway to prevent the endless loop than
> adding an extra field to the superblock data structure. I beleive (I
> havn't explored reiserfs code much) the slowdown is caused by the journal
> being synced with the superblock, thus causing:

The dirty bit is cleared by reiserfs_write_super when something has
actually been written to disk.  When reiserfs_write_super doesn't
do the commit, it leaves the super dirty, which is important because
sys_sync and other people rely on the dirty bit.

> 
> 1) Too much contention for disk resources.
> 2) A huge increase in the number of times programs must be suspended to
> wait for the disk
> 3) Poor CPU utilization in code that uses the filesystem regularly (like
> compiling)

The way to avoid these is to not call write_super until you really
want to see things on disk ;-)

> Patch 1 makes a huge difference because it stops reiserfs from reacting
> badly on a kupdated.

-chris

--- linux/fs/reiserfs/journal.c	Sat Sep  8 08:05:32 2001
+++ linux/fs/reiserfs/journal.c	Thu Sep 20 13:15:04 2001
@@ -2872,17 +2872,12 @@
   /* write any buffers that must hit disk before this commit is done */
   fsync_inode_buffers(&(SB_JOURNAL(p_s_sb)->j_dummy_inode)) ;
 
-  /* honor the flush and async wishes from the caller */
+  /* honor the flush wishes from the caller, simple commits can
+  ** be done outside the journal lock, they are done below
+  */
   if (flush) {
-  
     flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
     flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;  
-  } else if (commit_now) {
-    if (wait_on_commit) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    } else {
-      commit_flush_async(p_s_sb, orig_jindex) ; 
-    }
   }
 
   /* reset journal values for the next transaction */
@@ -2944,6 +2939,16 @@
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
   /* wake up any body waiting to join. */
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+  
+  if (!flush && commit_now) {
+    if (current->need_resched)
+      schedule() ;
+    if (wait_on_commit) {
+      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
+    } else {
+      commit_flush_async(p_s_sb, orig_jindex) ; 
+    }
+  }
   return 0 ;
 }
 




^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2001-09-24 14:37 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-09-20  7:12 [PATCH] Significant performace improvements on reiserfs systems, kupdated bugfixes Beau Kuiper
2001-09-20 12:16 ` Chris Mason
2001-09-20 15:20   ` Beau Kuiper
2001-09-20 16:15     ` Andreas Dilger
2001-09-20 16:22       ` Beau Kuiper
2001-09-21 20:12         ` Lehmann 
2001-09-21 20:57           ` Rik van Riel
2001-09-21 22:50             ` Lehmann 
2001-09-21 13:26 ` Matthias Andree
2001-09-22 18:38   ` Beau Kuiper
2001-09-22 19:32     ` Matthias Andree
2001-09-23 15:24     ` Chris Mason
2001-09-24 14:37   ` Stephen C. Tweedie
2001-09-20 17:23 Chris Mason

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).