linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] concurrent block allocation for ext2 against 2.5.64
@ 2003-03-13  8:55 Alex Tomas
  2003-03-13  9:58 ` Andrew Morton
  2003-03-13 17:39 ` [Ext2-devel] " Andreas Dilger
  0 siblings, 2 replies; 39+ messages in thread
From: Alex Tomas @ 2003-03-13  8:55 UTC (permalink / raw)
  To: linux-kernel; +Cc: ext2-devel, Andrew Morton, Alex Tomas


Hi!

as Andrew said, concurrent balloc for ext3 is useless because of BKL.
and I saw it in benchmarks. but it may be useful for ext2.

Results:
         9/100000   9/500000   16/100000  16/500000  32/100000  32/500000
ext2:    0m9.260s   0m46.160s  0m18.133s  1m33.553s  0m35.958s  3m4.164s
ext2-ca: 0m8.578s   0m42.712s  0m17.412s  1m28.637s  0m33.736s  2m53.824s

in those benchmarks, I run 2 process, each of them writes N blocks 
(9, 16, 32), truncates file and repeat these steps M times (100000, 500000).




diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c	Thu Mar 13 10:54:50 2003
@@ -98,9 +98,13 @@
 {
 	struct ext2_sb_info * sbi = EXT2_SB(sb);
 	struct ext2_super_block * es = sbi->s_es;
-	unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-	unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+	unsigned free_blocks;
+	unsigned root_blocks;
 
+	spin_lock(&sbi->s_alloc_lock);
+	
+	free_blocks = le32_to_cpu(es->s_free_blocks_count);
+	root_blocks = le32_to_cpu(es->s_r_blocks_count);	
 	if (free_blocks < count)
 		count = free_blocks;
 
@@ -113,11 +117,16 @@
 		 */
 		if (free_blocks > root_blocks)
 			count = free_blocks - root_blocks;
-		else
+		else {
+			spin_unlock(&sbi->s_alloc_lock);
 			return 0;
+		}
 	}
 
 	es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
+	
+	spin_unlock(&sbi->s_alloc_lock);
+	
 	mark_buffer_dirty(sbi->s_sbh);
 	sb->s_dirt = 1;
 	return count;
@@ -128,35 +137,54 @@
 	if (count) {
 		struct ext2_sb_info * sbi = EXT2_SB(sb);
 		struct ext2_super_block * es = sbi->s_es;
-		unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
+		unsigned free_blocks;
+		
+		spin_lock(&sbi->s_alloc_lock);
+		free_blocks = le32_to_cpu(es->s_free_blocks_count);
 		es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
+		spin_unlock(&sbi->s_alloc_lock);
+		
 		mark_buffer_dirty(sbi->s_sbh);
 		sb->s_dirt = 1;
 	}
 }
 
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_group_desc *desc,
 				    struct buffer_head *bh, int count)
 {
 	unsigned free_blocks;
 
-	if (!desc->bg_free_blocks_count)
+	spin_lock(&sbi->s_alloc_lock);
+	
+	if (!desc->bg_free_blocks_count) {
+		 spin_unlock(&sbi->s_alloc_lock);
 		return 0;
+	}
 
 	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 	if (free_blocks < count)
 		count = free_blocks;
 	desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+	
+	spin_unlock(&sbi->s_alloc_lock);
+
 	mark_buffer_dirty(bh);
 	return count;
 }
 
-static inline void group_release_blocks(struct ext2_group_desc *desc,
+static inline void group_release_blocks(struct ext2_sb_info *sbi, struct ext2_group_desc *desc,
 				    struct buffer_head *bh, int count)
 {
 	if (count) {
-		unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+		unsigned free_blocks;
+		
+		spin_lock(&sbi->s_alloc_lock);
+		
+		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+		
+		spin_unlock(&sbi->s_alloc_lock);
+		
 		mark_buffer_dirty(bh);
 	}
 }
@@ -176,7 +204,6 @@
 	struct ext2_super_block * es;
 	unsigned freed = 0, group_freed;
 
-	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
@@ -224,7 +251,7 @@
 			    block, count);
 
 	for (i = 0, group_freed = 0; i < count; i++) {
-		if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+		if (!test_and_clear_bit(bit + i, (void *) bitmap_bh->b_data))
 			ext2_error (sb, "ext2_free_blocks",
 				      "bit already cleared for block %lu",
 				      block + i);
@@ -236,7 +263,7 @@
 	if (sb->s_flags & MS_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
-	group_release_blocks(desc, bh2, group_freed);
+	group_release_blocks(EXT2_SB(sb), desc, bh2, group_freed);
 	freed += group_freed;
 
 	if (overflow) {
@@ -247,7 +274,6 @@
 error_return:
 	brelse(bitmap_bh);
 	release_blocks(sb, freed);
-	unlock_super (sb);
 	DQUOT_FREE_BLOCK(inode, freed);
 }
 
@@ -258,6 +284,8 @@
 
 	if (!ext2_test_bit(goal, map))
 		goto got_it;
+
+repeat:
 	if (goal) {
 		/*
 		 * The goal was occupied; search forward for a free 
@@ -297,7 +325,8 @@
 	}
 	return -1;
 got_it:
-	ext2_set_bit(goal, map);
+	if (test_and_set_bit(goal, (void *) map)) 
+		goto repeat;	
 	return goal;
 }
 
@@ -342,8 +371,6 @@
 
 	dq_alloc = prealloc_goal + 1;
 
-	lock_super (sb);
-
 	es_alloc = reserve_blocks(sb, dq_alloc);
 	if (!es_alloc) {
 		*err = -ENOSPC;
@@ -360,7 +387,7 @@
 	if (!desc)
 		goto io_error;
 
-	group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+	group_alloc = group_reserve_blocks(sbi, desc, gdp_bh, es_alloc);
 	if (group_alloc) {
 		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
 					group_size);
@@ -375,7 +402,7 @@
 				group_size, ret_block);
 		if (ret_block >= 0)
 			goto got_block;
-		group_release_blocks(desc, gdp_bh, group_alloc);
+		group_release_blocks(sbi, desc, gdp_bh, group_alloc);
 		group_alloc = 0;
 	}
 
@@ -393,7 +420,7 @@
 		desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
 		if (!desc)
 			goto io_error;
-		group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+		group_alloc = group_reserve_blocks(sbi, desc, gdp_bh, es_alloc);
 	}
 	if (bit >= sbi->s_groups_count) {
 		*err = -ENOSPC;
@@ -452,7 +479,7 @@
 		unsigned n;
 
 		for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
-			if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+			if (test_and_set_bit(ret_block, (void *) bitmap_bh->b_data))
  				break;
 		}
 		*prealloc_block = block + 1;
@@ -471,10 +498,9 @@
 
 	*err = 0;
 out_release:
-	group_release_blocks(desc, gdp_bh, group_alloc);
+	group_release_blocks(sbi, desc, gdp_bh, group_alloc);
 	release_blocks(sb, es_alloc);
 out_unlock:
-	unlock_super (sb);
 	DQUOT_FREE_BLOCK(inode, dq_alloc);
 out:
 	brelse(bitmap_bh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c	Wed Mar 12 23:29:53 2003
@@ -564,6 +564,7 @@
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
 	memset(sbi, 0, sizeof(*sbi));
+	spin_lock_init(&sbi->s_alloc_lock);
 
 	/*
 	 * See what the current blocksize for the device is, and
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h	Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h	Wed Mar 12 22:57:30 2003
@@ -45,6 +45,7 @@
 	u32 s_next_generation;
 	unsigned long s_dir_count;
 	u8 *s_debts;
+	spinlock_t s_alloc_lock;
 };
 
 #endif	/* _LINUX_EXT2_FS_SB */


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13  8:55 [PATCH] concurrent block allocation for ext2 against 2.5.64 Alex Tomas
@ 2003-03-13  9:58 ` Andrew Morton
  2003-03-13 19:17   ` Alex Tomas
  2003-03-13 17:39 ` [Ext2-devel] " Andreas Dilger
  1 sibling, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2003-03-13  9:58 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel, bzzz

Alex Tomas <bzzz@tmi.comex.ru> wrote:
>
> 
> Hi!
> 
> as Andrew said, concurrent balloc for ext3 is useless because of BKL.
> and I saw it in benchmarks. but it may be useful for ext2.
> 
> Results:
>          9/100000   9/500000   16/100000  16/500000  32/100000  32/500000
> ext2:    0m9.260s   0m46.160s  0m18.133s  1m33.553s  0m35.958s  3m4.164s
> ext2-ca: 0m8.578s   0m42.712s  0m17.412s  1m28.637s  0m33.736s  2m53.824s
> 
> in those benchmarks, I run 2 process, each of them writes N blocks 
> (9, 16, 32), truncates file and repeat these steps M times (100000, 500000).

OK.  The main gain here is from the large context switch rate which
lock_super() can cause on big machines.

> -		if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
> +		if (!test_and_clear_bit(bit + i, (void *) bitmap_bh->b_data))

Nope.

This is an on-disk bitmap.  ext2_clear_bit() is endian-neutral - see the
ppc/ppc64/mips/etc implementations.  The code you have here will not work on
big-endian architectures.

We either need to create per-architecture atomic implementations of
ext2_foo_bit(), or use the existing ones under spinlock.

You could do:

int bzzz_set_bit(struct ext2_bg_info *bgi, void *addr, int bit)
{
#if __BIG_ENDIAN
	int ret;

	spin_lock(&bgi->s_alloc_lock);
	ret = ext2_set_bit(addr, bit);
	spin_unlock(&bgi->s_alloc_lock);
	return ret;
#else
	return test_and_set_bit(addr, bit);
#endif
}

I think that will work...

> @@ -45,6 +45,7 @@
>  	u32 s_next_generation;
>  	unsigned long s_dir_count;
>  	u8 *s_debts;
> +	spinlock_t s_alloc_lock;
>  };

You can do better than this.  A spinlock per blockgroup will scale better,
and is pretty easy.

See that s_debts thing?  That points to an array of bytes, one per
blockgroup.  Turn it into:

	struct ext2_bg_info {
		u8 s_debt;
		spinlock_t s_alloc_lock;
	};

And the locking can become per-blockgroup.

The problem with this is the fs-wide s_free_blocks_count thing.  It needs
global locking.  But do we need it?

If you look, you'll see that's not really used for much.  When we report the
free block count to userspace you can just locklesly zoom across all the
blockgroups adding them up.  You'll have to do the same in
find_group_orlov(), which is a bit sucky, but that's only used by mkdir.

The only thing left which needs the global free blocks counter is the
"reserved blocks for root" thing, which doesn't work very well anyway.  A way
to fix that would be to add a "reserved to root" field to ext2_bg_info, and
to precalculate these at mount time.

So the mount code walks across the blockgroups reserving blocks in each one
until it has reserved the required number of blocks.  This way the for-root
reservation becomes per-block-group.  It should only be dipped into if all
blockgroups are otherwise full.

Or something like that ;)



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13  8:55 [PATCH] concurrent block allocation for ext2 against 2.5.64 Alex Tomas
  2003-03-13  9:58 ` Andrew Morton
@ 2003-03-13 17:39 ` Andreas Dilger
  2003-03-13 18:43   ` Alex Tomas
  2003-03-13 19:23   ` Theodore Ts'o
  1 sibling, 2 replies; 39+ messages in thread
From: Andreas Dilger @ 2003-03-13 17:39 UTC (permalink / raw)
  To: Alex Tomas; +Cc: linux-kernel, ext2-devel, Andrew Morton

On Mar 13, 2003  11:55 +0300, Alex Tomas wrote:
> as Andrew said, concurrent balloc for ext3 is useless because of BKL.
> and I saw it in benchmarks. but it may be useful for ext2.

Sadly, we are constantly diverging the ext2/ext3 codebases.  Lots of
features are going into ext3, but lots of fixes/improvements are only
going into ext2.  Is ext3 holding BKL for doing journal_start() still?

Looking at ext3_prepare_write() we grab the BKL for doing journal_start()
and for journal_stop(), but I don't _think_ we need BKL for journal_stop()
do we?  We may or may not need it for the journal_data case, but that is
not even working right now I think.

It also seems we are getting BKL in ext3_truncate(), which likely isn't
needed past journal_start(), although we do need to have superblock-only
lock for ext3_orphan_add/del.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 17:39 ` [Ext2-devel] " Andreas Dilger
@ 2003-03-13 18:43   ` Alex Tomas
  2003-03-13 19:09     ` Matthew Wilcox
  2003-03-13 19:23   ` Theodore Ts'o
  1 sibling, 1 reply; 39+ messages in thread
From: Alex Tomas @ 2003-03-13 18:43 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Alex Tomas, linux-kernel, ext2-devel, Andrew Morton


fs/attr.c:
        if (ia_valid & ATTR_SIZE) {
                if (attr->ia_size == inode->i_size) {
                        if (ia_valid == ATTR_SIZE)
                                goto out;       /* we can skip lock_kernel() */
                } else {
                        lock_kernel();
                        error = vmtruncate(inode, attr->ia_size);
                        unlock_kernel();
                        if (error)
                                goto out;
                }
        }

so, all (!) truncates are serialized

>>>>> Andreas Dilger (AD) writes:

 AD> On Mar 13, 2003 11:55 +0300, Alex Tomas wrote:
 >> as Andrew said, concurrent balloc for ext3 is useless because of
 >> BKL.  and I saw it in benchmarks. but it may be useful for ext2.

 AD> Sadly, we are constantly diverging the ext2/ext3 codebases.  Lots
 AD> of features are going into ext3, but lots of fixes/improvements
 AD> are only going into ext2.  Is ext3 holding BKL for doing
 AD> journal_start() still?

 AD> Looking at ext3_prepare_write() we grab the BKL for doing
 AD> journal_start() and for journal_stop(), but I don't _think_ we
 AD> need BKL for journal_stop() do we?  We may or may not need it for
 AD> the journal_data case, but that is not even working right now I
 AD> think.

 AD> It also seems we are getting BKL in ext3_truncate(), which likely
 AD> isn't needed past journal_start(), although we do need to have
 AD> superblock-only lock for ext3_orphan_add/del.




^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 18:43   ` Alex Tomas
@ 2003-03-13 19:09     ` Matthew Wilcox
  2003-03-13 19:39       ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: Matthew Wilcox @ 2003-03-13 19:09 UTC (permalink / raw)
  To: Alex Tomas; +Cc: Andreas Dilger, linux-kernel, ext2-devel, Andrew Morton

On Thu, Mar 13, 2003 at 09:43:05PM +0300, Alex Tomas wrote:
> 
> fs/attr.c:
>         if (ia_valid & ATTR_SIZE) {
>                 if (attr->ia_size == inode->i_size) {
>                         if (ia_valid == ATTR_SIZE)
>                                 goto out;       /* we can skip lock_kernel() */
>                 } else {
>                         lock_kernel();
>                         error = vmtruncate(inode, attr->ia_size);
>                         unlock_kernel();
>                         if (error)
>                                 goto out;
>                 }
>         }
> 
> so, all (!) truncates are serialized

This looks like a bug.  It should be safe to delete them.  Rationale:

 - Documentation/filesystems/Locking says ->truncate is called without the BKL.
 - This isn't the only place vmtruncate() is called.  Several of the callers
   do it without the BKL (eg xfs, cifs).
 - vmtruncate() appears to handle its own locking (mapping->i_shared_sem)

Comments?

-- 
"It's not Hollywood.  War is real, war is primarily not about defeat or
victory, it is about death.  I've seen thousands and thousands of dead bodies.
Do you think I want to have an academic debate on this subject?" -- Robert Fisk

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13  9:58 ` Andrew Morton
@ 2003-03-13 19:17   ` Alex Tomas
  2003-03-13 22:25     ` Andrew Morton
  2003-03-13 23:56     ` Andreas Dilger
  0 siblings, 2 replies; 39+ messages in thread
From: Alex Tomas @ 2003-03-13 19:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Alex Tomas, linux-kernel, ext2-devel


hi!

here is the new version of the patch. changes since the last one:
1) new primitives ext2_set_bit_atomic and ext2_clear_bit_atomic have been introduced.
   primitives have additional parameter spinlock *, defined for every arch. each arch
   should use atomic test_and_set_bit/test_and_clear_bit or use ext2_set_bit and
   ext2_clear_bit serialized by this lock
2) each group has own spinlock, which is used for group counter modifications and may
   be used to implement ext2_set_bit_atomic/ext2_clear_bit_atomic
3) sb->s_free_blocks_count isn't used any more. ext2_statfs() and find_group_orlov()
   loop over groups to count free blocks
4) sb->s_free_blocks_count is recalculated at mount/umount/sync_super time in order
   to check consistency and to avoid fsck warnings
5) reserved blocks are distributed over all groups at mount time
6) ext2_new_block() tries to use non-reserved blocks and if it fails then tries to
   use reserved blocks
7) ext2_new_block() and ext2_free_blocks do not modify sb->s_free_blocks, therefore
   they do not call mark_buffer_dirty() for superblock's buffer_head. I think it
   may reduce I/O a bit

Thanks to Andrew for idea.


diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c	Thu Mar 13 21:20:16 2003
@@ -94,69 +94,62 @@
 	return bh;
 }
 
-static inline int reserve_blocks(struct super_block *sb, int count)
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi, 
+					struct ext2_group_desc *desc,
+					struct buffer_head *bh, int count, int use_reserve)
 {
-	struct ext2_sb_info * sbi = EXT2_SB(sb);
-	struct ext2_super_block * es = sbi->s_es;
-	unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-	unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+	unsigned free_blocks;
+	unsigned root_blocks;
 
+	spin_lock(&bgi->alloc_lock);
+	
+	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 	if (free_blocks < count)
 		count = free_blocks;
+	root_blocks = bgi->reserved;
 
-	if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
-	    sbi->s_resuid != current->fsuid &&
-	    (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		/*
-		 * We are too close to reserve and we are not privileged.
-		 * Can we allocate anything at all?
-		 */
-		if (free_blocks > root_blocks)
-			count = free_blocks - root_blocks;
-		else
-			return 0;
+	if (free_blocks < root_blocks && !use_reserve) {
+		/* don't use reserved blocks */
+		spin_unlock(&bgi->alloc_lock);
+		return 0;
 	}
-
-	es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
-	mark_buffer_dirty(sbi->s_sbh);
-	sb->s_dirt = 1;
-	return count;
-}
-
-static inline void release_blocks(struct super_block *sb, int count)
-{
-	if (count) {
-		struct ext2_sb_info * sbi = EXT2_SB(sb);
-		struct ext2_super_block * es = sbi->s_es;
-		unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-		es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
-		mark_buffer_dirty(sbi->s_sbh);
-		sb->s_dirt = 1;
+	
+        if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
+            sbi->s_resuid != current->fsuid &&
+            (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                /*
+                 * We are too close to reserve and we are not privileged.
+                 * Can we allocate anything at all?
+                 */
+                if (free_blocks > root_blocks)
+                        count = free_blocks - root_blocks;
+                else {
+			spin_unlock(&bgi->alloc_lock);
+                        return 0;
+		}
 	}
-}
-
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
-				    struct buffer_head *bh, int count)
-{
-	unsigned free_blocks;
-
-	if (!desc->bg_free_blocks_count)
-		return 0;
-
-	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
-	if (free_blocks < count)
-		count = free_blocks;
 	desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+	
+	spin_unlock(&bgi->alloc_lock);
+
 	mark_buffer_dirty(bh);
 	return count;
 }
 
-static inline void group_release_blocks(struct ext2_group_desc *desc,
-				    struct buffer_head *bh, int count)
+static inline void group_release_blocks(struct ext2_bg_info *bgi,
+					struct ext2_group_desc *desc,
+					struct buffer_head *bh, int count)
 {
 	if (count) {
-		unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+		unsigned free_blocks;
+		
+		spin_lock(&bgi->alloc_lock);
+		
+		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+		
+		spin_unlock(&bgi->alloc_lock);
+		
 		mark_buffer_dirty(bh);
 	}
 }
@@ -172,12 +165,11 @@
 	unsigned long i;
 	unsigned long overflow;
 	struct super_block * sb = inode->i_sb;
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
 	struct ext2_group_desc * desc;
-	struct ext2_super_block * es;
+	struct ext2_super_block * es = sbi->s_es;
 	unsigned freed = 0, group_freed;
 
-	lock_super (sb);
-	es = EXT2_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > le32_to_cpu(es->s_blocks_count)) {
@@ -215,16 +207,17 @@
 	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
 	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
 	    in_range (block, le32_to_cpu(desc->bg_inode_table),
-		      EXT2_SB(sb)->s_itb_per_group) ||
+		      sbi->s_itb_per_group) ||
 	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
-		      EXT2_SB(sb)->s_itb_per_group))
+		      sbi->s_itb_per_group))
 		ext2_error (sb, "ext2_free_blocks",
 			    "Freeing blocks in system zones - "
 			    "Block = %lu, count = %lu",
 			    block, count);
 
 	for (i = 0, group_freed = 0; i < count; i++) {
-		if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+		if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].alloc_lock,
+					bit + i, (void *) bitmap_bh->b_data))
 			ext2_error (sb, "ext2_free_blocks",
 				      "bit already cleared for block %lu",
 				      block + i);
@@ -236,7 +229,7 @@
 	if (sb->s_flags & MS_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
-	group_release_blocks(desc, bh2, group_freed);
+	group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed);
 	freed += group_freed;
 
 	if (overflow) {
@@ -246,18 +239,18 @@
 	}
 error_return:
 	brelse(bitmap_bh);
-	release_blocks(sb, freed);
-	unlock_super (sb);
 	DQUOT_FREE_BLOCK(inode, freed);
 }
 
-static int grab_block(char *map, unsigned size, int goal)
+static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal)
 {
 	int k;
 	char *p, *r;
 
 	if (!ext2_test_bit(goal, map))
 		goto got_it;
+
+repeat:
 	if (goal) {
 		/*
 		 * The goal was occupied; search forward for a free 
@@ -297,7 +290,8 @@
 	}
 	return -1;
 got_it:
-	ext2_set_bit(goal, map);
+	if (ext2_set_bit_atomic(lock, goal, (void *) map)) 
+		goto repeat;	
 	return goal;
 }
 
@@ -319,7 +313,7 @@
 	int ret_block;			/* j */
 	int bit;		/* k */
 	int target_block;		/* tmp */
-	int block = 0;
+	int block = 0, use_reserve = 0;
 	struct super_block *sb = inode->i_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
@@ -341,14 +335,7 @@
 		prealloc_goal--;
 
 	dq_alloc = prealloc_goal + 1;
-
-	lock_super (sb);
-
-	es_alloc = reserve_blocks(sb, dq_alloc);
-	if (!es_alloc) {
-		*err = -ENOSPC;
-		goto out_unlock;
-	}
+	es_alloc = dq_alloc;
 
 	ext2_debug ("goal=%lu.\n", goal);
 
@@ -360,7 +347,8 @@
 	if (!desc)
 		goto io_error;
 
-	group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+	group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+					desc, gdp_bh, es_alloc, 0);
 	if (group_alloc) {
 		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
 					group_size);
@@ -371,11 +359,12 @@
 		
 		ext2_debug("goal is at %d:%d.\n", group_no, ret_block);
 
-		ret_block = grab_block(bitmap_bh->b_data,
+		ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+				bitmap_bh->b_data,
 				group_size, ret_block);
 		if (ret_block >= 0)
 			goto got_block;
-		group_release_blocks(desc, gdp_bh, group_alloc);
+		group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
 		group_alloc = 0;
 	}
 
@@ -385,6 +374,7 @@
 	 * Now search the rest of the groups.  We assume that 
 	 * i and desc correctly point to the last group visited.
 	 */
+repeat:
 	for (bit = 0; !group_alloc &&
 			bit < sbi->s_groups_count; bit++) {
 		group_no++;
@@ -393,7 +383,16 @@
 		desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
 		if (!desc)
 			goto io_error;
-		group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+		group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+						desc, gdp_bh, es_alloc, use_reserve);
+	}
+	if (!use_reserve) {
+		/* first time we did not try to allocate
+		 * reserved blocks. now it looks like
+		 * no more non-reserved blocks left. we
+		 * will try to allocate reserved blocks -bzzz */
+		use_reserve = 1;
+		goto repeat;
 	}
 	if (bit >= sbi->s_groups_count) {
 		*err = -ENOSPC;
@@ -404,13 +403,11 @@
 	if (!bitmap_bh)
 		goto io_error;
 
-	ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
+	ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+			bitmap_bh->b_data, group_size, 0);
 	if (ret_block < 0) {
-		ext2_error (sb, "ext2_new_block",
-			"Free blocks count corrupted for block group %d",
-				group_no);
 		group_alloc = 0;
-		goto io_error;
+		goto repeat;	
 	}
 
 got_block:
@@ -452,7 +449,8 @@
 		unsigned n;
 
 		for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
-			if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+			if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].alloc_lock,
+						ret_block, (void*) bitmap_bh->b_data))
  				break;
 		}
 		*prealloc_block = block + 1;
@@ -471,10 +469,7 @@
 
 	*err = 0;
 out_release:
-	group_release_blocks(desc, gdp_bh, group_alloc);
-	release_blocks(sb, es_alloc);
-out_unlock:
-	unlock_super (sb);
+	group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
 	DQUOT_FREE_BLOCK(inode, dq_alloc);
 out:
 	brelse(bitmap_bh);
@@ -487,11 +482,11 @@
 
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
-#ifdef EXT2FS_DEBUG
-	struct ext2_super_block * es;
-	unsigned long desc_count, bitmap_count, x;
 	struct ext2_group_desc * desc;
+	unsigned long desc_count = 0;
 	int i;
+#ifdef EXT2FS_DEBUG
+	unsigned long bitmap_count, x;
 	
 	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
@@ -519,7 +514,13 @@
 	unlock_super (sb);
 	return bitmap_count;
 #else
-	return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
+        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+                desc = ext2_get_group_desc (sb, i, NULL);
+                if (!desc)
+                        continue;
+                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+	}
+	return desc_count;
 #endif
 }
 
diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c	Mon Mar 10 14:52:34 2003
+++ edited/fs/ext2/ialloc.c	Thu Mar 13 20:08:58 2003
@@ -278,7 +278,8 @@
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
 	int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
-	int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+	int free_blocks = ext2_count_free_blocks(sb);
+	int avefreeb = free_blocks / ngroups;
 	int blocks_per_dir;
 	int ndirs = sbi->s_dir_count;
 	int max_debt, max_dirs, min_blocks, min_inodes;
@@ -320,8 +321,7 @@
 		goto fallback;
 	}
 
-	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
-			  le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs;
 
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
 	min_inodes = avefreei - inodes_per_group / 4;
@@ -340,7 +340,7 @@
 		desc = ext2_get_group_desc (sb, group, &bh);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
-		if (sbi->s_debts[group] >= max_debt)
+		if (sbi->s_bgi[group].debts >= max_debt)
 			continue;
 		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
 			continue;
@@ -501,11 +501,11 @@
 		cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
 
 	if (S_ISDIR(mode)) {
-		if (EXT2_SB(sb)->s_debts[group] < 255)
-			EXT2_SB(sb)->s_debts[group]++;
+		if (EXT2_SB(sb)->s_bgi[group].debts < 255)
+			EXT2_SB(sb)->s_bgi[group].debts++;
 	} else {
-		if (EXT2_SB(sb)->s_debts[group])
-			EXT2_SB(sb)->s_debts[group]--;
+		if (EXT2_SB(sb)->s_bgi[group].debts)
+			EXT2_SB(sb)->s_bgi[group].debts--;
 	}
 
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c	Thu Mar 13 17:34:35 2003
@@ -141,7 +141,7 @@
 		if (sbi->s_group_desc[i])
 			brelse (sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_debts);
+	kfree(sbi->s_bgi);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -464,8 +464,11 @@
 	int i;
 	int desc_block = 0;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	struct ext2_super_block * es = sbi->s_es;
+	unsigned long block = le32_to_cpu(es->s_first_data_block);
 	struct ext2_group_desc * gdp = NULL;
+	unsigned int total_free = 0;
+	unsigned int reserved = le32_to_cpu(es->s_r_blocks_count);
 
 	ext2_debug ("Checking group descriptors");
 
@@ -504,6 +507,41 @@
 		block += EXT2_BLOCKS_PER_GROUP(sb);
 		gdp++;
 	}
+	
+	/* restore free blocks counter in SB -bzzz */
+	total_free = ext2_count_free_blocks(sb);
+	if (le32_to_cpu(es->s_free_blocks_count) != total_free)
+		printk(KERN_INFO "EXT2-fs: last umount wasn't clean. correct free blocks counter\n");
+	es->s_free_blocks_count = cpu_to_le32(total_free);
+
+	/* distribute reserved blocks over groups -bzzz */
+	while (reserved && total_free) {
+		unsigned int per_group = reserved / sbi->s_groups_count + 1;
+		unsigned int free;
+	
+		for (i = 0; reserved && i < sbi->s_groups_count; i++) {
+			gdp = ext2_get_group_desc (sb, i, NULL);
+			if (!gdp) {
+				ext2_error (sb, "ext2_check_descriptors",
+						"can't get descriptor for group #%d", i);
+				return 0;
+			}
+			
+			free = le16_to_cpu(gdp->bg_free_blocks_count);
+			if (per_group > free)
+				per_group = free;
+			if (per_group > reserved)
+				per_group = reserved;
+			sbi->s_bgi[i].reserved += per_group;
+			reserved -= per_group;
+			total_free -= per_group;
+
+			/* correct per group aproximation */
+			if (i < sbi->s_groups_count - i)
+				per_group = reserved / (sbi->s_groups_count - i - 1) + 1;
+		}
+	}
+	
 	return 1;
 }
 
@@ -768,13 +806,17 @@
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+	sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info),
 			       GFP_KERNEL);
-	if (!sbi->s_debts) {
+	if (!sbi->s_bgi) {
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount_group_desc;
 	}
-	memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		sbi->s_bgi[i].debts = 0;
+		sbi->s_bgi[i].reserved = 0;
+		spin_lock_init(&sbi->s_bgi[i].alloc_lock);
+	}
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -820,8 +862,8 @@
 		brelse(sbi->s_group_desc[i]);
 failed_mount_group_desc:
 	kfree(sbi->s_group_desc);
-	if (sbi->s_debts)
-		kfree(sbi->s_debts);
+	if (sbi->s_bgi)
+		kfree(sbi->s_bgi);
 failed_mount:
 	brelse(bh);
 failed_sbi:
@@ -840,6 +882,7 @@
 
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 	es->s_wtime = cpu_to_le32(get_seconds());
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 	sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -868,6 +911,7 @@
 			ext2_debug ("setting valid to 0\n");
 			es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
 						  ~EXT2_VALID_FS);
+			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 			es->s_mtime = cpu_to_le32(get_seconds());
 			ext2_sync_super(sb, es);
 		} else
@@ -929,7 +973,8 @@
 static int ext2_statfs (struct super_block * sb, struct statfs * buf)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long overhead;
+	unsigned long overhead, total_free = 0;
+	struct ext2_group_desc *desc;
 	int i;
 
 	if (test_opt (sb, MINIX_DF))
@@ -950,9 +995,14 @@
 		 * block group descriptors.  If the sparse superblocks
 		 * feature is turned on, then not all groups have this.
 		 */
-		for (i = 0; i < sbi->s_groups_count; i++)
+		for (i = 0; i < sbi->s_groups_count; i++) {
 			overhead += ext2_bg_has_super(sb, i) +
 				ext2_bg_num_gdb(sb, i);
+			
+			/* sum total free blocks -bzzz */
+			desc = ext2_get_group_desc (sb, i, NULL);
+			total_free += le16_to_cpu(desc->bg_free_blocks_count);
+		}
 
 		/*
 		 * Every block group has an inode bitmap, a block
@@ -965,7 +1015,7 @@
 	buf->f_type = EXT2_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
-	buf->f_bfree = ext2_count_free_blocks (sb);
+	buf->f_bfree = total_free;
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
 		buf->f_bavail = 0;
diff -uNr linux/include/asm-alpha/bitops.h edited/include/asm-alpha/bitops.h
--- linux/include/asm-alpha/bitops.h	Mon Mar 10 14:52:36 2003
+++ edited/include/asm-alpha/bitops.h	Thu Mar 13 14:10:18 2003
@@ -487,7 +487,9 @@
 
 
 #define ext2_set_bit                 __test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)   test_and_set_bit(n,a)
 #define ext2_clear_bit               __test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-arm/bitops.h edited/include/asm-arm/bitops.h
--- linux/include/asm-arm/bitops.h	Mon Mar 10 14:52:36 2003
+++ edited/include/asm-arm/bitops.h	Thu Mar 13 14:10:46 2003
@@ -357,8 +357,12 @@
  */
 #define ext2_set_bit(nr,p)			\
 		__test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_set_bit_atomic(lock,nr,p)          \
+                test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_clear_bit(nr,p)			\
 		__test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_clear_bit_atomic(lock,nr,p)        \
+                test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_test_bit(nr,p)			\
 		__test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_find_first_zero_bit(p,sz)		\
diff -uNr linux/include/asm-cris/bitops.h edited/include/asm-cris/bitops.h
--- linux/include/asm-cris/bitops.h	Mon Nov 11 06:28:30 2002
+++ edited/include/asm-cris/bitops.h	Thu Mar 13 14:11:15 2003
@@ -360,7 +360,9 @@
 #define hweight8(x) generic_hweight8(x)
 
 #define ext2_set_bit                 test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)   test_and_set_bit(n,a)
 #define ext2_clear_bit               test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-i386/bitops.h edited/include/asm-i386/bitops.h
--- linux/include/asm-i386/bitops.h	Wed Dec 25 06:03:08 2002
+++ edited/include/asm-i386/bitops.h	Thu Mar 13 14:11:32 2003
@@ -479,8 +479,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+        test_and_set_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr, addr) \
+	        test_and_clear_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-ia64/bitops.h edited/include/asm-ia64/bitops.h
--- linux/include/asm-ia64/bitops.h	Thu Feb 20 16:18:21 2003
+++ edited/include/asm-ia64/bitops.h	Thu Mar 13 14:12:50 2003
@@ -453,7 +453,9 @@
 #define __clear_bit(nr, addr)        clear_bit(nr, addr)
 
 #define ext2_set_bit                 test_and_set_bit
+#define ext2_set_atomic(l,n,a)	      test_and_set_bit(n,a)
 #define ext2_clear_bit               test_and_clear_bit
+#define ext2_clear_atomic(l,n,a)     test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-m68k/bitops.h edited/include/asm-m68k/bitops.h
--- linux/include/asm-m68k/bitops.h	Mon Nov 11 06:28:33 2002
+++ edited/include/asm-m68k/bitops.h	Thu Mar 13 14:15:31 2003
@@ -355,6 +355,16 @@
 }
 
 extern __inline__ int
+ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, vaddr);
+	spin_unlock(lock);
+	return ret;
+}
+
+extern __inline__ int
 ext2_clear_bit (int nr, volatile void *vaddr)
 {
 	char retval;
@@ -366,6 +376,16 @@
 }
 
 extern __inline__ int
+ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, vaddr);
+        spin_unlock(lock);
+        return ret;
+}       
+
+extern __inline__ int
 ext2_test_bit (int nr, const volatile void *vaddr)
 {
 	return ((1U << (nr & 7)) & (((const volatile unsigned char *) vaddr)[nr >> 3])) != 0;
diff -uNr linux/include/asm-m68knommu/bitops.h edited/include/asm-m68knommu/bitops.h
--- linux/include/asm-m68knommu/bitops.h	Mon Nov 11 06:28:04 2002
+++ edited/include/asm-m68knommu/bitops.h	Thu Mar 13 14:18:21 2003
@@ -387,6 +387,16 @@
 	return retval;
 }
 
+extern __inline__ int ext2_set_bit_atomic(spinlock_t *lock, int nr,
+		volatile void * addr)
+{
+       int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int ext2_clear_bit(int nr, volatile void * addr)
 {
 	int		mask, retval;
@@ -402,6 +412,16 @@
 	return retval;
 }
 
+extern __inline__ int ext2_clear_bit_atomic(spinlock_t *lock, int nr,
+                volatile void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern __inline__ int ext2_test_bit(int nr, const volatile void * addr)
 {
 	int	mask;
diff -uNr linux/include/asm-mips/bitops.h edited/include/asm-mips/bitops.h
--- linux/include/asm-mips/bitops.h	Mon Nov 11 06:28:03 2002
+++ edited/include/asm-mips/bitops.h	Thu Mar 13 14:24:52 2003
@@ -810,6 +810,15 @@
 	return retval;
 }
 
+extern __inline__ int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int ext2_clear_bit(int nr, void * addr)
 {
 	int		mask, retval, flags;
@@ -824,6 +833,15 @@
 	return retval;
 }
 
+extern __inline__ int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern __inline__ int ext2_test_bit(int nr, const void * addr)
 {
 	int			mask;
@@ -890,7 +908,9 @@
 
 /* Native ext2 byte ordering, just collapse using defines. */
 #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_bit((nr), (addr))
 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_bit((nr), (addr))
 #define ext2_test_bit(nr, addr) test_bit((nr), (addr))
 #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
 #define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-mips64/bitops.h edited/include/asm-mips64/bitops.h
--- linux/include/asm-mips64/bitops.h	Mon Nov 11 06:28:29 2002
+++ edited/include/asm-mips64/bitops.h	Thu Mar 13 14:27:26 2003
@@ -517,6 +517,16 @@
 }
 
 extern inline int
+ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_set_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
+extern inline int
 ext2_clear_bit(int nr, void * addr)
 {
 	int		mask, retval, flags;
@@ -532,6 +542,16 @@
 }
 
 extern inline int
+ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
+extern inline int
 ext2_test_bit(int nr, const void * addr)
 {
 	int			mask;
@@ -599,7 +619,9 @@
 
 /* Native ext2 byte ordering, just collapse using defines. */
 #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
 #define ext2_test_bit(nr, addr) test_bit((nr), (addr))
 #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
 #define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-parisc/bitops.h edited/include/asm-parisc/bitops.h
--- linux/include/asm-parisc/bitops.h	Thu Feb 20 16:18:21 2003
+++ edited/include/asm-parisc/bitops.h	Thu Mar 13 14:29:47 2003
@@ -389,10 +389,14 @@
  */
 #ifdef __LP64__
 #define ext2_set_bit(nr, addr)		test_and_set_bit((nr) ^ 0x38, addr)
+#define ext2_set_bit_atomic(l,nr,addr)  test_and_set_bit((nr) ^ 0x38, addr)
 #define ext2_clear_bit(nr, addr)	test_and_clear_bit((nr) ^ 0x38, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr)
 #else
 #define ext2_set_bit(nr, addr)		test_and_set_bit((nr) ^ 0x18, addr)
+#define ext2_set_bit_atomic(l,nr,addr)  test_and_set_bit((nr) ^ 0x18, addr)
 #define ext2_clear_bit(nr, addr)	test_and_clear_bit((nr) ^ 0x18, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr)
 #endif
 
 #endif	/* __KERNEL__ */
diff -uNr linux/include/asm-ppc/bitops.h edited/include/asm-ppc/bitops.h
--- linux/include/asm-ppc/bitops.h	Mon Jan 20 05:23:05 2003
+++ edited/include/asm-ppc/bitops.h	Thu Mar 13 14:31:00 2003
@@ -392,7 +392,9 @@
 
 
 #define ext2_set_bit(nr, addr)	__test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock, nr, addr)  test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
 #define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
 
 static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
 {
diff -uNr linux/include/asm-ppc64/bitops.h edited/include/asm-ppc64/bitops.h
--- linux/include/asm-ppc64/bitops.h	Mon Nov 11 06:28:28 2002
+++ edited/include/asm-ppc64/bitops.h	Thu Mar 13 14:32:23 2003
@@ -336,8 +336,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_le_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr,addr) \
+	        test_and_set_le_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_le_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+	        test_and_clear_le_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_le_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_le_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-s390/bitops.h edited/include/asm-s390/bitops.h
--- linux/include/asm-s390/bitops.h	Mon Mar 10 14:52:09 2003
+++ edited/include/asm-s390/bitops.h	Thu Mar 13 14:33:55 2003
@@ -805,8 +805,12 @@
 
 #define ext2_set_bit(nr, addr)       \
 	test_and_set_bit((nr)^24, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr)       \
+	        test_and_set_bit((nr)^24, (unsigned long *)addr)
 #define ext2_clear_bit(nr, addr)     \
 	test_and_clear_bit((nr)^24, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr)     \
+	        test_and_clear_bit((nr)^24, (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^24, (unsigned long *)addr)
 
diff -uNr linux/include/asm-s390x/bitops.h edited/include/asm-s390x/bitops.h
--- linux/include/asm-s390x/bitops.h	Mon Mar 10 14:52:09 2003
+++ edited/include/asm-s390x/bitops.h	Thu Mar 13 14:35:22 2003
@@ -838,8 +838,12 @@
 
 #define ext2_set_bit(nr, addr)       \
 	test_and_set_bit((nr)^56, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr)       \
+	        test_and_set_bit((nr)^56, (unsigned long *)addr)
 #define ext2_clear_bit(nr, addr)     \
 	test_and_clear_bit((nr)^56, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr)     \
+	        test_and_clear_bit((nr)^56, (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^56, (unsigned long *)addr)
 
diff -uNr linux/include/asm-sh/bitops.h edited/include/asm-sh/bitops.h
--- linux/include/asm-sh/bitops.h	Mon Nov 11 06:28:02 2002
+++ edited/include/asm-sh/bitops.h	Thu Mar 13 14:37:18 2003
@@ -265,6 +265,16 @@
 	return retval;
 }
 
+static __inline__ int ext2_set_bit_atomic(spinlock_t *lock,
+		int nr, volatile void * addr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 static __inline__ int ext2_clear_bit(int nr, volatile void * addr)
 {
 	int		mask, retval;
@@ -280,6 +290,16 @@
 	return retval;
 }
 
+static __inline__ int ext2_clear_bit_atomic(spinlock_t *lock,
+                int nr, volatile void * addr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}       
+
 static __inline__ int ext2_test_bit(int nr, const volatile void * addr)
 {
 	int			mask;
diff -uNr linux/include/asm-sparc/bitops.h edited/include/asm-sparc/bitops.h
--- linux/include/asm-sparc/bitops.h	Mon Jan 20 05:23:05 2003
+++ edited/include/asm-sparc/bitops.h	Thu Mar 13 14:38:54 2003
@@ -454,7 +454,9 @@
         find_next_zero_le_bit((addr), (size), 0)
 
 #define ext2_set_bit			__test_and_set_le_bit
+#define ext2_set_bit_atomic(l,n,a)      test_and_set_le_bit(n,a)
 #define ext2_clear_bit			__test_and_clear_le_bit
+#define ext2_clear_bit_atomic(l,n,a)    test_and_clear_le_bit(n,a)
 #define ext2_test_bit			test_le_bit
 #define ext2_find_first_zero_bit	find_first_zero_le_bit
 #define ext2_find_next_zero_bit		find_next_zero_le_bit
diff -uNr linux/include/asm-sparc64/bitops.h edited/include/asm-sparc64/bitops.h
--- linux/include/asm-sparc64/bitops.h	Mon Nov 11 06:28:05 2002
+++ edited/include/asm-sparc64/bitops.h	Thu Mar 13 14:43:49 2003
@@ -351,7 +351,9 @@
 #ifdef __KERNEL__
 
 #define ext2_set_bit(nr,addr)		test_and_set_le_bit((nr),(unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
 #define ext2_clear_bit(nr,addr)		test_and_clear_le_bit((nr),(unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
 #define ext2_test_bit(nr,addr)		test_le_bit((nr),(unsigned long *)(addr))
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_le_bit((unsigned long *)(addr), (size))
diff -uNr linux/include/asm-v850/bitops.h edited/include/asm-v850/bitops.h
--- linux/include/asm-v850/bitops.h	Mon Nov 11 06:28:02 2002
+++ edited/include/asm-v850/bitops.h	Thu Mar 13 14:44:48 2003
@@ -252,7 +252,9 @@
 #define hweight8(x) 			generic_hweight8 (x)
 
 #define ext2_set_bit			test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)      test_and_set_bit(n,a)
 #define ext2_clear_bit			test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a)    test_and_clear_bit(n,a)
 #define ext2_test_bit			test_bit
 #define ext2_find_first_zero_bit	find_first_zero_bit
 #define ext2_find_next_zero_bit		find_next_zero_bit
diff -uNr linux/include/asm-x86_64/bitops.h edited/include/asm-x86_64/bitops.h
--- linux/include/asm-x86_64/bitops.h	Mon Mar 10 14:52:09 2003
+++ edited/include/asm-x86_64/bitops.h	Thu Mar 13 14:45:56 2003
@@ -487,8 +487,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+	        test_and_set_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr,addr) \
+	        test_and_clear_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h	Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h	Thu Mar 13 15:56:52 2003
@@ -16,6 +16,12 @@
 #ifndef _LINUX_EXT2_FS_SB
 #define _LINUX_EXT2_FS_SB
 
+struct ext2_bg_info {
+	u8 debts;
+	spinlock_t alloc_lock;
+	unsigned int reserved;
+};
+
 /*
  * second extended-fs super-block data in memory
  */
@@ -44,7 +50,7 @@
 	int s_first_ino;
 	u32 s_next_generation;
 	unsigned long s_dir_count;
-	u8 *s_debts;
+	struct ext2_bg_info *s_bgi;
 };
 
 #endif	/* _LINUX_EXT2_FS_SB */



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 17:39 ` [Ext2-devel] " Andreas Dilger
  2003-03-13 18:43   ` Alex Tomas
@ 2003-03-13 19:23   ` Theodore Ts'o
  2003-03-13 19:44     ` Andreas Dilger
  1 sibling, 1 reply; 39+ messages in thread
From: Theodore Ts'o @ 2003-03-13 19:23 UTC (permalink / raw)
  To: sct, Alex Tomas, linux-kernel, ext2-devel, Andrew Morton

On Thu, Mar 13, 2003 at 10:39:48AM -0700, Andreas Dilger wrote:
> Sadly, we are constantly diverging the ext2/ext3 codebases.  Lots of
> features are going into ext3, but lots of fixes/improvements are only
> going into ext2.  Is ext3 holding BKL for doing journal_start() still?
> 
> Looking at ext3_prepare_write() we grab the BKL for doing journal_start()
> and for journal_stop(), but I don't _think_ we need BKL for journal_stop()
> do we?  We may or may not need it for the journal_data case, but that is
> not even working right now I think.

We badly need to remove the BKL from ext3; it's the source of massive
performance problems for ext3 on larger machines.  

Stephen, you were telling me a week or two ago that there were some
subtle issues involved with BKL removal from the jbd layer --- could
you give us a quick summary of what landminds are there for whoever
wants to try to tackle the ext3/jbd BKL removal?

						- Ted

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 19:09     ` Matthew Wilcox
@ 2003-03-13 19:39       ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-13 19:39 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: bzzz, adilger, linux-kernel, ext2-devel

Matthew Wilcox <willy@debian.org> wrote:
>
> On Thu, Mar 13, 2003 at 09:43:05PM +0300, Alex Tomas wrote:
> > 
> > fs/attr.c:
> >         if (ia_valid & ATTR_SIZE) {
> >                 if (attr->ia_size == inode->i_size) {
> >                         if (ia_valid == ATTR_SIZE)
> >                                 goto out;       /* we can skip lock_kernel() */
> >                 } else {
> >                         lock_kernel();
> >                         error = vmtruncate(inode, attr->ia_size);
> >                         unlock_kernel();
> >                         if (error)
> >                                 goto out;
> >                 }
> >         }
> > 
> > so, all (!) truncates are serialized
> 
> This looks like a bug.  It should be safe to delete them.

Probably.  I was running without them for months.  But this is the
ftruncate() path and not the unlink() path, so I kinda forgot about it.

Most truncations are unlinks, and they are not under lock_kernel.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 19:23   ` Theodore Ts'o
@ 2003-03-13 19:44     ` Andreas Dilger
  0 siblings, 0 replies; 39+ messages in thread
From: Andreas Dilger @ 2003-03-13 19:44 UTC (permalink / raw)
  To: Theodore Ts'o, sct, Alex Tomas, linux-kernel, ext2-devel,
	Andrew Morton

On Mar 13, 2003  14:23 -0500, Theodore Ts'o wrote:
> On Thu, Mar 13, 2003 at 10:39:48AM -0700, Andreas Dilger wrote:
> > Sadly, we are constantly diverging the ext2/ext3 codebases.  Lots of
> > features are going into ext3, but lots of fixes/improvements are only
> > going into ext2.  Is ext3 holding BKL for doing journal_start() still?
> > 
> > Looking at ext3_prepare_write() we grab the BKL for doing journal_start()
> > and for journal_stop(), but I don't _think_ we need BKL for journal_stop()
> > do we?  We may or may not need it for the journal_data case, but that is
> > not even working right now I think.
> 
> We badly need to remove the BKL from ext3; it's the source of massive
> performance problems for ext3 on larger machines.  
> 
> Stephen, you were telling me a week or two ago that there were some
> subtle issues involved with BKL removal from the jbd layer --- could
> you give us a quick summary of what landminds are there for whoever
> wants to try to tackle the ext3/jbd BKL removal?

Ted, as a start, we can move the (un)lock_kernel() calls from the ext3
code into the journal_start() and journal_stop(), and then continue to
push it down into the places where we need it and/or replace it with a
better lock.  This not only makes the lock migration easier, but also
ensures that we always have the lock when we need it.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 19:17   ` Alex Tomas
@ 2003-03-13 22:25     ` Andrew Morton
  2003-03-13 23:03       ` Andreas Dilger
  2003-03-13 23:03       ` Alex Tomas
  2003-03-13 23:56     ` Andreas Dilger
  1 sibling, 2 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-13 22:25 UTC (permalink / raw)
  To: Alex Tomas; +Cc: bzzz, linux-kernel, ext2-devel

Alex Tomas <bzzz@tmi.comex.ru> wrote:
>
> 
> hi!
> 
> here is the new version of the patch.

This is great work.

a) The algorithm which you are using to distribute the root-reserved
   blocks across the blockgroups will end up leaving a small number of unused
   blocks in every blockgroup.  So large files which span multiple
   blockgroups will have little gaps in them.

   I think it's probably better to just lump all the root-reserved blocks
   into as few blockgroups as possible.

   Probably these should be the last blockgroups, because those are
   nearest the spindle, and hence the slowest.  This is by no means always
   the case - some disks are backwards, but it seems that most are not.  Plus
   nearness to the superblock is good.

b) struct ext2_bg_info needs a ____cacheline_aligned_in_smp stuck on it.

c) It looks like EXT2FS_DEBUG broke.  Nobody uses that much, but we should
   fix and test it sometime.

Be expecting some benchmark numbers.  Maybe those 32-ways will be able to run
as fast as my $300 2-way now ;)



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 22:25     ` Andrew Morton
@ 2003-03-13 23:03       ` Andreas Dilger
  2003-03-13 23:10         ` Andrew Morton
  2003-03-13 23:03       ` Alex Tomas
  1 sibling, 1 reply; 39+ messages in thread
From: Andreas Dilger @ 2003-03-13 23:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Alex Tomas, linux-kernel, ext2-devel

On Mar 13, 2003  14:25 -0800, Andrew Morton wrote:
> This is great work.

Agreed.  This is something that has been talked about but not implemented
for a long time now.  Thanks for the efforts.

> a) The algorithm which you are using to distribute the root-reserved
>    blocks across the blockgroups will end up leaving a small number of unused
>    blocks in every blockgroup.  So large files which span multiple
>    blockgroups will have little gaps in them.
> 
>    I think it's probably better to just lump all the root-reserved blocks
>    into as few blockgroups as possible.

I might disagree here.  One of the reasons for having the reserved blocks
is to prevent fragmentation, and not necessarily to reserve space for root.
For the lots of small files cases it makes more sense to leave free space
in each group to prevent fragmentation at the group level.

For the large file case, there is less need to worry about fragmentation,
so we can just ignore the group's reserved percentage for "large" files.
A heuristic which says "if this file is huge, just keep allocating from this
group, and screw the reserved blocks" makes sense.

One such heuristic is if the file is, say, larger than 1/2 or 1/4 of the
entire group in size, it is allowed to continue allocating from the same
group.

We could also say that for the purpose of allocating new files in a directory,
anything more than 95% full is "full" and the inode should be allocated in
a different group regardless of where the parent is.  It may be that the
Orlov allocator already has such a heuristic, but I think that is a different
discussion.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 22:25     ` Andrew Morton
  2003-03-13 23:03       ` Andreas Dilger
@ 2003-03-13 23:03       ` Alex Tomas
  2003-03-13 23:25         ` Andrew Morton
  1 sibling, 1 reply; 39+ messages in thread
From: Alex Tomas @ 2003-03-13 23:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Alex Tomas, linux-kernel, ext2-devel


hi!

>>>>> Andrew Morton (AM) writes:

 AM> a) The algorithm which you are using to distribute the
 AM> root-reserved blocks across the blockgroups will end up leaving a
 AM> small number of unused blocks in every blockgroup.  So large
 AM> files which span multiple blockgroups will have little gaps in
 AM> them.

 AM>    I think it's probably better to just lump all the
 AM> root-reserved blocks into as few blockgroups as possible.

 AM>    Probably these should be the last blockgroups, because those
 AM> are nearest the spindle, and hence the slowest.  This is by no
 AM> means always the case - some disks are backwards, but it seems
 AM> that most are not.  Plus nearness to the superblock is good.

done

 AM> b) struct ext2_bg_info needs a ____cacheline_aligned_in_smp stuck
 AM> on it.

done

 AM> c) It looks like EXT2FS_DEBUG broke.  Nobody uses that much, but
 AM> we should fix and test it sometime.

I suggest this to be fixed in separate patch. are you?

 AM> Be expecting some benchmark numbers.  Maybe those 32-ways will be
 AM> able to run as fast as my $300 2-way now ;)

me too ;)


btw, what about minor bug in ext2 allocation code I posted recently?
do you agree it needs to be fixed?




diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c	Thu Mar 13 21:20:16 2003
@@ -94,69 +94,62 @@
 	return bh;
 }
 
-static inline int reserve_blocks(struct super_block *sb, int count)
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi, 
+					struct ext2_group_desc *desc,
+					struct buffer_head *bh, int count, int use_reserve)
 {
-	struct ext2_sb_info * sbi = EXT2_SB(sb);
-	struct ext2_super_block * es = sbi->s_es;
-	unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-	unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+	unsigned free_blocks;
+	unsigned root_blocks;
 
+	spin_lock(&bgi->alloc_lock);
+	
+	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 	if (free_blocks < count)
 		count = free_blocks;
+	root_blocks = bgi->reserved;
 
-	if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
-	    sbi->s_resuid != current->fsuid &&
-	    (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		/*
-		 * We are too close to reserve and we are not privileged.
-		 * Can we allocate anything at all?
-		 */
-		if (free_blocks > root_blocks)
-			count = free_blocks - root_blocks;
-		else
-			return 0;
+	if (free_blocks < root_blocks && !use_reserve) {
+		/* don't use reserved blocks */
+		spin_unlock(&bgi->alloc_lock);
+		return 0;
 	}
-
-	es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
-	mark_buffer_dirty(sbi->s_sbh);
-	sb->s_dirt = 1;
-	return count;
-}
-
-static inline void release_blocks(struct super_block *sb, int count)
-{
-	if (count) {
-		struct ext2_sb_info * sbi = EXT2_SB(sb);
-		struct ext2_super_block * es = sbi->s_es;
-		unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-		es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
-		mark_buffer_dirty(sbi->s_sbh);
-		sb->s_dirt = 1;
+	
+        if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
+            sbi->s_resuid != current->fsuid &&
+            (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                /*
+                 * We are too close to reserve and we are not privileged.
+                 * Can we allocate anything at all?
+                 */
+                if (free_blocks > root_blocks)
+                        count = free_blocks - root_blocks;
+                else {
+			spin_unlock(&bgi->alloc_lock);
+                        return 0;
+		}
 	}
-}
-
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
-				    struct buffer_head *bh, int count)
-{
-	unsigned free_blocks;
-
-	if (!desc->bg_free_blocks_count)
-		return 0;
-
-	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
-	if (free_blocks < count)
-		count = free_blocks;
 	desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+	
+	spin_unlock(&bgi->alloc_lock);
+
 	mark_buffer_dirty(bh);
 	return count;
 }
 
-static inline void group_release_blocks(struct ext2_group_desc *desc,
-				    struct buffer_head *bh, int count)
+static inline void group_release_blocks(struct ext2_bg_info *bgi,
+					struct ext2_group_desc *desc,
+					struct buffer_head *bh, int count)
 {
 	if (count) {
-		unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+		unsigned free_blocks;
+		
+		spin_lock(&bgi->alloc_lock);
+		
+		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+		
+		spin_unlock(&bgi->alloc_lock);
+		
 		mark_buffer_dirty(bh);
 	}
 }
@@ -172,12 +165,11 @@
 	unsigned long i;
 	unsigned long overflow;
 	struct super_block * sb = inode->i_sb;
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
 	struct ext2_group_desc * desc;
-	struct ext2_super_block * es;
+	struct ext2_super_block * es = sbi->s_es;
 	unsigned freed = 0, group_freed;
 
-	lock_super (sb);
-	es = EXT2_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > le32_to_cpu(es->s_blocks_count)) {
@@ -215,16 +207,17 @@
 	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
 	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
 	    in_range (block, le32_to_cpu(desc->bg_inode_table),
-		      EXT2_SB(sb)->s_itb_per_group) ||
+		      sbi->s_itb_per_group) ||
 	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
-		      EXT2_SB(sb)->s_itb_per_group))
+		      sbi->s_itb_per_group))
 		ext2_error (sb, "ext2_free_blocks",
 			    "Freeing blocks in system zones - "
 			    "Block = %lu, count = %lu",
 			    block, count);
 
 	for (i = 0, group_freed = 0; i < count; i++) {
-		if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+		if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].alloc_lock,
+					bit + i, (void *) bitmap_bh->b_data))
 			ext2_error (sb, "ext2_free_blocks",
 				      "bit already cleared for block %lu",
 				      block + i);
@@ -236,7 +229,7 @@
 	if (sb->s_flags & MS_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
-	group_release_blocks(desc, bh2, group_freed);
+	group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed);
 	freed += group_freed;
 
 	if (overflow) {
@@ -246,18 +239,18 @@
 	}
 error_return:
 	brelse(bitmap_bh);
-	release_blocks(sb, freed);
-	unlock_super (sb);
 	DQUOT_FREE_BLOCK(inode, freed);
 }
 
-static int grab_block(char *map, unsigned size, int goal)
+static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal)
 {
 	int k;
 	char *p, *r;
 
 	if (!ext2_test_bit(goal, map))
 		goto got_it;
+
+repeat:
 	if (goal) {
 		/*
 		 * The goal was occupied; search forward for a free 
@@ -297,7 +290,8 @@
 	}
 	return -1;
 got_it:
-	ext2_set_bit(goal, map);
+	if (ext2_set_bit_atomic(lock, goal, (void *) map)) 
+		goto repeat;	
 	return goal;
 }
 
@@ -319,7 +313,7 @@
 	int ret_block;			/* j */
 	int bit;		/* k */
 	int target_block;		/* tmp */
-	int block = 0;
+	int block = 0, use_reserve = 0;
 	struct super_block *sb = inode->i_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
@@ -341,14 +335,7 @@
 		prealloc_goal--;
 
 	dq_alloc = prealloc_goal + 1;
-
-	lock_super (sb);
-
-	es_alloc = reserve_blocks(sb, dq_alloc);
-	if (!es_alloc) {
-		*err = -ENOSPC;
-		goto out_unlock;
-	}
+	es_alloc = dq_alloc;
 
 	ext2_debug ("goal=%lu.\n", goal);
 
@@ -360,7 +347,8 @@
 	if (!desc)
 		goto io_error;
 
-	group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+	group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+					desc, gdp_bh, es_alloc, 0);
 	if (group_alloc) {
 		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
 					group_size);
@@ -371,11 +359,12 @@
 		
 		ext2_debug("goal is at %d:%d.\n", group_no, ret_block);
 
-		ret_block = grab_block(bitmap_bh->b_data,
+		ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+				bitmap_bh->b_data,
 				group_size, ret_block);
 		if (ret_block >= 0)
 			goto got_block;
-		group_release_blocks(desc, gdp_bh, group_alloc);
+		group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
 		group_alloc = 0;
 	}
 
@@ -385,6 +374,7 @@
 	 * Now search the rest of the groups.  We assume that 
 	 * i and desc correctly point to the last group visited.
 	 */
+repeat:
 	for (bit = 0; !group_alloc &&
 			bit < sbi->s_groups_count; bit++) {
 		group_no++;
@@ -393,7 +383,16 @@
 		desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
 		if (!desc)
 			goto io_error;
-		group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+		group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+						desc, gdp_bh, es_alloc, use_reserve);
+	}
+	if (!use_reserve) {
+		/* first time we did not try to allocate
+		 * reserved blocks. now it looks like
+		 * no more non-reserved blocks left. we
+		 * will try to allocate reserved blocks -bzzz */
+		use_reserve = 1;
+		goto repeat;
 	}
 	if (bit >= sbi->s_groups_count) {
 		*err = -ENOSPC;
@@ -404,13 +403,11 @@
 	if (!bitmap_bh)
 		goto io_error;
 
-	ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
+	ret_block = grab_block(&sbi->s_bgi[group_no].alloc_lock,
+			bitmap_bh->b_data, group_size, 0);
 	if (ret_block < 0) {
-		ext2_error (sb, "ext2_new_block",
-			"Free blocks count corrupted for block group %d",
-				group_no);
 		group_alloc = 0;
-		goto io_error;
+		goto repeat;	
 	}
 
 got_block:
@@ -452,7 +449,8 @@
 		unsigned n;
 
 		for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
-			if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+			if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].alloc_lock,
+						ret_block, (void*) bitmap_bh->b_data))
  				break;
 		}
 		*prealloc_block = block + 1;
@@ -471,10 +469,7 @@
 
 	*err = 0;
 out_release:
-	group_release_blocks(desc, gdp_bh, group_alloc);
-	release_blocks(sb, es_alloc);
-out_unlock:
-	unlock_super (sb);
+	group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
 	DQUOT_FREE_BLOCK(inode, dq_alloc);
 out:
 	brelse(bitmap_bh);
@@ -487,11 +482,11 @@
 
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
-#ifdef EXT2FS_DEBUG
-	struct ext2_super_block * es;
-	unsigned long desc_count, bitmap_count, x;
 	struct ext2_group_desc * desc;
+	unsigned long desc_count = 0;
 	int i;
+#ifdef EXT2FS_DEBUG
+	unsigned long bitmap_count, x;
 	
 	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
@@ -519,7 +514,13 @@
 	unlock_super (sb);
 	return bitmap_count;
 #else
-	return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
+        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+                desc = ext2_get_group_desc (sb, i, NULL);
+                if (!desc)
+                        continue;
+                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+	}
+	return desc_count;
 #endif
 }
 
diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c	Fri Mar 14 01:53:36 2003
+++ edited/fs/ext2/ialloc.c	Thu Mar 13 20:08:58 2003
@@ -278,7 +278,8 @@
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
 	int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
-	int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+	int free_blocks = ext2_count_free_blocks(sb);
+	int avefreeb = free_blocks / ngroups;
 	int blocks_per_dir;
 	int ndirs = sbi->s_dir_count;
 	int max_debt, max_dirs, min_blocks, min_inodes;
@@ -320,8 +321,7 @@
 		goto fallback;
 	}
 
-	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
-			  le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs;
 
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
 	min_inodes = avefreei - inodes_per_group / 4;
@@ -340,7 +340,7 @@
 		desc = ext2_get_group_desc (sb, group, &bh);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
-		if (sbi->s_debts[group] >= max_debt)
+		if (sbi->s_bgi[group].debts >= max_debt)
 			continue;
 		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
 			continue;
@@ -501,11 +501,11 @@
 		cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
 
 	if (S_ISDIR(mode)) {
-		if (EXT2_SB(sb)->s_debts[group] < 255)
-			EXT2_SB(sb)->s_debts[group]++;
+		if (EXT2_SB(sb)->s_bgi[group].debts < 255)
+			EXT2_SB(sb)->s_bgi[group].debts++;
 	} else {
-		if (EXT2_SB(sb)->s_debts[group])
-			EXT2_SB(sb)->s_debts[group]--;
+		if (EXT2_SB(sb)->s_bgi[group].debts)
+			EXT2_SB(sb)->s_bgi[group].debts--;
 	}
 
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c	Fri Mar 14 01:46:35 2003
@@ -141,7 +141,7 @@
 		if (sbi->s_group_desc[i])
 			brelse (sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_debts);
+	kfree(sbi->s_bgi);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -464,8 +464,11 @@
 	int i;
 	int desc_block = 0;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	struct ext2_super_block * es = sbi->s_es;
+	unsigned long block = le32_to_cpu(es->s_first_data_block);
 	struct ext2_group_desc * gdp = NULL;
+	unsigned int total_free = 0, free;
+	unsigned int reserved = le32_to_cpu(es->s_r_blocks_count);
 
 	ext2_debug ("Checking group descriptors");
 
@@ -504,6 +507,31 @@
 		block += EXT2_BLOCKS_PER_GROUP(sb);
 		gdp++;
 	}
+	
+	/* restore free blocks counter in SB -bzzz */
+	total_free = ext2_count_free_blocks(sb);
+	if (le32_to_cpu(es->s_free_blocks_count) != total_free)
+		printk(KERN_INFO "EXT2-fs: last umount wasn't clean."
+			       "correct free blocks counter\n");
+	es->s_free_blocks_count = cpu_to_le32(total_free);
+
+	/* distribute reserved blocks over groups -bzzz */
+	for(i = sbi->s_groups_count-1; reserved && total_free && i >= 0; i--) {
+		gdp = ext2_get_group_desc (sb, i, NULL);
+		if (!gdp) {
+			ext2_error (sb, "ext2_check_descriptors",
+					"cant get descriptor for group %d", i);
+			return 0;
+		}
+		
+		free = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free > reserved)
+			free = reserved;
+		sbi->s_bgi[i].reserved = free;
+		reserved -= free;
+		total_free -= free;
+	}
+	
 	return 1;
 }
 
@@ -768,13 +796,17 @@
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+	sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info),
 			       GFP_KERNEL);
-	if (!sbi->s_debts) {
+	if (!sbi->s_bgi) {
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount_group_desc;
 	}
-	memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		sbi->s_bgi[i].debts = 0;
+		sbi->s_bgi[i].reserved = 0;
+		spin_lock_init(&sbi->s_bgi[i].alloc_lock);
+	}
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -820,8 +852,8 @@
 		brelse(sbi->s_group_desc[i]);
 failed_mount_group_desc:
 	kfree(sbi->s_group_desc);
-	if (sbi->s_debts)
-		kfree(sbi->s_debts);
+	if (sbi->s_bgi)
+		kfree(sbi->s_bgi);
 failed_mount:
 	brelse(bh);
 failed_sbi:
@@ -840,6 +872,7 @@
 
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 	es->s_wtime = cpu_to_le32(get_seconds());
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 	sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -868,6 +901,7 @@
 			ext2_debug ("setting valid to 0\n");
 			es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
 						  ~EXT2_VALID_FS);
+			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 			es->s_mtime = cpu_to_le32(get_seconds());
 			ext2_sync_super(sb, es);
 		} else
@@ -929,7 +963,8 @@
 static int ext2_statfs (struct super_block * sb, struct statfs * buf)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long overhead;
+	unsigned long overhead, total_free = 0;
+	struct ext2_group_desc *desc;
 	int i;
 
 	if (test_opt (sb, MINIX_DF))
@@ -950,9 +985,14 @@
 		 * block group descriptors.  If the sparse superblocks
 		 * feature is turned on, then not all groups have this.
 		 */
-		for (i = 0; i < sbi->s_groups_count; i++)
+		for (i = 0; i < sbi->s_groups_count; i++) {
 			overhead += ext2_bg_has_super(sb, i) +
 				ext2_bg_num_gdb(sb, i);
+			
+			/* sum total free blocks -bzzz */
+			desc = ext2_get_group_desc (sb, i, NULL);
+			total_free += le16_to_cpu(desc->bg_free_blocks_count);
+		}
 
 		/*
 		 * Every block group has an inode bitmap, a block
@@ -965,7 +1005,7 @@
 	buf->f_type = EXT2_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
-	buf->f_bfree = ext2_count_free_blocks (sb);
+	buf->f_bfree = total_free;
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
 		buf->f_bavail = 0;
diff -uNr linux/include/asm-alpha/bitops.h edited/include/asm-alpha/bitops.h
--- linux/include/asm-alpha/bitops.h	Fri Mar 14 01:53:36 2003
+++ edited/include/asm-alpha/bitops.h	Thu Mar 13 14:10:18 2003
@@ -487,7 +487,9 @@
 
 
 #define ext2_set_bit                 __test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)   test_and_set_bit(n,a)
 #define ext2_clear_bit               __test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-arm/bitops.h edited/include/asm-arm/bitops.h
--- linux/include/asm-arm/bitops.h	Fri Mar 14 01:53:36 2003
+++ edited/include/asm-arm/bitops.h	Thu Mar 13 14:10:46 2003
@@ -357,8 +357,12 @@
  */
 #define ext2_set_bit(nr,p)			\
 		__test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_set_bit_atomic(lock,nr,p)          \
+                test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_clear_bit(nr,p)			\
 		__test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_clear_bit_atomic(lock,nr,p)        \
+                test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_test_bit(nr,p)			\
 		__test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_find_first_zero_bit(p,sz)		\
diff -uNr linux/include/asm-cris/bitops.h edited/include/asm-cris/bitops.h
--- linux/include/asm-cris/bitops.h	Mon Nov 11 06:28:30 2002
+++ edited/include/asm-cris/bitops.h	Thu Mar 13 14:11:15 2003
@@ -360,7 +360,9 @@
 #define hweight8(x) generic_hweight8(x)
 
 #define ext2_set_bit                 test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)   test_and_set_bit(n,a)
 #define ext2_clear_bit               test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-i386/bitops.h edited/include/asm-i386/bitops.h
--- linux/include/asm-i386/bitops.h	Wed Dec 25 06:03:08 2002
+++ edited/include/asm-i386/bitops.h	Thu Mar 13 14:11:32 2003
@@ -479,8 +479,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+        test_and_set_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr, addr) \
+	        test_and_clear_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-ia64/bitops.h edited/include/asm-ia64/bitops.h
--- linux/include/asm-ia64/bitops.h	Thu Feb 20 16:18:21 2003
+++ edited/include/asm-ia64/bitops.h	Thu Mar 13 14:12:50 2003
@@ -453,7 +453,9 @@
 #define __clear_bit(nr, addr)        clear_bit(nr, addr)
 
 #define ext2_set_bit                 test_and_set_bit
+#define ext2_set_atomic(l,n,a)	     test_and_set_bit(n,a)
 #define ext2_clear_bit               test_and_clear_bit
+#define ext2_clear_atomic(l,n,a)     test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-m68k/bitops.h edited/include/asm-m68k/bitops.h
--- linux/include/asm-m68k/bitops.h	Mon Nov 11 06:28:33 2002
+++ edited/include/asm-m68k/bitops.h	Thu Mar 13 14:15:31 2003
@@ -355,6 +355,16 @@
 }
 
 extern __inline__ int
+ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, vaddr);
+	spin_unlock(lock);
+	return ret;
+}
+
+extern __inline__ int
 ext2_clear_bit (int nr, volatile void *vaddr)
 {
 	char retval;
@@ -366,6 +376,16 @@
 }
 
 extern __inline__ int
+ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, vaddr);
+        spin_unlock(lock);
+        return ret;
+}       
+
+extern __inline__ int
 ext2_test_bit (int nr, const volatile void *vaddr)
 {
 	return ((1U << (nr & 7)) & (((const volatile unsigned char *) vaddr)[nr >> 3])) != 0;
diff -uNr linux/include/asm-m68knommu/bitops.h edited/include/asm-m68knommu/bitops.h
--- linux/include/asm-m68knommu/bitops.h	Mon Nov 11 06:28:04 2002
+++ edited/include/asm-m68knommu/bitops.h	Thu Mar 13 14:18:21 2003
@@ -387,6 +387,16 @@
 	return retval;
 }
 
+extern __inline__ int ext2_set_bit_atomic(spinlock_t *lock, int nr,
+		volatile void * addr)
+{
+        int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int ext2_clear_bit(int nr, volatile void * addr)
 {
 	int		mask, retval;
@@ -402,6 +412,16 @@
 	return retval;
 }
 
+extern __inline__ int ext2_clear_bit_atomic(spinlock_t *lock, int nr,
+                volatile void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern __inline__ int ext2_test_bit(int nr, const volatile void * addr)
 {
 	int	mask;
diff -uNr linux/include/asm-mips/bitops.h edited/include/asm-mips/bitops.h
--- linux/include/asm-mips/bitops.h	Mon Nov 11 06:28:03 2002
+++ edited/include/asm-mips/bitops.h	Thu Mar 13 14:24:52 2003
@@ -810,6 +810,15 @@
 	return retval;
 }
 
+extern __inline__ int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int ext2_clear_bit(int nr, void * addr)
 {
 	int		mask, retval, flags;
@@ -824,6 +833,15 @@
 	return retval;
 }
 
+extern __inline__ int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern __inline__ int ext2_test_bit(int nr, const void * addr)
 {
 	int			mask;
@@ -890,7 +908,9 @@
 
 /* Native ext2 byte ordering, just collapse using defines. */
 #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
 #define ext2_test_bit(nr, addr) test_bit((nr), (addr))
 #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
 #define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-mips64/bitops.h edited/include/asm-mips64/bitops.h
--- linux/include/asm-mips64/bitops.h	Mon Nov 11 06:28:29 2002
+++ edited/include/asm-mips64/bitops.h	Thu Mar 13 14:27:26 2003
@@ -517,6 +517,16 @@
 }
 
 extern inline int
+ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_set_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
+extern inline int
 ext2_clear_bit(int nr, void * addr)
 {
 	int		mask, retval, flags;
@@ -532,6 +542,16 @@
 }
 
 extern inline int
+ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
+extern inline int
 ext2_test_bit(int nr, const void * addr)
 {
 	int			mask;
@@ -599,7 +619,9 @@
 
 /* Native ext2 byte ordering, just collapse using defines. */
 #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
 #define ext2_test_bit(nr, addr) test_bit((nr), (addr))
 #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
 #define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-parisc/bitops.h edited/include/asm-parisc/bitops.h
--- linux/include/asm-parisc/bitops.h	Thu Feb 20 16:18:21 2003
+++ edited/include/asm-parisc/bitops.h	Thu Mar 13 14:29:47 2003
@@ -389,10 +389,14 @@
  */
 #ifdef __LP64__
 #define ext2_set_bit(nr, addr)		test_and_set_bit((nr) ^ 0x38, addr)
+#define ext2_set_bit_atomic(l,nr,addr)  test_and_set_bit((nr) ^ 0x38, addr)
 #define ext2_clear_bit(nr, addr)	test_and_clear_bit((nr) ^ 0x38, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr)
 #else
 #define ext2_set_bit(nr, addr)		test_and_set_bit((nr) ^ 0x18, addr)
+#define ext2_set_bit_atomic(l,nr,addr)  test_and_set_bit((nr) ^ 0x18, addr)
 #define ext2_clear_bit(nr, addr)	test_and_clear_bit((nr) ^ 0x18, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr)
 #endif
 
 #endif	/* __KERNEL__ */
diff -uNr linux/include/asm-ppc/bitops.h edited/include/asm-ppc/bitops.h
--- linux/include/asm-ppc/bitops.h	Mon Jan 20 05:23:05 2003
+++ edited/include/asm-ppc/bitops.h	Thu Mar 13 14:31:00 2003
@@ -392,7 +392,9 @@
 
 
 #define ext2_set_bit(nr, addr)	__test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock, nr, addr)  test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
 #define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
 
 static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
 {
diff -uNr linux/include/asm-ppc64/bitops.h edited/include/asm-ppc64/bitops.h
--- linux/include/asm-ppc64/bitops.h	Mon Nov 11 06:28:28 2002
+++ edited/include/asm-ppc64/bitops.h	Thu Mar 13 14:32:23 2003
@@ -336,8 +336,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_le_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr,addr) \
+	        test_and_set_le_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_le_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+	        test_and_clear_le_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_le_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_le_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-s390/bitops.h edited/include/asm-s390/bitops.h
--- linux/include/asm-s390/bitops.h	Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390/bitops.h	Thu Mar 13 14:33:55 2003
@@ -805,8 +805,12 @@
 
 #define ext2_set_bit(nr, addr)       \
 	test_and_set_bit((nr)^24, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr)       \
+	        test_and_set_bit((nr)^24, (unsigned long *)addr)
 #define ext2_clear_bit(nr, addr)     \
 	test_and_clear_bit((nr)^24, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr)     \
+	        test_and_clear_bit((nr)^24, (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^24, (unsigned long *)addr)
 
diff -uNr linux/include/asm-s390x/bitops.h edited/include/asm-s390x/bitops.h
--- linux/include/asm-s390x/bitops.h	Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390x/bitops.h	Thu Mar 13 14:35:22 2003
@@ -838,8 +838,12 @@
 
 #define ext2_set_bit(nr, addr)       \
 	test_and_set_bit((nr)^56, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr)       \
+	        test_and_set_bit((nr)^56, (unsigned long *)addr)
 #define ext2_clear_bit(nr, addr)     \
 	test_and_clear_bit((nr)^56, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr)     \
+	        test_and_clear_bit((nr)^56, (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^56, (unsigned long *)addr)
 
diff -uNr linux/include/asm-sh/bitops.h edited/include/asm-sh/bitops.h
--- linux/include/asm-sh/bitops.h	Mon Nov 11 06:28:02 2002
+++ edited/include/asm-sh/bitops.h	Thu Mar 13 14:37:18 2003
@@ -265,6 +265,16 @@
 	return retval;
 }
 
+static __inline__ int ext2_set_bit_atomic(spinlock_t *lock,
+		int nr, volatile void * addr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 static __inline__ int ext2_clear_bit(int nr, volatile void * addr)
 {
 	int		mask, retval;
@@ -280,6 +290,16 @@
 	return retval;
 }
 
+static __inline__ int ext2_clear_bit_atomic(spinlock_t *lock,
+                int nr, volatile void * addr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}       
+
 static __inline__ int ext2_test_bit(int nr, const volatile void * addr)
 {
 	int			mask;
diff -uNr linux/include/asm-sparc/bitops.h edited/include/asm-sparc/bitops.h
--- linux/include/asm-sparc/bitops.h	Mon Jan 20 05:23:05 2003
+++ edited/include/asm-sparc/bitops.h	Thu Mar 13 14:38:54 2003
@@ -454,7 +454,9 @@
         find_next_zero_le_bit((addr), (size), 0)
 
 #define ext2_set_bit			__test_and_set_le_bit
+#define ext2_set_bit_atomic(l,n,a)      test_and_set_le_bit(n,a)
 #define ext2_clear_bit			__test_and_clear_le_bit
+#define ext2_clear_bit_atomic(l,n,a)    test_and_clear_le_bit(n,a)
 #define ext2_test_bit			test_le_bit
 #define ext2_find_first_zero_bit	find_first_zero_le_bit
 #define ext2_find_next_zero_bit		find_next_zero_le_bit
diff -uNr linux/include/asm-sparc64/bitops.h edited/include/asm-sparc64/bitops.h
--- linux/include/asm-sparc64/bitops.h	Mon Nov 11 06:28:05 2002
+++ edited/include/asm-sparc64/bitops.h	Thu Mar 13 14:43:49 2003
@@ -351,7 +351,9 @@
 #ifdef __KERNEL__
 
 #define ext2_set_bit(nr,addr)		test_and_set_le_bit((nr),(unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
 #define ext2_clear_bit(nr,addr)		test_and_clear_le_bit((nr),(unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
 #define ext2_test_bit(nr,addr)		test_le_bit((nr),(unsigned long *)(addr))
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_le_bit((unsigned long *)(addr), (size))
diff -uNr linux/include/asm-v850/bitops.h edited/include/asm-v850/bitops.h
--- linux/include/asm-v850/bitops.h	Mon Nov 11 06:28:02 2002
+++ edited/include/asm-v850/bitops.h	Thu Mar 13 14:44:48 2003
@@ -252,7 +252,9 @@
 #define hweight8(x) 			generic_hweight8 (x)
 
 #define ext2_set_bit			test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)      test_and_set_bit(n,a)
 #define ext2_clear_bit			test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a)    test_and_clear_bit(n,a)
 #define ext2_test_bit			test_bit
 #define ext2_find_first_zero_bit	find_first_zero_bit
 #define ext2_find_next_zero_bit		find_next_zero_bit
diff -uNr linux/include/asm-x86_64/bitops.h edited/include/asm-x86_64/bitops.h
--- linux/include/asm-x86_64/bitops.h	Fri Mar 14 01:53:27 2003
+++ edited/include/asm-x86_64/bitops.h	Thu Mar 13 14:45:56 2003
@@ -487,8 +487,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+	        test_and_set_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr,addr) \
+	        test_and_clear_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h	Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h	Fri Mar 14 01:41:31 2003
@@ -16,6 +16,12 @@
 #ifndef _LINUX_EXT2_FS_SB
 #define _LINUX_EXT2_FS_SB
 
+struct ext2_bg_info {
+	u8 debts;
+	spinlock_t alloc_lock;
+	unsigned int reserved;
+} ____cacheline_aligned_in_smp;
+
 /*
  * second extended-fs super-block data in memory
  */
@@ -44,7 +50,7 @@
 	int s_first_ino;
 	u32 s_next_generation;
 	unsigned long s_dir_count;
-	u8 *s_debts;
+	struct ext2_bg_info *s_bgi;
 };
 
 #endif	/* _LINUX_EXT2_FS_SB */


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 23:03       ` Andreas Dilger
@ 2003-03-13 23:10         ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-13 23:10 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: bzzz, linux-kernel, ext2-devel

Andreas Dilger <adilger@clusterfs.com> wrote:
>
> >    I think it's probably better to just lump all the root-reserved blocks
> >    into as few blockgroups as possible.
> 
> I might disagree here.  One of the reasons for having the reserved blocks
> is to prevent fragmentation, and not necessarily to reserve space for root.
> For the lots of small files cases it makes more sense to leave free space
> in each group to prevent fragmentation at the group level.

Alex's approach effectively makes every blockgroup a little bit smaller.  I
don't expect it will improve fragmentation effects.  Not sure...

> ...
> We could also say that for the purpose of allocating new files in a directory,
> anything more than 95% full is "full" and the inode should be allocated in
> a different group regardless of where the parent is.  It may be that the
> Orlov allocator already has such a heuristic, but I think that is a different
> discussion.

Yes, both find_group_other() and find_group_orlov() do things like that.

But only in 2.5, or in 2.4 with Ted's backport patches.  find_group_other()
in 2.4 forgets to look at the free block count, which is rather sad.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 23:03       ` Alex Tomas
@ 2003-03-13 23:25         ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-13 23:25 UTC (permalink / raw)
  To: Alex Tomas; +Cc: bzzz, linux-kernel, ext2-devel

Alex Tomas <bzzz@tmi.comex.ru> wrote:
>
> 
> done
> done

Thanks!

> 
>  AM> c) It looks like EXT2FS_DEBUG broke.  Nobody uses that much, but
>  AM> we should fix and test it sometime.
> 
> I suggest this to be fixed in separate patch. are you?

Yes, that's fine.

> ...
> btw, what about minor bug in ext2 allocation code I posted recently?
> do you agree it needs to be fixed?

That's still in my inbox.  I do not silently drop stuff, but am sometimes
laggy.



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 19:17   ` Alex Tomas
  2003-03-13 22:25     ` Andrew Morton
@ 2003-03-13 23:56     ` Andreas Dilger
  2003-03-14  7:20       ` Alex Tomas
                         ` (2 more replies)
  1 sibling, 3 replies; 39+ messages in thread
From: Andreas Dilger @ 2003-03-13 23:56 UTC (permalink / raw)
  To: Alex Tomas; +Cc: Andrew Morton, linux-kernel, ext2-devel

First of all, thanks for this work, Alex.  It's been a long time in coming.

One thing I would wonder about is whether we should be implementing this in
ext2, or in ext3 only.  One of the decisions we made in the past is that we
shouldn't necessarily implement everything in ext2 (especially features that
complicated the code, and are only useful on high-end systems).

There was a desire to keep ext2 small and simple, and ext3 would get the
fancy high-end features that make sense if you have a large filesystem
that you would likely be using in conjunction with ext3 anyways.

It does make sense to test this out on ext2 since it is definitely easier
to code for ext2 than ext3, and the journaling doesn't skew the performance
so much.  Of course one of the reasons that ext2 is easier to code for is
exactly _because_ we don't put all of the features into ext2...

Comments on the code inline below...

On Mar 13, 2003  22:17 +0300, Alex Tomas wrote:
> -static inline int reserve_blocks(struct super_block *sb, int count)
> +static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi, 
> +					struct ext2_group_desc *desc,
> +					struct buffer_head *bh, int count, int use_reserve)

I would suggest just hooking the ext2_group_desc (and the buffer_head in
which it lives) off of the ext2_bg_info array instead of passing both
around explicitly.  Since we have ext2_bg_info as a group_number-indexed
array already, this would essentially mean that wherever we call
ext2_get_group_desc() we could just use sbi->bgi[group].desc (or make
ext2_get_group_desc() do that, if we don't need it to populate bgi[group].desc
in the first place).

> +	root_blocks = bgi->reserved;

I would avoid calling this "root_blocks" and instead just use "bgi->reserved"
or "reserved_blocks" everywhere.  The original intent of these blocks was to
reduce fragmentation and not necessarily reserved-for-root.

> +        if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
> +            sbi->s_resuid != current->fsuid &&
> +            (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
> +                /*
> +                 * We are too close to reserve and we are not privileged.
> +                 * Can we allocate anything at all?
> +                 */
> +                if (free_blocks > root_blocks)
> +                        count = free_blocks - root_blocks;
> +                else {
> +			spin_unlock(&bgi->alloc_lock);
> +                        return 0;
> +		}

Per my other email, if we want to handle large files properly by allowing them
to fill the entire group, yet we want to keep the "reserved blocks" count
correct, we could always grab the lock on the last group and add reserved
blocks there.  Or, we could just ignore the reserved blocks count entirely.

>  unsigned long ext2_count_free_blocks (struct super_block * sb)
>       :
> -	return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
> +        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
> +                desc = ext2_get_group_desc (sb, i, NULL);
> +                if (!desc)
> +                        continue;
> +                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
> +	}
> +	return desc_count;
>  #endif

In general, this should be safe to do without a lock, since it is only
used for heuristics (orlov) and statfs (which is out-of-date as soon as
we call it).  Are there any other users of ext2_count_free_blocks() that
need a correct value?  I suppose mount/unmount to set s_free_blocks_count,
but those probably have exclusive access to the filesystem anyways.

PS - it looks like you are using spaces for indents instead of tabs here...

> +	if (le32_to_cpu(es->s_free_blocks_count) != total_free)
> +		printk(KERN_INFO "EXT2-fs: last umount wasn't clean. correct free blocks counter\n");

Probably no need to print this for ext2, since there is already an "uncleanly
unmounted" flag in the superblock, and e2fsck will have otherwise fixed it
up.

> +	/* distribute reserved blocks over groups -bzzz */
> +	while (reserved && total_free) {
> +		unsigned int per_group = reserved / sbi->s_groups_count + 1;
> +		unsigned int free;
> +	
> +		for (i = 0; reserved && i < sbi->s_groups_count; i++) {
> +			gdp = ext2_get_group_desc (sb, i, NULL);
> +			if (!gdp) {
> +				ext2_error (sb, "ext2_check_descriptors",
> +						"can't get descriptor for group #%d", i);
> +				return 0;
> +			}
> +			
> +			free = le16_to_cpu(gdp->bg_free_blocks_count);
> +			if (per_group > free)
> +				per_group = free;

I'm not sure whether I agree with this or not...  If a group ever exceeds
the reserved mark for some reason (e.g. full filesystem) it will never be
able to "improve itself" back to a decent amount of reserved blocks.  That
said, you may want to only reduce "reserved" by "free" in the end, so that
the total amount of reserved blocks is kept constant.
need to re-calculate "per_group" for each loop).

>  extern __inline__ int
> +ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)

Please don't use "extern __inline__", as that can cause all sorts of
grief.  Either "static inline" or just "extern".

> +struct ext2_bg_info {
> +	u8 debts;
> +	spinlock_t alloc_lock;
> +	unsigned int reserved;
> +};

Please rename this "balloc_lock", as it is likely that we will get an
"ialloc_lock" in the future also.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 23:56     ` Andreas Dilger
@ 2003-03-14  7:20       ` Alex Tomas
  2003-03-14 20:59         ` Andreas Dilger
  2003-03-15  4:37         ` William Lee Irwin III
  2003-03-14 18:25       ` Martin J. Bligh
  2003-03-14 19:30       ` [Ext2-devel] " Daniel Phillips
  2 siblings, 2 replies; 39+ messages in thread
From: Alex Tomas @ 2003-03-14  7:20 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Alex Tomas, Andrew Morton, linux-kernel, ext2-devel


hi!

>>>>> Andreas Dilger (AD) writes:

 AD> First of all, thanks for this work, Alex.  It's been a long time
 AD> in coming.  One thing I would wonder about is whether we should
 AD> be implementing this in ext2, or in ext3 only.  One of the
 AD> decisions we made in the past is that we shouldn't necessarily
 AD> implement everything in ext2 (especially features that
 AD> complicated the code, and are only useful on high-end systems).

well. ext2 in 2.4 have a lot of BKL. ext2 in 2.5 is almost free from
BKL. I think concurrent balloc is just one more step in this direction.

 AD> I would suggest just hooking the ext2_group_desc (and the
 AD> buffer_head in which it lives) off of the ext2_bg_info array
 AD> instead of passing both around explicitly.  Since we have
 AD> ext2_bg_info as a group_number-indexed array already, this would
 AD> essentially mean that wherever we call ext2_get_group_desc() we
 AD> could just use sbi->bgi[group].desc (or make
 AD> ext2_get_group_desc() do that, if we don't need it to populate
 AD> bgi[group].desc in the first place).

it make sense. what about to make it by separate patch?
just to prevent huge concurrent-balloc.diff


 >> + root_blocks = bgi->reserved;

 AD> I would avoid calling this "root_blocks" and instead just use
 AD> "bgi->reserved" or "reserved_blocks" everywhere.  The original
 AD> intent of these blocks was to reduce fragmentation and not
 AD> necessarily reserved-for-root.

fixed

 >> + if (free_blocks < root_blocks + count &&
 >> !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid &&
 >> + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + /* + *
 >> We are too close to reserve and we are not privileged.  + * Can we
 >> allocate anything at all?  + */ + if (free_blocks > root_blocks) +
 >> count = free_blocks - root_blocks; + else { +
 >> spin_unlock(&bgi->alloc_lock); + return 0; + }

 AD> Per my other email, if we want to handle large files properly by
 AD> allowing them to fill the entire group, yet we want to keep the
 AD> "reserved blocks" count correct, we could always grab the lock on
 AD> the last group and add reserved blocks there.  Or, we could just
 AD> ignore the reserved blocks count entirely.

hmm. looks I miss something here. reservation is protected by the lock.
what's the problem?

 >> unsigned long ext2_count_free_blocks (struct super_block * sb)
 >> :
 >> - return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count); +
 >> for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) { + desc =
 >> ext2_get_group_desc (sb, i, NULL); + if (!desc) + continue; +
 >> desc_count += le16_to_cpu(desc->bg_free_blocks_count); + } +
 >> return desc_count; #endif

this code doesn't use lock. if you mean code under EXT2FS_DEBUG then
Andrew already pointed out that this code is broken and should be fixed.

 AD> In general, this should be safe to do without a lock, since it is
 AD> only used for heuristics (orlov) and statfs (which is out-of-date
 AD> as soon as we call it).  Are there any other users of
 AD> ext2_count_free_blocks() that need a correct value?  I suppose
 AD> mount/unmount to set s_free_blocks_count, but those probably have
 AD> exclusive access to the filesystem anyways.

there is no more user of ext2_count_free_block() who needs precise counter.
find_group_orlov() uses it, but I think this routine doesn't need this
loop is serialized against block reservation.

 AD> PS - it looks like you are using spaces for indents instead of
 AD> tabs here...

I just use vim ;)

 >> + if (le32_to_cpu(es->s_free_blocks_count) != total_free) +
 >> printk(KERN_INFO "EXT2-fs: last umount wasn't clean. correct free
 >> blocks counter\n");

 AD> Probably no need to print this for ext2, since there is already
 AD> an "uncleanly unmounted" flag in the superblock, and e2fsck will
 AD> have otherwise fixed it up.

fixed. in fact. it was 'debug for myself'.

 AD> I'm not sure whether I agree with this or not...  If a group ever
 AD> exceeds the reserved mark for some reason (e.g. full filesystem)
 AD> it will never be able to "improve itself" back to a decent amount
 AD> of reserved blocks.  That said, you may want to only reduce
 AD> "reserved" by "free" in the end, so that the total amount of
 AD> reserved blocks is kept constant.  need to re-calculate
 AD> "per_group" for each loop).

well, I believe reserved blocks may be really _reserved_ at the end of
the fs. simple because of nobody should use them until fs is almost full.

 >> extern __inline__ int +ext2_set_bit_atomic (spinlock_t *lock, int
 >> nr, volatile void *vaddr)

 AD> Please don't use "extern __inline__", as that can cause all sorts
 AD> of grief.  Either "static inline" or just "extern".

fixed

 >> +struct ext2_bg_info { + u8 debts; + spinlock_t alloc_lock; +
 >> unsigned int reserved; +};

 AD> Please rename this "balloc_lock", as it is likely that we will
 AD> get an "ialloc_lock" in the future also.

this makes sense as well.


and corrected patch:



diff -uNr linux/fs/ext2/balloc.c edited/fs/ext2/balloc.c
--- linux/fs/ext2/balloc.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/balloc.c	Fri Mar 14 09:54:11 2003
@@ -94,69 +94,62 @@
 	return bh;
 }
 
-static inline int reserve_blocks(struct super_block *sb, int count)
+static inline int group_reserve_blocks(struct ext2_sb_info *sbi, struct ext2_bg_info *bgi, 
+					struct ext2_group_desc *desc,
+					struct buffer_head *bh, int count, int use_reserve)
 {
-	struct ext2_sb_info * sbi = EXT2_SB(sb);
-	struct ext2_super_block * es = sbi->s_es;
-	unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-	unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
+	unsigned free_blocks;
+	unsigned root_blocks;
 
+	spin_lock(&bgi->balloc_lock);
+	
+	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 	if (free_blocks < count)
 		count = free_blocks;
+	root_blocks = bgi->reserved;
 
-	if (free_blocks < root_blocks + count && !capable(CAP_SYS_RESOURCE) &&
-	    sbi->s_resuid != current->fsuid &&
-	    (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		/*
-		 * We are too close to reserve and we are not privileged.
-		 * Can we allocate anything at all?
-		 */
-		if (free_blocks > root_blocks)
-			count = free_blocks - root_blocks;
-		else
-			return 0;
+	if (free_blocks <  bgi->reserved && !use_reserve) {
+		/* don't use reserved blocks */
+		spin_unlock(&bgi->balloc_lock);
+		return 0;
 	}
-
-	es->s_free_blocks_count = cpu_to_le32(free_blocks - count);
-	mark_buffer_dirty(sbi->s_sbh);
-	sb->s_dirt = 1;
-	return count;
-}
-
-static inline void release_blocks(struct super_block *sb, int count)
-{
-	if (count) {
-		struct ext2_sb_info * sbi = EXT2_SB(sb);
-		struct ext2_super_block * es = sbi->s_es;
-		unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
-		es->s_free_blocks_count = cpu_to_le32(free_blocks + count);
-		mark_buffer_dirty(sbi->s_sbh);
-		sb->s_dirt = 1;
+	
+        if (free_blocks <  bgi->reserved + count && !capable(CAP_SYS_RESOURCE) &&
+            sbi->s_resuid != current->fsuid &&
+            (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+                /*
+                 * We are too close to reserve and we are not privileged.
+                 * Can we allocate anything at all?
+                 */
+                if (free_blocks > bgi->reserved)
+                        count = free_blocks - bgi->reserved;
+                else {
+			spin_unlock(&bgi->balloc_lock);
+                        return 0;
+		}
 	}
-}
-
-static inline int group_reserve_blocks(struct ext2_group_desc *desc,
-				    struct buffer_head *bh, int count)
-{
-	unsigned free_blocks;
-
-	if (!desc->bg_free_blocks_count)
-		return 0;
-
-	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
-	if (free_blocks < count)
-		count = free_blocks;
 	desc->bg_free_blocks_count = cpu_to_le16(free_blocks - count);
+	
+	spin_unlock(&bgi->balloc_lock);
+
 	mark_buffer_dirty(bh);
 	return count;
 }
 
-static inline void group_release_blocks(struct ext2_group_desc *desc,
-				    struct buffer_head *bh, int count)
+static inline void group_release_blocks(struct ext2_bg_info *bgi,
+					struct ext2_group_desc *desc,
+					struct buffer_head *bh, int count)
 {
 	if (count) {
-		unsigned free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
+		unsigned free_blocks;
+		
+		spin_lock(&bgi->balloc_lock);
+		
+		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
+		
+		spin_unlock(&bgi->balloc_lock);
+		
 		mark_buffer_dirty(bh);
 	}
 }
@@ -172,12 +165,11 @@
 	unsigned long i;
 	unsigned long overflow;
 	struct super_block * sb = inode->i_sb;
+	struct ext2_sb_info * sbi = EXT2_SB(sb);
 	struct ext2_group_desc * desc;
-	struct ext2_super_block * es;
+	struct ext2_super_block * es = sbi->s_es;
 	unsigned freed = 0, group_freed;
 
-	lock_super (sb);
-	es = EXT2_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) ||
 	    block + count < block ||
 	    block + count > le32_to_cpu(es->s_blocks_count)) {
@@ -215,16 +207,17 @@
 	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
 	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
 	    in_range (block, le32_to_cpu(desc->bg_inode_table),
-		      EXT2_SB(sb)->s_itb_per_group) ||
+		      sbi->s_itb_per_group) ||
 	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
-		      EXT2_SB(sb)->s_itb_per_group))
+		      sbi->s_itb_per_group))
 		ext2_error (sb, "ext2_free_blocks",
 			    "Freeing blocks in system zones - "
 			    "Block = %lu, count = %lu",
 			    block, count);
 
 	for (i = 0, group_freed = 0; i < count; i++) {
-		if (!ext2_clear_bit(bit + i, bitmap_bh->b_data))
+		if (!ext2_clear_bit_atomic(&sbi->s_bgi[block_group].balloc_lock,
+					bit + i, (void *) bitmap_bh->b_data))
 			ext2_error (sb, "ext2_free_blocks",
 				      "bit already cleared for block %lu",
 				      block + i);
@@ -236,7 +229,7 @@
 	if (sb->s_flags & MS_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
-	group_release_blocks(desc, bh2, group_freed);
+	group_release_blocks(&sbi->s_bgi[block_group], desc, bh2, group_freed);
 	freed += group_freed;
 
 	if (overflow) {
@@ -246,18 +239,18 @@
 	}
 error_return:
 	brelse(bitmap_bh);
-	release_blocks(sb, freed);
-	unlock_super (sb);
 	DQUOT_FREE_BLOCK(inode, freed);
 }
 
-static int grab_block(char *map, unsigned size, int goal)
+static int grab_block(spinlock_t *lock, char *map, unsigned size, int goal)
 {
 	int k;
 	char *p, *r;
 
 	if (!ext2_test_bit(goal, map))
 		goto got_it;
+
+repeat:
 	if (goal) {
 		/*
 		 * The goal was occupied; search forward for a free 
@@ -297,7 +290,8 @@
 	}
 	return -1;
 got_it:
-	ext2_set_bit(goal, map);
+	if (ext2_set_bit_atomic(lock, goal, (void *) map)) 
+		goto repeat;	
 	return goal;
 }
 
@@ -319,7 +313,7 @@
 	int ret_block;			/* j */
 	int bit;		/* k */
 	int target_block;		/* tmp */
-	int block = 0;
+	int block = 0, use_reserve = 0;
 	struct super_block *sb = inode->i_sb;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = sbi->s_es;
@@ -341,14 +335,7 @@
 		prealloc_goal--;
 
 	dq_alloc = prealloc_goal + 1;
-
-	lock_super (sb);
-
-	es_alloc = reserve_blocks(sb, dq_alloc);
-	if (!es_alloc) {
-		*err = -ENOSPC;
-		goto out_unlock;
-	}
+	es_alloc = dq_alloc;
 
 	ext2_debug ("goal=%lu.\n", goal);
 
@@ -360,7 +347,8 @@
 	if (!desc)
 		goto io_error;
 
-	group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+	group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+					desc, gdp_bh, es_alloc, 0);
 	if (group_alloc) {
 		ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
 					group_size);
@@ -371,11 +359,12 @@
 		
 		ext2_debug("goal is at %d:%d.\n", group_no, ret_block);
 
-		ret_block = grab_block(bitmap_bh->b_data,
+		ret_block = grab_block(&sbi->s_bgi[group_no].balloc_lock,
+				bitmap_bh->b_data,
 				group_size, ret_block);
 		if (ret_block >= 0)
 			goto got_block;
-		group_release_blocks(desc, gdp_bh, group_alloc);
+		group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
 		group_alloc = 0;
 	}
 
@@ -385,6 +374,7 @@
 	 * Now search the rest of the groups.  We assume that 
 	 * i and desc correctly point to the last group visited.
 	 */
+repeat:
 	for (bit = 0; !group_alloc &&
 			bit < sbi->s_groups_count; bit++) {
 		group_no++;
@@ -393,7 +383,16 @@
 		desc = ext2_get_group_desc(sb, group_no, &gdp_bh);
 		if (!desc)
 			goto io_error;
-		group_alloc = group_reserve_blocks(desc, gdp_bh, es_alloc);
+		group_alloc = group_reserve_blocks(sbi, &sbi->s_bgi[group_no],
+						desc, gdp_bh, es_alloc, use_reserve);
+	}
+	if (!use_reserve) {
+		/* first time we did not try to allocate
+		 * reserved blocks. now it looks like
+		 * no more non-reserved blocks left. we
+		 * will try to allocate reserved blocks -bzzz */
+		use_reserve = 1;
+		goto repeat;
 	}
 	if (bit >= sbi->s_groups_count) {
 		*err = -ENOSPC;
@@ -404,13 +403,11 @@
 	if (!bitmap_bh)
 		goto io_error;
 
-	ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
+	ret_block = grab_block(&sbi->s_bgi[group_no].balloc_lock,
+			bitmap_bh->b_data, group_size, 0);
 	if (ret_block < 0) {
-		ext2_error (sb, "ext2_new_block",
-			"Free blocks count corrupted for block group %d",
-				group_no);
 		group_alloc = 0;
-		goto io_error;
+		goto repeat;	
 	}
 
 got_block:
@@ -452,7 +449,8 @@
 		unsigned n;
 
 		for (n = 0; n < group_alloc && ++ret_block < group_size; n++) {
-			if (ext2_set_bit(ret_block, bitmap_bh->b_data))
+			if (ext2_set_bit_atomic(&sbi->s_bgi[group_no].balloc_lock,
+						ret_block, (void*) bitmap_bh->b_data))
  				break;
 		}
 		*prealloc_block = block + 1;
@@ -471,10 +469,7 @@
 
 	*err = 0;
 out_release:
-	group_release_blocks(desc, gdp_bh, group_alloc);
-	release_blocks(sb, es_alloc);
-out_unlock:
-	unlock_super (sb);
+	group_release_blocks(&sbi->s_bgi[group_no], desc, gdp_bh, group_alloc);
 	DQUOT_FREE_BLOCK(inode, dq_alloc);
 out:
 	brelse(bitmap_bh);
@@ -487,11 +482,11 @@
 
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
-#ifdef EXT2FS_DEBUG
-	struct ext2_super_block * es;
-	unsigned long desc_count, bitmap_count, x;
 	struct ext2_group_desc * desc;
+	unsigned long desc_count = 0;
 	int i;
+#ifdef EXT2FS_DEBUG
+	unsigned long bitmap_count, x;
 	
 	lock_super (sb);
 	es = EXT2_SB(sb)->s_es;
@@ -519,7 +514,13 @@
 	unlock_super (sb);
 	return bitmap_count;
 #else
-	return le32_to_cpu(EXT2_SB(sb)->s_es->s_free_blocks_count);
+        for (i = 0; i < EXT2_SB(sb)->s_groups_count; i++) {
+                desc = ext2_get_group_desc (sb, i, NULL);
+                if (!desc)
+                        continue;
+                desc_count += le16_to_cpu(desc->bg_free_blocks_count);
+	}
+	return desc_count;
 #endif
 }
 
diff -uNr linux/fs/ext2/ialloc.c edited/fs/ext2/ialloc.c
--- linux/fs/ext2/ialloc.c	Fri Mar 14 01:53:36 2003
+++ edited/fs/ext2/ialloc.c	Thu Mar 13 20:08:58 2003
@@ -278,7 +278,8 @@
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT2_INODES_PER_GROUP(sb);
 	int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups;
-	int avefreeb = le32_to_cpu(es->s_free_blocks_count) / ngroups;
+	int free_blocks = ext2_count_free_blocks(sb);
+	int avefreeb = free_blocks / ngroups;
 	int blocks_per_dir;
 	int ndirs = sbi->s_dir_count;
 	int max_debt, max_dirs, min_blocks, min_inodes;
@@ -320,8 +321,7 @@
 		goto fallback;
 	}
 
-	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
-			  le32_to_cpu(es->s_free_blocks_count)) / ndirs;
+	blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - free_blocks) / ndirs;
 
 	max_dirs = ndirs / ngroups + inodes_per_group / 16;
 	min_inodes = avefreei - inodes_per_group / 4;
@@ -340,7 +340,7 @@
 		desc = ext2_get_group_desc (sb, group, &bh);
 		if (!desc || !desc->bg_free_inodes_count)
 			continue;
-		if (sbi->s_debts[group] >= max_debt)
+		if (sbi->s_bgi[group].debts >= max_debt)
 			continue;
 		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
 			continue;
@@ -501,11 +501,11 @@
 		cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
 
 	if (S_ISDIR(mode)) {
-		if (EXT2_SB(sb)->s_debts[group] < 255)
-			EXT2_SB(sb)->s_debts[group]++;
+		if (EXT2_SB(sb)->s_bgi[group].debts < 255)
+			EXT2_SB(sb)->s_bgi[group].debts++;
 	} else {
-		if (EXT2_SB(sb)->s_debts[group])
-			EXT2_SB(sb)->s_debts[group]--;
+		if (EXT2_SB(sb)->s_bgi[group].debts)
+			EXT2_SB(sb)->s_bgi[group].debts--;
 	}
 
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
diff -uNr linux/fs/ext2/super.c edited/fs/ext2/super.c
--- linux/fs/ext2/super.c	Thu Feb 20 16:18:53 2003
+++ edited/fs/ext2/super.c	Fri Mar 14 10:10:09 2003
@@ -141,7 +141,7 @@
 		if (sbi->s_group_desc[i])
 			brelse (sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_debts);
+	kfree(sbi->s_bgi);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi);
@@ -464,8 +464,11 @@
 	int i;
 	int desc_block = 0;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	struct ext2_super_block * es = sbi->s_es;
+	unsigned long block = le32_to_cpu(es->s_first_data_block);
 	struct ext2_group_desc * gdp = NULL;
+	unsigned int total_free = 0, free;
+	unsigned int reserved = le32_to_cpu(es->s_r_blocks_count);
 
 	ext2_debug ("Checking group descriptors");
 
@@ -504,6 +507,27 @@
 		block += EXT2_BLOCKS_PER_GROUP(sb);
 		gdp++;
 	}
+	
+	/* restore free blocks counter in SB -bzzz */
+	es->s_free_blocks_count = total_free = ext2_count_free_blocks(sb);
+
+	/* distribute reserved blocks over groups -bzzz */
+	for(i = sbi->s_groups_count-1; reserved && total_free && i >= 0; i--) {
+		gdp = ext2_get_group_desc (sb, i, NULL);
+		if (!gdp) {
+			ext2_error (sb, "ext2_check_descriptors",
+					"cant get descriptor for group %d", i);
+			return 0;
+		}
+		
+		free = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free > reserved)
+			free = reserved;
+		sbi->s_bgi[i].reserved = free;
+		reserved -= free;
+		total_free -= free;
+	}
+	
 	return 1;
 }
 
@@ -768,13 +792,17 @@
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(*sbi->s_debts),
+	sbi->s_bgi = kmalloc(sbi->s_groups_count*sizeof(struct ext2_bg_info),
 			       GFP_KERNEL);
-	if (!sbi->s_debts) {
+	if (!sbi->s_bgi) {
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount_group_desc;
 	}
-	memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(*sbi->s_debts));
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		sbi->s_bgi[i].debts = 0;
+		sbi->s_bgi[i].reserved = 0;
+		spin_lock_init(&sbi->s_bgi[i].balloc_lock);
+	}
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
@@ -820,8 +848,8 @@
 		brelse(sbi->s_group_desc[i]);
 failed_mount_group_desc:
 	kfree(sbi->s_group_desc);
-	if (sbi->s_debts)
-		kfree(sbi->s_debts);
+	if (sbi->s_bgi)
+		kfree(sbi->s_bgi);
 failed_mount:
 	brelse(bh);
 failed_sbi:
@@ -840,6 +868,7 @@
 
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+	es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 	es->s_wtime = cpu_to_le32(get_seconds());
 	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
 	sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
@@ -868,6 +897,7 @@
 			ext2_debug ("setting valid to 0\n");
 			es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) &
 						  ~EXT2_VALID_FS);
+			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
 			es->s_mtime = cpu_to_le32(get_seconds());
 			ext2_sync_super(sb, es);
 		} else
@@ -929,7 +959,8 @@
 static int ext2_statfs (struct super_block * sb, struct statfs * buf)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long overhead;
+	unsigned long overhead, total_free = 0;
+	struct ext2_group_desc *desc;
 	int i;
 
 	if (test_opt (sb, MINIX_DF))
@@ -950,9 +981,14 @@
 		 * block group descriptors.  If the sparse superblocks
 		 * feature is turned on, then not all groups have this.
 		 */
-		for (i = 0; i < sbi->s_groups_count; i++)
+		for (i = 0; i < sbi->s_groups_count; i++) {
 			overhead += ext2_bg_has_super(sb, i) +
 				ext2_bg_num_gdb(sb, i);
+			
+			/* sum total free blocks -bzzz */
+			desc = ext2_get_group_desc (sb, i, NULL);
+			total_free += le16_to_cpu(desc->bg_free_blocks_count);
+		}
 
 		/*
 		 * Every block group has an inode bitmap, a block
@@ -965,7 +1001,7 @@
 	buf->f_type = EXT2_SUPER_MAGIC;
 	buf->f_bsize = sb->s_blocksize;
 	buf->f_blocks = le32_to_cpu(sbi->s_es->s_blocks_count) - overhead;
-	buf->f_bfree = ext2_count_free_blocks (sb);
+	buf->f_bfree = total_free;
 	buf->f_bavail = buf->f_bfree - le32_to_cpu(sbi->s_es->s_r_blocks_count);
 	if (buf->f_bfree < le32_to_cpu(sbi->s_es->s_r_blocks_count))
 		buf->f_bavail = 0;
diff -uNr linux/include/asm-alpha/bitops.h edited/include/asm-alpha/bitops.h
--- linux/include/asm-alpha/bitops.h	Fri Mar 14 01:53:36 2003
+++ edited/include/asm-alpha/bitops.h	Thu Mar 13 14:10:18 2003
@@ -487,7 +487,9 @@
 
 
 #define ext2_set_bit                 __test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)   test_and_set_bit(n,a)
 #define ext2_clear_bit               __test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-arm/bitops.h edited/include/asm-arm/bitops.h
--- linux/include/asm-arm/bitops.h	Fri Mar 14 01:53:36 2003
+++ edited/include/asm-arm/bitops.h	Thu Mar 13 14:10:46 2003
@@ -357,8 +357,12 @@
  */
 #define ext2_set_bit(nr,p)			\
 		__test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_set_bit_atomic(lock,nr,p)          \
+                test_and_set_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_clear_bit(nr,p)			\
 		__test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
+#define ext2_clear_bit_atomic(lock,nr,p)        \
+                test_and_clear_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_test_bit(nr,p)			\
 		__test_bit(WORD_BITOFF_TO_LE(nr), (unsigned long *)(p))
 #define ext2_find_first_zero_bit(p,sz)		\
diff -uNr linux/include/asm-cris/bitops.h edited/include/asm-cris/bitops.h
--- linux/include/asm-cris/bitops.h	Mon Nov 11 06:28:30 2002
+++ edited/include/asm-cris/bitops.h	Thu Mar 13 14:11:15 2003
@@ -360,7 +360,9 @@
 #define hweight8(x) generic_hweight8(x)
 
 #define ext2_set_bit                 test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)   test_and_set_bit(n,a)
 #define ext2_clear_bit               test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a) test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-i386/bitops.h edited/include/asm-i386/bitops.h
--- linux/include/asm-i386/bitops.h	Wed Dec 25 06:03:08 2002
+++ edited/include/asm-i386/bitops.h	Thu Mar 13 14:11:32 2003
@@ -479,8 +479,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+        test_and_set_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr, addr) \
+	        test_and_clear_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-ia64/bitops.h edited/include/asm-ia64/bitops.h
--- linux/include/asm-ia64/bitops.h	Thu Feb 20 16:18:21 2003
+++ edited/include/asm-ia64/bitops.h	Thu Mar 13 14:12:50 2003
@@ -453,7 +453,9 @@
 #define __clear_bit(nr, addr)        clear_bit(nr, addr)
 
 #define ext2_set_bit                 test_and_set_bit
+#define ext2_set_atomic(l,n,a)	     test_and_set_bit(n,a)
 #define ext2_clear_bit               test_and_clear_bit
+#define ext2_clear_atomic(l,n,a)     test_and_clear_bit(n,a)
 #define ext2_test_bit                test_bit
 #define ext2_find_first_zero_bit     find_first_zero_bit
 #define ext2_find_next_zero_bit      find_next_zero_bit
diff -uNr linux/include/asm-m68k/bitops.h edited/include/asm-m68k/bitops.h
--- linux/include/asm-m68k/bitops.h	Mon Nov 11 06:28:33 2002
+++ edited/include/asm-m68k/bitops.h	Fri Mar 14 10:00:15 2003
@@ -354,6 +354,16 @@
 	return retval;
 }
 
+static inline int
+ext2_set_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, vaddr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int
 ext2_clear_bit (int nr, volatile void *vaddr)
 {
@@ -365,6 +375,16 @@
 	return retval;
 }
 
+static inline int
+ext2_clear_bit_atomic (spinlock_t *lock, int nr, volatile void *vaddr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, vaddr);
+        spin_unlock(lock);
+        return ret;
+}       
+
 extern __inline__ int
 ext2_test_bit (int nr, const volatile void *vaddr)
 {
diff -uNr linux/include/asm-m68knommu/bitops.h edited/include/asm-m68knommu/bitops.h
--- linux/include/asm-m68knommu/bitops.h	Mon Nov 11 06:28:04 2002
+++ edited/include/asm-m68knommu/bitops.h	Fri Mar 14 10:00:52 2003
@@ -387,6 +387,16 @@
 	return retval;
 }
 
+static inline int ext2_set_bit_atomic(spinlock_t *lock, int nr,
+		volatile void * addr)
+{
+        int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int ext2_clear_bit(int nr, volatile void * addr)
 {
 	int		mask, retval;
@@ -402,6 +412,16 @@
 	return retval;
 }
 
+static inline int ext2_clear_bit_atomic(spinlock_t *lock, int nr,
+                volatile void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern __inline__ int ext2_test_bit(int nr, const volatile void * addr)
 {
 	int	mask;
diff -uNr linux/include/asm-mips/bitops.h edited/include/asm-mips/bitops.h
--- linux/include/asm-mips/bitops.h	Mon Nov 11 06:28:03 2002
+++ edited/include/asm-mips/bitops.h	Fri Mar 14 10:01:22 2003
@@ -810,6 +810,15 @@
 	return retval;
 }
 
+static inline int ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 extern __inline__ int ext2_clear_bit(int nr, void * addr)
 {
 	int		mask, retval, flags;
@@ -824,6 +833,15 @@
 	return retval;
 }
 
+static inline int ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern __inline__ int ext2_test_bit(int nr, const void * addr)
 {
 	int			mask;
@@ -890,7 +908,9 @@
 
 /* Native ext2 byte ordering, just collapse using defines. */
 #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
 #define ext2_test_bit(nr, addr) test_bit((nr), (addr))
 #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
 #define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-mips64/bitops.h edited/include/asm-mips64/bitops.h
--- linux/include/asm-mips64/bitops.h	Mon Nov 11 06:28:29 2002
+++ edited/include/asm-mips64/bitops.h	Fri Mar 14 10:01:46 2003
@@ -516,6 +516,16 @@
 	return retval;
 }
 
+static inline int
+ext2_set_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_set_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern inline int
 ext2_clear_bit(int nr, void * addr)
 {
@@ -531,6 +541,16 @@
 	return retval;
 }
 
+static inline int
+ext2_clear_bit_atomic(spinlock_t * lock, int nr, void * addr)
+{
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}
+
 extern inline int
 ext2_test_bit(int nr, const void * addr)
 {
@@ -599,7 +619,9 @@
 
 /* Native ext2 byte ordering, just collapse using defines. */
 #define ext2_set_bit(nr, addr) test_and_set_bit((nr), (addr))
+#define ext2_set_bit_atomic(lock, nr, addr) test_and_set_bit((nr), (addr))
 #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr), (addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr), (addr))
 #define ext2_test_bit(nr, addr) test_bit((nr), (addr))
 #define ext2_find_first_zero_bit(addr, size) find_first_zero_bit((addr), (size))
 #define ext2_find_next_zero_bit(addr, size, offset) \
diff -uNr linux/include/asm-parisc/bitops.h edited/include/asm-parisc/bitops.h
--- linux/include/asm-parisc/bitops.h	Thu Feb 20 16:18:21 2003
+++ edited/include/asm-parisc/bitops.h	Thu Mar 13 14:29:47 2003
@@ -389,10 +389,14 @@
  */
 #ifdef __LP64__
 #define ext2_set_bit(nr, addr)		test_and_set_bit((nr) ^ 0x38, addr)
+#define ext2_set_bit_atomic(l,nr,addr)  test_and_set_bit((nr) ^ 0x38, addr)
 #define ext2_clear_bit(nr, addr)	test_and_clear_bit((nr) ^ 0x38, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x38, addr)
 #else
 #define ext2_set_bit(nr, addr)		test_and_set_bit((nr) ^ 0x18, addr)
+#define ext2_set_bit_atomic(l,nr,addr)  test_and_set_bit((nr) ^ 0x18, addr)
 #define ext2_clear_bit(nr, addr)	test_and_clear_bit((nr) ^ 0x18, addr)
+#define ext2_clear_bit_atomic(l,nr,addr) test_and_clear_bit((nr) ^ 0x18, addr)
 #endif
 
 #endif	/* __KERNEL__ */
diff -uNr linux/include/asm-ppc/bitops.h edited/include/asm-ppc/bitops.h
--- linux/include/asm-ppc/bitops.h	Mon Jan 20 05:23:05 2003
+++ edited/include/asm-ppc/bitops.h	Thu Mar 13 14:31:00 2003
@@ -392,7 +392,9 @@
 
 
 #define ext2_set_bit(nr, addr)	__test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock, nr, addr)  test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr))
 #define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock, nr, addr) test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr))
 
 static __inline__ int ext2_test_bit(int nr, __const__ void * addr)
 {
diff -uNr linux/include/asm-ppc64/bitops.h edited/include/asm-ppc64/bitops.h
--- linux/include/asm-ppc64/bitops.h	Mon Nov 11 06:28:28 2002
+++ edited/include/asm-ppc64/bitops.h	Thu Mar 13 14:32:23 2003
@@ -336,8 +336,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_le_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock, nr,addr) \
+	        test_and_set_le_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_le_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr) \
+	        test_and_clear_le_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_le_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_le_bit((unsigned long*)addr, size)
diff -uNr linux/include/asm-s390/bitops.h edited/include/asm-s390/bitops.h
--- linux/include/asm-s390/bitops.h	Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390/bitops.h	Thu Mar 13 14:33:55 2003
@@ -805,8 +805,12 @@
 
 #define ext2_set_bit(nr, addr)       \
 	test_and_set_bit((nr)^24, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr)       \
+	        test_and_set_bit((nr)^24, (unsigned long *)addr)
 #define ext2_clear_bit(nr, addr)     \
 	test_and_clear_bit((nr)^24, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr)     \
+	        test_and_clear_bit((nr)^24, (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^24, (unsigned long *)addr)
 
diff -uNr linux/include/asm-s390x/bitops.h edited/include/asm-s390x/bitops.h
--- linux/include/asm-s390x/bitops.h	Fri Mar 14 01:53:27 2003
+++ edited/include/asm-s390x/bitops.h	Thu Mar 13 14:35:22 2003
@@ -838,8 +838,12 @@
 
 #define ext2_set_bit(nr, addr)       \
 	test_and_set_bit((nr)^56, (unsigned long *)addr)
+#define ext2_set_bit_atomic(lock, nr, addr)       \
+	        test_and_set_bit((nr)^56, (unsigned long *)addr)
 #define ext2_clear_bit(nr, addr)     \
 	test_and_clear_bit((nr)^56, (unsigned long *)addr)
+#define ext2_clear_bit_atomic(lock, nr, addr)     \
+	        test_and_clear_bit((nr)^56, (unsigned long *)addr)
 #define ext2_test_bit(nr, addr)      \
 	test_bit((nr)^56, (unsigned long *)addr)
 
diff -uNr linux/include/asm-sh/bitops.h edited/include/asm-sh/bitops.h
--- linux/include/asm-sh/bitops.h	Mon Nov 11 06:28:02 2002
+++ edited/include/asm-sh/bitops.h	Fri Mar 14 10:03:08 2003
@@ -265,6 +265,16 @@
 	return retval;
 }
 
+static inline int ext2_set_bit_atomic(spinlock_t *lock,
+		int nr, volatile void * addr)
+{
+	int ret;
+	spin_lock(lock);
+	ret = ext2_set_bit(nr, addr);
+	spin_unlock(lock);
+	return ret;
+}
+
 static __inline__ int ext2_clear_bit(int nr, volatile void * addr)
 {
 	int		mask, retval;
@@ -280,6 +290,16 @@
 	return retval;
 }
 
+static inline int ext2_clear_bit_atomic(spinlock_t *lock,
+                int nr, volatile void * addr)
+{       
+        int ret;
+        spin_lock(lock);
+        ret = ext2_clear_bit(nr, addr);
+        spin_unlock(lock);
+        return ret;
+}       
+
 static __inline__ int ext2_test_bit(int nr, const volatile void * addr)
 {
 	int			mask;
diff -uNr linux/include/asm-sparc/bitops.h edited/include/asm-sparc/bitops.h
--- linux/include/asm-sparc/bitops.h	Mon Jan 20 05:23:05 2003
+++ edited/include/asm-sparc/bitops.h	Thu Mar 13 14:38:54 2003
@@ -454,7 +454,9 @@
         find_next_zero_le_bit((addr), (size), 0)
 
 #define ext2_set_bit			__test_and_set_le_bit
+#define ext2_set_bit_atomic(l,n,a)      test_and_set_le_bit(n,a)
 #define ext2_clear_bit			__test_and_clear_le_bit
+#define ext2_clear_bit_atomic(l,n,a)    test_and_clear_le_bit(n,a)
 #define ext2_test_bit			test_le_bit
 #define ext2_find_first_zero_bit	find_first_zero_le_bit
 #define ext2_find_next_zero_bit		find_next_zero_le_bit
diff -uNr linux/include/asm-sparc64/bitops.h edited/include/asm-sparc64/bitops.h
--- linux/include/asm-sparc64/bitops.h	Mon Nov 11 06:28:05 2002
+++ edited/include/asm-sparc64/bitops.h	Thu Mar 13 14:43:49 2003
@@ -351,7 +351,9 @@
 #ifdef __KERNEL__
 
 #define ext2_set_bit(nr,addr)		test_and_set_le_bit((nr),(unsigned long *)(addr))
+#define ext2_set_bit_atomic(lock,nr,addr) test_and_set_le_bit((nr),(unsigned long *)(addr))
 #define ext2_clear_bit(nr,addr)		test_and_clear_le_bit((nr),(unsigned long *)(addr))
+#define ext2_clear_bit_atomic(lock,nr,addr) test_and_clear_le_bit((nr),(unsigned long *)(addr))
 #define ext2_test_bit(nr,addr)		test_le_bit((nr),(unsigned long *)(addr))
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_le_bit((unsigned long *)(addr), (size))
diff -uNr linux/include/asm-v850/bitops.h edited/include/asm-v850/bitops.h
--- linux/include/asm-v850/bitops.h	Mon Nov 11 06:28:02 2002
+++ edited/include/asm-v850/bitops.h	Thu Mar 13 14:44:48 2003
@@ -252,7 +252,9 @@
 #define hweight8(x) 			generic_hweight8 (x)
 
 #define ext2_set_bit			test_and_set_bit
+#define ext2_set_bit_atomic(l,n,a)      test_and_set_bit(n,a)
 #define ext2_clear_bit			test_and_clear_bit
+#define ext2_clear_bit_atomic(l,n,a)    test_and_clear_bit(n,a)
 #define ext2_test_bit			test_bit
 #define ext2_find_first_zero_bit	find_first_zero_bit
 #define ext2_find_next_zero_bit		find_next_zero_bit
diff -uNr linux/include/asm-x86_64/bitops.h edited/include/asm-x86_64/bitops.h
--- linux/include/asm-x86_64/bitops.h	Fri Mar 14 01:53:27 2003
+++ edited/include/asm-x86_64/bitops.h	Thu Mar 13 14:45:56 2003
@@ -487,8 +487,12 @@
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
+#define ext2_set_bit_atomic(lock,nr,addr) \
+	        test_and_set_bit((nr),(unsigned long*)addr)
 #define ext2_clear_bit(nr, addr) \
 	__test_and_clear_bit((nr),(unsigned long*)addr)
+#define ext2_clear_bit_atomic(lock,nr,addr) \
+	        test_and_clear_bit((nr),(unsigned long*)addr)
 #define ext2_test_bit(nr, addr)      test_bit((nr),(unsigned long*)addr)
 #define ext2_find_first_zero_bit(addr, size) \
 	find_first_zero_bit((unsigned long*)addr, size)
diff -uNr linux/include/linux/ext2_fs_sb.h edited/include/linux/ext2_fs_sb.h
--- linux/include/linux/ext2_fs_sb.h	Mon Nov 11 06:28:30 2002
+++ edited/include/linux/ext2_fs_sb.h	Fri Mar 14 09:51:10 2003
@@ -16,6 +16,12 @@
 #ifndef _LINUX_EXT2_FS_SB
 #define _LINUX_EXT2_FS_SB
 
+struct ext2_bg_info {
+	u8 debts;
+	spinlock_t balloc_lock;
+	unsigned int reserved;
+} ____cacheline_aligned_in_smp;
+
 /*
  * second extended-fs super-block data in memory
  */
@@ -44,7 +50,7 @@
 	int s_first_ino;
 	u32 s_next_generation;
 	unsigned long s_dir_count;
-	u8 *s_debts;
+	struct ext2_bg_info *s_bgi;
 };
 
 #endif	/* _LINUX_EXT2_FS_SB */



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 23:56     ` Andreas Dilger
  2003-03-14  7:20       ` Alex Tomas
@ 2003-03-14 18:25       ` Martin J. Bligh
  2003-03-14 19:30       ` [Ext2-devel] " Daniel Phillips
  2 siblings, 0 replies; 39+ messages in thread
From: Martin J. Bligh @ 2003-03-14 18:25 UTC (permalink / raw)
  To: Andreas Dilger, Alex Tomas; +Cc: Andrew Morton, linux-kernel, ext2-devel

> First of all, thanks for this work, Alex.  It's been a long time in coming.
> 
> One thing I would wonder about is whether we should be implementing this in
> ext2, or in ext3 only.  One of the decisions we made in the past is that we
> shouldn't necessarily implement everything in ext2 (especially features that
> complicated the code, and are only useful on high-end systems).
> 
> There was a desire to keep ext2 small and simple, and ext3 would get the
> fancy high-end features that make sense if you have a large filesystem
> that you would likely be using in conjunction with ext3 anyways.

Errrm ... if you want to start advocating that sort of thing, I suggest 
you make ext3 usable on high end systems first. At the moment, that makes 
no sense whatsoever. Ext3 still doesn't scale to big systems.

M.



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-13 23:56     ` Andreas Dilger
  2003-03-14  7:20       ` Alex Tomas
  2003-03-14 18:25       ` Martin J. Bligh
@ 2003-03-14 19:30       ` Daniel Phillips
  2003-03-14 19:55         ` Andrew Morton
  2 siblings, 1 reply; 39+ messages in thread
From: Daniel Phillips @ 2003-03-14 19:30 UTC (permalink / raw)
  To: Andreas Dilger, Alex Tomas; +Cc: Andrew Morton, linux-kernel, ext2-devel

On Fri 14 Mar 03 00:56, Andreas Dilger wrote:
> There was a desire to keep ext2 small and simple, and ext3 would get the
> fancy high-end features that make sense if you have a large filesystem
> that you would likely be using in conjunction with ext3 anyways.
>
> It does make sense to test this out on ext2 since it is definitely easier
> to code for ext2 than ext3, and the journaling doesn't skew the performance
> so much.  Of course one of the reasons that ext2 is easier to code for is
> exactly _because_ we don't put all of the features into ext2...
>
> Comments on the code inline below...

Ext3 is getting to the point, or has already gotten to the point, where it's 
so reliable that it's reasonable to call it Linux's new native filesystem. At 
this point, Ext2 can become more of a crucible for new techniques, hopefully, 
techniques that simplify things, shorten up data paths, clarify the code, 
make it more parallel and so on.  For example, I can't help thinking that's 
there's some fundamental improvement possible to the truncate path (hmm, I 
wonder if I'm giving Alex new ideas...) and that proving such a thing out in 
Ext2 first would make a whole lot of sense.

I do intend to pick up the Ext2 HTree patch again in due course and attempt 
some simplification of it, as well as working on the outstanding 
optimizations, i.e., improved inode allocation and delete coalescing.  HTree 
is an example of a feature that adds a few K of code, but in my opinion it's 
worth it in order to match up better with the Ext3 feature set.  Besides, 
Ext2 is still quite attractive as a host filesystem for NFS export, and would 
be still more attractive with the directory index.

(By the way, on the HTree simplification front, there's a whole lot of 
forward declaration cruft that can go away as soon as CONFIG_EXT3_INDEX
is declared to be always on.)

So anyway, the point you were making and that I agree with, is that Ext2 is
growing into the role of experimental filesystem; Ext3 is now the stable 
filesystem.  Hopefully, the experiments will make Ext2 smaller, cleaner and 
at the same time, more powerful, over time.  Sort of like the role that RAMFS 
plays: besides being useful, Ext2 should be thought of as a showcase for best 
filesystem coding practices.

Regards,

Daniel

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-14 19:30       ` [Ext2-devel] " Daniel Phillips
@ 2003-03-14 19:55         ` Andrew Morton
  0 siblings, 0 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-14 19:55 UTC (permalink / raw)
  To: Daniel Phillips; +Cc: adilger, bzzz, linux-kernel, ext2-devel

Daniel Phillips <phillips@arcor.de> wrote:
>
> Ext2 should be thought of as a showcase for best 
> filesystem coding practices.

Yes.  It is the reference block-backed filesystem for the VFS and VM API.  If
a feature is added to core kernel, ext2 gets to use it first, and ext2 is the
place to look to see "how is it done".

Arguably, minixfs should be playing that role, and it is close.  But it is
now missing a few things.

ext2 is also scarily quick.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-14  7:20       ` Alex Tomas
@ 2003-03-14 20:59         ` Andreas Dilger
  2003-03-14 21:14           ` Alex Tomas
  2003-03-15  4:37         ` William Lee Irwin III
  1 sibling, 1 reply; 39+ messages in thread
From: Andreas Dilger @ 2003-03-14 20:59 UTC (permalink / raw)
  To: Alex Tomas; +Cc: Andrew Morton, linux-kernel, ext2-devel

On Mar 14, 2003  10:20 +0300, Alex Tomas wrote:
>  AD> I would suggest just hooking the ext2_group_desc (and the
>  AD> buffer_head in which it lives) off of the ext2_bg_info array
>  AD> instead of passing both around explicitly.  Since we have
>  AD> ext2_bg_info as a group_number-indexed array already, this would
>  AD> essentially mean that wherever we call ext2_get_group_desc() we
>  AD> could just use sbi->bgi[group].desc (or make
>  AD> ext2_get_group_desc() do that, if we don't need it to populate
>  AD> bgi[group].desc in the first place).
> 
> it make sense. what about to make it by separate patch?
> just to prevent huge concurrent-balloc.diff

Could you make it a pre-requisite to the concurrent-alloc patch?  That
would make it a shoo-in to being accepted (cleans up code nicely).

>  >> + if (free_blocks < root_blocks + count &&
>  >> !capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid &&
>  >> + (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { + /* + *
>  >> We are too close to reserve and we are not privileged.  + * Can we
>  >> allocate anything at all?  + */ + if (free_blocks > root_blocks) +
>  >> count = free_blocks - root_blocks; + else { +
>  >> spin_unlock(&bgi->alloc_lock); + return 0; + }

Argh, please try not to wrap code...

>  AD> Per my other email, if we want to handle large files properly by
>  AD> allowing them to fill the entire group, yet we want to keep the
>  AD> "reserved blocks" count correct, we could always grab the lock on
>  AD> the last group and add reserved blocks there.  Or, we could just
>  AD> ignore the reserved blocks count entirely.
> 
> hmm. looks I miss something here. reservation is protected by the lock.
> what's the problem?

So, what Andrew had complained about with the per-group reservation is
that it leaves "gaps" in the allocation of large files.  Small gaps,
which IMHO aren't so critical, but whatever.  So to avoid having gaps
in the allocation of large files you could additionally allow allocations
from the "reserved pool" of the group in the cases like:

    (inode->i_blocks >> (inode->i_blkbits - 9)) > sbi->s_blocks_per_group / 2

If we want to preserve the total reserved blocks count (in the case where
the above test is the only reason we can allocate these blocks), we can
shift any reserved blocks we are "stealing" from this group into the last
group.

>  AD> I'm not sure whether I agree with this or not...  If a group ever
>  AD> exceeds the reserved mark for some reason (e.g. full filesystem)
>  AD> it will never be able to "improve itself" back to a decent amount
>  AD> of reserved blocks.  That said, you may want to only reduce
>  AD> "reserved" by "free" in the end, so that the total amount of
>  AD> reserved blocks is kept constant.  need to re-calculate
>  AD> "per_group" for each loop).
> 
> well, I believe reserved blocks may be really _reserved_ at the end of
> the fs. simple because of nobody should use them until fs is almost full.

The point of having the reserved blocks is to reduce fragmentation
in file allocation.  Having per-group reserved blocks is a good
idea, because it keeps the reserved "slack" per group, and helps file
allocations within that group have a bit of free space in which to grow.
If you are reserving all of the blocks at the end of the filesystem,
then the earlier groups will become 100% allocated prematurely and lose
any ability to keep files there contiguous.

What I was disagreeing with was reducing a groups reserved count because
it currently exceeds the per_group reserved count.  That's like saying
"the filesystem is 99% full, reduce the total reserved count to 1%".
Even if a group _currently_ exceeds the reserved limit, we should keep
the reserved limit for that group as-is, and hopefully allow it to grow
more "slack" for future allocation improvement if files are deleted.

If we are concerned about the total reserved blocks count (which I
personally am not), we can always add the shortfall in reserved blocks
for the current group to the remaining groups without reducing the
current group's reserved limit.

>  {
> -	struct ext2_sb_info * sbi = EXT2_SB(sb);
> -	struct ext2_super_block * es = sbi->s_es;
> -	unsigned free_blocks = le32_to_cpu(es->s_free_blocks_count);
> -	unsigned root_blocks = le32_to_cpu(es->s_r_blocks_count);
> +	unsigned free_blocks;
> +	unsigned root_blocks;
>  
> +	spin_lock(&bgi->balloc_lock);
> +	
> +	free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
>  	if (free_blocks < count)
>  		count = free_blocks;
> +	root_blocks = bgi->reserved;

>  >> + root_blocks = bgi->reserved;
> 
>  AD> I would avoid calling this "root_blocks" and instead just use
>  AD> "bgi->reserved" or "reserved_blocks" everywhere.  The original
>  AD> intent of these blocks was to reduce fragmentation and not
>  AD> necessarily reserved-for-root.
> 
> fixed

??

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-14 20:59         ` Andreas Dilger
@ 2003-03-14 21:14           ` Alex Tomas
  0 siblings, 0 replies; 39+ messages in thread
From: Alex Tomas @ 2003-03-14 21:14 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Alex Tomas, Andrew Morton, linux-kernel, ext2-devel

>>>>> Andreas Dilger (AD) writes:

 AD> Could you make it a pre-requisite to the concurrent-alloc patch?
 AD> That would make it a shoo-in to being accepted (cleans up code
 AD> nicely).

Andrew already asked to wait until next -mm

 AD> The point of having the reserved blocks is to reduce
 AD> fragmentation in file allocation.  Having per-group reserved
 AD> blocks is a good idea, because it keeps the reserved "slack" per
 AD> group, and helps file allocations within that group have a bit of
 AD> free space in which to grow.  If you are reserving all of the
 AD> blocks at the end of the filesystem, then the earlier groups will
 AD> become 100% allocated prematurely and lose any ability to keep
 AD> files there contiguous.

well. looks like I miss something here. I thought reservation is not
allocation policy, but mechanism to protect some user (root, usually)
from fs overflow


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-14  7:20       ` Alex Tomas
  2003-03-14 20:59         ` Andreas Dilger
@ 2003-03-15  4:37         ` William Lee Irwin III
  2003-03-15  4:54           ` Andrew Morton
  1 sibling, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  4:37 UTC (permalink / raw)
  To: Alex Tomas; +Cc: Andreas Dilger, Andrew Morton, linux-kernel, ext2-devel

On Fri, Mar 14, 2003 at 10:20:24AM +0300, Alex Tomas wrote:
> and corrected patch:

This patch is a godsend. Whoever's listening, please apply!

dbench on 32x/48G NUMA-Q, aic7xxx adapter, pbay disk, 32K PAGE_SIZE
(pgcl was used for benchmark feasibility purposes)

throughput:
---------- 
before:
Throughput 61.5376 MB/sec 512 procs
dbench 512  637.21s user 15739.41s system 565% cpu 48:16.28 total

after:
Throughput 104.074 MB/sec 512 procs
(GRR, didn't do time, took ca. 30 minutes)

profile:
--------
before:
vma      samples    %-age       symbol name
c0106ff4 160824916  45.1855     default_idle
c01dbfd0  49993575  14.0462     __copy_to_user_ll
c01dc038  15474349   4.34768    __copy_from_user_ll
c0108140  13603867   3.82215    .text.lock.semaphore
c0119058  10872716   3.0548     try_to_wake_up
c02647f0   7896052   2.21848    sync_buffer
c011a1bc   7539112   2.11819    schedule
c0119dac   7168574   2.01409    scheduler_tick
c011fadc   6053745   1.70086    profile_hook
c0119860   4759523   1.33724    load_balance
c0107d0c   4472105   1.25649    __down
c011c4ff   4159010   1.16852    .text.lock.sched
c013dd28   3026705   0.850385   .text.lock.vmscan
c013ece4   3016788   0.847599   check_highmem_ptes
c0113590   2406329   0.676084   mark_offset_tsc
c02649c0   2210485   0.621059   add_event_entry
c010f6b8   2195748   0.616919   timer_interrupt
c0133118   1696204   0.476566   find_get_page

after:
vma        samples  %-age       symbol name
c0106ff4   52751908 30.8696     default_idle
c01dc3b0   28988721 16.9637     __copy_to_user_ll
c01dc418    8240854  4.82242    __copy_from_user_ll
c011e472    8044716  4.70764    .text.lock.fork
c0264bd0    5666004  3.31566    sync_buffer
c013dd28    4454362  2.60662    .text.lock.vmscan
c0119058    4291999  2.51161    try_to_wake_up
c0119dac    4055412  2.37316    scheduler_tick
c011fadc    3554019  2.07976    profile_hook
c011a1bc    2866025  1.67715    schedule
c0119860    2637644  1.54351    load_balance
c0108140    2433644  1.42413    .text.lock.semaphore
c0264da0    1406704  0.823181   add_event_entry
c011c9a4    1370708  0.802117   prepare_to_wait
c0185e20    1236390  0.723516   ext2_new_block
c011c4ff    1227452  0.718285   .text.lock.sched
c013ece4    1148317  0.671977   check_highmem_ptes
c0113590    1145881  0.670551   mark_offset_tsc


vmstat (short excerpt, edited for readability):
------
before:
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b     free    buff    cache     bi   bo    in   cs    us sy id wa
12  5   38747168 484672  9049088    20  4032  1171 13148   1 22 65 12
11 11   38767264 479168  9034304    20  2908  1180 13077   1 28 52 19
 9 14   38764256 480000  9036512    24  1920  1164 13940   1 23 51 25
 7  7   38764128 480832  9035360    12  4444  1191 13784   1 24 51 24
 9  5   38764512 481664  9033024    16  2924  1220 13853   1 23 66 10
 9  6   38762208 482816  9035904     0  3404  1186 13686   1 25 62 12

after:
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b     free    buff    cache     bi    bo   in    cs   us sy id wa
60 11   38659840 533920  9226720   100  1672  2760  1853   5 66 11 18
31 23   38565472 531264  9320384   240  1020  1195  1679   2 35 37 26
23 23   38384928 521952  9503104   772  3372  5624  5093   2 62  9 27
24 31   37945664 518080  9916448  1536  5808 10449 13484   1 45 13 41
31 86   37755072 516096 10091104  1040  1916  3672  9744   2 51 15 32
24 30   37644352 512864 10192960   900  1612  3184  8414   2 49 12 36

There's a lot of odd things going on in both of the vmstat logs.


I've also collected logs of top slab consumers every 10s and full
dbench output for both runs, if that's interesting to anyone.


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  4:37         ` William Lee Irwin III
@ 2003-03-15  4:54           ` Andrew Morton
  2003-03-15  5:30             ` William Lee Irwin III
  2003-03-15  5:49             ` William Lee Irwin III
  0 siblings, 2 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-15  4:54 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: bzzz, adilger, linux-kernel, ext2-devel

William Lee Irwin III <wli@holomorphy.com> wrote:
>
> On Fri, Mar 14, 2003 at 10:20:24AM +0300, Alex Tomas wrote:
> > and corrected patch:
> 
> This patch is a godsend. Whoever's listening, please apply!
> 
> dbench on 32x/48G NUMA-Q, aic7xxx adapter, pbay disk, 32K PAGE_SIZE
> (pgcl was used for benchmark feasibility purposes)
> 
> throughput:
> ---------- 
> before:
> Throughput 61.5376 MB/sec 512 procs
> dbench 512  637.21s user 15739.41s system 565% cpu 48:16.28 total
> 
> after:
> Throughput 104.074 MB/sec 512 procs
> (GRR, didn't do time, took ca. 30 minutes)

`dbench 512' will presumably do lots of IO and spend significant
time in I/O wait.  You should see the effects of this change more
if you use fewer clients (say, 32) so it doesn't hit disk.

On quad power4, dbench 32:

Unpatched:

	Throughput 334.372 MB/sec (NB=417.965 MB/sec  3343.72 MBit/sec)
	Throughput 331.379 MB/sec (NB=414.224 MB/sec  3313.79 MBit/sec)
	Throughput 364.151 MB/sec (NB=455.189 MB/sec  3641.51 MBit/sec)
	Throughput 333.066 MB/sec (NB=416.332 MB/sec  3330.66 MBit/sec)
	Throughput 365.335 MB/sec (NB=456.669 MB/sec  3653.35 MBit/sec)
	Throughput 335.523 MB/sec (NB=419.404 MB/sec  3355.23 MBit/sec)
	Throughput 334.457 MB/sec (NB=418.071 MB/sec  3344.57 MBit/sec)
	Throughput 329.527 MB/sec (NB=411.909 MB/sec  3295.27 MBit/sec)
	Throughput 332.721 MB/sec (NB=415.901 MB/sec  3327.21 MBit/sec)
	Throughput 328.735 MB/sec (NB=410.919 MB/sec  3287.35 MBit/sec)

patched:

	Throughput 335.262 MB/sec (NB=419.078 MB/sec  3352.62 MBit/sec)
	Throughput 334.531 MB/sec (NB=418.163 MB/sec  3345.31 MBit/sec)
	Throughput 337.366 MB/sec (NB=421.707 MB/sec  3373.66 MBit/sec)
	Throughput 334.504 MB/sec (NB=418.13 MB/sec  3345.04 MBit/sec)
	Throughput 332.482 MB/sec (NB=415.602 MB/sec  3324.82 MBit/sec)
	Throughput 334.69 MB/sec (NB=418.363 MB/sec  3346.9 MBit/sec)
	Throughput 370.14 MB/sec (NB=462.675 MB/sec  3701.4 MBit/sec)
	Throughput 333.255 MB/sec (NB=416.569 MB/sec  3332.55 MBit/sec)
	Throughput 336.065 MB/sec (NB=420.081 MB/sec  3360.65 MBit/sec)
	Throughput 334.328 MB/sec (NB=417.91 MB/sec  3343.28 MBit/sec)

No difference at all.

On the quad Xeon (after increasing dirty_ratio and dirty_background_ratio so
I/O was negligible) I was able to measure a 1.5% improvement.

I worry about the hardware you're using there.

> profile:
> --------
>
> ...
> after:
> vma        samples  %-age       symbol name
> c0106ff4   52751908 30.8696     default_idle
> c01dc3b0   28988721 16.9637     __copy_to_user_ll
> c01dc418    8240854  4.82242    __copy_from_user_ll
> c011e472    8044716  4.70764    .text.lock.fork
> c0264bd0    5666004  3.31566    sync_buffer
> c013dd28    4454362  2.60662    .text.lock.vmscan
> c0119058    4291999  2.51161    try_to_wake_up
> c0119dac    4055412  2.37316    scheduler_tick
> c011fadc    3554019  2.07976    profile_hook
> c011a1bc    2866025  1.67715    schedule
> c0119860    2637644  1.54351    load_balance
> c0108140    2433644  1.42413    .text.lock.semaphore
> c0264da0    1406704  0.823181   add_event_entry
> c011c9a4    1370708  0.802117   prepare_to_wait
> c0185e20    1236390  0.723516   ext2_new_block
> c011c4ff    1227452  0.718285   .text.lock.sched
> c013ece4    1148317  0.671977   check_highmem_ptes
> c0113590    1145881  0.670551   mark_offset_tsc

Lots of idle time.  Try it with a smaller client count, get the I/O out of
the picture.

> 
> vmstat (short excerpt, edited for readability):

With what interval?

> after:
> procs -----------memory---------- -----io---- --system-- ----cpu----
>  r  b     free    buff    cache     bi    bo   in    cs   us sy id wa
> 60 11   38659840 533920  9226720   100  1672  2760  1853   5 66 11 18
> 31 23   38565472 531264  9320384   240  1020  1195  1679   2 35 37 26
> 23 23   38384928 521952  9503104   772  3372  5624  5093   2 62  9 27
> 24 31   37945664 518080  9916448  1536  5808 10449 13484   1 45 13 41
> 31 86   37755072 516096 10091104  1040  1916  3672  9744   2 51 15 32
> 24 30   37644352 512864 10192960   900  1612  3184  8414   2 49 12 36
> 
> There's a lot of odd things going on in both of the vmstat logs.

Where are all those interrupts coming from?



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  4:54           ` Andrew Morton
@ 2003-03-15  5:30             ` William Lee Irwin III
  2003-03-15  5:43               ` Martin J. Bligh
  2003-03-15  5:49             ` William Lee Irwin III
  1 sibling, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  5:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: bzzz, adilger, linux-kernel, ext2-devel

William Lee Irwin III <wli@holomorphy.com> wrote:
>> dbench on 32x/48G NUMA-Q, aic7xxx adapter, pbay disk, 32K PAGE_SIZE
>> (pgcl was used for benchmark feasibility purposes)
>> throughput:
>> ---------- 
>> before:
>> Throughput 61.5376 MB/sec 512 procs
>> dbench 512  637.21s user 15739.41s system 565% cpu 48:16.28 total
>> after:
>> Throughput 104.074 MB/sec 512 procs
>> (GRR, didn't do time, took ca. 30 minutes)

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> `dbench 512' will presumably do lots of IO and spend significant
> time in I/O wait.  You should see the effects of this change more
> if you use fewer clients (say, 32) so it doesn't hit disk.
> On quad power4, dbench 32:

Hmm. I'm just trying to spawn enough tasks to keep the cpus busy to get
a large enough thread pool to have something to run when someone sleeps.
There's enough idle time now that this sounds like the wrong direction
to move the task count in...


On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> Unpatched:
> 	Throughput 334.372 MB/sec (NB=417.965 MB/sec  3343.72 MBit/sec)
[...]
> patched:
> 	Throughput 335.262 MB/sec (NB=419.078 MB/sec  3352.62 MBit/sec)
[...]
> No difference at all.
> On the quad Xeon (after increasing dirty_ratio and dirty_background_ratio so
> I/O was negligible) I was able to measure a 1.5% improvement.
> I worry about the hardware you're using there.

Why? The adapter is "vaguely modern" (actually acquired as part of a
hunt for an HBA w/a less buggy driver) but the box and disks and so on
are still pretty ancient, so the absolute numbers aren't useful.

To get a real comparison we'd have to compare spindles, HBA's, and
cpus, and attempt to factor them out. The disks are actually only
capable of doing 30MB/s or 40MB/s, the buses can only do 40MB/s, and
the cpus are 700MHz P-III's. Where dbench gets its numbers faster than
wirespeed I have no idea...

This locking issue may just need more cpus to bring out.


William Lee Irwin III <wli@holomorphy.com> wrote:
>> profile:
>> --------
[...]
>> after:
>> vma        samples  %-age       symbol name
>> c0106ff4   52751908 30.8696     default_idle
>> c01dc3b0   28988721 16.9637     __copy_to_user_ll
>> c01dc418    8240854  4.82242    __copy_from_user_ll
>> c011e472    8044716  4.70764    .text.lock.fork
[...]
> Lots of idle time.  Try it with a smaller client count, get the I/O out of
> the picture.

I'll have trouble as there won't be enough tasks to keep the cpus busy.
Why do you think reducing the client count gets io out of the picture?
Why do you think reducing the client count will reduce idle time?


William Lee Irwin III <wli@holomorphy.com> wrote:
>> vmstat (short excerpt, edited for readability):

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> With what interval?

Sorry, 1s.


William Lee Irwin III <wli@holomorphy.com> wrote:
>> after:
>> procs -----------memory---------- -----io---- --system-- ----cpu----
>>  r  b     free    buff    cache     bi    bo   in    cs   us sy id wa
>> 60 11   38659840 533920  9226720   100  1672  2760  1853   5 66 11 18
>> 31 23   38565472 531264  9320384   240  1020  1195  1679   2 35 37 26
>> 23 23   38384928 521952  9503104   772  3372  5624  5093   2 62  9 27
>> 24 31   37945664 518080  9916448  1536  5808 10449 13484   1 45 13 41
>> 31 86   37755072 516096 10091104  1040  1916  3672  9744   2 51 15 32
>> 24 30   37644352 512864 10192960   900  1612  3184  8414   2 49 12 36
>> There's a lot of odd things going on in both of the vmstat logs.

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> Where are all those interrupts coming from?

Well, the timer interrupt is a killer. 1KHz*num_cpus_online() blows
goats for sufficiently large num_cpus_online(), but for some reason
things are slower without it. I suspect that scheduling response
time is somehow dependent on it.

I got a hold of an aic7xxx so io throughput is slightly better than my
usual NUMA-Q runs (i.e. oopsen). The disks are still clockwork, though.


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  5:30             ` William Lee Irwin III
@ 2003-03-15  5:43               ` Martin J. Bligh
  2003-03-15  5:50                 ` William Lee Irwin III
  0 siblings, 1 reply; 39+ messages in thread
From: Martin J. Bligh @ 2003-03-15  5:43 UTC (permalink / raw)
  To: William Lee Irwin III, Andrew Morton
  Cc: bzzz, adilger, linux-kernel, ext2-devel

>> On the quad Xeon (after increasing dirty_ratio and dirty_background_ratio so
>> I/O was negligible) I was able to measure a 1.5% improvement.
>> I worry about the hardware you're using there.
> 
> Why? The adapter is "vaguely modern" (actually acquired as part of a
> hunt for an HBA w/a less buggy driver) but the box and disks and so on
> are still pretty ancient, so the absolute numbers aren't useful.
> 
> To get a real comparison we'd have to compare spindles, HBA's, and
> cpus, and attempt to factor them out. The disks are actually only
> capable of doing 30MB/s or 40MB/s, the buses can only do 40MB/s, and
> the cpus are 700MHz P-III's. Where dbench gets its numbers faster than
> wirespeed I have no idea...

You'd also have to stop sending all your IO over a NUMA backplane ...

> This locking issue may just need more cpus to bring out.

More than 32 CPUs? Hmmmm.

M.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  4:54           ` Andrew Morton
  2003-03-15  5:30             ` William Lee Irwin III
@ 2003-03-15  5:49             ` William Lee Irwin III
  2003-03-15  6:20               ` William Lee Irwin III
  1 sibling, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  5:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: bzzz, adilger, linux-kernel, ext2-devel

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> `dbench 512' will presumably do lots of IO and spend significant
> time in I/O wait.  You should see the effects of this change more
> if you use fewer clients (say, 32) so it doesn't hit disk.

Throughput 226.57 MB/sec 32 procs
dbench 32 2>& 1  25.04s user 515.02s system 1069% cpu 50.516 total

vma      samples  %-age       symbol name
c0106ff4 1877599  35.8654     default_idle
c01dc3b0 586997   11.2127     __copy_to_user_ll
c0108140 193213   3.6907      .text.lock.semaphore
c015249a 137467   2.62586     .text.lock.file_table
c01dc418 117981   2.25364     __copy_from_user_ll
c01dc59c 115415   2.20463     .text.lock.dec_and_lock
c016997b 106198   2.02857     .text.lock.dcache
c0119dac 98439    1.88036     scheduler_tick
c01dc510 95745    1.8289      atomic_dec_and_lock
c0119058 91746    1.75251     try_to_wake_up
c011fadc 88996    1.69998     profile_hook
c0107d0c 84514    1.61436     __down
c01522a0 70518    1.34702     file_move
c011a1bc 68364    1.30587     schedule
c011c4ff 59716    1.14068     .text.lock.sched
c0168aac 58337    1.11434     d_lookup
c015f3dc 58111    1.11002     path_lookup
c0119860 55141    1.05329     load_balance

procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free   buff    cache   bi    bo        in    cs   us sy id wa
11  0 47538048 549664 737120    0     0       1028 12123   2 33 65  0
 6  2 47534592 550880 738272    0 16312       1085 12498   2 28 67  3
15  2 47559680 552064 711936    0  2332       1111 12197   2 30 63  6
10  3 47539648 547808 737344    0  5012       1174 12683   2 28 63  8
13  4 47585600 548736 689728    0  1616       1173 12393   2 31 58  8
17  2 47575680 550432 699264    0  2252       1224 12135   2 35 54  8
31  2 47643008 550944 631712    0  2216       1189  4795   2 82 15  2
28  1 47724288 551296 548320    0  2532       1178  4297   2 77 18  4
25  2 47798464 552032 473824    0  2724       1199  3283   2 73 22  3
12  5 48026944 552096 243296    0  2272       1170  4389   2 54 37  7
 0  9 48201344 552160  69696    0  3480       1167   466   0  8 62 29
 1  4 48206720 552160  64512    0  3252       1173   152   0  0 83 16
 1  2 48210880 552160  60864    0  3232       1163   106   0  0 90  9
 2  2 48210880 552160  60864    0  3592       1163   111   0  0 93  6
 1  8 48256320 552160  36928    0  3008       1146   587   0  2 79 20
 2  7 48264128 552160  30016    0  3488       1153   170   0  0 76 24
 2  6 48268544 552160  26912    0  3012       1151   145   0  0 79 21
 2  5 48273408 552160  22400    0   312       1162   116   0  0 83 16
 4  0 48277248 552160  21056   12     8       1051   184   0  1 97  1
 0  0 48280448 552160  21280    0     0       1033    59   0  0 100 0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  5:43               ` Martin J. Bligh
@ 2003-03-15  5:50                 ` William Lee Irwin III
  0 siblings, 0 replies; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  5:50 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Andrew Morton, bzzz, adilger, linux-kernel, ext2-devel

On Fri, Mar 14, 2003 at 09:43:38PM -0800, Martin J. Bligh wrote:
> You'd also have to stop sending all your IO over a NUMA backplane ...

Oh yes, there is also that.


At some point in the past, I wrote:
>> This locking issue may just need more cpus to bring out.

On Fri, Mar 14, 2003 at 09:43:38PM -0800, Martin J. Bligh wrote:
> More than 32 CPUs? Hmmmm.

More than 4.


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  5:49             ` William Lee Irwin III
@ 2003-03-15  6:20               ` William Lee Irwin III
  2003-03-15  6:44                 ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  6:20 UTC (permalink / raw)
  To: Andrew Morton, bzzz, adilger, linux-kernel, ext2-devel

On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> > `dbench 512' will presumably do lots of IO and spend significant
> > time in I/O wait.  You should see the effects of this change more
> > if you use fewer clients (say, 32) so it doesn't hit disk.
> 
On Fri, Mar 14, 2003 at 09:49:10PM -0800, William Lee Irwin III wrote:
> Throughput 226.57 MB/sec 32 procs
> dbench 32 2>& 1  25.04s user 515.02s system 1069% cpu 50.516 total

It's too light a load... here's dbench 128.

Looks like dbench doesn't scale. It needs to learn how to spread itself
across disks if it's not to saturate a device queue while at the same
time generating enough cpu load to saturate cpus.

Is there a better (publicable/open/whatever) benchmark?

dbench 128:
Throughput 161.237 MB/sec 128 procs
dbench 128 2>& 1  143.85s user 3311.10s system 1219% cpu 4:43.27 total

vma      samples  %-age       symbol name
c0106ff4 9134179  33.7261     default_idle
c01dc3b0 5570229  20.5669     __copy_to_user_ll
c01dc418 1773600  6.54865     __copy_from_user_ll
c0119058 731524   2.701       try_to_wake_up
c0108140 686952   2.53643     .text.lock.semaphore
c011a1bc 489415   1.80706     schedule
c0119dac 485196   1.79149     scheduler_tick
c011fadc 448048   1.65433     profile_hook
c0119860 356065   1.3147      load_balance
c0107d0c 267333   0.987072    __down
c011c4ff 249627   0.921696    .text.lock.sched
c0152ab0 223897   0.826694    __find_get_block_slow
c01dc510 222598   0.821897    atomic_dec_and_lock
c0168aac 218153   0.805485    d_lookup
c013ece4 194326   0.717509    check_highmem_ptes
c015f3dc 193112   0.713026    path_lookup
c01522a0 187115   0.690884    file_move
c010f6b8 166809   0.615908    timer_interrupt

procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 0  0      0 48280256 552768  23744    0    0     0     0 1027    16  0  0 100  0
 1  0      0 48246272 552768  24160    0    0     0   108 1027   321  0  3 97  0
12  1      0 48194752 545152  24160    0    0     0    40 1054   664  0  8 91  0
14  0      0 48061696 548672 115328    0    0     0     0 1029  1922  2 33 64  0
42  0      0 47821824 547360 366240    0    0     0     0 1026  1255  1 75 22  2
63  1      0 47603392 546624 589760    0    0     0     0 1027  1270  1 98  1  0
60  0      0 47338368 551328 853056    0    0     0     8 1027  2193  1 96  2  0
61  0      0 47074496 551680 1117952    0    0     0     0 1034  2147  1 97  2  0
11  1      0 46781376 553184 1409472    0    0     0     0 1033  5128  1 80 18  1
35  0      0 46492224 552320 1696128    0    0   116     0 1059  7890  2 59 38  1
19  0      0 46295104 554304 1890112    0    0    28     0 1031  9004  2 52 45  1
14  0      0 46097728 558848 2086368    0    0    24     0 1033  8317  2 56 40  2
22  1      0 45849344 556288 2342304    0    0    20    32 1043  8267  2 55 43  1
26  1      0 45608576 558784 2579936    0    0    20     0 1032  7990  2 50 47  1
26  0      0 45421824 557184 2767040    0    0    16     0 1032  9670  2 41 55  2
21  1      0 45297408 557696 2889696    0    0    24     0 1034  8997  2 50 47  1
21  0      0 45254528 560992 2925760    0    0     4     0 1028  9363  2 47 50  0
35  0      0 45245120 556992 2938944    0    0     4   632 1097  7463  3 59 38  1
17  0      0 45247744 560768 2931456    0    0     0     0 1119  8538  3 52 45  0
15  1      0 45269376 556416 2913952    0    0     0   624 1056  9081  2 45 52  0
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
17  0      0 45296128 559136 2884576    0    0     4    12 1152  9749  2 44 54  0
24  0      0 45296832 562560 2880832    0    0     0     0 1029  8585  2 57 41  0
22  0      0 45274496 559072 2907360    0    0     0   588 1172  7348  2 60 38  0
19  1      0 45255616 555520 2929184    0    0     0   520 1027  7362  3 61 37  0
15  0      0 45245696 559552 2935264    0    0     0    52 1167  8931  3 50 47  0
10  0      0 45241152 563232 2936032    0    0     0     0 1027  9644  2 45 53  0
18  0      0 45273216 556800 2910176    0    0     0   416 1133  9657  2 49 48  0
16  0      0 45268288 559776 2912800    0    0    12     0 1030  9502  2 45 53  0
40  0      0 45253312 562368 2925664    0    0     0     0 1029  7597  3 62 35  0
11  1      0 45233408 562528 2945280    0    0     0   912 1118  8105  3 55 41  1
30  0      0 45251136 560192 2929888    0    0     0   104 1183  8715  3 50 47  0
11  0      0 45264768 562368 2913344    0    0     0     0 1025  8622  2 53 45  0
30  0      0 45296000 564128 2880928    0    0     0   160 1067  9565  2 48 50  0
20  0      0 45296192 559776 2886176    0    0     0   620 1173  8638  2 53 45  0
31  0      0 45267072 562400 2912416    0    0     0     0 1023  7383  3 63 34  0
17  1      0 45261184 558560 2921184    0    0     0   584 1043  8113  2 56 42  0
22  0      0 45239040 561984 2941216    0    0     0    56 1169  9078  2 48 49  0
17  1      0 45246528 564192 2928800    0    0     0 18076 1067  9885  2 46 50  1
24  2      0 45263808 560864 2915840    0    0     0  3912 1077 10085  2 47 48  3
25  3      0 45250944 563456 2927744    0    0     4   164 1063  8361  2 54 40  4
24  3      0 45247616 560448 2933728    0    0     0   600 1163  8387  3 58 35  5
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
23  2      0 45263744 560928 2917184    0    0     0  1792 1225  9299  2 52 42  4
21  1      0 45259264 562816 2920864    0    0     0  2052 1201  8903  3 49 45  3
24  1      0 45282432 564192 2895968    0    0     0  2220 1205  9228  2 49 47  2
17  2      0 45289408 560640 2892832    0    0     0  1276 1181  9552  2 46 49  3
31  1      0 45281344 558304 2903456    0    0     0  1332 1244  8814  2 53 41  3
23  1      0 45267456 560192 2914784    0    0     0  2164 1199  8500  2 55 41  1
20  1      0 45257536 562880 2922336    0    0     0  2212 1193  9535  2 50 46  2
23  1      0 45257152 565728 2919712    0    0     0  4524 1180  9578  2 45 51  2
11  2      0 45265408 563200 2914144    0    0     0  1484 1136  9154  2 50 45  3
22  1      0 45273344 560160 2910048    0    0     0  1540 1196  8949  2 49 46  3
21  1      0 45269632 561600 2910496    0    0     0  4840 1130  8013  3 58 38  2
30  1      0 45280960 563328 2897856    0    0     0  4292 1113  7722  3 62 34  2
14  1      0 45264064 565056 2913664    0    0     0  3492 1129  9123  2 54 42  2
26  2      0 45289792 562432 2890816    0    0     0  6028 1109  8671  2 57 39  2
14  2      0 45301504 561600 2880640    0    0     0   364 1117  9178  2 49 44  4
17  1      0 45286976 561472 2895744    0    0     0  6884 1175  8299  2 55 41  2
15  1      0 45270528 563040 2910240    0    0     0  4504 1090  8066  2 56 40  1
24  1      0 45261952 564704 2916800    0    0     0  2980 1102  8734  2 50 46  1
14  2      0 45261760 561920 2919968    0    0     0  5524 1122  9604  3 48 47  2
28  2      0 45269056 563136 2911488    0    0     0   328 1107  9034  3 51 42  4
14  1      0 45294080 560128 2889824    0    0     0  3756 1221  9055  2 50 45  2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
28  1      0 45282944 561472 2899712    0    0     0  3556 1120  8221  3 57 39  2
23  1      0 45289856 562912 2890336    0    0     0  3176 1102  8270  3 56 40  1
27  1      0 45281984 564448 2896832    0    0     0  4364 1133  8721  3 53 42  2
22  1      0 45288576 565792 2889280    0    0     0  3756 1112  9156  3 50 45  2
20  2      0 45307968 563648 2872768    0    0     0  2564 1131  9414  2 48 47  2
20  2      0 45293376 563104 2886560    0    0     0   576 1166  8840  2 52 42  3
17  1      0 45287680 561824 2895616    0    0     0  3548 1202  8016  3 59 36  2
14  1      0 45272960 563584 2907584    0    0     0  3276 1139  8910  3 52 44  2
20  1      0 45265408 565088 2914784    0    0     0  3492 1135  9168  3 49 47  2
15  1      0 45278848 566560 2899296    0    0     0  3968 1170  9189  2 49 46  2
13  1      0 45291328 567360 2886592    0    0     0  3752 1125  9027  3 49 47  2
19  2      0 45297408 562944 2884864    0    0     0  2148 1126  8020  3 55 40  2
22  2      0 45298048 562944 2884544    0    0     0   492 1151  7937  3 56 38  3
25  1      0 45292352 562304 2891232    0    0     0  3948 1191  8045  3 54 42  2
22  1      0 45292480 563616 2888576    0    0     0  3620 1125  8714  2 52 43  2
17  1      0 45305344 564896 2874976    0    0     0  3084 1132  9214  3 49 47  2
13  1      0 45301760 565792 2877824    0    0     0  6672 1148  9488  2 46 50  2
20  1      0 45292160 566944 2884992    0    0     0  2368 1105  8931  2 53 44  1
22  2      0 45279552 564736 2899776    0    0     0  2820 1137  8201  3 59 36  2
11  2      0 45270976 564608 2910240    0    0     0   484 1150  9030  3 53 41  3
15  1      0 45297856 562432 2884544    0    0     0  4060 1176  8291  2 52 43  2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
24  1      0 45302144 563200 2880032    0    0     0  3496 1139  9189  3 51 45  2
29  1      0 45310592 564224 2871296    0    0     0  4328 1145  8822  3 53 43  2
17  1      0 45323136 564896 2857600    0    0     0  4072 1131  7628  2 53 43  2
28  1      0 45312576 566240 2867328    0    0     0  3468 1150  8090  2 56 40  1
16  1      0 45299200 567360 2879328    0    0     0  2344 1156  8063  3 56 40  2
15  1      0 45315968 568640 2860768    0    0     0  2400 1181  9660  2 47 49  2
21  2      0 45321536 565248 2858944    0    0     0  1228 1197  9757  2 48 47  3
18  1      0 45308352 562656 2875168    0    0     0  1892 1230  8413  2 57 38  3
20  1      0 45300992 563584 2881216    0    0     0  4184 1174  8691  2 53 43  2
21  1      0 45289216 564640 2890720    0    0     0  3696 1138  8479  2 53 43  2
14  1      0 45287424 565472 2891872    0    0     0  4108 1146  8578  3 54 41  2
23  1      0 45309376 566240 2869152    0    0     0  2528 1124  9247  2 48 48  2
18  1      0 45314560 566848 2865312    0    0     0  3044 1126  9376  3 51 45  2
19  1      0 45300672 567968 2876000    0    0     0  2788 1127  9059  3 53 43  2
16  1      0 45320576 568736 2856672    0    0     0  3528 1139  8816  3 56 40  2
18  2      0 45314368 564992 2866784    0    0     0  1632 1138  8406  3 55 39  3
23  2      0 45298752 563584 2883552    0    0     0   792 1193  9421  3 47 47  4
17  1      0 45326976 563008 2855712    0    0     0  3756 1180  9161  2 51 44  2
15  1      0 45318976 563872 2864192    0    0     0  2600 1125  9449  2 50 46  2
27  1      0 45297472 564800 2884512    0    0     0  3176 1134  8358  3 57 38  2
22  1      0 45303552 565600 2877024    0    0     0  3204 1150  9311  3 49 47  2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
22  0      0 45291776 566368 2888288    0    0     0  3104 1138  8800  3 51 45  2
21  1      0 45308416 567008 2871072    0    0     0  3112 1135  9005  3 51 44  2
18  1      0 45315392 567808 2863200    0    0     0  3080 1129  9499  2 50 46  2
27  1      0 45305920 568352 2871200    0    0     0  3012 1137  9396  3 49 47  2
25  1      0 45321344 569120 2854624    0    0     0  2728 1115  8623  2 55 41  1
24  2      0 45313984 565632 2867424    0    0     0  2432 1147  8838  2 53 43  2
20  1      0 45320704 562368 2864160    0    0     0  1372 1142  9033  3 50 45  3
22  1      0 45339968 563296 2843296    0    0     0  3520 1145  9056  2 49 46  2
17  1      0 45339584 563808 2843840    0    0     0  3468 1137  8871  2 54 43  1
17  1      0 45323968 564640 2859008    0    0     0  4096 1150  8350  3 57 39  2
26  1      0 45317824 565376 2864512    0    0     0  3040 1123  8397  3 56 40  1
14  1      0 45321344 566112 2859872    0    0     0  3328 1132  9070  2 52 44  2
21  1      0 45324672 566816 2855712    0    0     0  3336 1143  7997  3 59 37  1
18  1      0 45336768 567456 2844032    0    0     0  3888 1119  8689  2 55 41  2
22  1      0 45322432 567904 2858272    0    0     0  3816 1131  8632  3 51 44  2
25  1      0 45321280 568672 2857632    0    0     0  3012 1120  8792  2 56 40  2
19  1      0 45327040 569536 2851296    0    0     0  2180 1103  9192  2 51 45  1
18  1      0 45312576 570432 2863104    0    0     0  3308 1134  9535  3 50 46  2
15  2      0 45329088 566624 2850976    0    0     0   580 1127  9666  2 47 47  4
16  1      0 45335296 564064 2848288    0    0     0  3612 1210  9346  2 47 48  2
17  1      0 45331968 564640 2851008    0    0     0  3204 1129  9596  3 48 47  2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
11  1      0 45330304 565376 2851520    0    0     0  3616 1123  9132  3 53 42  2
22  1      0 45331520 565984 2849888    0    0     0  3660 1137  9305  3 50 46  2
17  1      0 45332544 566592 2847648    0    0     0  3464 1123  8743  2 53 43  1
22  1      0 45332128 567104 2848992    0    0     0  2520 1112  8055  2 57 39  1
17  1      0 45322688 567584 2856480    0    0     0  3624 1124  9194  2 50 46  1
13  1      0 45326976 568224 2852128    0    0     0  2464 1127  9158  2 51 45  2
15  1      0 45320576 568736 2858368    0    0     0  2128 1155  9505  2 46 50  2
19  1      0 45318208 569312 2860160    0    0     0  2704 1174  9129  2 50 46  2
13  1      0 45325056 570016 2853216    0    0     0  2188 1142  9297  2 51 44  2
21  1      0 45337536 570400 2838048    0    0     0  3436 1177  8373  3 53 43  2
22  2      0 45334464 565696 2847584    0    0     0  1044 1128  8705  2 55 40  2
17  1      0 45358912 565024 2824736    0    0     0  2892 1219  8876  3 56 39  2
18  1      0 45357696 565664 2824192    0    0     0  3524 1154  8886  2 50 47  1
22  1      0 45355328 566176 2826816    0    0     0  3812 1152  9006  2 54 42  2
20  1      0 45354816 566720 2827040    0    0     0  3200 1148  9004  3 50 46  2
16  1      0 45349696 566976 2831360    0    0     0  3160 1150  8772  3 51 44  2
26  1      0 45360256 567424 2819648    0    0     0  3296 1145  8908  2 49 46  2
16  1      0 45352640 567776 2827968    0    0     0  4416 1151  9085  3 51 44  2
20  1      0 45337408 568480 2842624    0    0     0  4556 1165  8596  3 54 42  1
12  1      0 45337984 568928 2840800    0    0     0  3816 1158  9757  3 45 51  2
18  1      0 45348800 569472 2829696    0    0     0  4104 1166  9528  3 48 48  2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
29  1      0 45358464 569760 2820192    0    0     0  4432 1152  8817  2 56 40  1
22  1      0 45346176 570240 2830336    0    0     0  2560 1122  8296  2 55 40  2
21  1      0 45351488 570464 2826144    0    0     0  3528 1145  9093  3 54 42  2
22  1      0 45349312 570752 2828160    0    0     0  3724 1149  8422  2 53 43  2
23  1      0 45347904 571040 2829856    0    0     0  3340 1138  9162  2 50 45  2
 9  2      0 45354304 564960 2828288    0    0     0  1652 1150  9398  2 48 46  3
23  1      0 45358720 564928 2824608    0    0     0  2320 1183  9687  2 46 49  2
22  1      0 45356224 565536 2827232    0    0     0  3424 1155  8763  2 52 44  2
13  1      0 45345152 565984 2837472    0    0     0  2760 1132  8306  2 54 42  2
13  1      0 45344192 566400 2837792    0    0     0  4292 1168  9164  2 49 47  2
17  1      0 45351872 566816 2829088    0    0     0  3556 1143  9530  2 47 49  2
19  1      0 45352192 567104 2829248    0    0     0  3116 1139  9016  2 50 46  2
14  1      0 45360000 567680 2820096    0    0     0  4560 1161  9633  2 48 48  2
25  1      0 45352896 568160 2826720    0    0     0  3272 1147  8597  2 57 39  2
20  0      0 45347264 568448 2831968    0    0     0  3236 1134  8287  3 56 39  2
16  1      0 45341184 568928 2836224    0    0     0  3408 1138  9057  2 52 44  2
12  1      0 45347584 569280 2831168    0    0     0  2724 1122  9587  2 47 49  2
19  1      0 45354240 569440 2824832    0    0     0  3332 1134  9334  2 46 49  2
16  1      0 45357120 569760 2821568    0    0     0  2884 1137  8936  2 50 45  2
26  1      0 45346176 570176 2832544    0    0     0  3236 1130  9035  2 50 46  2
14  2      0 45339072 570560 2838272    0    0     8  3016 1136  8522  3 52 43  2
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
21  2      0 45357632 570848 2819616    0    0     0  3552 1154  9181  3 48 46  3
18  3      0 45363008 565632 2819168    0    0     8  2064 1146  9593  2 47 47  4
20  2      0 45404800 563776 2779744    0    0     8  2740 1197  9447  2 50 43  4
17  2      0 45404096 564032 2779968    0    0     0  4276 1156  8744  2 53 41  3
19  2      0 45394304 564320 2789728    0    0     0  2796 1127  8453  2 54 40  4
21  2      0 45385536 564864 2798336    0    0     4  2668 1160  8473  3 56 37  4
22  2      0 45397696 566048 2784160    0    0    24  3208 1204  9178  2 51 44  3
17  3      0 45403392 566912 2777632    0    0     8  2564 1156  9782  2 47 45  6
20  3      0 45401024 567488 2780672    0    0    16  3136 1157  8877  2 58 35  4
29  3      0 45408192 568512 2772256    0    0     8  2860 1141  9097  2 56 37  5
19  4      0 45387456 569504 2792192    0    0     4  4100 1149  8595  3 57 34  7
18  4      0 45390592 570336 2787872    0    0    20  3472 1143  8755  3 52 38  7
16  5      0 45408000 571456 2769280    0    0    16  3748 1167 10441  2 46 43  9
21  4      0 45423104 567584 2757696    0    0    48  1036 1193 10248  2 48 41  9
16  5      0 45427264 565344 2755648    0    0    32  2620 1236  9347  2 54 34 10
16  5      0 45417856 567456 2763296    0    0    72  3556 1160  9557  2 51 37  9
25  6      0 45427776 569440 2751616    0    0    60  3112 1151 10403  2 45 42 10
15  4      0 45438720 571072 2738720    0    0    48  3500 1173  9942  3 50 37 10
13 10      0 45457024 568320 2723392    0    0    96  1728 1160 10681  2 48 37 13
17  8      0 45454208 565952 2728416    0    0    24  1940 1224 10121  3 50 33 14
32  8      0 45460352 568416 2718848    0    0    56  2980 1148 10187  3 54 31 13
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
19  8      0 45449664 570048 2728992    0    0    48  3268 1160  9737  3 53 32 12
41 12      0 45494208 571392 2683040    0    0    88  2628 1179 10545  2 55 30 13
35 15      0 45536448 569280 2642528    0    0   148  1480 1136  9438  2 63 21 14
83 21      0 45558784 569728 2619680    0    0   160   848 1245  6599  2 79  9  9
20 21      0 45604736 566208 2576128    0    0   376  1216 1224 10129  2 65 17 16
30 29      0 45667328 570496 2512064    0    0   604  1668 1193 14999  2 54 19 25
24 31      0 45726656 572928 2449472    0    0   544   484 1209 16814  2 51 20 26
69 41      0 45830272 576576 2341088    0    0   604   320 1333 22979  2 58 15 25
42 62      0 45950688 566944 2228224    0    0   892   340 1464 27057  2 66 10 22
23 27      0 46028832 570656 2147328    0    0   424   304 1244 18954  2 78  5 15
28 107      0 46159200 566496 2022560    0    0   808   352 1228 23538  1 71  4 24
 7 116      0 46224160 570016 1960864    0    0   600   224 1230 27775  0 13  5 82
 2 119      0 46272416 574432 1909504    0    0   560   288 1239 21885  0  7  4 88
15 109      0 46330272 564256 1862912    0    0   488   968 1232 18866  0  7  3 90
 5 114      0 46375584 568128 1816160    0    0   500  1216 1199 19986  0  7  1 91
 2 114      0 46423456 560352 1777440    0    0   452  1248 1202 16782  0  6  2 92
 2 113      0 46455392 563488 1742176    0    0   396  1768 1193 15011  0  5  3 93
 6 111      0 46492320 566752 1702944    0    0   420  1464 1185 16749  0  5  0 94
 4 112      0 46532512 570688 1659200    0    0   492  1152 1188 18447  0  6  1 92
 4 113      0 46582304 562080 1617984    0    0   400  1344 1187 15024  0  5  2 93
 2 113      0 46625312 565440 1571648    0    0   436  1568 1196 18994  0  7  1 92
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 6 109      0 46647712 567616 1548000    0    0   288  1956 1182 10422  0  4  3 93
 4 108      0 46707872 559744 1499008    0    0   480  1200 1202 20386  0  8  3 89
 9 103      0 46752672 563168 1452544    0    0   444  1360 1195 18791  0  6  3 90
 6 105      0 46792864 566240 1409216    0    0   412  1720 1186 17387  0  6  3 90
 2 75      0 46821152 568384 1381088    0    0   280  1156 1191 10829  0  4  4 92
 2 66      0 46874784 559872 1336128    0    0   432  1240 1198  9432  0  4  4 92
 3 53      0 46912480 563264 1294912    0    0   436  1360 1204  9480  0  4  3 93
 1 27      0 46918112 563648 1289056    0    0    52  2000 1154  1342  0  1 21 78
 1 26      0 46919776 563744 1287392    0    0    20  3380 1154   484  0  1 37 62
 2 25      0 46921120 563936 1285664    0    0    28  2620 1133   422  0  1 37 62
 4 24      0 46928992 564576 1277216    0    0    80  2508 1142   929  0  1 38 61
 2 25      0 46935968 564832 1270304    0    0    40  2856 1147   912  0  1 37 62
 4 97      0 46986784 568736 1221696    0    0   520  2720 1169 11680  0  6  7 87
 2 97      0 47047712 560480 1171264    0    0   444  1456 1193 17112  0  7  6 88
 2 92      0 47094304 563168 1125856    0    0   368  1444 1180 15707  0  6  3 91
 2 64      0 47138720 566048 1079008    0    0   376  1320 1192 14764  0  5  4 90
 2 53      0 47160352 567360 1055904    0    0   176  2248 1165  4756  0  2 13 85
 1 52      0 47162912 567456 1053280    0    0    16  2768 1143   889  0  1 12 87
 7 81      0 47206816 569792 1016704    0    0   304  2288 1158  8250  0  4 11 84
 2 80      0 47276128 561120 958400    0    0   432  1488 1198 17237  0  6  6 88
 8 76      0 47331552 564320 902144    0    0   440  1252 1198 15313  0  6  6 88
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 1 72      0 47406880 567840 829184    0    0   456  1676 1204 18356  0  7  6 87
 1 70      0 47457568 570624 778240    0    0   384  1508 1179 12199  0  4  3 92
 8 61      0 47532000 573312 706496    0    0   348  1704 1184 14745  0  5  9 85
 0 60      0 47596000 564896 655776    0    0   384  1516 1185  9453  0  4 10 86
 2 57      0 47636640 568160 614112    0    0   416  1512 1189  7620  0  3  8 89
 1 54      0 47689184 570016 564096    0    0   256  1996 1168  8807  0  4  7 90
 1 54      0 47689632 570016 563520    0    0     0  2656 1129   548  0  0  9 90
 1 52      0 47699616 570528 554336    0    0    64   200 1149  2042  0  1 13 86
 0 49      0 47758112 572960 497216    0    0   312  4744 1201  9579  0  4 13 83
 1 48      0 47768736 573280 487296    0    0    40  2204 1139  1964  0  1 16 83
 1 48      0 47772384 573440 483392    0    0    24  3040 1142  1150  0  1 16 83
 0 48      0 47799840 565632 463776    0    0   108  2156 1140  3420  0  2 15 83
 1 48      0 47803616 565888 459552    0    0    32  4440 1157  1138  0  1 12 87
 3 48      0 47852064 567776 409376    0    0   244  3280 1172  7173  0  3  9 88
 1 41      0 47889568 568640 377888    0    0   108  4608 1160  4626  0  2 13 84
 1 41      0 47896224 568832 371520    0    0    24  3712 1125  1348  0  1 22 77
 1 41      0 47905632 569088 361600    0    0    32  2156 1152  1743  0  1 22 77
 1 38      0 47929696 569888 338560    0    0   100  2836 1143  3432  0  2 21 77
 0 39      0 47937760 569984 330560    0    0    16  3228 1136  1328  0  1 22 77
 2 36      0 47944608 570080 326784    0    0    16  3560 1129   684  0  1 24 75
 2 36      0 47974048 570688 297024    0    0    76  2560 1119  3580  0  2 26 71
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 1 35      0 47991392 571136 280256    0    0    56  2908 1122  2047  0  1 27 72
 3 35      0 47994272 571168 277568    0    0     4  3676 1112   614  0  1 28 71
 0 35      0 48006944 571648 264480    0    0    60  2252 1124  1507  0  1 26 73
 0 35      0 48041440 572224 229440    0    0    80  2676 1137  3767  0  2 21 77
 2 35      0 48042528 572320 227968    0    0    52  2540 1189   703  0  2 24 73
 0 34      0 48061088 572544 209664    0    0    28  4124 1123  1935  0  1 24 75
 0 29      0 48089696 573152 185568    0    0    76  3720 1132  2361  0  1 25 74
 1 28      0 48097824 573248 178816    0    0    12  3616 1130   945  0  0 31 69
 3 25      0 48105056 573408 174688    0    0    20  3112 1130   628  0  0 35 64
 1 25      0 48126688 573824 152736    0    0    60  2400 1136  1972  0  1 40 59
 0 25      0 48132000 574016 147616    0    0    28  2704 1126   516  0  0 36 63
 0 24      0 48143136 574112 136832    0    0    12  2724 1126   816  0  0 35 65
 0 23      0 48160480 574496 120800    0    0    48  4436 1192  1205  0  1 35 65
 0 24      0 48155808 574752 118304    0    0   928  4848 1205   442  0  0 38 61
 0 23      0 48163744 574816 117216    0    0    44  3108 1135   285  3  0 38 60
 0 21      0 48192864 565920  99200    0    0    40  2368 1114   625  0  0 39 61
 1 21      0 48197600 566048  94304    0    0    20  3212 1126   416  0  0 41 59
 1 14      0 48206176 566048  92192    0    0     0  3244 1130   312  0  0 58 42
 2 11      0 48217504 566080  83648    0    0     4  2536 1125   532  0  0 62 38
 1 11      0 48241568 566592  59296    0    0    64  3300 1143   963  0  1 68 31
 0 11      0 48242016 566624  58880    0    0     4  2788 1126   173  0  0 69 31
procs -----------memory---------- ---swap-- -----io---- --system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in    cs us sy id wa
 0 11      0 48242016 566624  58880    0    0     0  3648 1142   156  0  0 69 31
 1 10      0 48253856 566944  46720    0    0    40  3644 1173   423  0  0 69 31
 0 11      0 48256992 567040  43232    0    0    12  2644 1165   219  0  0 69 31
 2 11      0 48261472 567136  38720    0    0    12  3048 1165   226  0  0 69 31
 1  6      0 48268192 567200  35776    0    0     8  3608 1169   138  0  0 73 27
 1  0      0 48288032 567552  22976    0    0    44     0 1132   142  0  0 88 12
 0  0      0 48288288 567552  22976    0    0     0   180 1061    23  0  0 100  0
 1  0      0 48288288 567552  22976    0    0     0     0 1026    21  0  0 100  0
 0  0      0 48288096 567552  22976    0    0     0     0 1027    43  0  0 100  0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  6:20               ` William Lee Irwin III
@ 2003-03-15  6:44                 ` Andrew Morton
  2003-03-15  7:05                   ` William Lee Irwin III
  2003-03-15  8:16                   ` [Ext2-devel] " Alex Tomas
  0 siblings, 2 replies; 39+ messages in thread
From: Andrew Morton @ 2003-03-15  6:44 UTC (permalink / raw)
  To: William Lee Irwin III; +Cc: bzzz, adilger, linux-kernel, ext2-devel

William Lee Irwin III <wli@holomorphy.com> wrote:
>
> On Fri, Mar 14, 2003 at 08:54:55PM -0800, Andrew Morton wrote:
> > > `dbench 512' will presumably do lots of IO and spend significant
> > > time in I/O wait.  You should see the effects of this change more
> > > if you use fewer clients (say, 32) so it doesn't hit disk.
> > 
> On Fri, Mar 14, 2003 at 09:49:10PM -0800, William Lee Irwin III wrote:
> > Throughput 226.57 MB/sec 32 procs
> > dbench 32 2>& 1  25.04s user 515.02s system 1069% cpu 50.516 total
> 
> It's too light a load... here's dbench 128.

OK.

> Looks like dbench doesn't scale. It needs to learn how to spread itself
> across disks if it's not to saturate a device queue while at the same
> time generating enough cpu load to saturate cpus.

Nope.  What we're trying to measure here is pure in-memory lock contention,
locked bus traffic, context switches, etc, etc.  To do that we need to get
the IO system out of the picture.

One way to do that is to increase /proc/sys/vm/dirty_ratio and
dirty_background_ratio to 70% or so.  You can still hit IO wait if someone
tries to truncate a file which pdflush is writing out, so increase
dirty_expire_centisecs and dirty_writeback_centisecs to 1000000000 or so...

Then, on the second run, when all the required metadata blocks are in
pagecache you should be able to get an IO-free run.

> Is there a better (publicable/open/whatever) benchmark?

I have lots of little testlets which can be mixed and matched.  RAM-only
dbench will do for the while.  It is showing things.

> 
> dbench 128:
> Throughput 161.237 MB/sec 128 procs
> dbench 128 2>& 1  143.85s user 3311.10s system 1219% cpu 4:43.27 total
> 
> vma      samples  %-age       symbol name
> c0106ff4 9134179  33.7261     default_idle
> c01dc3b0 5570229  20.5669     __copy_to_user_ll
> c01dc418 1773600  6.54865     __copy_from_user_ll
> c0119058 731524   2.701       try_to_wake_up
> c0108140 686952   2.53643     .text.lock.semaphore
> c011a1bc 489415   1.80706     schedule
> c0119dac 485196   1.79149     scheduler_tick
> c011fadc 448048   1.65433     profile_hook
> c0119860 356065   1.3147      load_balance
> c0107d0c 267333   0.987072    __down
> c011c4ff 249627   0.921696    .text.lock.sched

The wakeup and .text.lock.semaphore load indicates that there is a lot
of contention for a semaphore somewhere.  Still.

I'm not sure which one.  It shouldn't be a directory semaphore.  Might be
lock_super() in the inode allocator, but that seems unlikely.



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  6:44                 ` Andrew Morton
@ 2003-03-15  7:05                   ` William Lee Irwin III
  2003-03-15  8:24                     ` William Lee Irwin III
  2003-03-15  8:16                   ` [Ext2-devel] " Alex Tomas
  1 sibling, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  7:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: bzzz, adilger, linux-kernel, ext2-devel

William Lee Irwin III <wli@holomorphy.com> wrote:
>> Looks like dbench doesn't scale. It needs to learn how to spread itself
>> across disks if it's not to saturate a device queue while at the same
>> time generating enough cpu load to saturate cpus.

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
> Nope.  What we're trying to measure here is pure in-memory lock contention,
> locked bus traffic, context switches, etc, etc.  To do that we need to get
> the IO system out of the picture.
> One way to do that is to increase /proc/sys/vm/dirty_ratio and
> dirty_background_ratio to 70% or so.  You can still hit IO wait if someone
> tries to truncate a file which pdflush is writing out, so increase
> dirty_expire_centisecs and dirty_writeback_centisecs to 1000000000 or so...
> Then, on the second run, when all the required metadata blocks are in
> pagecache you should be able to get an IO-free run.

Oh, sorry, I did increase dirty_ratio and dirty_background_ratio to 99,
I forgot about dirty_writeback_centisecs though, I'll re-run with that.

William Lee Irwin III <wli@holomorphy.com> wrote:
>> Is there a better (publicable/open/whatever) benchmark?

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
> I have lots of little testlets which can be mixed and matched.  RAM-only
> dbench will do for the while.  It is showing things.
> 

William Lee Irwin III <wli@holomorphy.com> wrote:
>> dbench 128:
>> Throughput 161.237 MB/sec 128 procs
>> dbench 128 2>& 1  143.85s user 3311.10s system 1219% cpu 4:43.27 total
>> vma      samples  %-age       symbol name
>> c0106ff4 9134179  33.7261     default_idle
>> c01dc3b0 5570229  20.5669     __copy_to_user_ll
>> c01dc418 1773600  6.54865     __copy_from_user_ll
>> c0119058 731524   2.701       try_to_wake_up
>> c0108140 686952   2.53643     .text.lock.semaphore
>> c011a1bc 489415   1.80706     schedule
>> c0119dac 485196   1.79149     scheduler_tick
>> c011fadc 448048   1.65433     profile_hook
>> c0119860 356065   1.3147      load_balance
>> c0107d0c 267333   0.987072    __down
>> c011c4ff 249627   0.921696    .text.lock.sched

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
> The wakeup and .text.lock.semaphore load indicates that there is a lot
> of contention for a semaphore somewhere.  Still.
> I'm not sure which one.  It shouldn't be a directory semaphore.  Might be
> lock_super() in the inode allocator, but that seems unlikely.

I'm going to have to break out tools to decipher which one this is.
hlinder forward-ported lockmeter so I'll throw that in the mix.


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  6:44                 ` Andrew Morton
  2003-03-15  7:05                   ` William Lee Irwin III
@ 2003-03-15  8:16                   ` Alex Tomas
  2003-03-15  8:29                     ` William Lee Irwin III
  1 sibling, 1 reply; 39+ messages in thread
From: Alex Tomas @ 2003-03-15  8:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: William Lee Irwin III, bzzz, adilger, linux-kernel, ext2-devel

>>>>> Andrew Morton (AM) writes:

 AM> Nope.  What we're trying to measure here is pure in-memory lock
 AM> contention, locked bus traffic, context switches, etc, etc.  To
 AM> do that we need to get the IO system out of the picture.

I simple use own pretty simple test. btw, you may disable preallocation
to increase allocation rate


bash-script:
============================================================
#!/bin/sh

# args:
# 1 - how many processes to create
# 2 - how many blocks in file to be written
# 3 - how many times to repeat (write+truncate)
# for example: cd.sh 2 32 100000

let i=0
while let "i < $1"; do
        if [ ! -d /mnt/$i ]; then
                mkdir /mnt/$i
        fi
        rm -rf /mnt/$i/*
        let "i=i+1"
done

sync
sync

let i=0
while let "i < $1"; do
        time /root/cdsingle $2 $3 /mnt/$i/1 &
        let "i=i+1"
done

wait
============================================================

C programm, which does loop over writes and truncate:

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>


main (int argc, char ** argv)
{
        int i, j, num, siz, err, k;
        int fd[1024];

        num = atoi(argv[2]);
        siz = atoi(argv[1]);
        
        for (i = 3; i < argc; i++) {
                fd[i] = creat(argv[i], 0666);
                if (fd[i] < 0) {
                        perror("can't create");
                        exit(1);
                }
        }

        for (j = 0; j < num; j++) {
                for (i = 3; i < argc; i++) {
                        for (k = 0; k < siz; k++)
                                if ((err = write(fd[i], main, 4096)) < 0) {
                                        printf("err=%d\n", err);
                                        perror("can't write");
                                        exit(1);
                                }
                }
                for (i = 3; i < argc; i++) {
                        ftruncate(fd[i], 0);
                        lseek(fd[i], 0, SEEK_SET);
                }
        }

        for(i = 3; i < argc; i++) {
                close(fd[i]);
                if (unlink(argv[i]) < 0)
                        perror("can't unlink");
        }
}



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  7:05                   ` William Lee Irwin III
@ 2003-03-15  8:24                     ` William Lee Irwin III
  2003-03-15  9:47                       ` William Lee Irwin III
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  8:24 UTC (permalink / raw)
  To: Andrew Morton, bzzz, adilger, linux-kernel, ext2-devel

On Fri, Mar 14, 2003 at 10:44:13PM -0800, Andrew Morton wrote:
>> Nope.  What we're trying to measure here is pure in-memory lock contention,
>> locked bus traffic, context switches, etc, etc.  To do that we need to get
>> the IO system out of the picture.
>> One way to do that is to increase /proc/sys/vm/dirty_ratio and
>> dirty_background_ratio to 70% or so.  You can still hit IO wait if someone
>> tries to truncate a file which pdflush is writing out, so increase
>> dirty_expire_centisecs and dirty_writeback_centisecs to 1000000000 or so...
>> Then, on the second run, when all the required metadata blocks are in
>> pagecache you should be able to get an IO-free run.
> 
On Fri, Mar 14, 2003 at 11:05:11PM -0800, William Lee Irwin III wrote:
> Oh, sorry, I did increase dirty_ratio and dirty_background_ratio to 99,
> I forgot about dirty_writeback_centisecs though, I'll re-run with that.

Next pass involves lockmeter:

$ cat /proc/sys/vm/dirty_expire_centisecs
360000
$ cd /test/wli
$ ls
$ (time dbench 128) |& tee -a ~/dbench.output.log.7
zsh: correct '~/dbench.output.log.7' to '~/dbench.output.log.6' [nyae]? n
128 clients started
   0     62477  206.65 MB/sec
Throughput 206.651 MB/sec 128 procs
dbench 128  143.50s user 3258.66s system 1574% cpu 3:36.04 total

vma      samples  %-age       symbol name
c0106ff4 7617343  29.5286     default_idle
c01dc3b0 5212934  20.2079     __copy_to_user_ll
c01dc418 1806434  7.00263     __copy_from_user_ll
c0264bd0 1595815  6.18617     sync_buffer
c0108140 712115   2.76051     .text.lock.semaphore
c0119058 621494   2.40922     try_to_wake_up
c011a1bc 409622   1.5879      schedule
c0107d0c 278704   1.08039     __down
c011c4ff 263802   1.02263     .text.lock.sched
c0152ab0 260394   1.00942     __find_get_block_slow
c011fadc 247423   0.959134    profile_hook
c0264da0 231721   0.898265    add_event_entry
c0168aac 223276   0.865528    d_lookup
c01dc510 212968   0.825569    atomic_dec_and_lock
c0119dac 208443   0.808028    scheduler_tick
c015f3dc 192765   0.747253    path_lookup
c01522a0 191853   0.743717    file_move
c0119860 188927   0.732375    load_balance
c0122930 168016   0.651313    current_kernel_time
c010f6b8 166633   0.645952    timer_interrupt
c013ece4 160376   0.621697    check_highmem_ptes
c0133118 155858   0.604183    find_get_page


procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo   in   cs  us sy id  wa
 0  0  48309664 568288   23360    0     0 1026    18  0  0 100  0
 0  0  48274208 568288   23392    0     0 1028   256  0  1  99  0
91  0  48192928 569056   58272    0     0 1036  1790  1 20  79  0
49  0  47769920 580224  439808    0     0 1024  5071  2 94   4  0
70  0  47437248 582720  773952    0     0 1030  2366  1 95   5  0
56  1  47140576 586656 1062272    0     0 1022  1931  1 97   3  0
86  0  46901920 581568 1307168    0     0 1025  1800  0 98   2  0
55  0  46619744 584672 1585568    0     0 1028  2584  1 93   5  0
26  1  46343712 577600 1867360    0     0 1025  3387  1 87  12  1
20  0  46033184 575936 2176384    0     0 1028  5742  2 67  30  1
27  1  45834912 577056 2366304    0     0 1027  6072  2 66  31  1
24  1  45692128 577184 2504960   16     0 1029  8056  2 56  41  1
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo   in   cs  us sy id  wa
18  0  45533280 581376 2659648   32     0 1037  8107  2 57  40  1
23  1  45386464 575264 2810592   32     0 1036  8227  2 54  42  2
 4  4  45342240 577056 2852768  360     0 1112  3217  1 14  81  5
11  1  45283040 576160 2911712  140     0 1065  9003  1 26  71  2
 8  1  45221856 580064 2968128   64     0 1043 11620  2 31  66  1
15  0  45181984 577024 3009120   92     0 1050  9451  2 40  57  1
10  0  45160224 579136 3027584   32     0 1038  9296  2 47  51  0
12  2  45198176 575200 2991840    4  1156 1205  9604  3 47  47  3
19  0  45284320 571872 2909440    0    72 1180 10532  2 42  54  1
21  0  45364064 574112 2827136    0     0 1028  9547  2 49  49  0
23  0  45368224 576384 2821504    0     0 1026  7902  3 57  41  0
22  0  45354720 571776 2839616    0   512 1127  7965  3 60  37  0
16  0  45315168 575072 2874272    0     0 1054  8367  3 59  39  0
19  0  45290592 578272 2898048   68     0 1043  8835  2 45  52  0
18  0  45244768 574432 2947424    8   640 1188  9696  2 45  51  2
18  0  45217184 576064 2973184    8     0 1028 10751  2 43  55  0
17  0  45192992 578080 2995040   24     0 1031 10860  2 40  58  0
21  0  45189408 572736 3004128    0   708 1201  8661  3 56  41  0
17  0  45219744 574112 2971392    8     0 1028  9531  3 46  51  0
26  0  45270176 575232 2919456    0     0 1028  9925  3 44  53  0
28  0  45332448 576672 2855968    0     0 1026  9073  3 50  47  0
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in    cs  us sy id  wa
17  0  45365536 578496 2821440    4     0 1028  8403  3 53  44  0
14  0  45353056 572768 2840352    0   592 1173  7842  3 60  37  0
14  0  45307808 574400 2882176    0     0 1028  8073  3 60  38  0
20  0  45287776 576320 2902208    0     0 1028  8750  3 54  43  0
17  0  45245280 578240 2943424    4     0 1025 10229  2 44  54  0
19  0  45229408 571840 2965536    0   588 1173  9809  2 44  53  0
15  1  45210848 573664 2981216   12     0 1028  9992  2 46  52  0
18  0  45189024 574880 3002016   48     0 1040  8482  2 50  48  0
 9  0  45216032 576032 2972800    0     0 1026  8795  3 53  44  0
12  0  45263968 577056 2923680    0     0 1027 10132  3 43  55  0
18  0  45312992 578080 2874432    0     0 1027  8791  3 51  46  0
20  1  45362336 575168 2828000    0   896 1118  8793  2 52  45  1
29  0  45345376 573760 2847296    0    76 1172  8315  3 56  41  0
30  0  45319840 575648 2871040    0     0 1032  7647  3 61  36  0
 6  1  45289440 576864 2898432   12     0 1026  8163  2 53  44  0
16  0  45255520 578240 2932256   12     0 1030  9372  2 44  54  0
16  0  45228000 572800 2965376    0  1016 1227 10052  2 44  52  1
15  0  45216736 573920 2974880    0     0 1080 10110  2 46  51  0
21  0  45200672 574912 2990528    0     0 1025  8989  2 54  44  0
24  0  45214304 575776 2974880    0     0 1027  9274  3 50  47  0
20  0  45263712 576608 2923520    0     0 1026  8689  3 50  48  0
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in    cs  us sy id  wa
22  0  45320800 577440 2867328    0     0 1027  9487  2 47  50  0
19  0  45344416 571488 2849888    0   632 1079  8074  3 58  39  0
23  0  45344096 573184 2848384    0     0 1132  8043  3 57  40  0
25  0  45324960 574464 2866784    0     0 1026  7457  3 64  33  0
17  0  45296992 575808 2893440    0     0 1027  9124  3 51  46  0
19  0  45265952 576992 2923648    0     0 1025  9579  2 50  48  0
23  0  45237280 577728 2949344    0     0 1030 10051  2 48  50  0
15  0  45208800 578912 2978656    0     0 1026 10072  2 47  51  0
19  0  45211552 568832 2985696    0   176 1070  8537  3 55  42  0
20  0  45215328 569792 2980736    0     0 1027  9434  3 47  50  0
12  0  45245408 570432 2949472    0     0 1026  9018  3 51  46  0
19  0  45317600 570816 2875904    0     0 1025  9767  3 47  50  0
13  0  45351712 571616 2842272    0     0 1030  9012  2 51  46  0
23  0  45342752 572640 2851232    0     0 1027  8134  3 59  38  0
23  0  45332832 573536 2859616    0     0 1026  8081  3 57  40  0
15  0  45304992 574816 2886560    0     0 1024  8929  2 54  44  0
19  0  45270752 575712 2919776    0     0 1027  9503  2 49  49  0
16  0  45230560 576896 2958208    0     0 1031 10190  2 44  54  0
22  0  45205344 577664 2981312    0     0 1024  9903  2 45  53  0
16  0  45206688 578720 2981024    4     0 1028  8860  2 53  44  0
20  0  45218080 572128 2976416    0   988 1168  9526  3 50  47  1
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in    cs  us sy id  wa
20  0  45242400 572640 2950144    0     0 1133  8446  3 52  45  0
21  0  45312608 573024 2879936    0     0 1026  9444  3 48  49  0
22  0  45347104 573664 2844576    0     0 1026  8102  3 57  40  0
15  0  45344352 574368 2847584    0     0 1027  8310  3 57  40  0
27  0  45334304 575008 2856736    0     0 1026  8763  3 52  45  0
26  0  45303328 575712 2887648    0     0 1026  9057  2 53  44  0
17  0  45269984 576704 2920704    0     0 1026  9975  2 45  52  0
20  0  45222880 578240 2966080    0     0 1027 10239  2 44  53  0
23  0  45209760 573120 2983168    4   852 1138 10067  2 45  52  1
25  0  45197792 573856 2995200    0     0 1136  8914  3 51  46  0
17  0  45216288 574208 2974976    0     0 1017  9714  2 47  51  0
15  0  45255712 574720 2935488    0     0 1026  9560  3 48  49  0
20  0  45311072 575360 2879296    0     0 1027  8887  3 55  42  0
29  0  45339360 575904 2851072    0     0 1031  7836  3 61  36  0
24  0  45341280 576480 2849056    0     0 1022  7930  3 57  40  0
27  0  45330976 577504 2858400    0     0 1025  7902  3 60  37  0
25  0  45311264 578112 2877728    0     0 1027  8213  3 56  42  0
17  0  45271712 579264 2916256   20     0 1032  9453  2 47  51  0
21  0  45236064 573632 2957344    0   864 1240 10165  2 45  52  1
21  0  45214752 574336 2977152    0     0 1026 10164  2 43  54  0
28  0  45212512 574976 2979936    0     0 1026  9132  3 52  45  0
procs -----------memory--------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in    cs  us sy id  wa
22  0  45204512 575360 2985536    0     0 1028  9087  3 51  46  0
16  0  45234976 576096 2954816    0     0 1028  9246  3 52  46  0
28  0  45284640 576608 2903360    0     0 1028  8713  3 55  43  0
30  0  45327328 577120 2860992    0     0 1026  8932  3 53  44  0
23  0  45342048 577504 2847104    0     0 1026  8421  3 55  43  0
26  0  45334656 578048 2854144    0     0 1025  7808  3 56  41  0
26  0  45315616 578720 2872352    0     0 1027  8527  3 53  45  0
18  1  45280800 574528 2911712    0   524 1028  8218  2 55  42  0
22  0  45246752 573792 2946784    0   416 1256  9411  3 45  51  1
11  0  45226464 574432 2965856    0     0 1026  9924  2 44  54  0
13  0  45214432 575296 2977056    0     0 1027  9667  2 47  51  0
24  0  45214048 575872 2976544    0     0 1026  8562  2 53  45  0
26  0  45216544 576288 2973792    0     0 1026  8803  3 48  49  0
28  0  45261472 576640 2928544    0     0 1043  9363  3 50  47  0
27  0  45304352 576928 2884896    0     0 1009  8179  3 52  45  0
27  0  45349408 577472 2839488    0     0 1028  8289  3 55  42  0
17  0  45342432 577792 2845600    0     0 1033  8466  3 56  41  0
35  0  45328672 578112 2858560    0     0 1022  8160  2 56  41  0
23  0  45290720 578464 2897344    0     0 1026  8669  2 51  47  0
17  0  45249824 579360 2938080    0     0 1027  8979  2 51  47  0
14  0  45234080 572896 2960544    0   784 1215 10500  2 43  54  1
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in   cs   us sy id  wa
20  0  45218656 573344 2974624    0     0 1030 10204  2 44  54  0
35  0  45216480 573664 2976992    0     0 1049  8737  2 51  46  0
15  0  45230048 574048 2962240    0     0 1277 10935  3 52  45  0
19  0  45256928 574432 2934816    0     0 1012  8437  3 50  47  0
18  0  45306080 574720 2885344    0     0 1025  8968  3 51  46  0
11  0  45346016 574944 2846176    0     0 1026  8805  2 53  45  0
21  0  45350688 575200 2842144    0     0 1026  7943  3 57  40  0
13  0  45315552 575520 2876352    0     0 1026  8419  2 56  42  0
24  0  45292704 575872 2897824    0     0 1028  8356  2 55  43  0
23  0  45254176 576576 2936992    0     0 1029  9594  3 49  49  0
18  0  45229856 577056 2960640    0     0 1027  9921  2 46  52  0
17  0  45209632 577600 2980864    0     0 1025  9635  2 45  53  0
26  0  45221664 577888 2967232    0     0 1045  9195  2 51  47  0
16  0  45231264 578464 2957248    0     0 1264 10989  2 51  47  0
21  0  45253088 578784 2935552    0     0 1025  8312  3 55  43  0
22  0  45310240 579040 2876992    0     0 1028  8878  3 51  47  0
23  0  45346016 579392 2841696    0     0 1026  9040  3 49  48  0
19  0  45345696 579712 2842464    0     0 1026  8184  2 58  39  0
23  0  45324000 580064 2863296    0     0 1027  8464  3 53  45  0
21  0  45285024 574240 2909024    0   748 1157  8943  2 52  45  0
17  0  45251360 574848 2941888    0     0 1081  9679  3 48  50  0
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in   cs   us sy id  wa
16  0  45233760 575296 2959616    0     0 1027 10785  2 43  55  0
21  0  45208032 575904 2983936   20     0 1034  9635  2 46  52  0
11  0  45212512 576576 2977248   12     0 1030  9593  2 48  49  0
13  0  45226848 576832 2963360    0     0 1026  9295  3 47  51  0
27  0  45264544 577088 2924704    0     0 1026  8856  3 51  46  0
17  0  45308768 577280 2880608    0     0 1027  9160  3 51  46  0
28  0  45335584 577536 2854528    0     0 1030  8855  3 52  46  0
25  0  45338336 577856 2850976    0     0 1027  7700  3 57  40  0
29  0  45326816 578048 2862496    0     0 1025  7957  3 56  42  0
33  0  45291168 578400 2897952    0     0 1028  8537  2 55  43  0
17  0  45252576 578880 2936192    0     0 1024  8634  2 53  44  0
17  0  45226912 579136 2960416    0     0 1026  9436  2 49  49  0
15  0  45206240 579392 2980928    0     0 1028  9701  2 51  47  0
26  0  45210976 579520 2978112    0     0 1026  9054  3 52  45  0
28  0  45227040 579712 2960576    0     0 1025  9191  3 47  50  0
25  0  45255328 579968 2932736    0     0 1027  8116  3 55  42  0
20  0  45301152 580096 2886624    0     0 1027  9113  3 52  45  0
23  0  45324128 572960 2870272    0   612 1129  7719  3 57  40  0
22  0  45341792 573216 2852512    0     0 1075  8123  3 54  43  0
23  0  45341728 573760 2851616    0     0 1027  8674  2 55  43  0
18  0  45306720 573984 2887360    0     0 1026  8670  2 55  43  0
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in    cs  us sy id  wa
16  0  45282016 574176 2911232    0     0 1028  9431  3 50  48  0
17  0  45243040 574432 2951456    0     0 1030  9618  3 49  48  0
21  0  45218784 574656 2974240    0     0 1025  9529  2 49  49  0
21  0  45219296 574848 2971904    0     0 1027  9826  2 48  49  0
21  0  45234464 575008 2956288    0     0 1026  9718  3 48  50  0
17  0  45265376 575168 2927712    0     0 1028  8749  3 57  40  0
22  0  45278752 575232 2914176    0     0 1026  8613  3 55  42  0
31  0  45304480 575520 2887328    0     0 1025  7851  3 60  37  0
20  0  45329888 575808 2861280    0     0 1026  8646  3 54  43  0
24  0  45331872 575936 2859552    0     0 1027  8722  2 54  44  0
21  0  45305056 576192 2886976    0     0 1026  8383  2 58  40  0
24  0  45279136 576416 2912736    0     0 1027  8586  2 57  41  0
21  0  45248736 576896 2942272    0     0 1026  9470  2 51  47  0
14  0  45223392 577184 2967328    0     0 1026 10033  2 47  50  0
19  0  45219104 577376 2971392    0     0 1026  9856  2 49  49  0
17  0  45222432 577600 2967456    0     0 1027  9484  3 47  51  0
29  0  45261472 577632 2927264    0     0 1035  8997  3 54  43  0
25  0  45271008 577792 2917664    0     0 1021  8259  3 55  42  0
24  0  45291936 578016 2897632    0     0 1027  7985  3 57  40  0
22  0  45308960 578336 2880864    0     0 1027  9218  3 52  46  0
18  0  45328160 578560 2861120    0     0 1025  8907  2 51  47  0
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in    cs  us sy id  wa
22  0  45304480 578784 2884768    0     0 1026  8257  2 52  45  0
36  0  45279776 578912 2909472    0     0 1027  8651  2 54  43  0
28  0  45254624 579104 2933280    0     0 1026  9417  3 49  48  0
22  0  45238688 579328 2949920    0     0 1031  8350  2 57  41  0
20  0  45216800 579552 2972384    0     0 1023  8923  2 51  47  0
22  0  45217504 579616 2970464    0     0 1026  9383  3 50  47  0
19  0  45249760 579808 2937920    0     0 1026  9535  2 47  50  0
21  1  45283680 580480 2902336   24     0 1032  9435  3 49  49  0
26  0  45316960 575424 2873696   52   800 1237  8920  3 57  37  4
18  2  45365152 579776 2822816  220     0 1079  8644  3 56  40  1
21 10  45406880 576064 2783904  216  1104 1219  9441  3 59  30  8
19 13  45499040 576800 2691584  184   252 1229 14054  2 52  31 15
22 26  45560736 578432 2627872  544   584 1204 11215  2 63  19 16
36 49  45656672 585472 2524128  616   288 1236 21734  2 61  14 23
34 47  45793248 580544 2391296  564   384 1251 26024  2 67  10 21
20  5  45992672 582432 2189312  272   896 1338 19731  2 75   9 15
16 107 46253536 578656 1932672  832   524 1157 41761  1 54   9 36
 1 106 46416608 575776 1792800  472  1200 1212 37282  0 23  11 66
 7 100 46468000 578784 1739456  392  1776 1201 20277  0  7   4 89
 1 100 46538208 571104 1679872  524    16 1234 22199  0  8   3 88
 0 99  46624480 578016 1587840  892     0 1255 32708  0 11   3 85
procs -----------memory---------- -----io---- --system-- ----cpu----
 r  b   free     buff   cache    bi    bo  in   cs   us sy  id wa
16 95  46728928 573920 1487168  916    44 1259 35577  0 12   2 86
 1 98  46829792 570080 1392160  860    52 1261 32797  0 12   4 85
10 94  46924640 576640 1292160  860     0 1251 33826  0 12   1 87
 8 91  47017376 572576 1205824  808    24 1245 29650  0 10   2 87
 2 90  47122592 579392 1096864  904     0 1256 35883  0 13   2 86
11 85  47241056 578784  978976  844    24 1247 36740  0 13   4 83
 8 81  47411872 578720  815584  892     4 1260 46037  0 18   5 77
 5 66  47588576 576832  651200  740     4 1229 42522  0 17   9 74
 8 48  47851872 573696  413280  760     4 1237 43504  0 19  15 66
 5 24  48114848 579616  164448  760     0 1244 31623  0 15  25 60
 0  0  48296032 575456   21280  396     0 1155  7253  0  6  70 24
 0  0  48296160 575456   21408    4     0 1028    88  0  1  99  0
 0  0  48295456 575488   21664   44     0 1037   174  0  1  98  0
 0  0  48295456 575488   21664    0     0 1025    20  0  0 100  0
 0  0  48295456 575488   21664    0     0 1025    18  0  0 100  0
 0  0  48295456 575488   21664    0     0 1025    22  0  0 100  0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  8:16                   ` [Ext2-devel] " Alex Tomas
@ 2003-03-15  8:29                     ` William Lee Irwin III
  2003-03-15  8:32                       ` Alex Tomas
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  8:29 UTC (permalink / raw)
  To: Alex Tomas; +Cc: Andrew Morton, adilger, linux-kernel, ext2-devel

Andrew Morton (AM) writes:
AM> Nope.  What we're trying to measure here is pure in-memory lock
AM> contention, locked bus traffic, context switches, etc, etc.  To
AM> do that we need to get the IO system out of the picture.

On Sat, Mar 15, 2003 at 11:16:10AM +0300, Alex Tomas wrote:
> I simple use own pretty simple test. btw, you may disable preallocation
> to increase allocation rate

This looks very interesting, but it may have to wait ca. 24 hours for
some benchmark time b/c of the long boot times and late hour in .us.

This also looks like it would be a much better stress test, and the
NUMA-Q is known for bringing out many rare races. There is are good
reasons to run this test even aside from performance.


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  8:29                     ` William Lee Irwin III
@ 2003-03-15  8:32                       ` Alex Tomas
  2003-03-15  9:23                         ` William Lee Irwin III
  0 siblings, 1 reply; 39+ messages in thread
From: Alex Tomas @ 2003-03-15  8:32 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Alex Tomas, Andrew Morton, adilger, linux-kernel, ext2-devel

>>>>> William Lee Irwin (WLI) writes:

 >> I simple use own pretty simple test. btw, you may disable
 >> preallocation to increase allocation rate

 WLI> This looks very interesting, but it may have to wait ca. 24
 WLI> hours for some benchmark time b/c of the long boot times and
 WLI> late hour in .us.

 WLI> This also looks like it would be a much better stress test, and
 WLI> the NUMA-Q is known for bringing out many rare races. There is
 WLI> are good reasons to run this test even aside from performance.

fine. it's really interesting to see results for so big iron.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [Ext2-devel] Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  8:32                       ` Alex Tomas
@ 2003-03-15  9:23                         ` William Lee Irwin III
  0 siblings, 0 replies; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  9:23 UTC (permalink / raw)
  To: Alex Tomas; +Cc: Andrew Morton, adilger, linux-kernel, ext2-devel

>>>>> William Lee Irwin (WLI) writes:
>> I simple use own pretty simple test. btw, you may disable
>> preallocation to increase allocation rate

WLI> This looks very interesting, but it may have to wait ca. 24
WLI> hours for some benchmark time b/c of the long boot times and
WLI> late hour in .us.
WLI> This also looks like it would be a much better stress test, and
WLI> the NUMA-Q is known for bringing out many rare races. There is
WLI> are good reasons to run this test even aside from performance.

On Sat, Mar 15, 2003 at 11:32:28AM +0300, Alex Tomas wrote:
> fine. it's really interesting to see results for so big iron.
"
So maybe it's pointless to elaborate on this in particular, but...

I actually borrowed time on the extra quads (I have 4 that are primarily
used by me; these systems support static partitioning, so as long as the
cabling is done right, you can make 4 4 quad systems from 16 quads, or
2 8 quad systems from 16 quads, or 1 16 quad system from 16 quads, etc.;
the other 4 are actually primarily used by Dave Hansen, but he's been
tied up with tasks that need him to use other systems this week and so
lent them to me) for the purpose of hardening pgcl (my forward port of
Hugh Dickins' page clustering patch), but when the issue of lock
contention came up, I thought it would be a good idea to utilize the
elevated cpu count to highlight the lock contention you were trying to
address with this patch. I'd be more than happy to see an effective
case for it made or otherwise demonstrate its merits.

I guess it's mostly OT and/or organizational, but it might (for those
who are interested) give an idea of how the time on these larger
systems is spent. In this case, the larger system is dynamically put
together from two smaller systems when another kernel hacker isn't
focusing on that system and nicely cooperates to give other people time
to test/benchmark/etc. on the hardware that can be glued together with
stuff regularly used by some other kernel hacker to form a larger system.
To some it might sound inconvenient, but I'm grateful for every minute
of time I get on the things.

There are other situations or "typical patterns" for getting at the
larger systems. What's probably the most typical pattern of all is that
the vendors themselves can't afford the larger models of their own
machines for kernel hacking purposes, and so the hackers (and their
managers and other kinds of helpers) scramble to beg, borrow, and steal
time on such machines from whatever places they can.

I have no idea what possessed me to describe all this, but I'll go on.
And sorry that this is probably very irrelevant to you Alex, but:

To all those who help get me in front of these, things, i.e. Dave, Hans,
Martin, Gerrit, Hubertus, et al, thanks a million! I love hacking on
big boxen, and (at least from the above) it's clear I can't do it alone.


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  8:24                     ` William Lee Irwin III
@ 2003-03-15  9:47                       ` William Lee Irwin III
  2003-03-15 11:58                         ` William Lee Irwin III
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15  9:47 UTC (permalink / raw)
  To: Andrew Morton, bzzz, adilger, linux-kernel, ext2-devel; +Cc: hawkes, hannal

On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
> Next pass involves lockmeter:

Throughput 39.2014 MB/sec 128 procs
dbench 128  142.51s user 10828.91s system 964% cpu 18:57.88 total

That's an 83% reduction in throughput from applying lockmeter.

Um, somebody should look into this. The thing is a bloody doorstop:

vma      samples  %-age       symbol name
c012fbbd 20829312 51.3129     .text.lock.lockmeter
c012eb1c 3834281  9.44573     alloc_rwlock_struct
c0106f74 2940592  7.24413     default_idle
c025174c 2837008  6.98895     sync_buffer
c012ec58 1438542  3.54384     _metered_read_lock
c012e98c 1129385  2.78223     _metered_spin_lock
c012ea94 1044869  2.57403     _metered_spin_unlock
c012e89c 982225   2.41971     lstat_update
c012efd0 702482   1.73056     _metered_write_lock
c01cca10 657780   1.62044     __copy_to_user_ll
c012e6f0 587940   1.44839     lstat_lookup
c0251910 551723   1.35917     add_event_entry
c012ee70 512327   1.26211     _metered_read_unlock
c0109a30 298994   0.73657     apic_timer_interrupt
c01cca78 202482   0.498813    __copy_from_user_ll
c0120730 159350   0.392558    current_kernel_time
c012f148 133340   0.328482    _metered_write_unlock
c02516a8 127259   0.313502    add_sample
c0251634 112579   0.277338    add_sample_entry
c0118cac 102231   0.251845    scheduler_tick
c010f080 75857    0.186873    timer_interrupt

AFAICT the actual results are also garbage.


System: Linux curly 2.5.64 #1 SMP Sat Mar 15 00:49:53 PST 2003 i686
Total counts

All (32) CPUs

Start time: Sat Mar 15 01:20:18 2003
End   time: Sat Mar 15 01:39:03 2003
Delta Time: 1129.85 sec.
Hash table slots in use:      216.
Global read lock slots in use: 999.

*************************** Warnings! ******************************
	Read Lock table overflowed.

	The data in this report may be in error due to this.
************************ End of Warnings! **************************


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SPINLOCKS         HOLD            WAIT
  UTIL  CON    MEAN(  MAX )   MEAN(  MAX )(% CPU)     TOTAL NOWAIT SPIN RJECT  NAME

        2.6%                   0us                821511883 97.4% 0.15%  2.5%  *TOTAL*

    0%    0%                   0us                      179  100%    0%    0%  [0xc0433810]
    0%    0%                   0us                      179  100%    0%    0%    uart_wait_modem_status+0x168

    0%    0%                   0us                      179  100%    0%    0%  [0xc0434260]
    0%    0%                   0us                      179  100%    0%    0%    uart_wait_modem_status+0x18c

    0%    0%                   0us                      129  100%    0%    0%  [0xee1c90f0]
    0%    0%                   0us                      129  100%    0%    0%    autofs4_dir_rmdir+0xa0

    0%    0%                   0us                        1  100%    0%    0%  [0xee66b020]
    0%    0%                   0us                        1  100%    0%    0%    inet_rtm_newaddr+0x54

    0%    0%                   0us                       49  100%    0%    0%  [0xef30b8d4]
    0%    0%                   0us                       16  100%    0%    0%    do_mmap_pgoff+0x468
    0%    0%                   0us                       33  100%    0%    0%    sys_mlockall+0x88

    0%  3.7%                   0us                    46947 96.3%  3.7%    0%  [0xef3c85f4]
    0%  3.7%                   0us                    46947 96.3%  3.7%    0%    sys_fchmod+0xb8

    0%    0%                   0us                      227  100%    0%    0%  [0xef53e994]
    0%    0%                   0us                      227  100%    0%    0%    scsi_dispatch_cmd+0x38

    0%    0%                   0us                     2770  100%    0%    0%  [0xef55011c]
    0%    0%                   0us                     2218  100%    0%    0%    __constant_c_and_count_memset+0x24
    0%    0%                   0us                      552  100%    0%    0%    _decode_session+0x130

    0%    0%                   0us                     2218  100%    0%    0%  [0xef550128]
    0%    0%                   0us                     2218  100%    0%    0%    _decode_session+0x14

    0%    0%                   0us                        2  100%    0%    0%  [0xef6455a0]
    0%    0%                   0us                        1  100%    0%    0%    filemap_sync+0xcc
    0%    0%                   0us                        1  100%    0%    0%    mprotect_fixup+0x14

    0%    0%                   0us                        2  100%    0%    0%  [0xf04c7df0]
    0%    0%                   0us                        1  100%    0%    0%    get_one_pte_map_nested+0x58
    0%    0%                   0us                        1  100%    0%    0%    insert_vm_struct+0xc

    0%  3.0%                   0us                  1543963 97.0%  3.0%    0%  __per_cpu_end+0xdc
    0%  3.6%                   0us                      220 96.4%  3.6%    0%    clear_inode+0x58
    0%  1.9%                   0us                   258185 98.1%  1.9%    0%    dentry_iput+0x24
    0%  4.3%                   0us                   119875 95.7%  4.3%    0%    do_pollfd+0x68
    0%  2.6%                   0us                   258168 97.4%  2.6%    0%    do_poll+0x7c
    0%  3.2%                   0us                   385561 96.8%  3.2%    0%    sys_poll+0x218
    0%  3.4%                   0us                   258115 96.6%  3.4%    0%    sys_select+0x7c
    0%    0%                   0us                        5  100%    0%    0%    sys_select+0x204
    0%  3.2%                   0us                   263834 96.8%  3.2%    0%    sys_select+0x340

    0%    0%                   0us                        3  100%    0%    0%  _binary_usr_initramfs_data_cpio_gz_end+0x13c
    0%    0%                   0us                        2  100%    0%    0%    do_rw_proc+0x24
    0%    0%                   0us                        1  100%    0%    0%    proc_readsys+0xc

    0% 31.8%                   0us                   904982 68.2% 31.8%    0%  log_buf+0x5c60
    0%    0%                   0us                        3  100%    0%    0%    .text.lock.eventpoll+0x6
    0% 20.0%                   0us                       80 80.0% 20.0%    0%    .text.lock.pageattr+0x3f
    0% 35.8%                   0us                     2302 64.2% 35.8%    0%    __register_serial+0x98
    0% 33.3%                   0us                        3 66.7% 33.3%    0%    aio_kick_handler+0x8
    0%  100%                   0us                        2    0%  100%    0%    block_truncate_page+0x20c
    0% 31.9%                   0us                      138 68.1% 31.9%    0%    copy_msqid_from_user+0xbc
    0% 36.4%                   0us                     2176 63.6% 36.4%    0%    ep_poll+0x34
    0% 24.4%                   0us                      639 75.6% 24.4%    0%    flock_make_lock+0x38
    0% 36.5%                   0us                      639 63.5% 36.5%    0%    flock_make_lock+0xb0
    0% 34.6%                   0us                     2331 65.4% 34.6%    0%    get_pci_port+0x14c
    0% 44.0%                   0us                      639 56.0% 44.0%    0%    locks_alloc_lock
    0% 35.6%                   0us                      225 64.4% 35.6%    0%    parse_extended+0x184
    0% 12.7%                   0us                       71 87.3% 12.7%    0%    pipe_write+0x258
    0% 31.7%                   0us                   895062 68.3% 31.7%    0%    proc_pid_make_inode+0x3c
    0% 33.3%                   0us                        3 66.7% 33.3%    0%    read_events+0xe8
    0%    0%                   0us                        2  100%    0%    0%    serial8250_type+0x8
    0% 39.4%                   0us                      635 60.6% 39.4%    0%    unuse_pmd+0x11c
    0% 12.5%                   0us                       32 87.5% 12.5%    0%    vfs_create+0x90

    0%    0%                   0us                        2  100%    0%    0%  log_buf+0x5c80
    0%    0%                   0us                        2  100%    0%    0%    kmap_atomic+0x8

    0%  1.6%                   0us                  1045171 98.4%  1.6%    0%  lru_add_active_pvecs__per_cpu+0x20
    0%  1.6%                   0us                  1045150 98.4%  1.6%    0%    bd_set_size+0x24
    0%    0%                   0us                       21  100%    0%    0%    bd_set_size+0x70

    0%    0%                   0us                        2  100%    0%    0%  pci_boards+0x22c
    0%    0%                   0us                        2  100%    0%    0%    mtrr_ioctl+0x4f8

    0%    0%                   0us                        6  100%    0%    0%  pci_boards+0x234
    0%    0%                   0us                        6  100%    0%    0%    __constant_c_and_count_memset+0x50

    0%    0%                   0us                  1129503  100%    0%    0%  pci_vendor_list+0x4200
    0%    0%                   0us                  1129503  100%    0%    0%    destroy_context+0x4c

    0%    0%                   0us                  1129503  100%    0%    0%  pci_vendor_list+0x47d0
    0%    0%                   0us                  1129503  100%    0%    0%    mtrr_write+0x194

    0%    0%                   0us                      149  100%    0%    0%  pid_hash+0x8680
    0%    0%                   0us                        4  100%    0%    0%    bio_add_page+0xa8
    0%    0%                   0us                      140  100%    0%    0%    load_balance+0x1e8
    0%    0%                   0us                        5  100%    0%    0%    scheduler_tick+0x180

    0%    0%                   0us                  1129503  100%    0%    0%  pid_hash+0x8824
    0%    0%                   0us                  1129503  100%    0%    0%    init_new_context+0xe0

    0%  3.9%                   0us                  2900686 96.1%  3.9%    0%  pid_hash+0x89c0
    0%  4.0%                   0us                  1450343 96.0%  4.0%    0%    __pagevec_lru_add+0xa4
    0%  3.8%                   0us                  1450343 96.2%  3.8%    0%    do_invalidatepage

    0%  6.5%                   0us                  2985077 93.5%  6.5%    0%  pid_hash+0x89e0
    0%  6.8%                   0us                  1492680 93.2%  6.8%    0%    sys_swapon+0xe4
    0% 50.0%                   0us                        2 50.0% 50.0%    0%    sys_swapon+0x1bc
    0%  6.1%                   0us                  1492393 93.9%  6.1%    0%    sys_swapon+0x204
    0%    0%                   0us                        2  100%    0%    0%    sys_swapon+0x244

    0%    0%                   0us                     1089  100%    0%    0%  pid_hash+0x8a00
    0%    0%                   0us                     1089  100%    0%    0%    __block_prepare_write+0x428

    0%  7.6%                   0us                  4513471 92.4%  7.6%    0%  pid_hash+0x8a20
    0% 25.0%                   0us                     4929 75.0% 25.0%    0%    .text.lock.namei+0x74
    0%  8.8%                   0us                   324046 91.2%  8.8%    0%    .text.lock.namei+0x1ac
    0%  7.2%                   0us                  2491551 92.8%  7.2%    0%    __follow_down+0x4c
    0% 50.0%                   0us                        2 50.0% 50.0%    0%    blkdev_put+0x110
    0%  2.4%                   0us                     1185 97.6%  2.4%    0%    count+0x28
    0% 11.4%                   0us                    17982 88.6% 11.4%    0%    do_fcntl+0x154
    0%    0%                   0us                        8  100%    0%    0%    do_open+0x258
    0% 10.0%                   0us                       20 90.0% 10.0%    0%    file_ioctl+0x138
    0%  8.6%                   0us                     6114 91.4%  8.6%    0%    locate_fd+0xd0
    0%  7.6%                   0us                   265244 92.4%  7.6%    0%    send_sigio+0x94
    0%  3.5%                   0us                   368709 96.5%  3.5%    0%    send_sigurg+0x24
    0%  7.4%                   0us                    54539 92.6%  7.4%    0%    send_sigurg+0x84
    0% 10.6%                   0us                   368709 89.4% 10.6%    0%    setfl+0xa8
    0%  9.0%                   0us                   608895 91.0%  9.0%    0%    setfl+0x110
    0%    0%                   0us                        2  100%    0%    0%    sys_ioctl+0xf0
    0%  7.2%                   0us                     1536 92.8%  7.2%    0%    sys_uselib+0x12c

    0%    0%                   0us                      390  100%    0%    0%  pid_hash+0x8a60
    0%    0%                   0us                      195  100%    0%    0%    csi_m+0x258
    0%    0%                   0us                      195  100%    0%    0%    cursor_report+0x2c

    0% 21.7%                   0us                    25815 78.3% 21.7%    0%  tvec_bases__per_cpu+0x24
    0%  7.1%                   0us                     4725 92.9%  7.1%    0%    in_group_p+0xc
    0% 24.9%                   0us                    21090 75.1% 24.9%    0%    sys_getgroups+0x38

    0%    0%                   0us                        1  100%    0%    0%  tvec_bases__per_cpu+0x554
    0%    0%                   0us                        1  100%    0%    0%    .text.lock.mprotect+0x41

    0%    0%                   0us                     1089  100%    0%    0%  tvec_bases__per_cpu+0xac0
    0%    0%                   0us                     1089  100%    0%    0%    get_vm_area+0x4c

    0%    0%                   0us                     1770  100%    0%    0%  tvec_bases__per_cpu+0xc70
    0%    0%                   0us                      225  100%    0%    0%    __getblk_slow+0x40
    0%    0%                   0us                      445  100%    0%    0%    __getblk_slow+0x80
    0%    0%                   0us                      220  100%    0%    0%    clear_inode+0x68
    0%    0%                   0us                      880  100%    0%    0%    clear_inode+0xbc

    0%    0%                   0us                       69  100%    0%    0%  .text.lock.mempool+0x4a
    0% 0.00%                   0us                   670256  100% 0.00%    0%  .text.lock.namei+0xac
    0%    0%                   0us                      587  100%    0%    0%  __bounce_end_io_read+0x38
    0%    0%                   0us                      140  100%    0%    0%  __constant_memcpy+0x10
    0% 0.99%                   0us                   829218 99.0% 0.99%    0%  __constant_memcpy+0xf0
    0% 0.00%                   0us                  2491547  100% 0.00%    0%  __follow_down+0x60
    0% 0.94%                   0us                    21329 99.1% 0.94%    0%  __free_pages_bulk+0x2c
    0%  100%                   0us                  2119165    0%    0%  100%  __free_pages_bulk+0x88
    0% 57.2%                   0us                    30598 42.8% 57.2%    0%  __ioremap+0x20
    0% 0.05%                   0us                    30598  100% 0.05%    0%  __ioremap+0x2c
    0% 78.7%                   0us                     7429 21.3% 78.7%    0%  __ioremap+0x68
    0% 0.01%                   0us                    28980  100% 0.01%    0%  __set_page_dirty_buffers+0x110
    0%  3.5%                   0us                    20255 96.5%  3.5%    0%  balance_dirty_pages+0xb8
    0%    0%                   0us                        4  100%    0%    0%  bio_add_page+0x104
    0%    0%                   0us                        4  100%    0%    0%  bio_alloc+0xf8
    0%    0%                   0us                    23110  100%    0%    0%  cpu_raise_softirq+0x8
    0%    0%                   0us                      142  100%    0%    0%  create_workqueue+0x144
    0%    0%                   0us                   327491  100%    0%    0%  dentry_open+0x14c
    0%    0%                   0us                        7  100%    0%    0%  do_anonymous_page+0x268
    0%    0%                   0us                        7  100%    0%    0%  do_page_fault+0xfc
    0% 0.71%                   0us                   203785 99.3% 0.71%    0%  do_proc_readlink+0x38
    0%    0%                   0us                      901  100%    0%    0%  do_wp_page+0x2cc
    0%    0%                   0us                       22  100%    0%    0%  do_wp_page+0x45c
    0%    0%                   0us                      138  100%    0%    0%  dup_mmap+0x120
    0%    0%                   0us                      138  100%    0%    0%  dup_mmap+0x158
    0%    0%                   0us                      138  100%    0%    0%  dup_mmap+0x220
    0%    0%                   0us                      138  100%    0%    0%  dup_mmap+0xc8
    0% 0.34%                   0us                     1782 99.7% 0.34%    0%  frag_show+0x40
    0%    0%                   0us                  1019899  100%    0%    0%  free_buffer_head+0x34
    0%    0%                   0us                       14  100%    0%    0%  free_one_pmd+0x168
    0% 0.01%                   0us                    11129  100% 0.01%    0%  get_dirty_limits+0x3c
    0%    0%                   0us                   392486  100%    0%    0%  get_empty_filp+0x12c
    0%    0%                   0us                      838  100%    0%    0%  handle_mm_fault+0xe0
    0%    0%                   0us                       14  100%    0%    0%  hugetlb_report_meminfo+0x34
    0%    0%                   0us                  1185123  100%    0%    0%  init_buffer_head+0x4c
    0%    0%                   0us                        7  100%    0%    0%  kmap_atomic+0x14
    0%    0%                   0us                       75  100%    0%    0%  ksoftirqd+0x100
    0%    0%                   0us                       77  100%    0%    0%  ksoftirqd+0x10c
    0%    0%                   0us                      152  100%    0%    0%  ksoftirqd+0x114
    0%    0%                   0us                        5  100%    0%    0%  kunmap+0x20
    0% 0.22%                   0us                   487201 99.8% 0.22%    0%  mem_open+0x8
    0% 0.65%                   0us                   510091 99.4% 0.65%    0%  mounts_release+0x8
    0%    0%                   0us                        2  100%    0%    0%  number+0x1fc
    0%    0%                   0us                      138  100%    0%    0%  proc_doutsstring+0x78
    0% 0.39%                   0us                     1275 99.6% 0.39%    0%  proc_info_read+0x98
    0% 0.38%                   0us                   675021 99.6% 0.38%    0%  proc_pid_cmdline+0x54
    0%    0%                   0us                      714  100%    0%    0%  pte_alloc_kernel+0x74
    0% 12.2%                   0us                   312871 87.8%    0% 12.2%  remap_area_pages+0x1f0
    0%    0%                   0us                        8  100%    0%    0%  risc_code01+0x1490
    0%    0%                   0us                     2186  100%    0%    0%  risc_code01+0x1b24
    0% 0.13%                   0us                     2225 99.9% 0.13%    0%  risc_code01+0x4f9c
    0%    0%                   0us                      129  100%    0%    0%  scheduler_tick+0x2e0
    0% 0.03%                   0us                  1282523  100% 0.03%    0%  search_exception_table+0x3c
    0%    0%                   0us                   265244  100%    0%    0%  send_sigio+0xa0
    0% 0.94%                   0us                 14301378 99.1% 0.94%    0%  send_sigio_to_task+0x30
    0%    0%                   0us                    54539  100%    0%    0%  send_sigurg+0x94
    0%    0%                   0us                        4  100%    0%    0%  sget+0x30
    0%    0%                   0us                      129  100%    0%    0%  shrink_cache+0x264
    0%    0%                   0us                       30  100%    0%    0%  shrink_list+0x420
    0% 0.00%                   0us                  1132765  100% 0.00%    0%  simd_math_error+0x28
    0%    0%                   0us                  1132765  100%    0%    0%  simd_math_error+0x94
    0% 0.10%                   0us                 10704360 99.9% 0.10%    0%  split_large_page+0x4
    0%  2.5%                   0us                    11583 97.5%  2.5%    0%  sys_access+0x150
    0%  3.0%                   0us                    41401 97.0%  3.0%    0%  sys_fstatfs+0x18
    0%    0%                   0us                       14  100%    0%    0%  sys_mincore+0x138
    0%    0%                   0us                        4  100%    0%    0%  sys_semtimedop+0x3fc
    0%    0%                   0us                   174724  100%    0%    0%  udp_queue_rcv_skb+0x5c
    0%  2.4%                   0us                761925513 97.6%    0%  2.4%  udp_recvmsg+0x26c
    0%  3.9%                   0us                  1705264 96.1%  3.9%    0%  valid_swaphandles+0x4c
    0%  1.2%                   0us                     1782 98.8%  1.2%    0%  vmstat_next+0x38
    0% 50.0%                   0us                        2 50.0% 50.0%    0%  vsnprintf+0x20
    0%    0%                   0us                     2518  100%    0%    0%  zap_pte_range+0x2e0
    0%    0%                   0us                      805  100%    0%    0%  zap_pte_range+0x9c

- - - - - - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RWLOCK READS   HOLD    MAX  RDR BUSY PERIOD      WAIT
  UTIL  CON    MEAN   RDRS   MEAN(  MAX )   MEAN(  MAX )( %CPU)     TOTAL NOWAIT SPIN  NAME

       0.02%                                 0us                 26339364  100% 0.02%  *TOTAL*

 0.01%    0%   6.7us     1  6.7us( 628us)    0us                     2196  100%    0%  [0xc1be51c4]
          0%                                 0us                     2196  100%    0%    do_pipe+0x1bc

 0.00%    0%  11.5us     1   11us( 130us)    0us                        3  100%    0%  [0xee6ae8a4]
          0%                                 0us                        3  100%    0%    __constant_memcpy+0x68

 0.00%    0%  13.0us     1   13us( 286us)    0us                     1099  100%    0%  [0xee8c21c0]
          0%                                 0us                     1099  100%    0%    risc_code01+0x4e1c

 55.9%    0%  10.4us     8  271us( 231ms)    0us                        1  100%    0%  [0xef3c858c]
          0%                                 0us                        1  100%    0%    sys_setgroups16+0xcc

 0.04%    0%  19.1us     1   19us( 481us)    0us                        1  100%    0%  [0xf057b3a4]
          0%                                 0us                        1  100%    0%    sys_ioctl+0x38

 0.03%    0%  15.8us     1   16us( 337us)    0us                        1  100%    0%  [0xf057d364]
          0%                                 0us                        1  100%    0%    set_user_nice

 0.01%    0%   6.0us     2  6.1us( 337us)    0us                    11056  100%    0%  __per_cpu_end+0x1c
          0%                                 0us                    11056  100%    0%    pipe_write+0x6c

 0.07%    0% 202.6us     1  203us(1013us)    0us                     3652  100%    0%  pid_hash+0x8660
          0%                                 0us                       18  100%    0%    copy_files+0xd4
          0%                                 0us                       49  100%    0%    internal_add_timer+0x98
          0%                                 0us                     1130  100%    0%    sys_capset+0x64
          0%                                 0us                     2455  100%    0%    sys_personality+0x30

 93.4% 0.08%   7.9us     5  354us( 232ms)    0us                  3151445  100% 0.08%  pid_hash+0x8a40
          0%                                 0us                      639  100%    0%    lease_alloc+0x30
          0%                                 0us                        7  100%    0%    setup_swap_extents+0xa8
       0.08%                                 0us                  2264074  100% 0.08%    try_to_unuse+0xb8
       0.08%                                 0us                   886725  100% 0.08%    try_to_unuse+0x2a8

 1620%    0%   7.5us    15 2027us( 232ms)    0us                 13185756  100%    0%  serial_pci_tbl+0xb8c
          0%                                 0us                 13185756  100%    0%    mm_init+0xb8

          0%                                 0us                    20917  100%    0%  buffered_rmqueue+0x38
          0%                                 0us                    16081  100%    0%  find_local_symbol+0x8
          0%                                 0us                  2513995  100%    0%  get_chrfops+0x15c
          0%                                 0us                    45266  100%    0%  proc_pid_stat+0x120
       0.05%                                 0us                  2864093  100% 0.05%  sys_getgroups16+0x5c
       0.01%                                 0us                    76659  100% 0.01%  sys_setgroups16+0x64
          0%                                 0us                  4447143  100%    0%  sys_swapon+0x16c

- - - - - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RWLOCK WRITES     HOLD           WAIT (ALL)           WAIT (WW) 
  UTIL  CON    MEAN(  MAX )   MEAN(  MAX )( %CPU)   MEAN(  MAX )     TOTAL NOWAIT SPIN(  WW )  NAME

          0%                   0us                   0us          16216231  100%    0%(   0%)  *TOTAL*

    0%    0%                   0us                   0us                 1  100%    0%(   0%)  [0xe0fff12c]
    0%    0%                   0us                   0us                 1  100%    0%(   0%)    task_name+0xb0

    0%    0%                   0us                   0us                 3  100%    0%(   0%)  [0xee6ae8a4]
    0%    0%                   0us                   0us                 1  100%    0%(   0%)    generic_shutdown_super+0x84
    0%    0%                   0us                   0us                 1  100%    0%(   0%)    locks_wake_up_blocks+0x58
    0%    0%                   0us                   0us                 1  100%    0%(   0%)    schedule+0x278

    0%    0%                   0us                   0us                 4  100%    0%(   0%)  [0xee78d7c4]
    0%    0%                   0us                   0us                 4  100%    0%(   0%)    free_dma+0x1c

    0%    0%                   0us                   0us            428518  100%    0%(   0%)  __per_cpu_end+0x1c8
    0%    0%                   0us                   0us            428518  100%    0%(   0%)    posix_test_lock+0x28

    0%    0%                   0us                   0us                35  100%    0%(   0%)  cpu_devices+0xb48
    0%    0%                   0us                   0us                35  100%    0%(   0%)    fn_hash_delete+0x1a8

    0%    0%                   0us                   0us                12  100%    0%(   0%)  memblk_devices+0x904
    0%    0%                   0us                   0us                 6  100%    0%(   0%)    __func__.1+0x130d8
    0%    0%                   0us                   0us                 6  100%    0%(   0%)    __func__.1+0x135a4

    0%    0%                   0us                   0us             51985  100%    0%(   0%)  pid_hash+0x8a40
    0%    0%                   0us                   0us             51985  100%    0%(   0%)    __kill_fasync+0x1c

    0%    0%                   0us                   0us                 6  100%    0%(   0%)  __func__.1+0x13104
    0%    0%                   0us                   0us                 6  100%    0%(   0%)  __func__.1+0x13d24
    0%    0%                   0us                   0us                21  100%    0%(   0%)  badness+0x30
    0%    0%                   0us                   0us            338541  100%    0%(   0%)  badness+0xdc
    0%    0%                   0us                   0us                 6  100%    0%(   0%)  de_thread+0x25c
    0%    0%                   0us                   0us            447544  100%    0%(   0%)  de_thread+0x30
    0%    0%                   0us                   0us               105  100%    0%(   0%)  fn_hash_delete+0x220
    0%    0%                   0us                   0us                 4  100%    0%(   0%)  generic_shutdown_super+0x28
    0%    0%                   0us                   0us           1825042  100%    0%(   0%)  get_swap_page+0x8c
    0%    0%                   0us                   0us                20  100%    0%(   0%)  ifind+0x5c
    0%    0%                   0us                   0us                25  100%    0%(   0%)  inode_change_ok+0x98
    0%    0%                   0us                   0us                 2  100%    0%(   0%)  lock_get_status+0x54
    0%    0%                   0us                   0us            392487  100%    0%(   0%)  param_get_intarray+0x5c
    0%    0%                   0us                   0us                20  100%    0%(   0%)  param_set_copystring+0x4c
    0%    0%                   0us                   0us            510098  100%    0%(   0%)  proc_pid_follow_link+0x38
    0%    0%                   0us                   0us           2063912  100%    0%(   0%)  proc_pid_maps_get_line+0x128
    0%    0%                   0us                   0us           2063900  100%    0%(   0%)  proc_pid_maps_get_line+0x48
    0%    0%                   0us                   0us           2332202  100%    0%(   0%)  proc_pid_status+0x120
    0%    0%                   0us                   0us           2105301  100%    0%(   0%)  proc_pid_status+0x184
    0%    0%                   0us                   0us           1492685  100%    0%(   0%)  remove_exclusive_swap_page+0xc
    0%    0%                   0us                   0us           1492406  100%    0%(   0%)  swap_entry_free+0x8
    0%    0%                   0us                   0us            332694  100%    0%(   0%)  swap_info_get+0xe0
    0%    0%                   0us                   0us            338646  100%    0%(   0%)  sys_fchown16+0x24
_________________________________________________________________________________________________________________________
Number of read locks found=10

Hanna, I suspect you're not to blame, but rather the global lock...


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15  9:47                       ` William Lee Irwin III
@ 2003-03-15 11:58                         ` William Lee Irwin III
  2003-03-15 12:08                           ` Andrew Morton
  0 siblings, 1 reply; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15 11:58 UTC (permalink / raw)
  To: Andrew Morton, bzzz, adilger, linux-kernel, ext2-devel, hawkes, hannal

[-- Attachment #1: brief message --]
[-- Type: text/plain, Size: 640 bytes --]

On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
>> Next pass involves lockmeter:

On Sat, Mar 15, 2003 at 01:47:58AM -0800, William Lee Irwin III wrote:
> Throughput 39.2014 MB/sec 128 procs
> dbench 128  142.51s user 10828.91s system 964% cpu 18:57.88 total
> That's an 83% reduction in throughput from applying lockmeter.
> Um, somebody should look into this. The thing is a bloody doorstop:

Okay, dump_stack() every once in a while when we schedule() in down().

No good ideas how to script the results so I have the foggiest idea
who's the bad guy. gzipped and MIME attached (Sorry!) for space reasons.


-- wli

[-- Attachment #2: sem.log.gz --]
[-- Type: application/octet-stream, Size: 5767 bytes --]

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15 11:58                         ` William Lee Irwin III
@ 2003-03-15 12:08                           ` Andrew Morton
  2003-03-15 12:25                             ` William Lee Irwin III
  0 siblings, 1 reply; 39+ messages in thread
From: Andrew Morton @ 2003-03-15 12:08 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: bzzz, adilger, linux-kernel, ext2-devel, hawkes, hannal

William Lee Irwin III <wli@holomorphy.com> wrote:
>
> On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
> >> Next pass involves lockmeter:
> 
> On Sat, Mar 15, 2003 at 01:47:58AM -0800, William Lee Irwin III wrote:
> > Throughput 39.2014 MB/sec 128 procs
> > dbench 128  142.51s user 10828.91s system 964% cpu 18:57.88 total
> > That's an 83% reduction in throughput from applying lockmeter.
> > Um, somebody should look into this. The thing is a bloody doorstop:
> 
> Okay, dump_stack() every once in a while when we schedule() in down().

Thanks.

> No good ideas how to script the results so I have the foggiest idea
> who's the bad guy. gzipped and MIME attached (Sorry!) for space reasons.

lock_super() in the ext2 inode allocator mainly.  It needs the same treatment.


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH] concurrent block allocation for ext2 against 2.5.64
  2003-03-15 12:08                           ` Andrew Morton
@ 2003-03-15 12:25                             ` William Lee Irwin III
  0 siblings, 0 replies; 39+ messages in thread
From: William Lee Irwin III @ 2003-03-15 12:25 UTC (permalink / raw)
  To: Andrew Morton; +Cc: bzzz, adilger, linux-kernel, ext2-devel, hawkes, hannal

On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
>> Okay, dump_stack() every once in a while when we schedule() in down().

On Sat, Mar 15, 2003 at 04:08:19AM -0800, Andrew Morton wrote:
> Thanks.

No problem. I think we found out a number of things that help everyone.


On Sat, Mar 15, 2003 at 12:24:31AM -0800, William Lee Irwin III wrote:
>> No good ideas how to script the results so I have the foggiest idea
>> who's the bad guy. gzipped and MIME attached (Sorry!) for space reasons.

On Sat, Mar 15, 2003 at 04:08:19AM -0800, Andrew Morton wrote:
> lock_super() in the ext2 inode allocator mainly.  It needs the same treatment.

Terrific! Not only have we resolved 16x ext2 contention issues we've
also identified a clear direction for 32x!!

Go fs hackers go! First 2.5 VM, now 2.6/2.7 VFS. What can't you do?


-- wli

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2003-03-15 12:15 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-03-13  8:55 [PATCH] concurrent block allocation for ext2 against 2.5.64 Alex Tomas
2003-03-13  9:58 ` Andrew Morton
2003-03-13 19:17   ` Alex Tomas
2003-03-13 22:25     ` Andrew Morton
2003-03-13 23:03       ` Andreas Dilger
2003-03-13 23:10         ` Andrew Morton
2003-03-13 23:03       ` Alex Tomas
2003-03-13 23:25         ` Andrew Morton
2003-03-13 23:56     ` Andreas Dilger
2003-03-14  7:20       ` Alex Tomas
2003-03-14 20:59         ` Andreas Dilger
2003-03-14 21:14           ` Alex Tomas
2003-03-15  4:37         ` William Lee Irwin III
2003-03-15  4:54           ` Andrew Morton
2003-03-15  5:30             ` William Lee Irwin III
2003-03-15  5:43               ` Martin J. Bligh
2003-03-15  5:50                 ` William Lee Irwin III
2003-03-15  5:49             ` William Lee Irwin III
2003-03-15  6:20               ` William Lee Irwin III
2003-03-15  6:44                 ` Andrew Morton
2003-03-15  7:05                   ` William Lee Irwin III
2003-03-15  8:24                     ` William Lee Irwin III
2003-03-15  9:47                       ` William Lee Irwin III
2003-03-15 11:58                         ` William Lee Irwin III
2003-03-15 12:08                           ` Andrew Morton
2003-03-15 12:25                             ` William Lee Irwin III
2003-03-15  8:16                   ` [Ext2-devel] " Alex Tomas
2003-03-15  8:29                     ` William Lee Irwin III
2003-03-15  8:32                       ` Alex Tomas
2003-03-15  9:23                         ` William Lee Irwin III
2003-03-14 18:25       ` Martin J. Bligh
2003-03-14 19:30       ` [Ext2-devel] " Daniel Phillips
2003-03-14 19:55         ` Andrew Morton
2003-03-13 17:39 ` [Ext2-devel] " Andreas Dilger
2003-03-13 18:43   ` Alex Tomas
2003-03-13 19:09     ` Matthew Wilcox
2003-03-13 19:39       ` Andrew Morton
2003-03-13 19:23   ` Theodore Ts'o
2003-03-13 19:44     ` Andreas Dilger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).