linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Lightweight userspace semaphores...
@ 2002-02-23  3:47 Rusty Russell
  2002-02-23 15:03 ` Ingo Molnar
                   ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: Rusty Russell @ 2002-02-23  3:47 UTC (permalink / raw)
  To: torvalds, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III
  Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 14471 bytes --]

Hi all,

	There are several lightweight userspace semaphore patches
flying around, and I'll really like to meld into one good solution we
can include in 2.5.

	This version uses wli's multiplicitive hashing.  And it has
these yummy properties:

1) Interface is: open /dev/usem, pread, pwrite.
2) No initialization required: just tell the kernel "this is a
   semaphore: down/up it".
3) No in-kernel arch-specific code.
4) Locks in mmaped are persistent (including across reboots!).

Modifications for tdb to use this interface were pretty trivial (too
bad it proved no faster than fcntl locks, more investigation needed).

User library and kernel patch attached,
Rusty.
PS. map_usem() is ugly: is there a better way to pin pages?
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/Config.help working-2.5.5-usem/drivers/char/Config.help
--- linux-2.5.5/drivers/char/Config.help	Thu Jan 31 08:17:08 2002
+++ working-2.5.5-usem/drivers/char/Config.help	Thu Feb 21 12:33:46 2002
@@ -1153,3 +1153,9 @@
 
   Not sure? It's safe to say N.
 
+CONFIG_USERSEM
+  Say Y here to have support for fast userspace semaphores, or M to
+  compile it as a module called usersem.o.
+
+  If unsure, say Y: everyone else will and you wouldn't want to miss
+  out.
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/Config.in working-2.5.5-usem/drivers/char/Config.in
--- linux-2.5.5/drivers/char/Config.in	Mon Feb 11 15:17:18 2002
+++ working-2.5.5-usem/drivers/char/Config.in	Thu Feb 21 12:33:46 2002
@@ -227,4 +227,5 @@
    tristate 'ACP Modem (Mwave) support' CONFIG_MWAVE
 fi
 
+dep_tristate 'Fast userspace semaphore support (EXPERIMENTAL)' CONFIG_USERSEM $CONFIG_EXPERIMENTAL
 endmenu
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/Makefile working-2.5.5-usem/drivers/char/Makefile
--- linux-2.5.5/drivers/char/Makefile	Mon Feb 11 15:17:18 2002
+++ working-2.5.5-usem/drivers/char/Makefile	Thu Feb 21 12:33:46 2002
@@ -229,6 +229,7 @@
 obj-$(CONFIG_SH_WDT) += shwdt.o
 obj-$(CONFIG_EUROTECH_WDT) += eurotechwdt.o
 obj-$(CONFIG_SOFT_WATCHDOG) += softdog.o
+obj-$(CONFIG_USERSEM) += usersem.o
 
 subdir-$(CONFIG_MWAVE) += mwave
 ifeq ($(CONFIG_MWAVE),y)
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/usersem.c working-2.5.5-usem/drivers/char/usersem.c
--- linux-2.5.5/drivers/char/usersem.c	Thu Jan  1 10:00:00 1970
+++ working-2.5.5-usem/drivers/char/usersem.c	Sat Feb 23 14:18:28 2002
@@ -0,0 +1,244 @@
+/*
+ *  User-accessible semaphores.
+ *  (C) Rusty Russell, IBM 2002
+ *
+ *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+ *  enough at me, Linus for the original (flawed) idea, Matthew
+ *  Kirkwood for proof-of-concept implementation.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/devfs_fs_kernel.h>
+#include <asm/uaccess.h>
+
+struct usem
+{
+	atomic_t count;
+	unsigned int sleepers;
+};
+
+static spinlock_t usem_lock = SPIN_LOCK_UNLOCKED;
+
+/* FIXME: This may be way too small. --RR */
+#define USEM_HASHBITS 6
+/* The key for the hash is the address + index + offset within page */
+static wait_queue_head_t usem_waits[1<<USEM_HASHBITS];
+
+static inline wait_queue_head_t *hash_usem(const struct page *page,
+					   unsigned long offset)
+{
+	unsigned long h;
+
+	/* Hash based on inode number (if page is backed), has chance
+	   of persistence across reboots on sane filesystems. */
+	if (!page->mapping) h = (unsigned long)page;
+	else h = page->mapping->host->i_ino;
+
+	return &usem_waits[hash_long(h + page->index + offset, USEM_HASHBITS)];
+}
+
+/* Must be holding mmap_sem */
+static inline int pin_page(unsigned long upage_start, struct page **page)
+{
+	return get_user_pages(current, current->mm, upage_start,
+			      1 /* one page */,
+			      1 /* writable */,
+			      0 /* don't force */,
+			      page,
+			      NULL /* don't return vmas */);
+}
+
+/* Get kernel address of the two variables, and pin them in. */
+static int map_usem(unsigned long uaddr,
+		    atomic_t **count,
+		    unsigned int **sleepers,
+		    struct page *pages[2])
+{
+	struct mm_struct *mm = current->mm;
+	unsigned int num_pages;
+	unsigned long upage_start;
+	int err;
+
+	upage_start = (uaddr & PAGE_MASK);
+
+	/* Most times, whole thing on one page */
+	if (((uaddr + sizeof(struct usem) - 1) & PAGE_MASK) == upage_start) {
+		num_pages = 1;
+		pages[1] = NULL;
+	} else {
+		num_pages = 2;
+		/* ... otherwise, page boundary must be between them */
+		if ((uaddr + offsetof(struct usem, sleepers))%PAGE_SIZE != 0)
+			return -EINVAL;
+	}
+
+	down_read(&mm->mmap_sem);
+	/* Pin first page */
+	err = pin_page(upage_start, &pages[0]);
+	if (num_pages == 2 && err == 1) {
+		/* Pin second page */
+		err = pin_page(upage_start + PAGE_SIZE, &pages[1]);
+		if (err < 0)
+			put_page(pages[0]);
+	}
+	up_read(&mm->mmap_sem);
+	if (err < 0)
+		return err;
+
+	/* Set up pointers into pinned page(s) */
+	*count = page_address(pages[0]) + (uaddr%PAGE_SIZE);
+	uaddr += offsetof(struct usem, sleepers);
+	if (num_pages == 1)
+		*sleepers = page_address(pages[0]) + (uaddr%PAGE_SIZE);
+	else /* sleepers is on second page */
+		*sleepers = page_address(pages[1]) + (uaddr%PAGE_SIZE);
+	return 0;
+}
+
+/* Unpin the variables */
+static void unmap_usem(struct page *pages[2])
+{
+	put_page(pages[0]);
+	if (pages[1]) put_page(pages[1]);
+}
+
+/* Stolen from arch/i386/kernel/semaphore.c. */
+static int __usem_down(wait_queue_head_t *wq,
+		       atomic_t *count,
+		       unsigned int *sleepers)
+{
+	int retval = 0;
+	DECLARE_WAITQUEUE(wait, current);
+
+	current->state = TASK_INTERRUPTIBLE;
+	add_wait_queue_exclusive(wq, &wait);
+
+	spin_lock(&usem_lock);
+	(*sleepers)++;
+	for (;;) {
+		unsigned int sl = *sleepers;
+
+		/* With signals pending, this turns into the trylock
+		 * failure case - we won't be sleeping, and we can't
+		 * get the lock as it has contention. Just correct the
+		 * count and exit.  */
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			*sleepers = 0;
+			atomic_add(sl, count);
+			break;
+		}
+
+		/* Add "everybody else" into it. They aren't playing,
+		 * because we own the spinlock. The "-1" is because
+		 * we're still hoping to get the lock.  */
+		if (!atomic_add_negative(sl - 1, count)) {
+			*sleepers = 0;
+			break;
+		}
+		*sleepers = 1;	/* us - see -1 above */
+		spin_unlock(&usem_lock);
+
+		schedule();
+		current->state = TASK_INTERRUPTIBLE;
+		spin_lock(&usem_lock);
+	}
+	spin_unlock(&usem_lock);
+	current->state = TASK_RUNNING;
+	remove_wait_queue(wq, &wait);
+
+	/* Wake up everyone else. */
+	wake_up(wq);
+	return retval;
+}
+
+/* aka "down" */
+static ssize_t usem_read(struct file *f, char *u, size_t c, loff_t *ppos)
+{
+	struct page *pages[2];
+	wait_queue_head_t *wqhead;
+	atomic_t *count;
+	unsigned int *sleepers;
+	int ret;
+
+	/* Must not vanish underneath us. */
+	ret = map_usem(*ppos, &count, &sleepers, pages);
+	if (ret < 0)
+		return ret;
+	wqhead = hash_usem(pages[0], *ppos % PAGE_SIZE);
+	ret = __usem_down(wqhead, count, sleepers);
+	unmap_usem(pages);
+	return 0;
+}
+	
+
+/* aka "up" */
+static ssize_t
+usem_write(struct file *f, const char *u, size_t c, loff_t *ppos)
+{
+	struct page *pages[2];
+	wait_queue_head_t *wqhead;
+	atomic_t *count;
+	unsigned int *sleepers;
+	int ret;
+
+	/* Must not vanish underneath us: even if they do an up
+           without a down (userspace bug). */
+	ret = map_usem(*ppos, &count, &sleepers, pages);
+	if (ret < 0)
+		return ret;
+	wqhead = hash_usem(pages[0], *ppos % PAGE_SIZE);
+	wake_up(wqhead);
+	unmap_usem(pages);
+	return ret;
+}
+
+static struct file_operations usem_fops = {
+	owner:		THIS_MODULE,
+	read:		usem_read,
+	write:		usem_write,
+};
+
+static int usem_major;
+static devfs_handle_t usem_dev;
+
+int __init init(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(usem_waits); i++)
+		init_waitqueue_head(&usem_waits[i]);
+	usem_major = devfs_register_chrdev(0, "usem", &usem_fops);
+	usem_dev = devfs_register(NULL, "usem", DEVFS_FL_NONE, usem_major,
+				  0, S_IFCHR | 0666, &usem_fops, NULL);
+	return 0;
+}
+
+void __exit fini(void)
+{
+	devfs_unregister(usem_dev);
+	devfs_unregister_chrdev(usem_major, "usem");
+}
+
+MODULE_LICENSE("GPL");
+module_init(init);
+module_exit(fini);
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/include/linux/hash.h working-2.5.5-usem/include/linux/hash.h
--- linux-2.5.5/include/linux/hash.h	Thu Jan  1 10:00:00 1970
+++ working-2.5.5-usem/include/linux/hash.h	Thu Feb 21 12:33:46 2002
@@ -0,0 +1,58 @@
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+/* Fast hashing routine for a long.
+   (C) 2002 William Lee Irwin III, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+#if BITS_PER_LONG == 32
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e370001UL
+#elif BITS_PER_LONG == 64
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
+#else
+#error Define GOLDEN_RATIO_PRIME for your wordsize.
+#endif
+
+static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+{
+	unsigned long hash = val;
+
+#if BITS_PER_LONG == 64
+	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+	unsigned long n = hash;
+	n <<= 18;
+	hash -= n;
+	n <<= 33;
+	hash -= n;
+	n <<= 3;
+	hash += n;
+	n <<= 3;
+	hash -= n;
+	n <<= 4;
+	hash += n;
+	n <<= 2;
+	hash += n;
+#else
+	/* On some cpus multiply is faster, on others gcc will do shifts */
+	hash *= GOLDEN_RATIO_PRIME;
+#endif
+
+	/* High bits are more random, so use them. */
+	return hash >> (BITS_PER_LONG - bits);
+}
+	
+static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
+{
+	return hash_long((unsigned long)ptr, bits);
+}
+#endif /* _LINUX_HASH_H */
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/include/linux/mmzone.h working-2.5.5-usem/include/linux/mmzone.h
--- linux-2.5.5/include/linux/mmzone.h	Wed Feb 20 16:07:17 2002
+++ working-2.5.5-usem/include/linux/mmzone.h	Thu Feb 21 12:46:30 2002
@@ -51,8 +51,7 @@
 	/*
 	 * wait_table		-- the array holding the hash table
 	 * wait_table_size	-- the size of the hash table array
-	 * wait_table_shift	-- wait_table_size
-	 * 				== BITS_PER_LONG (1 << wait_table_bits)
+	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
 	 *
 	 * The purpose of all these is to keep track of the people
 	 * waiting for a page to become available and make them
@@ -75,7 +74,7 @@
 	 */
 	wait_queue_head_t	* wait_table;
 	unsigned long		wait_table_size;
-	unsigned long		wait_table_shift;
+	unsigned long		wait_table_bits;
 
 	/*
 	 * Discontig memory support fields.
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/kernel/ksyms.c working-2.5.5-usem/kernel/ksyms.c
--- linux-2.5.5/kernel/ksyms.c	Wed Feb 20 16:07:17 2002
+++ working-2.5.5-usem/kernel/ksyms.c	Thu Feb 21 16:48:59 2002
@@ -86,6 +86,7 @@
 EXPORT_SYMBOL(do_munmap);
 EXPORT_SYMBOL(do_brk);
 EXPORT_SYMBOL(exit_mm);
+EXPORT_SYMBOL(get_user_pages);
 
 /* internal kernel memory management */
 EXPORT_SYMBOL(_alloc_pages);
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/mm/filemap.c working-2.5.5-usem/mm/filemap.c
--- linux-2.5.5/mm/filemap.c	Wed Feb 20 16:07:17 2002
+++ working-2.5.5-usem/mm/filemap.c	Thu Feb 21 13:01:02 2002
@@ -25,6 +25,7 @@
 #include <linux/iobuf.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/hash.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -773,32 +774,8 @@
 static inline wait_queue_head_t *page_waitqueue(struct page *page)
 {
 	const zone_t *zone = page_zone(page);
-	wait_queue_head_t *wait = zone->wait_table;
-	unsigned long hash = (unsigned long)page;
 
-#if BITS_PER_LONG == 64
-	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	unsigned long n = hash;
-	n <<= 18;
-	hash -= n;
-	n <<= 33;
-	hash -= n;
-	n <<= 3;
-	hash += n;
-	n <<= 3;
-	hash -= n;
-	n <<= 4;
-	hash += n;
-	n <<= 2;
-	hash += n;
-#else
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	hash *= GOLDEN_RATIO_PRIME;
-#endif
-
-	hash >>= zone->wait_table_shift;
-
-	return &wait[hash];
+	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
 }
 
 /* 
diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/mm/page_alloc.c working-2.5.5-usem/mm/page_alloc.c
--- linux-2.5.5/mm/page_alloc.c	Wed Feb 20 16:07:17 2002
+++ working-2.5.5-usem/mm/page_alloc.c	Thu Feb 21 12:33:46 2002
@@ -776,8 +776,8 @@
 		 * per zone.
 		 */
 		zone->wait_table_size = wait_table_size(size);
-		zone->wait_table_shift =
-			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
+		zone->wait_table_bits =
+			wait_table_bits(zone->wait_table_size);
 		zone->wait_table = (wait_queue_head_t *)
 			alloc_bootmem_node(pgdat, zone->wait_table_size
 						* sizeof(wait_queue_head_t));


[-- Attachment #2: usem.tar.gz --]
[-- Type: application/octet-stream, Size: 10098 bytes --]

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23  3:47 [PATCH] Lightweight userspace semaphores Rusty Russell
@ 2002-02-23 15:03 ` Ingo Molnar
  2002-02-23 18:20   ` Linus Torvalds
  2002-02-24 23:29   ` Rusty Russell
  2002-02-25 15:00 ` Hubertus Franke
  2002-02-27  0:24 ` Rusty Russell
  2 siblings, 2 replies; 47+ messages in thread
From: Ingo Molnar @ 2002-02-23 15:03 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Linus Torvalds, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel


On Sat, 23 Feb 2002, Rusty Russell wrote:

> 1) Interface is: open /dev/usem, pread, pwrite.

i like the patch, but the interface is ugly IMO. Why not new syscalls? I
think these lightweight semaphores will become an important part of Linux,
so having their own syscall entries is the most correct interface,
something like:

  sys_sem_create()
  sys_sem_destroy()
  sys_sem_down()
  sys_sem_up()

/dev/usem is such an ... ioctl()-ish approach. It's a scalability problem
as well: read()/write() has (or can have) some implicit locking that is
imposed on the usem interface as well. It's a usage robustness issue as
well: chroot() environments that have no /dev directory will suddenly lose
a functionality of Linux. There is absolutely no problem with adding new
syscalls for things like this.

Plus sys_sem_create() should do some proper resource limit management,
pinning down an unlimited number of pages is bad.

	Ingo


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23 15:03 ` Ingo Molnar
@ 2002-02-23 18:20   ` Linus Torvalds
  2002-02-23 18:28     ` Larry McVoy
  2002-02-26 16:09     ` Hubertus Franke
  2002-02-24 23:29   ` Rusty Russell
  1 sibling, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2002-02-23 18:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Rusty Russell, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel



On Sat, 23 Feb 2002, Ingo Molnar wrote:
>
> On Sat, 23 Feb 2002, Rusty Russell wrote:
>
> > 1) Interface is: open /dev/usem, pread, pwrite.
>
> i like the patch, but the interface is ugly IMO. Why not new syscalls?

I agree - I dislike magic device files a whole lot more than just abotu
every alternative.

Also, one thing possibly worth looking into is to just put the actual
semaphore contents into a regular file backed setup.

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23 18:20   ` Linus Torvalds
@ 2002-02-23 18:28     ` Larry McVoy
  2002-02-23 20:31       ` Ingo Molnar
  2002-02-23 21:22       ` Alan Cox
  2002-02-26 16:09     ` Hubertus Franke
  1 sibling, 2 replies; 47+ messages in thread
From: Larry McVoy @ 2002-02-23 18:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Rusty Russell, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel

> Also, one thing possibly worth looking into is to just put the actual
> semaphore contents into a regular file backed setup.
> 
> 		Linus

Exactly.  SMP gives you coherent memory and test-and-set or some other
atomic operation.  Why not use it?
-- 
---
Larry McVoy            	 lm at bitmover.com           http://www.bitmover.com/lm 

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23 18:28     ` Larry McVoy
@ 2002-02-23 20:31       ` Ingo Molnar
  2002-02-23 21:22       ` Alan Cox
  1 sibling, 0 replies; 47+ messages in thread
From: Ingo Molnar @ 2002-02-23 20:31 UTC (permalink / raw)
  To: Larry McVoy
  Cc: Linus Torvalds, Rusty Russell, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel


On Sat, 23 Feb 2002, Larry McVoy wrote:

> Exactly.  SMP gives you coherent memory and test-and-set or some other
> atomic operation.  Why not use it?

the userspace library side does it. The kernel patch is the slowpath, the
fast path (no contention) happens in user-space, using SMP-atomic
instructions. It's all very nice and lightweight.

also as far as i can see, this implementation enables semaphores to live
anywhere within the VM, the /dev/usem is just a hack to communicate this
VM address to the kernel-space code. So i think the patch's concepts are
really nice, except the interface cleanliness issue which shouldnt be too
hard to fix - adding new syscalls is pleasant work anyway :-)

	Ingo


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23 18:28     ` Larry McVoy
  2002-02-23 20:31       ` Ingo Molnar
@ 2002-02-23 21:22       ` Alan Cox
  1 sibling, 0 replies; 47+ messages in thread
From: Alan Cox @ 2002-02-23 21:22 UTC (permalink / raw)
  To: Larry McVoy
  Cc: Linus Torvalds, Ingo Molnar, Rusty Russell, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel

> Exactly.  SMP gives you coherent memory and test-and-set or some other
> atomic operation.  Why not use it?

Coherent memory on some platforms, locks on some platforms. Fortunately both
on several important architectures. It needs a much cleaner API but in
user space to wrap the user mode/kernel mode mixed locks. You need the
kernel side for sleeping cases

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 19:31                             ` Linus Torvalds
@ 2002-02-24  4:57                               ` Daniel Phillips
  0 siblings, 0 replies; 47+ messages in thread
From: Daniel Phillips @ 2002-02-24  4:57 UTC (permalink / raw)
  To: Linus Torvalds, Alan Cox
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel, Larry McVoy

On February 25, 2002 08:31 pm, Linus Torvalds wrote:
> On Mon, 25 Feb 2002, Alan Cox wrote:
> > Ok I see where you are coming from now -- that makes sense for a few 
> > cases.
> 
> Note that the "few cases" in question imply _all_ of the current broken 
> library spinlocks, for example. 
> 
> Don't think "POSIX semaphores", but think "fast locking" in the general
> case. I will bet you $1 in small change that most normal locking by far is
> for the kind of thread-safe stuff libc does right now.

This looks like another piece of the equation needed to make Larry's smp 
cluster concept come true.  So...

-- 
Daniel

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23 15:03 ` Ingo Molnar
  2002-02-23 18:20   ` Linus Torvalds
@ 2002-02-24 23:29   ` Rusty Russell
  2002-02-24 23:48     ` Linus Torvalds
  1 sibling, 1 reply; 47+ messages in thread
From: Rusty Russell @ 2002-02-24 23:29 UTC (permalink / raw)
  To: mingo
  Cc: Linus Torvalds, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel

In message <Pine.LNX.4.33.0202231551300.4173-100000@localhost.localdomain> you 
write:
> 
> On Sat, 23 Feb 2002, Rusty Russell wrote:
> 
> > 1) Interface is: open /dev/usem, pread, pwrite.
> 
> i like the patch, but the interface is ugly IMO. Why not new syscalls? I
> think these lightweight semaphores will become an important part of Linux,
> so having their own syscall entries is the most correct interface,
> something like:
> 
>   sys_sem_create()
>   sys_sem_destroy()

There is no create and destroy (init is purely userspace).  There is
"this is a semapore: up it".  This is a feature.

>   sys_sem_down()
>   sys_sem_up()
> 
> /dev/usem is such an ... ioctl()-ish approach. It's a scalability problem
> as well: read()/write() has (or can have) some implicit locking that is
> imposed on the usem interface as well.

Agreed with implicit locking: good catch.  Disagree with neatness: I
like finding out in advance that there's no fast semaphore support.

> Plus sys_sem_create() should do some proper resource limit management,
> pinning down an unlimited number of pages is bad.

Since pages are pinned "on demand" and a process can only do one
syscall at a time, the maximum number of pinned pages per process ==
2.  Which is fine.

Will do syscall version, and see if I can actually get it to beat
fcntl locking on reasonable benchmarks (ie. tdbtorture).

Cheers!
Rusty.
PS.  Nomenclature: my fiance suggested FUS (Fast Userspace
     Semaphores), and I am legally obliged to agree.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-24 23:29   ` Rusty Russell
@ 2002-02-24 23:48     ` Linus Torvalds
  2002-02-25  1:10       ` Rusty Russell
  2002-03-02 14:54       ` Pavel Machek
  0 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2002-02-24 23:48 UTC (permalink / raw)
  To: Rusty Russell
  Cc: mingo, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel



On Mon, 25 Feb 2002, Rusty Russell wrote:
> >
> >   sys_sem_create()
> >   sys_sem_destroy()
>
> There is no create and destroy (init is purely userspace).  There is
> "this is a semapore: up it".  This is a feature.

No, that's a bug.

You have to realize that there are architectures that need special
initialization and page allocation for semaphores: they need special flags
in the TLB for "careful access", for example (sometimes the careful access
ends up being non-cached).

You can't just put semaphores anywhere and tell the kernel to try to fix
up whatever happened.

			Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-24 23:48     ` Linus Torvalds
@ 2002-02-25  1:10       ` Rusty Russell
  2002-02-25  1:23         ` Linus Torvalds
  2002-03-02 14:54       ` Pavel Machek
  1 sibling, 1 reply; 47+ messages in thread
From: Rusty Russell @ 2002-02-25  1:10 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: mingo, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel

In message <Pine.LNX.4.33.0202241543550.28708-100000@home.transmeta.com> you wr
ite:
> 
> 
> On Mon, 25 Feb 2002, Rusty Russell wrote:
> > >
> > >   sys_sem_create()
> > >   sys_sem_destroy()
> >
> > There is no create and destroy (init is purely userspace).  There is
> > "this is a semapore: up it".  This is a feature.
> 
> You have to realize that there are architectures that need special
> initialization and page allocation for semaphores: they need special flags
> in the TLB for "careful access", for example (sometimes the careful access
> ends up being non-cached).

Bugger.  How about:

	sys_sem_area(void *pagestart, size_t len)
	sys_unsem_area(void *pagestart, size_t len)

Is that sufficient?  Is sys_unsem_area required at all?

TDB has an arbitrary number of semaphores in the mmap file...
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25  1:10       ` Rusty Russell
@ 2002-02-25  1:23         ` Linus Torvalds
  2002-02-25 13:14           ` Alan Cox
  2002-03-01  4:56           ` Eric W. Biederman
  0 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2002-02-25  1:23 UTC (permalink / raw)
  To: Rusty Russell
  Cc: mingo, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel



On Mon, 25 Feb 2002, Rusty Russell wrote:
>
> Bugger.  How about:
>
> 	sys_sem_area(void *pagestart, size_t len)
> 	sys_unsem_area(void *pagestart, size_t len)
>
> Is that sufficient?  Is sys_unsem_area required at all?

The above is sufficient, but I would personally actually prefer an
interface more like

	fd = sem_initialize();
	mmap(fd, ...)
	..
	munmap(..)

which gives you a handle for the semaphore.

Note that getting a file descriptor is really quite useful - it means that
you can pass the file descriptor around through unix domain sockets, for
example, and allow sharing of the semaphore across unrelated processes
that way.

Also, the fd thus acts as an "anchor" for any in-kernel data structures
that the semaphore implementation may (or may not) want to use. Think of
it as your /dev/usem, except with a more explicit interface.

And make the initial mmap() only do a limited number of pages, so that
people don't start trying to allocate tons of memory this way.-

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25  1:23         ` Linus Torvalds
@ 2002-02-25 13:14           ` Alan Cox
  2002-02-25 16:11             ` Linus Torvalds
  2002-03-01  4:56           ` Eric W. Biederman
  1 sibling, 1 reply; 47+ messages in thread
From: Alan Cox @ 2002-02-25 13:14 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel

> 	fd = sem_initialize();
> 	mmap(fd, ...)
> 	..
> 	munmap(..)
> 
> which gives you a handle for the semaphore.

	fd = open("/dev/shm/sem....");
	mmap(fd, ...)

	munmap(..)

That lets the kernel decide what it wants to do and provide as the back
end. It allows you to think about things like mremap and growing/shrinking
the object. And finally /dev/sem looks suspiciously like a quick tweak
to /dev/shm..

> And make the initial mmap() only do a limited number of pages, so that
> people don't start trying to allocate tons of memory this way.-

If it uses the /dev/shm world then anyone running a kernel with the new
patches for resource accounting is still safe, and anyone else is still
simply going to find their shm areas hitting swap under extreme load (which
is ideal)


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23  3:47 [PATCH] Lightweight userspace semaphores Rusty Russell
  2002-02-23 15:03 ` Ingo Molnar
@ 2002-02-25 15:00 ` Hubertus Franke
  2002-03-01  4:44   ` Eric W. Biederman
  2002-02-27  0:24 ` Rusty Russell
  2 siblings, 1 reply; 47+ messages in thread
From: Hubertus Franke @ 2002-02-25 15:00 UTC (permalink / raw)
  To: Rusty Russell
  Cc: torvalds, Matthew Kirkwood, Benjamin LaHaise, David Axmark,
	William Lee Irwin III, linux-kernel

Rusty, since I supplied one of those packages available under lse.sourceforge.net
let me make some comments.

(a) why do you need to pin. I simply use the user level address (vaddr)  
    and hash with the <mm,vaddr> to obtain the internal object.
    This also gives me full protection through the general vmm mechanisms.
(b) I like the idea of mmap or shmat with special flags better than going 
    through a device driver.
    Let's make this a real extension rather than going through a device 
    interface. i.e. expand the sys call interface
(c) creation can be done on demand, that's what I do. If you never have 
    to synchronize through the kernel than you don't create the objects. 
    There should be an option to force explicite creation if desired.
(d) The kernel should simply provide waiting and wakeup functionality and 
    the bean counting should be done in user space. There is no need to 
    pin or crossmap.
(e) I really like to see multiple reader/single writer locks as well. I 
    implemented these 
(f) I also implemented convoy avoidance locks, spinning versions of 
    user semaphores.  All over the same simple interface.
    ulocks_wait(read_or_write) and ulocks_signal(read_or_write, num_to_wake_up).
    Ofcourse to cut down on the interface a single system call is enough.
(g) I have an extensive test program and regression test <ulockflex>
    that exercises the hell out of the userlevel approach. 
 
On Sat, Feb 23, 2002 at 02:47:25PM +1100, Rusty Russell wrote:
> Hi all,
> 
> 	There are several lightweight userspace semaphore patches
> flying around, and I'll really like to meld into one good solution we
> can include in 2.5.
> 
> 	This version uses wli's multiplicitive hashing.  And it has
> these yummy properties:
> 
> 1) Interface is: open /dev/usem, pread, pwrite.
> 2) No initialization required: just tell the kernel "this is a
>    semaphore: down/up it".
> 3) No in-kernel arch-specific code.
> 4) Locks in mmaped are persistent (including across reboots!).
> 
> Modifications for tdb to use this interface were pretty trivial (too
> bad it proved no faster than fcntl locks, more investigation needed).
> 
> User library and kernel patch attached,
> Rusty.
> PS. map_usem() is ugly: is there a better way to pin pages?
> --
>   Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
> 
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/Config.help working-2.5.5-usem/drivers/char/Config.help
> --- linux-2.5.5/drivers/char/Config.help	Thu Jan 31 08:17:08 2002
> +++ working-2.5.5-usem/drivers/char/Config.help	Thu Feb 21 12:33:46 2002
> @@ -1153,3 +1153,9 @@
>  
>    Not sure? It's safe to say N.
>  
> +CONFIG_USERSEM
> +  Say Y here to have support for fast userspace semaphores, or M to
> +  compile it as a module called usersem.o.
> +
> +  If unsure, say Y: everyone else will and you wouldn't want to miss
> +  out.
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/Config.in working-2.5.5-usem/drivers/char/Config.in
> --- linux-2.5.5/drivers/char/Config.in	Mon Feb 11 15:17:18 2002
> +++ working-2.5.5-usem/drivers/char/Config.in	Thu Feb 21 12:33:46 2002
> @@ -227,4 +227,5 @@
>     tristate 'ACP Modem (Mwave) support' CONFIG_MWAVE
>  fi
>  
> +dep_tristate 'Fast userspace semaphore support (EXPERIMENTAL)' CONFIG_USERSEM $CONFIG_EXPERIMENTAL
>  endmenu
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/Makefile working-2.5.5-usem/drivers/char/Makefile
> --- linux-2.5.5/drivers/char/Makefile	Mon Feb 11 15:17:18 2002
> +++ working-2.5.5-usem/drivers/char/Makefile	Thu Feb 21 12:33:46 2002
> @@ -229,6 +229,7 @@
>  obj-$(CONFIG_SH_WDT) += shwdt.o
>  obj-$(CONFIG_EUROTECH_WDT) += eurotechwdt.o
>  obj-$(CONFIG_SOFT_WATCHDOG) += softdog.o
> +obj-$(CONFIG_USERSEM) += usersem.o
>  
>  subdir-$(CONFIG_MWAVE) += mwave
>  ifeq ($(CONFIG_MWAVE),y)
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/drivers/char/usersem.c working-2.5.5-usem/drivers/char/usersem.c
> --- linux-2.5.5/drivers/char/usersem.c	Thu Jan  1 10:00:00 1970
> +++ working-2.5.5-usem/drivers/char/usersem.c	Sat Feb 23 14:18:28 2002
> @@ -0,0 +1,244 @@
> +/*
> + *  User-accessible semaphores.
> + *  (C) Rusty Russell, IBM 2002
> + *
> + *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
> + *  enough at me, Linus for the original (flawed) idea, Matthew
> + *  Kirkwood for proof-of-concept implementation.
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *  GNU General Public License for more details.
> + *
> + *  You should have received a copy of the GNU General Public License
> + *  along with this program; if not, write to the Free Software
> + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + */
> +#include <linux/config.h>
> +#include <linux/kernel.h>
> +#include <linux/wait.h>
> +#include <linux/sched.h>
> +#include <linux/mm.h>
> +#include <linux/hash.h>
> +#include <linux/module.h>
> +#include <linux/devfs_fs_kernel.h>
> +#include <asm/uaccess.h>
> +
> +struct usem
> +{
> +	atomic_t count;
> +	unsigned int sleepers;
> +};
> +
> +static spinlock_t usem_lock = SPIN_LOCK_UNLOCKED;
> +
> +/* FIXME: This may be way too small. --RR */
> +#define USEM_HASHBITS 6
> +/* The key for the hash is the address + index + offset within page */
> +static wait_queue_head_t usem_waits[1<<USEM_HASHBITS];
> +
> +static inline wait_queue_head_t *hash_usem(const struct page *page,
> +					   unsigned long offset)
> +{
> +	unsigned long h;
> +
> +	/* Hash based on inode number (if page is backed), has chance
> +	   of persistence across reboots on sane filesystems. */
> +	if (!page->mapping) h = (unsigned long)page;
> +	else h = page->mapping->host->i_ino;
> +
> +	return &usem_waits[hash_long(h + page->index + offset, USEM_HASHBITS)];
> +}
> +
> +/* Must be holding mmap_sem */
> +static inline int pin_page(unsigned long upage_start, struct page **page)
> +{
> +	return get_user_pages(current, current->mm, upage_start,
> +			      1 /* one page */,
> +			      1 /* writable */,
> +			      0 /* don't force */,
> +			      page,
> +			      NULL /* don't return vmas */);
> +}
> +
> +/* Get kernel address of the two variables, and pin them in. */
> +static int map_usem(unsigned long uaddr,
> +		    atomic_t **count,
> +		    unsigned int **sleepers,
> +		    struct page *pages[2])
> +{
> +	struct mm_struct *mm = current->mm;
> +	unsigned int num_pages;
> +	unsigned long upage_start;
> +	int err;
> +
> +	upage_start = (uaddr & PAGE_MASK);
> +
> +	/* Most times, whole thing on one page */
> +	if (((uaddr + sizeof(struct usem) - 1) & PAGE_MASK) == upage_start) {
> +		num_pages = 1;
> +		pages[1] = NULL;
> +	} else {
> +		num_pages = 2;
> +		/* ... otherwise, page boundary must be between them */
> +		if ((uaddr + offsetof(struct usem, sleepers))%PAGE_SIZE != 0)
> +			return -EINVAL;
> +	}
> +
> +	down_read(&mm->mmap_sem);
> +	/* Pin first page */
> +	err = pin_page(upage_start, &pages[0]);
> +	if (num_pages == 2 && err == 1) {
> +		/* Pin second page */
> +		err = pin_page(upage_start + PAGE_SIZE, &pages[1]);
> +		if (err < 0)
> +			put_page(pages[0]);
> +	}
> +	up_read(&mm->mmap_sem);
> +	if (err < 0)
> +		return err;
> +
> +	/* Set up pointers into pinned page(s) */
> +	*count = page_address(pages[0]) + (uaddr%PAGE_SIZE);
> +	uaddr += offsetof(struct usem, sleepers);
> +	if (num_pages == 1)
> +		*sleepers = page_address(pages[0]) + (uaddr%PAGE_SIZE);
> +	else /* sleepers is on second page */
> +		*sleepers = page_address(pages[1]) + (uaddr%PAGE_SIZE);
> +	return 0;
> +}
> +
> +/* Unpin the variables */
> +static void unmap_usem(struct page *pages[2])
> +{
> +	put_page(pages[0]);
> +	if (pages[1]) put_page(pages[1]);
> +}
> +
> +/* Stolen from arch/i386/kernel/semaphore.c. */
> +static int __usem_down(wait_queue_head_t *wq,
> +		       atomic_t *count,
> +		       unsigned int *sleepers)
> +{
> +	int retval = 0;
> +	DECLARE_WAITQUEUE(wait, current);
> +
> +	current->state = TASK_INTERRUPTIBLE;
> +	add_wait_queue_exclusive(wq, &wait);
> +
> +	spin_lock(&usem_lock);
> +	(*sleepers)++;
> +	for (;;) {
> +		unsigned int sl = *sleepers;
> +
> +		/* With signals pending, this turns into the trylock
> +		 * failure case - we won't be sleeping, and we can't
> +		 * get the lock as it has contention. Just correct the
> +		 * count and exit.  */
> +		if (signal_pending(current)) {
> +			retval = -EINTR;
> +			*sleepers = 0;
> +			atomic_add(sl, count);
> +			break;
> +		}
> +
> +		/* Add "everybody else" into it. They aren't playing,
> +		 * because we own the spinlock. The "-1" is because
> +		 * we're still hoping to get the lock.  */
> +		if (!atomic_add_negative(sl - 1, count)) {
> +			*sleepers = 0;
> +			break;
> +		}
> +		*sleepers = 1;	/* us - see -1 above */
> +		spin_unlock(&usem_lock);
> +
> +		schedule();
> +		current->state = TASK_INTERRUPTIBLE;
> +		spin_lock(&usem_lock);
> +	}
> +	spin_unlock(&usem_lock);
> +	current->state = TASK_RUNNING;
> +	remove_wait_queue(wq, &wait);
> +
> +	/* Wake up everyone else. */
> +	wake_up(wq);
> +	return retval;
> +}
> +
> +/* aka "down" */
> +static ssize_t usem_read(struct file *f, char *u, size_t c, loff_t *ppos)
> +{
> +	struct page *pages[2];
> +	wait_queue_head_t *wqhead;
> +	atomic_t *count;
> +	unsigned int *sleepers;
> +	int ret;
> +
> +	/* Must not vanish underneath us. */
> +	ret = map_usem(*ppos, &count, &sleepers, pages);
> +	if (ret < 0)
> +		return ret;
> +	wqhead = hash_usem(pages[0], *ppos % PAGE_SIZE);
> +	ret = __usem_down(wqhead, count, sleepers);
> +	unmap_usem(pages);
> +	return 0;
> +}
> +	
> +
> +/* aka "up" */
> +static ssize_t
> +usem_write(struct file *f, const char *u, size_t c, loff_t *ppos)
> +{
> +	struct page *pages[2];
> +	wait_queue_head_t *wqhead;
> +	atomic_t *count;
> +	unsigned int *sleepers;
> +	int ret;
> +
> +	/* Must not vanish underneath us: even if they do an up
> +           without a down (userspace bug). */
> +	ret = map_usem(*ppos, &count, &sleepers, pages);
> +	if (ret < 0)
> +		return ret;
> +	wqhead = hash_usem(pages[0], *ppos % PAGE_SIZE);
> +	wake_up(wqhead);
> +	unmap_usem(pages);
> +	return ret;
> +}
> +
> +static struct file_operations usem_fops = {
> +	owner:		THIS_MODULE,
> +	read:		usem_read,
> +	write:		usem_write,
> +};
> +
> +static int usem_major;
> +static devfs_handle_t usem_dev;
> +
> +int __init init(void)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(usem_waits); i++)
> +		init_waitqueue_head(&usem_waits[i]);
> +	usem_major = devfs_register_chrdev(0, "usem", &usem_fops);
> +	usem_dev = devfs_register(NULL, "usem", DEVFS_FL_NONE, usem_major,
> +				  0, S_IFCHR | 0666, &usem_fops, NULL);
> +	return 0;
> +}
> +
> +void __exit fini(void)
> +{
> +	devfs_unregister(usem_dev);
> +	devfs_unregister_chrdev(usem_major, "usem");
> +}
> +
> +MODULE_LICENSE("GPL");
> +module_init(init);
> +module_exit(fini);
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/include/linux/hash.h working-2.5.5-usem/include/linux/hash.h
> --- linux-2.5.5/include/linux/hash.h	Thu Jan  1 10:00:00 1970
> +++ working-2.5.5-usem/include/linux/hash.h	Thu Feb 21 12:33:46 2002
> @@ -0,0 +1,58 @@
> +#ifndef _LINUX_HASH_H
> +#define _LINUX_HASH_H
> +/* Fast hashing routine for a long.
> +   (C) 2002 William Lee Irwin III, IBM */
> +
> +/*
> + * Knuth recommends primes in approximately golden ratio to the maximum
> + * integer representable by a machine word for multiplicative hashing.
> + * Chuck Lever verified the effectiveness of this technique:
> + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
> + *
> + * These primes are chosen to be bit-sparse, that is operations on
> + * them can use shifts and additions instead of multiplications for
> + * machines where multiplications are slow.
> + */
> +#if BITS_PER_LONG == 32
> +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e370001UL
> +#elif BITS_PER_LONG == 64
> +/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
> +#else
> +#error Define GOLDEN_RATIO_PRIME for your wordsize.
> +#endif
> +
> +static inline unsigned long hash_long(unsigned long val, unsigned int bits)
> +{
> +	unsigned long hash = val;
> +
> +#if BITS_PER_LONG == 64
> +	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
> +	unsigned long n = hash;
> +	n <<= 18;
> +	hash -= n;
> +	n <<= 33;
> +	hash -= n;
> +	n <<= 3;
> +	hash += n;
> +	n <<= 3;
> +	hash -= n;
> +	n <<= 4;
> +	hash += n;
> +	n <<= 2;
> +	hash += n;
> +#else
> +	/* On some cpus multiply is faster, on others gcc will do shifts */
> +	hash *= GOLDEN_RATIO_PRIME;
> +#endif
> +
> +	/* High bits are more random, so use them. */
> +	return hash >> (BITS_PER_LONG - bits);
> +}
> +	
> +static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
> +{
> +	return hash_long((unsigned long)ptr, bits);
> +}
> +#endif /* _LINUX_HASH_H */
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/include/linux/mmzone.h working-2.5.5-usem/include/linux/mmzone.h
> --- linux-2.5.5/include/linux/mmzone.h	Wed Feb 20 16:07:17 2002
> +++ working-2.5.5-usem/include/linux/mmzone.h	Thu Feb 21 12:46:30 2002
> @@ -51,8 +51,7 @@
>  	/*
>  	 * wait_table		-- the array holding the hash table
>  	 * wait_table_size	-- the size of the hash table array
> -	 * wait_table_shift	-- wait_table_size
> -	 * 				== BITS_PER_LONG (1 << wait_table_bits)
> +	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
>  	 *
>  	 * The purpose of all these is to keep track of the people
>  	 * waiting for a page to become available and make them
> @@ -75,7 +74,7 @@
>  	 */
>  	wait_queue_head_t	* wait_table;
>  	unsigned long		wait_table_size;
> -	unsigned long		wait_table_shift;
> +	unsigned long		wait_table_bits;
>  
>  	/*
>  	 * Discontig memory support fields.
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/kernel/ksyms.c working-2.5.5-usem/kernel/ksyms.c
> --- linux-2.5.5/kernel/ksyms.c	Wed Feb 20 16:07:17 2002
> +++ working-2.5.5-usem/kernel/ksyms.c	Thu Feb 21 16:48:59 2002
> @@ -86,6 +86,7 @@
>  EXPORT_SYMBOL(do_munmap);
>  EXPORT_SYMBOL(do_brk);
>  EXPORT_SYMBOL(exit_mm);
> +EXPORT_SYMBOL(get_user_pages);
>  
>  /* internal kernel memory management */
>  EXPORT_SYMBOL(_alloc_pages);
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/mm/filemap.c working-2.5.5-usem/mm/filemap.c
> --- linux-2.5.5/mm/filemap.c	Wed Feb 20 16:07:17 2002
> +++ working-2.5.5-usem/mm/filemap.c	Thu Feb 21 13:01:02 2002
> @@ -25,6 +25,7 @@
>  #include <linux/iobuf.h>
>  #include <linux/compiler.h>
>  #include <linux/fs.h>
> +#include <linux/hash.h>
>  
>  #include <asm/pgalloc.h>
>  #include <asm/uaccess.h>
> @@ -773,32 +774,8 @@
>  static inline wait_queue_head_t *page_waitqueue(struct page *page)
>  {
>  	const zone_t *zone = page_zone(page);
> -	wait_queue_head_t *wait = zone->wait_table;
> -	unsigned long hash = (unsigned long)page;
>  
> -#if BITS_PER_LONG == 64
> -	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
> -	unsigned long n = hash;
> -	n <<= 18;
> -	hash -= n;
> -	n <<= 33;
> -	hash -= n;
> -	n <<= 3;
> -	hash += n;
> -	n <<= 3;
> -	hash -= n;
> -	n <<= 4;
> -	hash += n;
> -	n <<= 2;
> -	hash += n;
> -#else
> -	/* On some cpus multiply is faster, on others gcc will do shifts */
> -	hash *= GOLDEN_RATIO_PRIME;
> -#endif
> -
> -	hash >>= zone->wait_table_shift;
> -
> -	return &wait[hash];
> +	return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
>  }
>  
>  /* 
> diff -urN -I \$.*\$ --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.5/mm/page_alloc.c working-2.5.5-usem/mm/page_alloc.c
> --- linux-2.5.5/mm/page_alloc.c	Wed Feb 20 16:07:17 2002
> +++ working-2.5.5-usem/mm/page_alloc.c	Thu Feb 21 12:33:46 2002
> @@ -776,8 +776,8 @@
>  		 * per zone.
>  		 */
>  		zone->wait_table_size = wait_table_size(size);
> -		zone->wait_table_shift =
> -			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
> +		zone->wait_table_bits =
> +			wait_table_bits(zone->wait_table_size);
>  		zone->wait_table = (wait_queue_head_t *)
>  			alloc_bootmem_node(pgdat, zone->wait_table_size
>  						* sizeof(wait_queue_head_t));
> 



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 13:14           ` Alan Cox
@ 2002-02-25 16:11             ` Linus Torvalds
  2002-02-25 16:39               ` Alan Cox
  0 siblings, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2002-02-25 16:11 UTC (permalink / raw)
  To: Alan Cox
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel



On Mon, 25 Feb 2002, Alan Cox wrote:
>
> 	fd = open("/dev/shm/sem....");

Hmm.. Yes. Except I would allow a NULL backing store name for the
normal(?) case of just wanting private anonymous memory.

At the same time, I have to admit that I like the notion that Rusty had of
libraries being able to just put their semaphores anywhere (on the stack
etc), as it does work for many architectures. Ugh.

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 16:39               ` Alan Cox
@ 2002-02-25 16:32                 ` Benjamin LaHaise
  2002-02-25 17:42                   ` Alan Cox
  2002-02-25 18:23                   ` Hubertus Franke
  2002-02-25 17:06                 ` Linus Torvalds
  2002-03-03  7:07                 ` [PATCH] Lightweight userspace semaphores Rusty Russell
  2 siblings, 2 replies; 47+ messages in thread
From: Benjamin LaHaise @ 2002-02-25 16:32 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Rusty Russell, mingo, Matthew Kirkwood,
	David Axmark, William Lee Irwin III, linux-kernel

On Mon, Feb 25, 2002 at 04:39:56PM +0000, Alan Cox wrote:
> _alloca
> mmap
> 
> Still fits on the stack 8)

Are we sure that forcing semaphore overhead to the size of a page is a 
good idea?  I'd much rather see a sleep/wakeup mechanism akin to wait 
queues be exported by the kernel so that userspace can implement a rich 
set of locking functions on top of that in whatever shared memory is 
being used.

		-ben

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 16:11             ` Linus Torvalds
@ 2002-02-25 16:39               ` Alan Cox
  2002-02-25 16:32                 ` Benjamin LaHaise
                                   ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: Alan Cox @ 2002-02-25 16:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Rusty Russell, mingo, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel

> > 	fd = open("/dev/shm/sem....");
> 
> Hmm.. Yes. Except I would allow a NULL backing store name for the
> normal(?) case of just wanting private anonymous memory.

unlink()

> At the same time, I have to admit that I like the notion that Rusty had of
> libraries being able to just put their semaphores anywhere (on the stack
> etc), as it does work for many architectures. Ugh.

_alloca
mmap

Still fits on the stack 8)


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 16:39               ` Alan Cox
  2002-02-25 16:32                 ` Benjamin LaHaise
@ 2002-02-25 17:06                 ` Linus Torvalds
  2002-02-25 17:31                   ` Alan Cox
  2002-03-03  7:07                 ` [PATCH] Lightweight userspace semaphores Rusty Russell
  2 siblings, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2002-02-25 17:06 UTC (permalink / raw)
  To: Alan Cox
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel



On Mon, 25 Feb 2002, Alan Cox wrote:
> > > 	fd = open("/dev/shm/sem....");
> >
> > Hmm.. Yes. Except I would allow a NULL backing store name for the
> > normal(?) case of just wanting private anonymous memory.
>
> unlink()

Sure, but that, together with making up a unique temporary name etc just
adds extra overhead for no actual gain.

> > At the same time, I have to admit that I like the notion that Rusty had of
> > libraries being able to just put their semaphores anywhere (on the stack
> > etc), as it does work for many architectures. Ugh.
>
> _alloca
> mmap
>
> Still fits on the stack 8)

.. but is slow as hell.

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 17:31                   ` Alan Cox
@ 2002-02-25 17:20                     ` Linus Torvalds
  2002-02-25 17:50                       ` Alan Cox
  2002-02-26 12:15                       ` [PATCH] 2.5.5 IDE clean 14 Martin Dalecki
  0 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2002-02-25 17:20 UTC (permalink / raw)
  To: Alan Cox
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel



On Mon, 25 Feb 2002, Alan Cox wrote:
>
> As opposed to adding special cases to the kernel which are unswappable and
> stand to tangle up bits of the generic vfs - eg we would have a vma with
> a vm_file but that file would not be in the dcache ?

Why should they be unswappable?

It's the same thing as giving a -1 to mmap. That doesn't make it
unswappable.

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 17:06                 ` Linus Torvalds
@ 2002-02-25 17:31                   ` Alan Cox
  2002-02-25 17:20                     ` Linus Torvalds
  0 siblings, 1 reply; 47+ messages in thread
From: Alan Cox @ 2002-02-25 17:31 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Rusty Russell, mingo, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel

> > unlink()
> 
> Sure, but that, together with making up a unique temporary name etc just
> adds extra overhead for no actual gain.

As opposed to adding special cases to the kernel which are unswappable and
stand to tangle up bits of the generic vfs - eg we would have a vma with
a vm_file but that file would not be in the dcache ?

Is it really worth it. For temporary files unix has never adopted a tmpfile()
syscall because nobody has ever found mkstemp() a paticularly critical path
that justified it

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 16:32                 ` Benjamin LaHaise
@ 2002-02-25 17:42                   ` Alan Cox
  2002-02-25 18:23                   ` Hubertus Franke
  1 sibling, 0 replies; 47+ messages in thread
From: Alan Cox @ 2002-02-25 17:42 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Alan Cox, Linus Torvalds, Rusty Russell, mingo, Matthew Kirkwood,
	David Axmark, William Lee Irwin III, linux-kernel

> Are we sure that forcing semaphore overhead to the size of a page is a 
> good idea?  I'd much rather see a sleep/wakeup mechanism akin to wait 
> queues be exported by the kernel so that userspace can implement a rich 
> set of locking functions on top of that in whatever shared memory is 
> being used.

The shared memory side of it has to be page sized. It doesn't mean you have
to have 1 semaphore per page but it does mean you have to allocate in page
sized chunks for mmu granularity


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 17:50                       ` Alan Cox
@ 2002-02-25 17:44                         ` Linus Torvalds
  2002-02-25 18:06                           ` Alan Cox
  0 siblings, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2002-02-25 17:44 UTC (permalink / raw)
  To: Alan Cox
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel


On Mon, 25 Feb 2002, Alan Cox wrote:
> 
> Any kernel special cases it adds will be unswappable because they are in
> kernel space (not the semaphores here - we want them to be swappable and
> they can be)

Alan, wake up!

I'm talking about anonymous semaphores, the kernel implementation can just 
map a normal anonymous page there.

On 99.9% of all machines out there, you can have semaphores in perfectly 
normal memory.

> When you create a shared mapping by passing -1 to mmap we do

Why are you talking about shared mappings?

The most common case for any fast semaphores are for _threaded_
applications. No shared memory, no nothing.

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 17:20                     ` Linus Torvalds
@ 2002-02-25 17:50                       ` Alan Cox
  2002-02-25 17:44                         ` Linus Torvalds
  2002-02-26 12:15                       ` [PATCH] 2.5.5 IDE clean 14 Martin Dalecki
  1 sibling, 1 reply; 47+ messages in thread
From: Alan Cox @ 2002-02-25 17:50 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Rusty Russell, mingo, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel

> On Mon, 25 Feb 2002, Alan Cox wrote:
> >
> > As opposed to adding special cases to the kernel which are unswappable and
> > stand to tangle up bits of the generic vfs - eg we would have a vma with
> > a vm_file but that file would not be in the dcache ?
> 
> Why should they be unswappable?

Any kernel special cases it adds will be unswappable because they are in
kernel space (not the semaphores here - we want them to be swappable and
they can be)

> It's the same thing as giving a -1 to mmap. That doesn't make it
> unswappable.

When you create a shared mapping by passing -1 to mmap we do

        } else if (flags & MAP_SHARED) {
                error = shmem_zero_setup(vma);

shmem_zero_setup does

        file = shmem_file_setup("dev/zero", size);
        if (IS_ERR(file))
                return PTR_ERR(file);

        if (vma->vm_file)
                fput (vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;

and we are back creating file names. Basically because a shared mmap in
Linux needs vma->vm_file, and vma->vm_file needs all the rest of the logic
behind it

Thats why I am saying that magic name picking is something that user space
might as well do for unnamed objects. We end up with names and vm_file
however we do it

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 17:44                         ` Linus Torvalds
@ 2002-02-25 18:06                           ` Alan Cox
  2002-02-25 19:31                             ` Linus Torvalds
  2002-02-25 19:51                             ` Hubertus Franke
  0 siblings, 2 replies; 47+ messages in thread
From: Alan Cox @ 2002-02-25 18:06 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Alan Cox, Rusty Russell, mingo, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel

> The most common case for any fast semaphores are for _threaded_
> applications. No shared memory, no nothing.

Ok I see where you are coming from now -- that makes sense for a few cases.
POSIX thread locks have to be able to work interprocess not just between
threads though, so a full posix lock implementation couldn't be done without
being able to put these things on shared pages (hence I was coming from
the using shmfs as backing store angle).  Using a subset of shmfs also got
me resource management which happens to be nice.

The other user of these kind of fast locks is databases. Oracle for example
seems not to be a single mm threaded application.

If we are talking about being able to say "make this page semaphores" then I 
agree - the namespace is a seperate problem and up to whoever allocated the
backing store in the first place, and may well not involve a naming at all.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 16:32                 ` Benjamin LaHaise
  2002-02-25 17:42                   ` Alan Cox
@ 2002-02-25 18:23                   ` Hubertus Franke
  2002-02-25 20:57                     ` Hubertus Franke
  1 sibling, 1 reply; 47+ messages in thread
From: Hubertus Franke @ 2002-02-25 18:23 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Alan Cox, Linus Torvalds, Rusty Russell, mingo, Matthew Kirkwood,
	David Axmark, William Lee Irwin III, linux-kernel

On Mon, Feb 25, 2002 at 11:32:40AM -0500, Benjamin LaHaise wrote:
> On Mon, Feb 25, 2002 at 04:39:56PM +0000, Alan Cox wrote:
> > _alloca
> > mmap
> > 
> > Still fits on the stack 8)
> 
> Are we sure that forcing semaphore overhead to the size of a page is a 
> good idea?  I'd much rather see a sleep/wakeup mechanism akin to wait 
> queues be exported by the kernel so that userspace can implement a rich 
> set of locking functions on top of that in whatever shared memory is 
> being used.
> 
> 		-ben
> -

Amen, I agree Ben. As I indicated in my previous note, one can
implement various versions of spinning, starvation, non-starvation locks
based on the appropriateness for a given app and scenario.
For instance the multiple reader/single writer requires 2 queues.
If as Ben stated something similar to the SysV implementation is desired
where a single lock holds multiple waiting queues, that should be straight
forward to implement. Waiting queues could be allocated on demand as well.

I'd like to see an implementation that facilitate that.
My implementation separates the state to the user level and the 
waiting to the kernel level. There are race conditions that need to 
be resolved with respect to wakeup. They can be all encoded into
the atomic word maintained in shared memory in user space.

For more complex locks I'd like to have compare_and_swap instructions.
As I stated, I have implemented some of the more complicated locks
(spinning, convoy avoidance, etc.) and they have all passed some rigorous
stress test.

As for allocation on the stack. If indeed there are kernel objects
associated with the address, they need to be cleared upon exit from
the issueing subroutine (at least in my implementation).


At this point, could be go through and delineate some of the requirements
first.
E.g.  (a) filedescriptors vs. vaddr
      (b) explicit vs. implicite allocation  
      (c) system call interface vs. device driver
      (d) state management in user space only or in kernel as well
	        i.e. how many are waiting, how many are woken up.
      (e) semaphores only or multiple queues
      (f) protection through an exported handle with some MAGIC or
          through virtual memory access rights
      (g) persistence on mmap or not 

Here is my point of view:
(a) vaddr 
(b) implicite
(c) syscall
(d) user only
(e) multiple queues
(f) virtual memory access rights.
(g) persistent  (if you don't want persistence you remove the underlying object)
	
I requested some input on my original message a couple of weeks regarding 
these points (but only got one on (b)).

Could everybody respond to (a)-(f) for a show of hands.
Could we also consolidate some pointers of the various implementations
that are out there and then see what the pluses and minuses of the various
implementations are  and how they score against (a)-(f).

-- Hubertus Franke


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 18:06                           ` Alan Cox
@ 2002-02-25 19:31                             ` Linus Torvalds
  2002-02-24  4:57                               ` Daniel Phillips
  2002-02-25 19:51                             ` Hubertus Franke
  1 sibling, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2002-02-25 19:31 UTC (permalink / raw)
  To: Alan Cox
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel


On Mon, 25 Feb 2002, Alan Cox wrote:
> 
> Ok I see where you are coming from now -- that makes sense for a few cases.

Note that the "few cases" in question imply _all_ of the current broken 
library spinlocks, for example. 

Don't think "POSIX semaphores", but think "fast locking" in the general
case. I will bet you $1 in small change that most normal locking by far is
for the kind of thread-safe stuff libc does right now.

		Linus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 18:06                           ` Alan Cox
  2002-02-25 19:31                             ` Linus Torvalds
@ 2002-02-25 19:51                             ` Hubertus Franke
  1 sibling, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-02-25 19:51 UTC (permalink / raw)
  To: Alan Cox
  Cc: Linus Torvalds, Rusty Russell, mingo, Matthew Kirkwood,
	Benjamin LaHaise, David Axmark, William Lee Irwin III,
	linux-kernel

On Mon, Feb 25, 2002 at 06:06:48PM +0000, Alan Cox wrote:
> > The most common case for any fast semaphores are for _threaded_
> > applications. No shared memory, no nothing.

Well, but most threaded applications should actually take care of this
without having to ditch into the kernel.

> 
> Ok I see where you are coming from now -- that makes sense for a few cases.
> POSIX thread locks have to be able to work interprocess not just between
> threads though, so a full posix lock implementation couldn't be done without
> being able to put these things on shared pages (hence I was coming from
> the using shmfs as backing store angle).  Using a subset of shmfs also got
> me resource management which happens to be nice.

With respect to global POSIX locks:
I talked to Bill Abt of the NGPT team and we planned of introducing 
an asynchronous mechanism to the user level lock package that I submitted
some 2 weeks ago. The problem here is that in NGPT like packages
the underlying kernel thread can not be blocked on a wait call, because
it constitutes a virtual CPU on which multiple user level threads are 
run. So what we were thinking of is on a wait call to the kernel
a datastructure is put in place on which to wait rather than blocking
on the calling thread. On wakeup, the initial process will be signaled
that one of the locks is available again. The user level thread scheduler
can then pick up the lock (many mechanism possible) and continue
the user level thread waiting on it.
> 

> The other user of these kind of fast locks is databases. Oracle for example
> seems not to be a single mm threaded application.
> 
> If we are talking about being able to say "make this page semaphores" then I 
> agree - the namespace is a seperate problem and up to whoever allocated the
> backing store in the first place, and may well not involve a naming at all.

What I implemented is what Linus recommended, namely indicate whether
a memory region is capable of being utilized for user level locks.
We need this for cleanup, i.e., when a process exits or dies, and the 
vma area is closed/removed, we know when to call back the kernel subsystem
holding the kernel locks associated with user level locks and clean
up lingering objets.

This works quite nice. I did this in my implementation and it requires
basically 4 line changes in the kernel.
The flag MAP_SEMAPHORE is to be introduced for mmap and shmat.
One problem is that libc seems to mask out any flags that
is currently not exposed by the kernel.

-- Hubertus

> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 18:23                   ` Hubertus Franke
@ 2002-02-25 20:57                     ` Hubertus Franke
  0 siblings, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-02-25 20:57 UTC (permalink / raw)
  To: Benjamin LaHaise
  Cc: Alan Cox, Linus Torvalds, Rusty Russell, mingo, Matthew Kirkwood,
	David Axmark, William Lee Irwin III, linux-kernel

All, I uploaded my latest version on this to lse.sourceforge.net
you can get to by http://prdownloads.sourceforge.net/lse/ulocks-2.4.17.tar.bz2

Hope you find some time to look at it and respond to the questions
and positions I outlined in the message before.

-- Hubertus Franke

On Mon, Feb 25, 2002 at 01:23:35PM -0500, Hubertus Franke wrote:
> On Mon, Feb 25, 2002 at 11:32:40AM -0500, Benjamin LaHaise wrote:
> > On Mon, Feb 25, 2002 at 04:39:56PM +0000, Alan Cox wrote:
> > > _alloca
> > > mmap
> > > 
> > > Still fits on the stack 8)
> > 
> > Are we sure that forcing semaphore overhead to the size of a page is a 
> > good idea?  I'd much rather see a sleep/wakeup mechanism akin to wait 
> > queues be exported by the kernel so that userspace can implement a rich 
> > set of locking functions on top of that in whatever shared memory is 
> > being used.
> > 
> > 		-ben
> > -
> 
> Amen, I agree Ben. As I indicated in my previous note, one can
> implement various versions of spinning, starvation, non-starvation locks
> based on the appropriateness for a given app and scenario.
> For instance the multiple reader/single writer requires 2 queues.
> If as Ben stated something similar to the SysV implementation is desired
> where a single lock holds multiple waiting queues, that should be straight
> forward to implement. Waiting queues could be allocated on demand as well.
> 
> I'd like to see an implementation that facilitate that.
> My implementation separates the state to the user level and the 
> waiting to the kernel level. There are race conditions that need to 
> be resolved with respect to wakeup. They can be all encoded into
> the atomic word maintained in shared memory in user space.
> 
> For more complex locks I'd like to have compare_and_swap instructions.
> As I stated, I have implemented some of the more complicated locks
> (spinning, convoy avoidance, etc.) and they have all passed some rigorous
> stress test.
> 
> As for allocation on the stack. If indeed there are kernel objects
> associated with the address, they need to be cleared upon exit from
> the issueing subroutine (at least in my implementation).
> 
> 
> At this point, could be go through and delineate some of the requirements
> first.
> E.g.  (a) filedescriptors vs. vaddr
>       (b) explicit vs. implicite allocation  
>       (c) system call interface vs. device driver
>       (d) state management in user space only or in kernel as well
> 	        i.e. how many are waiting, how many are woken up.
>       (e) semaphores only or multiple queues
>       (f) protection through an exported handle with some MAGIC or
>           through virtual memory access rights
>       (g) persistence on mmap or not 
> 
> Here is my point of view:
> (a) vaddr 
> (b) implicite
> (c) syscall
> (d) user only
> (e) multiple queues
> (f) virtual memory access rights.
> (g) persistent  (if you don't want persistence you remove the underlying object)
> 	
> I requested some input on my original message a couple of weeks regarding 
> these points (but only got one on (b)).
> 
> Could everybody respond to (a)-(f) for a show of hands.
> Could we also consolidate some pointers of the various implementations
> that are out there and then see what the pluses and minuses of the various
> implementations are  and how they score against (a)-(f).
> 
> -- Hubertus Franke
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH] 2.5.5 IDE clean 14
  2002-02-25 17:20                     ` Linus Torvalds
  2002-02-25 17:50                       ` Alan Cox
@ 2002-02-26 12:15                       ` Martin Dalecki
  2002-02-26 21:49                         ` Keith Owens
  1 sibling, 1 reply; 47+ messages in thread
From: Martin Dalecki @ 2002-02-26 12:15 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2931 bytes --]

Hello!

Most importantly this patch is making ide.c use the
new automagic for module initialization lists and further
preparing the rest of the code in question here for proper
module separation. Despite this the CMOS probe has been removed
as well... *Iff*, which I don't expect, this breaks anything
it can be reintroduced easely. During this effort an actual bug
in the initialization of the main module has been uncovered as well.
a quite serious BUG has been tagged in ide-scsi.c as well, but
as far as now I just didn't get along to actually fixing it.
(The patch is big enough as it is).

Details follow:

- Kill *unused* ide_media_verbose() funciton.

- Remove the unnecessary media and supports_dma fields from
   ide_driver_t.

- Remove the global name field from ide_driver_t struct by pushing it
   down to the places where it's actually used.

- Remove the unused hwif_data field from ide_hwif_t.

- Push the supports_dsc_overlap condition up to the level where it
   belongs: disk type as well.

- Make the initialization of ide main ide.c work with the new module
   initialization auto-magic instead of calling it explicitly in
   ll_rw_block.c This prevents the ide_init() from being called twice. We
   have BTW. renamed it to ata_module_init(), since  ata is more adequate
   then ide and xxx_module_init corresponds better to the naming
   conventions used elsewhere throughout the kernel.

   This BUG was there before any ide-clean.  It was worked around by a
   magic variable preventing the second call to succeed.  We have removed
   this variable in one of the previous patches and thus uncovered it.

- Kill proc_ide_read_driver() and proc_ide_write_driver(). The drivers
   already report on syslog which drives they have taken care of.  (Or
   at least they should). In esp. the proc_ide_write_driver() was just
   too offending for me.  Beleve it or not the purpose of it was to
   *request a particular* driver for a device, by echoing some magic
   values to a magic file...
   More importantly this "back door" was getting in the way of a properly
   done modularization of the IDE stuff.

- Made some not externally used functions static or not EXPORT-ed.

- Provide the start of a proper modularization between the main module
   and drivers for particular device types. Changing the name-space
   polluting DRIVER() macro to ata_ops() showed how inconsistently the
   busy (read: module  busy!) field from ide_driver_t
   is currently used across the    different device type modules.
   This has to be fixed soon.

- Make the ide code use the similar device type ID numbers as the SCSI
   code :-).  This is just tedious, but it will help in a distant
   feature. It helps reading the code anyway.

- Mark repettitive code with /* ATA-PATTERN */ comments for later
   consolidation at places where we did came across it.

- Various comments and notes added where some explanations was missing.

[-- Attachment #2: ide-clean-14.diff --]
[-- Type: text/plain, Size: 81357 bytes --]

diff -ur linux-2.5.5/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c
--- linux-2.5.5/drivers/block/ll_rw_blk.c	Wed Feb 20 03:10:55 2002
+++ linux/drivers/block/ll_rw_blk.c	Mon Feb 25 21:30:03 2002
@@ -1697,9 +1697,6 @@
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
 
-#if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
-	ide_init();		/* this MUST precede hd_init */
-#endif
 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
 	hd_init();
 #endif
diff -ur linux-2.5.5/drivers/ide/aec62xx.c linux/drivers/ide/aec62xx.c
--- linux-2.5.5/drivers/ide/aec62xx.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/aec62xx.c	Tue Feb 26 01:23:43 2002
@@ -48,7 +48,6 @@
 
 static int aec62xx_get_info(char *, char **, off_t, int);
 extern int (*aec62xx_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int aec62xx_get_info (char *buffer, char **addr, off_t offset, int count)
@@ -310,8 +309,8 @@
 	unsigned long dma_base	= hwif->dma_base;
 	byte speed		= -1;
 
-	if (drive->media != ide_disk)
-		return ((int) ide_dma_off_quietly);
+	if (drive->type != ATA_DISK)
+		return ide_dma_off_quietly;
 
 	if (((id->dma_ultra & 0x0010) ||
 	     (id->dma_ultra & 0x0008) ||
@@ -356,7 +355,7 @@
 	byte speed		= -1;
 	byte ultra66		= eighty_ninty_three(drive);
 
-	if (drive->media != ide_disk)
+	if (drive->type != ATA_DISK)
 		return ((int) ide_dma_off_quietly);
 
 	if ((id->dma_ultra & 0x0010) && (ultra) && (ultra66)) {
diff -ur linux-2.5.5/drivers/ide/alim15x3.c linux/drivers/ide/alim15x3.c
--- linux-2.5.5/drivers/ide/alim15x3.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/alim15x3.c	Tue Feb 26 01:25:26 2002
@@ -278,7 +278,7 @@
 	 * PIO mode => ATA FIFO on, ATAPI FIFO off
 	 */
 	pci_read_config_byte(dev, portFIFO, &cd_dma_fifo);
-	if (drive->media==ide_disk) {
+	if (drive->type == ATA_DISK) {
 		if (hwif->index) {
 			pci_write_config_byte(dev, portFIFO, (cd_dma_fifo & 0x0F) | 0x50);
 		} else {
@@ -424,9 +424,9 @@
 	} else if ((m5229_revision < 0xC2) &&
 #ifndef CONFIG_WDC_ALI15X3
 		   ((chip_is_1543c_e && strstr(id->model, "WDC ")) ||
-		    (drive->media!=ide_disk))) {
+		    (drive->type != ATA_DISK))) {
 #else /* CONFIG_WDC_ALI15X3 */
-		   (drive->media!=ide_disk)) {
+		   (drive->type != ATA_DISK)) {
 #endif /* CONFIG_WDC_ALI15X3 */
 		return 0;
 	} else {
@@ -441,7 +441,7 @@
 	ide_dma_action_t dma_func	= ide_dma_on;
 	byte can_ultra_dma		= ali15x3_can_ultra(drive);
 
-	if ((m5229_revision<=0x20) && (drive->media!=ide_disk))
+	if ((m5229_revision<=0x20) && (drive->type != ATA_DISK))
 		return hwif->dmaproc(ide_dma_off_quietly, drive);
 
 	if ((id != NULL) && ((id->capability & 1) != 0) && hwif->autodma) {
@@ -494,7 +494,7 @@
 		case ide_dma_check:
 			return ali15x3_config_drive_for_dma(drive);
 		case ide_dma_write:
-			if ((m5229_revision < 0xC2) && (drive->media != ide_disk))
+			if ((m5229_revision < 0xC2) && (drive->type != ATA_DISK))
 				return 1;	/* try PIO instead of DMA */
 			break;
 		default:
diff -ur linux-2.5.5/drivers/ide/amd74xx.c linux/drivers/ide/amd74xx.c
--- linux-2.5.5/drivers/ide/amd74xx.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/amd74xx.c	Mon Feb 25 01:54:28 2002
@@ -34,7 +34,6 @@
 
 static int amd74xx_get_info(char *, char **, off_t, int);
 extern int (*amd74xx_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int amd74xx_get_info (char *buffer, char **addr, off_t offset, int count)
diff -ur linux-2.5.5/drivers/ide/cmd64x.c linux/drivers/ide/cmd64x.c
--- linux-2.5.5/drivers/ide/cmd64x.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/cmd64x.c	Tue Feb 26 01:27:04 2002
@@ -88,7 +88,6 @@
 static int cmd64x_get_info(char *, char **, off_t, int);
 static int cmd680_get_info(char *, char **, off_t, int);
 extern int (*cmd64x_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int cmd64x_get_info (char *buffer, char **addr, off_t offset, int count)
@@ -448,7 +447,8 @@
 	u8 regU			= 0;
 	u8 regD			= 0;
 
-	if ((drive->media != ide_disk) && (speed < XFER_SW_DMA_0))	return 1;
+	if ((drive->type != ATA_DISK) && (speed < XFER_SW_DMA_0))
+		return 1;
 
 	(void) pci_read_config_byte(dev, pciD, &regD);
 	(void) pci_read_config_byte(dev, pciU, &regU);
@@ -641,8 +641,8 @@
 			break;
 	}
 
-	if (drive->media != ide_disk) {
-		cmdprintk("CMD64X: drive->media != ide_disk at double check, inital check failed!!\n");
+	if (drive->type != ATA_DISK) {
+		cmdprintk("CMD64X: drive is not a disk at double check, inital check failed!!\n");
 		return ((int) ide_dma_off);
 	}
 
@@ -788,7 +788,7 @@
 	}
 
 	if ((id != NULL) && ((id->capability & 1) != 0) &&
-	    hwif->autodma && (drive->media == ide_disk)) {
+	    hwif->autodma && (drive->type == ATA_DISK)) {
 		/* Consult the list of known "bad" drives */
 		if (ide_dmaproc(ide_dma_bad_drive, drive)) {
 			dma_func = ide_dma_off;
diff -ur linux-2.5.5/drivers/ide/cs5530.c linux/drivers/ide/cs5530.c
--- linux-2.5.5/drivers/ide/cs5530.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/cs5530.c	Mon Feb 25 01:54:35 2002
@@ -37,7 +37,6 @@
 
 static int cs5530_get_info(char *, char **, off_t, int);
 extern int (*cs5530_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int cs5530_get_info (char *buffer, char **addr, off_t offset, int count)
diff -ur linux-2.5.5/drivers/ide/hpt34x.c linux/drivers/ide/hpt34x.c
--- linux-2.5.5/drivers/ide/hpt34x.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/hpt34x.c	Tue Feb 26 01:27:49 2002
@@ -56,7 +56,6 @@
 
 static int hpt34x_get_info(char *, char **, off_t, int);
 extern int (*hpt34x_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int hpt34x_get_info (char *buffer, char **addr, off_t offset, int count)
@@ -210,7 +209,7 @@
 	struct hd_driveid *id	= drive->id;
 	byte speed		= 0x00;
 
-	if (drive->media != ide_disk)
+	if (drive->type != ATA_DISK)
 		return ((int) ide_dma_off_quietly);
 
 	hpt34x_clear_chipset(drive);
@@ -333,7 +332,7 @@
 			outb(reading, dma_base);		/* specify r/w */
 			outb(inb(dma_base+2)|6, dma_base+2);	/* clear INTR & ERROR flags */
 			drive->waiting_for_dma = 1;
-			if (drive->media != ide_disk)
+			if (drive->type != ATA_DISK)
 				return 0;
 			ide_set_handler(drive, &ide_dma_intr, WAIT_CMD, NULL);	/* issue cmd to drive */
 			OUT_BYTE((reading == 9) ? WIN_READDMA : WIN_WRITEDMA, IDE_COMMAND_REG);
diff -ur linux-2.5.5/drivers/ide/hpt366.c linux/drivers/ide/hpt366.c
--- linux-2.5.5/drivers/ide/hpt366.c	Sun Feb 24 16:32:53 2002
+++ linux/drivers/ide/hpt366.c	Tue Feb 26 01:28:31 2002
@@ -355,7 +355,6 @@
 #if defined(DISPLAY_HPT366_TIMINGS) && defined(CONFIG_PROC_FS)
 static int hpt366_get_info(char *, char **, off_t, int);
 extern int (*hpt366_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 
 static int hpt366_get_info (char *buffer, char **addr, off_t offset, int count)
 {
@@ -579,7 +578,7 @@
 
 static int hpt3xx_tune_chipset (ide_drive_t *drive, byte speed)
 {
-	if ((drive->media != ide_disk) && (speed < XFER_SW_DMA_0))
+	if ((drive->type != ATA_DISK) && (speed < XFER_SW_DMA_0))
 		return -1;
 
 	if (!drive->init_speed)
@@ -664,7 +663,7 @@
 	byte ultra66		= eighty_ninty_three(drive);
 	int  rval;
 
-	if ((drive->media != ide_disk) && (speed < XFER_SW_DMA_0))
+	if ((drive->type != ATA_DISK) && (speed < XFER_SW_DMA_0))
 		return ((int) ide_dma_off_quietly);
 
 	if ((id->dma_ultra & 0x0020) &&
diff -ur linux-2.5.5/drivers/ide/ht6560b.c linux/drivers/ide/ht6560b.c
--- linux-2.5.5/drivers/ide/ht6560b.c	Sun Feb 24 16:32:49 2002
+++ linux/drivers/ide/ht6560b.c	Tue Feb 26 01:29:05 2002
@@ -143,7 +143,7 @@
 	if (select != current_select || timing != current_timing) {
 		current_select = select;
 		current_timing = timing;
-		if (drive->media != ide_disk || !drive->present)
+		if (drive->type != ATA_DISK || !drive->present)
 			select |= HT_PREFETCH_MODE;
 		(void) inb(HT_CONFIG_PORT);
 		(void) inb(HT_CONFIG_PORT);
diff -ur linux-2.5.5/drivers/ide/ide-cd.c linux/drivers/ide/ide-cd.c
--- linux-2.5.5/drivers/ide/ide-cd.c	Sun Feb 24 16:32:45 2002
+++ linux/drivers/ide/ide-cd.c	Tue Feb 26 01:41:58 2002
@@ -2906,14 +2906,10 @@
 	return 0;
 }
 
-int ide_cdrom_reinit (ide_drive_t *drive);
+static int ide_cdrom_reinit (ide_drive_t *drive);
 
-static ide_driver_t ide_cdrom_driver = {
-	name:			"ide-cdrom",
-	media:			ide_cdrom,
-	busy:			0,
-	supports_dma:		1,
-	supports_dsc_overlap:	1,
+static struct ata_operations ide_cdrom_driver = {
+	owner:			THIS_MODULE,
 	cleanup:		ide_cdrom_cleanup,
 	standby:		NULL,
 	flushcache:		NULL,
@@ -2937,7 +2933,7 @@
 MODULE_PARM(ignore, "s");
 MODULE_DESCRIPTION("ATAPI CD-ROM Driver");
 
-int ide_cdrom_reinit (ide_drive_t *drive)
+static int ide_cdrom_reinit (ide_drive_t *drive)
 {
 	struct cdrom_info *info;
 	int failed = 0;
@@ -2955,14 +2951,17 @@
 	}
 	memset (info, 0, sizeof (struct cdrom_info));
 	drive->driver_data = info;
-	DRIVER(drive)->busy++;
+
+	/* ATA-PATTERN */
+	ata_ops(drive)->busy++;
 	if (ide_cdrom_setup (drive)) {
-		DRIVER(drive)->busy--;
+		ata_ops(drive)->busy--;
 		if (ide_cdrom_cleanup (drive))
 			printk ("%s: ide_cdrom_cleanup failed in ide_cdrom_init\n", drive->name);
 		return 1;
 	}
-	DRIVER(drive)->busy--;
+	ata_ops(drive)->busy--;
+
 	failed--;
 
 	revalidate_drives();
@@ -2975,7 +2974,7 @@
 	ide_drive_t *drive;
 	int failed = 0;
 
-	while ((drive = ide_scan_devices (ide_cdrom, ide_cdrom_driver.name, &ide_cdrom_driver, failed)) != NULL)
+	while ((drive = ide_scan_devices(ATA_ROM, "ide-cdrom", &ide_cdrom_driver, failed)) != NULL)
 		if (ide_cdrom_cleanup (drive)) {
 			printk ("%s: cleanup_module() called while still busy\n", drive->name);
 			failed++;
@@ -2989,7 +2988,7 @@
 	int failed = 0;
 
 	MOD_INC_USE_COUNT;
-	while ((drive = ide_scan_devices (ide_cdrom, ide_cdrom_driver.name, NULL, failed++)) != NULL) {
+	while ((drive = ide_scan_devices (ATA_ROM, "ide-cdrom", NULL, failed++)) != NULL) {
 		/* skip drives that we were told to ignore */
 		if (ignore != NULL) {
 			if (strstr(ignore, drive->name)) {
@@ -3013,14 +3012,17 @@
 		}
 		memset (info, 0, sizeof (struct cdrom_info));
 		drive->driver_data = info;
-		DRIVER(drive)->busy++;
+
+		/* ATA-PATTERN */
+		ata_ops(drive)->busy++;
 		if (ide_cdrom_setup (drive)) {
-			DRIVER(drive)->busy--;
+			ata_ops(drive)->busy--;
 			if (ide_cdrom_cleanup (drive))
 				printk ("%s: ide_cdrom_cleanup failed in ide_cdrom_init\n", drive->name);
 			continue;
 		}
-		DRIVER(drive)->busy--;
+		ata_ops(drive)->busy--;
+
 		failed--;
 	}
 	revalidate_drives();
diff -ur linux-2.5.5/drivers/ide/ide-disk.c linux/drivers/ide/ide-disk.c
--- linux-2.5.5/drivers/ide/ide-disk.c	Sun Feb 24 16:32:49 2002
+++ linux/drivers/ide/ide-disk.c	Tue Feb 26 01:39:24 2002
@@ -1053,17 +1053,13 @@
 	return ide_unregister_subdriver(drive);
 }
 
-int idedisk_reinit(ide_drive_t *drive);
+static int idedisk_reinit(ide_drive_t *drive);
 
 /*
  *      IDE subdriver functions, registered with ide.c
  */
-static ide_driver_t idedisk_driver = {
-	name:			"ide-disk",
-	media:			ide_disk,
-	busy:			0,
-	supports_dma:		1,
-	supports_dsc_overlap:	0,
+static struct ata_operations idedisk_driver = {
+	owner:			THIS_MODULE,
 	cleanup:		idedisk_cleanup,
 	standby:		do_idedisk_standby,
 	flushcache:		do_idedisk_flushcache,
@@ -1083,7 +1079,7 @@
 
 MODULE_DESCRIPTION("ATA DISK Driver");
 
-int idedisk_reinit (ide_drive_t *drive)
+static int idedisk_reinit(ide_drive_t *drive)
 {
 	int failed = 0;
 
@@ -1093,15 +1089,16 @@
 		printk (KERN_ERR "ide-disk: %s: Failed to register the driver with ide.c\n", drive->name);
 		return 1;
 	}
-	DRIVER(drive)->busy++;
+
+	ata_ops(drive)->busy++;
 	idedisk_setup(drive);
 	if ((!drive->head || drive->head > 16) && !drive->select.b.lba) {
 		printk(KERN_ERR "%s: INVALID GEOMETRY: %d PHYSICAL HEADS?\n", drive->name, drive->head);
-		(void) idedisk_cleanup(drive);
-		DRIVER(drive)->busy--;
+		idedisk_cleanup(drive);
+		ata_ops(drive)->busy--;
 		return 1;
 	}
-	DRIVER(drive)->busy--;
+	ata_ops(drive)->busy--;
 	failed--;
 
 	revalidate_drives();
@@ -1114,7 +1111,7 @@
 	ide_drive_t *drive;
 	int failed = 0;
 
-	while ((drive = ide_scan_devices (ide_disk, idedisk_driver.name, &idedisk_driver, failed)) != NULL) {
+	while ((drive = ide_scan_devices(ATA_DISK, "ide-disk", &idedisk_driver, failed)) != NULL) {
 		if (idedisk_cleanup (drive)) {
 			printk (KERN_ERR "%s: cleanup_module() called while still busy\n", drive->name);
 			failed++;
@@ -1134,20 +1131,20 @@
 	int failed = 0;
 	
 	MOD_INC_USE_COUNT;
-	while ((drive = ide_scan_devices (ide_disk, idedisk_driver.name, NULL, failed++)) != NULL) {
+	while ((drive = ide_scan_devices(ATA_DISK, "ide-disk", NULL, failed++)) != NULL) {
 		if (ide_register_subdriver (drive, &idedisk_driver)) {
 			printk (KERN_ERR "ide-disk: %s: Failed to register the driver with ide.c\n", drive->name);
 			continue;
 		}
-		DRIVER(drive)->busy++;
+		ata_ops(drive)->busy++;
 		idedisk_setup(drive);
 		if ((!drive->head || drive->head > 16) && !drive->select.b.lba) {
 			printk(KERN_ERR "%s: INVALID GEOMETRY: %d PHYSICAL HEADS?\n", drive->name, drive->head);
-			(void) idedisk_cleanup(drive);
-			DRIVER(drive)->busy--;
+			idedisk_cleanup(drive);
+			ata_ops(drive)->busy--;
 			continue;
 		}
-		DRIVER(drive)->busy--;
+		ata_ops(drive)->busy--;
 		failed--;
 	}
 	revalidate_drives();
diff -ur linux-2.5.5/drivers/ide/ide-dma.c linux/drivers/ide/ide-dma.c
--- linux-2.5.5/drivers/ide/ide-dma.c	Wed Feb 20 03:10:57 2002
+++ linux/drivers/ide/ide-dma.c	Tue Feb 26 01:29:56 2002
@@ -470,7 +470,7 @@
 	ide_hwif_t *hwif = HWIF(drive);
 
 #ifdef CONFIG_IDEDMA_ONLYDISK
-	if (drive->media != ide_disk)
+	if (drive->type != ATA_DISK)
 		config_allows_dma = 0;
 #endif
 
@@ -555,7 +555,7 @@
 {
 	u64 addr = BLK_BOUNCE_HIGH;
 
-	if (on && drive->media == ide_disk && HWIF(drive)->highmem) {
+	if (on && drive->type == ATA_DISK && HWIF(drive)->highmem) {
 		if (!PCI_DMA_BUS_IS_PHYS)
 			addr = BLK_BOUNCE_ANY;
 		else
@@ -613,7 +613,7 @@
 			outb(reading, dma_base);			/* specify r/w */
 			outb(inb(dma_base+2)|6, dma_base+2);		/* clear INTR & ERROR flags */
 			drive->waiting_for_dma = 1;
-			if (drive->media != ide_disk)
+			if (drive->type != ATA_DISK)
 				return 0;
 #ifdef CONFIG_BLK_DEV_IDEDMA_TIMEOUT
 			ide_set_handler(drive, &ide_dma_intr, 2*WAIT_CMD, NULL);	/* issue cmd to drive */
diff -ur linux-2.5.5/drivers/ide/ide-features.c linux/drivers/ide/ide-features.c
--- linux-2.5.5/drivers/ide/ide-features.c	Wed Feb 20 03:11:04 2002
+++ linux/drivers/ide/ide-features.c	Mon Feb 25 01:54:49 2002
@@ -70,22 +70,6 @@
 }
 
 /*
- *
- */
-char *ide_media_verbose (ide_drive_t *drive)
-{
-	switch (drive->media) {
-		case ide_scsi:		return("scsi   ");
-		case ide_disk:		return("disk   ");
-		case ide_optical:	return("optical");
-		case ide_cdrom:		return("cdrom  ");
-		case ide_tape:		return("tape   ");
-		case ide_floppy:	return("floppy ");
-		default:		return("???????");
-	}
-}
-
-/*
  * A Verbose noise maker for debugging on the attempted dmaing calls.
  */
 char *ide_dmafunc_verbose (ide_dma_action_t dmafunc)
diff -ur linux-2.5.5/drivers/ide/ide-floppy.c linux/drivers/ide/ide-floppy.c
--- linux-2.5.5/drivers/ide/ide-floppy.c	Sun Feb 24 16:32:45 2002
+++ linux/drivers/ide/ide-floppy.c	Tue Feb 26 01:43:48 2002
@@ -1827,14 +1827,6 @@
 }
 
 /*
- *	Revalidate the new media. Should set blk_size[]
- */
-static void idefloppy_revalidate (ide_drive_t *drive)
-{
-	ide_revalidate_drive(drive);
-}
-
-/*
  *	Return the current floppy capacity to ide.c.
  */
 static unsigned long idefloppy_capacity (ide_drive_t *drive)
@@ -2046,17 +2038,13 @@
 
 #endif	/* CONFIG_PROC_FS */
 
-int idefloppy_reinit(ide_drive_t *drive);
+static int idefloppy_reinit(ide_drive_t *drive);
 
 /*
  *	IDE subdriver functions, registered with ide.c
  */
-static ide_driver_t idefloppy_driver = {
-	name:			"ide-floppy",
-	media:			ide_floppy,
-	busy:			0,
-	supports_dma:		1,
-	supports_dsc_overlap:	0,
+static struct ata_operations idefloppy_driver = {
+	owner:			THIS_MODULE,
 	cleanup:		idefloppy_cleanup,
 	standby:		NULL,
 	flushcache:		NULL,
@@ -2066,7 +2054,7 @@
 	open:			idefloppy_open,
 	release:		idefloppy_release,
 	media_change:		idefloppy_media_change,
-	revalidate:		idefloppy_revalidate,
+	revalidate:		ide_revalidate_drive,
 	pre_reset:		NULL,
 	capacity:		idefloppy_capacity,
 	special:		NULL,
@@ -2074,13 +2062,13 @@
 	driver_reinit:		idefloppy_reinit,
 };
 
-int idefloppy_reinit (ide_drive_t *drive)
+static int idefloppy_reinit (ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy;
 	int failed = 0;
 
 	MOD_INC_USE_COUNT;
-	while ((drive = ide_scan_devices (ide_floppy, idefloppy_driver.name, NULL, failed++)) != NULL) {
+	while ((drive = ide_scan_devices(ATA_FLOPPY, "ide-floppy", NULL, failed++)) != NULL) {
 		if (!idefloppy_identify_device (drive, drive->id)) {
 			printk (KERN_ERR "ide-floppy: %s: not supported by this version of ide-floppy\n", drive->name);
 			continue;
@@ -2098,9 +2086,12 @@
 			kfree (floppy);
 			continue;
 		}
-		DRIVER(drive)->busy++;
+
+		/* ATA-PATTERN */
+		ata_ops(drive)->busy++;
 		idefloppy_setup (drive, floppy);
-		DRIVER(drive)->busy--;
+		ata_ops(drive)->busy--;
+
 		failed--;
 	}
 	revalidate_drives();
@@ -2115,7 +2106,7 @@
 	ide_drive_t *drive;
 	int failed = 0;
 
-	while ((drive = ide_scan_devices (ide_floppy, idefloppy_driver.name, &idefloppy_driver, failed)) != NULL) {
+	while ((drive = ide_scan_devices(ATA_FLOPPY, "ide-floppy", &idefloppy_driver, failed)) != NULL) {
 		if (idefloppy_cleanup (drive)) {
 			printk ("%s: cleanup_module() called while still busy\n", drive->name);
 			failed++;
@@ -2141,7 +2132,7 @@
 
 	printk("ide-floppy driver " IDEFLOPPY_VERSION "\n");
 	MOD_INC_USE_COUNT;
-	while ((drive = ide_scan_devices (ide_floppy, idefloppy_driver.name, NULL, failed++)) != NULL) {
+	while ((drive = ide_scan_devices (ATA_FLOPPY, "ide-floppy", NULL, failed++)) != NULL) {
 		if (!idefloppy_identify_device (drive, drive->id)) {
 			printk (KERN_ERR "ide-floppy: %s: not supported by this version of ide-floppy\n", drive->name);
 			continue;
@@ -2159,9 +2150,11 @@
 			kfree (floppy);
 			continue;
 		}
-		DRIVER(drive)->busy++;
+		/* ATA-PATTERN */
+		ata_ops(drive)->busy++;
 		idefloppy_setup (drive, floppy);
-		DRIVER(drive)->busy--;
+		ata_ops(drive)->busy--;
+
 		failed--;
 	}
 	revalidate_drives();
diff -ur linux-2.5.5/drivers/ide/ide-geometry.c linux/drivers/ide/ide-geometry.c
--- linux-2.5.5/drivers/ide/ide-geometry.c	Wed Feb 20 03:10:53 2002
+++ linux/drivers/ide/ide-geometry.c	Tue Feb 26 00:26:47 2002
@@ -1,94 +1,16 @@
 /*
  * linux/drivers/ide/ide-geometry.c
+ *
+ * Sun Feb 24 23:13:03 CET 2002: Patch by Andries Brouwer to remove the
+ * confused CMOS probe applied. This is solving more problems then it my
+ * (unexpectedly) introduce.
  */
+
 #include <linux/config.h>
 #include <linux/ide.h>
 #include <linux/mc146818rtc.h>
 #include <asm/io.h>
 
-#ifdef CONFIG_BLK_DEV_IDE
-
-/*
- * We query CMOS about hard disks : it could be that we have a SCSI/ESDI/etc
- * controller that is BIOS compatible with ST-506, and thus showing up in our
- * BIOS table, but not register compatible, and therefore not present in CMOS.
- *
- * Furthermore, we will assume that our ST-506 drives <if any> are the primary
- * drives in the system -- the ones reflected as drive 1 or 2.  The first
- * drive is stored in the high nibble of CMOS byte 0x12, the second in the low
- * nibble.  This will be either a 4 bit drive type or 0xf indicating use byte
- * 0x19 for an 8 bit type, drive 1, 0x1a for drive 2 in CMOS.  A non-zero value
- * means we have an AT controller hard disk for that drive.
- *
- * Of course, there is no guarantee that either drive is actually on the
- * "primary" IDE interface, but we don't bother trying to sort that out here.
- * If a drive is not actually on the primary interface, then these parameters
- * will be ignored.  This results in the user having to supply the logical
- * drive geometry as a boot parameter for each drive not on the primary i/f.
- */
-/*
- * The only "perfect" way to handle this would be to modify the setup.[cS] code
- * to do BIOS calls Int13h/Fn08h and Int13h/Fn48h to get all of the drive info
- * for us during initialization.  I have the necessary docs -- any takers?  -ml
- */
-/*
- * I did this, but it doesnt work - there is no reasonable way to find the
- * correspondence between the BIOS numbering of the disks and the Linux
- * numbering. -aeb
- *
- * The code below is bad. One of the problems is that drives 1 and 2
- * may be SCSI disks (even when IDE disks are present), so that
- * the geometry we read here from BIOS is attributed to the wrong disks.
- * Consequently, also the former "drive->present = 1" below was a mistake.
- *
- * Eventually the entire routine below should be removed.
- *
- * 17-OCT-2000 rjohnson@analogic.com Added spin-locks for reading CMOS
- * chip.
- */
-
-void probe_cmos_for_drives (ide_hwif_t *hwif)
-{
-#ifdef __i386__
-	extern struct drive_info_struct drive_info;
-	byte cmos_disks, *BIOS = (byte *) &drive_info;
-	int unit;
-	unsigned long flags;
-
-#ifdef CONFIG_BLK_DEV_PDC4030
-	if (hwif->chipset == ide_pdc4030 && hwif->channel != 0)
-		return;
-#endif /* CONFIG_BLK_DEV_PDC4030 */
-	spin_lock_irqsave(&rtc_lock, flags);
-	cmos_disks = CMOS_READ(0x12);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-	/* Extract drive geometry from CMOS+BIOS if not already setup */
-	for (unit = 0; unit < MAX_DRIVES; ++unit) {
-		ide_drive_t *drive = &hwif->drives[unit];
-
-		if ((cmos_disks & (0xf0 >> (unit*4)))
-		   && !drive->present && !drive->nobios) {
-			unsigned short cyl = *(unsigned short *)BIOS;
-			unsigned char head = *(BIOS+2);
-			unsigned char sect = *(BIOS+14);
-			if (cyl > 0 && head > 0 && sect > 0 && sect < 64) {
-				drive->cyl   = drive->bios_cyl  = cyl;
-				drive->head  = drive->bios_head = head;
-				drive->sect  = drive->bios_sect = sect;
-				drive->ctl   = *(BIOS+8);
-			} else {
-				printk("hd%c: C/H/S=%d/%d/%d from BIOS ignored\n",
-				       unit+'a', cyl, head, sect);
-			}
-		}
-
-		BIOS += 16;
-	}
-#endif
-}
-#endif /* CONFIG_BLK_DEV_IDE */
-
-
 #if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_IDE_MODULE)
 
 extern ide_drive_t * get_info_ptr(kdev_t);
@@ -105,14 +27,14 @@
 	unsigned long total;
 
 	/*
-	 * The specs say: take geometry as obtained from Identify,
-	 * compute total capacity C*H*S from that, and truncate to
-	 * 1024*255*63. Now take S=63, H the first in the sequence
-	 * 4, 8, 16, 32, 64, 128, 255 such that 63*H*1024 >= total.
-	 * [Please tell aeb@cwi.nl in case this computes a
-	 * geometry different from what OnTrack uses.]
+	 * The specs say: take geometry as obtained from Identify, compute
+	 * total capacity C*H*S from that, and truncate to 1024*255*63. Now
+	 * take S=63, H the first in the sequence 4, 8, 16, 32, 64, 128, 255
+	 * such that 63*H*1024 >= total.  [Please tell aeb@cwi.nl in case this
+	 * computes a geometry different from what OnTrack uses.]
 	 */
-	total = DRIVER(drive)->capacity(drive);
+
+	total = ata_ops(drive)->capacity(drive);
 
 	*s = 63;
 
diff -ur linux-2.5.5/drivers/ide/ide-probe.c linux/drivers/ide/ide-probe.c
--- linux-2.5.5/drivers/ide/ide-probe.c	Sun Feb 24 16:32:49 2002
+++ linux/drivers/ide/ide-probe.c	Tue Feb 26 01:19:39 2002
@@ -130,17 +130,17 @@
 		}
 #endif /* CONFIG_BLK_DEV_PDC4030 */
 		switch (type) {
-			case ide_floppy:
+			case ATA_FLOPPY:
 				if (!strstr(id->model, "CD-ROM")) {
 					if (!strstr(id->model, "oppy") && !strstr(id->model, "poyp") && !strstr(id->model, "ZIP"))
 						printk("cdrom or floppy?, assuming ");
-					if (drive->media != ide_cdrom) {
+					if (drive->type != ATA_ROM) {
 						printk ("FLOPPY");
 						break;
 					}
 				}
-				type = ide_cdrom;	/* Early cdrom models used zero */
-			case ide_cdrom:
+				type = ATA_ROM;	/* Early cdrom models used zero */
+			case ATA_ROM:
 				drive->removable = 1;
 #ifdef CONFIG_PPC
 				/* kludge for Apple PowerBook internal zip */
@@ -152,10 +152,10 @@
 #endif
 				printk ("CD/DVD-ROM");
 				break;
-			case ide_tape:
+			case ATA_TAPE:
 				printk ("TAPE");
 				break;
-			case ide_optical:
+			case ATA_MOD:
 				printk ("OPTICAL");
 				drive->removable = 1;
 				break;
@@ -164,7 +164,7 @@
 				break;
 		}
 		printk (" drive\n");
-		drive->media = type;
+		drive->type = type;
 		return;
 	}
 
@@ -184,7 +184,7 @@
 			mate->noprobe = 1;
 		}
 	}
-	drive->media = ide_disk;
+	drive->type = ATA_DISK;
 	printk("ATA DISK drive\n");
 	QUIRK_LIST(HWIF(drive),drive);
 	return;
@@ -327,12 +327,12 @@
 	int rc;
 	ide_hwif_t *hwif = HWIF(drive);
 	if (drive->present) {	/* avoid waiting for inappropriate probes */
-		if ((drive->media != ide_disk) && (cmd == WIN_IDENTIFY))
+		if ((drive->type != ATA_DISK) && (cmd == WIN_IDENTIFY))
 			return 4;
 	}
 #ifdef DEBUG
-	printk("probing for %s: present=%d, media=%d, probetype=%s\n",
-		drive->name, drive->present, drive->media,
+	printk("probing for %s: present=%d, type=%d, probetype=%s\n",
+		drive->name, drive->present, drive->type,
 		(cmd == WIN_IDENTIFY) ? "ATA" : "ATAPI");
 #endif
 	ide_delay_50ms();	/* needed for some systems (e.g. crw9624 as drive0 with disk as slave) */
@@ -421,10 +421,10 @@
 	if (!drive->present)
 		return;			/* drive not found */
 	if (drive->id == NULL) {		/* identification failed? */
-		if (drive->media == ide_disk) {
+		if (drive->type == ATA_DISK) {
 			printk ("%s: non-IDE drive, CHS=%d/%d/%d\n",
 			 drive->name, drive->cyl, drive->head, drive->sect);
-		} else if (drive->media == ide_cdrom) {
+		} else if (drive->type == ATA_ROM) {
 			printk("%s: ATAPI cdrom (?)\n", drive->name);
 		} else {
 			drive->present = 0;	/* nuke it */
@@ -481,33 +481,31 @@
 	    ((unsigned long)hwif->io_ports[IDE_STATUS_OFFSET])) {
 		ide_request_region(hwif->io_ports[IDE_DATA_OFFSET], 8, hwif->name);
 		hwif->straight8 = 1;
-		goto jump_straight8;
-	}
-
-	if (hwif->io_ports[IDE_DATA_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_DATA_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_ERROR_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_ERROR_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_NSECTOR_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_NSECTOR_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_SECTOR_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_SECTOR_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_LCYL_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_LCYL_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_HCYL_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_HCYL_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_SELECT_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_SELECT_OFFSET], 1, hwif->name);
-	if (hwif->io_ports[IDE_STATUS_OFFSET])
-		ide_request_region(hwif->io_ports[IDE_STATUS_OFFSET], 1, hwif->name);
+	} else {
+		if (hwif->io_ports[IDE_DATA_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_DATA_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_ERROR_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_ERROR_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_NSECTOR_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_NSECTOR_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_SECTOR_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_SECTOR_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_LCYL_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_LCYL_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_HCYL_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_HCYL_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_SELECT_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_SELECT_OFFSET], 1, hwif->name);
+		if (hwif->io_ports[IDE_STATUS_OFFSET])
+			ide_request_region(hwif->io_ports[IDE_STATUS_OFFSET], 1, hwif->name);
 
-jump_straight8:
+	}
 	if (hwif->io_ports[IDE_CONTROL_OFFSET])
 		ide_request_region(hwif->io_ports[IDE_CONTROL_OFFSET], 1, hwif->name);
 #if defined(CONFIG_AMIGA) || defined(CONFIG_MAC)
 	if (hwif->io_ports[IDE_IRQ_OFFSET])
 		ide_request_region(hwif->io_ports[IDE_IRQ_OFFSET], 1, hwif->name);
-#endif /* (CONFIG_AMIGA) || (CONFIG_MAC) */
+#endif
 }
 
 /*
@@ -521,13 +519,6 @@
 
 	if (hwif->noprobe)
 		return;
-#ifdef CONFIG_BLK_DEV_IDE
-	if (hwif->io_ports[IDE_DATA_OFFSET] == HD_DATA) {
-		extern void probe_cmos_for_drives(ide_hwif_t *);
-
-		probe_cmos_for_drives (hwif);
-	}
-#endif
 
 	if ((hwif->chipset != ide_4drives || !hwif->mate->present) &&
 #if CONFIG_BLK_DEV_PDC4030
@@ -545,7 +536,7 @@
 		}
 		if (!msgout)
 			printk("%s: ports already in use, skipping probe\n", hwif->name);
-		return;	
+		return;
 	}
 
 	__save_flags(flags);	/* local CPU only */
@@ -796,60 +787,51 @@
 static void init_gendisk (ide_hwif_t *hwif)
 {
 	struct gendisk *gd;
-	unsigned int unit, units, minors, i;
+	unsigned int unit, minors, i;
 	extern devfs_handle_t ide_devfs_handle;
 
-#if 1
-	units = MAX_DRIVES;
-#else
-	/* figure out maximum drive number on the interface */
-	for (units = MAX_DRIVES; units > 0; --units) {
-		if (hwif->drives[units-1].present)
-			break;
-	}
-#endif
-
-	minors = units * (1<<PARTN_BITS);
+	minors = MAX_DRIVES * (1 << PARTN_BITS);
 
 	gd = kmalloc (sizeof(struct gendisk), GFP_KERNEL);
 	if (!gd)
 		goto err_kmalloc_gd;
+
 	gd->sizes = kmalloc (minors * sizeof(int), GFP_KERNEL);
 	if (!gd->sizes)
 		goto err_kmalloc_gd_sizes;
+
 	gd->part = kmalloc (minors * sizeof(struct hd_struct), GFP_KERNEL);
 	if (!gd->part)
 		goto err_kmalloc_gd_part;
+	memset(gd->part, 0, minors * sizeof(struct hd_struct));
+
 	blksize_size[hwif->major] = kmalloc (minors*sizeof(int), GFP_KERNEL);
 	if (!blksize_size[hwif->major])
 		goto err_kmalloc_bs;
-
-	memset(gd->part, 0, minors * sizeof(struct hd_struct));
-
 	for (i = 0; i < minors; ++i)
 	    blksize_size[hwif->major][i] = BLOCK_SIZE;
-	for (unit = 0; unit < units; ++unit)
+
+	for (unit = 0; unit < MAX_DRIVES; ++unit)
 		hwif->drives[unit].part = &gd->part[unit << PARTN_BITS];
 
 	gd->major	= hwif->major;		/* our major device number */
 	gd->major_name	= IDE_MAJOR_NAME;	/* treated special in genhd.c */
 	gd->minor_shift	= PARTN_BITS;		/* num bits for partitions */
-	gd->nr_real	= units;		/* current num real drives */
+	gd->nr_real	= MAX_DRIVES;		/* current num real drives */
 	gd->next	= NULL;			/* linked list of major devs */
 	gd->fops        = ide_fops;             /* file operations */
-	gd->de_arr	= kmalloc (sizeof *gd->de_arr * units, GFP_KERNEL);
-	gd->flags	= kmalloc (sizeof *gd->flags * units, GFP_KERNEL);
+	gd->de_arr	= kmalloc(sizeof(*gd->de_arr) * MAX_DRIVES, GFP_KERNEL);
+	gd->flags	= kmalloc(sizeof(*gd->flags) * MAX_DRIVES, GFP_KERNEL);
 	if (gd->de_arr)
-		memset (gd->de_arr, 0, sizeof *gd->de_arr * units);
+		memset(gd->de_arr, 0, sizeof(*gd->de_arr) * MAX_DRIVES);
 	if (gd->flags)
-		memset (gd->flags, 0, sizeof *gd->flags * units);
+		memset(gd->flags, 0, sizeof(*gd->flags) * MAX_DRIVES);
 
 	hwif->gd = gd;
 	add_gendisk(gd);
 
-	for (unit = 0; unit < units; ++unit) {
-#if 1
-		char name[64];
+	for (unit = 0; unit < MAX_DRIVES; ++unit) {
+		char name[80];
 		ide_add_generic_settings(hwif->drives + unit);
 		hwif->drives[unit].dn = ((hwif->channel ? 2 : 0) + unit);
 		sprintf (name, "host%d/bus%d/target%d/lun%d",
@@ -858,19 +840,6 @@
 			hwif->channel, unit, hwif->drives[unit].lun);
 		if (hwif->drives[unit].present)
 			hwif->drives[unit].de = devfs_mk_dir(ide_devfs_handle, name, NULL);
-#else
-		if (hwif->drives[unit].present) {
-			char name[64];
-
-			ide_add_generic_settings(hwif->drives + unit);
-			hwif->drives[unit].dn = ((hwif->channel ? 2 : 0) + unit);
-			sprintf (name, "host%d/bus%d/target%d/lun%d",
-				 (hwif->channel && hwif->mate) ? hwif->mate->index : hwif->index,
-				 hwif->channel, unit, hwif->drives[unit].lun);
-			hwif->drives[unit].de =
-				devfs_mk_dir (ide_devfs_handle, name, NULL);
-		}
-#endif
 	}
 	return;
 
@@ -881,7 +850,7 @@
 err_kmalloc_gd_sizes:
 	kfree(gd);
 err_kmalloc_gd:
-	printk(KERN_WARNING "(ide::init_gendisk) Out of memory\n");
+	printk(KERN_CRIT "(ide::init_gendisk) Out of memory\n");
 	return;
 }
 
diff -ur linux-2.5.5/drivers/ide/ide-proc.c linux/drivers/ide/ide-proc.c
--- linux-2.5.5/drivers/ide/ide-proc.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/ide-proc.c	Tue Feb 26 01:37:25 2002
@@ -202,7 +202,7 @@
 	memset(&hobfile, 0, sizeof(struct hd_drive_hob_hdr));
 
 	taskfile.sector_count = 0x01;
-	taskfile.command = (drive->media == ide_disk) ? WIN_IDENTIFY : WIN_PIDENTIFY ;
+	taskfile.command = (drive->type == ATA_DISK) ? WIN_IDENTIFY : WIN_PIDENTIFY ;
 
 	return ide_wait_taskfile(drive, &taskfile, &hobfile, buf);
 }
@@ -346,9 +346,9 @@
 int proc_ide_read_capacity
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
-	ide_drive_t	*drive = data;
-	ide_driver_t    *driver = drive->driver;
-	int		len;
+	ide_drive_t *drive = data;
+	struct ata_operations *driver = drive->driver;
+	int len;
 
 	if (!driver)
 		len = sprintf(page, "(none)\n");
@@ -381,58 +381,31 @@
 	PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
 }
 
-static int proc_ide_read_driver
-	(char *page, char **start, off_t off, int count, int *eof, void *data)
-{
-	ide_drive_t	*drive = (ide_drive_t *) data;
-	ide_driver_t	*driver = drive->driver;
-	int		len;
-
-	if (!driver)
-		len = sprintf(page, "(none)\n");
-	else
-		len = sprintf(page, "%s\n", driver->name);
-	PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
-}
-
-static int proc_ide_write_driver
-	(struct file *file, const char *buffer, unsigned long count, void *data)
-{
-	ide_drive_t	*drive = data;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-	if (ide_replace_subdriver(drive, buffer))
-		return -EINVAL;
-	return count;
-}
-
 static int proc_ide_read_media
 	(char *page, char **start, off_t off, int count, int *eof, void *data)
 {
 	ide_drive_t	*drive = data;
-	const char	*media;
+	const char	*type;
 	int		len;
 
-	switch (drive->media) {
-		case ide_disk:	media = "disk\n";
+	switch (drive->type) {
+		case ATA_DISK:	type = "disk\n";
 				break;
-		case ide_cdrom:	media = "cdrom\n";
+		case ATA_ROM:	type = "cdrom\n";
 				break;
-		case ide_tape:	media = "tape\n";
+		case ATA_TAPE:	type = "tape\n";
 				break;
-		case ide_floppy:media = "floppy\n";
+		case ATA_FLOPPY:type = "floppy\n";
 				break;
-		default:	media = "UNKNOWN\n";
+		default:	type = "UNKNOWN\n";
 				break;
 	}
-	strcpy(page,media);
-	len = strlen(media);
+	strcpy(page,type);
+	len = strlen(type);
 	PROC_IDE_READ_RETURN(page,start,off,count,eof,len);
 }
 
 static ide_proc_entry_t generic_drive_entries[] = {
-	{ "driver",	S_IFREG|S_IRUGO,	proc_ide_read_driver,	proc_ide_write_driver },
 	{ "identify",	S_IFREG|S_IRUSR,	proc_ide_read_identify,	NULL },
 	{ "media",	S_IFREG|S_IRUGO,	proc_ide_read_media,	NULL },
 	{ "model",	S_IFREG|S_IRUGO,	proc_ide_read_dmodel,	NULL },
@@ -476,7 +449,7 @@
 
 	for (d = 0; d < MAX_DRIVES; d++) {
 		ide_drive_t *drive = &hwif->drives[d];
-		ide_driver_t *driver = drive->driver;
+		struct ata_operations *driver = drive->driver;
 
 		if (!drive->present)
 			continue;
@@ -497,35 +470,9 @@
 	}
 }
 
-void recreate_proc_ide_device(ide_hwif_t *hwif, ide_drive_t *drive)
-{
-	struct proc_dir_entry *ent;
-	struct proc_dir_entry *parent = hwif->proc;
-	char name[64];
-
-	if (drive->present && !drive->proc) {
-		drive->proc = proc_mkdir(drive->name, parent);
-		if (drive->proc)
-			ide_add_proc_entries(drive->proc, generic_drive_entries, drive);
-
-/*
- * assume that we have these already, however, should test FIXME!
- * if (driver) {
- *      ide_add_proc_entries(drive->proc, generic_subdriver_entries, drive);
- *      ide_add_proc_entries(drive->proc, driver->proc, drive);
- * }
- *
- */
-		sprintf(name,"ide%d/%s", (drive->name[2]-'a')/2, drive->name);
-		ent = proc_symlink(drive->name, proc_ide_root, name);
-		if (!ent)
-			return;
-	}
-}
-
-void destroy_proc_ide_device(ide_hwif_t *hwif, ide_drive_t *drive)
+static void destroy_proc_ide_device(ide_hwif_t *hwif, ide_drive_t *drive)
 {
-	ide_driver_t *driver = drive->driver;
+	struct ata_operations *driver = drive->driver;
 
 	if (drive->proc) {
 		if (driver)
diff -ur linux-2.5.5/drivers/ide/ide-tape.c linux/drivers/ide/ide-tape.c
--- linux-2.5.5/drivers/ide/ide-tape.c	Sun Feb 24 16:32:45 2002
+++ linux/drivers/ide/ide-tape.c	Tue Feb 26 01:42:59 2002
@@ -6098,8 +6098,11 @@
 	}
 	idetape_chrdevs[minor].drive = NULL;
 	restore_flags (flags);	/* all CPUs (overkill?) */
-	DRIVER(drive)->busy = 0;
-	(void) ide_unregister_subdriver (drive);
+
+	/* FIXME: this appears to be totally wrong! */
+	ata_ops(drive)->busy = 0;
+
+	ide_unregister_subdriver (drive);
 	drive->driver_data = NULL;
 	devfs_unregister (tape->de_r);
 	devfs_unregister (tape->de_n);
@@ -6137,17 +6140,13 @@
 
 #endif
 
-int idetape_reinit(ide_drive_t *drive);
+static int idetape_reinit(ide_drive_t *drive);
 
 /*
  *	IDE subdriver functions, registered with ide.c
  */
-static ide_driver_t idetape_driver = {
-	name:			"ide-tape",
-	media:			ide_tape,
-	busy:			1,
-	supports_dma:		1,
-	supports_dsc_overlap: 	1,
+static struct ata_operations idetape_driver = {
+	owner:			THIS_MODULE,
 	cleanup:		idetape_cleanup,
 	standby:		NULL,
 	flushcache:		NULL,
@@ -6176,90 +6175,10 @@
 	release:	idetape_chrdev_release,
 };
 
-int idetape_reinit (ide_drive_t *drive)
+/* This will propably just go entierly away... */
+static int idetape_reinit (ide_drive_t *drive)
 {
-#if 0
-	idetape_tape_t *tape;
-	int minor, failed = 0, supported = 0;
-/* DRIVER(drive)->busy++; */
-	MOD_INC_USE_COUNT;
-#if ONSTREAM_DEBUG
-        printk(KERN_INFO "ide-tape: MOD_INC_USE_COUNT in idetape_init\n");
-#endif
-	if (!idetape_chrdev_present)
-		for (minor = 0; minor < MAX_HWIFS * MAX_DRIVES; minor++ )
-			idetape_chrdevs[minor].drive = NULL;
-
-	if ((drive = ide_scan_devices (ide_tape, idetape_driver.name, NULL, failed++)) == NULL) {
-		revalidate_drives();
-		MOD_DEC_USE_COUNT;
-#if ONSTREAM_DEBUG
-		printk(KERN_INFO "ide-tape: MOD_DEC_USE_COUNT in idetape_init\n");
-#endif
-		return 0;
-	}
-	if (!idetape_chrdev_present &&
-	    devfs_register_chrdev (IDETAPE_MAJOR, "ht", &idetape_fops)) {
-		printk (KERN_ERR "ide-tape: Failed to register character device interface\n");
-		MOD_DEC_USE_COUNT;
-#if ONSTREAM_DEBUG
-		printk(KERN_INFO "ide-tape: MOD_DEC_USE_COUNT in idetape_init\n");
-#endif
-		return -EBUSY;
-	}
-	do {
-		if (!idetape_identify_device (drive, drive->id)) {
-			printk (KERN_ERR "ide-tape: %s: not supported by this version of ide-tape\n", drive->name);
-			continue;
-		}
-		if (drive->scsi) {
-			if (strstr(drive->id->model, "OnStream DI-30")) {
-				printk("ide-tape: ide-scsi emulation is not supported for %s.\n", drive->id->model);
-			} else {
-				printk("ide-tape: passing drive %s to ide-scsi emulation.\n", drive->name);
-				continue;
-			}
-		}
-		tape = (idetape_tape_t *) kmalloc (sizeof (idetape_tape_t), GFP_KERNEL);
-		if (tape == NULL) {
-			printk (KERN_ERR "ide-tape: %s: Can't allocate a tape structure\n", drive->name);
-			continue;
-		}
-		if (ide_register_subdriver (drive, &idetape_driver)) {
-			printk (KERN_ERR "ide-tape: %s: Failed to register the driver with ide.c\n", drive->name);
-			kfree (tape);
-			continue;
-		}
-		for (minor = 0; idetape_chrdevs[minor].drive != NULL; minor++);
-		idetape_setup (drive, tape, minor);
-		idetape_chrdevs[minor].drive = drive;
-		tape->de_r =
-		    devfs_register (drive->de, "mt", DEVFS_FL_DEFAULT,
-				    HWIF(drive)->major, minor,
-				    S_IFCHR | S_IRUGO | S_IWUGO,
-				    &idetape_fops, NULL);
-		tape->de_n =
-		    devfs_register (drive->de, "mtn", DEVFS_FL_DEFAULT,
-				    HWIF(drive)->major, minor + 128,
-				    S_IFCHR | S_IRUGO | S_IWUGO,
-				    &idetape_fops, NULL);
-		devfs_register_tape (tape->de_r);
-		supported++; failed--;
-	} while ((drive = ide_scan_devices (ide_tape, idetape_driver.name, NULL, failed++)) != NULL);
-	if (!idetape_chrdev_present && !supported) {
-		devfs_unregister_chrdev (IDETAPE_MAJOR, "ht");
-	} else
-		idetape_chrdev_present = 1;
-	revalidate_drives();
-	MOD_DEC_USE_COUNT;
-#if ONSTREAM_DEBUG
-	printk(KERN_INFO "ide-tape: MOD_DEC_USE_COUNT in idetape_init\n");
-#endif
-
-	return 0;
-#else
 	return 1;
-#endif
 }
 
 MODULE_DESCRIPTION("ATAPI Streaming TAPE Driver");
@@ -6294,7 +6213,7 @@
 		for (minor = 0; minor < MAX_HWIFS * MAX_DRIVES; minor++ )
 			idetape_chrdevs[minor].drive = NULL;
 
-	if ((drive = ide_scan_devices (ide_tape, idetape_driver.name, NULL, failed++)) == NULL) {
+	if ((drive = ide_scan_devices(ATA_TAPE, "ide-tape", NULL, failed++)) == NULL) {
 		revalidate_drives();
 		MOD_DEC_USE_COUNT;
 #if ONSTREAM_DEBUG
@@ -6349,7 +6268,7 @@
 				    &idetape_fops, NULL);
 		devfs_register_tape (tape->de_r);
 		supported++; failed--;
-	} while ((drive = ide_scan_devices (ide_tape, idetape_driver.name, NULL, failed++)) != NULL);
+	} while ((drive = ide_scan_devices(ATA_TAPE, "ide-tape", NULL, failed++)) != NULL);
 	if (!idetape_chrdev_present && !supported) {
 		devfs_unregister_chrdev (IDETAPE_MAJOR, "ht");
 	} else
diff -ur linux-2.5.5/drivers/ide/ide-taskfile.c linux/drivers/ide/ide-taskfile.c
--- linux-2.5.5/drivers/ide/ide-taskfile.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/ide-taskfile.c	Tue Feb 26 00:31:26 2002
@@ -648,7 +648,7 @@
 
 	if (rq->errors >= ERROR_MAX) {
 		if (drive->driver != NULL)
-			DRIVER(drive)->end_request(0, HWGROUP(drive));
+			ata_ops(drive)->end_request(0, HWGROUP(drive));
 		else
 			ide_end_request(drive, 0);
 	} else {
diff -ur linux-2.5.5/drivers/ide/ide.c linux/drivers/ide/ide.c
--- linux-2.5.5/drivers/ide/ide.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/ide.c	Tue Feb 26 01:15:47 2002
@@ -1,6 +1,4 @@
 /*
- *  linux/drivers/ide/ide.c		Version 6.31	June 9, 2000
- *
  *  Copyright (C) 1994-1998  Linus Torvalds & authors (see below)
  */
 
@@ -114,17 +112,12 @@
  *			Native ATA-100 support
  *			Prep for Cascades Project
  * Version 6.32		4GB highmem support for DMA, and mapping of those for
- * 			PIO transfer (Jens Axboe)
+ *			PIO transfer (Jens Axboe)
  *
  *  Some additional driver compile-time options are in ./include/linux/ide.h
- *
- *  To do, in likely order of completion:
- *	- modify kernel to obtain BIOS geometry for drives on 2nd/3rd/4th i/f
- *
  */
 
-#define	REVISION	"Revision: 6.32"
-#define	VERSION		"Id: ide.c 6.32 2001/05/24"
+#define	VERSION	"7.0.0"
 
 #undef REALLY_SLOW_IO		/* most systems can safely undef this */
 
@@ -254,8 +247,6 @@
 /* default maximum number of failures */
 #define IDE_DEFAULT_MAX_FAILURES	1
 
-static const byte ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR, IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR };
-
 static int	idebus_parameter; /* holds the "idebus=" parameter */
 int system_bus_speed; /* holds what we think is VESA/PCI bus speed */
 static int	initializing;     /* set while initializing built-in drivers */
@@ -418,6 +409,11 @@
  */
 static void init_hwif_data (unsigned int index)
 {
+	static const byte ide_major[] = {
+		IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR,
+		IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR
+	};
+
 	unsigned int unit;
 	hw_regs_t hw;
 	ide_hwif_t *hwif = &ide_hwifs[index];
@@ -436,16 +432,13 @@
 	if (hwif->io_ports[IDE_DATA_OFFSET] == HD_DATA)
 		hwif->noprobe = 1; /* may be overridden by ide_setup() */
 #endif /* CONFIG_BLK_DEV_HD */
-	hwif->major	= ide_hwif_to_major[index];
-	hwif->name[0]	= 'i';
-	hwif->name[1]	= 'd';
-	hwif->name[2]	= 'e';
-	hwif->name[3]	= '0' + index;
+	hwif->major = ide_major[index];
+	sprintf(hwif->name, "ide%d", index);
 	hwif->bus_state = BUSSTATE_ON;
 	for (unit = 0; unit < MAX_DRIVES; ++unit) {
 		ide_drive_t *drive = &hwif->drives[unit];
 
-		drive->media			= ide_disk;
+		drive->type			= ATA_DISK;
 		drive->select.all		= (unit<<4)|0xa0;
 		drive->hwif			= hwif;
 		drive->ctl			= 0x08;
@@ -453,9 +446,7 @@
 		drive->bad_wstat		= BAD_W_STAT;
 		drive->special.b.recalibrate	= 1;
 		drive->special.b.set_geometry	= 1;
-		drive->name[0]			= 'h';
-		drive->name[1]			= 'd';
-		drive->name[2]			= 'a' + (index * MAX_DRIVES) + unit;
+		sprintf(drive->name, "hd%c", 'a' + (index * MAX_DRIVES) + unit);
 		drive->max_failures		= IDE_DEFAULT_MAX_FAILURES;
 		init_waitqueue_head(&drive->wqueue);
 	}
@@ -591,19 +582,18 @@
 }
 
 /*
- * current_capacity() returns the capacity (in sectors) of a drive
- * according to its current geometry/LBA settings.
+ * The capacity of a drive according to its current geometry/LBA settings in
+ * sectors.
  */
 unsigned long current_capacity (ide_drive_t *drive)
 {
-	if (!drive->present)
+	if (!drive->present || !drive->driver)
 		return 0;
-	if (drive->driver != NULL)
-		return DRIVER(drive)->capacity(drive);
-	return 0;
+	return ata_ops(drive)->capacity(drive);
 }
 
 extern struct block_device_operations ide_fops[];
+
 /*
  * ide_geninit() is called exactly *once* for each interface.
  */
@@ -617,7 +607,7 @@
 
 		if (!drive->present)
 			continue;
-		if (drive->media!=ide_disk && drive->media!=ide_floppy)
+		if (drive->type != ATA_DISK && drive->type != ATA_FLOPPY)
 			continue;
 		register_disk(gd,mk_kdev(hwif->major,unit<<PARTN_BITS),
 #ifdef CONFIG_BLK_DEV_ISAPNP
@@ -728,7 +718,7 @@
 static void pre_reset (ide_drive_t *drive)
 {
 	if (drive->driver != NULL)
-		DRIVER(drive)->pre_reset(drive);
+		ata_ops(drive)->pre_reset(drive);
 
 	if (!drive->keep_settings) {
 		if (drive->using_dma) {
@@ -769,7 +759,7 @@
 	__cli();		/* local CPU only */
 
 	/* For an ATAPI device, first try an ATAPI SRST. */
-	if (drive->media != ide_disk && !do_not_try_atapi) {
+	if (drive->type != ATA_DISK && !do_not_try_atapi) {
 		pre_reset(drive);
 		SELECT_DRIVE(hwif,drive);
 		udelay (20);
@@ -938,7 +928,7 @@
 		err = GET_ERR();
 		printk("%s: %s: error=0x%02x", drive->name, msg, err);
 #if FANCY_STATUS_DUMPS
-		if (drive->media == ide_disk) {
+		if (drive->type == ATA_DISK) {
 			printk(" { ");
 			if (err & ABRT_ERR)	printk("DriveStatusError ");
 			if (err & ICRC_ERR)	printk("%s", (err & ABRT_ERR) ? "BadCRC " : "BadSector ");
@@ -997,7 +987,7 @@
 {
 	int i = (drive->mult_count ? drive->mult_count : 1) * SECTOR_WORDS;
 
-	if (drive->media != ide_disk)
+	if (drive->type != ATA_DISK)
 		return;
 	while (i > 0) {
 		u32 buffer[16];
@@ -1033,7 +1023,7 @@
 	if (stat & BUSY_STAT || ((stat & WRERR_STAT) && !drive->nowerr)) { /* other bits are useless when BUSY */
 		rq->errors |= ERROR_RESET;
 	} else {
-		if (drive->media == ide_disk && (stat & ERR_STAT)) {
+		if (drive->type == ATA_DISK && (stat & ERR_STAT)) {
 			/* err has different meaning on cdrom and tape */
 			if (err == ABRT_ERR) {
 				if (drive->select.b.lba && IN_BYTE(IDE_COMMAND_REG) == WIN_SPECIFY)
@@ -1053,8 +1043,9 @@
 		OUT_BYTE(WIN_IDLEIMMEDIATE,IDE_COMMAND_REG);	/* force an abort */
 
 	if (rq->errors >= ERROR_MAX) {
+		/* ATA-PATTERN */
 		if (drive->driver != NULL)
-			DRIVER(drive)->end_request(drive, 0);
+			ata_ops(drive)->end_request(drive, 0);
 		else
 			ide_end_request(drive, 0);
 	} else {
@@ -1126,7 +1117,7 @@
 		if (tuneproc != NULL)
 			tuneproc(drive, drive->tune_req);
 	} else if (drive->driver != NULL) {
-		return DRIVER(drive)->special(drive);
+		return ata_ops(drive)->special(drive);
 	} else if (s->all) {
 		printk("%s: bad special flag: 0x%02x\n", drive->name, s->all);
 		s->all = 0;
@@ -1324,8 +1315,8 @@
 	block    = rq->sector;
 
 	/* Strange disk manager remap */
-	if ((rq->flags & REQ_CMD) && 
-	    (drive->media == ide_disk || drive->media == ide_floppy)) {
+	if ((rq->flags & REQ_CMD) &&
+	    (drive->type == ATA_DISK || drive->type == ATA_FLOPPY)) {
 		block += drive->sect0;
 	}
 	/* Yecch - this will shift the entire interval,
@@ -1340,7 +1331,7 @@
 	SELECT_DRIVE(hwif, drive);
 	if (ide_wait_stat(&startstop, drive, drive->ready_stat,
 			  BUSY_STAT|DRQ_STAT, WAIT_READY)) {
-		printk("%s: drive not ready for command\n", drive->name);
+		printk(KERN_WARNING "%s: drive not ready for command\n", drive->name);
 		return startstop;
 	}
 	if (!drive->special.all) {
@@ -1348,16 +1339,16 @@
 			return execute_drive_cmd(drive, rq);
 
 		if (drive->driver != NULL) {
-			return (DRIVER(drive)->do_request(drive, rq, block));
+			return ata_ops(drive)->do_request(drive, rq, block);
 		}
-		printk("%s: media type %d not supported\n",
-		       drive->name, drive->media);
+		printk(KERN_WARNING "%s: device type %d not supported\n",
+		       drive->name, drive->type);
 		goto kill_rq;
 	}
 	return do_special(drive);
 kill_rq:
 	if (drive->driver != NULL)
-		DRIVER(drive)->end_request(drive, 0);
+		ata_ops(drive)->end_request(drive, 0);
 	else
 		ide_end_request(drive, 0);
 	return ide_stopped;
@@ -1987,8 +1978,8 @@
 	if (res)
 		goto leave;
 
-	if (DRIVER(drive)->revalidate)
-		DRIVER(drive)->revalidate(drive);
+	if (ata_ops(drive)->revalidate)
+		ata_ops(drive)->revalidate(drive);
 
  leave:
 	drive->busy = 0;
@@ -2014,7 +2005,7 @@
 			if (drive->revalidate) {
 				drive->revalidate = 0;
 				if (!initializing)
-					(void) ide_revalidate_disk(mk_kdev(hwif->major, unit<<PARTN_BITS));
+					ide_revalidate_disk(mk_kdev(hwif->major, unit<<PARTN_BITS));
 			}
 		}
 	}
@@ -2047,27 +2038,44 @@
 		return -ENXIO;
 	if (drive->driver == NULL)
 		ide_driver_module();
+
+	/* Request a particular device type module.
+	 *
+	 * FIXME: The function which should rather requests the drivers is
+	 * ide_driver_module(), since it seems illogical and even a bit
+	 * dangerous to delay this until open time!
+	 */
+
 #ifdef CONFIG_KMOD
 	if (drive->driver == NULL) {
-		if (drive->media == ide_disk)
-			(void) request_module("ide-disk");
-		if (drive->media == ide_cdrom)
-			(void) request_module("ide-cd");
-		if (drive->media == ide_tape)
-			(void) request_module("ide-tape");
-		if (drive->media == ide_floppy)
-			(void) request_module("ide-floppy");
+		switch (drive->type) {
+			case ATA_DISK:
+				request_module("ide-disk");
+				break;
+			case ATA_ROM:
+				request_module("ide-cd");
+				break;
+			case ATA_TAPE:
+				request_module("ide-tape");
+				break;
+			case ATA_FLOPPY:
+				request_module("ide-floppy");
+				break;
 #if defined(CONFIG_BLK_DEV_IDESCSI) && defined(CONFIG_SCSI)
-		if (drive->media == ide_scsi)
-			(void) request_module("ide-scsi");
-#endif /* defined(CONFIG_BLK_DEV_IDESCSI) && defined(CONFIG_SCSI) */
+			case ATA_SCSI:
+				request_module("ide-scsi");
+				break;
+#endif
+			default:
+				/* nothing to be done about it */ ;
+		}
 	}
 #endif /* CONFIG_KMOD */
 	while (drive->busy)
 		sleep_on(&drive->wqueue);
 	drive->usage++;
 	if (drive->driver != NULL)
-		return DRIVER(drive)->open(inode, filp, drive);
+		return ata_ops(drive)->open(inode, filp, drive);
 	printk ("%s: driver not present\n", drive->name);
 	drive->usage--;
 	return -ENXIO;
@@ -2084,27 +2092,11 @@
 	if ((drive = get_info_ptr(inode->i_rdev)) != NULL) {
 		drive->usage--;
 		if (drive->driver != NULL)
-			DRIVER(drive)->release(inode, file, drive);
+			ata_ops(drive)->release(inode, file, drive);
 	}
 	return 0;
 }
 
-int ide_replace_subdriver (ide_drive_t *drive, const char *driver)
-{
-	if (!drive->present || drive->busy || drive->usage)
-		goto abort;
-	if (drive->driver != NULL && DRIVER(drive)->cleanup(drive))
-		goto abort;
-	strncpy(drive->driver_req, driver, 9);
-	ide_driver_module();
-	drive->driver_req[0] = 0;
-	ide_driver_module();
-	if (DRIVER(drive) && !strcmp(DRIVER(drive)->name, driver))
-		return 0;
-abort:
-	return 1;
-}
-
 #ifdef CONFIG_PROC_FS
 ide_proc_entry_t generic_subdriver_entries[] = {
 	{ "capacity",	S_IFREG|S_IRUGO,	proc_ide_read_capacity,	NULL },
@@ -2172,7 +2164,7 @@
 			continue;
 		if (drive->busy || drive->usage)
 			goto abort;
-		if (drive->driver != NULL && DRIVER(drive)->cleanup(drive))
+		if (drive->driver != NULL && ata_ops(drive)->cleanup(drive))
 			goto abort;
 	}
 	hwif->present = 0;
@@ -2306,7 +2298,6 @@
 	hwif->pci_dev		= old_hwif.pci_dev;
 #endif /* CONFIG_BLK_DEV_IDEPCI */
 	hwif->straight8		= old_hwif.straight8;
-	hwif->hwif_data		= old_hwif.hwif_data;
 abort:
 	restore_flags(flags);	/* all CPUs */
 }
@@ -2562,7 +2553,7 @@
 
 static int set_using_dma (ide_drive_t *drive, int arg)
 {
-	if (!drive->driver || !DRIVER(drive)->supports_dma)
+	if (!drive->driver)
 		return -EPERM;
 	if (!drive->id || !(drive->id->capability & 1) || !HWIF(drive)->dmaproc)
 		return -EPERM;
@@ -2690,7 +2681,9 @@
 		{
 			struct hd_geometry *loc = (struct hd_geometry *) arg;
 			unsigned short bios_cyl = drive->bios_cyl; /* truncate */
-			if (!loc || (drive->media != ide_disk && drive->media != ide_floppy)) return -EINVAL;
+
+			if (!loc || (drive->type != ATA_DISK && drive->type != ATA_FLOPPY))
+				return -EINVAL;
 			if (put_user(drive->bios_head, (byte *) &loc->heads)) return -EFAULT;
 			if (put_user(drive->bios_sect, (byte *) &loc->sectors)) return -EFAULT;
 			if (put_user(bios_cyl, (unsigned short *) &loc->cylinders)) return -EFAULT;
@@ -2702,7 +2695,9 @@
 		case HDIO_GETGEO_BIG:
 		{
 			struct hd_big_geometry *loc = (struct hd_big_geometry *) arg;
-			if (!loc || (drive->media != ide_disk && drive->media != ide_floppy)) return -EINVAL;
+
+			if (!loc || (drive->type != ATA_DISK && drive->type != ATA_FLOPPY))
+				return -EINVAL;
 
 			if (put_user(drive->bios_head, (byte *) &loc->heads)) return -EFAULT;
 			if (put_user(drive->bios_sect, (byte *) &loc->sectors)) return -EFAULT;
@@ -2715,7 +2710,8 @@
 		case HDIO_GETGEO_BIG_RAW:
 		{
 			struct hd_big_geometry *loc = (struct hd_big_geometry *) arg;
-			if (!loc || (drive->media != ide_disk && drive->media != ide_floppy)) return -EINVAL;
+			if (!loc || (drive->type != ATA_DISK && drive->type != ATA_FLOPPY))
+				return -EINVAL;
 			if (put_user(drive->head, (byte *) &loc->heads)) return -EFAULT;
 			if (put_user(drive->sect, (byte *) &loc->sectors)) return -EFAULT;
 			if (put_user(drive->cyl, (unsigned int *) &loc->cylinders)) return -EFAULT;
@@ -2750,15 +2746,15 @@
 		case HDIO_DRIVE_TASKFILE:
 		        if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO))
 				return -EACCES;
-			switch(drive->media) {
-				case ide_disk:
+			switch(drive->type) {
+				case ATA_DISK:
 					return ide_taskfile_ioctl(drive, inode, file, cmd, arg);
 #ifdef CONFIG_PKT_TASK_IOCTL
-				case ide_cdrom:
-				case ide_tape:
-				case ide_floppy:
+				case ATA_CDROM:
+				case ATA_TAPE:
+				case ATA_FLOPPY:
 					return pkt_taskfile_ioctl(drive, inode, file, cmd, arg);
-#endif /* CONFIG_PKT_TASK_IOCTL */
+#endif
 				default:
 					return -ENOMSG;
 			}
@@ -2791,12 +2787,11 @@
 			return 0;
 		case HDIO_SET_NICE:
 			if (!capable(CAP_SYS_ADMIN)) return -EACCES;
-			if (drive->driver == NULL)
-				return -EPERM;
 			if (arg != (arg & ((1 << IDE_NICE_DSC_OVERLAP) | (1 << IDE_NICE_1))))
 				return -EPERM;
 			drive->dsc_overlap = (arg >> IDE_NICE_DSC_OVERLAP) & 1;
-			if (drive->dsc_overlap && !DRIVER(drive)->supports_dsc_overlap) {
+			/* Only CD-ROM's and tapes support DSC overlap. */
+			if (drive->dsc_overlap && !(drive->type == ATA_ROM || drive->type == ATA_TAPE)) {
 				drive->dsc_overlap = 0;
 				return -EPERM;
 			}
@@ -2872,7 +2867,7 @@
 
 		default:
 			if (drive->driver != NULL)
-				return DRIVER(drive)->ioctl(drive, inode, file, cmd, arg);
+				return ata_ops(drive)->ioctl(drive, inode, file, cmd, arg);
 			return -EPERM;
 	}
 }
@@ -2884,7 +2879,7 @@
 	if ((drive = get_info_ptr(i_rdev)) == NULL)
 		return -ENODEV;
 	if (drive->driver != NULL)
-		return DRIVER(drive)->media_change(drive);
+		return ata_ops(drive)->media_change(drive);
 	return 0;
 }
 
@@ -3068,7 +3063,6 @@
 	const char max_drive = 'a' + ((MAX_HWIFS * MAX_DRIVES) - 1);
 	const char max_hwif  = '0' + (MAX_HWIFS - 1);
 
-	
 	if (strncmp(s,"hd",2) == 0 && s[2] == '=')	/* hd= is for hd.c   */
 		return 0;				/* driver and not us */
 
@@ -3146,7 +3140,7 @@
 				goto done;
 			case -4: /* "cdrom" */
 				drive->present = 1;
-				drive->media = ide_cdrom;
+				drive->type = ATA_ROM;
 				hwif->noprobe = 0;
 				goto done;
 			case -5: /* "serialize" */
@@ -3183,7 +3177,7 @@
 				goto bad_option;
 #endif /* defined(CONFIG_BLK_DEV_IDESCSI) && defined(CONFIG_SCSI) */
 			case 3: /* cyl,head,sect */
-				drive->media	= ide_disk;
+				drive->type	= ATA_DISK;
 				drive->cyl	= drive->bios_cyl  = vals[0];
 				drive->head	= drive->bios_head = vals[1];
 				drive->sect	= drive->bios_sect = vals[2];
@@ -3485,7 +3479,7 @@
 	}
 #endif /* __mc68000__ || CONFIG_APUS */
 
-	(void) ideprobe_init();
+	ideprobe_init();
 
 #if defined(__mc68000__) || defined(CONFIG_APUS)
 	if (ide_hwifs[0].io_ports[IDE_DATA_OFFSET]) {
@@ -3500,27 +3494,27 @@
 #endif
 
 	/*
-	 * Attempt to match drivers for the available drives
+	 * Initialize all device type driver modules.
 	 */
 #ifdef CONFIG_BLK_DEV_IDEDISK
 	idedisk_init();
-#endif /* CONFIG_BLK_DEV_IDEDISK */
+#endif
 #ifdef CONFIG_BLK_DEV_IDECD
 	ide_cdrom_init();
-#endif /* CONFIG_BLK_DEV_IDECD */
+#endif
 #ifdef CONFIG_BLK_DEV_IDETAPE
 	idetape_init();
-#endif /* CONFIG_BLK_DEV_IDETAPE */
+#endif
 #ifdef CONFIG_BLK_DEV_IDEFLOPPY
 	idefloppy_init();
-#endif /* CONFIG_BLK_DEV_IDEFLOPPY */
+#endif
 #ifdef CONFIG_BLK_DEV_IDESCSI
  #ifdef CONFIG_SCSI
 	idescsi_init();
  #else
     #warning ide scsi-emulation selected but no SCSI-subsystem in kernel
  #endif
-#endif /* CONFIG_BLK_DEV_IDESCSI */
+#endif
 }
 
 static int default_cleanup (ide_drive_t *drive)
@@ -3596,7 +3590,10 @@
 	return 0;
 }
 
-ide_drive_t *ide_scan_devices (byte media, const char *name, ide_driver_t *driver, int n)
+/*
+ * Lookup IDE devices, which requested a particular deriver
+ */
+ide_drive_t *ide_scan_devices(byte type, const char *name, struct ata_operations *driver, int n)
 {
 	unsigned int unit, index, i;
 
@@ -3609,7 +3606,7 @@
 			char *req = drive->driver_req;
 			if (*req && !strstr(name, req))
 				continue;
-			if (drive->present && drive->media == media && drive->driver == driver && ++i > n)
+			if (drive->present && drive->type == type && drive->driver == driver && ++i > n)
 				return drive;
 		}
 	}
@@ -3619,7 +3616,7 @@
 /*
  * This is in fact registering a drive not a driver.
  */
-int ide_register_subdriver (ide_drive_t *drive, ide_driver_t *driver)
+int ide_register_subdriver(ide_drive_t *drive, struct ata_operations *driver)
 {
 	unsigned long flags;
 
@@ -3663,18 +3660,24 @@
 	    driver->driver_reinit = default_driver_reinit;
 
 	restore_flags(flags);		/* all CPUs */
+	/* FIXME: Check what this magic number is supposed to be about? */
 	if (drive->autotune != 2) {
-		if (driver->supports_dma && HWIF(drive)->dmaproc != NULL) {
+		if (HWIF(drive)->dmaproc != NULL) {
+
 			/*
-			 * Force DMAing for the beginning of the check.
-			 * Some chipsets appear to do interesting things,
-			 * if not checked and cleared.
+			 * Force DMAing for the beginning of the check.  Some
+			 * chipsets appear to do interesting things, if not
+			 * checked and cleared.
+			 *
 			 *   PARANOIA!!!
 			 */
-			(void) (HWIF(drive)->dmaproc(ide_dma_off_quietly, drive));
-			(void) (HWIF(drive)->dmaproc(ide_dma_check, drive));
+
+			HWIF(drive)->dmaproc(ide_dma_off_quietly, drive);
+			HWIF(drive)->dmaproc(ide_dma_check, drive);
 		}
-		drive->dsc_overlap = (drive->next != drive && driver->supports_dsc_overlap);
+		/* Only CD-ROMs and tape drives support DSC overlap. */
+		drive->dsc_overlap = (drive->next != drive
+				&& (drive->type == ATA_ROM || drive->type == ATA_TAPE));
 		drive->nice1 = 1;
 	}
 	drive->revalidate = 1;
@@ -3686,13 +3689,13 @@
 	return 0;
 }
 
-int ide_unregister_subdriver (ide_drive_t *drive)
+int ide_unregister_subdriver(ide_drive_t *drive)
 {
 	unsigned long flags;
 
 	save_flags(flags);		/* all CPUs */
 	cli();				/* all CPUs */
-	if (drive->usage || drive->busy || drive->driver == NULL || DRIVER(drive)->busy) {
+	if (drive->usage || drive->busy || drive->driver == NULL || ata_ops(drive)->busy) {
 		restore_flags(flags);	/* all CPUs */
 		return 1;
 	}
@@ -3700,7 +3703,7 @@
 	pnpide_init(0);
 #endif /* CONFIG_BLK_DEV_ISAPNP */
 #ifdef CONFIG_PROC_FS
-	ide_remove_proc_entries(drive->proc, DRIVER(drive)->proc);
+	ide_remove_proc_entries(drive->proc, ata_ops(drive)->proc);
 	ide_remove_proc_entries(drive->proc, generic_subdriver_entries);
 #endif
 	auto_remove_settings(drive);
@@ -3709,6 +3712,29 @@
 	return 0;
 }
 
+
+/*
+ * Register an ATA driver for a particular device type.
+ */
+
+int register_ata_driver(unsigned int type, struct ata_operations *driver)
+{
+	return 0;
+}
+
+EXPORT_SYMBOL(register_ata_driver);
+
+/*
+ * Unregister an ATA driver for a particular device type.
+ */
+
+int unregister_ata_driver(unsigned int type, struct ata_operations *driver)
+{
+	return 0;
+}
+
+EXPORT_SYMBOL(unregister_ata_driver);
+
 struct block_device_operations ide_fops[] = {{
 	owner:			THIS_MODULE,
 	open:			ide_open,
@@ -3731,10 +3757,8 @@
 EXPORT_SYMBOL(drive_is_flashcard);
 EXPORT_SYMBOL(ide_timer_expiry);
 EXPORT_SYMBOL(ide_intr);
-EXPORT_SYMBOL(ide_fops);
 EXPORT_SYMBOL(ide_get_queue);
 EXPORT_SYMBOL(ide_add_generic_settings);
-EXPORT_SYMBOL(ide_devfs_handle);
 EXPORT_SYMBOL(do_ide_request);
 /*
  * Driver module
@@ -3742,7 +3766,6 @@
 EXPORT_SYMBOL(ide_scan_devices);
 EXPORT_SYMBOL(ide_register_subdriver);
 EXPORT_SYMBOL(ide_unregister_subdriver);
-EXPORT_SYMBOL(ide_replace_subdriver);
 EXPORT_SYMBOL(ide_set_handler);
 EXPORT_SYMBOL(ide_dump_status);
 EXPORT_SYMBOL(ide_error);
@@ -3766,9 +3789,6 @@
 EXPORT_SYMBOL(ide_add_proc_entries);
 EXPORT_SYMBOL(ide_remove_proc_entries);
 EXPORT_SYMBOL(proc_ide_read_geometry);
-EXPORT_SYMBOL(create_proc_ide_interfaces);
-EXPORT_SYMBOL(recreate_proc_ide_device);
-EXPORT_SYMBOL(destroy_proc_ide_device);
 #endif
 EXPORT_SYMBOL(ide_add_setting);
 EXPORT_SYMBOL(ide_remove_setting);
@@ -3809,10 +3829,10 @@
 			/* set the drive to standby */
 			printk("%s ", drive->name);
 			if (event != SYS_RESTART)
-				if (drive->driver != NULL && DRIVER(drive)->standby(drive))
-				continue;
+				if (drive->driver != NULL && ata_ops(drive)->standby(drive))
+					continue;
 
-			if (drive->driver != NULL && DRIVER(drive)->cleanup(drive))
+			if (drive->driver != NULL && ata_ops(drive)->cleanup(drive))
 				continue;
 		}
 	}
@@ -3827,13 +3847,14 @@
 };
 
 /*
- * This is gets invoked once during initialization, to set *everything* up
+ * This is the global initialization entry point.
  */
-int __init ide_init(void)
+static int __init ata_module_init(void)
 {
 	int i;
 
-	printk(KERN_INFO "Uniform Multi-Platform E-IDE driver " REVISION "\n");
+	printk(KERN_INFO "Uniform Multi-Platform E-IDE driver ver.:" VERSION "\n");
+
 	ide_devfs_handle = devfs_mk_dir (NULL, "ide", NULL);
 
 	/* Initialize system bus speed.
@@ -3859,7 +3880,9 @@
 	init_ide_data ();
 
 	initializing = 1;
+
 	ide_init_builtin_drivers();
+
 	initializing = 0;
 
 	for (i = 0; i < MAX_HWIFS; ++i) {
@@ -3872,8 +3895,7 @@
 	return 0;
 }
 
-#ifdef MODULE
-char *options = NULL;
+static char *options = NULL;
 MODULE_PARM(options,"s");
 MODULE_LICENSE("GPL");
 
@@ -3884,40 +3906,44 @@
 	if (line == NULL || !*line)
 		return;
 	while ((line = next) != NULL) {
- 		if ((next = strchr(line,' ')) != NULL)
+		if ((next = strchr(line,' ')) != NULL)
 			*next++ = 0;
 		if (!ide_setup(line))
 			printk ("Unknown option '%s'\n", line);
 	}
 }
 
-int init_module (void)
+static int __init init_ata (void)
 {
 	parse_options(options);
-	return ide_init();
+	return ata_module_init();
 }
 
-void cleanup_module (void)
+static void __exit cleanup_ata (void)
 {
 	int index;
 
 	unregister_reboot_notifier(&ide_notifier);
 	for (index = 0; index < MAX_HWIFS; ++index) {
 		ide_unregister(index);
-#if defined(CONFIG_BLK_DEV_IDEDMA) && !defined(CONFIG_DMA_NONPCI)
+# if defined(CONFIG_BLK_DEV_IDEDMA) && !defined(CONFIG_DMA_NONPCI)
 		if (ide_hwifs[index].dma_base)
 			(void) ide_release_dma(&ide_hwifs[index]);
-#endif /* (CONFIG_BLK_DEV_IDEDMA) && !(CONFIG_DMA_NONPCI) */
+# endif /* (CONFIG_BLK_DEV_IDEDMA) && !(CONFIG_DMA_NONPCI) */
 	}
 
-#ifdef CONFIG_PROC_FS
+# ifdef CONFIG_PROC_FS
 	proc_ide_destroy();
-#endif
+# endif
 	devfs_unregister (ide_devfs_handle);
 }
 
-#else /* !MODULE */
+module_init(init_ata);
+module_exit(cleanup_ata);
 
+#ifndef MODULE
+
+/* command line option parser */
 __setup("", ide_setup);
 
-#endif /* MODULE */
+#endif
diff -ur linux-2.5.5/drivers/ide/ns87415.c linux/drivers/ide/ns87415.c
--- linux-2.5.5/drivers/ide/ns87415.c	Wed Feb 20 03:11:00 2002
+++ linux/drivers/ide/ns87415.c	Tue Feb 26 01:30:43 2002
@@ -103,7 +103,7 @@
 			ns87415_prepare_drive(drive, 0);	/* DMA failed: select PIO xfer */
 			return 1;
 		case ide_dma_check:
-			if (drive->media != ide_disk)
+			if (drive->type != ATA_DISK)
 				return ide_dmaproc(ide_dma_off_quietly, drive);
 			/* Fallthrough... */
 		default:
diff -ur linux-2.5.5/drivers/ide/pdc202xx.c linux/drivers/ide/pdc202xx.c
--- linux-2.5.5/drivers/ide/pdc202xx.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/pdc202xx.c	Tue Feb 26 01:32:17 2002
@@ -63,7 +63,6 @@
 
 static int pdc202xx_get_info(char *, char **, off_t, int);
 extern int (*pdc202xx_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 char *pdc202xx_pio_verbose (u32 drive_pci)
@@ -412,7 +411,8 @@
 		default: return -1;
 	}
 
-	if ((drive->media != ide_disk) && (speed < XFER_SW_DMA_0))	return -1;
+	if ((drive->type != ATA_DISK) && (speed < XFER_SW_DMA_0))
+		return -1;
 
 	pci_read_config_dword(dev, drive_pci, &drive_conf);
 	pci_read_config_byte(dev, (drive_pci), &AP);
@@ -820,7 +820,8 @@
 	}
 
 	if (jumpbit) {
-		if (drive->media != ide_disk)	return ide_dma_off_quietly;
+		if (drive->type != ATA_DISK)
+			return ide_dma_off_quietly;
 		if (id->capability & 4) {	/* IORDY_EN & PREFETCH_EN */
 			OUT_BYTE((iordy + adj), indexreg);
 			OUT_BYTE((IN_BYTE(datareg)|0x03), datareg);
@@ -869,13 +870,14 @@
 
 chipset_is_set:
 
-	if (drive->media != ide_disk)	return ide_dma_off_quietly;
+	if (drive->type != ATA_DISK)
+		return ide_dma_off_quietly;
 
 	pci_read_config_byte(dev, (drive_pci), &AP);
 	if (id->capability & 4)	/* IORDY_EN */
 		pci_write_config_byte(dev, (drive_pci), AP|IORDY_EN);
 	pci_read_config_byte(dev, (drive_pci), &AP);
-	if (drive->media == ide_disk)	/* PREFETCH_EN */
+	if (drive->type == ATA_DISK)	/* PREFETCH_EN */
 		pci_write_config_byte(dev, (drive_pci), AP|PREFETCH_EN);
 
 jumpbit_is_set:
diff -ur linux-2.5.5/drivers/ide/pdcadma.c linux/drivers/ide/pdcadma.c
--- linux-2.5.5/drivers/ide/pdcadma.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/pdcadma.c	Mon Feb 25 01:54:02 2002
@@ -34,7 +34,6 @@
 
 static int pdcadma_get_info(char *, char **, off_t, int);
 extern int (*pdcadma_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int pdcadma_get_info (char *buffer, char **addr, off_t offset, int count)
diff -ur linux-2.5.5/drivers/ide/piix.c linux/drivers/ide/piix.c
--- linux-2.5.5/drivers/ide/piix.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/piix.c	Mon Feb 25 01:54:22 2002
@@ -77,7 +77,6 @@
 
 static int piix_get_info(char *, char **, off_t, int);
 extern int (*piix_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int piix_get_info (char *buffer, char **addr, off_t offset, int count)
diff -ur linux-2.5.5/drivers/ide/qd65xx.c linux/drivers/ide/qd65xx.c
--- linux-2.5.5/drivers/ide/qd65xx.c	Sun Feb 24 16:32:49 2002
+++ linux/drivers/ide/qd65xx.c	Tue Feb 26 01:32:52 2002
@@ -293,7 +293,7 @@
 		printk(KERN_INFO "%s: PIO mode%d\n",drive->name,pio);
 	}
 
-	if (!HWIF(drive)->channel && drive->media != ide_disk) {
+	if (!HWIF(drive)->channel && drive->type != ATA_DISK) {
 		qd_write_reg(0x5f,QD_CONTROL_PORT);
 		printk(KERN_WARNING "%s: ATAPI: disabled read-ahead FIFO and post-write buffer on %s.\n",drive->name,HWIF(drive)->name);
 	}
diff -ur linux-2.5.5/drivers/ide/serverworks.c linux/drivers/ide/serverworks.c
--- linux-2.5.5/drivers/ide/serverworks.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/serverworks.c	Mon Feb 25 01:52:46 2002
@@ -105,7 +105,6 @@
 
 static int svwks_get_info(char *, char **, off_t, int);
 extern int (*svwks_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 
 static int svwks_get_info (char *buffer, char **addr, off_t offset, int count)
 {
diff -ur linux-2.5.5/drivers/ide/sis5513.c linux/drivers/ide/sis5513.c
--- linux-2.5.5/drivers/ide/sis5513.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/sis5513.c	Tue Feb 26 01:33:19 2002
@@ -238,7 +238,7 @@
 	byte rw_prefetch	= (0x11 << drive->dn);
 
 	pci_read_config_byte(dev, 0x4b, &reg4bh);
-	if (drive->media != ide_disk)
+	if (drive->type != ATA_DISK)
 		return;
 	
 	if ((reg4bh & rw_prefetch) != rw_prefetch)
diff -ur linux-2.5.5/drivers/ide/slc90e66.c linux/drivers/ide/slc90e66.c
--- linux-2.5.5/drivers/ide/slc90e66.c	Sun Feb 24 16:32:54 2002
+++ linux/drivers/ide/slc90e66.c	Mon Feb 25 01:53:00 2002
@@ -60,7 +60,6 @@
 
 static int slc90e66_get_info(char *, char **, off_t, int);
 extern int (*slc90e66_display_info)(char *, char **, off_t, int); /* ide-proc.c */
-extern char *ide_media_verbose(ide_drive_t *);
 static struct pci_dev *bmide_dev;
 
 static int slc90e66_get_info (char *buffer, char **addr, off_t offset, int count)
diff -ur linux-2.5.5/drivers/ide/trm290.c linux/drivers/ide/trm290.c
--- linux-2.5.5/drivers/ide/trm290.c	Wed Feb 20 03:10:57 2002
+++ linux/drivers/ide/trm290.c	Tue Feb 26 01:33:48 2002
@@ -192,7 +192,7 @@
 			outl(hwif->dmatable_dma|reading|writing, hwif->dma_base);
 			drive->waiting_for_dma = 1;
 			outw((count * 2) - 1, hwif->dma_base+2); /* start DMA */
-			if (drive->media != ide_disk)
+			if (drive->type != ATA_DISK)
 				return 0;
 			ide_set_handler(drive, &ide_dma_intr, WAIT_CMD, NULL);
 			OUT_BYTE(reading ? WIN_READDMA : WIN_WRITEDMA, IDE_COMMAND_REG);
diff -ur linux-2.5.5/drivers/scsi/ide-scsi.c linux/drivers/scsi/ide-scsi.c
--- linux-2.5.5/drivers/scsi/ide-scsi.c	Sun Feb 24 16:32:45 2002
+++ linux/drivers/scsi/ide-scsi.c	Tue Feb 26 01:45:37 2002
@@ -182,7 +182,7 @@
 
 	if (!test_bit(PC_TRANSFORM, &pc->flags))
 		return;
-	if (drive->media == ide_cdrom || drive->media == ide_optical) {
+	if (drive->type == ATA_ROM || drive->type == ATA_MOD) {
 		if (c[0] == READ_6 || c[0] == WRITE_6) {
 			c[8] = c[4];		c[5] = c[3];		c[4] = c[2];
 			c[3] = c[1] & 0x1f;	c[2] = 0;		c[1] &= 0xe0;
@@ -221,7 +221,7 @@
 
 	if (!test_bit(PC_TRANSFORM, &pc->flags))
 		return;
-	if (drive->media == ide_cdrom || drive->media == ide_optical) {
+	if (drive->type == ATA_ROM || drive->type == ATA_MOD) {
 		if (pc->c[0] == MODE_SENSE_10 && sc[0] == MODE_SENSE) {
 			scsi_buf[0] = atapi_buf[1];		/* Mode data length */
 			scsi_buf[1] = atapi_buf[2];		/* Medium type */
@@ -508,7 +508,8 @@
  */
 static void idescsi_setup (ide_drive_t *drive, idescsi_scsi_t *scsi, int id)
 {
-	DRIVER(drive)->busy++;
+	ata_ops(drive)->busy++;
+
 	idescsi_drives[id] = drive;
 	drive->driver_data = scsi;
 	drive->ready_stat = 0;
@@ -535,17 +536,13 @@
 	return 0;
 }
 
-int idescsi_reinit(ide_drive_t *drive);
+static int idescsi_reinit(ide_drive_t *drive);
 
 /*
  *	IDE subdriver functions, registered with ide.c
  */
-static ide_driver_t idescsi_driver = {
-	name:			"ide-scsi",
-	media:			ide_scsi,
-	busy:			0,
-	supports_dma:		1,
-	supports_dsc_overlap:	0,
+static struct ata_operations idescsi_driver = {
+	owner:			THIS_MODULE,
 	cleanup:		idescsi_cleanup,
 	standby:		NULL,
 	flushcache:		NULL,
@@ -563,50 +560,20 @@
 	driver_reinit:		idescsi_reinit,
 };
 
-int idescsi_reinit (ide_drive_t *drive)
+static int idescsi_reinit (ide_drive_t *drive)
 {
-#if 0
-	idescsi_scsi_t *scsi;
-	byte media[] = {TYPE_DISK, TYPE_TAPE, TYPE_PROCESSOR, TYPE_WORM, TYPE_ROM, TYPE_SCANNER, TYPE_MOD, 255};
-	int i, failed, id;
-
-	if (!idescsi_initialized)
-		return 0;
-	for (i = 0; i < MAX_HWIFS * MAX_DRIVES; i++)
-		idescsi_drives[i] = NULL;
-
-	MOD_INC_USE_COUNT;
-	for (i = 0; media[i] != 255; i++) {
-		failed = 0;
-		while ((drive = ide_scan_devices (media[i], idescsi_driver.name, NULL, failed++)) != NULL) {
-
-			if ((scsi = (idescsi_scsi_t *) kmalloc (sizeof (idescsi_scsi_t), GFP_KERNEL)) == NULL) {
-				printk (KERN_ERR "ide-scsi: %s: Can't allocate a scsi structure\n", drive->name);
-				continue;
-			}
-			if (ide_register_subdriver (drive, &idescsi_driver)) {
-				printk (KERN_ERR "ide-scsi: %s: Failed to register the driver with ide.c\n", drive->name);
-				kfree (scsi);
-				continue;
-			}
-			for (id = 0; id < MAX_HWIFS * MAX_DRIVES && idescsi_drives[id]; id++);
-				idescsi_setup (drive, scsi, id);
-			failed--;
-		}
-	}
-	revalidate_drives();
-	MOD_DEC_USE_COUNT;
-#endif
 	return 0;
 }
 
 /*
  *	idescsi_init will register the driver for each scsi.
  */
-int idescsi_init (void)
+static int idescsi_init(void)
 {
 	ide_drive_t *drive;
 	idescsi_scsi_t *scsi;
+	/* FIXME: The following is just plain wrong, since those are definitely *not* the
+	 * media types supported by the ATA layer */
 	byte media[] = {TYPE_DISK, TYPE_TAPE, TYPE_PROCESSOR, TYPE_WORM, TYPE_ROM, TYPE_SCANNER, TYPE_MOD, 255};
 	int i, failed, id;
 
@@ -618,7 +585,7 @@
 	MOD_INC_USE_COUNT;
 	for (i = 0; media[i] != 255; i++) {
 		failed = 0;
-		while ((drive = ide_scan_devices (media[i], idescsi_driver.name, NULL, failed++)) != NULL) {
+		while ((drive = ide_scan_devices (media[i], "ide-scsi", NULL, failed++)) != NULL) {
 
 			if ((scsi = (idescsi_scsi_t *) kmalloc (sizeof (idescsi_scsi_t), GFP_KERNEL)) == NULL) {
 				printk (KERN_ERR "ide-scsi: %s: Can't allocate a scsi structure\n", drive->name);
@@ -666,7 +633,7 @@
 	for (id = 0; id < MAX_HWIFS * MAX_DRIVES; id++) {
 		drive = idescsi_drives[id];
 		if (drive)
-			DRIVER(drive)->busy--;
+			ata_ops(drive)->busy--;
 	}
 	return 0;
 }
@@ -896,9 +863,17 @@
 	int i, failed;
 
 	scsi_unregister_host(&idescsi_template);
+
+	/* FIXME: The media types scanned here have literally nothing to do
+	 * with the media types used by the overall ATA code!
+	 *
+	 * This is basically showing us, that there is something wrong with the
+	 * ide_scan_devices function.
+	 */
+
 	for (i = 0; media[i] != 255; i++) {
 		failed = 0;
-		while ((drive = ide_scan_devices (media[i], idescsi_driver.name, &idescsi_driver, failed)) != NULL)
+		while ((drive = ide_scan_devices (media[i], "ide-scsi", &idescsi_driver, failed)) != NULL)
 			if (idescsi_cleanup (drive)) {
 				printk ("%s: exit_idescsi_module() called while still busy\n", drive->name);
 				failed++;
diff -ur linux-2.5.5/include/linux/ide.h linux/include/linux/ide.h
--- linux-2.5.5/include/linux/ide.h	Sun Feb 24 22:48:39 2002
+++ linux/include/linux/ide.h	Tue Feb 26 01:22:15 2002
@@ -1,9 +1,7 @@
 #ifndef _IDE_H
 #define _IDE_H
 /*
- *  linux/include/linux/ide.h
- *
- *  Copyright (C) 1994-1998  Linus Torvalds & authors
+ *  Copyright (C) 1994-2002  Linus Torvalds & authors
  */
 
 #include <linux/config.h>
@@ -350,12 +348,13 @@
  * Now for the data we need to maintain per-drive:  ide_drive_t
  */
 
-#define ide_scsi	0x21
-#define ide_disk	0x20
-#define ide_optical	0x7
-#define ide_cdrom	0x5
-#define ide_tape	0x1
-#define ide_floppy	0x0
+#define ATA_DISK        0x20
+#define ATA_TAPE        0x01
+#define ATA_ROM         0x05	/* CD-ROM */
+#define ATA_MOD		0x07    /* optical */
+#define ATA_FLOPPY	0x00
+#define ATA_SCSI	0x21
+#define ATA_NO_LUN      0x7f
 
 typedef union {
 	unsigned all			: 8;	/* all of the bits together */
@@ -371,7 +370,14 @@
 struct ide_settings_s;
 
 typedef struct ide_drive_s {
-	request_queue_t		 queue;	/* request queue */
+	char type; /* distingiush different devices: disk, cdrom, tape, floppy, ... */
+
+	/* NOTE: If we had proper separation between channel and host chip, we
+	 * could move this to the chanell and many sync problems would
+	 * magically just go away.
+	 */
+	request_queue_t		 queue;	/* per device request queue */
+
 	struct ide_drive_s	*next;	/* circular list of hwgroup drives */
 	unsigned long sleep;		/* sleep until this time */
 	unsigned long service_start;	/* time we started last request */
@@ -406,7 +412,6 @@
 	unsigned ata_flash	: 1;	/* 1=present, 0=default */
 	unsigned	addressing;	/* : 2; 0=28-bit, 1=48-bit, 2=64-bit */
 	byte		scsi;		/* 0=default, 1=skip current ide-subdriver for ide-scsi emulation */
-	byte		media;		/* disk, cdrom, tape, floppy, ... */
 	select_t	select;		/* basic drive/head select reg value */
 	byte		ctl;		/* "normal" value for IDE_CONTROL_REG */
 	byte		ready_stat;	/* min status value for drive ready */
@@ -432,7 +437,7 @@
 	struct hd_driveid *id;		/* drive model identification info */
 	struct hd_struct  *part;	/* drive partition table */
 	char		name[4];	/* drive name, such as "hda" */
-	struct ide_driver_s *driver;	/* (ide_driver_t *) */
+	struct ata_operations *driver;
 	void		*driver_data;	/* extra driver data */
 	devfs_handle_t	de;		/* directory for device */
 	struct proc_dir_entry *proc;	/* /proc/ide/ directory entry */
@@ -570,7 +575,6 @@
 	unsigned long	last_time;	/* time when previous rq was done */
 #endif
 	byte		straight8;	/* Alan's straight 8 check */
-	void		*hwif_data;	/* extra hwif data */
 	ide_busproc_t	*busproc;	/* driver soft-power interface */
 	byte		bus_state;	/* power state of the IDE bus */
 	struct device	device;		/* global device tree handle */
@@ -663,8 +667,6 @@
 #ifdef CONFIG_PROC_FS
 void proc_ide_create(void);
 void proc_ide_destroy(void);
-void recreate_proc_ide_device(ide_hwif_t *, ide_drive_t *);
-void destroy_proc_ide_device(ide_hwif_t *, ide_drive_t *);
 void destroy_proc_ide_drives(ide_hwif_t *);
 void create_proc_ide_interfaces(void);
 void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data);
@@ -692,32 +694,54 @@
 #endif
 
 /*
- * Subdrivers support.
+ * This structure describes the operations possible on a particular device type
+ * (CD-ROM, tape, DISK and so on).
+ *
+ * This is the main hook for device type support submodules.
  */
-typedef struct ide_driver_s {
-	const char			*name;
-	byte				media;
-	unsigned busy			: 1;
-	unsigned supports_dma		: 1;
-	unsigned supports_dsc_overlap	: 1;
+
+struct ata_operations {
+	struct module *owner;
+	unsigned busy: 1; /* FIXME: this will go soon away... */
 	int (*cleanup)(ide_drive_t *);
 	int (*standby)(ide_drive_t *);
 	int (*flushcache)(ide_drive_t *);
 	ide_startstop_t	(*do_request)(ide_drive_t *, struct request *, unsigned long);
 	int (*end_request)(ide_drive_t *drive, int uptodate);
+
 	int (*ioctl)(ide_drive_t *, struct inode *, struct file *, unsigned int, unsigned long);
 	int (*open)(struct inode *, struct file *, ide_drive_t *);
 	void (*release)(struct inode *, struct file *, ide_drive_t *);
 	int (*media_change)(ide_drive_t *);
 	void (*revalidate)(ide_drive_t *);
+
 	void (*pre_reset)(ide_drive_t *);
 	unsigned long (*capacity)(ide_drive_t *);
 	ide_startstop_t	(*special)(ide_drive_t *);
 	ide_proc_entry_t		*proc;
 	int (*driver_reinit)(ide_drive_t *);
-} ide_driver_t;
+};
+
+/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
+#define ata_get(ata) \
+	(((ata) && (ata)->owner)	\
+		? ( try_inc_mod_count((ata)->owner) ? (ata) : NULL ) \
+		: (ata))
+
+#define ata_put(ata) \
+do {	\
+	if ((ata) && (ata)->owner) \
+		__MOD_DEC_USE_COUNT((ata)->owner);	\
+} while(0)
 
-#define DRIVER(drive)		((drive)->driver)
+
+/* FIXME: Actually implement and use them as soon as possible!  to make the
+ * ide_scan_devices() go away! */
+
+extern int unregister_ata_driver(unsigned int type, struct ata_operations *driver);
+extern int register_ata_driver(unsigned int type, struct ata_operations *driver);
+
+#define ata_ops(drive)		((drive)->driver)
 
 /*
  * ide_hwifs[] is the master data structure used to keep track
@@ -994,30 +1018,24 @@
 extern int ideprobe_init (void);
 #endif
 #ifdef CONFIG_BLK_DEV_IDEDISK
-extern int idedisk_reinit (ide_drive_t *drive);
 extern int idedisk_init (void);
-#endif /* CONFIG_BLK_DEV_IDEDISK */
+#endif
 #ifdef CONFIG_BLK_DEV_IDECD
-extern int ide_cdrom_reinit (ide_drive_t *drive);
 extern int ide_cdrom_init (void);
-#endif /* CONFIG_BLK_DEV_IDECD */
+#endif
 #ifdef CONFIG_BLK_DEV_IDETAPE
-extern int idetape_reinit (ide_drive_t *drive);
 extern int idetape_init (void);
-#endif /* CONFIG_BLK_DEV_IDETAPE */
+#endif
 #ifdef CONFIG_BLK_DEV_IDEFLOPPY
-extern int idefloppy_reinit (ide_drive_t *drive);
 extern int idefloppy_init (void);
-#endif /* CONFIG_BLK_DEV_IDEFLOPPY */
+#endif
 #ifdef CONFIG_BLK_DEV_IDESCSI
-extern int idescsi_reinit (ide_drive_t *drive);
 extern int idescsi_init (void);
-#endif /* CONFIG_BLK_DEV_IDESCSI */
+#endif
 
-ide_drive_t *ide_scan_devices (byte media, const char *name, ide_driver_t *driver, int n);
-extern int ide_register_subdriver(ide_drive_t *drive, ide_driver_t *driver);
+ide_drive_t *ide_scan_devices (byte media, const char *name, struct ata_operations *driver, int n);
+extern int ide_register_subdriver(ide_drive_t *drive, struct ata_operations *driver);
 extern int ide_unregister_subdriver(ide_drive_t *drive);
-extern int ide_replace_subdriver(ide_drive_t *drive, const char *driver);
 
 #ifdef CONFIG_BLK_DEV_IDEPCI
 #define ON_BOARD		1

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23 18:20   ` Linus Torvalds
  2002-02-23 18:28     ` Larry McVoy
@ 2002-02-26 16:09     ` Hubertus Franke
  1 sibling, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-02-26 16:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, Rusty Russell, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel, lse

On Sat, Feb 23, 2002 at 10:20:04AM -0800, Linus Torvalds wrote:
> 
> 

In the hope to be included into this discussion and keep this discussion
active.

As previously reported, I wrote a user level locking or fast semaphore
package, which resolves the uncontended case in shared memory and through
atomic operations, while the contended case falls back into the kernel.
Yesterday I layed out some of my assumptions/approach of my implementation.

download available under 
	http://prdownloads.sourceforge.net/lse/ulocks-2.4.17.tar.bz2.

> On Sat, 23 Feb 2002, Ingo Molnar wrote:
> >
> > On Sat, 23 Feb 2002, Rusty Russell wrote:
> >
> > > 1) Interface is: open /dev/usem, pread, pwrite.
> >
> > i like the patch, but the interface is ugly IMO. Why not new syscalls?
> 
> I agree - I dislike magic device files a whole lot more than just abotu
> every alternative.
> 
> Also, one thing possibly worth looking into is to just put the actual
> semaphore contents into a regular file backed setup.
> 

As of Linus's request to put the content into a regular file backe setup.
I support that as well. I also support different virtual addresses in
the various processes. It should simply be shared memory wether through
shmat or mmap.

> 		Linus
> 

Currently I was fixated on semaphores/calocks/rwlocks which require at most
two queues. Ben LeHaise yesterday more or less stated that any number of queues
is desirable to implement whatever-you-like lock. I second that motion and
will modify the code accordingly, should be a quick hack.

In the following I first describe again what my implementation provides
I will follow with performance numbers, lock contention, retry rates, and fairness
under various load conditions for a synthetic benchmark.


WHAT HAS BEEN IMPLEMENTED
=========================

(a)  EXCLUSIVE LOCKS
   (a1)  SysV IPC semaphores: I need these to have a base for comparision.
   (FAIR)
   (a2)  usema: semaphore implementation. (FAIR)
   (a3)  usema_spin: same as above, however, I will spin for a period of
   time, called patience,before actually entering into the kernel on
   contention. (FAIR)
   (a4)  calock: convoy avoidance lock. This lock is not fair. On unlock
   operation, the lock is made available even if a waiting process has to
   be woken up. The woken process then must content for the lock again,
   which (a) is unfair and (b) can result into multiple reentrances into
   the kernel for waiting  (NOT FAIR).
   (a5) calock_spin: same as above, but we spin for a while.

   Some notes here: all but for (a1,a2) all locks required compare_exchange
   functionality.

(b) SHARED LOCKS
   (b1) rwlock:  multiple reader/single writer. On wakeup preference is
   given to the readers, not the writers. (FAIR)


What has been tested and how?

I wrote a micro benchmark, that allows me to
(a) test the correctness of the locks (and believe me that helped, some
obvious solutions particularly for the spinning just don't work) and
(b) measure the performance. I wrote a program, called ulockflex. It allows
me to control the type of lock used, the number of children, the mean lock
hold time and non-lock hold time, the patience.

In return it tells me a lot of things about the individual locks.
I have integrated lock contention measurement right into the base lock
routines.
They come extremely cheap, in most cases a single atomic increment or
simple increment and don't really show up at the application level at all
(sometimes even help due to the spinning affect).

The performance of the locks is measured by the cummulative throughput/sec
of the children through the following loop.
for (;;) {
      run_lock_not_held(uniform(mean_non_locktime));
      lock();
      run_lock_held(uniform(mean_non_locktime));
      unlock();
}

uniform(x) gives me a uniformly distributed random value in the interval
[ .5x .. 1.5x].
I also measure the fairness of the locks among children. A fair lock should
give everybody the same number of loops over a meaningful execution time.
small changes are due to the random distribution. This is measured as  the
coefficient of variance
which is a normalized value compared to the mean. 0.0 is fair, and the
higher the value the less the fair it becomes.

So without further due, here is the output of my regression/performance
test. I split it up with some comments and point out some irregularities.
All tests run on a 2-way PIII 550MHZ machine.

PERFORMANCE
===========


Wed Feb 20 23:02:33 EST 2002:  calibrated at 273
base arguments are <-t 10 -o 5 -m 8 -R 273 -P>

First the legend
 c #number of children
 a #number of locks
 t #second of each run
 i #number of iterations (if less than 7) we reached 95% confidence
interval. (1 warmup run)
                        otherwise the result is statistically not
significant. I don't have
                        the exact confidence number.
 L locktype (capital letter means spinning version)
            e ::= no locks
            k ::= sysv ipc sem
            f ::= usema
            F ::= usema_spinning
            c ::= calock
            C ::= calock_spinning
            s<x> ::= with <x> the percentage of read sharing lock requests

 p patience of spinning if any
 WC lock contention on write lock
 RC read contention on read lock (for shared lock only)
 R  percentage of failed lock attempts resolved by spinning OR
    average times in calock lock attempt that failed needed to go into the
kernel (1 min).
 COV coefficient of variance between children (fairness)
 ThPut Cummulative throughput achieved per second for all children

 c  a t   i L      p   r  x     WC      RC      R     COV       ThPut

 1  1 10  5 e      0   0   0    0.00   0.00   0.00 0.000000  1398321  (A1)
 1  1 10  5 k      0   0   0    0.00   0.00   0.00 0.000000   309423
 1  1 10  5 f      0   0   0    0.00   0.00   0.00 0.000000  1157684
 1  1 10  5 F      0   0   0    0.00   0.00   0.00 0.000000  1136268
 1  1 10  5 F      3   0   0    0.00   0.00   0.00 0.000000  1136147
 1  1 10  5 c      0   0   0    0.00   0.00   0.00 0.000000  1115431
 1  1 10  5 C      0   0   0    0.00   0.00   0.00 0.000000  1150488
 1  1 10  5 C      3   0   0    0.00   0.00   0.00 0.000000  1150368

(A1) this is the base number without any locking. Every loop takes about 400 cycles
    We can see that he sysv kernel implementation has high overhead just getting in
    and out of the kernel and inspecting the ipc object namely ~1400 cycles of overhead.
    The remainder are all very much the same, since there is no contention, neither
    spinning nor any kernel calls are issued, simply an atomic instruction.

 1  1 10  5 e      0   0  10    0.00   0.00   0.00 0.000000    93978
 1  1 10  5 k      0   0  10    0.00   0.00   0.00 0.000000    76133
 1  1 10  5 f      0   0  10    0.00   0.00   0.00 0.000000    93136
 1  1 10  5 F      0   0  10    0.00   0.00   0.00 0.000000    92945
 1  1 10  5 F      3   0  10    0.00   0.00   0.00 0.000000    92968
 1  1 10  5 c      0   0  10    0.00   0.00   0.00 0.000000    63579 (*)
 1  1 10  5 C      0   0  10    0.00   0.00   0.00 0.000000    92993
 1  1 10  5 C      3   0  10    0.00   0.00   0.00 0.000000    93018

 1  1 10  5 e      0   7   3    0.00   0.00   0.00 0.000000    93850
 1  1 10  5 k      0   7   3    0.00   0.00   0.00 0.000000    76055
 1  1 10  5 f      0   7   3    0.00   0.00   0.00 0.000000    70352 (*)
 1  1 10  5 F      0   7   3    0.00   0.00   0.00 0.000000    92848
 1  1 10  5 F      3   7   3    0.00   0.00   0.00 0.000000    92858
 1  1 10  5 c      0   7   3    0.00   0.00   0.00 0.000000    81495 (*)
 1  1 10  5 C      0   7   3    0.00   0.00   0.00 0.000000    92882
 1  1 10  5 C      3   7   3    0.00   0.00   0.00 0.000000    92900

 1  1 10  5 e      0   9   1    0.00   0.00   0.00 0.000000    93882
 1  1 10  5 k      0   9   1    0.00   0.00   0.00 0.000000    76137
 1  1 10  5 f      0   9   1    0.00   0.00   0.00 0.000000    65751 (*)
 1  1 10  5 F      0   9   1    0.00   0.00   0.00 0.000000    92844
 1  1 10  5 F      3   9   1    0.00   0.00   0.00 0.000000    92846
 1  1 10  5 c      0   9   1    0.00   0.00   0.00 0.000000    88650 (*)
 1  1 10  5 C      0   9   1    0.00   0.00   0.00 0.000000    92875
 1  1 10  5 C      3   9   1    0.00   0.00   0.00 0.000000    92877

Same case, no contention, but we increase the loop time to 10usecs,
splitting it up into (0,10) (7,3) and (10,0) for (nonlock-time,lock time).
This will serve as a base number again. Surprisingly (*) have
subpar performance, this indicates some scheduler irregularities, since it
doesn't really do anything different than in (A). This observation repeats
itself several times and I'll try to hunt it down. I'll try the newest MQ
scheduler. Nevertheless, the overhead of sysv-ipc is apparent.

Now let's turn to the real cases, 2 children with lock contention.

 2  1 10  5 e      0   0   0    0.00   0.00   0.00 0.003351  2696755
 2  1 10  5 k      0   0   0   99.44   0.00   0.00 0.000786   159238
 2  1 10  6 f      0   0   0   42.33   0.00   0.00 0.034375   391065
 2  1 10  5 F      0   0   0   71.06   0.00   4.41 0.004886   242038
 2  1 10  5 F      3   0   0   46.02   0.00  26.55 0.043065   254974
 2  1 10  5 c      0   0   0    7.58   0.00   0.18 0.007522   695978
 2  1 10  5 C      0   0   0    3.84   0.00   0.12 0.004054   713464
 2  1 10  5 C      3   0   0    6.22   0.00   0.00 0.007328   706955

In this tied race we can clearly see the superior performance that can
be achieved with f and c locks. Note that most of the time is spent in
the loop preparation of determining the random values etc. Its also astounding
how little contention there exists in the calock.

Now looking at the 2 children 2 locks, there ofcourse is no contention
and these numbers should be double the 1 child/1 lock case and they are.
The strange cases (*) are pointed out again.

 2  2 10  5 e      0   0  10    0.00   0.00   0.00 0.002096   127205
 2  2 10  5 k      0   0  10    0.00   0.00   0.00 0.000616   145450
 2  2 10  5 f      0   0  10    0.00   0.00   0.00 0.002025   185794
 2  2 10  5 F      0   0  10    0.00   0.00   0.00 0.000511   185242
 2  2 10  5 F      3   0  10    0.00   0.00   0.00 0.000798   185239
 2  2 10  5 c      0   0  10    0.00   0.00   0.00 0.001097   126782 (*)
 2  2 10  5 C      0   0  10    0.00   0.00   0.00 0.000506   185478
 2  2 10  5 C      3   0  10    0.00   0.00   0.00 0.002143   185473

 2  2 10  5 e      0   7   3    0.00   0.00   0.00 0.002369   127126
 2  2 10  5 k      0   7   3    0.00   0.00   0.00 0.000970   143495
 2  2 10  5 f      0   7   3    0.00   0.00   0.00 0.000154   140287
 2  2 10  5 F      0   7   3    0.00   0.00   0.00 0.000251   185045
 2  2 10  5 F      3   7   3    0.00   0.00   0.00 0.000295   185046
 2  2 10  5 c      0   7   3    0.00   0.00   0.00 0.001159   162488 (*)
 2  2 10  5 C      0   7   3    0.00   0.00   0.00 0.001096   185233
 2  2 10  5 C      3   7   3    0.00   0.00   0.00 0.001196   185249

 2  2 10  5 e      0   9   1    0.00   0.00   0.00 0.000680   127125
 2  2 10  5 k      0   9   1    0.00   0.00   0.00 0.001323   144927
 2  2 10  5 f      0   9   1    0.00   0.00   0.00 0.000906   131147 (*)
 2  2 10  5 F      0   9   1    0.00   0.00   0.00 0.002069   185095
 2  2 10  5 F      3   9   1    0.00   0.00   0.00 0.000214   185082
 2  2 10  5 c      0   9   1    0.00   0.00   0.00 0.000668   176828 (*)
 2  2 10  5 C      0   9   1    0.00   0.00   0.00 0.003634   185237
 2  2 10  5 C      3   9   1    0.00   0.00   0.00 0.002531   185207

 2  1 10  5 k      0   0  10   99.92   0.00   0.00 0.039162    59885
 2  1 10  5 f      0   0  10   99.91   0.00   0.00 0.031926    59141
 2  1 10  5 F      0   0  10   99.90   0.00   0.00 0.058318    58287
 2  1 10  5 F      3   0  10   99.63   0.00   0.01 0.000769    49363 (C1)
 2  1 10  5 c      0   0  10    0.07   0.00 1365.3 0.010231    48821 (C2)
 2  1 10  5 C      0   0  10    1.89   0.00  52.95 0.274067    64882 (C3)
 2  1 10  5 C      3   0  10    2.43   0.00  41.12 0.227824    64425 (C4)

Above is a really interesting case, where the lock is 100% contended.
In that case spinning (C1) doesn't pay. Interesting are the cases
C2-C4, take a look at the R column, which tells me that on average a
contended case (RW) needed to retry 1365, 52, 41 times respectively.
Though the contention was minimal, this is high and results in unfair
locking, which reflects itself in an extremely high COV value (22%).


 2  1 10  5 k      0   7   3   42.65   0.00   0.00 0.000456    92178
 2  1 10  5 f      0   7   3   20.55   0.00   0.00 0.001683   110963
 2  1 10  5 F      0   7   3   33.35   0.00   8.62 0.002107   121134
 2  1 10  5 F      3   7   3   21.84   0.00  49.51 0.002057   161271
 2  1 10  5 c      0   7   3   28.93   0.00   2.24 0.001609   132291
 2  1 10  5 C      0   7   3   22.22   0.00   1.72 0.001753   150395
 2  1 10  5 C      3   7   3   20.94   0.00   0.06 0.003021   147583

 2  1 10  5 k      0   9   1   18.30   0.00   0.00 0.000911   114569
 2  1 10  5 f      0   9   1    6.62   0.00   0.00 0.003179   121252
 2  1 10  5 F      0   9   1   10.48   0.00  23.20 0.002029   163215
 2  1 10  5 F      3   9   1    9.49   0.00  49.94 0.002685   172780
 2  1 10  5 c      0   9   1   12.82   0.00   1.01 0.002806   150094
 2  1 10  5 C      0   9   1    9.79   0.00   0.77 0.003529   157167
 2  1 10  5 C      3   9   1    9.19   0.00   0.03 0.003280   154746

 3  1 10  5 k      0   0  10   99.99   0.00   0.00 0.005466    59116
 3  1 10  5 f      0   0  10   99.99   0.00   0.00 0.071828    48826
 3  1 10  5 F      0   0  10   99.98   0.00   0.00 0.061724    49281
 3  1 10  5 F      3   0  10   99.96   0.00   0.00 0.030956    49393
 3  1 10  5 c      0   0  10    4.04   0.00  24.75 0.111655    47918
 3  1 10  5 C      0   0  10   14.32   0.00   6.98 0.050590    62455
 3  1 10  5 C      3   0  10   16.63   0.00   6.01 0.057665    63155

 3  1 10  5 k      0   7   3   97.59   0.00   0.00 0.004165    78487
 3  1 10  5 f      0   7   3   85.77   0.00   0.00 0.022811    98692
 3  1 10  5 F      0   7   3   96.15   0.00   0.20 0.013047   103924
 3  1 10  7 F      3   7   3   43.21   0.00  26.81 0.018630   124829
 3  1 10  5 c      0   7   3   29.11   0.00   3.43 0.002640   126916
 3  1 10  5 C      0   7   3   22.50   0.00   4.44 0.019857   138407
 3  1 10  7 C      3   7   3   20.95   0.00   4.76 0.014293   129526

 3  1 10  5 k      0   9   1   88.33   0.00   0.00 0.022545   110836
 3  1 10  5 f      0   9   1   73.99   0.00   0.00 0.007920   107787  (*)
 3  1 10  5 F      0   9   1   76.47   0.00   1.08 0.010588   135066
 3  1 10  5 F      3   9   1   11.39   0.00  44.85 0.011170   170034
 3  1 10  7 c      0   9   1   12.82   0.00   7.23 0.007448   134004
 3  1 10  7 C      0   9   1    9.78   0.00   6.90 0.014131   144045
 3  1 10  7 C      3   9   1    9.14   0.00   7.54 0.020201   138368

 4  1 10  5 k      0   0  10  100.00   0.00   0.00 0.065016    57854
 4  1 10  5 f      0   0  10  100.00   0.00   0.00 0.059136    49980
 4  1 10  5 F      0   0  10  100.00   0.00   0.00 0.084718    49467
 4  1 10  5 F      3   0  10  100.00   0.00   0.00 0.026923    48819
 4  1 10  5 c      0   0  10    3.12   0.00  32.05 0.267046    47519
 4  1 10  5 C      0   0  10   14.91   0.00   6.71 0.022086    61074
 4  1 10  5 C      3   0  10   18.18   0.00   5.50 0.039526    60815

 4  1 10  6 k      0   7   3   99.91   0.00   0.00 0.003822    85681
 4  1 10  5 f      0   7   3   94.27   0.00   0.00 0.010577    98893
 4  1 10  5 F      0   7   3   97.29   0.00   0.13 0.032502   107599
 4  1 10  5 F      3   7   3   69.03   0.00  11.04 0.032696   119276
 4  1 10  5 c      0   7   3   29.09   0.00   3.43 0.012937   126950
 4  1 10  5 C      0   7   3   22.42   0.00   4.43 0.011381   138405
 4  1 10  5 C      3   7   3   20.95   0.00   4.75 0.012371   129167

 4  1 10  5 k      0   9   1   97.52   0.00   0.00 0.008442   114210
 4  1 10  5 f      0   9   1   94.32   0.00   0.00 0.015333   104639
 4  1 10  5 F      0   9   1   93.78   0.00   0.23 0.007120   131777
 4  1 10  5 F      3   9   1   22.86   0.00  26.04 0.010409   161190
 4  1 10  5 c      0   9   1   12.84   0.00   7.77 0.017105   132406
 4  1 10  6 C      0   9   1    9.75   0.00  10.22 0.010480   137857
 4  1 10  7 C      3   9   1    9.23   0.00  10.46 0.029786   135441

To summarize the small process(children) case. Not counting the (*) cases
we clearly see that the user level locks substantially outperform (+20%)
the kernel implementation if lock contention does not hoover above
95%, a case we don't optimize for anyway.
Surprisingly, they are even reasonable fair in most cases, (c) in various
cases had a high COV value, but that's what the lock is for.
Spinning helps (sometimes).


Now let's turn our attention to the 100 process case.

100  1 10  5 k      0   0  10  100.00   0.00   0.00 0.756951    16866
100  1 10  5 f      0   0  10  100.00   0.00   0.00 0.536551    49233
100  1 10  5 F      0   0  10  100.00   0.00   0.00 0.516478    49096
100  1 10  5 F      3   0  10  100.00   0.00   0.00 0.592092    49216
100  1 10  5 c      0   0  10    5.19   0.00  19.27 0.324299    46064
100  1 10  7 C      0   0  10   15.17   0.00   6.59 0.071464    53429
100  1 10  7 C      3   0  10   16.96   0.00   5.90 0.075513    52285

not a lot of difference in the fully contended case. Striking again how
often the lock acquisition in the calock succeeds even for this fully
contended case. The fairness is totally down the drain even in the
k and f lock, which indicates to me that the run time might not have been long
enough.

100  1 10  7 k      0   7   3  100.00   0.00   0.00 0.028202    12168
100  1 10  5 f      0   7   3  100.00   0.00   0.00 0.082430    24693
100  1 10  5 F      0   7   3  100.00   0.00   0.00 0.083701    26918
100  1 10  5 F      3   7   3  100.00   0.00   0.00 0.084006    26489
100  1 10  5 c      0   7   3   29.10   0.00   3.43 0.253149   127234
100  1 10  5 C      0   7   3   22.23   0.00   4.49 0.198701   138257
100  1 10  5 C      3   7   3   20.81   0.00   4.79 0.235070   129320


100  1 10  7 k      0   9   1  100.00   0.00   0.00 0.014891    12297
100  1 10  5 f      0   9   1  100.00   0.00   0.00 0.079500    67188 (E1)
100  1 10  5 F      0   9   1  100.00   0.00   0.00 0.077384    25207 (E2)
100  1 10  5 F      3   9   1  100.00   0.00   0.00 0.077485    25148
100  1 10  5 c      0   9   1   12.73   0.00   7.81 0.194209   133376
100  1 10  5 C      0   9   1    9.71   0.00  10.28 0.153369   137555
100  1 10  5 C      3   9   1    8.85   0.00  11.24 0.163615   134339

The last two cases are SHOWTIME. Clearly we get superior behavior user
level
locking. (E1)is very high as compared to (E2). I don't have any explanation
for this.

Userlocks on VIAGRA: 1000 processes contending for a single lock.

1000  1 10  5 k      0   9   1  100.00   0.00   0.00 0.026477     1358
1000  1 10  7 f      0   9   1  100.00   0.00   0.00 0.544364     2640
1000  1 10  7 F      0   9   1   99.84   0.00   0.00 0.588966     2507
1000  1 10  7 F      3   9   1   99.45   0.00   0.03 0.362092     2245
1000  1 10  7 c      0   9   1    6.87   0.00  14.38 0.596546   104738
1000  1 10  6 C      0   9   1    9.22   0.00  10.78 0.696058   136075
1000  1 10  5 C      3   9   1    7.74   0.00  12.71 0.749075   134492

Note we did not get to statistically significant runs for f,F,c.
The results speak for themselves!!!!

SHARED LOCKS
------------

Here I check how the shared locks work for (0,10) (10,0) case 2 processes.
I increase the lock sharing from 0 to 100 %. As can be seen the performance
increases with it. Interesting again is that the (0,10) case is faster
than the the (10,0) case for 100% sharing ?

 2  1 10  5 s  0   0   0  10   99.90   0.00   0.00 0.020436    58331
 2  1 10  5 s 20   0   0  10   99.00  79.67   0.00 0.003076    55887
 2  1 10  5 s 40   0   0  10   97.08  59.75   0.00 0.001201    59823
 2  1 10  5 s 60   0   0  10   94.34  39.78   0.00 0.000268    71217
 2  1 10  5 s 80   0   0  10   91.42  19.91   0.00 0.001100    96733
 2  1 10  5 s100   0   0  10    0.00   0.00   0.00 0.001839   172432

 2  1 10  5 s  0   0  10   0    0.49   0.00   0.00 0.001672   121785
 2  1 10  5 s 20   0  10   0    0.51   0.58   0.00 0.000147   121616
 2  1 10  5 s 40   0  10   0    0.67   0.51   0.00 0.001209   121057
 2  1 10  5 s 60   0  10   0    0.88   0.41   0.00 0.001455   120407
 2  1 10  5 s 80   0  10   0    0.96   0.22   0.00 0.002061   120161
 2  1 10  5 s100   0  10   0    0.00   0.00   0.00 0.000782   120306



Comments particular to the questions previously posted and posted here again.

At this point, could be go through and delineate some of the requirements
first.
E.g.  (a) filedescriptors vs. vaddr
      (b) explicit vs. implicite allocation
      (c) system call interface vs. device driver
      (d) state management in user space only or in kernel as well
                i.e. how many are waiting, how many are woken up.
      (e) semaphores only or multiple queues
      (f) protection through an exported handle with some MAGIC or
          through virtual memory access rights
      (g) persistence on mmap or not
 
Here is my point of view:
(a) vaddr
(b) implicite
(c) syscall
(d) user only
(e) multiple queues
(f) virtual memory access rights.
(g) persistent  (if you don't want persistence you remove the underlying object)



-- Hubertus Franke

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] 2.5.5 IDE clean 14
  2002-02-26 12:15                       ` [PATCH] 2.5.5 IDE clean 14 Martin Dalecki
@ 2002-02-26 21:49                         ` Keith Owens
  2002-02-27 10:08                           ` Martin Dalecki
  0 siblings, 1 reply; 47+ messages in thread
From: Keith Owens @ 2002-02-26 21:49 UTC (permalink / raw)
  To: Martin Dalecki; +Cc: linux-kernel

Martin, please fix your mailer on 195.63.194.11.  It retries the send
every minute which is far too fast.  It is absolutely broken because it
ignores 5xx response codes.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-23  3:47 [PATCH] Lightweight userspace semaphores Rusty Russell
  2002-02-23 15:03 ` Ingo Molnar
  2002-02-25 15:00 ` Hubertus Franke
@ 2002-02-27  0:24 ` Rusty Russell
  2002-02-27 15:53   ` Hubertus Franke
                     ` (3 more replies)
  2 siblings, 4 replies; 47+ messages in thread
From: Rusty Russell @ 2002-02-27  0:24 UTC (permalink / raw)
  To: Hubertus Franke; +Cc: torvalds, matthew, bcrl, david, wli, linux-kernel

On Mon, 25 Feb 2002 10:00:25 -0500
Hubertus Franke <frankeh@watson.ibm.com> wrote:

> Rusty, since I supplied one of those packages available under lse.sourceforge.net
> let me make some comments.
> 
> (a) why do you need to pin. I simply use the user level address (vaddr)  
>     and hash with the <mm,vaddr> to obtain the internal object.
>     This also gives me full protection through the general vmm mechanisms.

I pin while sleeping for convenience, so I can get a kernel address.  It's
only one page (maybe 2).  I could look up the address every time, but then I
need to swap the page back in anyway to look at it.

> (b) I like the idea of mmap or shmat with special flags better than going 
>     through a device driver.

Me too, but I'd rather have people saying "make it a syscall" than "eww...
not another special purpose syscall!" 8)

> (c) creation can be done on demand, that's what I do. If you never have 
>     to synchronize through the kernel than you don't create the objects. 
>     There should be an option to force explicite creation if desired.

Absolutely, except there is no real "creation" event.  Adding a "here be
semaphores" syscall is sufficient and useful (and also makes it easy to
detect that there is no FUS support in the kernel).

> (d) The kernel should simply provide waiting and wakeup functionality and 
>     the bean counting should be done in user space. There is no need to 
>     pin or crossmap.

See above.

> (e) I really like to see multiple reader/single writer locks as well. I 
>     implemented these 

Hmmm... my current implementatino only allows down one and up one
operations, but off the top of my head I don't see a no reason this couldn't
be generalized.   Then:
1) Initialize at INT_MAX
2) down_read = down 1
3) down_write = down INT_MAX

Sufficient?

> (f) I also implemented convoy avoidance locks, spinning versions of 
>     user semaphores.  All over the same simple interface.
>     ulocks_wait(read_or_write) and ulocks_signal(read_or_write, num_to_wake_up).
>     Ofcourse to cut down on the interface a single system call is enough.

Interesting.  Something like this might be needed for backwards
compatibility anyway (spin & yield, at least).

> (g) I have an extensive test program and regression test <ulockflex>
>     that exercises the hell out of the userlevel approach. 

Excellent.   I shall grab it and take a look!

Thanks for the feedback,
Rusty.
-- 
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] 2.5.5 IDE clean 14
  2002-02-26 21:49                         ` Keith Owens
@ 2002-02-27 10:08                           ` Martin Dalecki
  0 siblings, 0 replies; 47+ messages in thread
From: Martin Dalecki @ 2002-02-27 10:08 UTC (permalink / raw)
  To: Keith Owens; +Cc: linux-kernel

Keith Owens wrote:
> Martin, please fix your mailer on 195.63.194.11.  It retries the send
> every minute which is far too fast.  It is absolutely broken because it
> ignores 5xx response codes.

This is *not my* responsibility. The resposible admin in question
already knows about it. OK?


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-27  0:24 ` Rusty Russell
@ 2002-02-27 15:53   ` Hubertus Franke
  2002-03-01  0:24     ` Richard Henderson
  2002-02-27 16:29   ` Hubertus Franke
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 47+ messages in thread
From: Hubertus Franke @ 2002-02-27 15:53 UTC (permalink / raw)
  To: Rusty Russell; +Cc: torvalds, matthew, bcrl, david, wli, linux-kernel

On Wed, Feb 27, 2002 at 11:24:17AM +1100, Rusty Russell wrote:
> On Mon, 25 Feb 2002 10:00:25 -0500
> Hubertus Franke <frankeh@watson.ibm.com> wrote:
> 
> > Rusty, since I supplied one of those packages available under lse.sourceforge.net
> > let me make some comments.
> > 
> > (a) why do you need to pin. I simply use the user level address (vaddr)  
> >     and hash with the <mm,vaddr> to obtain the internal object.
> >     This also gives me full protection through the general vmm mechanisms.
> 
> I pin while sleeping for convenience, so I can get a kernel address.  It's
> only one page (maybe 2).  I could look up the address every time, but then I
> need to swap the page back in anyway to look at it.

As stated above, I allocate a kernel object <kulock_t> on demand and
hash it. This way I don't have to pin any user address. What does everybody
think about the merit of this approach versus the pinning approach?
Most importantly I think the merit of both approaches stems from the fact
that the access is purely through the userlevel address rather than
through a handle. In your case, can the lock be allocated at different
virtual addresses in the various address spaces.
How do you cleanup ? Do you search through the hashtable ? In other words
what happens if you SIGTERM a process ?
> 
> > (b) I like the idea of mmap or shmat with special flags better than going 
> >     through a device driver.
> 
> Me too, but I'd rather have people saying "make it a syscall" than "eww...
> not another special purpose syscall!" 8)

One more syscall its all that needed like semop(). I multiplex all operations
through this interface. Agree, its more scalable then a device driver as
somebody pointed out that the device has locks associated with it.
> 
> > (c) creation can be done on demand, that's what I do. If you never have 
> >     to synchronize through the kernel than you don't create the objects. 
> >     There should be an option to force explicite creation if desired.
> 
> Absolutely, except there is no real "creation" event.  Adding a "here be
> semaphores" syscall is sufficient and useful (and also makes it easy to
> detect that there is no FUS support in the kernel).

That's what I have done. All internal objects are created on the fly.
Somebody pointed out to me that RT programs might want to allocate the 
internal object apriori, in that case the forcing the creation would be
required.

> 
> > (d) The kernel should simply provide waiting and wakeup functionality and 
> >     the bean counting should be done in user space. There is no need to 
> >     pin or crossmap.
> 
> See above.

Yip, no state sharing required between kernel and user. I analyzed this 
vary carefully, particularly for spinning locks, I can show you sequences
of events that get you in real trouble because the atomic update in user
space and the waiting are non atomic. As far as I am concerned separation
is imperative.

> 
> > (e) I really like to see multiple reader/single writer locks as well. I 
> >     implemented these 
> 
> Hmmm... my current implementatino only allows down one and up one
> operations, but off the top of my head I don't see a no reason this couldn't
> be generalized.   Then:
> 1) Initialize at INT_MAX
> 2) down_read = down 1
> 3) down_write = down INT_MAX
> 
> Sufficient?
> 

No, you effectively need two queues or you have to basically replicate
the low level semaphore routines to walk the list and identify the
sequence (R*W)* and only wake those up that are the same. There are many
issues in the wakeup. Do I want to wakeup all readers waiting regardless
of intermittent writers....

> > (f) I also implemented convoy avoidance locks, spinning versions of 
> >     user semaphores.  All over the same simple interface.
> >     ulocks_wait(read_or_write) and ulocks_signal(read_or_write, num_to_wake_up).
> >     Ofcourse to cut down on the interface a single system call is enough.
> 
> Interesting.  Something like this might be needed for backwards
> compatibility anyway (spin & yield, at least).

This is a must. I talked to database folks, they can see huge utilization
out of these locks. I hence echo again Ben LaHaises opinion, the kernel
mechanism must be flexible enough to support more complex locking stuff.
I have added in my next update on lse download support for queues.

> 
> > (g) I have an extensive test program and regression test <ulockflex>
> >     that exercises the hell out of the userlevel approach. 
> 
> Excellent.   I shall grab it and take a look!

Indeed this is/was a lifesaver, it points out so many interesting things
with respect to bugs and performance when the rubber hits the road.

> 
> Thanks for the feedback,
> Rusty.

Let's keep the sync going. At this point we need to figure out what
is the right approach for the kernel. Is the explicite kernel object 
approach I have chosen better/worse/equal to the pinning approach ?
Inputs from everyone please. We need to score against
	codepath length, 
	cleanup, 
	memory requirements

> -- 
>   Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

Cheers.
-- Hubertus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-27  0:24 ` Rusty Russell
  2002-02-27 15:53   ` Hubertus Franke
@ 2002-02-27 16:29   ` Hubertus Franke
  2002-03-02 14:50   ` Hubertus Franke
  2002-03-03 13:30   ` Rusty Russell
  3 siblings, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-02-27 16:29 UTC (permalink / raw)
  To: Rusty Russell; +Cc: torvalds, matthew, bcrl, david, wli, linux-kernel

On Wed, Feb 27, 2002 at 11:24:17AM +1100, Rusty Russell wrote:
> On Mon, 25 Feb 2002 10:00:25 -0500
> Hubertus Franke <frankeh@watson.ibm.com> wrote:
> 
> > Rusty, since I supplied one of those packages available under lse.sourceforge.net
> > let me make some comments.
> > 
> > (a) why do you need to pin. I simply use the user level address (vaddr)  
> >     and hash with the <mm,vaddr> to obtain the internal object.
> >     This also gives me full protection through the general vmm mechanisms.
> 
> I pin while sleeping for convenience, so I can get a kernel address.  It's
> only one page (maybe 2).  I could look up the address every time, but then I
> need to swap the page back in anyway to look at it.

Lookup is cheap. I integrated a <hit meter> into my hashing mechanism
that reports the hits/misses and the average length of traversal down
a hash chain, it's insignificant for the cases I tried, but again
no big progam has be tried against any of these approaches.

I (think) I now also see the merit of your approach, in that you really don't
need to allocate a kernel object. You actually allocate the object
right into the shared user page and pin it down. Your argument is that
you only need to pin while somebody is sleeping on the lock.
Is that correct? Very tricky. 
Let me also point out some shortcomings of this. If indeed there is a 
shared page between user and kernel then how does the system react to 
buggy userlevel code, e.g wild write that corrupt the wait queue.

In my explicite kernel object approach, I won't have this problem.

I hope I am not missing something here.

-- Hubertus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-27 15:53   ` Hubertus Franke
@ 2002-03-01  0:24     ` Richard Henderson
  2002-03-01  2:00       ` Hubertus Franke
  0 siblings, 1 reply; 47+ messages in thread
From: Richard Henderson @ 2002-03-01  0:24 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Rusty Russell, torvalds, matthew, bcrl, david, wli, linux-kernel

On Wed, Feb 27, 2002 at 10:53:11AM -0500, Hubertus Franke wrote:
> As stated above, I allocate a kernel object <kulock_t> on demand and
> hash it. This way I don't have to pin any user address. What does everybody
> think about the merit of this approach versus the pinning approach?
[...]
> In your case, can the lock be allocated at different
> virtual addresses in the various address spaces.

I think this is a relatively important feature.  It may not be
possible to use the same virtual address in different processes.


r~

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-03-01  0:24     ` Richard Henderson
@ 2002-03-01  2:00       ` Hubertus Franke
  0 siblings, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-03-01  2:00 UTC (permalink / raw)
  To: Rusty Russell, torvalds, matthew, bcrl, david, wli, linux-kernel,
	lse-tech

On Thu, Feb 28, 2002 at 04:24:22PM -0800, Richard Henderson wrote:
> On Wed, Feb 27, 2002 at 10:53:11AM -0500, Hubertus Franke wrote:
> > As stated above, I allocate a kernel object <kulock_t> on demand and
> > hash it. This way I don't have to pin any user address. What does everybody
> > think about the merit of this approach versus the pinning approach?
> [...]
> > In your case, can the lock be allocated at different
> > virtual addresses in the various address spaces.
> 
> I think this is a relatively important feature.  It may not be
> possible to use the same virtual address in different processes.
> 
> 
> r~

I think so too. However let me point that Linus's initial recommendation
of a handle, comprised of a kernel pointer and a signature also has
that property.
Just pointing out the merits of the various approaches.

-- Hubertus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 15:00 ` Hubertus Franke
@ 2002-03-01  4:44   ` Eric W. Biederman
  0 siblings, 0 replies; 47+ messages in thread
From: Eric W. Biederman @ 2002-03-01  4:44 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: Rusty Russell, torvalds, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel

Hubertus Franke <frankeh@watson.ibm.com> writes:

> Rusty, since I supplied one of those packages available under
> lse.sourceforge.net
> 
> let me make some comments.
> 
> (a) why do you need to pin. I simply use the user level address (vaddr)  
>     and hash with the <mm,vaddr> to obtain the internal object.
>     This also gives me full protection through the general vmm mechanisms.

Do: 
if (page->mapping) 
    hash(page->mapping, page->offset, vaddr & ~PAGE_MASK)
else
    hash(mm, vaddr)
This handles the semaphores in shared memory case.  So the semaphores
will work across processes as well.

> (c) creation can be done on demand, that's what I do. If you never have 
>     to synchronize through the kernel than you don't create the objects. 
>     There should be an option to force explicite creation if desired.

Which makes the uncontended (common) case fast.

> (d) The kernel should simply provide waiting and wakeup functionality and 
>     the bean counting should be done in user space. There is no need to 
>     pin or crossmap.

Agreed.

Eric

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25  1:23         ` Linus Torvalds
  2002-02-25 13:14           ` Alan Cox
@ 2002-03-01  4:56           ` Eric W. Biederman
  1 sibling, 0 replies; 47+ messages in thread
From: Eric W. Biederman @ 2002-03-01  4:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel

Linus Torvalds <torvalds@transmeta.com> writes:

> On Mon, 25 Feb 2002, Rusty Russell wrote:
> >
> > Bugger.  How about:
> >
> > 	sys_sem_area(void *pagestart, size_t len)
> > 	sys_unsem_area(void *pagestart, size_t len)
> >
> > Is that sufficient?  Is sys_unsem_area required at all?
> 
> The above is sufficient, but I would personally actually prefer an
> interface more like

Hmm.  My preference is for something like
mprotect(start, len, PROT_SEM | PROT_READ | PROT_WRITE);

And then 
#ifdef PROT_SEM && PROT_SEM
mprotect ....
#else
/* This architecture needs not special support skip the mprotect...
#endif

> 	fd = sem_initialize();
> 	mmap(fd, ...)
> 	..
> 	munmap(..)
> 
> which gives you a handle for the semaphore.

Ouch.  

The common case for a decent lock is the uncontended case.  In which
case you only need kernel support on demand.  What you suggest would
create kernel data structures for all of the uncontended locks.  That
sounds heavy.  Especially as the memory on most architectures is already
safe to use for locks.

So if nothing else can we separate the two cases of having user space
memory safe for user space spin locks.   And how to setup a wait queue
of user space waiters on when we need to wait?

Eric


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-27  0:24 ` Rusty Russell
  2002-02-27 15:53   ` Hubertus Franke
  2002-02-27 16:29   ` Hubertus Franke
@ 2002-03-02 14:50   ` Hubertus Franke
  2002-03-03 13:30   ` Rusty Russell
  3 siblings, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-03-02 14:50 UTC (permalink / raw)
  To: Rusty Russell; +Cc: torvalds, matthew, bcrl, david, wli, linux-kernel, lse-tech

On Wed, Feb 27, 2002 at 11:24:17AM +1100, Rusty Russell wrote:
> OK.  New implementation.  Compiles (PPC), but due to personal reasons,
> UNTESTED.  Thanks especially to Hubertus for his prior work and
> feedback.
>
> 1) Turned into syscalls.
> 2) Added arch-specific sys_futex_region() to enable mutexes on region,
>    and give a chance to detect lack of support via -ENOSYS.
> 3) Just a single atomic_t variable for mutex (thanks Paul M).
> - This means -ENOSYS on SMP 386s, as we need cmpxchg.


> - We can no longer do generic semaphores: mutexes only.
> - Requires arch __update_count atomic op.

    I don't see that, you can use atomic_inc on 386s ..

> 4) Current valid args to sys_futex are -1 and +1: we can implement
>    other lock types by using other values later (eg. rw locks).
>

  (seem below) 

> Size of futex.c dropped from 244 to 161 lines, so it's clearly 40%
> better!
>
> Get your complaints in now...

  (plenty below :-)

> Rusty.
> --
>   Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
>


Rusty, as indicated I think there is quite some merit to this solution
and it certainly cuts down on some of the complexity that is inherent in
what I submitted.
First, what I like is the fact that by pinning the page down you avoid the
extra indirection that I went through with the kulock_ref_t objects.

There are some limitations to your approach however.

(a) the hashing is to simple. As a consequence there are two limitations
     -  the total number of locks that can be waited for is 
        (1<< USEM_HASHBITS).
        that is way not enough (as you pointed out).
     - this has to be bumped up to roughly the number of tasks in the
       system.
       since each wait_queue_head_t element has 3 words, MAX_PID = 32K
       this would end up with about 400K of memory for this. This would be
       the worst case scenario, namely that every task in the system is 
       waiting on a different lock (hey shit happens).

more seriously though is the following.
(b) there is no provision to what to do when there is a hash collision?
      this seems too rudimentary that I assume I am missing something, 
      in case not:
      if <page1,offset1> and <page2,offset2> both result in the same
      hash value then they will be both waiting on the same wait queue, 
      which is incorrect.

     A solution to this would be to introduce hash chaining as I have done
     through my access vector cache and allocate the waitqueue objects from 
     the cachep object and link them through it. An alternative solution is
     to provide a sufficiently large queue as described in (a) and on collision
     go for a secondary hash function and if that fails search for an empty 
     bucket. In the current situation, I think the second solution might be
     more appropriate.

(c) your implementation could be easily expanded to include Ben's
    suggestion of multiple queues (which I already implement in my 
    submission), by including the queue index into the hashing mechanism.
    Since you are rewriting the wait routine, the problem can be circumvented
    if we can settle for mutex, semaphores, and rwlocks only. Ben what's your
    take. In rwlocks you signal continuation of the next class of holders.
    Then you wake up one writer or the next set of readers, or based on 
    some other parameter, if the first candidate you find is a reader you
    walk the entire list to wake up all readers. Lot's of stuff that can be
    done here. What you need then is to expand your wait structure in this
    case to indicate why you are sleeping, waiting for read or waiting for
    write.   What do you think.


(d) If (b) is magically solved, then I have no take yet regarding cleanup.
    If not, then given (b), your cleanup will become expensive and you 
    will end up in a similar solution as I have it, namely a callback on 
    vmarea destruction and some ref counting. 

(e) In your solution you use throughout cmpxchg, which limits you from 
    utilizing atomic_inc/dec functions on i386.
    I have based my mutex implementation in user space on atomic_inc
    Spinning versions require cmpxchg though.
    Maybe it might be useful to differentiate these cases and provide at
    least non spinning versions for i386.

(f) Why get away from semaphores and only stick to mutexes. Isn't there
    some value to provide counting semaphores ? Effectively, mutexes are
    a subclass of semaphores with a max-count of 1.

(g) I don't understand the business of rechecking in the kernel. In your
    case it comes cheap as you pinned the page already. In general that's 
    true, since the page was just touched the pinning itself is reasonable
    cheap. Nevertheless, I am concerned that now you need intrinsic knowledge
    how the lock word is used in user space. For instance, I use it in 
    different fashions, dependent whether its a semaphore or rwlock.
    Why can't we keep that separated from the kernel function of waiting
    on an event that is signaled along the lock word, the content of the lock 
    word should not come into play in the kernel. 
    If you want this, you use spinning locks.

(h) how do you deal with signals ? E.g. SIGIO or something like it.

Overall, pinning down the page (assuming that not a lot of tasks are
waiting on a large number of queues at the same time) is acceptable, 
and it cuts out one additional level of indirection that is present 
in my submission.

-- Hubertus Franke (frankeh@watson.ibm.com)



^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-24 23:48     ` Linus Torvalds
  2002-02-25  1:10       ` Rusty Russell
@ 2002-03-02 14:54       ` Pavel Machek
  1 sibling, 0 replies; 47+ messages in thread
From: Pavel Machek @ 2002-03-02 14:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Rusty Russell, mingo, Matthew Kirkwood, Benjamin LaHaise,
	David Axmark, William Lee Irwin III, linux-kernel

Hi!

> > >   sys_sem_create()
> > >   sys_sem_destroy()
> >
> > There is no create and destroy (init is purely userspace).  There is
> > "this is a semapore: up it".  This is a feature.
> 
> No, that's a bug.
> 
> You have to realize that there are architectures that need special
> initialization and page allocation for semaphores: they need special flags
> in the TLB for "careful access", for example (sometimes the careful access
> ends up being non-cached).

Your user part is arch-dependend, anyway. So it can just mmap(..., O_CAREFULL).

									Pavel

-- 
Philips Velo 1: 1"x4"x8", 300gram, 60, 12MB, 40bogomips, linux, mutt,
details at http://atrey.karlin.mff.cuni.cz/~pavel/velo/index.html.


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-25 16:39               ` Alan Cox
  2002-02-25 16:32                 ` Benjamin LaHaise
  2002-02-25 17:06                 ` Linus Torvalds
@ 2002-03-03  7:07                 ` Rusty Russell
  2 siblings, 0 replies; 47+ messages in thread
From: Rusty Russell @ 2002-03-03  7:07 UTC (permalink / raw)
  To: Benjamin LaHaise; +Cc: alan, torvalds, mingo, matthew, david, wli, linux-kernel

On Mon, 25 Feb 2002 11:32:40 -0500
Benjamin LaHaise <bcrl@redhat.com> wrote:

> On Mon, Feb 25, 2002 at 04:39:56PM +0000, Alan Cox wrote:
> > _alloca
> > mmap
> > 
> > Still fits on the stack 8)
> 
> Are we sure that forcing semaphore overhead to the size of a page is a 
> good idea?  I'd much rather see a sleep/wakeup mechanism akin to wait 
> queues be exported by the kernel so that userspace can implement a rich 
> set of locking functions on top of that in whatever shared memory is 
> being used.

Unfortunately, no.  You need to know what userspace is using them for so
you can check to avoid the "add to waitqueue" race.

AFAICT a mutex is the simplest useful primitive that can be realistically
exported.

Hope that helps,
Rusty.
-- 
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-02-27  0:24 ` Rusty Russell
                     ` (2 preceding siblings ...)
  2002-03-02 14:50   ` Hubertus Franke
@ 2002-03-03 13:30   ` Rusty Russell
  2002-03-04 16:51     ` Hubertus Franke
  2002-03-05  4:41     ` Rusty Russell
  3 siblings, 2 replies; 47+ messages in thread
From: Rusty Russell @ 2002-03-03 13:30 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: torvalds, matthew, bcrl, david, wli, linux-kernel, lse-tech

On Sat, 2 Mar 2002 09:50:31 -0500
Hubertus Franke <frankeh@watson.ibm.com> wrote:

> On Wed, Feb 27, 2002 at 11:24:17AM +1100, Rusty Russell wrote:
> > - We can no longer do generic semaphores: mutexes only.
> > - Requires arch __update_count atomic op.
> 
>     I don't see that, you can use atomic_inc on 386s ..

__update_count needed to be able to do the decrement from 0 or
the count, whichever was greater.  I have eliminated this in the
new patch (patch III).

> (a) the hashing is to simple. As a consequence there are two limitations
>      -  the total number of locks that can be waited for is 
>         (1<< USEM_HASHBITS).
>         that is way not enough (as you pointed out).
>      - this has to be bumped up to roughly the number of tasks in the
>        system.

OK.  We dealt with hash collisions by sharing waitqueues.  This works, but
means we have to do wake-all.  My new patch uses its own queues: we still
share queues, but we only wake the first one waiting on our counter.

> (c) your implementation could be easily expanded to include Ben's
>     suggestion of multiple queues (which I already implement in my 
>     submission), by including the queue index into the hashing mechanism.

Hmmm.... it should be pretty easy to extend: the queues can be shared.

> (f) Why get away from semaphores and only stick to mutexes. Isn't there
>     some value to provide counting semaphores ? Effectively, mutexes are
>     a subclass of semaphores with a max-count of 1.

Yes, but counting semaphores need two variables, or cmpxchg.  Mutexes do not
need either (proof by implementation 8).  While general counting semaphores
may be useful, the fact that they are not implemented in the kernel suggests
they are not worth paying extra for.

> (g) I don't understand the business of rechecking in the kernel. In your
>     case it comes cheap as you pinned the page already. In general that's 
>     true, since the page was just touched the pinning itself is reasonable
>     cheap. Nevertheless, I am concerned that now you need intrinsic knowledge
>     how the lock word is used in user space. For instance, I use it in 
>     different fashions, dependent whether its a semaphore or rwlock.

This is a definite advantage: my old fd-based code never looked at the
userspace counter: it had a kernel sempahore to sleep on for each
userspace lock. OTOH, this involves kernel allocation, with the complexity
that brings.

> (h) how do you deal with signals ? E.g. SIGIO or something like it.

They're interruptible, so you'll get -ERESTARTSYS.  Should be fine.
 
> Overall, pinning down the page (assuming that not a lot of tasks are
> waiting on a large number of queues at the same time) is acceptable, 
> and it cuts out one additional level of indirection that is present 
> in my submission.

I agree.  While originally reluctant, when it's one page per sleeping
process it's rather neat.

Cheers!
Rusty.
-- 
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-03-03 13:30   ` Rusty Russell
@ 2002-03-04 16:51     ` Hubertus Franke
  2002-03-05  4:41     ` Rusty Russell
  1 sibling, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-03-04 16:51 UTC (permalink / raw)
  To: Rusty Russell; +Cc: torvalds, matthew, bcrl, david, wli, linux-kernel, lse-tech

On Mon, Mar 04, 2002 at 12:30:40AM +1100, Rusty Russell wrote:
> On Sat, 2 Mar 2002 09:50:31 -0500
> Hubertus Franke <frankeh@watson.ibm.com> wrote:
> 

I am pretty much ready to support Rusty's latest patch. I think
much of the stuff I mentioned as been addressed.
The hash collision elimination the biggest one.
The page pinning is a great idea and I see that to be a great 
advantage over what I tried to push.

> > On Wed, Feb 27, 2002 at 11:24:17AM +1100, Rusty Russell wrote:
> > > - We can no longer do generic semaphores: mutexes only.
> > > - Requires arch __update_count atomic op.
> > 
> >     I don't see that, you can use atomic_inc on 386s ..
> 
> __update_count needed to be able to do the decrement from 0 or
> the count, whichever was greater.  I have eliminated this in the
> new patch (patch III).
> 
> > (a) the hashing is to simple. As a consequence there are two limitations
> >      -  the total number of locks that can be waited for is 
> >         (1<< USEM_HASHBITS).
> >         that is way not enough (as you pointed out).
> >      - this has to be bumped up to roughly the number of tasks in the
> >        system.
> 
> OK.  We dealt with hash collisions by sharing waitqueues.  This works, but
> means we have to do wake-all.  My new patch uses its own queues: we still
> share queues, but we only wake the first one waiting on our counter.
> 
> > (c) your implementation could be easily expanded to include Ben's
> >     suggestion of multiple queues (which I already implement in my 
> >     submission), by including the queue index into the hashing mechanism.
> 
> Hmmm.... it should be pretty easy to extend: the queues can be shared.
> 

Correct, should we bring that into the interface already. 
Expand the syscall with an additional parameter identifying the queue.

> > (f) Why get away from semaphores and only stick to mutexes. Isn't there
> >     some value to provide counting semaphores ? Effectively, mutexes are
> >     a subclass of semaphores with a max-count of 1.
> 
> Yes, but counting semaphores need two variables, or cmpxchg.  Mutexes do not
> need either (proof by implementation 8).  While general counting semaphores
> may be useful, the fact that they are not implemented in the kernel suggests
> they are not worth paying extra for.

Not exactly, they need one atomic in user space and a semaphore in the kernel
which as you stated uses (atomic and sleepers).

> 
> > (g) I don't understand the business of rechecking in the kernel. In your
> >     case it comes cheap as you pinned the page already. In general that's 
> >     true, since the page was just touched the pinning itself is reasonable
> >     cheap. Nevertheless, I am concerned that now you need intrinsic knowledge
> >     how the lock word is used in user space. For instance, I use it in 
> >     different fashions, dependent whether its a semaphore or rwlock.
> 
> This is a definite advantage: my old fd-based code never looked at the
> userspace counter: it had a kernel sempahore to sleep on for each
> userspace lock. OTOH, this involves kernel allocation, with the complexity
> that brings.
> 

???, kernel allocation is only required in under contention. If you take a look 
at how I used the atomic word for semaphores and for rwlock_t, its different. 
If you recheck in the kernel you need to know how this is used.
In my approach I simply allocated two queues on demand.

I am OK with only supplying semaphores and rwlocks, if there is a general consent
about it. Nevertheless, I think the multiqueue approach is somewhat more elegant
and expandable.

> > (h) how do you deal with signals ? E.g. SIGIO or something like it.
> 
> They're interruptible, so you'll get -ERESTARTSYS.  Should be fine.
>  

That's what I did too, but some folks pointed out they might want to 
interrupt a waking task, so that it can get back into user space and
fail with EAGAIN or so and do not automatically restart the syscall.

> > Overall, pinning down the page (assuming that not a lot of tasks are
> > waiting on a large number of queues at the same time) is acceptable, 
> > and it cuts out one additional level of indirection that is present 
> > in my submission.
> 
> I agree.  While originally reluctant, when it's one page per sleeping
> process it's rather neat.
> 

Yipp, most applications that will take advantage of this will allocate
the locks anyway in a shared region, i.e. there will be more than
one lock per page.

> Cheers!
> Rusty.
> -- 

As I said above, I am favor for the general infrastructure that you have in 
place now. Can we hash out the rwlock issues again.
They can still be folded into a single queue (logical) and you wakeup
eiher the next writer or set of readers, or all readers ....
We need a mean to bring that through the API. I think about it.

Great job Rusty, pulling all these issues together with a comprehensive patch.
The few remaining issues are cravy :-)

>   Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

-- Hubertus  

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
  2002-03-03 13:30   ` Rusty Russell
  2002-03-04 16:51     ` Hubertus Franke
@ 2002-03-05  4:41     ` Rusty Russell
  1 sibling, 0 replies; 47+ messages in thread
From: Rusty Russell @ 2002-03-05  4:41 UTC (permalink / raw)
  To: Hubertus Franke
  Cc: torvalds, matthew, bcrl, david, wli, linux-kernel, lse-tech

On Mon, 4 Mar 2002 11:51:43 -0500
Hubertus Franke <frankeh@watson.ibm.com> wrote:
> > This is a definite advantage: my old fd-based code never looked at the
> > userspace counter: it had a kernel sempahore to sleep on for each
> > userspace lock. OTOH, this involves kernel allocation, with the complexity
> > that brings.
> > 
> 
> ???, kernel allocation is only required in under contention. If you take a look 
> at how I used the atomic word for semaphores and for rwlock_t, its different. 
> If you recheck in the kernel you need to know how this is used.
> In my approach I simply allocated two queues on demand.

Yes, but no allocation is even better than on-demand allocation, if you can
get away with it.

> I am OK with only supplying semaphores and rwlocks, if there is a general consent
> about it. Nevertheless, I think the multiqueue approach is somewhat more elegant
> and expandable.

Well, it's pretty easy to allow other values than -1/1 later as required.
I don't think the switch() overhead will be measurable for the first dozen
lock types 8).

> > > (h) how do you deal with signals ? E.g. SIGIO or something like it.
> > 
> > They're interruptible, so you'll get -ERESTARTSYS.  Should be fine.
> >  
> 
> That's what I did too, but some folks pointed out they might want to 
> interrupt a waking task, so that it can get back into user space and
> fail with EAGAIN or so and do not automatically restart the syscall.

Sorry, my bad.  I use -EINTR, so they don't get restarted, for exactly that
reason (eg. implementing timeouts on mutexes).

Cheers!
Rusty.
-- 
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
       [not found]       ` <3C7FDF76.9040903@dlr.de>
@ 2002-03-02 14:08         ` Hubertus Franke
  0 siblings, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-03-02 14:08 UTC (permalink / raw)
  To: Martin Wirth; +Cc: rusty, linux-kernel, lse-tech

On Fri, Mar 01, 2002 at 09:07:18PM +0100, Martin Wirth wrote:
> 
> 
> Hubertus Franke Worte:
> 
> >
> >So, it works and is correct. Hope that clarifies it, if not let me know.
> >Interestingly enough. This scheme doesn't work for spinning locks.
> >Goto lib/ulocks.c[91-133], I have integrated this dead code here
> >to show how not to do it. A similar analysis as above shows
> >that this approach wouldn't work. You need cmpxchg for these scenarios
> >(more or less).
> >
> 
> You are right, I falsely assumed the initial state to be [1,1].
> 
> But as mentioned in your README, your approach is currently is not able 
> to manage signal handling correctly.
> You have to ignore all non-fatal signals by using ESYSRESTART and a 
> SIG_KILL sent to one of the processes
> may corrupt your user-kernel-syncronisation. 
> 
> I don't think a user space semaphore implementation is acceptable until 
> it provides (signal-) interruptability and
> timeouts. But maybe you have some idea how to manage this.
> 
> Martin Wirth
> 
>  

As of the signal handling and ESYSRESTART.
The user code on the slow path can check for return code and
has two choices (a) reenter the kernel and wait or (b) correct the
status word, because its still counted as a waiting process and return
0. I have chosen (a) and I automated it.
I have tried to send signals (e.g. SIGIO) and it 
seems to work fine. The signal is handled in user space and returns back
to the kernel section. (b) is feasible but a requires a bit more exception
work in the routine.
As of the corruption case. There is flatout (almost) nothing you can do.
For instance, if a process graps the locks and exits, then another process
tries to acquire it you got a deadlock. At the least on the mailing list
most people feel that you are dead in the water anyway.

I have been thinking about the problem of corruption.
The owner of the lock could register its pid in the lock aread (2nd word).
That still leaves non-atomic windows open and is a half-ass solution.

But more conceptually, if you process dies while holding a lock that protects
some shared data, there is no guarantee that you can make about the data
itself if the updating process dies. In that kind of scenario, why
trying to continue as if nothing happened. 
It seems the general consent on the list is that your app is toast at that
point.


-- Hubertus


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
       [not found]   ` <20020227173307.GH322@reload.nmd.msu.ru>
@ 2002-02-27 22:09     ` Hubertus Franke
  0 siblings, 0 replies; 47+ messages in thread
From: Hubertus Franke @ 2002-02-27 22:09 UTC (permalink / raw)
  To: Joshua MacDonald; +Cc: linux-kernel, lse-tech

To followup on this. I have posted the latest version of this under
http://prdownloads.sourceforge.net/lse/ulocks-2.4.17.tar.bz2

- I followed the suggestion below to minimize ulock_t datastructure
  to do so go to ./make.setup and comment out the -DLOCK_STATISTICS flag
  this will bring the ulockt_t down to 2 words, otherwise 28 bytes.
- The ulockflex program will irrespectively allocate all locks on cacheline
  boundaries regardless of size.
- I fixed a problem in ulockflex that was introduced in the last version.
- I have now number of queues instead of an explicite lock type. Hence
  the kernel only knows about number of queues to maintain.

-- Hubertus Franke

On Wed, Feb 27, 2002 at 08:33:07PM +0300, Joshua MacDonald wrote:
> On Wed, Feb 27, 2002 at 11:58:42AM -0500, Hubertus Franke wrote:
> > On Wed, Feb 27, 2002 at 07:38:34PM +0300, Joshua MacDonald wrote:
> > > Hi Hubertus,
> > > 
> > > I have a question for you about these semaphores.  I took a glance at
> > > <linux/ulocks.h> to try and find out how large an object the
> > > user-space lock is.  I see that you have it written like this:
> > > 
> > > typedef struct ulock_t {
> > > 	unsigned long  status;
> > > 	unsigned long  type;
> > > 	unsigned long  counters[ULOCK_STAT_COUNTERS];
> > > 	char           pad[SMP_CACHE_BYTES - 
> > > 			   (ULOCK_STAT_COUNTERS+2)*sizeof(unsigned long)];
> > > } ulock_t ____cacheline_aligned;
> > > 
> > > I would like to suggest that you offer a version of the ulock_t that
> > > is as small as possible so that the user can make use of the entire
> > > cacheline rather than waste it on padding.
> > > 
> > > The reason I'm interested is that I have written a concurrent,
> > > cache-optimized skip list and I did all of my testing using the
> > > standard Linux spinlocks.  Those are 4 bytes per lock.  I use one lock
> > > per cacheline-sized node of the data structure.  If you can get your
> > > locks down to one or two words that would be really interesting, since
> > > spinlocks don't work terribly well in user-space.  I would really like
> > > to be able to use this data structure outside of the kernel, and your
> > > locks might just solve my problem, but only if they are small enough.
> > > 
> > > Do you see my point?  You can find my latest skiplist code at:
> > 
> > I have seen the light. Seriously, I initially had it as you said, but
> > Linus was strongly recommending cacheline size objects to 
> > avoid false sharing other than on the lock word
> > However, there should be no problem whatsoever to bring that back
> > to 2 words.
> 
> Sounds good.  There is nothing to prevent the declaration of a ulock_t
> variable from using __cacheline_aligned, so I think Linus is off on
> this one.
> 
> I'd like to test this out sometime.  The SLPC code uses a lots of CPP
> trickery to configure itself with various different read/write or
> exclusive locking packages, so its fairly easy to port to a new
> locking primitive.
> 
> -josh


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH] Lightweight userspace semaphores...
       [not found] <20020227163834.GF322@reload.nmd.msu.ru>
@ 2002-02-27 16:58 ` Hubertus Franke
       [not found]   ` <20020227173307.GH322@reload.nmd.msu.ru>
  0 siblings, 1 reply; 47+ messages in thread
From: Hubertus Franke @ 2002-02-27 16:58 UTC (permalink / raw)
  To: Joshua MacDonald; +Cc: rusty, linux-kernel, lse-tech

On Wed, Feb 27, 2002 at 07:38:34PM +0300, Joshua MacDonald wrote:
> Hi Hubertus,
> 
> I have a question for you about these semaphores.  I took a glance at
> <linux/ulocks.h> to try and find out how large an object the
> user-space lock is.  I see that you have it written like this:
> 
> typedef struct ulock_t {
> 	unsigned long  status;
> 	unsigned long  type;
> 	unsigned long  counters[ULOCK_STAT_COUNTERS];
> 	char           pad[SMP_CACHE_BYTES - 
> 			   (ULOCK_STAT_COUNTERS+2)*sizeof(unsigned long)];
> } ulock_t ____cacheline_aligned;
> 
> I would like to suggest that you offer a version of the ulock_t that
> is as small as possible so that the user can make use of the entire
> cacheline rather than waste it on padding.
> 
> The reason I'm interested is that I have written a concurrent,
> cache-optimized skip list and I did all of my testing using the
> standard Linux spinlocks.  Those are 4 bytes per lock.  I use one lock
> per cacheline-sized node of the data structure.  If you can get your
> locks down to one or two words that would be really interesting, since
> spinlocks don't work terribly well in user-space.  I would really like
> to be able to use this data structure outside of the kernel, and your
> locks might just solve my problem, but only if they are small enough.
> 
> Do you see my point?  You can find my latest skiplist code at:

I have seen the light. Seriously, I initially had it as you said, but
Linus was strongly recommending cacheline size objects to 
avoid false sharing other than on the lock word
However, there should be no problem whatsoever to bring that back
to 2 words.

> typedef struct ulock_t {
>       unsigned long  status;
>       unsigned long  qcount;   /* this will change instead of type */
> }

Counters are only there for lock statistics.
padding is only there for filling the cacheline (might be obsolute anyway.
What can be done is to make the ____cacheline_aligned a configuration
option. Nothing in the system relies on this. I'll see how I'll
do this.

Also, this can be brought down to one word, if we don't have
demand based kernel object allocation and/or multiple queue requirements
as requested by BenLeHaise.

Rusty, how would your pin based approach be effected by this ?

Comments ?

-- Hubertus

> 
>     http://sourceforge.net/projects/skiplist
> 
> I have put test results up on that site as well, but those tests were
> made using spinlocks at user-level!  In otherwords, I don't really
> believe my results are meaningful.
> 
> (And let me warn you that there's a bug and haven't uploaded the
> latest version yet...)
> 
> -josh

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2002-03-05  4:38 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-02-23  3:47 [PATCH] Lightweight userspace semaphores Rusty Russell
2002-02-23 15:03 ` Ingo Molnar
2002-02-23 18:20   ` Linus Torvalds
2002-02-23 18:28     ` Larry McVoy
2002-02-23 20:31       ` Ingo Molnar
2002-02-23 21:22       ` Alan Cox
2002-02-26 16:09     ` Hubertus Franke
2002-02-24 23:29   ` Rusty Russell
2002-02-24 23:48     ` Linus Torvalds
2002-02-25  1:10       ` Rusty Russell
2002-02-25  1:23         ` Linus Torvalds
2002-02-25 13:14           ` Alan Cox
2002-02-25 16:11             ` Linus Torvalds
2002-02-25 16:39               ` Alan Cox
2002-02-25 16:32                 ` Benjamin LaHaise
2002-02-25 17:42                   ` Alan Cox
2002-02-25 18:23                   ` Hubertus Franke
2002-02-25 20:57                     ` Hubertus Franke
2002-02-25 17:06                 ` Linus Torvalds
2002-02-25 17:31                   ` Alan Cox
2002-02-25 17:20                     ` Linus Torvalds
2002-02-25 17:50                       ` Alan Cox
2002-02-25 17:44                         ` Linus Torvalds
2002-02-25 18:06                           ` Alan Cox
2002-02-25 19:31                             ` Linus Torvalds
2002-02-24  4:57                               ` Daniel Phillips
2002-02-25 19:51                             ` Hubertus Franke
2002-02-26 12:15                       ` [PATCH] 2.5.5 IDE clean 14 Martin Dalecki
2002-02-26 21:49                         ` Keith Owens
2002-02-27 10:08                           ` Martin Dalecki
2002-03-03  7:07                 ` [PATCH] Lightweight userspace semaphores Rusty Russell
2002-03-01  4:56           ` Eric W. Biederman
2002-03-02 14:54       ` Pavel Machek
2002-02-25 15:00 ` Hubertus Franke
2002-03-01  4:44   ` Eric W. Biederman
2002-02-27  0:24 ` Rusty Russell
2002-02-27 15:53   ` Hubertus Franke
2002-03-01  0:24     ` Richard Henderson
2002-03-01  2:00       ` Hubertus Franke
2002-02-27 16:29   ` Hubertus Franke
2002-03-02 14:50   ` Hubertus Franke
2002-03-03 13:30   ` Rusty Russell
2002-03-04 16:51     ` Hubertus Franke
2002-03-05  4:41     ` Rusty Russell
2002-02-27  8:43 [PATCH] Lightweight userspace semphores Martin Wirth
2002-02-27 15:24 ` Hubertus Franke
2002-02-27 17:17   ` Martin Wirth
2002-02-27 19:04     ` Hubertus Franke
     [not found]       ` <3C7FDF76.9040903@dlr.de>
2002-03-02 14:08         ` [PATCH] Lightweight userspace semaphores Hubertus Franke
     [not found] <20020227163834.GF322@reload.nmd.msu.ru>
2002-02-27 16:58 ` Hubertus Franke
     [not found]   ` <20020227173307.GH322@reload.nmd.msu.ru>
2002-02-27 22:09     ` Hubertus Franke

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).