linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] one of $BIGNUM devfs races
@ 2001-08-06 11:58 Alexander Viro
  2001-08-06 16:47 ` Richard Gooch
  2001-08-06 23:50 ` Richard Gooch
  0 siblings, 2 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-06 11:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Richard Gooch, Alan Cox, linux-kernel

	OK, folks - that's it.  By all reasonable standards a year _is_
sufficient time to fix an obvious race.  One in devfs/base.c::create_entry()
had been described to Richard more than a year ago.  While I respect the
"I'll do it myself, don't spoil the fun" stance, it's clearly over the
bleedin' top.  Patch for that one is in the end of posting.  Linus, see
if it looks sane for you.

	Richard, _please_, stop adding features and spend some of the
freed time fixing the long-standing security holes.  E.g., readlink()
on devfs is a big "boys, come and get some" sign on the kernel's arse.
_Anyone_ can crash the box with devfs mounted on it as soon as rmmod
removes a symlink.  Yes, that one had been pointed out to you only
a couple of months ago.  It's less than a year, but could we please
get it fixed in some reasonable time?  By the end of December or
something...

	When devfs went into the tree, the word was "at least it will
make people look at the code".  Well, it did.  Veni, vidi, vomere.
There are tons of bad races in devfs/base.c.  Reported to you many times -
just look through your mailbox.  Richard, please, either fix the crap
yourself or step down and admit that devfs is unmaintained.  Saying
that you'll fix it yourself is nice, but there's a point when it gets
really old.  And that point had been crossed _way_ back.

--- S8-pre4/fs/devfs/base.c	Sun Jul 29 01:54:47 2001
+++ /tmp/base.c	Mon Aug  6 07:14:09 2001
@@ -789,47 +789,70 @@
 static struct devfs_entry *create_entry (struct devfs_entry *parent,
 					 const char *name,unsigned int namelen)
 {
-    struct devfs_entry *new, **table;
+	struct devfs_entry *new;
 
-    /*  First ensure table size is enough  */
-    if (fs_info.num_inodes >= fs_info.table_size)
-    {
-	if ( ( table = kmalloc (sizeof *table *
-				(fs_info.table_size + INODE_TABLE_INC),
-				GFP_KERNEL) ) == NULL ) return NULL;
-	fs_info.table_size += INODE_TABLE_INC;
+	if (name && namelen<1)
+		namelen = strlen (name);
+
+	new = kmalloc(sizeof(*new) + namelen, GFP_KERNEL);
+
+	if (!new)
+		return NULL;
+
+	/*
+	 * Magic: this will set the ctime to zero, thus subsequent lookups
+	 * will trigger the call to <update_devfs_inode_from_entry>
+	 */
+	memset (new, 0, sizeof *new + namelen);
+	new->parent = parent;
+	if (name)
+		memcpy (new->name, name, namelen);
+	new->namelen = namelen;
+	new->inode.nlink = 1;
+
+	/*  Ensure table size is enough  */
+	while (fs_info.num_inodes >= fs_info.table_size) {
+		unsigned new_size = fs_info.table_size + INODE_TABLE_INC;
+		struct devfs_entry **table;
+
+		table = kmalloc(sizeof(*table) * new_size, GFP_KERNEL);
+
+		if (new_size <= fs_info.table_size) {
+			kfree(table);
+			continue;
+		}
+		if (!table) {
+			kfree(new);
+			return NULL;
+		}
+		fs_info.table_size = new_size;
+		if (!fs_info.table) {
+			fs_info.table = table;
+			break;
+		}
+		memcpy(table, fs_info.table, sizeof(*table)*fs_info.num_inodes);
+		kfree (fs_info.table);
+		fs_info.table = table;
 #ifdef CONFIG_DEVFS_DEBUG
-	if (devfs_debug & DEBUG_I_CREATE)
-	    printk ("%s: create_entry(): grew inode table to: %u entries\n",
-		    DEVFS_NAME, fs_info.table_size);
+		if (devfs_debug & DEBUG_I_CREATE)
+			printk("%s: create_entry(): grew inode table to:"
+				"%u entries\n", DEVFS_NAME, new_size);
 #endif
-	if (fs_info.table)
-	{
-	    memcpy (table, fs_info.table, sizeof *table *fs_info.num_inodes);
-	    kfree (fs_info.table);
 	}
-	fs_info.table = table;
-    }
-    if ( name && (namelen < 1) ) namelen = strlen (name);
-    if ( ( new = kmalloc (sizeof *new + namelen, GFP_KERNEL) ) == NULL )
-	return NULL;
-    /*  Magic: this will set the ctime to zero, thus subsequent lookups will
-	trigger the call to <update_devfs_inode_from_entry>  */
-    memset (new, 0, sizeof *new + namelen);
-    new->parent = parent;
-    if (name) memcpy (new->name, name, namelen);
-    new->namelen = namelen;
-    new->inode.ino = fs_info.num_inodes + FIRST_INODE;
-    new->inode.nlink = 1;
-    fs_info.table[fs_info.num_inodes] = new;
-    ++fs_info.num_inodes;
-    if (parent == NULL) return new;
-    new->prev = parent->u.dir.last;
-    /*  Insert into the parent directory's list of children  */
-    if (parent->u.dir.first == NULL) parent->u.dir.first = new;
-    else parent->u.dir.last->next = new;
-    parent->u.dir.last = new;
-    return new;
+
+	new->inode.ino = fs_info.num_inodes + FIRST_INODE;
+	fs_info.table[fs_info.num_inodes++] = new;
+
+	if (parent) {
+		new->prev = parent->u.dir.last;
+		/*  Insert into the parent directory's list of children  */
+		if (parent->u.dir.first)
+			parent->u.dir.last->next = new;
+		else
+			parent->u.dir.first = new;
+		parent->u.dir.last = new;
+	}
+	return new;
 }   /*  End Function create_entry  */
 
 static void update_devfs_inode_from_entry (struct devfs_entry *de)


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-06 11:58 [PATCH] one of $BIGNUM devfs races Alexander Viro
@ 2001-08-06 16:47 ` Richard Gooch
  2001-08-06 16:56   ` Rik van Riel
  2001-08-06 17:11   ` Richard Gooch
  2001-08-06 23:50 ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-06 16:47 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 	OK, folks - that's it.  By all reasonable standards a year
> _is_ sufficient time to fix an obvious race.  One in
> devfs/base.c::create_entry() had been described to Richard more than
> a year ago.  While I respect the "I'll do it myself, don't spoil the
> fun" stance, it's clearly over the bleedin' top.  Patch for that one
> is in the end of posting.  Linus, see if it looks sane for you.

Well, funny you send this today, Al, as today was supposed to be the
day I start work on fixing a pile of races. I've got the most
important features in before 2.5 is forked, and I've got a free day to
get started on this.

I'll look at your patch after breakfast :-)

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-06 16:47 ` Richard Gooch
@ 2001-08-06 16:56   ` Rik van Riel
  2001-08-06 17:11   ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Rik van Riel @ 2001-08-06 16:56 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alexander Viro, linux-kernel

On Mon, 6 Aug 2001, Richard Gooch wrote:

> Well, funny you send this today, Al, as today was supposed to be
> the day I start work on fixing a pile of races. I've got the
> most important features in before 2.5 is forked, and I've got a
> free day to get started on this.

"most important features in before 2.5 is forked" ?!?!

While you leave bugfixes lying around for a YEAR ?!?!

Can you get any more irresponsible as a maintainer ?

Rik
--
Executive summary of a recent Microsoft press release:
   "we are concerned about the GNU General Public License (GPL)"


		http://www.surriel.com/
http://www.conectiva.com/	http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-06 16:47 ` Richard Gooch
  2001-08-06 16:56   ` Rik van Riel
@ 2001-08-06 17:11   ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-06 17:11 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Alexander Viro, linux-kernel

Rik van Riel writes:
> On Mon, 6 Aug 2001, Richard Gooch wrote:
> 
> > Well, funny you send this today, Al, as today was supposed to be
> > the day I start work on fixing a pile of races. I've got the
> > most important features in before 2.5 is forked, and I've got a
> > free day to get started on this.
> 
> "most important features in before 2.5 is forked" ?!?!
> 
> While you leave bugfixes lying around for a YEAR ?!?!
> 
> Can you get any more irresponsible as a maintainer ?

We all have different priorities. And I've had an extemely bad year (a
dozen different trips, I've moved twice (not by choice), I've got an
ongoing flooding problem where I live now (and the property manager
isn't fixing it right, he's just doing band-aid solutions to reduce
the problem)). On top of all that, I also have a life.

So don't start a stupid flamewar. I could rant about how the VM has
sucked for, what, 3 years now? Or is it longer? It's been so bad for
so long I can't remember anymore when it was good. The only changes
I've noticed are oscillations between bad and worse.

Get off my back.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-06 11:58 [PATCH] one of $BIGNUM devfs races Alexander Viro
  2001-08-06 16:47 ` Richard Gooch
@ 2001-08-06 23:50 ` Richard Gooch
  2001-08-07  0:34   ` Alexander Viro
  2001-08-07  0:51   ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-06 23:50 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 	OK, folks - that's it.  By all reasonable standards a year
> _is_ sufficient time to fix an obvious race.  One in
> devfs/base.c::create_entry() had been described to Richard more than
> a year ago.  While I respect the "I'll do it myself, don't spoil the
> fun" stance, it's clearly over the bleedin' top.  Patch for that one
> is in the end of posting.  Linus, see if it looks sane for you.

Linus: please don't apply.
Alan: I notice you've put Al's patch into 2.4.7-ac8. Please remove it.

This patch has the following ugly construct:

> +	/*  Ensure table size is enough  */
> +	while (fs_info.num_inodes >= fs_info.table_size) {

Putting the allocation inside a while loop is horrible, and isn't a
perfect solution anyway. I'm fixing this (and other races) with proper
locking. If you went to the trouble to start patching, why at least
didn't you do it cleanly with a lock?

Furthermore, the patch makes gratuitous formatting changes (which made
it harder to see what it actually *changed*).

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-06 23:50 ` Richard Gooch
@ 2001-08-07  0:34   ` Alexander Viro
  2001-08-07  0:51   ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  0:34 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel


On Mon, 6 Aug 2001, Richard Gooch wrote:

> This patch has the following ugly construct:
> 
> > +	/*  Ensure table size is enough  */
> > +	while (fs_info.num_inodes >= fs_info.table_size) {
> 
> Putting the allocation inside a while loop is horrible, and isn't a

Why, exactly? I can show you quite a few places where we do exactly that
(allocate and if somebody else had done it before us - free and repeat).
Pretty common for situations when we want low-contention spinlocks
to protect actual reassignment of buffer (in this case BKL acts as such).

> perfect solution anyway. I'm fixing this (and other races) with proper
> locking. If you went to the trouble to start patching, why at least
> didn't you do it cleanly with a lock?

Because it means adding a per-superblock lock for no good reason.
 
> Furthermore, the patch makes gratuitous formatting changes (which made
> it harder to see what it actually *changed*).

_Gratitious_?  You want your style (which, BTW, flies in the face of
Documentation/CodingStyle) - you do it in some vaguely reasonable time.
Excuse me, but I might be inclined to follow your style half a year
ago.  By now, IMO, you've lost any grounds for complaining.  There is a
bunch of holes.  Holes that need fixing.  If you "have other priorities"
for that long - expect other folks to start fixing them without any
respect to your opinion on style.

/me is sorely tempted to say "screw it" and just do fork'n'rewrite...

PS: ObYourPropertyManager: karmic retribution?


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-06 23:50 ` Richard Gooch
  2001-08-07  0:34   ` Alexander Viro
@ 2001-08-07  0:51   ` Richard Gooch
  2001-08-07  1:10     ` Alexander Viro
  2001-08-07  1:27     ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  0:51 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> On Mon, 6 Aug 2001, Richard Gooch wrote:
> 
> > This patch has the following ugly construct:
> > 
> > > +	/*  Ensure table size is enough  */
> > > +	while (fs_info.num_inodes >= fs_info.table_size) {
> > 
> > Putting the allocation inside a while loop is horrible, and isn't a
> 
> Why, exactly? I can show you quite a few places where we do exactly
> that (allocate and if somebody else had done it before us - free and
> repeat).  Pretty common for situations when we want low-contention
> spinlocks to protect actual reassignment of buffer (in this case BKL
> acts as such).

Even if it were only a situation of allocating, I don't like the loop,
because it means you can end up allocating twice for no reason.

More importantly, the loop you used doesn't protect insertions into
the table. So it's not safe on SMP.

Anyway, I've already fixed the allocation race you're concerned about,
plus the insertion race, in my tree (using a semaphore).

> > perfect solution anyway. I'm fixing this (and other races) with proper
> > locking. If you went to the trouble to start patching, why at least
> > didn't you do it cleanly with a lock?
> 
> Because it means adding a per-superblock lock for no good reason.

Well, it's just a single lock (only one devfs superblock possible).
And this is generally a low-contention case (new devfs entries are not
created that often). Using the lock also keeps the code simpler.

> PS: ObYourPropertyManager: karmic retribution?

Um, retribution for putting in an awful lot of time developing devfs
(despite extreme hostility), at considerable personal sacrifice, and
being patient and civilised to those who flamed against it? My, how
I've been such a horrible person.

I find your comment on karmic retribution repugnant.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  0:51   ` Richard Gooch
@ 2001-08-07  1:10     ` Alexander Viro
  2001-08-07  1:27     ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  1:10 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Mon, 6 Aug 2001, Richard Gooch wrote:

> More importantly, the loop you used doesn't protect insertions into
> the table. So it's not safe on SMP.

Nope.  Allocation of entry itself is moved ahead of the loop, so
we insert immediately after expanding the thing.

> > PS: ObYourPropertyManager: karmic retribution?
> 
> Um, retribution for putting in an awful lot of time developing devfs
> (despite extreme hostility), at considerable personal sacrifice, and
> being patient and civilised to those who flamed against it? My, how
> I've been such a horrible person.

<tongue-in-cheek>
Nah, not that. Not plugging the holes that need to be plugged. Admit
it, there is some poetic justice in the situation.
</tongue-in-cheek>

As for the repugnant comments - IMO your "On the top of that, I have
a life" used in the context it was used in counts pretty high on that
scale. You know, you are _not_ unique in that respect.

Whatever.  I just hope that this time all that mess will be fixed and by
now I really don't care who does it and what does it take.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  0:51   ` Richard Gooch
  2001-08-07  1:10     ` Alexander Viro
@ 2001-08-07  1:27     ` Richard Gooch
  2001-08-07  1:42       ` Alexander Viro
                         ` (2 more replies)
  1 sibling, 3 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  1:27 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> 
> On Mon, 6 Aug 2001, Richard Gooch wrote:
> 
> > More importantly, the loop you used doesn't protect insertions into
> > the table. So it's not safe on SMP.
> 
> Nope.  Allocation of entry itself is moved ahead of the loop, so
> we insert immediately after expanding the thing.

I'm referring specifically to this code:
    new->inode.ino = fs_info.num_inodes + FIRST_INODE;
    fs_info.table[fs_info.num_inodes++] = new;

This is not SMP safe. Besides, even the allocation loop isn't SMP
safe. If two tasks both allocate a table, they each could end up
calling:
	kfree (fs_info.table);
for the same value. Or for a different one (which is also bad).

> > > PS: ObYourPropertyManager: karmic retribution?
> > 
> > Um, retribution for putting in an awful lot of time developing devfs
> > (despite extreme hostility), at considerable personal sacrifice, and
> > being patient and civilised to those who flamed against it? My, how
> > I've been such a horrible person.
> 
> <tongue-in-cheek>
> Nah, not that. Not plugging the holes that need to be plugged. Admit
> it, there is some poetic justice in the situation.
> </tongue-in-cheek>

Ah. I didn't get that one. OK. I guess seen that way it is funny.
Unfortunately I've been living with this flooding problem for several
months, so I'm a bit touchy on the subject. It is incredibly
disruptive to getting work done.

> As for the repugnant comments - IMO your "On the top of that, I have
> a life" used in the context it was used in counts pretty high on that
> scale. You know, you are _not_ unique in that respect.

Oh, that's not what I meant. I get the impression from some quarters
that one is expected to always put work ahead of personal
considerations. That's what I was railing against.

I apologise if it came across the other way (i.e. implying that others
don't have a life: it's obviously incorrect, and besides, I don't make
those kinds of cheap shots).

> Whatever.  I just hope that this time all that mess will be fixed
> and by now I really don't care who does it and what does it take.

Well, I *am* doing it. Like I said, today was due to be the day.
Really.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  1:27     ` Richard Gooch
@ 2001-08-07  1:42       ` Alexander Viro
  2001-08-07  2:00       ` Richard Gooch
  2001-08-07  5:53       ` Richard Gooch
  2 siblings, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  1:42 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Mon, 6 Aug 2001, Richard Gooch wrote:

> I'm referring specifically to this code:
>     new->inode.ino = fs_info.num_inodes + FIRST_INODE;
>     fs_info.table[fs_info.num_inodes++] = new;
> 
> This is not SMP safe. Besides, even the allocation loop isn't SMP
> safe. If two tasks both allocate a table, they each could end up
> calling:
> 	kfree (fs_info.table);
> for the same value. Or for a different one (which is also bad).

BKL. kfree() is non-blocking. IOW, critical area can be placed under a
spinlock and BKL acts as such. We can trivially replace it with
a spinlock (static in function).

Actually, there is another problem with that code and it has nothing to
SMP. You never shrink that table and AFAICS you never reuse the entries.
IOW, you've got a leak there.

Why on the Earth do you need it, in the first place? Just put the
pointer to entry into inode->u.generic_ip and be done with that -
it kills all that mess for good. AFAICS the only places where you
really use that table is your get_devfs_entry_from_vfs_inode()
and devfs_write_inode(). In both cases pointer would be obviously more
convenient.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  1:27     ` Richard Gooch
  2001-08-07  1:42       ` Alexander Viro
@ 2001-08-07  2:00       ` Richard Gooch
  2001-08-07  2:15         ` Alexander Viro
  2001-08-07  5:17         ` Richard Gooch
  2001-08-07  5:53       ` Richard Gooch
  2 siblings, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  2:00 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> 
> On Mon, 6 Aug 2001, Richard Gooch wrote:
> 
> > I'm referring specifically to this code:
> >     new->inode.ino = fs_info.num_inodes + FIRST_INODE;
> >     fs_info.table[fs_info.num_inodes++] = new;
> > 
> > This is not SMP safe. Besides, even the allocation loop isn't SMP
> > safe. If two tasks both allocate a table, they each could end up
> > calling:
> > 	kfree (fs_info.table);
> > for the same value. Or for a different one (which is also bad).
> 
> BKL. kfree() is non-blocking. IOW, critical area can be placed under a
> spinlock and BKL acts as such. We can trivially replace it with
> a spinlock (static in function).
> 
> Actually, there is another problem with that code and it has nothing
> to SMP. You never shrink that table and AFAICS you never reuse the
> entries.  IOW, you've got a leak there.

The devfs entries are reused *for the same name*. But yes, if "fred"
is registered and unregistered, and is never registered again, it will
indeed stick around forever. In general, this is not a significant
problem, since the same name tends to be re-registered later. Said
another way: there aren't many different temporary entries.

The reason for not freeing stuff is simplicity. Without proper
locking, I was able to avoid a lot of races. Now that I'm putting
locks in, I can consider freeing stuff (after the locks are in).

> Why on the Earth do you need it, in the first place? Just put the
> pointer to entry into inode->u.generic_ip and be done with that - it
> kills all that mess for good. AFAICS the only places where you
> really use that table is your get_devfs_entry_from_vfs_inode() and
> devfs_write_inode(). In both cases pointer would be obviously more
> convenient.

Again, historical reasons. When I wrote devfs, the pipe data trampled
the inode->u.generic_ip pointer. So that's no good. I see that the
pipe data has been moved away. Good. Hm. But there's still the
inode->u.socket_i structure. I'd need to check where that gets
trampled.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  2:00       ` Richard Gooch
@ 2001-08-07  2:15         ` Alexander Viro
  2001-08-07  5:17         ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  2:15 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Mon, 6 Aug 2001, Richard Gooch wrote:

> Again, historical reasons. When I wrote devfs, the pipe data trampled
> the inode->u.generic_ip pointer. So that's no good. I see that the
> pipe data has been moved away. Good. Hm. But there's still the
> inode->u.socket_i structure. I'd need to check where that gets
> trampled.

It isn't. socket_i is used only in inodes allocated by sock_alloc().
It is not used in the inodes that live on any fs other than sockfs.
For local-domain socket you get _two_ kinds of inodes, both with
S_IFSOCK in ->i_mode: one on the filesystem (acting like an meeting
place) and another - bearing the actual socket and used for all IO.

In other words, the only kind you can get from mknod(2) never uses
->i_socket. It's used only by bind() and connect() - and only as
a place in namespace. The only thing we ever look at is ownership
and permissions - they determine who can bind()/connect() here.

So ->u.generic_ip is safe.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  2:00       ` Richard Gooch
  2001-08-07  2:15         ` Alexander Viro
@ 2001-08-07  5:17         ` Richard Gooch
  2001-08-07  5:28           ` Alexander Viro
                             ` (2 more replies)
  1 sibling, 3 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  5:17 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> 
> On Mon, 6 Aug 2001, Richard Gooch wrote:
> 
> > Again, historical reasons. When I wrote devfs, the pipe data trampled
> > the inode->u.generic_ip pointer. So that's no good. I see that the
> > pipe data has been moved away. Good. Hm. But there's still the
> > inode->u.socket_i structure. I'd need to check where that gets
> > trampled.
> 
> It isn't. socket_i is used only in inodes allocated by sock_alloc().
> It is not used in the inodes that live on any fs other than sockfs.

OK. Although, where is sockfs?

> For local-domain socket you get _two_ kinds of inodes, both with
> S_IFSOCK in ->i_mode: one on the filesystem (acting like an meeting
> place) and another - bearing the actual socket and used for all IO.
> 
> In other words, the only kind you can get from mknod(2) never uses
> ->i_socket. It's used only by bind() and connect() - and only as
> a place in namespace. The only thing we ever look at is ownership
> and permissions - they determine who can bind()/connect() here.
> 
> So ->u.generic_ip is safe.

Great! That table is toast! All I need is a spinlock-protected
incrementing counter :-)

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  5:17         ` Richard Gooch
@ 2001-08-07  5:28           ` Alexander Viro
  2001-08-07 10:21           ` Anton Altaparmakov
  2001-08-07 17:09           ` Richard Gooch
  2 siblings, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  5:28 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Mon, 6 Aug 2001, Richard Gooch wrote:

> > It isn't. socket_i is used only in inodes allocated by sock_alloc().
> > It is not used in the inodes that live on any fs other than sockfs.
> 
> OK. Although, where is sockfs?

net/socket.c. It's a pseudo-fs where all real sockets go. Nothing
spectacular - just a superblock + directory + bunch of dentries.
When we allocate a socket (socket(2), accept(2), socketpair(2), etc.)
its dentry is made a child of sockfs root and given the right name,
so that readlink on /proc/<pid>/fd/<n> would do the right thing
without any special-casing. Besides, we can get rid of checks for
inode->i_sb != NULL - it's _always_ non-NULL now (we have a similar
one for pipe(2)-created pipes). We also get pipe and socket struct file
out of the anonymous list - they live on the ->s_files of their
superblocks. Neither sockfs nor pipefs can be mounted by user, so it's 
not visible to user, or, for that matter, to the rest of the kernel.
Some special-casing is gone - that's it.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  1:27     ` Richard Gooch
  2001-08-07  1:42       ` Alexander Viro
  2001-08-07  2:00       ` Richard Gooch
@ 2001-08-07  5:53       ` Richard Gooch
  2001-08-07  6:23         ` Alexander Viro
  2001-08-07  6:36         ` Richard Gooch
  2 siblings, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  5:53 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> Why on the Earth do you need it, in the first place? Just put the
> pointer to entry into inode->u.generic_ip and be done with that -
> it kills all that mess for good. AFAICS the only places where you
> really use that table is your get_devfs_entry_from_vfs_inode()
> and devfs_write_inode(). In both cases pointer would be obviously more
> convenient.

Damn. I've just run into a snag. My read_inode() needs to dereference
inode->u.generic_ip, however, I can only initialise this *after* the
call to iget() finishes. Now, I could shoehorn my pointer into
inode->ino (thanks to it being an unsigned long), but that's pretty
gross.

I also notice iget4() and the read_inode2() method, however, from the
comments, it looks like those are reiserfs-specific, and will die
soon. At the very least, it seems use thereof is discouraged.

Suggestions?

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  5:53       ` Richard Gooch
@ 2001-08-07  6:23         ` Alexander Viro
  2001-08-07  6:36         ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  6:23 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Mon, 6 Aug 2001, Richard Gooch wrote:

> Damn. I've just run into a snag. My read_inode() needs to dereference
> inode->u.generic_ip, however, I can only initialise this *after* the
> call to iget() finishes. Now, I could shoehorn my pointer into
> inode->ino (thanks to it being an unsigned long), but that's pretty
> gross.
> 
> I also notice iget4() and the read_inode2() method, however, from the
> comments, it looks like those are reiserfs-specific, and will die
> soon. At the very least, it seems use thereof is discouraged.
> 
> Suggestions?

Lose ->read_inode(). Since your inode numbers are not stable across
reboot you can't use iget for NFS-exporting devfs (even if you would
want to export it in the first place). So there is no reason whatsoever
to use it.

Add put_inode: force_delete, into your super_operations and replace
your call of iget() with

	inode = new_inode(sb);
	if (inode) {
		inode->i_ino = whatever;
		/* stuff you've used to do in devfs_read_inode */
	}

Notice that here you have pointer to 'entry', so there is no problem
with passing it. ->read_inode() simply goes away. Besides, that way
you don't pollute icache hash chains - devfs inodes stay out of hash.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  5:53       ` Richard Gooch
  2001-08-07  6:23         ` Alexander Viro
@ 2001-08-07  6:36         ` Richard Gooch
  2001-08-07  6:40           ` Alexander Viro
  2001-08-07  6:47           ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  6:36 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> 
> On Mon, 6 Aug 2001, Richard Gooch wrote:
> 
> > Damn. I've just run into a snag. My read_inode() needs to dereference
> > inode->u.generic_ip, however, I can only initialise this *after* the
> > call to iget() finishes. Now, I could shoehorn my pointer into
> > inode->ino (thanks to it being an unsigned long), but that's pretty
> > gross.
> > 
> > I also notice iget4() and the read_inode2() method, however, from the
> > comments, it looks like those are reiserfs-specific, and will die
> > soon. At the very least, it seems use thereof is discouraged.
> > 
> > Suggestions?
> 
> Lose ->read_inode(). Since your inode numbers are not stable across
> reboot you can't use iget for NFS-exporting devfs (even if you would
> want to export it in the first place). So there is no reason whatsoever
> to use it.

OK...

> Add put_inode: force_delete, into your super_operations and replace
> your call of iget() with
> 
> 	inode = new_inode(sb);
> 	if (inode) {
> 		inode->i_ino = whatever;
> 		/* stuff you've used to do in devfs_read_inode */
> 	}
> 
> Notice that here you have pointer to 'entry', so there is no problem
> with passing it. ->read_inode() simply goes away. Besides, that way
> you don't pollute icache hash chains - devfs inodes stay out of hash.

Um, what will happen to inode change events? What exactly is the
purpose of these hash chains?

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  6:36         ` Richard Gooch
@ 2001-08-07  6:40           ` Alexander Viro
  2001-08-07  6:47           ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  6:40 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Tue, 7 Aug 2001, Richard Gooch wrote:

> > Notice that here you have pointer to 'entry', so there is no problem
> > with passing it. ->read_inode() simply goes away. Besides, that way
> > you don't pollute icache hash chains - devfs inodes stay out of hash.
> 
> Um, what will happen to inode change events? What exactly is the
> purpose of these hash chains?

iget() uses them to find inode by superblock and inode number.
If you don't use iget()...

I'm not sure what do you call an inode change event, though. Stuff
like chmod() and friends? They call ->setattr() (devfs_notify_change(),
in your case) and that has nothing to icache (you already have the
inode). Or had I completely misparsed you?


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  6:36         ` Richard Gooch
  2001-08-07  6:40           ` Alexander Viro
@ 2001-08-07  6:47           ` Richard Gooch
  2001-08-07  7:31             ` Alexander Viro
  2001-08-07 18:11             ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07  6:47 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> 
> On Tue, 7 Aug 2001, Richard Gooch wrote:
> 
> > > Notice that here you have pointer to 'entry', so there is no problem
> > > with passing it. ->read_inode() simply goes away. Besides, that way
> > > you don't pollute icache hash chains - devfs inodes stay out of hash.
> > 
> > Um, what will happen to inode change events? What exactly is the
> > purpose of these hash chains?
> 
> iget() uses them to find inode by superblock and inode number.
> If you don't use iget()...

OK.

> I'm not sure what do you call an inode change event, though. Stuff
> like chmod() and friends?

Yes, that's what I meant.

> They call ->setattr() (devfs_notify_change(), in your case) and that
> has nothing to icache (you already have the inode). Or had I
> completely misparsed you?

You parsed correctly. I know ->setattr() is called. I just wanted to
make sure that the icache didn't have some subtle interaction I was
missing. Such as ->write_inode() not being called.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  6:47           ` Richard Gooch
@ 2001-08-07  7:31             ` Alexander Viro
  2001-08-07 18:11             ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07  7:31 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Tue, 7 Aug 2001, Richard Gooch wrote:

> > They call ->setattr() (devfs_notify_change(), in your case) and that
> > has nothing to icache (you already have the inode). Or had I
> > completely misparsed you?
> 
> You parsed correctly. I know ->setattr() is called. I just wanted to
> make sure that the icache didn't have some subtle interaction I was
> missing. Such as ->write_inode() not being called.

Oh... That actually begs for another question - why the heck do you
need ->write_inode()?

Sorry - I had missed the presense of that animal. Hmm... OK.
Variant 1:
	insert_inode_hash(inode). Silly, but will work
Variant 2:
	kill devfs_write_inode() - do its equivalent in notify_change()
_and_ set the de->atime in ->readlink() and ->follow_link(). The latter
is due to update_atime() logics. And no, I don't like it.

Actually, it stinks - we have UPDATE_ATIME implemented by direct assignment to
->i_atime instead of the proper way (->setattr()). Linus, would you mind if
I change that? That would be the Right Thing(tm), but it might be too heavy.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  5:17         ` Richard Gooch
  2001-08-07  5:28           ` Alexander Viro
@ 2001-08-07 10:21           ` Anton Altaparmakov
  2001-08-07 17:09           ` Richard Gooch
  2 siblings, 0 replies; 41+ messages in thread
From: Anton Altaparmakov @ 2001-08-07 10:21 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alexander Viro, Linus Torvalds, Alan Cox, linux-kernel

On Mon, 6 Aug 2001, Richard Gooch wrote:
> Alexander Viro writes:
> > On Mon, 6 Aug 2001, Richard Gooch wrote:
> > For local-domain socket you get _two_ kinds of inodes, both with
> > S_IFSOCK in ->i_mode: one on the filesystem (acting like an meeting
> > place) and another - bearing the actual socket and used for all IO.
> > 
> > In other words, the only kind you can get from mknod(2) never uses
> > ->i_socket. It's used only by bind() and connect() - and only as
> > a place in namespace. The only thing we ever look at is ownership
> > and permissions - they determine who can bind()/connect() here.
> > 
> > So ->u.generic_ip is safe.
> 
> Great! That table is toast! All I need is a spinlock-protected
> incrementing counter :-)

If you are introducing a spinlock for the sole purpose of protecting
a counter, I would suggest to drop the spinlock, make the counter an 
atomic_t and just use atomic operations on it. That should be both faster
and generate shorter code.

Best regards,

	Anton
-- 
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Linux NTFS maintainer / WWW: http://linux-ntfs.sf.net/
ICQ: 8561279 / WWW: http://www-stu.christs.cam.ac.uk/~aia21/


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  5:17         ` Richard Gooch
  2001-08-07  5:28           ` Alexander Viro
  2001-08-07 10:21           ` Anton Altaparmakov
@ 2001-08-07 17:09           ` Richard Gooch
  2001-08-07 17:27             ` Arjan van de Ven
  2001-08-07 17:28             ` Richard Gooch
  2 siblings, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07 17:09 UTC (permalink / raw)
  To: Anton Altaparmakov; +Cc: Alexander Viro, Linus Torvalds, Alan Cox, linux-kernel

Anton Altaparmakov writes:
> If you are introducing a spinlock for the sole purpose of protecting
> a counter, I would suggest to drop the spinlock, make the counter an
> atomic_t and just use atomic operations on it. That should be both
> faster and generate shorter code.

Firstly, I don't see an atomic_get_and_inc(). Sure, I can atomically
increment, and I can atomically read. But I can't read and increment
atomically.

Secondly, the range is 24 bits. While 24 bits is "probably enough",
I'd prefer not to waste bits.

Finally, a spinlock is less code (0) and faster on UP.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 17:09           ` Richard Gooch
@ 2001-08-07 17:27             ` Arjan van de Ven
  2001-08-07 17:28             ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Arjan van de Ven @ 2001-08-07 17:27 UTC (permalink / raw)
  To: Richard Gooch, linux-kernel

Richard Gooch wrote:
 
> Finally, a spinlock is less code (0) and faster on UP.

Ehm not if you require protection against IRQ events......

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 17:09           ` Richard Gooch
  2001-08-07 17:27             ` Arjan van de Ven
@ 2001-08-07 17:28             ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07 17:28 UTC (permalink / raw)
  To: arjanv; +Cc: linux-kernel

Arjan van de Ven writes:
> Richard Gooch wrote:
>  
> > Finally, a spinlock is less code (0) and faster on UP.
> 
> Ehm not if you require protection against IRQ events......

Obviously. But I don't, in this case.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07  6:47           ` Richard Gooch
  2001-08-07  7:31             ` Alexander Viro
@ 2001-08-07 18:11             ` Richard Gooch
  2001-08-07 18:23               ` Alexander Viro
  2001-08-07 18:55               ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07 18:11 UTC (permalink / raw)
  To: Alexander Viro; +Cc: Linus Torvalds, Alan Cox, linux-kernel

Alexander Viro writes:
> 
> On Tue, 7 Aug 2001, Richard Gooch wrote:
> 
> > > They call ->setattr() (devfs_notify_change(), in your case) and that
> > > has nothing to icache (you already have the inode). Or had I
> > > completely misparsed you?
> > 
> > You parsed correctly. I know ->setattr() is called. I just wanted to
> > make sure that the icache didn't have some subtle interaction I was
> > missing. Such as ->write_inode() not being called.
> 
> Oh... That actually begs for another question - why the heck do you
> need ->write_inode()?
> 
> Sorry - I had missed the presense of that animal. Hmm... OK.
> Variant 1:
> 	insert_inode_hash(inode). Silly, but will work
> Variant 2:
> 	kill devfs_write_inode() - do its equivalent in notify_change()
> _and_ set the de->atime in ->readlink() and ->follow_link(). The latter
> is due to update_atime() logics. And no, I don't like it.

OK, I've implemented variant 2. Everything looked OK, but then I
noticed that pwd no longer works in subdirectories in devfs. Sigh.
I'll have to get back to it tonight and track this one down. How
annoying. Still, it was satisfying to rip out the table code.

BTW: I'm not bothering to update atime in the symlink methods. It's
not important anyway. And if you get you VFS change in, then it should
just magically save atimes for symlinks again, right?

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 18:11             ` Richard Gooch
@ 2001-08-07 18:23               ` Alexander Viro
  2001-08-07 18:55               ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07 18:23 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Linus Torvalds, Alan Cox, linux-kernel



On Tue, 7 Aug 2001, Richard Gooch wrote:

> OK, I've implemented variant 2. Everything looked OK, but then I
> noticed that pwd no longer works in subdirectories in devfs. Sigh.

Very interesting. pwd should be using getcwd(2), which doesn't
give a damn for inode numbers. If you have seriously old pwd binary
that tries to track the thing down to root by hands - yes, it doesn't
work.

> I'll have to get back to it tonight and track this one down. How
> annoying. Still, it was satisfying to rip out the table code.
> 
> BTW: I'm not bothering to update atime in the symlink methods. It's
> not important anyway. And if you get you VFS change in, then it should
> just magically save atimes for symlinks again, right?

Yes.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 18:11             ` Richard Gooch
  2001-08-07 18:23               ` Alexander Viro
@ 2001-08-07 18:55               ` Richard Gooch
  2001-08-07 19:10                 ` Alexander Viro
  2001-08-07 22:13                 ` Richard Gooch
  1 sibling, 2 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07 18:55 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-kernel

[Removed Linus and Alan from the Cc: list: I'm sure they're getting
bored by now]

Alexander Viro writes:
> 
> 
> On Tue, 7 Aug 2001, Richard Gooch wrote:
> 
> > OK, I've implemented variant 2. Everything looked OK, but then I
> > noticed that pwd no longer works in subdirectories in devfs. Sigh.
> 
> Very interesting. pwd should be using getcwd(2), which doesn't
> give a damn for inode numbers. If you have seriously old pwd binary
> that tries to track the thing down to root by hands - yes, it doesn't
> work.

Hm. strace suggests my pwd is walking up the path. But WTF would it
break? 2.4.7 was fine. What did I break?

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 18:55               ` Richard Gooch
@ 2001-08-07 19:10                 ` Alexander Viro
  2001-08-07 22:13                 ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07 19:10 UTC (permalink / raw)
  To: Richard Gooch; +Cc: linux-kernel



On Tue, 7 Aug 2001, Richard Gooch wrote:

> Hm. strace suggests my pwd is walking up the path. But WTF would it
> break? 2.4.7 was fine. What did I break?

Walking the path works so:

open ..
read it, entry by entry
find an entry that would have inode number equal to that of our directory.
We had found the last component of our name. Lather, rinse, repeat.

It relies on numbers from stat() and numbers from readdir() being
in sync.  It's not true on so many filesystems that this algorithm
is laughable.  Anything with synthetic inode numbers breaks it.

IOW, your pwd(1) behaviour is b0rken. Aside of being unable to work on a
lot of filesystems, it's doing a lot of extra work even on the filesystems
where it happens to work.

Check what your getcwd(3) is doing. It should at least try to call
getcwd(2) before reverting to that horror. Notice that you need
to walk the path only on 2.0 - 2.2 and above have a syscall that
works without any IO and regardless of the inumber assignment policy.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 18:55               ` Richard Gooch
  2001-08-07 19:10                 ` Alexander Viro
@ 2001-08-07 22:13                 ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-07 22:13 UTC (permalink / raw)
  To: Alexander Viro; +Cc: linux-kernel

Alexander Viro writes:
> 
> 
> On Tue, 7 Aug 2001, Richard Gooch wrote:
> 
> > Hm. strace suggests my pwd is walking up the path. But WTF would it
> > break? 2.4.7 was fine. What did I break?
> 
> Walking the path works so:
> 
> open ..
> read it, entry by entry
> find an entry that would have inode number equal to that of our directory.
> We had found the last component of our name. Lather, rinse, repeat.

Yes, I know the algorithm.

> It relies on numbers from stat() and numbers from readdir() being
> in sync.  It's not true on so many filesystems that this algorithm
> is laughable.  Anything with synthetic inode numbers breaks it.

Sure, but devfs does keep inums in sync. So it shouldn't be an issue.

[From another reply]
> So fix getcwd(3) in libc5.

Well, I might just do that one day. But I've got some devfs races to
fix first :-)

> Or use your ->dentry in devfs_readdir() - then you can get the
> consistency you want for existing inodes and that will allow b0rken
> getcwd() to work.

Yes, devfs_readdir() already uses de->inode.ino. Anyway, when I get
back to that machine I'll be able to dig further.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-08 21:16       ` H. Peter Anvin
  2001-08-08 21:47         ` Alexander Viro
@ 2001-08-08 23:29         ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-08 23:29 UTC (permalink / raw)
  To: Alexander Viro; +Cc: H. Peter Anvin, linux-kernel

Alexander Viro writes:
> 
> On 8 Aug 2001, H. Peter Anvin wrote:
> 
> > Followup to:  <Pine.GSO.4.21.0108071510390.18565-100000@weyl.math.psu.edu>
> > By author:    Alexander Viro <viro@math.psu.edu>
> > In newsgroup: linux.dev.kernel
> > > 
> > > It is not reliable. E.g. on NFS inumbers are not unique - 32 bits is
> > > not enough.
> > 
> > Unfortunately there is a whole bunch of other things too that rely on
> > it, and *HAVE* to rely on it -- (st_dev, st_ino) are defined to
> > specify the identity of a file, and if the current types aren't large
> > enough we *HAVE* to go to new types.  THERE IS NO OTHER WAY TO TEST
> > FOR FILE IDENTITY IN UNIX, and being able to perform such a test is
> > vital for many things, including security and hard link detection
> 
> Indeed, but it still doesn't help libc5 getcwd(3), which uses 32 bit
> values.

FYI: the problem that spawned this sub-thread is fixed. The
devfs-patch-v185 that I released last night fixes this. So the libc5
getcwd(3) is fine with 32 bit inums on devfs.

Filesystems with larger inums are left as an exercise for the reader
:-)

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-08 21:16       ` H. Peter Anvin
@ 2001-08-08 21:47         ` Alexander Viro
  2001-08-08 23:29         ` Richard Gooch
  1 sibling, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-08 21:47 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: linux-kernel



On 8 Aug 2001, H. Peter Anvin wrote:

> Followup to:  <Pine.GSO.4.21.0108071510390.18565-100000@weyl.math.psu.edu>
> By author:    Alexander Viro <viro@math.psu.edu>
> In newsgroup: linux.dev.kernel
> > 
> > It is not reliable. E.g. on NFS inumbers are not unique - 32 bits is
> > not enough.
> > 
> 
> Unfortunately there is a whole bunch of other things too that rely on
> it, and *HAVE* to rely on it -- (st_dev, st_ino) are defined to
> specify the identity of a file, and if the current types aren't large
> enough we *HAVE* to go to new types.  THERE IS NO OTHER WAY TO TEST
> FOR FILE IDENTITY IN UNIX, and being able to perform such a test is
> vital for many things, including security and hard link detection

Indeed, but it still doesn't help libc5 getcwd(3), which uses 32 bit
values.

> (think tar, cpio, cp -a.)

I'd rather not.  Too bloody depressive... (If you want details - let's
take it off-list).


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 19:16     ` Alexander Viro
@ 2001-08-08 21:16       ` H. Peter Anvin
  2001-08-08 21:47         ` Alexander Viro
  2001-08-08 23:29         ` Richard Gooch
  0 siblings, 2 replies; 41+ messages in thread
From: H. Peter Anvin @ 2001-08-08 21:16 UTC (permalink / raw)
  To: linux-kernel

Followup to:  <Pine.GSO.4.21.0108071510390.18565-100000@weyl.math.psu.edu>
By author:    Alexander Viro <viro@math.psu.edu>
In newsgroup: linux.dev.kernel
> 
> It is not reliable. E.g. on NFS inumbers are not unique - 32 bits is
> not enough.
> 

Unfortunately there is a whole bunch of other things too that rely on
it, and *HAVE* to rely on it -- (st_dev, st_ino) are defined to
specify the identity of a file, and if the current types aren't large
enough we *HAVE* to go to new types.  THERE IS NO OTHER WAY TO TEST
FOR FILE IDENTITY IN UNIX, and being able to perform such a test is
vital for many things, including security and hard link detection
(think tar, cpio, cp -a.)

	-hpa

-- 
<hpa@transmeta.com> at work, <hpa@zytor.com> in private!
"Unix gives you enough rope to shoot yourself in the foot."
http://www.zytor.com/~hpa/puzzle.txt	<amsp@zytor.com>

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 19:09   ` Richard Gooch
@ 2001-08-07 19:20     ` Alexander Viro
  0 siblings, 0 replies; 41+ messages in thread
From: Alexander Viro @ 2001-08-07 19:20 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alan Cox, linux-kernel



On Tue, 7 Aug 2001, Richard Gooch wrote:

> Yes, I use libc5. And I don't care about old pwd being slower. And I

So fix getcwd(3) in libc5. BFD... Or use your ->dentry in devfs_readdir() -
then you can get the consistency you want for existing inodes and that
will allow b0rken getcwd() to work.

It _is_ b0rken - it relies on unique 32-bit number for inodes. That's
not guaranteed on NFS and there's nothing we could do about that.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
  2001-08-07 19:04   ` Alan Cox
@ 2001-08-07 19:16     ` Alexander Viro
  2001-08-08 21:16       ` H. Peter Anvin
  0 siblings, 1 reply; 41+ messages in thread
From: Alexander Viro @ 2001-08-07 19:16 UTC (permalink / raw)
  To: Alan Cox; +Cc: Richard Gooch, linux-kernel



On Tue, 7 Aug 2001, Alan Cox wrote:

> > > Very interesting. pwd should be using getcwd(2), which doesn't
> > > give a damn for inode numbers. If you have seriously old pwd binary
> > > that tries to track the thing down to root by hands - yes, it doesn't
> > > work.
> > 
> > Hm. strace suggests my pwd is walking up the path. But WTF would it
> > break? 2.4.7 was fine. What did I break?
> 
> Sounds like you are using libc5. The old style pwd should be reliable but
> its much slower and can't see across protected directory paths

It is not reliable. E.g. on NFS inumbers are not unique - 32 bits is
not enough.


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
                     ` (4 preceding siblings ...)
  2001-08-07 19:04   ` Alan Cox
@ 2001-08-07 19:09   ` Richard Gooch
  2001-08-07 19:20     ` Alexander Viro
  5 siblings, 1 reply; 41+ messages in thread
From: Richard Gooch @ 2001-08-07 19:09 UTC (permalink / raw)
  To: Alan Cox; +Cc: Alexander Viro, linux-kernel

Alan Cox writes:
> > > Very interesting. pwd should be using getcwd(2), which doesn't
> > > give a damn for inode numbers. If you have seriously old pwd binary
> > > that tries to track the thing down to root by hands - yes, it doesn't
> > > work.
> > 
> > Hm. strace suggests my pwd is walking up the path. But WTF would it
> > break? 2.4.7 was fine. What did I break?
> 
> Sounds like you are using libc5. The old style pwd should be
> reliable but its much slower and can't see across protected
> directory paths

Yes, I use libc5. And I don't care about old pwd being slower. And I
certainly don't want to break it, even if I wasn't using it.
By "protected directory paths", you mean a directory with read access?

Well, rx access is available for the whole path. And the inums looked
fine. So the breakage is odd.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
                     ` (3 preceding siblings ...)
  2001-08-07 16:22   ` Alan Cox
@ 2001-08-07 19:04   ` Alan Cox
  2001-08-07 19:16     ` Alexander Viro
  2001-08-07 19:09   ` Richard Gooch
  5 siblings, 1 reply; 41+ messages in thread
From: Alan Cox @ 2001-08-07 19:04 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alexander Viro, linux-kernel

> > Very interesting. pwd should be using getcwd(2), which doesn't
> > give a damn for inode numbers. If you have seriously old pwd binary
> > that tries to track the thing down to root by hands - yes, it doesn't
> > work.
> 
> Hm. strace suggests my pwd is walking up the path. But WTF would it
> break? 2.4.7 was fine. What did I break?

Sounds like you are using libc5. The old style pwd should be reliable but
its much slower and can't see across protected directory paths

Alan

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
                     ` (2 preceding siblings ...)
  2001-08-06 23:59   ` Richard Gooch
@ 2001-08-07 16:22   ` Alan Cox
  2001-08-07 19:04   ` Alan Cox
  2001-08-07 19:09   ` Richard Gooch
  5 siblings, 0 replies; 41+ messages in thread
From: Alan Cox @ 2001-08-07 16:22 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alan Cox, Alexander Viro, Linus Torvalds, linux-kernel

> > unless asked too - hence joystick. input and much of USB are so far
> > behind in Linus tree
> 
> So does that mean you won't try to merge Al's patch?

Correct. I'd just get in your way if I did

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
@ 2001-08-06 23:59 Alan Cox
       [not found] ` <no.id>
  5 siblings, 1 reply; 41+ messages in thread
From: Alan Cox @ 2001-08-06 23:59 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alan Cox, Alexander Viro, Linus Torvalds, linux-kernel

> OK, fair enough. When is your next merge with Linus scheduled? I'd
> prefer to get a few races fixed before shipping a patch, but I can try
> to plan for an earlier release if necessary.

I send stuff Linus regularly and sometimes it goes in and sometimes it
doesn't. Stuff with active maintainers I don't send on to Linus unless asked
too - hence joystick. input and much of USB are so far behind in Linus tree

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
  2001-08-06 23:54   ` Alan Cox
  2001-08-06 23:55   ` Richard Gooch
@ 2001-08-06 23:59   ` Richard Gooch
  2001-08-07 16:22   ` Alan Cox
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-06 23:59 UTC (permalink / raw)
  To: Alan Cox; +Cc: Alexander Viro, Linus Torvalds, linux-kernel

Alan Cox writes:
> > OK, fair enough. When is your next merge with Linus scheduled? I'd
> > prefer to get a few races fixed before shipping a patch, but I can try
> > to plan for an earlier release if necessary.
> 
> I send stuff Linus regularly and sometimes it goes in and sometimes
> it doesn't. Stuff with active maintainers I don't send on to Linus
> unless asked too - hence joystick. input and much of USB are so far
> behind in Linus tree

So does that mean you won't try to merge Al's patch?

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
  2001-08-06 23:54   ` Alan Cox
@ 2001-08-06 23:55   ` Richard Gooch
  2001-08-06 23:59   ` Richard Gooch
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Richard Gooch @ 2001-08-06 23:55 UTC (permalink / raw)
  To: Alan Cox; +Cc: Alexander Viro, Linus Torvalds, linux-kernel

Alan Cox writes:
> > Linus: please don't apply.
> > Alan: I notice you've put Al's patch into 2.4.7-ac8. Please remove it.
> 
> I'll remove it when your preferred fixes are ready. Until then its
> better than leaving it broken.

OK, fair enough. When is your next merge with Linus scheduled? I'd
prefer to get a few races fixed before shipping a patch, but I can try
to plan for an earlier release if necessary.

				Regards,

					Richard....
Permanent: rgooch@atnf.csiro.au
Current:   rgooch@ras.ucalgary.ca

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH] one of $BIGNUM devfs races
       [not found] ` <no.id>
@ 2001-08-06 23:54   ` Alan Cox
  2001-08-06 23:55   ` Richard Gooch
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Alan Cox @ 2001-08-06 23:54 UTC (permalink / raw)
  To: Richard Gooch; +Cc: Alexander Viro, Linus Torvalds, Alan Cox, linux-kernel

> Linus: please don't apply.
> Alan: I notice you've put Al's patch into 2.4.7-ac8. Please remove it.

I'll remove it when your preferred fixes are ready. Until then its better
than leaving it broken.


^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2001-08-08 23:30 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-08-06 11:58 [PATCH] one of $BIGNUM devfs races Alexander Viro
2001-08-06 16:47 ` Richard Gooch
2001-08-06 16:56   ` Rik van Riel
2001-08-06 17:11   ` Richard Gooch
2001-08-06 23:50 ` Richard Gooch
2001-08-07  0:34   ` Alexander Viro
2001-08-07  0:51   ` Richard Gooch
2001-08-07  1:10     ` Alexander Viro
2001-08-07  1:27     ` Richard Gooch
2001-08-07  1:42       ` Alexander Viro
2001-08-07  2:00       ` Richard Gooch
2001-08-07  2:15         ` Alexander Viro
2001-08-07  5:17         ` Richard Gooch
2001-08-07  5:28           ` Alexander Viro
2001-08-07 10:21           ` Anton Altaparmakov
2001-08-07 17:09           ` Richard Gooch
2001-08-07 17:27             ` Arjan van de Ven
2001-08-07 17:28             ` Richard Gooch
2001-08-07  5:53       ` Richard Gooch
2001-08-07  6:23         ` Alexander Viro
2001-08-07  6:36         ` Richard Gooch
2001-08-07  6:40           ` Alexander Viro
2001-08-07  6:47           ` Richard Gooch
2001-08-07  7:31             ` Alexander Viro
2001-08-07 18:11             ` Richard Gooch
2001-08-07 18:23               ` Alexander Viro
2001-08-07 18:55               ` Richard Gooch
2001-08-07 19:10                 ` Alexander Viro
2001-08-07 22:13                 ` Richard Gooch
2001-08-06 23:59 Alan Cox
     [not found] ` <no.id>
2001-08-06 23:54   ` Alan Cox
2001-08-06 23:55   ` Richard Gooch
2001-08-06 23:59   ` Richard Gooch
2001-08-07 16:22   ` Alan Cox
2001-08-07 19:04   ` Alan Cox
2001-08-07 19:16     ` Alexander Viro
2001-08-08 21:16       ` H. Peter Anvin
2001-08-08 21:47         ` Alexander Viro
2001-08-08 23:29         ` Richard Gooch
2001-08-07 19:09   ` Richard Gooch
2001-08-07 19:20     ` Alexander Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).