linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [git pull] VFS patches, the first series
@ 2008-07-27  1:22 Al Viro
  2008-08-21  7:40 ` Eric W. Biederman
  0 siblings, 1 reply; 7+ messages in thread
From: Al Viro @ 2008-07-27  1:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

The first part of huge pile.  Mostly it's untangling nameidata handling,
digging towards the pieces that kill intents and cleaning pathname
resolution in general.  ->permission() sanitizing and sysctl procfs
treatment rewrite needed for it.  A bunch of descriptor handling fixes.
Plus part of assorted patched from the last cycle sent by other folks.
A _lot_ more is still pending; this is what I'd managed to pull into
a series by this point.  Please, pull from
git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6.git/ for-linus

Shortlog:
Al Viro (24):
      beginning of sysctl cleanup - ctl_table_set
      allow delayed freeing of ctl_table_header
      sysctl: make sure that /proc/sys/net/ipv4 appears before per-ns ones
      sysctl: keep track of tree relationships
      sanitize proc_sysctl
      sanitize ->permission() prototype
      permission checks for chdir need special treatment only on the last step
      kill altroot
      fix MAY_CHDIR/MAY_ACCESS/LOOKUP_ACCESS mess
      pass MAY_OPEN to vfs_permission() explicitly
      more nameidata removal: exec_permission_lite() doesn't need it
      take noexec checks to very few callers that care
      kill nameidata passing to permission(), rename to inode_permission()
      preparation to __user_walk_fd cleanup
      sanitize __user_walk_fd() et.al.
      new (local) helper: user_path_parent()
      don't pass nameidata to gfs2_lookupi()
      don't pass nameidata to __ncp_lookup_validate()
      f_count may wrap around
      get rid of __user_path_lookup_open
      get rid of indirect users of namei.h
      remove remaining namei_{32,64}.h crap
      get rid of corner case in dup3() entirely
      fix RLIM_NOFILE handling

Christoph Hellwig (1):
      Re: [PATCH 3/6] vfs: open_exec cleanup

Denys Vlasenko (1):
      reuse xxx_fifo_fops for xxx_pipe_fops

Li Zefan (1):
      vfs: use kstrdup() and check failing allocation

Miklos Szeredi (10):
      [patch] vfs: fix lookup on deleted directory
      [patch] hppfs: remove hppfs_permission
      [patch 05/14] hpfs: dont call permission()
      [patch 1/5] vfs: truncate: dont check immutable twice
      [patch 3/5] vfs: change remove_suid() to file_remove_suid()
      [patch 5/5] vfs: remove mode parameter from vfs_symlink()
      [patch 1/4] vfs: utimes: move owner check into inode_change_ok()
      [patch 2/4] vfs: utimes cleanup
      [patch 3/4] fat: dont call notify_change
      [patch 4/4] vfs: immutable inode checking cleanup

Tetsuo Handa (1):
      [patch 4/5] vfs: reuse local variable in vfs_link()

Ulrich Drepper (1):
      dup3 fix

Diffstat:
 arch/alpha/kernel/osf_sys.c    |   10 +-
 arch/parisc/hpux/sys_hpux.c    |   10 +-
 drivers/net/ppp_generic.c      |    6 +-
 fs/affs/file.c                 |    4 -
 fs/afs/internal.h              |    4 +-
 fs/afs/security.c              |    2 +-
 fs/aio.c                       |    6 +-
 fs/attr.c                      |    7 +-
 fs/bad_inode.c                 |    3 +-
 fs/cifs/cifsfs.c               |    2 +-
 fs/coda/dir.c                  |    4 +-
 fs/coda/pioctl.c               |   20 +-
 fs/compat.c                    |   20 +-
 fs/ecryptfs/inode.c            |   21 +--
 fs/exec.c                      |   81 +++++---
 fs/ext2/acl.c                  |    2 +-
 fs/ext2/acl.h                  |    2 +-
 fs/ext3/acl.c                  |    2 +-
 fs/ext3/acl.h                  |    2 +-
 fs/ext4/acl.c                  |    2 +-
 fs/ext4/acl.h                  |    2 +-
 fs/fat/file.c                  |   15 ++-
 fs/fcntl.c                     |   33 ++--
 fs/fifo.c                      |    8 +-
 fs/file.c                      |    9 +
 fs/file_table.c                |   10 +-
 fs/fuse/dir.c                  |    6 +-
 fs/fuse/file.c                 |    2 +-
 fs/gfs2/inode.c                |    6 +-
 fs/gfs2/inode.h                |    2 +-
 fs/gfs2/ops_export.c           |    2 +-
 fs/gfs2/ops_inode.c            |   16 +-
 fs/gfs2/super.c                |    2 +-
 fs/hfs/inode.c                 |    7 +-
 fs/hfsplus/inode.c             |    6 +-
 fs/hostfs/hostfs_kern.c        |    2 +-
 fs/hpfs/namei.c                |    2 +-
 fs/hppfs/hppfs.c               |    7 -
 fs/inotify_user.c              |   22 +-
 fs/jffs2/acl.c                 |    2 +-
 fs/jffs2/acl.h                 |    2 +-
 fs/jfs/acl.c                   |    2 +-
 fs/jfs/jfs_acl.h               |    2 +-
 fs/namei.c                     |  354 +++++++++++----------------------
 fs/namespace.c                 |  106 +++++------
 fs/ncpfs/dir.c                 |    4 +-
 fs/nfs/dir.c                   |   11 +-
 fs/nfsd/nfsctl.c               |    1 +
 fs/nfsd/nfsfh.c                |    2 +-
 fs/nfsd/vfs.c                  |   14 +-
 fs/ntfs/file.c                 |    2 +-
 fs/ocfs2/file.c                |    2 +-
 fs/ocfs2/file.h                |    3 +-
 fs/open.c                      |  179 ++++++++---------
 fs/pipe.c                      |   51 +----
 fs/proc/base.c                 |    3 +-
 fs/proc/inode.c                |    5 +
 fs/proc/proc_sysctl.c          |  429 ++++++++++++++++++----------------------
 fs/reiserfs/xattr.c            |    2 +-
 fs/smbfs/file.c                |    4 +-
 fs/splice.c                    |    4 +-
 fs/stat.c                      |   32 ++--
 fs/ubifs/file.c                |    1 +
 fs/utimes.c                    |  139 +++++++-------
 fs/xattr.c                     |   98 +++++-----
 fs/xfs/linux-2.6/xfs_ioctl.c   |   14 +-
 fs/xfs/linux-2.6/xfs_iops.c    |    3 +-
 fs/xfs/linux-2.6/xfs_lrw.c     |    2 +-
 include/asm-alpha/namei.h      |   17 --
 include/asm-arm/namei.h        |   25 ---
 include/asm-avr32/namei.h      |    7 -
 include/asm-blackfin/namei.h   |   19 --
 include/asm-cris/namei.h       |   17 --
 include/asm-frv/namei.h        |   18 --
 include/asm-h8300/namei.h      |   17 --
 include/asm-ia64/namei.h       |   25 ---
 include/asm-m32r/namei.h       |   17 --
 include/asm-m68k/namei.h       |   17 --
 include/asm-m68knommu/namei.h  |    1 -
 include/asm-mips/namei.h       |   11 -
 include/asm-mn10300/namei.h    |   22 --
 include/asm-parisc/namei.h     |   17 --
 include/asm-powerpc/namei.h    |   20 --
 include/asm-s390/namei.h       |   21 --
 include/asm-sh/namei.h         |   17 --
 include/asm-sparc/namei.h      |    8 -
 include/asm-sparc/namei_32.h   |   13 --
 include/asm-sparc/namei_64.h   |   13 --
 include/asm-sparc64/namei.h    |    1 -
 include/asm-um/namei.h         |    6 -
 include/asm-v850/namei.h       |   17 --
 include/asm-x86/namei.h        |   11 -
 include/asm-xtensa/namei.h     |   26 ---
 include/linux/coda_linux.h     |    2 +-
 include/linux/fs.h             |   57 +++---
 include/linux/fs_struct.h      |    3 +-
 include/linux/mount.h          |    2 +-
 include/linux/namei.h          |   19 +-
 include/linux/nfs_fs.h         |    3 +-
 include/linux/proc_fs.h        |    5 +
 include/linux/reiserfs_xattr.h |    2 +-
 include/linux/security.h       |    7 +-
 include/linux/shmem_fs.h       |    2 +-
 include/linux/sysctl.h         |   25 +++-
 include/net/af_unix.h          |    2 +-
 include/net/ip.h               |    2 +
 include/net/net_namespace.h    |    4 +-
 ipc/mqueue.c                   |    2 +-
 kernel/cgroup.c                |    1 +
 kernel/exec_domain.c           |    1 -
 kernel/exit.c                  |    2 -
 kernel/fork.c                  |    7 -
 kernel/sysctl.c                |  166 ++++++++++++++--
 mm/filemap.c                   |    7 +-
 mm/filemap_xip.c               |    2 +-
 mm/shmem_acl.c                 |    2 +-
 net/ipv4/af_inet.c             |    4 +
 net/ipv4/sysctl_net_ipv4.c     |    7 +
 net/sched/sch_atm.c            |    4 +-
 net/sysctl_net.c               |   22 +--
 net/unix/af_unix.c             |    2 +-
 net/unix/garbage.c             |   18 +-
 security/capability.c          |    3 +-
 security/security.c            |    5 +-
 security/selinux/hooks.c       |    5 +-
 security/smack/smack_lsm.c     |    3 +-
 126 files changed, 1088 insertions(+), 1535 deletions(-)
 delete mode 100644 include/asm-alpha/namei.h
 delete mode 100644 include/asm-arm/namei.h
 delete mode 100644 include/asm-avr32/namei.h
 delete mode 100644 include/asm-blackfin/namei.h
 delete mode 100644 include/asm-cris/namei.h
 delete mode 100644 include/asm-frv/namei.h
 delete mode 100644 include/asm-h8300/namei.h
 delete mode 100644 include/asm-ia64/namei.h
 delete mode 100644 include/asm-m32r/namei.h
 delete mode 100644 include/asm-m68k/namei.h
 delete mode 100644 include/asm-m68knommu/namei.h
 delete mode 100644 include/asm-mips/namei.h
 delete mode 100644 include/asm-mn10300/namei.h
 delete mode 100644 include/asm-parisc/namei.h
 delete mode 100644 include/asm-powerpc/namei.h
 delete mode 100644 include/asm-s390/namei.h
 delete mode 100644 include/asm-sh/namei.h
 delete mode 100644 include/asm-sparc/namei.h
 delete mode 100644 include/asm-sparc/namei_32.h
 delete mode 100644 include/asm-sparc/namei_64.h
 delete mode 100644 include/asm-sparc64/namei.h
 delete mode 100644 include/asm-um/namei.h
 delete mode 100644 include/asm-v850/namei.h
 delete mode 100644 include/asm-x86/namei.h
 delete mode 100644 include/asm-xtensa/namei.h

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [git pull] VFS patches, the first series
  2008-07-27  1:22 [git pull] VFS patches, the first series Al Viro
@ 2008-08-21  7:40 ` Eric W. Biederman
  2008-08-21 17:14   ` Eric W. Biederman
  0 siblings, 1 reply; 7+ messages in thread
From: Eric W. Biederman @ 2008-08-21  7:40 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-kernel

Al Viro <viro@ZenIV.linux.org.uk> writes:

> The first part of huge pile.  Mostly it's untangling nameidata handling,
> digging towards the pieces that kill intents and cleaning pathname
> resolution in general.  ->permission() sanitizing and sysctl procfs
> treatment rewrite needed for it.  A bunch of descriptor handling fixes.
> Plus part of assorted patched from the last cycle sent by other folks.
> A _lot_ more is still pending; this is what I'd managed to pull into
> a series by this point.  Please, pull from
> git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6.git/ for-linus

Al a quick heads up.  In testing movement of network devices between
namespaces  I hit the recently added WARN_ON in unregister_sysctl_table.

More tomorrow when I get a after I have dug into this farther.

Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [git pull] VFS patches, the first series
  2008-08-21  7:40 ` Eric W. Biederman
@ 2008-08-21 17:14   ` Eric W. Biederman
  2008-08-22  0:08     ` Eric W. Biederman
  0 siblings, 1 reply; 7+ messages in thread
From: Eric W. Biederman @ 2008-08-21 17:14 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-kernel

ebiederm@xmission.com (Eric W. Biederman) writes:

> Al Viro <viro@ZenIV.linux.org.uk> writes:
>
>> The first part of huge pile.  Mostly it's untangling nameidata handling,
>> digging towards the pieces that kill intents and cleaning pathname
>> resolution in general.  ->permission() sanitizing and sysctl procfs
>> treatment rewrite needed for it.  A bunch of descriptor handling fixes.
>> Plus part of assorted patched from the last cycle sent by other folks.
>> A _lot_ more is still pending; this is what I'd managed to pull into
>> a series by this point.  Please, pull from
>> git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6.git/ for-linus
>
> Al a quick heads up.  In testing movement of network devices between
> namespaces  I hit the recently added WARN_ON in unregister_sysctl_table.

It seems to be some new oddness when destroying a network namespace.

If I don't have network devices to push out of the network namespace
when I clean it up nothing happens.  When I do I get this nice beautiful
backtrace.

I will dig into the sysctl code in a bit and see if I can understand
why this is happening.

------------[ cut here ]------------
WARNING: at /home/eric/projects/linux/linux-2.6-arastra-ns/kernel/sysctl.c:1929 unregister_sysctl_table+0xb5/0x
e5()
Modules linked in:
Pid: 22, comm: netns Tainted: G        W 2.6.27-rc3x86_64 #48

Call Trace:
 [<ffffffff802361da>] warn_on_slowpath+0x51/0x77
 [<ffffffff8023c631>] unregister_sysctl_table+0x34/0xe5
 [<ffffffff8023c6b2>] unregister_sysctl_table+0xb5/0xe5
 [<ffffffff80523f8c>] neigh_sysctl_unregister+0x1a/0x31
 [<ffffffff80559635>] inetdev_event+0x2b4/0x3d1
 [<ffffffff8024b850>] notifier_call_chain+0x29/0x56
 [<ffffffff8051fa2a>] dev_change_net_namespace+0x1bb/0x1da
 [<ffffffff8051fa9d>] default_device_exit+0x54/0xa2
 [<ffffffff8052105e>] netdev_run_todo+0x1fd/0x206
 [<ffffffff8051ce0b>] cleanup_net+0x0/0x95
 [<ffffffff8051ce6f>] cleanup_net+0x64/0x95
 [<ffffffff80244e58>] run_workqueue+0xf1/0x1ee
 [<ffffffff80244e02>] run_workqueue+0x9b/0x1ee
 [<ffffffff802459ee>] worker_thread+0xd8/0xe3
 [<ffffffff80248426>] autoremove_wake_function+0x0/0x2e
 [<ffffffff80245916>] worker_thread+0x0/0xe3
 [<ffffffff80248311>] kthread+0x47/0x76
 [<ffffffff805c7529>] trace_hardirqs_on_thunk+0x3a/0x3f
 [<ffffffff8020cdc9>] child_rip+0xa/0x11
 [<ffffffff8020c3ff>] restore_args+0x0/0x30
 [<ffffffff80230bcd>] finish_task_switch+0x0/0xc4
 [<ffffffff802482ca>] kthread+0x0/0x76
 [<ffffffff8020cdbf>] child_rip+0x0/0x11

---[ end trace f9cc56de378eb3ce ]---


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [git pull] VFS patches, the first series
  2008-08-21 17:14   ` Eric W. Biederman
@ 2008-08-22  0:08     ` Eric W. Biederman
  2008-08-23  3:33       ` Al Viro
  2008-08-23  7:24       ` Al Viro
  0 siblings, 2 replies; 7+ messages in thread
From: Eric W. Biederman @ 2008-08-22  0:08 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-kernel, netdev, Denis V. Lunev

ebiederm@xmission.com (Eric W. Biederman) writes:

> ebiederm@xmission.com (Eric W. Biederman) writes:
>
>> Al Viro <viro@ZenIV.linux.org.uk> writes:
>>
>>> The first part of huge pile.  Mostly it's untangling nameidata handling,
>>> digging towards the pieces that kill intents and cleaning pathname
>>> resolution in general.  ->permission() sanitizing and sysctl procfs
>>> treatment rewrite needed for it.  A bunch of descriptor handling fixes.
>>> Plus part of assorted patched from the last cycle sent by other folks.
>>> A _lot_ more is still pending; this is what I'd managed to pull into
>>> a series by this point.  Please, pull from
>>> git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6.git/ for-linus
>>
>> Al a quick heads up.  In testing movement of network devices between
>> namespaces  I hit the recently added WARN_ON in unregister_sysctl_table.
>
> It seems to be some new oddness when destroying a network namespace.
>
> If I don't have network devices to push out of the network namespace
> when I clean it up nothing happens.  When I do I get this nice beautiful
> backtrace.
>
> I will dig into the sysctl code in a bit and see if I can understand
> why this is happening.

Ok.  The situation is now clear.

/proc/sys/net/ipv4/neigh/default does not currently exist in network
namespaces.    This looks like an oversight.

In my network namespace I have the interfaces lo, sit, veth0

We have the result that /proc/sys/net/ipv4/neigh/veth0 has /proc/sys/net/ipv4/neigh/lo.

lo gets unregistered before veth0 when we bring the network namespaces down.
Then when veth gets unregistered we have a problem.

I'm not certain what to do about it.  The semantics of the how the
sysctl tables are access have changed significantly.  Now the first 
sysctl table to describe a directory must remain until there are no
other tables that have entries in that directory and a sysctl table
must have a pure path of directories for any portion of the address
space it shares with an earlier sysctl table.  This is noticeably
different from the union mount semantics we have had previously for
the sysctl tables.

If it doesn't look to bad to maintain the new semantics it looks like
the right thing to do is to add some additional checks so we get more
precise warnings (who knows what out of tree sysctl code will do) and
to find someplace I can insert a net/ipv4/neigh sysctl directory into
(ipv4_net_table looks like it will work) to keep the network namespace
code working safely.

Al btw nice trick using compare to keep the dentries separate allowing
us to cache everything in /proc.  I feel silly for missing that one.
Want to get together in the next couple of weeks and build a tree that
updates the sysctls infrastructure to suck less?

Eric

> ------------[ cut here ]------------
> WARNING: at /home/eric/projects/linux/linux-2.6-arastra-ns/kernel/sysctl.c:1929
> unregister_sysctl_table+0xb5/0x
> e5()
> Modules linked in:
> Pid: 22, comm: netns Tainted: G        W 2.6.27-rc3x86_64 #48
>
> Call Trace:
>  [<ffffffff802361da>] warn_on_slowpath+0x51/0x77
>  [<ffffffff8023c631>] unregister_sysctl_table+0x34/0xe5
>  [<ffffffff8023c6b2>] unregister_sysctl_table+0xb5/0xe5
>  [<ffffffff80523f8c>] neigh_sysctl_unregister+0x1a/0x31
>  [<ffffffff80559635>] inetdev_event+0x2b4/0x3d1
>  [<ffffffff8024b850>] notifier_call_chain+0x29/0x56
>  [<ffffffff8051fa2a>] dev_change_net_namespace+0x1bb/0x1da
>  [<ffffffff8051fa9d>] default_device_exit+0x54/0xa2
>  [<ffffffff8052105e>] netdev_run_todo+0x1fd/0x206
>  [<ffffffff8051ce0b>] cleanup_net+0x0/0x95
>  [<ffffffff8051ce6f>] cleanup_net+0x64/0x95
>  [<ffffffff80244e58>] run_workqueue+0xf1/0x1ee
>  [<ffffffff80244e02>] run_workqueue+0x9b/0x1ee
>  [<ffffffff802459ee>] worker_thread+0xd8/0xe3
>  [<ffffffff80248426>] autoremove_wake_function+0x0/0x2e
>  [<ffffffff80245916>] worker_thread+0x0/0xe3
>  [<ffffffff80248311>] kthread+0x47/0x76
>  [<ffffffff805c7529>] trace_hardirqs_on_thunk+0x3a/0x3f
>  [<ffffffff8020cdc9>] child_rip+0xa/0x11
>  [<ffffffff8020c3ff>] restore_args+0x0/0x30
>  [<ffffffff80230bcd>] finish_task_switch+0x0/0xc4
>  [<ffffffff802482ca>] kthread+0x0/0x76
>  [<ffffffff8020cdbf>] child_rip+0x0/0x11
>
> ---[ end trace f9cc56de378eb3ce ]---

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [git pull] VFS patches, the first series
  2008-08-22  0:08     ` Eric W. Biederman
@ 2008-08-23  3:33       ` Al Viro
  2008-08-23  5:22         ` Eric W. Biederman
  2008-08-23  7:24       ` Al Viro
  1 sibling, 1 reply; 7+ messages in thread
From: Al Viro @ 2008-08-23  3:33 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-kernel, netdev, Denis V. Lunev

On Thu, Aug 21, 2008 at 05:08:25PM -0700, Eric W. Biederman wrote:

> I'm not certain what to do about it.  The semantics of the how the
> sysctl tables are access have changed significantly.  Now the first 
> sysctl table to describe a directory must remain until there are no
> other tables that have entries in that directory and a sysctl table
> must have a pure path of directories for any portion of the address
> space it shares with an earlier sysctl table.  This is noticeably
> different from the union mount semantics we have had previously for
> the sysctl tables.

Note that the old semantics had a lovely inherent problem (leaving aside
the utterly insane amount of walking and re-walking the trees, as you've
found out the hard way - don't tell me you hadn't cursed it when writing
the previous version of proc_sysctl.c): there's redundancy between the
trees.  At the very least, just what are we supposed to get when the
stems do not match each other - either in permissions or in ctl_name?

> If it doesn't look to bad to maintain the new semantics it looks like
> the right thing to do is to add some additional checks so we get more
> precise warnings (who knows what out of tree sysctl code will do) and
> to find someplace I can insert a net/ipv4/neigh sysctl directory into
> (ipv4_net_table looks like it will work) to keep the network namespace
> code working safely.
> 
> Al btw nice trick using compare to keep the dentries separate allowing
> us to cache everything in /proc.  I feel silly for missing that one.
> Want to get together in the next couple of weeks and build a tree that
> updates the sysctls infrastructure to suck less?

Fine by me...  BTW, fixing that particular crap is not hard - you need
to have the entry in question show up before either interface, that's all.
I missed that part of ordering mess, to be honest.  I'll look into that,
hopefully will post the fix later tonight.

FWIW, I'd very much prefer ->d_compare() trick to the horrors you guys
are doing around sysfs; it might or might not be feasible depending on
what visibility rules you end up with there, but if it's feasible at all
I'd rather go for it and avoid the entire 'separate backing store' mess.
IIRC, I had described that scheme to you quite a few months ago in sysfs
context; got no response back then...

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [git pull] VFS patches, the first series
  2008-08-23  3:33       ` Al Viro
@ 2008-08-23  5:22         ` Eric W. Biederman
  0 siblings, 0 replies; 7+ messages in thread
From: Eric W. Biederman @ 2008-08-23  5:22 UTC (permalink / raw)
  To: Al Viro; +Cc: Linus Torvalds, linux-kernel, netdev, Denis V. Lunev

Al Viro <viro@ZenIV.linux.org.uk> writes:

> On Thu, Aug 21, 2008 at 05:08:25PM -0700, Eric W. Biederman wrote:
>
>> I'm not certain what to do about it.  The semantics of the how the
>> sysctl tables are access have changed significantly.  Now the first 
>> sysctl table to describe a directory must remain until there are no
>> other tables that have entries in that directory and a sysctl table
>> must have a pure path of directories for any portion of the address
>> space it shares with an earlier sysctl table.  This is noticeably
>> different from the union mount semantics we have had previously for
>> the sysctl tables.
>
> Note that the old semantics had a lovely inherent problem (leaving aside
> the utterly insane amount of walking and re-walking the trees, as you've
> found out the hard way - don't tell me you hadn't cursed it when writing
> the previous version of proc_sysctl.c): there's redundancy between the
> trees.  At the very least, just what are we supposed to get when the
> stems do not match each other - either in permissions or in ctl_name?

That case is simple.  We never allowed overlapping leaves, and all of
the directories had essentially the same permissions.  Beyond that
I added checks in sysctl_check to make certain we are never out of
sync. 

As for the walking and rewalking I was never fond of it but it was
simple and worked.

So far I am not a fan of the new semantics.

>> If it doesn't look to bad to maintain the new semantics it looks like
>> the right thing to do is to add some additional checks so we get more
>> precise warnings (who knows what out of tree sysctl code will do) and
>> to find someplace I can insert a net/ipv4/neigh sysctl directory into
>> (ipv4_net_table looks like it will work) to keep the network namespace
>> code working safely.
>> 
>> Al btw nice trick using compare to keep the dentries separate allowing
>> us to cache everything in /proc.  I feel silly for missing that one.
>> Want to get together in the next couple of weeks and build a tree that
>> updates the sysctls infrastructure to suck less?
>
> Fine by me...  BTW, fixing that particular crap is not hard - you need
> to have the entry in question show up before either interface, that's all.
> I missed that part of ordering mess, to be honest.  I'll look into that,
> hopefully will post the fix later tonight.

Thanks for looking.

The ordering problem is self inflicted as you introduced an ordering
constraint where none existed previously, and it seems unnecessary.

I'm currently tearing my hair out trying to think of a reasonable
way to audit the current sysctl usage to see if there is anything
else that was missed.

> FWIW, I'd very much prefer ->d_compare() trick to the horrors you guys
> are doing around sysfs; it might or might not be feasible depending on
> what visibility rules you end up with there, but if it's feasible at all
> I'd rather go for it and avoid the entire 'separate backing store' mess.
> IIRC, I had described that scheme to you quite a few months ago in sysfs
> context; got no response back then...

Weird.  I must have missed seeing it, as I don't have any recollection of
it.

There are two pieces of the problem.  
- How do we get a dentry tree that the vfs won't gag on.  Without
  knowing how to successfully implement the dcompare trick it required
  2 dentry trees.

- Monitoring.  It is desirable to be able to mount the filesystem such that
  someone outside the namespace can get a view of what the folks inside the
  namespace see.  Roughly like is done with /proc/net today.

Neither of those two cases requires multiple dentry trees and the
tagged sysfs dirents can easily support an operation like is_seen.

I don't think the dcompare trick is general enough to support discriminating
on something besides the current process.  Which leads to problems with
monitoring.

Eric

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [git pull] VFS patches, the first series
  2008-08-22  0:08     ` Eric W. Biederman
  2008-08-23  3:33       ` Al Viro
@ 2008-08-23  7:24       ` Al Viro
  1 sibling, 0 replies; 7+ messages in thread
From: Al Viro @ 2008-08-23  7:24 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Linus Torvalds, linux-kernel, netdev, Denis V. Lunev

On Thu, Aug 21, 2008 at 05:08:25PM -0700, Eric W. Biederman wrote:

> Ok.  The situation is now clear.
> 
> /proc/sys/net/ipv4/neigh/default does not currently exist in network
> namespaces.    This looks like an oversight.
> 
> In my network namespace I have the interfaces lo, sit, veth0
> 
> We have the result that /proc/sys/net/ipv4/neigh/veth0 has /proc/sys/net/ipv4/neigh/lo.
> 
> lo gets unregistered before veth0 when we bring the network namespaces down.
> Then when veth gets unregistered we have a problem.
> 
> I'm not certain what to do about it.

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 16fc6f4..d3c156e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3054,14 +3054,23 @@ static ctl_table ipv4_route_table[] = {
 	{ .ctl_name = 0 }
 };
 
-static __net_initdata struct ctl_path ipv4_route_path[] = {
+static struct ctl_table empty[1];
+
+static struct ctl_table ipv4_skeleton[] =
+{
+	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE,
+	  .child = ipv4_route_table},
+	{ .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
+	  .child = empty},
+	{ }
+};
+
+static __net_initdata struct ctl_path ipv4_path[] = {
 	{ .procname = "net", .ctl_name = CTL_NET, },
 	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
-	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
 	{ },
 };
 
-
 static struct ctl_table ipv4_route_flush_table[] = {
 	{
 		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
@@ -3074,6 +3083,13 @@ static struct ctl_table ipv4_route_flush_table[] = {
 	{ .ctl_name = 0 },
 };
 
+static __net_initdata struct ctl_path ipv4_route_path[] = {
+	{ .procname = "net", .ctl_name = CTL_NET, },
+	{ .procname = "ipv4", .ctl_name = NET_IPV4, },
+	{ .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
+	{ },
+};
+
 static __net_init int sysctl_route_net_init(struct net *net)
 {
 	struct ctl_table *tbl;
@@ -3223,7 +3239,7 @@ int __init ip_rt_init(void)
  */
 void __init ip_static_sysctl_init(void)
 {
-	register_sysctl_paths(ipv4_route_path, ipv4_route_table);
+	register_sysctl_paths(ipv4_path, ipv4_skeleton);
 }
 #endif
 

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2008-08-23  7:24 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-27  1:22 [git pull] VFS patches, the first series Al Viro
2008-08-21  7:40 ` Eric W. Biederman
2008-08-21 17:14   ` Eric W. Biederman
2008-08-22  0:08     ` Eric W. Biederman
2008-08-23  3:33       ` Al Viro
2008-08-23  5:22         ` Eric W. Biederman
2008-08-23  7:24       ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).