All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
@ 2013-06-19 15:09 Dwight Engen
  2013-06-19 20:35 ` Eric W. Biederman
  2013-06-20  0:13 ` Dave Chinner
  0 siblings, 2 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-19 15:09 UTC (permalink / raw)
  To: xfs; +Cc: Eric W. Biederman

Use uint32 from init_user_ns for xfs internal uid/gid representation in
acl, xfs_icdinode. Conversion of kuid/gid is done at the vfs boundary,
other user visible xfs specific interfaces (bulkstat, eofblocks filter)
expect uint32 init_user_ns uid/gid values.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_acl.c      | 24 ++++++++++++++++++++----
 fs/xfs/xfs_fs.h       |  4 ++--
 fs/xfs/xfs_icache.c   |  2 +-
 fs/xfs/xfs_inode.c    |  6 +++---
 fs/xfs/xfs_ioctl.c    |  2 +-
 fs/xfs/xfs_iops.c     | 38 ++++++++++++++++++++------------------
 fs/xfs/xfs_qm.c       | 16 ++++++++--------
 fs/xfs/xfs_quota.h    |  9 +++++----
 fs/xfs/xfs_symlink.c  |  4 +++-
 fs/xfs/xfs_vnodeops.c |  4 +++-
 init/Kconfig          | 13 -------------
 11 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 306d883..fd2854e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -68,14 +68,17 @@ xfs_acl_from_disk(
 
 		switch (acl_e->e_tag) {
 		case ACL_USER:
+			acl_e->e_uid = make_kuid(&init_user_ns,
+						 be32_to_cpu(ace->ae_id));
+			break;
 		case ACL_GROUP:
-			acl_e->e_id = be32_to_cpu(ace->ae_id);
+			acl_e->e_gid = make_kgid(&init_user_ns,
+						 be32_to_cpu(ace->ae_id));
 			break;
 		case ACL_USER_OBJ:
 		case ACL_GROUP_OBJ:
 		case ACL_MASK:
 		case ACL_OTHER:
-			acl_e->e_id = ACL_UNDEFINED_ID;
 			break;
 		default:
 			goto fail;
@@ -101,7 +104,20 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
 		acl_e = &acl->a_entries[i];
 
 		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
-		ace->ae_id = cpu_to_be32(acl_e->e_id);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			ace->ae_id = cpu_to_be32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ace->ae_id = cpu_to_be32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			break;
+		default:
+			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+			break;
+		}
+
 		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
 	}
 }
@@ -360,7 +376,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 		return -EINVAL;
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return value ? -EACCES : 0;
-	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (!value)
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d046955..bf0a6f8 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -347,8 +347,8 @@ typedef struct xfs_error_injection {
 struct xfs_eofblocks {
 	__u32		eof_version;
 	__u32		eof_flags;
-	uid_t		eof_uid;
-	gid_t		eof_gid;
+	__u32		eof_uid;
+	__u32		eof_gid;
 	prid_t		eof_prid;
 	__u32		pad32;
 	__u64		eof_min_file_size;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e..70ba410 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -617,7 +617,7 @@ restart:
 
 /*
  * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
  */
 STATIC void
 xfs_queue_eofblocks(
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f..8049976 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1268,8 +1268,8 @@ xfs_ialloc(
 	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
 	ASSERT(ip->i_d.di_nlink == nlink);
-	ip->i_d.di_uid = current_fsuid();
-	ip->i_d.di_gid = current_fsgid();
+	ip->i_d.di_uid = from_kuid(&init_user_ns, current_fsuid());
+	ip->i_d.di_gid = from_kgid(&init_user_ns, current_fsgid());
 	xfs_set_projid(ip, prid);
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
@@ -1308,7 +1308,7 @@ xfs_ialloc(
 	 */
 	if ((irix_sgid_inherit) &&
 	    (ip->i_d.di_mode & S_ISGID) &&
-	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
+	    (!in_group_p(make_kgid(&init_user_ns, ip->i_d.di_gid)))) {
 		ip->i_d.di_mode &= ~S_ISGID;
 	}
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e99968..daa6127 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -981,7 +981,7 @@ xfs_ioctl_setattr(
 	 * to the file owner ID, except in cases where the
 	 * CAP_FSETID capability is applicable.
 	 */
-	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+	if (!inode_owner_or_capable(&ip->i_vnode)) {
 		code = XFS_ERROR(EPERM);
 		goto error_return;
 	}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa..bf96cf8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -420,8 +420,8 @@ xfs_vn_getattr(
 	stat->dev = inode->i_sb->s_dev;
 	stat->mode = ip->i_d.di_mode;
 	stat->nlink = ip->i_d.di_nlink;
-	stat->uid = ip->i_d.di_uid;
-	stat->gid = ip->i_d.di_gid;
+	stat->uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
+	stat->gid = make_kgid(&init_user_ns, ip->i_d.di_gid);
 	stat->ino = ip->i_ino;
 	stat->atime = inode->i_atime;
 	stat->mtime = inode->i_mtime;
@@ -488,8 +488,8 @@ xfs_setattr_nonsize(
 	int			mask = iattr->ia_valid;
 	xfs_trans_t		*tp;
 	int			error;
-	uid_t			uid = 0, iuid = 0;
-	gid_t			gid = 0, igid = 0;
+	kuid_t			uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+	kgid_t			gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
 	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
 	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
 
@@ -522,13 +522,13 @@ xfs_setattr_nonsize(
 			uid = iattr->ia_uid;
 			qflags |= XFS_QMOPT_UQUOTA;
 		} else {
-			uid = ip->i_d.di_uid;
+			uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
 		}
 		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
 			gid = iattr->ia_gid;
 			qflags |= XFS_QMOPT_GQUOTA;
 		}  else {
-			gid = ip->i_d.di_gid;
+			gid = make_kgid(&init_user_ns, ip->i_d.di_gid);
 		}
 
 		/*
@@ -538,8 +538,10 @@ xfs_setattr_nonsize(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-					 qflags, &udqp, &gdqp);
+		error = xfs_qm_vop_dqalloc(ip, from_kuid(&init_user_ns, uid),
+					   from_kgid(&init_user_ns, gid),
+					   xfs_get_projid(ip),
+					   qflags, &udqp, &gdqp);
 		if (error)
 			return error;
 	}
@@ -561,8 +563,8 @@ xfs_setattr_nonsize(
 		 * while we didn't have the inode locked, inode's dquot(s)
 		 * would have changed also.
 		 */
-		iuid = ip->i_d.di_uid;
-		igid = ip->i_d.di_gid;
+		iuid = make_kuid(&init_user_ns, ip->i_d.di_uid);
+		igid = make_kgid(&init_user_ns, ip->i_d.di_gid);
 		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
 		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 
@@ -571,8 +573,8 @@ xfs_setattr_nonsize(
 		 * going to change.
 		 */
 		if (XFS_IS_QUOTA_RUNNING(mp) &&
-		    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-		     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+		    ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+		     (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
 			ASSERT(tp);
 			error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
@@ -602,17 +604,17 @@ xfs_setattr_nonsize(
 		 * Change the ownerships and register quota modifications
 		 * in the transaction.
 		 */
-		if (iuid != uid) {
+		if (!uid_eq(iuid, uid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
 				ASSERT(mask & ATTR_UID);
 				ASSERT(udqp);
 				olddquot1 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_udquot, udqp);
 			}
-			ip->i_d.di_uid = uid;
+			ip->i_d.di_uid = from_kuid(&init_user_ns, uid);
 			inode->i_uid = uid;
 		}
-		if (igid != gid) {
+		if (!gid_eq(igid, gid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
 				ASSERT(!XFS_IS_PQUOTA_ON(mp));
 				ASSERT(mask & ATTR_GID);
@@ -620,7 +622,7 @@ xfs_setattr_nonsize(
 				olddquot2 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
-			ip->i_d.di_gid = gid;
+			ip->i_d.di_gid = from_kgid(&init_user_ns, gid);
 			inode->i_gid = gid;
 		}
 	}
@@ -1172,8 +1174,8 @@ xfs_setup_inode(
 
 	inode->i_mode	= ip->i_d.di_mode;
 	set_nlink(inode, ip->i_d.di_nlink);
-	inode->i_uid	= ip->i_d.di_uid;
-	inode->i_gid	= ip->i_d.di_gid;
+	inode->i_uid	= make_kuid(&init_user_ns, ip->i_d.di_uid);
+	inode->i_gid	= make_kgid(&init_user_ns, ip->i_d.di_gid);
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFBLK:
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb..94a2a8f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
 int
 xfs_qm_vop_dqalloc(
 	struct xfs_inode	*ip,
-	uid_t			uid,
-	gid_t			gid,
+	__uint32_t		di_uid,
+	__uint32_t		di_gid,
 	prid_t			prid,
 	uint			flags,
 	struct xfs_dquot	**O_udqpp,
@@ -1670,7 +1670,7 @@ xfs_qm_vop_dqalloc(
 	xfs_ilock(ip, lockflags);
 
 	if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
-		gid = ip->i_d.di_gid;
+		di_gid = ip->i_d.di_gid;
 
 	/*
 	 * Attach the dquot(s) to this inode, doing a dquot allocation
@@ -1686,7 +1686,7 @@ xfs_qm_vop_dqalloc(
 
 	uq = gq = NULL;
 	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
-		if (ip->i_d.di_uid != uid) {
+		if (ip->i_d.di_uid != di_uid) {
 			/*
 			 * What we need is the dquot that has this uid, and
 			 * if we send the inode to dqget, the uid of the inode
@@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
 			 * holding ilock.
 			 */
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)di_uid,
 						 XFS_DQ_USER,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
@@ -1721,9 +1721,9 @@ xfs_qm_vop_dqalloc(
 		}
 	}
 	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
-		if (ip->i_d.di_gid != gid) {
+		if (ip->i_d.di_gid != di_gid) {
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)di_gid,
 						 XFS_DQ_GROUP,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
@@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
 			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
 	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
 		delblksudq = udqp;
 		/*
 		 * If there are delayed allocation blocks, then we have to
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f..0464d77 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -320,8 +320,8 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
 		struct xfs_mount *, struct xfs_dquot *,
 		struct xfs_dquot *, long, long, uint);
 
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-		struct xfs_dquot **, struct xfs_dquot **);
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, __uint32_t, __uint32_t,
+		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **);
 extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *);
 extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -341,8 +341,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
 #else
 static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, __uint32_t uid, __uint32_t gid,
+		prid_t prid, uint flags, struct xfs_dquot **udqp,
+		struct xfs_dquot **gdqp)
 {
 	*udqp = NULL;
 	*gdqp = NULL;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403..3f7cfb3 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -384,7 +384,9 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb2..37e9d4a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -515,7 +515,9 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp,
+			from_kuid(&init_user_ns, current_fsuid()),
+			from_kgid(&init_user_ns, current_fsgid()), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		return error;
diff --git a/init/Kconfig b/init/Kconfig
index 9d3a788..fe29801 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1065,7 +1065,6 @@ config IPC_NS
 
 config USER_NS
 	bool "User namespace"
-	depends on UIDGID_CONVERTED
 	select UIDGID_STRICT_TYPE_CHECKS
 
 	default n
@@ -1099,20 +1098,8 @@ config NET_NS
 
 endif # NAMESPACES
 
-config UIDGID_CONVERTED
-	# True if all of the selected software conmponents are known
-	# to have uid_t and gid_t converted to kuid_t and kgid_t
-	# where appropriate and are otherwise safe to use with
-	# the user namespace.
-	bool
-	default y
-
-	# Filesystems
-	depends on XFS_FS = n
-
 config UIDGID_STRICT_TYPE_CHECKS
 	bool "Require conversions between uid/gids and their internal representation"
-	depends on UIDGID_CONVERTED
 	default n
 	help
 	 While the nececessary conversions are being added to all subsystems this option allows
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-19 15:09 [PATCH] userns: Convert xfs to use kuid/kgid where appropriate Dwight Engen
@ 2013-06-19 20:35 ` Eric W. Biederman
  2013-06-20  1:41   ` Dave Chinner
  2013-06-20  0:13 ` Dave Chinner
  1 sibling, 1 reply; 46+ messages in thread
From: Eric W. Biederman @ 2013-06-19 20:35 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, xfs


I am copying my gmail address so I have a chance of seeing replies from
Dave Chiner.  So far the only way I have been able to read his replies
has been to read mailling lists.  Which has not be conductive to having
this code discussed properly.  Hopefully copying my gmail address will
allow us to have a reasonable and timely conversation.


Dwight Engen <dwight.engen@oracle.com> writes:

> Use uint32 from init_user_ns for xfs internal uid/gid representation in
> acl, xfs_icdinode. 

>From my review of the code earlier that just isn't safe.  It allows all
kinds of things to slip through.

> Conversion of kuid/gid is done at the vfs boundary,
> other user visible xfs specific interfaces (bulkstat, eofblocks filter)
> expect uint32 init_user_ns uid/gid values.

>From my earlier review of the code conversion at the vfs boundary is
not safe.    

First off kuid_t and kgid_t are not a vfs concepts, they are linux
kernel concepts, and xfs is in the linux kernel.  What makes this
relevant is not all filesystem accesses are through the vfs so all of
the necessary conversions for security and a consistent user experience
can be had by only performing conversions at the user/kernel boundary.

In particular by being sloppy and not pushing kuid_t/kgid_t further down
you did not handle all of the conversions needed at the user/kernel
boundary in XFS_IOC_FREE_EOFBLOCKS.  Which can be called by an
unprivileged user.

I am little dubious about XFS_IOC_FREE_EOFBLOCKS allowing any
user to affect any other user.  Your changes just seem to make
it guaranteed that when called from a user namespace the wrong
user will be affected.

I honestly don't think avoiding the push down of kuid_t and kgid_t to
all of the xfs in-core data structures is safe.  Even if the initial
patch is safe I expect there will be silent breakage when the next ioctl
that bypasses the vfs is added.

Eric

> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> ---
>  fs/xfs/xfs_acl.c      | 24 ++++++++++++++++++++----
>  fs/xfs/xfs_fs.h       |  4 ++--
>  fs/xfs/xfs_icache.c   |  2 +-
>  fs/xfs/xfs_inode.c    |  6 +++---
>  fs/xfs/xfs_ioctl.c    |  2 +-
>  fs/xfs/xfs_iops.c     | 38 ++++++++++++++++++++------------------
>  fs/xfs/xfs_qm.c       | 16 ++++++++--------
>  fs/xfs/xfs_quota.h    |  9 +++++----
>  fs/xfs/xfs_symlink.c  |  4 +++-
>  fs/xfs/xfs_vnodeops.c |  4 +++-
>  init/Kconfig          | 13 -------------
>  11 files changed, 66 insertions(+), 56 deletions(-)
>
> diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
> index 306d883..fd2854e 100644
> --- a/fs/xfs/xfs_acl.c
> +++ b/fs/xfs/xfs_acl.c
> @@ -68,14 +68,17 @@ xfs_acl_from_disk(
>  
>  		switch (acl_e->e_tag) {
>  		case ACL_USER:
> +			acl_e->e_uid = make_kuid(&init_user_ns,
> +						 be32_to_cpu(ace->ae_id));
> +			break;
>  		case ACL_GROUP:
> -			acl_e->e_id = be32_to_cpu(ace->ae_id);
> +			acl_e->e_gid = make_kgid(&init_user_ns,
> +						 be32_to_cpu(ace->ae_id));
>  			break;
>  		case ACL_USER_OBJ:
>  		case ACL_GROUP_OBJ:
>  		case ACL_MASK:
>  		case ACL_OTHER:
> -			acl_e->e_id = ACL_UNDEFINED_ID;
>  			break;
>  		default:
>  			goto fail;
> @@ -101,7 +104,20 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
>  		acl_e = &acl->a_entries[i];
>  
>  		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
> -		ace->ae_id = cpu_to_be32(acl_e->e_id);
> +		switch (acl_e->e_tag) {
> +		case ACL_USER:
> +			ace->ae_id = cpu_to_be32(
> +				from_kuid(&init_user_ns, acl_e->e_uid));
> +			break;
> +		case ACL_GROUP:
> +			ace->ae_id = cpu_to_be32(
> +				from_kgid(&init_user_ns, acl_e->e_gid));
> +			break;
> +		default:
> +			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
> +			break;
> +		}
> +
>  		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
>  	}
>  }
> @@ -360,7 +376,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
>  		return -EINVAL;
>  	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
>  		return value ? -EACCES : 0;
> -	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
> +	if (!inode_owner_or_capable(inode))
>  		return -EPERM;
>  
>  	if (!value)
> diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
> index d046955..bf0a6f8 100644
> --- a/fs/xfs/xfs_fs.h
> +++ b/fs/xfs/xfs_fs.h
> @@ -347,8 +347,8 @@ typedef struct xfs_error_injection {
>  struct xfs_eofblocks {
>  	__u32		eof_version;
>  	__u32		eof_flags;
> -	uid_t		eof_uid;
> -	gid_t		eof_gid;
> +	__u32		eof_uid;
> +	__u32		eof_gid;
>  	prid_t		eof_prid;
>  	__u32		pad32;
>  	__u64		eof_min_file_size;
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 96e344e..70ba410 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -617,7 +617,7 @@ restart:
>  
>  /*
>   * Background scanning to trim post-EOF preallocated space. This is queued
> - * based on the 'background_prealloc_discard_period' tunable (5m by default).
> + * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
>   */
>  STATIC void
>  xfs_queue_eofblocks(
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 7f7be5f..8049976 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1268,8 +1268,8 @@ xfs_ialloc(
>  	ip->i_d.di_onlink = 0;
>  	ip->i_d.di_nlink = nlink;
>  	ASSERT(ip->i_d.di_nlink == nlink);
> -	ip->i_d.di_uid = current_fsuid();
> -	ip->i_d.di_gid = current_fsgid();
> +	ip->i_d.di_uid = from_kuid(&init_user_ns, current_fsuid());
> +	ip->i_d.di_gid = from_kgid(&init_user_ns, current_fsgid());
>  	xfs_set_projid(ip, prid);
>  	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
>  
> @@ -1308,7 +1308,7 @@ xfs_ialloc(
>  	 */
>  	if ((irix_sgid_inherit) &&
>  	    (ip->i_d.di_mode & S_ISGID) &&
> -	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
> +	    (!in_group_p(make_kgid(&init_user_ns, ip->i_d.di_gid)))) {
>  		ip->i_d.di_mode &= ~S_ISGID;
>  	}
>  
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 5e99968..daa6127 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -981,7 +981,7 @@ xfs_ioctl_setattr(
>  	 * to the file owner ID, except in cases where the
>  	 * CAP_FSETID capability is applicable.
>  	 */
> -	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
> +	if (!inode_owner_or_capable(&ip->i_vnode)) {
>  		code = XFS_ERROR(EPERM);
>  		goto error_return;
>  	}
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index ca9ecaa..bf96cf8 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -420,8 +420,8 @@ xfs_vn_getattr(
>  	stat->dev = inode->i_sb->s_dev;
>  	stat->mode = ip->i_d.di_mode;
>  	stat->nlink = ip->i_d.di_nlink;
> -	stat->uid = ip->i_d.di_uid;
> -	stat->gid = ip->i_d.di_gid;
> +	stat->uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
> +	stat->gid = make_kgid(&init_user_ns, ip->i_d.di_gid);
>  	stat->ino = ip->i_ino;
>  	stat->atime = inode->i_atime;
>  	stat->mtime = inode->i_mtime;
> @@ -488,8 +488,8 @@ xfs_setattr_nonsize(
>  	int			mask = iattr->ia_valid;
>  	xfs_trans_t		*tp;
>  	int			error;
> -	uid_t			uid = 0, iuid = 0;
> -	gid_t			gid = 0, igid = 0;
> +	kuid_t			uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
> +	kgid_t			gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
>  	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
>  	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
>  
> @@ -522,13 +522,13 @@ xfs_setattr_nonsize(
>  			uid = iattr->ia_uid;
>  			qflags |= XFS_QMOPT_UQUOTA;
>  		} else {
> -			uid = ip->i_d.di_uid;
> +			uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
>  		}
>  		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
>  			gid = iattr->ia_gid;
>  			qflags |= XFS_QMOPT_GQUOTA;
>  		}  else {
> -			gid = ip->i_d.di_gid;
> +			gid = make_kgid(&init_user_ns, ip->i_d.di_gid);
>  		}
>  
>  		/*
> @@ -538,8 +538,10 @@ xfs_setattr_nonsize(
>  		 */
>  		ASSERT(udqp == NULL);
>  		ASSERT(gdqp == NULL);
> -		error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
> -					 qflags, &udqp, &gdqp);
> +		error = xfs_qm_vop_dqalloc(ip, from_kuid(&init_user_ns, uid),
> +					   from_kgid(&init_user_ns, gid),
> +					   xfs_get_projid(ip),
> +					   qflags, &udqp, &gdqp);
>  		if (error)
>  			return error;
>  	}
> @@ -561,8 +563,8 @@ xfs_setattr_nonsize(
>  		 * while we didn't have the inode locked, inode's dquot(s)
>  		 * would have changed also.
>  		 */
> -		iuid = ip->i_d.di_uid;
> -		igid = ip->i_d.di_gid;
> +		iuid = make_kuid(&init_user_ns, ip->i_d.di_uid);
> +		igid = make_kgid(&init_user_ns, ip->i_d.di_gid);
>  		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
>  		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
>  
> @@ -571,8 +573,8 @@ xfs_setattr_nonsize(
>  		 * going to change.
>  		 */
>  		if (XFS_IS_QUOTA_RUNNING(mp) &&
> -		    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
> -		     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
> +		    ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
> +		     (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
>  			ASSERT(tp);
>  			error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
>  						capable(CAP_FOWNER) ?
> @@ -602,17 +604,17 @@ xfs_setattr_nonsize(
>  		 * Change the ownerships and register quota modifications
>  		 * in the transaction.
>  		 */
> -		if (iuid != uid) {
> +		if (!uid_eq(iuid, uid)) {
>  			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
>  				ASSERT(mask & ATTR_UID);
>  				ASSERT(udqp);
>  				olddquot1 = xfs_qm_vop_chown(tp, ip,
>  							&ip->i_udquot, udqp);
>  			}
> -			ip->i_d.di_uid = uid;
> +			ip->i_d.di_uid = from_kuid(&init_user_ns, uid);
>  			inode->i_uid = uid;
>  		}
> -		if (igid != gid) {
> +		if (!gid_eq(igid, gid)) {
>  			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
>  				ASSERT(!XFS_IS_PQUOTA_ON(mp));
>  				ASSERT(mask & ATTR_GID);
> @@ -620,7 +622,7 @@ xfs_setattr_nonsize(
>  				olddquot2 = xfs_qm_vop_chown(tp, ip,
>  							&ip->i_gdquot, gdqp);
>  			}
> -			ip->i_d.di_gid = gid;
> +			ip->i_d.di_gid = from_kgid(&init_user_ns, gid);
>  			inode->i_gid = gid;
>  		}
>  	}
> @@ -1172,8 +1174,8 @@ xfs_setup_inode(
>  
>  	inode->i_mode	= ip->i_d.di_mode;
>  	set_nlink(inode, ip->i_d.di_nlink);
> -	inode->i_uid	= ip->i_d.di_uid;
> -	inode->i_gid	= ip->i_d.di_gid;
> +	inode->i_uid	= make_kuid(&init_user_ns, ip->i_d.di_uid);
> +	inode->i_gid	= make_kgid(&init_user_ns, ip->i_d.di_gid);
>  
>  	switch (inode->i_mode & S_IFMT) {
>  	case S_IFBLK:
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index b75c9bb..94a2a8f 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
>  int
>  xfs_qm_vop_dqalloc(
>  	struct xfs_inode	*ip,
> -	uid_t			uid,
> -	gid_t			gid,
> +	__uint32_t		di_uid,
> +	__uint32_t		di_gid,
>  	prid_t			prid,
>  	uint			flags,
>  	struct xfs_dquot	**O_udqpp,
> @@ -1670,7 +1670,7 @@ xfs_qm_vop_dqalloc(
>  	xfs_ilock(ip, lockflags);
>  
>  	if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
> -		gid = ip->i_d.di_gid;
> +		di_gid = ip->i_d.di_gid;
>  
>  	/*
>  	 * Attach the dquot(s) to this inode, doing a dquot allocation
> @@ -1686,7 +1686,7 @@ xfs_qm_vop_dqalloc(
>  
>  	uq = gq = NULL;
>  	if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
> -		if (ip->i_d.di_uid != uid) {
> +		if (ip->i_d.di_uid != di_uid) {
>  			/*
>  			 * What we need is the dquot that has this uid, and
>  			 * if we send the inode to dqget, the uid of the inode
> @@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
>  			 * holding ilock.
>  			 */
>  			xfs_iunlock(ip, lockflags);
> -			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
> +			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)di_uid,
>  						 XFS_DQ_USER,
>  						 XFS_QMOPT_DQALLOC |
>  						 XFS_QMOPT_DOWARN,
> @@ -1721,9 +1721,9 @@ xfs_qm_vop_dqalloc(
>  		}
>  	}
>  	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
> -		if (ip->i_d.di_gid != gid) {
> +		if (ip->i_d.di_gid != di_gid) {
>  			xfs_iunlock(ip, lockflags);
> -			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
> +			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)di_gid,
>  						 XFS_DQ_GROUP,
>  						 XFS_QMOPT_DQALLOC |
>  						 XFS_QMOPT_DOWARN,
> @@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
>  			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
>  
>  	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
> -	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
> +	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
>  		delblksudq = udqp;
>  		/*
>  		 * If there are delayed allocation blocks, then we have to
> diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
> index c38068f..0464d77 100644
> --- a/fs/xfs/xfs_quota.h
> +++ b/fs/xfs/xfs_quota.h
> @@ -320,8 +320,8 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
>  		struct xfs_mount *, struct xfs_dquot *,
>  		struct xfs_dquot *, long, long, uint);
>  
> -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
> -		struct xfs_dquot **, struct xfs_dquot **);
> +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, __uint32_t, __uint32_t,
> +		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **);
>  extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
>  		struct xfs_dquot *, struct xfs_dquot *);
>  extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
> @@ -341,8 +341,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
>  
>  #else
>  static inline int
> -xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
> -		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
> +xfs_qm_vop_dqalloc(struct xfs_inode *ip, __uint32_t uid, __uint32_t gid,
> +		prid_t prid, uint flags, struct xfs_dquot **udqp,
> +		struct xfs_dquot **gdqp)
>  {
>  	*udqp = NULL;
>  	*gdqp = NULL;
> diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
> index 195a403..3f7cfb3 100644
> --- a/fs/xfs/xfs_symlink.c
> +++ b/fs/xfs/xfs_symlink.c
> @@ -384,7 +384,9 @@ xfs_symlink(
>  	/*
>  	 * Make sure that we have allocated dquot(s) on disk.
>  	 */
> -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
> +	error = xfs_qm_vop_dqalloc(dp,
> +			from_kuid(&init_user_ns, current_fsuid()),
> +			from_kgid(&init_user_ns, current_fsgid()), prid,
>  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
>  	if (error)
>  		goto std_return;
> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
> index 0176bb2..37e9d4a 100644
> --- a/fs/xfs/xfs_vnodeops.c
> +++ b/fs/xfs/xfs_vnodeops.c
> @@ -515,7 +515,9 @@ xfs_create(
>  	/*
>  	 * Make sure that we have allocated dquot(s) on disk.
>  	 */
> -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
> +	error = xfs_qm_vop_dqalloc(dp,
> +			from_kuid(&init_user_ns, current_fsuid()),
> +			from_kgid(&init_user_ns, current_fsgid()), prid,
>  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
>  	if (error)
>  		return error;
> diff --git a/init/Kconfig b/init/Kconfig
> index 9d3a788..fe29801 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1065,7 +1065,6 @@ config IPC_NS
>  
>  config USER_NS
>  	bool "User namespace"
> -	depends on UIDGID_CONVERTED
>  	select UIDGID_STRICT_TYPE_CHECKS
>  
>  	default n
> @@ -1099,20 +1098,8 @@ config NET_NS
>  
>  endif # NAMESPACES
>  
> -config UIDGID_CONVERTED
> -	# True if all of the selected software conmponents are known
> -	# to have uid_t and gid_t converted to kuid_t and kgid_t
> -	# where appropriate and are otherwise safe to use with
> -	# the user namespace.
> -	bool
> -	default y
> -
> -	# Filesystems
> -	depends on XFS_FS = n
> -
>  config UIDGID_STRICT_TYPE_CHECKS
>  	bool "Require conversions between uid/gids and their internal representation"
> -	depends on UIDGID_CONVERTED
>  	default n
>  	help
>  	 While the nececessary conversions are being added to all subsystems this option allows

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-19 15:09 [PATCH] userns: Convert xfs to use kuid/kgid where appropriate Dwight Engen
  2013-06-19 20:35 ` Eric W. Biederman
@ 2013-06-20  0:13 ` Dave Chinner
  2013-06-20 13:54   ` Dwight Engen
  1 sibling, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2013-06-20  0:13 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, xfs

On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> Use uint32 from init_user_ns for xfs internal uid/gid representation in
> acl, xfs_icdinode. Conversion of kuid/gid is done at the vfs boundary,
> other user visible xfs specific interfaces (bulkstat, eofblocks filter)
> expect uint32 init_user_ns uid/gid values.

It's minimal, but I'm not sure it's complete. I'll comment on that
in response to Eric's comments...

> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
....
> --- a/fs/xfs/xfs_fs.h
> +++ b/fs/xfs/xfs_fs.h
> @@ -347,8 +347,8 @@ typedef struct xfs_error_injection {
>  struct xfs_eofblocks {
>  	__u32		eof_version;
>  	__u32		eof_flags;
> -	uid_t		eof_uid;
> -	gid_t		eof_gid;
> +	__u32		eof_uid;
> +	__u32		eof_gid;
>  	prid_t		eof_prid;
>  	__u32		pad32;
>  	__u64		eof_min_file_size;

The patch doesn't do namespace conversion of these uid/gids, but I'm
not sure it should...

> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 96e344e..70ba410 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -617,7 +617,7 @@ restart:
>  
>  /*
>   * Background scanning to trim post-EOF preallocated space. This is queued
> - * based on the 'background_prealloc_discard_period' tunable (5m by default).
> + * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
>   */
>  STATIC void
>  xfs_queue_eofblocks(
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index 7f7be5f..8049976 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1268,8 +1268,8 @@ xfs_ialloc(
>  	ip->i_d.di_onlink = 0;
>  	ip->i_d.di_nlink = nlink;
>  	ASSERT(ip->i_d.di_nlink == nlink);
> -	ip->i_d.di_uid = current_fsuid();
> -	ip->i_d.di_gid = current_fsgid();
> +	ip->i_d.di_uid = from_kuid(&init_user_ns, current_fsuid());
> +	ip->i_d.di_gid = from_kgid(&init_user_ns, current_fsgid());

Why all new inodes be created in the init_user_ns? Shouldn't this be
current_user_ns()?

Same question throughout - why do you use init_user_ns for all these
UID conversions, when the whole point is to have awareness of
different namespaces?

>  	xfs_set_projid(ip, prid);
>  	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
>  
> @@ -1308,7 +1308,7 @@ xfs_ialloc(
>  	 */
>  	if ((irix_sgid_inherit) &&
>  	    (ip->i_d.di_mode & S_ISGID) &&
> -	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
> +	    (!in_group_p(make_kgid(&init_user_ns, ip->i_d.di_gid)))) {
>  		ip->i_d.di_mode &= ~S_ISGID;
>  	}
>  
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 5e99968..daa6127 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -981,7 +981,7 @@ xfs_ioctl_setattr(
>  	 * to the file owner ID, except in cases where the
>  	 * CAP_FSETID capability is applicable.
>  	 */
> -	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
> +	if (!inode_owner_or_capable(&ip->i_vnode)) {

				    VFS_I(ip)

>  		code = XFS_ERROR(EPERM);
>  		goto error_return;
>  	}
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index ca9ecaa..bf96cf8 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -420,8 +420,8 @@ xfs_vn_getattr(
>  	stat->dev = inode->i_sb->s_dev;
>  	stat->mode = ip->i_d.di_mode;
>  	stat->nlink = ip->i_d.di_nlink;
> -	stat->uid = ip->i_d.di_uid;
> -	stat->gid = ip->i_d.di_gid;
> +	stat->uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
> +	stat->gid = make_kgid(&init_user_ns, ip->i_d.di_gid);

Why not:

	stat->uid = inode->i_uid;
	stat->gid = inode->i_gid;

>  	stat->ino = ip->i_ino;
>  	stat->atime = inode->i_atime;
>  	stat->mtime = inode->i_mtime;
> @@ -488,8 +488,8 @@ xfs_setattr_nonsize(
>  	int			mask = iattr->ia_valid;
>  	xfs_trans_t		*tp;
>  	int			error;
> -	uid_t			uid = 0, iuid = 0;
> -	gid_t			gid = 0, igid = 0;
> +	kuid_t			uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
> +	kgid_t			gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
>  	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
>  	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
>  
> @@ -522,13 +522,13 @@ xfs_setattr_nonsize(
>  			uid = iattr->ia_uid;
>  			qflags |= XFS_QMOPT_UQUOTA;
>  		} else {
> -			uid = ip->i_d.di_uid;
> +			uid = make_kuid(&init_user_ns, ip->i_d.di_uid);

			uid = VFS_I(ip)->i_uid;
>  		}
>  		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
>  			gid = iattr->ia_gid;
>  			qflags |= XFS_QMOPT_GQUOTA;
>  		}  else {
> -			gid = ip->i_d.di_gid;
> +			gid = make_kgid(&init_user_ns, ip->i_d.di_gid);

			gid = VFS_I(ip)->i_gid;
>  		}
.....
> @@ -561,8 +563,8 @@ xfs_setattr_nonsize(
>  		 * while we didn't have the inode locked, inode's dquot(s)
>  		 * would have changed also.
>  		 */
> -		iuid = ip->i_d.di_uid;
> -		igid = ip->i_d.di_gid;
> +		iuid = make_kuid(&init_user_ns, ip->i_d.di_uid);
> +		igid = make_kgid(&init_user_ns, ip->i_d.di_gid);
>  		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
>  		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;

Same again - you can just use VFS_I(ip)->i_uid/VFS_I(ip)->i_gid
here.

> @@ -1172,8 +1174,8 @@ xfs_setup_inode(
>  
>  	inode->i_mode	= ip->i_d.di_mode;
>  	set_nlink(inode, ip->i_d.di_nlink);
> -	inode->i_uid	= ip->i_d.di_uid;
> -	inode->i_gid	= ip->i_d.di_gid;
> +	inode->i_uid	= make_kuid(&init_user_ns, ip->i_d.di_uid);
> +	inode->i_gid	= make_kgid(&init_user_ns, ip->i_d.di_gid);

current name space?

>  
>  	switch (inode->i_mode & S_IFMT) {
>  	case S_IFBLK:
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index b75c9bb..94a2a8f 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
>  int
>  xfs_qm_vop_dqalloc(
>  	struct xfs_inode	*ip,
> -	uid_t			uid,
> -	gid_t			gid,
> +	__uint32_t		di_uid,
> +	__uint32_t		di_gid,

xfs_dqid_t

And there's no need to rename the variables - that just causes
unnecessary churn, and the fact it is a dquot ID is documented by
the type.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-19 20:35 ` Eric W. Biederman
@ 2013-06-20  1:41   ` Dave Chinner
  2013-06-20 13:54     ` Dwight Engen
  0 siblings, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2013-06-20  1:41 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Eric W. Biederman, Dwight Engen, xfs

On Wed, Jun 19, 2013 at 01:35:30PM -0700, Eric W. Biederman wrote:
> 
> I am copying my gmail address so I have a chance of seeing replies from
> Dave Chiner.  So far the only way I have been able to read his replies
> has been to read mailling lists.  Which has not be conductive to having
> this code discussed properly.  Hopefully copying my gmail address will
> allow us to have a reasonable and timely conversation.
> 
> 
> Dwight Engen <dwight.engen@oracle.com> writes:
> 
> > Use uint32 from init_user_ns for xfs internal uid/gid representation in
> > acl, xfs_icdinode. 
> 
> From my review of the code earlier that just isn't safe.  It allows all
> kinds of things to slip through.

Such as?

> > Conversion of kuid/gid is done at the vfs boundary,
> > other user visible xfs specific interfaces (bulkstat, eofblocks filter)
> > expect uint32 init_user_ns uid/gid values.
> 
> From my earlier review of the code conversion at the vfs boundary is
> not safe.    

So you've claimed.

> First off kuid_t and kgid_t are not a vfs concepts, they are linux
> kernel concepts, and xfs is in the linux kernel.  What makes this
> relevant is not all filesystem accesses are through the vfs so all of
> the necessary conversions for security and a consistent user experience
> can be had by only performing conversions at the user/kernel boundary.

Right. Boundaries and consistent conversion across them are
important. They are especially important in filesystems for
converting to/from on-disk and kernel-native formats. Blindly
pushing kernel structure down as far are you can make them reach
ignores important boundaries.

You might want to hit XFS with a big hammer but the right solution
is far more nuanced that that.

> In particular by being sloppy and not pushing kuid_t/kgid_t further down
> you did not handle all of the conversions needed at the user/kernel
> boundary in XFS_IOC_FREE_EOFBLOCKS.

The kuid_t/kgid_t is actually pushed down this far - it's in the
struct inode - the code currently uses the on-disk XFS uid/gid,
not the struct inode's kuid_t/kgid_t. That's easily fixable.

Indeed, that's where most of the work needs to be done in XFS -
using VFS(ip)->i_uid instead of ip->i_d.di_uid in places where we
aren't dealing with reading or modifying the on-disk structures of
XFS. Once that is done, we will have driven kuid_t/kgid_t as far
down as the in-memory/on-disk format boundary allows, and we'll end
up catching all the sorts of problems you are worried about.

> Which can be called by an
> unprivileged user.

That's an oversite. Needs fixing.

> I honestly don't think avoiding the push down of kuid_t and kgid_t to
> all of the xfs in-core data structures is safe.  Even if the initial
> patch is safe I expect there will be silent breakage when the next ioctl
> that bypasses the vfs is added.

This problem isn't isolated to XFS - ioctls are added to ext4,
btrfs, gfs2, etc in every release and they all face the same
problems.  Hence trying to paint it as an XFS problem is realy
missing the mark....

I'd really like to see some regressions tests in xfstests that we
can use to confirm that filesystems have implemented namespaces
properly (e.g. quota set/get/report tests). That would go a long way
to ensuring that people don't inadvertantly. I'm sure you have some
test scripts for testing all the changes you made, so sharing them
would help us a lot.

[ Hmmmm - the quota netlink warning interface - it just takes a
warning for a specific kqid and emits it to all listeners on the
netlink interface. There's no namespace awareness there, so what
stops quota warning messages from one namespace being heard in all
other namespaces? It's not network namespace aware.... ]

As it is, I'm far more concerned about the security problems
existing ioctls and interfaces pose for user namespaces. i.e. that
CAP_SYS_ADMIN in any namespace can use bulkstat and then the
fs/fhandle.c interfaces to find and access any file in the
filesystem regardless of namespace restrictions. You can guess
filehandles pretty trivially, so you don't need bulkstat and you
don't need XFS for this to be a problem....

Further, bulkstat is used by backup utilities, so I think it needs
the unmodified uid/gid/prid to be passed out, and the restore
program needs a way to push them all back in unmodified. How would
you propose we go about doing this. Alternatively could you tell us
how a sub-namespace level backup/restore program supposed to
handle/detect/avoid being invoked from inside a restricted,
non-global namespace?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20  0:13 ` Dave Chinner
@ 2013-06-20 13:54   ` Dwight Engen
  2013-06-20 15:27     ` Brian Foster
  2013-06-20 22:03     ` Dave Chinner
  0 siblings, 2 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-20 13:54 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Eric W. Biederman, xfs

On Thu, 20 Jun 2013 10:13:41 +1000
Dave Chinner <david@fromorbit.com> wrote:

> On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> > Use uint32 from init_user_ns for xfs internal uid/gid
> > representation in acl, xfs_icdinode. Conversion of kuid/gid is done
> > at the vfs boundary, other user visible xfs specific interfaces
> > (bulkstat, eofblocks filter) expect uint32 init_user_ns uid/gid
> > values.
> 
> It's minimal, but I'm not sure it's complete. I'll comment on that
> in response to Eric's comments...
> 
> > Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> ....
> > --- a/fs/xfs/xfs_fs.h
> > +++ b/fs/xfs/xfs_fs.h
> > @@ -347,8 +347,8 @@ typedef struct xfs_error_injection {
> >  struct xfs_eofblocks {
> >  	__u32		eof_version;
> >  	__u32		eof_flags;
> > -	uid_t		eof_uid;
> > -	gid_t		eof_gid;
> > +	__u32		eof_uid;
> > +	__u32		eof_gid;
> >  	prid_t		eof_prid;
> >  	__u32		pad32;
> >  	__u64		eof_min_file_size;
> 
> The patch doesn't do namespace conversion of these uid/gids, but I'm
> not sure it should...

The ids are only advisory, if the caller doesn't specify
XFS_EOF_FLAGS_?ID blocks from any inode in the fs can be reclaimed
regardless of id. Because of this, I think at a minimum we should
change XFS_IOC_FREE_EOFBLOCKS to require capable(CAP_SYS_ADMIN), which
somewhat implies init_user_ns based ids.

To make this really userns aware, I think we could:
  - leave the fields as uid_t
  - make XFS_IOC_FREE_EOFBLOCKS require nsown_capable(CAP_SYS_ADMIN)
  - check kuid_has_mapping(current_user_ns()) for each
    inode. This would be a change in behavior when called
    from !init_user_ns, limiting the scope of inodes the ioctl could
    affect.
  - change xfs_inode_match_id() to use uid_eq(VFS_I(ip)->i_uid,
    make_kuid(current_user_ns(), eofb->eof_uid))

Does that sound reasonable?

> > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > index 96e344e..70ba410 100644
> > --- a/fs/xfs/xfs_icache.c
> > +++ b/fs/xfs/xfs_icache.c
> > @@ -617,7 +617,7 @@ restart:
> >  
> >  /*
> >   * Background scanning to trim post-EOF preallocated space. This
> > is queued
> > - * based on the 'background_prealloc_discard_period' tunable (5m
> > by default).
> > + * based on the 'speculative_prealloc_lifetime' tunable (5m by
> > default). */
> >  STATIC void
> >  xfs_queue_eofblocks(
> > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > index 7f7be5f..8049976 100644
> > --- a/fs/xfs/xfs_inode.c
> > +++ b/fs/xfs/xfs_inode.c
> > @@ -1268,8 +1268,8 @@ xfs_ialloc(
> >  	ip->i_d.di_onlink = 0;
> >  	ip->i_d.di_nlink = nlink;
> >  	ASSERT(ip->i_d.di_nlink == nlink);
> > -	ip->i_d.di_uid = current_fsuid();
> > -	ip->i_d.di_gid = current_fsgid();
> > +	ip->i_d.di_uid = from_kuid(&init_user_ns, current_fsuid());
> > +	ip->i_d.di_gid = from_kgid(&init_user_ns, current_fsgid());
> 
> Why all new inodes be created in the init_user_ns? Shouldn't this be
> current_user_ns()?

current_fsuid() is the kuid_t from whatever userns current is in,
which we are converting to a flat uint32 since i_d is the on disk inode.
This field is then used in xfs_setup_inode() to populate
VFS_I(ip)->i_uid. Most other filesystems would use inode_init_owner(),
but xfs does not (I assume because it wants to handle SGID itself based
on XFS_INHERIT_GID and irix_sgid_inherit).

> Same question throughout - why do you use init_user_ns for all these
> UID conversions, when the whole point is to have awareness of
> different namespaces?

Yep, there are instances you point out below where we can just use
inode->i_uid instead of converting back from the flat value, so I'll fix
those up.

> >  	xfs_set_projid(ip, prid);
> >  	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
> >  
> > @@ -1308,7 +1308,7 @@ xfs_ialloc(
> >  	 */
> >  	if ((irix_sgid_inherit) &&
> >  	    (ip->i_d.di_mode & S_ISGID) &&
> > -	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
> > +	    (!in_group_p(make_kgid(&init_user_ns,
> > ip->i_d.di_gid)))) { ip->i_d.di_mode &= ~S_ISGID;
> >  	}
> >  
> > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > index 5e99968..daa6127 100644
> > --- a/fs/xfs/xfs_ioctl.c
> > +++ b/fs/xfs/xfs_ioctl.c
> > @@ -981,7 +981,7 @@ xfs_ioctl_setattr(
> >  	 * to the file owner ID, except in cases where the
> >  	 * CAP_FSETID capability is applicable.
> >  	 */
> > -	if (current_fsuid() != ip->i_d.di_uid
> > && !capable(CAP_FOWNER)) {
> > +	if (!inode_owner_or_capable(&ip->i_vnode)) {
> 
> 				    VFS_I(ip)
> 
> >  		code = XFS_ERROR(EPERM);
> >  		goto error_return;
> >  	}
> > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> > index ca9ecaa..bf96cf8 100644
> > --- a/fs/xfs/xfs_iops.c
> > +++ b/fs/xfs/xfs_iops.c
> > @@ -420,8 +420,8 @@ xfs_vn_getattr(
> >  	stat->dev = inode->i_sb->s_dev;
> >  	stat->mode = ip->i_d.di_mode;
> >  	stat->nlink = ip->i_d.di_nlink;
> > -	stat->uid = ip->i_d.di_uid;
> > -	stat->gid = ip->i_d.di_gid;
> > +	stat->uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
> > +	stat->gid = make_kgid(&init_user_ns, ip->i_d.di_gid);
> 
> Why not:
> 
> 	stat->uid = inode->i_uid;
> 	stat->gid = inode->i_gid;
> 
> >  	stat->ino = ip->i_ino;
> >  	stat->atime = inode->i_atime;
> >  	stat->mtime = inode->i_mtime;
> > @@ -488,8 +488,8 @@ xfs_setattr_nonsize(
> >  	int			mask = iattr->ia_valid;
> >  	xfs_trans_t		*tp;
> >  	int			error;
> > -	uid_t			uid = 0, iuid = 0;
> > -	gid_t			gid = 0, igid = 0;
> > +	kuid_t			uid = GLOBAL_ROOT_UID, iuid
> > = GLOBAL_ROOT_UID;
> > +	kgid_t			gid = GLOBAL_ROOT_GID, igid
> > = GLOBAL_ROOT_GID; struct xfs_dquot	*udqp = NULL, *gdqp =
> > NULL; struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
> >  
> > @@ -522,13 +522,13 @@ xfs_setattr_nonsize(
> >  			uid = iattr->ia_uid;
> >  			qflags |= XFS_QMOPT_UQUOTA;
> >  		} else {
> > -			uid = ip->i_d.di_uid;
> > +			uid = make_kuid(&init_user_ns,
> > ip->i_d.di_uid);
> 
> 			uid = VFS_I(ip)->i_uid;
> >  		}
> >  		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
> >  			gid = iattr->ia_gid;
> >  			qflags |= XFS_QMOPT_GQUOTA;
> >  		}  else {
> > -			gid = ip->i_d.di_gid;
> > +			gid = make_kgid(&init_user_ns,
> > ip->i_d.di_gid);
> 
> 			gid = VFS_I(ip)->i_gid;
> >  		}
> .....
> > @@ -561,8 +563,8 @@ xfs_setattr_nonsize(
> >  		 * while we didn't have the inode locked, inode's
> > dquot(s)
> >  		 * would have changed also.
> >  		 */
> > -		iuid = ip->i_d.di_uid;
> > -		igid = ip->i_d.di_gid;
> > +		iuid = make_kuid(&init_user_ns, ip->i_d.di_uid);
> > +		igid = make_kgid(&init_user_ns, ip->i_d.di_gid);
> >  		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
> >  		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
> 
> Same again - you can just use VFS_I(ip)->i_uid/VFS_I(ip)->i_gid
> here.
> 
> > @@ -1172,8 +1174,8 @@ xfs_setup_inode(
> >  
> >  	inode->i_mode	= ip->i_d.di_mode;
> >  	set_nlink(inode, ip->i_d.di_nlink);
> > -	inode->i_uid	= ip->i_d.di_uid;
> > -	inode->i_gid	= ip->i_d.di_gid;
> > +	inode->i_uid	= make_kuid(&init_user_ns,
> > ip->i_d.di_uid);
> > +	inode->i_gid	= make_kgid(&init_user_ns,
> > ip->i_d.di_gid);
> 
> current name space?

I believe that is what this is doing, but I think it will be more
proper to do it the same as other filesystems:

i_uid_write(inode, ip->i_d.di_uid);
i_gid_write(inode, ip->i_d.di_gid);

> >  
> >  	switch (inode->i_mode & S_IFMT) {
> >  	case S_IFBLK:
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index b75c9bb..94a2a8f 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> > @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
> >  int
> >  xfs_qm_vop_dqalloc(
> >  	struct xfs_inode	*ip,
> > -	uid_t			uid,
> > -	gid_t			gid,
> > +	__uint32_t		di_uid,
> > +	__uint32_t		di_gid,
> 
> xfs_dqid_t
> 
> And there's no need to rename the variables - that just causes
> unnecessary churn, and the fact it is a dquot ID is documented by
> the type.

Yep using the type will be clearer, thanks.

> Cheers,
> 
> Dave.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20  1:41   ` Dave Chinner
@ 2013-06-20 13:54     ` Dwight Engen
  2013-06-20 21:10       ` Dave Chinner
  0 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-20 13:54 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Eric W. Biederman, xfs

On Thu, 20 Jun 2013 11:41:33 +1000
Dave Chinner <david@fromorbit.com> wrote:

> On Wed, Jun 19, 2013 at 01:35:30PM -0700, Eric W. Biederman wrote:
> > 
> > I am copying my gmail address so I have a chance of seeing replies
> > from Dave Chiner.  So far the only way I have been able to read his
> > replies has been to read mailling lists.  Which has not be
> > conductive to having this code discussed properly.  Hopefully
> > copying my gmail address will allow us to have a reasonable and
> > timely conversation.
> > 
> > 
> > Dwight Engen <dwight.engen@oracle.com> writes:
> > 
> > > Use uint32 from init_user_ns for xfs internal uid/gid
> > > representation in acl, xfs_icdinode. 
> > 
> > From my review of the code earlier that just isn't safe.  It allows
> > all kinds of things to slip through.
> 
> Such as?

Maybe saying "at the vfs boundary" is misleading, I guess I don't see
how this is all that different from what you did in the other
filesystems. Using ext4 as the example the conversions are done between:
 struct inode <-> struct ext4_inode
 struct posix_acl <-> ext4_acle_entry

which in xfs is analogous to
 struct inode <-> struct xfs_icdinode
 struct posix acl <-> struct xfs_acl_entry

which is where I did the conversions.

> > > Conversion of kuid/gid is done at the vfs boundary,
> > > other user visible xfs specific interfaces (bulkstat, eofblocks
> > > filter) expect uint32 init_user_ns uid/gid values.
> > 
> > From my earlier review of the code conversion at the vfs boundary is
> > not safe.    
> 
> So you've claimed.
> 
> > First off kuid_t and kgid_t are not a vfs concepts, they are linux
> > kernel concepts, and xfs is in the linux kernel.  What makes this
> > relevant is not all filesystem accesses are through the vfs so all
> > of the necessary conversions for security and a consistent user
> > experience can be had by only performing conversions at the
> > user/kernel boundary.
> 
> Right. Boundaries and consistent conversion across them are
> important. They are especially important in filesystems for
> converting to/from on-disk and kernel-native formats. Blindly
> pushing kernel structure down as far are you can make them reach
> ignores important boundaries.
> 
> You might want to hit XFS with a big hammer but the right solution
> is far more nuanced that that.
> 
> > In particular by being sloppy and not pushing kuid_t/kgid_t further
> > down you did not handle all of the conversions needed at the
> > user/kernel boundary in XFS_IOC_FREE_EOFBLOCKS.

See other reply for possible way to do this. I think the larger issue
is bulkstat, which I'm not sure should be converted or not.

> The kuid_t/kgid_t is actually pushed down this far - it's in the
> struct inode - the code currently uses the on-disk XFS uid/gid,
> not the struct inode's kuid_t/kgid_t. That's easily fixable.

Yep, I'll go through the code and switch to the inode where possible.

> Indeed, that's where most of the work needs to be done in XFS -
> using VFS(ip)->i_uid instead of ip->i_d.di_uid in places where we
> aren't dealing with reading or modifying the on-disk structures of
> XFS. Once that is done, we will have driven kuid_t/kgid_t as far
> down as the in-memory/on-disk format boundary allows, and we'll end
> up catching all the sorts of problems you are worried about.
> 
> > Which can be called by an
> > unprivileged user.
> 
> That's an oversite. Needs fixing.
> 
> > I honestly don't think avoiding the push down of kuid_t and kgid_t
> > to all of the xfs in-core data structures is safe.  Even if the
> > initial patch is safe I expect there will be silent breakage when
> > the next ioctl that bypasses the vfs is added.
> 
> This problem isn't isolated to XFS - ioctls are added to ext4,
> btrfs, gfs2, etc in every release and they all face the same
> problems.  Hence trying to paint it as an XFS problem is realy
> missing the mark....
> 
> I'd really like to see some regressions tests in xfstests that we
> can use to confirm that filesystems have implemented namespaces
> properly (e.g. quota set/get/report tests). That would go a long way
> to ensuring that people don't inadvertantly. I'm sure you have some
> test scripts for testing all the changes you made, so sharing them
> would help us a lot.
> 
> [ Hmmmm - the quota netlink warning interface - it just takes a
> warning for a specific kqid and emits it to all listeners on the
> netlink interface. There's no namespace awareness there, so what
> stops quota warning messages from one namespace being heard in all
> other namespaces? It's not network namespace aware.... ]
> 
> As it is, I'm far more concerned about the security problems
> existing ioctls and interfaces pose for user namespaces. i.e. that
> CAP_SYS_ADMIN in any namespace can use bulkstat and then the
> fs/fhandle.c interfaces to find and access any file in the
> filesystem regardless of namespace restrictions. You can guess
> filehandles pretty trivially, so you don't need bulkstat and you
> don't need XFS for this to be a problem....
> 
> Further, bulkstat is used by backup utilities, so I think it needs
> the unmodified uid/gid/prid to be passed out, and the restore
> program needs a way to push them all back in unmodified. How would
> you propose we go about doing this. Alternatively could you tell us
> how a sub-namespace level backup/restore program supposed to
> handle/detect/avoid being invoked from inside a restricted,
> non-global namespace?
> 
> Cheers,
> 
> Dave.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 13:54   ` Dwight Engen
@ 2013-06-20 15:27     ` Brian Foster
  2013-06-20 17:39       ` Dwight Engen
  2013-06-20 22:03     ` Dave Chinner
  1 sibling, 1 reply; 46+ messages in thread
From: Brian Foster @ 2013-06-20 15:27 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, xfs

On 06/20/2013 09:54 AM, Dwight Engen wrote:
> On Thu, 20 Jun 2013 10:13:41 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
>> On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
>>> Use uint32 from init_user_ns for xfs internal uid/gid
>>> representation in acl, xfs_icdinode. Conversion of kuid/gid is done
>>> at the vfs boundary, other user visible xfs specific interfaces
>>> (bulkstat, eofblocks filter) expect uint32 init_user_ns uid/gid
>>> values.
>>
>> It's minimal, but I'm not sure it's complete. I'll comment on that
>> in response to Eric's comments...
>>
>>> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
>> ....
>>> --- a/fs/xfs/xfs_fs.h
>>> +++ b/fs/xfs/xfs_fs.h
>>> @@ -347,8 +347,8 @@ typedef struct xfs_error_injection {
>>>  struct xfs_eofblocks {
>>>  	__u32		eof_version;
>>>  	__u32		eof_flags;
>>> -	uid_t		eof_uid;
>>> -	gid_t		eof_gid;
>>> +	__u32		eof_uid;
>>> +	__u32		eof_gid;
>>>  	prid_t		eof_prid;
>>>  	__u32		pad32;
>>>  	__u64		eof_min_file_size;
>>
>> The patch doesn't do namespace conversion of these uid/gids, but I'm
>> not sure it should...
> 
> The ids are only advisory, if the caller doesn't specify
> XFS_EOF_FLAGS_?ID blocks from any inode in the fs can be reclaimed
> regardless of id. Because of this, I think at a minimum we should
> change XFS_IOC_FREE_EOFBLOCKS to require capable(CAP_SYS_ADMIN), which
> somewhat implies init_user_ns based ids.
> 
> To make this really userns aware, I think we could:
>   - leave the fields as uid_t
>   - make XFS_IOC_FREE_EOFBLOCKS require nsown_capable(CAP_SYS_ADMIN)
>   - check kuid_has_mapping(current_user_ns()) for each
>     inode. This would be a change in behavior when called
>     from !init_user_ns, limiting the scope of inodes the ioctl could
>     affect.
>   - change xfs_inode_match_id() to use uid_eq(VFS_I(ip)->i_uid,
>     make_kuid(current_user_ns(), eofb->eof_uid))
> 
> Does that sound reasonable?
> 

Hi Dwight,

If I understand correctly, the proposition is to turn
XFS_EOF_FREE_EOFBLOCKS into administrator only functionality and run ns
conversions on the inode uid/gid and associated eofb values for the ID
filtering functionality.

The latter sounds reasonable to me, though I'm not so sure about the
CAP_SYS_ADMIN bit. For example, I think we'd expect a regular user to be
able to run an eofblocks scan against files covered under his quota.

Perhaps the right thing to do here is to restrict global (and project
quota?) scans to CAP_SYS_ADMIN and uid/gid based scans to processes with
the appropriate permissions (i.e., CAP_SYS_ADMIN, matching uid/gid or
CAP_FOWNER). Thoughts?

Brian

>>> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
>>> index 96e344e..70ba410 100644
>>> --- a/fs/xfs/xfs_icache.c
>>> +++ b/fs/xfs/xfs_icache.c
>>> @@ -617,7 +617,7 @@ restart:
>>>  
>>>  /*
>>>   * Background scanning to trim post-EOF preallocated space. This
>>> is queued
>>> - * based on the 'background_prealloc_discard_period' tunable (5m
>>> by default).
>>> + * based on the 'speculative_prealloc_lifetime' tunable (5m by
>>> default). */
>>>  STATIC void
>>>  xfs_queue_eofblocks(
>>> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
>>> index 7f7be5f..8049976 100644
>>> --- a/fs/xfs/xfs_inode.c
>>> +++ b/fs/xfs/xfs_inode.c
>>> @@ -1268,8 +1268,8 @@ xfs_ialloc(
>>>  	ip->i_d.di_onlink = 0;
>>>  	ip->i_d.di_nlink = nlink;
>>>  	ASSERT(ip->i_d.di_nlink == nlink);
>>> -	ip->i_d.di_uid = current_fsuid();
>>> -	ip->i_d.di_gid = current_fsgid();
>>> +	ip->i_d.di_uid = from_kuid(&init_user_ns, current_fsuid());
>>> +	ip->i_d.di_gid = from_kgid(&init_user_ns, current_fsgid());
>>
>> Why all new inodes be created in the init_user_ns? Shouldn't this be
>> current_user_ns()?
> 
> current_fsuid() is the kuid_t from whatever userns current is in,
> which we are converting to a flat uint32 since i_d is the on disk inode.
> This field is then used in xfs_setup_inode() to populate
> VFS_I(ip)->i_uid. Most other filesystems would use inode_init_owner(),
> but xfs does not (I assume because it wants to handle SGID itself based
> on XFS_INHERIT_GID and irix_sgid_inherit).
> 
>> Same question throughout - why do you use init_user_ns for all these
>> UID conversions, when the whole point is to have awareness of
>> different namespaces?
> 
> Yep, there are instances you point out below where we can just use
> inode->i_uid instead of converting back from the flat value, so I'll fix
> those up.
> 
>>>  	xfs_set_projid(ip, prid);
>>>  	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
>>>  
>>> @@ -1308,7 +1308,7 @@ xfs_ialloc(
>>>  	 */
>>>  	if ((irix_sgid_inherit) &&
>>>  	    (ip->i_d.di_mode & S_ISGID) &&
>>> -	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
>>> +	    (!in_group_p(make_kgid(&init_user_ns,
>>> ip->i_d.di_gid)))) { ip->i_d.di_mode &= ~S_ISGID;
>>>  	}
>>>  
>>> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
>>> index 5e99968..daa6127 100644
>>> --- a/fs/xfs/xfs_ioctl.c
>>> +++ b/fs/xfs/xfs_ioctl.c
>>> @@ -981,7 +981,7 @@ xfs_ioctl_setattr(
>>>  	 * to the file owner ID, except in cases where the
>>>  	 * CAP_FSETID capability is applicable.
>>>  	 */
>>> -	if (current_fsuid() != ip->i_d.di_uid
>>> && !capable(CAP_FOWNER)) {
>>> +	if (!inode_owner_or_capable(&ip->i_vnode)) {
>>
>> 				    VFS_I(ip)
>>
>>>  		code = XFS_ERROR(EPERM);
>>>  		goto error_return;
>>>  	}
>>> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
>>> index ca9ecaa..bf96cf8 100644
>>> --- a/fs/xfs/xfs_iops.c
>>> +++ b/fs/xfs/xfs_iops.c
>>> @@ -420,8 +420,8 @@ xfs_vn_getattr(
>>>  	stat->dev = inode->i_sb->s_dev;
>>>  	stat->mode = ip->i_d.di_mode;
>>>  	stat->nlink = ip->i_d.di_nlink;
>>> -	stat->uid = ip->i_d.di_uid;
>>> -	stat->gid = ip->i_d.di_gid;
>>> +	stat->uid = make_kuid(&init_user_ns, ip->i_d.di_uid);
>>> +	stat->gid = make_kgid(&init_user_ns, ip->i_d.di_gid);
>>
>> Why not:
>>
>> 	stat->uid = inode->i_uid;
>> 	stat->gid = inode->i_gid;
>>
>>>  	stat->ino = ip->i_ino;
>>>  	stat->atime = inode->i_atime;
>>>  	stat->mtime = inode->i_mtime;
>>> @@ -488,8 +488,8 @@ xfs_setattr_nonsize(
>>>  	int			mask = iattr->ia_valid;
>>>  	xfs_trans_t		*tp;
>>>  	int			error;
>>> -	uid_t			uid = 0, iuid = 0;
>>> -	gid_t			gid = 0, igid = 0;
>>> +	kuid_t			uid = GLOBAL_ROOT_UID, iuid
>>> = GLOBAL_ROOT_UID;
>>> +	kgid_t			gid = GLOBAL_ROOT_GID, igid
>>> = GLOBAL_ROOT_GID; struct xfs_dquot	*udqp = NULL, *gdqp =
>>> NULL; struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
>>>  
>>> @@ -522,13 +522,13 @@ xfs_setattr_nonsize(
>>>  			uid = iattr->ia_uid;
>>>  			qflags |= XFS_QMOPT_UQUOTA;
>>>  		} else {
>>> -			uid = ip->i_d.di_uid;
>>> +			uid = make_kuid(&init_user_ns,
>>> ip->i_d.di_uid);
>>
>> 			uid = VFS_I(ip)->i_uid;
>>>  		}
>>>  		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
>>>  			gid = iattr->ia_gid;
>>>  			qflags |= XFS_QMOPT_GQUOTA;
>>>  		}  else {
>>> -			gid = ip->i_d.di_gid;
>>> +			gid = make_kgid(&init_user_ns,
>>> ip->i_d.di_gid);
>>
>> 			gid = VFS_I(ip)->i_gid;
>>>  		}
>> .....
>>> @@ -561,8 +563,8 @@ xfs_setattr_nonsize(
>>>  		 * while we didn't have the inode locked, inode's
>>> dquot(s)
>>>  		 * would have changed also.
>>>  		 */
>>> -		iuid = ip->i_d.di_uid;
>>> -		igid = ip->i_d.di_gid;
>>> +		iuid = make_kuid(&init_user_ns, ip->i_d.di_uid);
>>> +		igid = make_kgid(&init_user_ns, ip->i_d.di_gid);
>>>  		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
>>>  		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
>>
>> Same again - you can just use VFS_I(ip)->i_uid/VFS_I(ip)->i_gid
>> here.
>>
>>> @@ -1172,8 +1174,8 @@ xfs_setup_inode(
>>>  
>>>  	inode->i_mode	= ip->i_d.di_mode;
>>>  	set_nlink(inode, ip->i_d.di_nlink);
>>> -	inode->i_uid	= ip->i_d.di_uid;
>>> -	inode->i_gid	= ip->i_d.di_gid;
>>> +	inode->i_uid	= make_kuid(&init_user_ns,
>>> ip->i_d.di_uid);
>>> +	inode->i_gid	= make_kgid(&init_user_ns,
>>> ip->i_d.di_gid);
>>
>> current name space?
> 
> I believe that is what this is doing, but I think it will be more
> proper to do it the same as other filesystems:
> 
> i_uid_write(inode, ip->i_d.di_uid);
> i_gid_write(inode, ip->i_d.di_gid);
> 
>>>  
>>>  	switch (inode->i_mode & S_IFMT) {
>>>  	case S_IFBLK:
>>> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
>>> index b75c9bb..94a2a8f 100644
>>> --- a/fs/xfs/xfs_qm.c
>>> +++ b/fs/xfs/xfs_qm.c
>>> @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
>>>  int
>>>  xfs_qm_vop_dqalloc(
>>>  	struct xfs_inode	*ip,
>>> -	uid_t			uid,
>>> -	gid_t			gid,
>>> +	__uint32_t		di_uid,
>>> +	__uint32_t		di_gid,
>>
>> xfs_dqid_t
>>
>> And there's no need to rename the variables - that just causes
>> unnecessary churn, and the fact it is a dquot ID is documented by
>> the type.
> 
> Yep using the type will be clearer, thanks.
> 
>> Cheers,
>>
>> Dave.
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 15:27     ` Brian Foster
@ 2013-06-20 17:39       ` Dwight Engen
  2013-06-20 19:12         ` Brian Foster
  0 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-20 17:39 UTC (permalink / raw)
  To: Brian Foster; +Cc: Eric W. Biederman, xfs

On Thu, 20 Jun 2013 11:27:04 -0400
Brian Foster <bfoster@redhat.com> wrote:

> On 06/20/2013 09:54 AM, Dwight Engen wrote:
> > On Thu, 20 Jun 2013 10:13:41 +1000
> > Dave Chinner <david@fromorbit.com> wrote:
> > 
> >> On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> >>> Use uint32 from init_user_ns for xfs internal uid/gid
> >>> representation in acl, xfs_icdinode. Conversion of kuid/gid is
> >>> done at the vfs boundary, other user visible xfs specific
> >>> interfaces (bulkstat, eofblocks filter) expect uint32
> >>> init_user_ns uid/gid values.
> >>
> >> It's minimal, but I'm not sure it's complete. I'll comment on that
> >> in response to Eric's comments...
> >>
> >>> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> >> ....
> >>> --- a/fs/xfs/xfs_fs.h
> >>> +++ b/fs/xfs/xfs_fs.h
> >>> @@ -347,8 +347,8 @@ typedef struct xfs_error_injection {
> >>>  struct xfs_eofblocks {
> >>>  	__u32		eof_version;
> >>>  	__u32		eof_flags;
> >>> -	uid_t		eof_uid;
> >>> -	gid_t		eof_gid;
> >>> +	__u32		eof_uid;
> >>> +	__u32		eof_gid;
> >>>  	prid_t		eof_prid;
> >>>  	__u32		pad32;
> >>>  	__u64		eof_min_file_size;
> >>
> >> The patch doesn't do namespace conversion of these uid/gids, but
> >> I'm not sure it should...
> > 
> > The ids are only advisory, if the caller doesn't specify
> > XFS_EOF_FLAGS_?ID blocks from any inode in the fs can be reclaimed
> > regardless of id. Because of this, I think at a minimum we should
> > change XFS_IOC_FREE_EOFBLOCKS to require capable(CAP_SYS_ADMIN),
> > which somewhat implies init_user_ns based ids.
> > 
> > To make this really userns aware, I think we could:
> >   - leave the fields as uid_t
> >   - make XFS_IOC_FREE_EOFBLOCKS require nsown_capable(CAP_SYS_ADMIN)
> >   - check kuid_has_mapping(current_user_ns()) for each
> >     inode. This would be a change in behavior when called
> >     from !init_user_ns, limiting the scope of inodes the ioctl could
> >     affect.
> >   - change xfs_inode_match_id() to use uid_eq(VFS_I(ip)->i_uid,
> >     make_kuid(current_user_ns(), eofb->eof_uid))
> > 
> > Does that sound reasonable?
> > 
> 
> Hi Dwight,
> 
> If I understand correctly, the proposition is to turn
> XFS_EOF_FREE_EOFBLOCKS into administrator only functionality and run
> ns conversions on the inode uid/gid and associated eofb values for
> the ID filtering functionality.

Hi Brian, yeah that was the proposal :) I think there are really two
issues here. One is that the uid_t/gid_t may come from a userns so we
should be aware of that. Currently the ids passed in are used for
*filtering* so a malicious user can't do anything more than they
already can by not passing ids at all, but we should fix this so only
the intended files are affected. Second is that currently the ioctl
allows an unprivileged user to affect another user (as Eric pointed
out):

> I am little dubious about XFS_IOC_FREE_EOFBLOCKS allowing any
> user to affect any other user.  Your changes just seem to make
> it guaranteed that when called from a user namespace the wrong
> user will be affected.

I don't think the nsown_capability() I proposed is enough to take care
of this. Do you agree that if the caller is going to affect other
users, they should be CAP_SYS_ADMIN (or maybe CAP_FOWNER) in
init_user_ns?

> The latter sounds reasonable to me, though I'm not so sure about the
> CAP_SYS_ADMIN bit. For example, I think we'd expect a regular user to
> be able to run an eofblocks scan against files covered under his
> quota.
>
> Perhaps the right thing to do here is to restrict global (and project
> quota?) scans to CAP_SYS_ADMIN and uid/gid based scans to processes
> with the appropriate permissions (i.e., CAP_SYS_ADMIN, matching
> uid/gid or CAP_FOWNER). Thoughts?

That sounds good to me. Maybe for a regular user the appropriate
permission check (at the top of xfs_inode_free_eofblocks()) could be
something like:

	if (!capable(CAP_SYS_ADMIN) &&
	    !uid_eq(VFS_I(ip)->i_uid, current_fsuid()) &&
	    !in_group_p(VFS_I(ip)->i_gid))
		return 0;

This has the drawback that the caller won't know if they supplied a
uid/gid in eofblocks that won't actually get cleared, so maybe we
want to validate a uid/gid in eofblocks after its copy_from_user()ed
in instead? Also, I'm not sure if this is the same as "under his quota"
and how it plays with project quotas.

> Brian

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 17:39       ` Dwight Engen
@ 2013-06-20 19:12         ` Brian Foster
  2013-06-20 22:12           ` Dave Chinner
  2013-06-20 22:45           ` Eric W. Biederman
  0 siblings, 2 replies; 46+ messages in thread
From: Brian Foster @ 2013-06-20 19:12 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, xfs

On 06/20/2013 01:39 PM, Dwight Engen wrote:
> On Thu, 20 Jun 2013 11:27:04 -0400
> Brian Foster <bfoster@redhat.com> wrote:
> 
>> On 06/20/2013 09:54 AM, Dwight Engen wrote:
>>> On Thu, 20 Jun 2013 10:13:41 +1000
>>> Dave Chinner <david@fromorbit.com> wrote:
>>>
>>>> On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
...
>>
>> Hi Dwight,
>>
>> If I understand correctly, the proposition is to turn
>> XFS_EOF_FREE_EOFBLOCKS into administrator only functionality and run
>> ns conversions on the inode uid/gid and associated eofb values for
>> the ID filtering functionality.
> 
> Hi Brian, yeah that was the proposal :) I think there are really two
> issues here. One is that the uid_t/gid_t may come from a userns so we
> should be aware of that. Currently the ids passed in are used for
> *filtering* so a malicious user can't do anything more than they
> already can by not passing ids at all, but we should fix this so only
> the intended files are affected. Second is that currently the ioctl
> allows an unprivileged user to affect another user (as Eric pointed
> out):
> 
>> I am little dubious about XFS_IOC_FREE_EOFBLOCKS allowing any
>> user to affect any other user.  Your changes just seem to make
>> it guaranteed that when called from a user namespace the wrong
>> user will be affected.
> 
> I don't think the nsown_capability() I proposed is enough to take care
> of this. Do you agree that if the caller is going to affect other
> users, they should be CAP_SYS_ADMIN (or maybe CAP_FOWNER) in
> init_user_ns?
> 

Yeah, that's what I was getting at below by restricting "global" scans
to admin privilege.

>> The latter sounds reasonable to me, though I'm not so sure about the
>> CAP_SYS_ADMIN bit. For example, I think we'd expect a regular user to
>> be able to run an eofblocks scan against files covered under his
>> quota.
>>
>> Perhaps the right thing to do here is to restrict global (and project
>> quota?) scans to CAP_SYS_ADMIN and uid/gid based scans to processes
>> with the appropriate permissions (i.e., CAP_SYS_ADMIN, matching
>> uid/gid or CAP_FOWNER). Thoughts?
> 
> That sounds good to me. Maybe for a regular user the appropriate
> permission check (at the top of xfs_inode_free_eofblocks()) could be
> something like:
> 

I think the various capability/permission checks should be in the ioctl
code. xfs_icache_free_eofblocks() and below are internal interfaces
where these checks are probably not relevant. I actually have code lying
around that creates an internal structure for that code, similar but
separate from the xfs_eofblocks structure.

> 	if (!capable(CAP_SYS_ADMIN) &&
> 	    !uid_eq(VFS_I(ip)->i_uid, current_fsuid()) &&
> 	    !in_group_p(VFS_I(ip)->i_gid))
> 		return 0;
> 

This is a little confusing (and pardon me, I'm a bit new to the
namespace work). What might be a bit more clear is to do the capability
checks against the EOFBLOCKS command flags in xfs_file_ioctl() and
return an appropriate error if permission is not granted for the
requested type of scan (i.e., a regular user doing a global or non-id
matching scan). Then restrict the changes in xfs_icache_free_eofblocks()
to just dealing with the namespace conversions.

This would still allow use cases such as the pending code I have that
invokes an eofblocks scan on write() failure due to EDQUOT/ENOSPC in the
case of project or user/group quotas. I suspect adding the namespace
conversion stuff wouldn't break the typical user/group quota case, but
we'd still require the ability to run a project quota scan from a
particular user context. I think the combined check you have above would
break that.

Brian

> This has the drawback that the caller won't know if they supplied a
> uid/gid in eofblocks that won't actually get cleared, so maybe we
> want to validate a uid/gid in eofblocks after its copy_from_user()ed
> in instead? Also, I'm not sure if this is the same as "under his quota"
> and how it plays with project quotas.
> 
>> Brian

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 13:54     ` Dwight Engen
@ 2013-06-20 21:10       ` Dave Chinner
  0 siblings, 0 replies; 46+ messages in thread
From: Dave Chinner @ 2013-06-20 21:10 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, xfs

On Thu, Jun 20, 2013 at 09:54:19AM -0400, Dwight Engen wrote:
> On Thu, 20 Jun 2013 11:41:33 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > On Wed, Jun 19, 2013 at 01:35:30PM -0700, Eric W. Biederman wrote:
> > > 
> > > I am copying my gmail address so I have a chance of seeing replies
> > > from Dave Chiner.  So far the only way I have been able to read his
> > > replies has been to read mailling lists.  Which has not be
> > > conductive to having this code discussed properly.  Hopefully
> > > copying my gmail address will allow us to have a reasonable and
> > > timely conversation.
> > > 
> > > 
> > > Dwight Engen <dwight.engen@oracle.com> writes:
> > > 
> > > > Use uint32 from init_user_ns for xfs internal uid/gid
> > > > representation in acl, xfs_icdinode. 
> > > 
> > > From my review of the code earlier that just isn't safe.  It allows
> > > all kinds of things to slip through.
> > 
> > Such as?
> 
> Maybe saying "at the vfs boundary" is misleading, I guess I don't see
> how this is all that different from what you did in the other
> filesystems. Using ext4 as the example the conversions are done between:
>  struct inode <-> struct ext4_inode
>  struct posix_acl <-> ext4_acle_entry
> 
> which in xfs is analogous to
>  struct inode <-> struct xfs_icdinode
>  struct posix acl <-> struct xfs_acl_entry
> 
> which is where I did the conversions.

Yup, that's where they should occur for XFS.

> > The kuid_t/kgid_t is actually pushed down this far - it's in the
> > struct inode - the code currently uses the on-disk XFS uid/gid,
> > not the struct inode's kuid_t/kgid_t. That's easily fixable.
> 
> Yep, I'll go through the code and switch to the inode where possible.

Cool. We'll need to be careful, though - there are some code paths
that XFS inodes can pass through where the VFS(ip) hasn't been
initialised. Let me worry about this during review, though ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 13:54   ` Dwight Engen
  2013-06-20 15:27     ` Brian Foster
@ 2013-06-20 22:03     ` Dave Chinner
  2013-06-21 15:14       ` Dwight Engen
  1 sibling, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2013-06-20 22:03 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, xfs

On Thu, Jun 20, 2013 at 09:54:10AM -0400, Dwight Engen wrote:
> On Thu, 20 Jun 2013 10:13:41 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> > > Use uint32 from init_user_ns for xfs internal uid/gid
> > > representation in acl, xfs_icdinode. Conversion of kuid/gid is done
> > > at the vfs boundary, other user visible xfs specific interfaces
> > > (bulkstat, eofblocks filter) expect uint32 init_user_ns uid/gid
> > > values.
> > 
> > It's minimal, but I'm not sure it's complete. I'll comment on that
> > in response to Eric's comments...
...
> > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > index 7f7be5f..8049976 100644
> > > --- a/fs/xfs/xfs_inode.c
> > > +++ b/fs/xfs/xfs_inode.c
> > > @@ -1268,8 +1268,8 @@ xfs_ialloc(
> > >  	ip->i_d.di_onlink = 0;
> > >  	ip->i_d.di_nlink = nlink;
> > >  	ASSERT(ip->i_d.di_nlink == nlink);
> > > -	ip->i_d.di_uid = current_fsuid();
> > > -	ip->i_d.di_gid = current_fsgid();
> > > +	ip->i_d.di_uid = from_kuid(&init_user_ns, current_fsuid());
> > > +	ip->i_d.di_gid = from_kgid(&init_user_ns, current_fsgid());
> > 
> > Why all new inodes be created in the init_user_ns? Shouldn't this be
> > current_user_ns()?
> 
> current_fsuid() is the kuid_t from whatever userns current is in,

Yes.

> which we are converting to a flat uint32 since i_d is the on disk inode.

So, init_user_ns is actually the target namespace of the conversion,
not the namespace we are converting *from*?

<sigh>

I *knew* this was going to happen when I first saw this code:

http://www.spinics.net/lists/netdev/msg209217.html

"For those of us that have to look at it once every few months,
following the same conventions as all the other code in the kernel
(i.e. kqid_to_id()) tells me everything I need to know without
having to go through the process of looking up the unusual
from_kqid() function and then from_kuid() to find out what it is
actually doing...."

Here I am, several months later and trying to work out what the damn
conversion from_kgid() and make_kuid() are supposed to be doing and
trying to work out if it is correct or not...

> This field is then used in xfs_setup_inode() to populate
> VFS_I(ip)->i_uid.

But that then uses make_kuid(&init_user_ns, ip->i_d.di_uid) which
according to the documentation creates a kuid in the init_user_ns,
not in the current user namespace.

So if we then do uid_eq(current_fsuid(), VFS_I(ip)->i_uid) on the
newly created and initialised inode they are different, yes? If they
are different, then this code is not correct....

> Most other filesystems would use inode_init_owner(),
> but xfs does not (I assume because it wants to handle SGID itself based
> on XFS_INHERIT_GID and irix_sgid_inherit).

No, XFS doesn't use inode_init_owner() because we are initialising
the on disk XFS inode here, not the VFS struct inode...

> > > @@ -1172,8 +1174,8 @@ xfs_setup_inode(
> > >  
> > >  	inode->i_mode	= ip->i_d.di_mode;
> > >  	set_nlink(inode, ip->i_d.di_nlink);
> > > -	inode->i_uid	= ip->i_d.di_uid;
> > > -	inode->i_gid	= ip->i_d.di_gid;
> > > +	inode->i_uid	= make_kuid(&init_user_ns,
> > > ip->i_d.di_uid);
> > > +	inode->i_gid	= make_kgid(&init_user_ns,
> > > ip->i_d.di_gid);
> > 
> > current name space?
> 
> I believe that is what this is doing, but I think it will be more
> proper to do it the same as other filesystems:
> 
> i_uid_write(inode, ip->i_d.di_uid);
> i_gid_write(inode, ip->i_d.di_gid);

Sure, but that's still creating the uids/gids in the init_user_ns,
so it doesn't solve my confusion about *why* this is being done.
There's no documentation as to how this stuff is supposed to work,
so I can't find out for myself. I'm not one for cargo-cult
copy-n-paste development - I like to understand why something is
done before copying it...

So, to prevent me from wondering what it is doing in another 6
months time, can you add a set of helper functions that are named:

xfs_kuid_to_disk()
xfs_kuid_from_disk()
xfs_kgid_to_disk()
xfs_kgid_from_disk()

and document why we are using the namespaces that are being used,
and then use them where we convert to/from the different inode
structures?

FWIW, what happens when ip->i_d.di_gid doesn't have a mapping in the
current namespace, and make_kuid/make_kgid return
INVALID_UID/INVALID_GID? Is this is going to happen, and if it does
what do we need to do about it? That will need to be added to the
comments, too.

At least if we get this done, XFS people will be able to tell at a
glance that the XFs code is doing the right thing w.r.t namespace
conversion...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 19:12         ` Brian Foster
@ 2013-06-20 22:12           ` Dave Chinner
  2013-06-20 22:45           ` Eric W. Biederman
  1 sibling, 0 replies; 46+ messages in thread
From: Dave Chinner @ 2013-06-20 22:12 UTC (permalink / raw)
  To: Brian Foster; +Cc: Eric W. Biederman, Dwight Engen, xfs

On Thu, Jun 20, 2013 at 03:12:16PM -0400, Brian Foster wrote:
> On 06/20/2013 01:39 PM, Dwight Engen wrote:
> > On Thu, 20 Jun 2013 11:27:04 -0400
> > Brian Foster <bfoster@redhat.com> wrote:
> > 
> >> On 06/20/2013 09:54 AM, Dwight Engen wrote:
> >>> On Thu, 20 Jun 2013 10:13:41 +1000
> >>> Dave Chinner <david@fromorbit.com> wrote:
> >>>
> >>>> On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> ...
> >>
> >> Hi Dwight,
> >>
> >> If I understand correctly, the proposition is to turn
> >> XFS_EOF_FREE_EOFBLOCKS into administrator only functionality and run
> >> ns conversions on the inode uid/gid and associated eofb values for
> >> the ID filtering functionality.
> > 
> > Hi Brian, yeah that was the proposal :) I think there are really two
> > issues here. One is that the uid_t/gid_t may come from a userns so we
> > should be aware of that. Currently the ids passed in are used for
> > *filtering* so a malicious user can't do anything more than they
> > already can by not passing ids at all, but we should fix this so only
> > the intended files are affected. Second is that currently the ioctl
> > allows an unprivileged user to affect another user (as Eric pointed
> > out):
> > 
> >> I am little dubious about XFS_IOC_FREE_EOFBLOCKS allowing any
> >> user to affect any other user.  Your changes just seem to make
> >> it guaranteed that when called from a user namespace the wrong
> >> user will be affected.
> > 
> > I don't think the nsown_capability() I proposed is enough to take care
> > of this. Do you agree that if the caller is going to affect other
> > users, they should be CAP_SYS_ADMIN (or maybe CAP_FOWNER) in
> > init_user_ns?
> > 
> 
> Yeah, that's what I was getting at below by restricting "global" scans
> to admin privilege.

Project quota scans are global scans, so user-based initiation
through ioctls they should always be restricted to CAP_SYS_ADMIN.

> >> The latter sounds reasonable to me, though I'm not so sure about the
> >> CAP_SYS_ADMIN bit. For example, I think we'd expect a regular user to
> >> be able to run an eofblocks scan against files covered under his
> >> quota.
> >>
> >> Perhaps the right thing to do here is to restrict global (and project
> >> quota?) scans to CAP_SYS_ADMIN and uid/gid based scans to processes
> >> with the appropriate permissions (i.e., CAP_SYS_ADMIN, matching
> >> uid/gid or CAP_FOWNER). Thoughts?
> > 
> > That sounds good to me. Maybe for a regular user the appropriate
> > permission check (at the top of xfs_inode_free_eofblocks()) could be
> > something like:
> > 
> 
> I think the various capability/permission checks should be in the ioctl
> code.

Yes, the cap/perm checks should be done before anything else in
the ioctl.

> This would still allow use cases such as the pending code I have that
> invokes an eofblocks scan on write() failure due to EDQUOT/ENOSPC in the
> case of project or user/group quotas.

Right, we have to ensure this can occur without namespace
restriction, because ENOSPC is not something that is bound by user
namespaces.

> I suspect adding the namespace
> conversion stuff wouldn't break the typical user/group quota case, but

For EDQUOT, no, but for a global ENOSPC scan I think it could cause
problems.

> we'd still require the ability to run a project quota scan from a
> particular user context.  I think the combined check you have
> above would break that.

Yup, that still needs to work, as does the background scanner which
should not be subject to any restrictions at all ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 19:12         ` Brian Foster
  2013-06-20 22:12           ` Dave Chinner
@ 2013-06-20 22:45           ` Eric W. Biederman
  2013-06-20 23:35             ` Dave Chinner
  1 sibling, 1 reply; 46+ messages in thread
From: Eric W. Biederman @ 2013-06-20 22:45 UTC (permalink / raw)
  To: Brian Foster; +Cc: Eric W. Biederman, Dwight Engen, xfs

Brian Foster <bfoster@redhat.com> writes:

> On 06/20/2013 01:39 PM, Dwight Engen wrote:
>> On Thu, 20 Jun 2013 11:27:04 -0400
>> Brian Foster <bfoster@redhat.com> wrote:
>> 
>>> On 06/20/2013 09:54 AM, Dwight Engen wrote:
>>>> On Thu, 20 Jun 2013 10:13:41 +1000
>>>> Dave Chinner <david@fromorbit.com> wrote:
>>>>
>>>>> On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> ...
>>>
>>> Hi Dwight,
>>>
>>> If I understand correctly, the proposition is to turn
>>> XFS_EOF_FREE_EOFBLOCKS into administrator only functionality and run
>>> ns conversions on the inode uid/gid and associated eofb values for
>>> the ID filtering functionality.
>> 
>> Hi Brian, yeah that was the proposal :) I think there are really two
>> issues here. One is that the uid_t/gid_t may come from a userns so we
>> should be aware of that. Currently the ids passed in are used for
>> *filtering* so a malicious user can't do anything more than they
>> already can by not passing ids at all, but we should fix this so only
>> the intended files are affected. Second is that currently the ioctl
>> allows an unprivileged user to affect another user (as Eric pointed
>> out):
>> 
>>> I am little dubious about XFS_IOC_FREE_EOFBLOCKS allowing any
>>> user to affect any other user.  Your changes just seem to make
>>> it guaranteed that when called from a user namespace the wrong
>>> user will be affected.
>> 
>> I don't think the nsown_capability() I proposed is enough to take care
>> of this. Do you agree that if the caller is going to affect other
>> users, they should be CAP_SYS_ADMIN (or maybe CAP_FOWNER) in
>> init_user_ns?
>> 
>
> Yeah, that's what I was getting at below by restricting "global" scans
> to admin privilege.
>
>>> The latter sounds reasonable to me, though I'm not so sure about the
>>> CAP_SYS_ADMIN bit. For example, I think we'd expect a regular user to
>>> be able to run an eofblocks scan against files covered under his
>>> quota.
>>>
>>> Perhaps the right thing to do here is to restrict global (and project
>>> quota?) scans to CAP_SYS_ADMIN and uid/gid based scans to processes
>>> with the appropriate permissions (i.e., CAP_SYS_ADMIN, matching
>>> uid/gid or CAP_FOWNER). Thoughts?
>> 
>> That sounds good to me. Maybe for a regular user the appropriate
>> permission check (at the top of xfs_inode_free_eofblocks()) could be
>> something like:
>> 
>
> I think the various capability/permission checks should be in the ioctl
> code. xfs_icache_free_eofblocks() and below are internal interfaces
> where these checks are probably not relevant. I actually have code lying
> around that creates an internal structure for that code, similar but
> separate from the xfs_eofblocks structure.
>
>> 	if (!capable(CAP_SYS_ADMIN) &&
>> 	    !uid_eq(VFS_I(ip)->i_uid, current_fsuid()) &&
>> 	    !in_group_p(VFS_I(ip)->i_gid))
>> 		return 0;
>> 


> This is a little confusing (and pardon me, I'm a bit new to the
> namespace work). 

Depending I think if we had a per inode check I would just use
inode_owner_or_capable().

That said I think it makes sense to figure out what the permission
checks should be without taking the user namespace into account first.
Generalizing and relaxing them in safe ways from that point is not too
difficult.

> What might be a bit more clear is to do the capability
> checks against the EOFBLOCKS command flags in xfs_file_ioctl() and
> return an appropriate error if permission is not granted for the
> requested type of scan (i.e., a regular user doing a global or non-id
> matching scan). Then restrict the changes in xfs_icache_free_eofblocks()
> to just dealing with the namespace conversions.

> This would still allow use cases such as the pending code I have that
> invokes an eofblocks scan on write() failure due to EDQUOT/ENOSPC in the
> case of project or user/group quotas. I suspect adding the namespace
> conversion stuff wouldn't break the typical user/group quota case, but
> we'd still require the ability to run a project quota scan from a
> particular user context. I think the combined check you have above would
> break that.

The general solution is to capture the credentials (struct cred) of the
write and perform the security checks against a passed in cred instead
of current_cred().

I have a question about the project quota.  Is it intended that any
user can set an project quota on any file?  Unless I am misreading
xfs_ioctl_setattr that is what it allows.

My narrow focus concern on this is that if the user is in a user
namespace these ids need to be mapped before we look at them or else
we will do the wrong thing.

Eric

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 22:45           ` Eric W. Biederman
@ 2013-06-20 23:35             ` Dave Chinner
  0 siblings, 0 replies; 46+ messages in thread
From: Dave Chinner @ 2013-06-20 23:35 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: Eric W. Biederman, Brian Foster, Dwight Engen, xfs

On Thu, Jun 20, 2013 at 03:45:43PM -0700, Eric W. Biederman wrote:
> I have a question about the project quota.  Is it intended that any
> user can set an project quota on any file?  Unless I am misreading
> xfs_ioctl_setattr that is what it allows.

Only on files they own. There is this check in xfs_ioctl_setattr():

        /*
         * CAP_FOWNER overrides the following restrictions:
         *
         * The user ID of the calling process must be equal
         * to the file owner ID, except in cases where the
         * CAP_FSETID capability is applicable.
         */
        if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
                code = XFS_ERROR(EPERM);
                goto error_return;
        }


> My narrow focus concern on this is that if the user is in a user
> namespace these ids need to be mapped before we look at them or else
> we will do the wrong thing.

The user IDs need to be mapped, yes, but do we want to map project
IDs? Project IDs are the property of the underlying filesystem, not
that of a user namespace. Users can change what project their files
are associated with, but they cannot change their UID or GID....

I can see reasons for wanting the same project quota id to be shared
across multiple namespaces. e.g.  setting up a directory tree quota
for a specific set of namespaces where you don't care about
individual namespace space usage but you want the group as a whole
to be limited.

Indeed, the use of project quotas as an external management tool for
limiting the filesystem space a namespace container can actually
consume makes an interesting argument for preventing access to
project quotas from any namespace other than the init_user_ns.

So, rather than saying "it must be mapped", how about we start by
thinking about how we wnt project quotas to be used in containerised
namespace configurations and work from there....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-20 22:03     ` Dave Chinner
@ 2013-06-21 15:14       ` Dwight Engen
  2013-06-24  0:33         ` Dave Chinner
  0 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-21 15:14 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Serge Hallyn, Eric W. Biederman, xfs

On Fri, 21 Jun 2013 08:03:11 +1000
Dave Chinner <david@fromorbit.com> wrote:

> On Thu, Jun 20, 2013 at 09:54:10AM -0400, Dwight Engen wrote:
> > On Thu, 20 Jun 2013 10:13:41 +1000
> > Dave Chinner <david@fromorbit.com> wrote:
> > 
> > > On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> > > > Use uint32 from init_user_ns for xfs internal uid/gid
> > > > representation in acl, xfs_icdinode. Conversion of kuid/gid is
> > > > done at the vfs boundary, other user visible xfs specific
> > > > interfaces (bulkstat, eofblocks filter) expect uint32
> > > > init_user_ns uid/gid values.
> > > 
> > > It's minimal, but I'm not sure it's complete. I'll comment on that
> > > in response to Eric's comments...
> ...
> > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > index 7f7be5f..8049976 100644
> > > > --- a/fs/xfs/xfs_inode.c
> > > > +++ b/fs/xfs/xfs_inode.c
> > > > @@ -1268,8 +1268,8 @@ xfs_ialloc(
> > > >  	ip->i_d.di_onlink = 0;
> > > >  	ip->i_d.di_nlink = nlink;
> > > >  	ASSERT(ip->i_d.di_nlink == nlink);
> > > > -	ip->i_d.di_uid = current_fsuid();
> > > > -	ip->i_d.di_gid = current_fsgid();
> > > > +	ip->i_d.di_uid = from_kuid(&init_user_ns,
> > > > current_fsuid());
> > > > +	ip->i_d.di_gid = from_kgid(&init_user_ns,
> > > > current_fsgid());
> > > 
> > > Why all new inodes be created in the init_user_ns? Shouldn't this
> > > be current_user_ns()?
> > 
> > current_fsuid() is the kuid_t from whatever userns current is in,
> 
> Yes.
> 
> > which we are converting to a flat uint32 since i_d is the on disk
> > inode.

I was incorrect here. current_fsuid() is the kuid_t from the
perspective of init_user_ns, so the value won't be different, just the
type.

> So, init_user_ns is actually the target namespace of the conversion,
> not the namespace we are converting *from*?
> 
> <sigh>
> 
> I *knew* this was going to happen when I first saw this code:
> 
> http://www.spinics.net/lists/netdev/msg209217.html
> 
> "For those of us that have to look at it once every few months,
> following the same conventions as all the other code in the kernel
> (i.e. kqid_to_id()) tells me everything I need to know without
> having to go through the process of looking up the unusual
> from_kqid() function and then from_kuid() to find out what it is
> actually doing...."
> 
> Here I am, several months later and trying to work out what the damn
> conversion from_kgid() and make_kuid() are supposed to be doing and
> trying to work out if it is correct or not...
> 
> > This field is then used in xfs_setup_inode() to populate
> > VFS_I(ip)->i_uid.
> 
> But that then uses make_kuid(&init_user_ns, ip->i_d.di_uid) which
> according to the documentation creates a kuid in the init_user_ns,
> not in the current user namespace.

Right, inside the kernel uids are all interpreted within
init_user_ns context and are only converted to a different value if the
caller is in some other userns at the kernel/user boundary (ie.
cp_compat_stat()). Maybe it is more clear to say that init_user_ns is
the "flat 32bit uid space" and thus has a mapping for every uid. So
the value in init_user_ns will actually be equal to the di_uid value.

> So if we then do uid_eq(current_fsuid(), VFS_I(ip)->i_uid) on the
> newly created and initialised inode they are different, yes? If they
> are different, then this code is not correct....

No they should equal because xfs_setup_inode() maps it back, see below.

> > Most other filesystems would use inode_init_owner(),
> > but xfs does not (I assume because it wants to handle SGID itself
> > based on XFS_INHERIT_GID and irix_sgid_inherit).
> 
> No, XFS doesn't use inode_init_owner() because we are initialising
> the on disk XFS inode here, not the VFS struct inode...

Right, but I was referring to xfs_setup_inode() which is setting up the
VFS inode. It is called in both the from disk and new inode paths, and
sets the VFS inode uid based on the value in the disk inode. I was
trying to show that in the new inode path the VFS inode uid is being
initialized the same as on other filesystems (ie from current_fsuid()),
but with an extra step just because of "conversion" to and back from
init_user_ns. The call sequence in xfs:

  xfs_ialloc():
    current_fsuid() -> from_kuid() -> di_uid
  xfs_setup_inode():
    di_uid -> make_kuid() -> inode.i_uid == current_fsuid()

Sorry, I think my first explanation wasn't clear, hopefully this is.

> > > > @@ -1172,8 +1174,8 @@ xfs_setup_inode(
> > > >  
> > > >  	inode->i_mode	= ip->i_d.di_mode;
> > > >  	set_nlink(inode, ip->i_d.di_nlink);
> > > > -	inode->i_uid	= ip->i_d.di_uid;
> > > > -	inode->i_gid	= ip->i_d.di_gid;
> > > > +	inode->i_uid	= make_kuid(&init_user_ns,
> > > > ip->i_d.di_uid);
> > > > +	inode->i_gid	= make_kgid(&init_user_ns,
> > > > ip->i_d.di_gid);
> > > 
> > > current name space?
> > 
> > I believe that is what this is doing, but I think it will be more
> > proper to do it the same as other filesystems:
> > 
> > i_uid_write(inode, ip->i_d.di_uid);
> > i_gid_write(inode, ip->i_d.di_gid);
> 
> Sure, but that's still creating the uids/gids in the init_user_ns,
> so it doesn't solve my confusion about *why* this is being done.
> There's no documentation as to how this stuff is supposed to work,
> so I can't find out for myself. I'm not one for cargo-cult
> copy-n-paste development - I like to understand why something is
> done before copying it...
> 
> So, to prevent me from wondering what it is doing in another 6
> months time, can you add a set of helper functions that are named:
> 
> xfs_kuid_to_disk()
> xfs_kuid_from_disk()
> xfs_kgid_to_disk()
> xfs_kgid_from_disk()
> 
> and document why we are using the namespaces that are being used,
> and then use them where we convert to/from the different inode
> structures?

Sure I can take a shot at that, and I'm guessing you would prefer to use

inode->i_uid = xfs_kuid_from_disk(ip->i_d.di_uid);

over

i_uid_write(inode, ip->i_d.di_uid);

> FWIW, what happens when ip->i_d.di_gid doesn't have a mapping in the
> current namespace, and make_kuid/make_kgid return
> INVALID_UID/INVALID_GID? Is this is going to happen, and if it does
> what do we need to do about it? That will need to be added to the
> comments, too.

I don't think it will happen here because init_user_ns has a mapping
for all values. Where it can happen is when there is a conversion to
some subset userns, that doesn't have a mapping for the value.
 
> At least if we get this done, XFS people will be able to tell at a
> glance that the XFs code is doing the right thing w.r.t namespace
> conversion...
> 
> Cheers,
> 
> Dave.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-21 15:14       ` Dwight Engen
@ 2013-06-24  0:33         ` Dave Chinner
  2013-06-24 13:10           ` [PATCH v2 RFC] " Dwight Engen
  0 siblings, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2013-06-24  0:33 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Serge Hallyn, Eric W. Biederman, xfs

On Fri, Jun 21, 2013 at 11:14:20AM -0400, Dwight Engen wrote:
> On Fri, 21 Jun 2013 08:03:11 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > On Thu, Jun 20, 2013 at 09:54:10AM -0400, Dwight Engen wrote:
> > > On Thu, 20 Jun 2013 10:13:41 +1000
> > > Dave Chinner <david@fromorbit.com> wrote:
> > > 
> > > > On Wed, Jun 19, 2013 at 11:09:48AM -0400, Dwight Engen wrote:
> > > > > Use uint32 from init_user_ns for xfs internal uid/gid
> > > > > representation in acl, xfs_icdinode. Conversion of kuid/gid is
> > > > > done at the vfs boundary, other user visible xfs specific
> > > > > interfaces (bulkstat, eofblocks filter) expect uint32
> > > > > init_user_ns uid/gid values.
> > > > 
> > > > It's minimal, but I'm not sure it's complete. I'll comment on that
> > > > in response to Eric's comments...
> > ...
> > > > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> > > > > index 7f7be5f..8049976 100644
> > > > > --- a/fs/xfs/xfs_inode.c
> > > > > +++ b/fs/xfs/xfs_inode.c
> > > > > @@ -1268,8 +1268,8 @@ xfs_ialloc(
> > > > >  	ip->i_d.di_onlink = 0;
> > > > >  	ip->i_d.di_nlink = nlink;
> > > > >  	ASSERT(ip->i_d.di_nlink == nlink);
> > > > > -	ip->i_d.di_uid = current_fsuid();
> > > > > -	ip->i_d.di_gid = current_fsgid();
> > > > > +	ip->i_d.di_uid = from_kuid(&init_user_ns,
> > > > > current_fsuid());
> > > > > +	ip->i_d.di_gid = from_kgid(&init_user_ns,
> > > > > current_fsgid());
> > > > 
> > > > Why all new inodes be created in the init_user_ns? Shouldn't this
> > > > be current_user_ns()?
> > > 
> > > current_fsuid() is the kuid_t from whatever userns current is in,
> > 
> > Yes.
> > 
> > > which we are converting to a flat uint32 since i_d is the on disk
> > > inode.
> 
> I was incorrect here. current_fsuid() is the kuid_t from the
> perspective of init_user_ns, so the value won't be different, just the
> type.

I guess I'm not the only that is easily confused by this convoluted,
badly documented namespace conversion crap either....

> > So, init_user_ns is actually the target namespace of the conversion,
> > not the namespace we are converting *from*?
> > 
> > <sigh>
> > 
> > I *knew* this was going to happen when I first saw this code:
> > 
> > http://www.spinics.net/lists/netdev/msg209217.html
> > 
> > "For those of us that have to look at it once every few months,
> > following the same conventions as all the other code in the kernel
> > (i.e. kqid_to_id()) tells me everything I need to know without
> > having to go through the process of looking up the unusual
> > from_kqid() function and then from_kuid() to find out what it is
> > actually doing...."
> > 
> > Here I am, several months later and trying to work out what the damn
> > conversion from_kgid() and make_kuid() are supposed to be doing and
> > trying to work out if it is correct or not...
> > 
> > > This field is then used in xfs_setup_inode() to populate
> > > VFS_I(ip)->i_uid.
> > 
> > But that then uses make_kuid(&init_user_ns, ip->i_d.di_uid) which
> > according to the documentation creates a kuid in the init_user_ns,
> > not in the current user namespace.
> 
> Right, inside the kernel uids are all interpreted within
> init_user_ns context and are only converted to a different value if the
> caller is in some other userns at the kernel/user boundary (ie.
> cp_compat_stat()). Maybe it is more clear to say that init_user_ns is
> the "flat 32bit uid space" and thus has a mapping for every uid. So
> the value in init_user_ns will actually be equal to the di_uid value.

If that's the case, then why the hell do filesystems need to know
*anything* about namespaces and conversions? It shoul dbe
*completely* hidden from filesystem code with simple wrappers rather
than open coding init_user_ns conversions everywhere....

Indeed, for updating or getting the uid/gid from the struct inode
there are these helpers:

i_uid_read/i_uid_write
i_gid_read/i_uid_write

but there's nothing generic for the filesystems to use that just
take a kuid_t/kgid_t and return a raw value or vice versa. Nice.

> > So if we then do uid_eq(current_fsuid(), VFS_I(ip)->i_uid) on the
> > newly created and initialised inode they are different, yes? If they
> > are different, then this code is not correct....
> 
> No they should equal because xfs_setup_inode() maps it back, see below.
> 
> > > Most other filesystems would use inode_init_owner(),
> > > but xfs does not (I assume because it wants to handle SGID itself
> > > based on XFS_INHERIT_GID and irix_sgid_inherit).
> > 
> > No, XFS doesn't use inode_init_owner() because we are initialising
> > the on disk XFS inode here, not the VFS struct inode...
> 
> Right, but I was referring to xfs_setup_inode() which is setting up the
> VFS inode. It is called in both the from disk and new inode paths, and
> sets the VFS inode uid based on the value in the disk inode.

So perhaps your comment was in the wrong place? ;)

> I was
> trying to show that in the new inode path the VFS inode uid is being
> initialized the same as on other filesystems (ie from current_fsuid()),
> but with an extra step just because of "conversion" to and back from
> init_user_ns. The call sequence in xfs:
> 
>   xfs_ialloc():
>     current_fsuid() -> from_kuid() -> di_uid

	ip->i_d.di_uid = xfs_kuid_to_disk(current_fsuid());

>   xfs_setup_inode():
>     di_uid -> make_kuid() -> inode.i_uid == current_fsuid()

	inode->i_uid = xfs_kuid_from_disk(ip->i_d.di_uid);


> Sorry, I think my first explanation wasn't clear, hopefully this is.

Well, it's taken me another hour of battling through the
undocumented crap that is this namespace code starting from
setuid(), but I understand the conversions being done now. I dislike
the implementation even more now, and I'm still wondering what the
hell we are supposed to do with bulkstat, filehandles and stuff like
xfs_fsr, backups, etc...

> > So, to prevent me from wondering what it is doing in another 6
> > months time, can you add a set of helper functions that are named:
> > 
> > xfs_kuid_to_disk()
> > xfs_kuid_from_disk()
> > xfs_kgid_to_disk()
> > xfs_kgid_from_disk()
> > 
> > and document why we are using the namespaces that are being used,
> > and then use them where we convert to/from the different inode
> > structures?
> 
> Sure I can take a shot at that, and I'm guessing you would prefer to use
> 
> inode->i_uid = xfs_kuid_from_disk(ip->i_d.di_uid);
> 
> over
> 
> i_uid_write(inode, ip->i_d.di_uid);

Yes, because i_uid_write() is not a generic helper function, and we
convert to/from disk format in many places where we aren't actually
reading/writing the struct inode. i.e. I don't see any reason at all
for having the variable init_user_ns anywhere in the XFS code
except than in those wrapper functions....

> > FWIW, what happens when ip->i_d.di_gid doesn't have a mapping in the
> > current namespace, and make_kuid/make_kgid return
> > INVALID_UID/INVALID_GID? Is this is going to happen, and if it does
> > what do we need to do about it? That will need to be added to the
> > comments, too.
> 
> I don't think it will happen here because init_user_ns has a mapping
> for all values. Where it can happen is when there is a conversion to
> some subset userns, that doesn't have a mapping for the value.

Yup, understood.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-24  0:33         ` Dave Chinner
@ 2013-06-24 13:10           ` Dwight Engen
  2013-06-25 16:46             ` Brian Foster
                               ` (3 more replies)
  0 siblings, 4 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-24 13:10 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Hi Dave, all. Here is a v2 patch that I believe addresses the previous
comments, but I expect there to be more :) I think there are a few
more issues to sort out before this is ready, and I want to add some
tests to xfstests also.

I added permission checks for eofblocks in the ioctl code, but
I don't think they are enough. Just because an unprivileged
caller is in a group doesn't mean he can write to a file of that group,
and I don't know how we can check for that till we get the inode in
hand. Brian, if you or anyone else could comment on how this should work
for the regular user and write() ENOSPC cases that'd be great.

The xfs code now uses inode->i_uid where possible instead of di_uid.
The remaining uses of di_uid are where the inode is being setup,
conversion to/from disk endianess, in dealing with quotas, and bulkstat.

We do need to decide on the di_uid that comes back from bulkstat.
Right now it is returning on disk (== init_user_ns) uids. It looks to
me like xfsrestore is using the normal vfs routines (chown, fchown,
lchown) when restoring so that won't line up if the xfsrestore is run
in !init_user_ns. We could possibly convert to userns values
before returning them from the kernel, but I doubt that will work
well with the xfs quotas. Should we just require that callers of bulkstat
be in init_user_ns? Thoughts?


--

Use uint32 from init_user_ns for xfs internal uid/gid representation in
acl, xfs_icdinode, xfs_dqid_t. Conversion of kuid/gid is done for these
structures and for the eofblocks filter. Other user visible xfs specific
interfaces (bulkstat) expect uint32 init_user_ns uid/gid values.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_acl.c      | 20 ++++++++++++++++----
 fs/xfs/xfs_fs.h       |  2 +-
 fs/xfs/xfs_icache.c   |  6 +++---
 fs/xfs/xfs_inode.c    |  6 +++---
 fs/xfs/xfs_ioctl.c    | 37 ++++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_iops.c     | 38 ++++++++++++++++++++------------------
 fs/xfs/xfs_linux.h    | 35 +++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_qm.c       | 10 +++++-----
 fs/xfs/xfs_quota.h    |  9 +++++----
 fs/xfs/xfs_symlink.c  |  4 +++-
 fs/xfs/xfs_vnodeops.c |  4 +++-
 init/Kconfig          | 15 +--------------
 12 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 306d883..b497ca2 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -68,14 +68,15 @@ xfs_acl_from_disk(
 
 		switch (acl_e->e_tag) {
 		case ACL_USER:
+			acl_e->e_uid = xfs_kuid_from_disk(be32_to_cpu(ace->ae_id));
+			break;
 		case ACL_GROUP:
-			acl_e->e_id = be32_to_cpu(ace->ae_id);
+			acl_e->e_gid = xfs_kgid_from_disk(be32_to_cpu(ace->ae_id));
 			break;
 		case ACL_USER_OBJ:
 		case ACL_GROUP_OBJ:
 		case ACL_MASK:
 		case ACL_OTHER:
-			acl_e->e_id = ACL_UNDEFINED_ID;
 			break;
 		default:
 			goto fail;
@@ -101,7 +102,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
 		acl_e = &acl->a_entries[i];
 
 		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
-		ace->ae_id = cpu_to_be32(acl_e->e_id);
+		switch(acl_e->e_tag) {
+		case ACL_USER:
+			ace->ae_id = cpu_to_be32(xfs_kuid_to_disk(acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ace->ae_id = cpu_to_be32(xfs_kgid_to_disk(acl_e->e_gid));
+			break;
+		default:
+			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+			break;
+		}
+
 		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
 	}
 }
@@ -360,7 +372,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 		return -EINVAL;
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return value ? -EACCES : 0;
-	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (!value)
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d046955..6bc3da4 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -344,7 +344,7 @@ typedef struct xfs_error_injection {
  * Speculative preallocation trimming.
  */
 #define XFS_EOFBLOCKS_VERSION		1
-struct xfs_eofblocks {
+struct xfs_ueofblocks {
 	__u32		eof_version;
 	__u32		eof_flags;
 	uid_t		eof_uid;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e..2c35b13 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -617,7 +617,7 @@ restart:
 
 /*
  * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
  */
 STATIC void
 xfs_queue_eofblocks(
@@ -1202,11 +1202,11 @@ xfs_inode_match_id(
 	struct xfs_eofblocks	*eofb)
 {
 	if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
-	    ip->i_d.di_uid != eofb->eof_uid)
+	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
 		return 0;
 
 	if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
-	    ip->i_d.di_gid != eofb->eof_gid)
+	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
 		return 0;
 
 	if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7f7be5f..2dc9e66 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1268,8 +1268,8 @@ xfs_ialloc(
 	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
 	ASSERT(ip->i_d.di_nlink == nlink);
-	ip->i_d.di_uid = current_fsuid();
-	ip->i_d.di_gid = current_fsgid();
+	ip->i_d.di_uid = xfs_kuid_to_disk(current_fsuid());
+	ip->i_d.di_gid = xfs_kgid_to_disk(current_fsgid());
 	xfs_set_projid(ip, prid);
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
@@ -1308,7 +1308,7 @@ xfs_ialloc(
 	 */
 	if ((irix_sgid_inherit) &&
 	    (ip->i_d.di_mode & S_ISGID) &&
-	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
+	    (!in_group_p(xfs_kgid_from_disk(ip->i_d.di_gid)))) {
 		ip->i_d.di_mode &= ~S_ISGID;
 	}
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e99968..d6e64d9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -981,7 +981,7 @@ xfs_ioctl_setattr(
 	 * to the file owner ID, except in cases where the
 	 * CAP_FSETID capability is applicable.
 	 */
-	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+	if (!inode_owner_or_capable(VFS_I(ip))) {
 		code = XFS_ERROR(EPERM);
 		goto error_return;
 	}
@@ -1610,7 +1610,8 @@ xfs_file_ioctl(
 		return -error;
 
 	case XFS_IOC_FREE_EOFBLOCKS: {
-		struct xfs_eofblocks eofb;
+		struct xfs_ueofblocks eofb;
+		struct xfs_eofblocks keofb;
 
 		if (copy_from_user(&eofb, arg, sizeof(eofb)))
 			return -XFS_ERROR(EFAULT);
@@ -1625,7 +1626,37 @@ xfs_file_ioctl(
 		    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
 			return -XFS_ERROR(EINVAL);
 
-		error = xfs_icache_free_eofblocks(mp, &eofb);
+		keofb.eof_version = eofb.eof_version;
+		keofb.eof_flags = eofb.eof_flags;
+		keofb.eof_prid = eofb.eof_prid;
+		keofb.eof_min_file_size = eofb.eof_min_file_size;
+
+		if (eofb.eof_flags & XFS_EOF_FLAGS_UID) {
+			keofb.eof_uid = make_kuid(current_user_ns(), eofb.eof_uid);
+			if (!uid_valid(keofb.eof_uid))
+				return -XFS_ERROR(EINVAL);
+		}
+
+		if (eofb.eof_flags & XFS_EOF_FLAGS_GID) {
+			keofb.eof_gid = make_kgid(current_user_ns(), eofb.eof_gid);
+			if (!gid_valid(keofb.eof_gid))
+				return -XFS_ERROR(EINVAL);
+		}
+
+		if (!capable(CAP_SYS_ADMIN)) {
+			if (!(eofb.eof_flags & (XFS_EOF_FLAGS_UID | XFS_EOF_FLAGS_GID)))
+				return -XFS_ERROR(EPERM);
+
+			if ((eofb.eof_flags & XFS_EOF_FLAGS_UID) &&
+			    !uid_eq(current_fsuid(), keofb.eof_uid))
+				return -XFS_ERROR(EPERM);
+
+			if ((eofb.eof_flags & XFS_EOF_FLAGS_GID) &&
+			    !in_group_p(keofb.eof_gid))
+				return -XFS_ERROR(EPERM);
+		}
+
+		error = xfs_icache_free_eofblocks(mp, &keofb);
 		return -error;
 	}
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca9ecaa..5beabf4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -420,8 +420,8 @@ xfs_vn_getattr(
 	stat->dev = inode->i_sb->s_dev;
 	stat->mode = ip->i_d.di_mode;
 	stat->nlink = ip->i_d.di_nlink;
-	stat->uid = ip->i_d.di_uid;
-	stat->gid = ip->i_d.di_gid;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
 	stat->ino = ip->i_ino;
 	stat->atime = inode->i_atime;
 	stat->mtime = inode->i_mtime;
@@ -488,8 +488,8 @@ xfs_setattr_nonsize(
 	int			mask = iattr->ia_valid;
 	xfs_trans_t		*tp;
 	int			error;
-	uid_t			uid = 0, iuid = 0;
-	gid_t			gid = 0, igid = 0;
+	kuid_t			uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+	kgid_t			gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
 	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
 	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
 
@@ -522,13 +522,13 @@ xfs_setattr_nonsize(
 			uid = iattr->ia_uid;
 			qflags |= XFS_QMOPT_UQUOTA;
 		} else {
-			uid = ip->i_d.di_uid;
+			uid = inode->i_uid;
 		}
 		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
 			gid = iattr->ia_gid;
 			qflags |= XFS_QMOPT_GQUOTA;
 		}  else {
-			gid = ip->i_d.di_gid;
+			gid = inode->i_gid;
 		}
 
 		/*
@@ -538,8 +538,10 @@ xfs_setattr_nonsize(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-					 qflags, &udqp, &gdqp);
+		error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_disk(uid),
+					   xfs_kgid_to_disk(gid),
+					   xfs_get_projid(ip),
+					   qflags, &udqp, &gdqp);
 		if (error)
 			return error;
 	}
@@ -561,8 +563,8 @@ xfs_setattr_nonsize(
 		 * while we didn't have the inode locked, inode's dquot(s)
 		 * would have changed also.
 		 */
-		iuid = ip->i_d.di_uid;
-		igid = ip->i_d.di_gid;
+		iuid = inode->i_uid;
+		igid = inode->i_gid;
 		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
 		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 
@@ -571,8 +573,8 @@ xfs_setattr_nonsize(
 		 * going to change.
 		 */
 		if (XFS_IS_QUOTA_RUNNING(mp) &&
-		    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-		     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+		    ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+		     (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
 			ASSERT(tp);
 			error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
@@ -602,17 +604,17 @@ xfs_setattr_nonsize(
 		 * Change the ownerships and register quota modifications
 		 * in the transaction.
 		 */
-		if (iuid != uid) {
+		if (!uid_eq(iuid, uid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
 				ASSERT(mask & ATTR_UID);
 				ASSERT(udqp);
 				olddquot1 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_udquot, udqp);
 			}
-			ip->i_d.di_uid = uid;
+			ip->i_d.di_uid = xfs_kuid_to_disk(uid);
 			inode->i_uid = uid;
 		}
-		if (igid != gid) {
+		if (!gid_eq(igid, gid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
 				ASSERT(!XFS_IS_PQUOTA_ON(mp));
 				ASSERT(mask & ATTR_GID);
@@ -620,7 +622,7 @@ xfs_setattr_nonsize(
 				olddquot2 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
-			ip->i_d.di_gid = gid;
+			ip->i_d.di_gid = xfs_kgid_to_disk(gid);
 			inode->i_gid = gid;
 		}
 	}
@@ -1172,8 +1174,8 @@ xfs_setup_inode(
 
 	inode->i_mode	= ip->i_d.di_mode;
 	set_nlink(inode, ip->i_d.di_nlink);
-	inode->i_uid	= ip->i_d.di_uid;
-	inode->i_gid	= ip->i_d.di_gid;
+	inode->i_uid    = xfs_kuid_from_disk(ip->i_d.di_uid);
+	inode->i_gid    = xfs_kgid_from_disk(ip->i_d.di_gid);
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFBLK:
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 800f896..80326da 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -159,6 +159,41 @@
 #define MAX(a,b)	(max(a,b))
 #define howmany(x, y)	(((x)+((y)-1))/(y))
 
+/* Kernel uid/gid conversion. These are used to convert to/from the on disk
+ * uid/gid types to the kuid_t/kgid_t types that the kernel uses internally.
+ * The conversion here is type only, the value will remain the same since we
+ * are converting to the init_user_ns. The uid is later mapped to a particular
+ * user namespace value when crossing the kernel/user boundary.
+ */
+static inline __uint32_t xfs_kuid_to_disk(kuid_t uid)
+{
+	return from_kuid(&init_user_ns, uid);
+}
+
+static inline kuid_t xfs_kuid_from_disk(__uint32_t uid)
+{
+	return make_kuid(&init_user_ns, uid);
+}
+
+static inline __uint32_t xfs_kgid_to_disk(kgid_t gid)
+{
+	return from_kgid(&init_user_ns, gid);
+}
+
+static inline kgid_t xfs_kgid_from_disk(__uint32_t gid)
+{
+	return make_kgid(&init_user_ns, gid);
+}
+
+struct xfs_eofblocks {
+	__u32		eof_version;
+	__u32		eof_flags;
+	kuid_t		eof_uid;
+	kgid_t		eof_gid;
+	prid_t		eof_prid;
+	__u64		eof_min_file_size;
+};
+
 /*
  * Various platform dependent calls that don't fit anywhere else
  */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb..57e2c18 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
 int
 xfs_qm_vop_dqalloc(
 	struct xfs_inode	*ip,
-	uid_t			uid,
-	gid_t			gid,
+	xfs_dqid_t		uid,
+	xfs_dqid_t		gid,
 	prid_t			prid,
 	uint			flags,
 	struct xfs_dquot	**O_udqpp,
@@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
 			 * holding ilock.
 			 */
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+			if ((error = xfs_qm_dqget(mp, NULL, uid,
 						 XFS_DQ_USER,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
@@ -1723,7 +1723,7 @@ xfs_qm_vop_dqalloc(
 	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
 		if (ip->i_d.di_gid != gid) {
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+			if ((error = xfs_qm_dqget(mp, NULL, gid,
 						 XFS_DQ_GROUP,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
@@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
 			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
 	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
 		delblksudq = udqp;
 		/*
 		 * If there are delayed allocation blocks, then we have to
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f..5f0bfe8 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -320,8 +320,8 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
 		struct xfs_mount *, struct xfs_dquot *,
 		struct xfs_dquot *, long, long, uint);
 
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-		struct xfs_dquot **, struct xfs_dquot **);
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **);
 extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *);
 extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -341,8 +341,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
 #else
 static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+		prid_t prid, uint flags, struct xfs_dquot **udqp,
+		struct xfs_dquot **gdqp)
 {
 	*udqp = NULL;
 	*gdqp = NULL;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 195a403..c50306e 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -384,7 +384,9 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp,
+			xfs_kuid_to_disk(current_fsuid()),
+			xfs_kgid_to_disk(current_fsgid()), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 0176bb2..94f4f9f6 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -515,7 +515,9 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp,
+			xfs_kuid_to_disk(current_fsuid()),
+			xfs_kgid_to_disk(current_fsgid()), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		return error;
diff --git a/init/Kconfig b/init/Kconfig
index 9d3a788..8083ffd 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1065,7 +1065,6 @@ config IPC_NS
 
 config USER_NS
 	bool "User namespace"
-	depends on UIDGID_CONVERTED
 	select UIDGID_STRICT_TYPE_CHECKS
 
 	default n
@@ -1099,21 +1098,9 @@ config NET_NS
 
 endif # NAMESPACES
 
-config UIDGID_CONVERTED
-	# True if all of the selected software conmponents are known
-	# to have uid_t and gid_t converted to kuid_t and kgid_t
-	# where appropriate and are otherwise safe to use with
-	# the user namespace.
-	bool
-	default y
-
-	# Filesystems
-	depends on XFS_FS = n
-
 config UIDGID_STRICT_TYPE_CHECKS
 	bool "Require conversions between uid/gids and their internal representation"
-	depends on UIDGID_CONVERTED
-	default n
+	default y
 	help
 	 While the nececessary conversions are being added to all subsystems this option allows
 	 the code to continue to build for unconverted subsystems.
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-24 13:10           ` [PATCH v2 RFC] " Dwight Engen
@ 2013-06-25 16:46             ` Brian Foster
  2013-06-25 20:08               ` Dwight Engen
  2013-06-26  2:09             ` Dave Chinner
                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 46+ messages in thread
From: Brian Foster @ 2013-06-25 16:46 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On 06/24/2013 09:10 AM, Dwight Engen wrote:
> Hi Dave, all. Here is a v2 patch that I believe addresses the previous
> comments, but I expect there to be more :) I think there are a few
> more issues to sort out before this is ready, and I want to add some
> tests to xfstests also.
> 
> I added permission checks for eofblocks in the ioctl code, but
> I don't think they are enough. Just because an unprivileged
> caller is in a group doesn't mean he can write to a file of that group,
> and I don't know how we can check for that till we get the inode in
> hand. Brian, if you or anyone else could comment on how this should work
> for the regular user and write() ENOSPC cases that'd be great.
> 

Hi Dwight,

Fair point with regard to the group. On one hand, we aren't exactly
writing to the file and from the perspective of the fs, we'd like the
ability for a user covered under a group quota to have the ability to
manage inodes covered under said quota. On the other hand, from a
userspace perspective it could imply a strange
incarnation/interpretation(/abuse) of file permissions. I'll have to
think about this some more.

Before getting too far into the code, could we break this down into
smaller, independent patches? This is starting to include logically
independent changes, and on first glance, it appears even the eofblocks
stuff could be broken down into multiple patches (i.e., the introduction
of a new structure should be its own patch, etc.). But more on that to
follow...

> The xfs code now uses inode->i_uid where possible instead of di_uid.
> The remaining uses of di_uid are where the inode is being setup,
> conversion to/from disk endianess, in dealing with quotas, and bulkstat.
> 
> We do need to decide on the di_uid that comes back from bulkstat.
> Right now it is returning on disk (== init_user_ns) uids. It looks to
> me like xfsrestore is using the normal vfs routines (chown, fchown,
> lchown) when restoring so that won't line up if the xfsrestore is run
> in !init_user_ns. We could possibly convert to userns values
> before returning them from the kernel, but I doubt that will work
> well with the xfs quotas. Should we just require that callers of bulkstat
> be in init_user_ns? Thoughts?
> 
> 
> --
> 
> Use uint32 from init_user_ns for xfs internal uid/gid representation in
> acl, xfs_icdinode, xfs_dqid_t. Conversion of kuid/gid is done for these
> structures and for the eofblocks filter. Other user visible xfs specific
> interfaces (bulkstat) expect uint32 init_user_ns uid/gid values.
> 
> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> ---
>  fs/xfs/xfs_acl.c      | 20 ++++++++++++++++----
>  fs/xfs/xfs_fs.h       |  2 +-
>  fs/xfs/xfs_icache.c   |  6 +++---
>  fs/xfs/xfs_inode.c    |  6 +++---
>  fs/xfs/xfs_ioctl.c    | 37 ++++++++++++++++++++++++++++++++++---
>  fs/xfs/xfs_iops.c     | 38 ++++++++++++++++++++------------------
>  fs/xfs/xfs_linux.h    | 35 +++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_qm.c       | 10 +++++-----
>  fs/xfs/xfs_quota.h    |  9 +++++----
>  fs/xfs/xfs_symlink.c  |  4 +++-
>  fs/xfs/xfs_vnodeops.c |  4 +++-
>  init/Kconfig          | 15 +--------------
>  12 files changed, 129 insertions(+), 57 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
> index d046955..6bc3da4 100644
> --- a/fs/xfs/xfs_fs.h
> +++ b/fs/xfs/xfs_fs.h
> @@ -344,7 +344,7 @@ typedef struct xfs_error_injection {
>   * Speculative preallocation trimming.
>   */
>  #define XFS_EOFBLOCKS_VERSION		1
> -struct xfs_eofblocks {
> +struct xfs_ueofblocks {

We probably don't want to go and change the name of the structure
exported to userspace. This was intentionally named with the
understanding that it's user facing. If anything, I'd say leave this and
use a different name for the internal version.

FWIW, I already have some patches that create an internal version of
xfs_eofblocks for a separate purpose (adding a new internal only field).
As is, it might not completely meet the use case here, but I'm wondering
if we should combine some efforts here...

>  	__u32		eof_version;
>  	__u32		eof_flags;
>  	uid_t		eof_uid;
> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> index 96e344e..2c35b13 100644
> --- a/fs/xfs/xfs_icache.c
> +++ b/fs/xfs/xfs_icache.c
> @@ -617,7 +617,7 @@ restart:
>  
>  /*
>   * Background scanning to trim post-EOF preallocated space. This is queued
> - * based on the 'background_prealloc_discard_period' tunable (5m by default).
> + * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
>   */
>  STATIC void
>  xfs_queue_eofblocks(
> @@ -1202,11 +1202,11 @@ xfs_inode_match_id(
>  	struct xfs_eofblocks	*eofb)
>  {
>  	if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
> -	    ip->i_d.di_uid != eofb->eof_uid)
> +	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
>  		return 0;
>  
>  	if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
> -	    ip->i_d.di_gid != eofb->eof_gid)
> +	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))

More of a question... since we're originally comparing against the
on-disk values, is the separate internal structure strictly necessary
for making eofblocks userns aware?

>  		return 0;
>  
>  	if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
...
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 5e99968..d6e64d9 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
...
> @@ -1610,7 +1610,8 @@ xfs_file_ioctl(
>  		return -error;
>  
>  	case XFS_IOC_FREE_EOFBLOCKS: {
> -		struct xfs_eofblocks eofb;
> +		struct xfs_ueofblocks eofb;
> +		struct xfs_eofblocks keofb;
>  
>  		if (copy_from_user(&eofb, arg, sizeof(eofb)))
>  			return -XFS_ERROR(EFAULT);
> @@ -1625,7 +1626,37 @@ xfs_file_ioctl(
>  		    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
>  			return -XFS_ERROR(EINVAL);
>  
> -		error = xfs_icache_free_eofblocks(mp, &eofb);
> +		keofb.eof_version = eofb.eof_version;
> +		keofb.eof_flags = eofb.eof_flags;
> +		keofb.eof_prid = eofb.eof_prid;
> +		keofb.eof_min_file_size = eofb.eof_min_file_size;
> +

A conversion helper would be nice here. I'd probably just convert all of
the fields and then reduce the following EINVAL bits to pure
permission/sanity checks.

> +		if (eofb.eof_flags & XFS_EOF_FLAGS_UID) {
> +			keofb.eof_uid = make_kuid(current_user_ns(), eofb.eof_uid);
> +			if (!uid_valid(keofb.eof_uid))
> +				return -XFS_ERROR(EINVAL);
> +		}
> +
> +		if (eofb.eof_flags & XFS_EOF_FLAGS_GID) {
> +			keofb.eof_gid = make_kgid(current_user_ns(), eofb.eof_gid);
> +			if (!gid_valid(keofb.eof_gid))
> +				return -XFS_ERROR(EINVAL);
> +		}
> +
> +		if (!capable(CAP_SYS_ADMIN)) {
> +			if (!(eofb.eof_flags & (XFS_EOF_FLAGS_UID | XFS_EOF_FLAGS_GID)))
> +				return -XFS_ERROR(EPERM);
> +
> +			if ((eofb.eof_flags & XFS_EOF_FLAGS_UID) &&
> +			    !uid_eq(current_fsuid(), keofb.eof_uid))
> +				return -XFS_ERROR(EPERM);
> +
> +			if ((eofb.eof_flags & XFS_EOF_FLAGS_GID) &&
> +			    !in_group_p(keofb.eof_gid))
> +				return -XFS_ERROR(EPERM);
> +		}

A comment above this hunk to describe what we decide to enforce here and
why will probably be helpful. ;) Otherwise, the checks seem reasonable
to me, notwithstanding the open question you called out in the commit
log description. FWIW, this particular EPERM check should probably stand
on its own as an independent patch.

> +
> +		error = xfs_icache_free_eofblocks(mp, &keofb);
>  		return -error;
>  	}
>  
...
> diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> index 800f896..80326da 100644
> --- a/fs/xfs/xfs_linux.h
> +++ b/fs/xfs/xfs_linux.h
> @@ -159,6 +159,41 @@
>  #define MAX(a,b)	(max(a,b))
>  #define howmany(x, y)	(((x)+((y)-1))/(y))
>  
...
> +
> +struct xfs_eofblocks {
> +	__u32		eof_version;
> +	__u32		eof_flags;
> +	kuid_t		eof_uid;
> +	kgid_t		eof_gid;
> +	prid_t		eof_prid;
> +	__u64		eof_min_file_size;
> +};
> +

This should probably go into xfs_icache.h along with the aforementioned
conversion helper.

As I mentioned previously, I have some code around that creates an
internal version of the eofblocks structure. The main differences are
the name (xfs_eofblocks_internal) and I did the conversion down in
xfs_icache.c since I wasn't changing anything that affected the ioctl().

I can throw it up on the list for reference or if it's of any use as a
base for this work...

Brian

>  /*
>   * Various platform dependent calls that don't fit anywhere else
>   */
> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> index b75c9bb..57e2c18 100644
> --- a/fs/xfs/xfs_qm.c
> +++ b/fs/xfs/xfs_qm.c
> @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
>  int
>  xfs_qm_vop_dqalloc(
>  	struct xfs_inode	*ip,
> -	uid_t			uid,
> -	gid_t			gid,
> +	xfs_dqid_t		uid,
> +	xfs_dqid_t		gid,
>  	prid_t			prid,
>  	uint			flags,
>  	struct xfs_dquot	**O_udqpp,
> @@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
>  			 * holding ilock.
>  			 */
>  			xfs_iunlock(ip, lockflags);
> -			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
> +			if ((error = xfs_qm_dqget(mp, NULL, uid,
>  						 XFS_DQ_USER,
>  						 XFS_QMOPT_DQALLOC |
>  						 XFS_QMOPT_DOWARN,
> @@ -1723,7 +1723,7 @@ xfs_qm_vop_dqalloc(
>  	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
>  		if (ip->i_d.di_gid != gid) {
>  			xfs_iunlock(ip, lockflags);
> -			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
> +			if ((error = xfs_qm_dqget(mp, NULL, gid,
>  						 XFS_DQ_GROUP,
>  						 XFS_QMOPT_DQALLOC |
>  						 XFS_QMOPT_DOWARN,
> @@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
>  			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
>  
>  	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
> -	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
> +	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
>  		delblksudq = udqp;
>  		/*
>  		 * If there are delayed allocation blocks, then we have to
> diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
> index c38068f..5f0bfe8 100644
> --- a/fs/xfs/xfs_quota.h
> +++ b/fs/xfs/xfs_quota.h
> @@ -320,8 +320,8 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
>  		struct xfs_mount *, struct xfs_dquot *,
>  		struct xfs_dquot *, long, long, uint);
>  
> -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
> -		struct xfs_dquot **, struct xfs_dquot **);
> +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
> +		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **);
>  extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
>  		struct xfs_dquot *, struct xfs_dquot *);
>  extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
> @@ -341,8 +341,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
>  
>  #else
>  static inline int
> -xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
> -		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
> +xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
> +		prid_t prid, uint flags, struct xfs_dquot **udqp,
> +		struct xfs_dquot **gdqp)
>  {
>  	*udqp = NULL;
>  	*gdqp = NULL;
> diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
> index 195a403..c50306e 100644
> --- a/fs/xfs/xfs_symlink.c
> +++ b/fs/xfs/xfs_symlink.c
> @@ -384,7 +384,9 @@ xfs_symlink(
>  	/*
>  	 * Make sure that we have allocated dquot(s) on disk.
>  	 */
> -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
> +	error = xfs_qm_vop_dqalloc(dp,
> +			xfs_kuid_to_disk(current_fsuid()),
> +			xfs_kgid_to_disk(current_fsgid()), prid,
>  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
>  	if (error)
>  		goto std_return;
> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
> index 0176bb2..94f4f9f6 100644
> --- a/fs/xfs/xfs_vnodeops.c
> +++ b/fs/xfs/xfs_vnodeops.c
> @@ -515,7 +515,9 @@ xfs_create(
>  	/*
>  	 * Make sure that we have allocated dquot(s) on disk.
>  	 */
> -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
> +	error = xfs_qm_vop_dqalloc(dp,
> +			xfs_kuid_to_disk(current_fsuid()),
> +			xfs_kgid_to_disk(current_fsgid()), prid,
>  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
>  	if (error)
>  		return error;
> diff --git a/init/Kconfig b/init/Kconfig
> index 9d3a788..8083ffd 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1065,7 +1065,6 @@ config IPC_NS
>  
>  config USER_NS
>  	bool "User namespace"
> -	depends on UIDGID_CONVERTED
>  	select UIDGID_STRICT_TYPE_CHECKS
>  
>  	default n
> @@ -1099,21 +1098,9 @@ config NET_NS
>  
>  endif # NAMESPACES
>  
> -config UIDGID_CONVERTED
> -	# True if all of the selected software conmponents are known
> -	# to have uid_t and gid_t converted to kuid_t and kgid_t
> -	# where appropriate and are otherwise safe to use with
> -	# the user namespace.
> -	bool
> -	default y
> -
> -	# Filesystems
> -	depends on XFS_FS = n
> -
>  config UIDGID_STRICT_TYPE_CHECKS
>  	bool "Require conversions between uid/gids and their internal representation"
> -	depends on UIDGID_CONVERTED
> -	default n
> +	default y
>  	help
>  	 While the nececessary conversions are being added to all subsystems this option allows
>  	 the code to continue to build for unconverted subsystems.
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-25 16:46             ` Brian Foster
@ 2013-06-25 20:08               ` Dwight Engen
  2013-06-25 21:04                 ` Brian Foster
  0 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-25 20:08 UTC (permalink / raw)
  To: Brian Foster; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On Tue, 25 Jun 2013 12:46:19 -0400
Brian Foster <bfoster@redhat.com> wrote:

> On 06/24/2013 09:10 AM, Dwight Engen wrote:
> > Hi Dave, all. Here is a v2 patch that I believe addresses the
> > previous comments, but I expect there to be more :) I think there
> > are a few more issues to sort out before this is ready, and I want
> > to add some tests to xfstests also.
> > 
> > I added permission checks for eofblocks in the ioctl code, but
> > I don't think they are enough. Just because an unprivileged
> > caller is in a group doesn't mean he can write to a file of that
> > group, and I don't know how we can check for that till we get the
> > inode in hand. Brian, if you or anyone else could comment on how
> > this should work for the regular user and write() ENOSPC cases
> > that'd be great.
> > 
> 
> Hi Dwight,
> 
> Fair point with regard to the group. On one hand, we aren't exactly
> writing to the file and from the perspective of the fs, we'd like the
> ability for a user covered under a group quota to have the ability to
> manage inodes covered under said quota. On the other hand, from a
> userspace perspective it could imply a strange
> incarnation/interpretation(/abuse) of file permissions. I'll have to
> think about this some more.
> 
> Before getting too far into the code, could we break this down into
> smaller, independent patches? This is starting to include logically
> independent changes, and on first glance, it appears even the
> eofblocks stuff could be broken down into multiple patches (i.e., the
> introduction of a new structure should be its own patch, etc.). But
> more on that to follow...

Sure, I'll split out the eofblocks changes and try to break that up
itself into conversion vs. use so hopefully it fits in with what you're
doing.
 
> > The xfs code now uses inode->i_uid where possible instead of di_uid.
> > The remaining uses of di_uid are where the inode is being setup,
> > conversion to/from disk endianess, in dealing with quotas, and
> > bulkstat.
> > 
> > We do need to decide on the di_uid that comes back from bulkstat.
> > Right now it is returning on disk (== init_user_ns) uids. It looks
> > to me like xfsrestore is using the normal vfs routines (chown,
> > fchown, lchown) when restoring so that won't line up if the
> > xfsrestore is run in !init_user_ns. We could possibly convert to
> > userns values before returning them from the kernel, but I doubt
> > that will work well with the xfs quotas. Should we just require
> > that callers of bulkstat be in init_user_ns? Thoughts?
> > 
> > 
> > --
> > 
> > Use uint32 from init_user_ns for xfs internal uid/gid
> > representation in acl, xfs_icdinode, xfs_dqid_t. Conversion of
> > kuid/gid is done for these structures and for the eofblocks filter.
> > Other user visible xfs specific interfaces (bulkstat) expect uint32
> > init_user_ns uid/gid values.
> > 
> > Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> > ---
> >  fs/xfs/xfs_acl.c      | 20 ++++++++++++++++----
> >  fs/xfs/xfs_fs.h       |  2 +-
> >  fs/xfs/xfs_icache.c   |  6 +++---
> >  fs/xfs/xfs_inode.c    |  6 +++---
> >  fs/xfs/xfs_ioctl.c    | 37 ++++++++++++++++++++++++++++++++++---
> >  fs/xfs/xfs_iops.c     | 38 ++++++++++++++++++++------------------
> >  fs/xfs/xfs_linux.h    | 35 +++++++++++++++++++++++++++++++++++
> >  fs/xfs/xfs_qm.c       | 10 +++++-----
> >  fs/xfs/xfs_quota.h    |  9 +++++----
> >  fs/xfs/xfs_symlink.c  |  4 +++-
> >  fs/xfs/xfs_vnodeops.c |  4 +++-
> >  init/Kconfig          | 15 +--------------
> >  12 files changed, 129 insertions(+), 57 deletions(-)
> > 
> ...
> > diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
> > index d046955..6bc3da4 100644
> > --- a/fs/xfs/xfs_fs.h
> > +++ b/fs/xfs/xfs_fs.h
> > @@ -344,7 +344,7 @@ typedef struct xfs_error_injection {
> >   * Speculative preallocation trimming.
> >   */
> >  #define XFS_EOFBLOCKS_VERSION		1
> > -struct xfs_eofblocks {
> > +struct xfs_ueofblocks {
> 
> We probably don't want to go and change the name of the structure
> exported to userspace. This was intentionally named with the
> understanding that it's user facing. If anything, I'd say leave this
> and use a different name for the internal version.

Ahh, okay sounds good, I didn't realize xfs.h was exported to
userspace. I changed it this way so we wouldn't have to change the type
that's used in xfs_icache.c but there aren't that many uses there and I
agree we don't want to change the userspace interface.

> FWIW, I already have some patches that create an internal version of
> xfs_eofblocks for a separate purpose (adding a new internal only
> field). As is, it might not completely meet the use case here, but
> I'm wondering if we should combine some efforts here...

Yeah, that sounds good. If you want to post your conversion routine
that'd be great, otherwise I'll just change to use your name
(xfs_eofblocks_internal from below).

> >  	__u32		eof_version;
> >  	__u32		eof_flags;
> >  	uid_t		eof_uid;
> > diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
> > index 96e344e..2c35b13 100644
> > --- a/fs/xfs/xfs_icache.c
> > +++ b/fs/xfs/xfs_icache.c
> > @@ -617,7 +617,7 @@ restart:
> >  
> >  /*
> >   * Background scanning to trim post-EOF preallocated space. This
> > is queued
> > - * based on the 'background_prealloc_discard_period' tunable (5m
> > by default).
> > + * based on the 'speculative_prealloc_lifetime' tunable (5m by
> > default). */
> >  STATIC void
> >  xfs_queue_eofblocks(
> > @@ -1202,11 +1202,11 @@ xfs_inode_match_id(
> >  	struct xfs_eofblocks	*eofb)
> >  {
> >  	if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
> > -	    ip->i_d.di_uid != eofb->eof_uid)
> > +	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
> >  		return 0;
> >  
> >  	if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
> > -	    ip->i_d.di_gid != eofb->eof_gid)
> > +	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
> 
> More of a question... since we're originally comparing against the
> on-disk values, is the separate internal structure strictly necessary
> for making eofblocks userns aware?

Not sure I fully understand your question, we were comparing on-disk
uid/gid values to unconverted eof values because xfs didn't support the
eof ioctl callers passing in ids from a userns. I believe part
of the idea of userns is that i_uid is an opaque type, hence the use of
_eq() comparators and why we have to convert eof_[ug]id if we want to
compare them to i_uid as opposed to di_uid.

> >  		return 0;
> >  
> >  	if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
> ...
> > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > index 5e99968..d6e64d9 100644
> > --- a/fs/xfs/xfs_ioctl.c
> > +++ b/fs/xfs/xfs_ioctl.c
> ...
> > @@ -1610,7 +1610,8 @@ xfs_file_ioctl(
> >  		return -error;
> >  
> >  	case XFS_IOC_FREE_EOFBLOCKS: {
> > -		struct xfs_eofblocks eofb;
> > +		struct xfs_ueofblocks eofb;
> > +		struct xfs_eofblocks keofb;
> >  
> >  		if (copy_from_user(&eofb, arg, sizeof(eofb)))
> >  			return -XFS_ERROR(EFAULT);
> > @@ -1625,7 +1626,37 @@ xfs_file_ioctl(
> >  		    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
> >  			return -XFS_ERROR(EINVAL);
> >  
> > -		error = xfs_icache_free_eofblocks(mp, &eofb);
> > +		keofb.eof_version = eofb.eof_version;
> > +		keofb.eof_flags = eofb.eof_flags;
> > +		keofb.eof_prid = eofb.eof_prid;
> > +		keofb.eof_min_file_size = eofb.eof_min_file_size;
> > +
> 
> A conversion helper would be nice here. I'd probably just convert all
> of the fields and then reduce the following EINVAL bits to pure
> permission/sanity checks.

Yep, sounds like I should split out conversion vs. permission
checks.

> > +		if (eofb.eof_flags & XFS_EOF_FLAGS_UID) {
> > +			keofb.eof_uid =
> > make_kuid(current_user_ns(), eofb.eof_uid);
> > +			if (!uid_valid(keofb.eof_uid))
> > +				return -XFS_ERROR(EINVAL);
> > +		}
> > +
> > +		if (eofb.eof_flags & XFS_EOF_FLAGS_GID) {
> > +			keofb.eof_gid =
> > make_kgid(current_user_ns(), eofb.eof_gid);
> > +			if (!gid_valid(keofb.eof_gid))
> > +				return -XFS_ERROR(EINVAL);
> > +		}
> > +
> > +		if (!capable(CAP_SYS_ADMIN)) {
> > +			if (!(eofb.eof_flags & (XFS_EOF_FLAGS_UID
> > | XFS_EOF_FLAGS_GID)))
> > +				return -XFS_ERROR(EPERM);
> > +
> > +			if ((eofb.eof_flags & XFS_EOF_FLAGS_UID) &&
> > +			    !uid_eq(current_fsuid(),
> > keofb.eof_uid))
> > +				return -XFS_ERROR(EPERM);
> > +
> > +			if ((eofb.eof_flags & XFS_EOF_FLAGS_GID) &&
> > +			    !in_group_p(keofb.eof_gid))
> > +				return -XFS_ERROR(EPERM);
> > +		}
> 
> A comment above this hunk to describe what we decide to enforce here
> and why will probably be helpful. ;) Otherwise, the checks seem
> reasonable to me, notwithstanding the open question you called out in
> the commit log description. FWIW, this particular EPERM check should
> probably stand on its own as an independent patch.

Yep. I'll add a comment and do the split outs while we're mulling over
if these checks are enough.
 
> > +
> > +		error = xfs_icache_free_eofblocks(mp, &keofb);
> >  		return -error;
> >  	}
> >  
> ...
> > diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> > index 800f896..80326da 100644
> > --- a/fs/xfs/xfs_linux.h
> > +++ b/fs/xfs/xfs_linux.h
> > @@ -159,6 +159,41 @@
> >  #define MAX(a,b)	(max(a,b))
> >  #define howmany(x, y)	(((x)+((y)-1))/(y))
> >  
> ...
> > +
> > +struct xfs_eofblocks {
> > +	__u32		eof_version;
> > +	__u32		eof_flags;
> > +	kuid_t		eof_uid;
> > +	kgid_t		eof_gid;
> > +	prid_t		eof_prid;
> > +	__u64		eof_min_file_size;
> > +};
> > +
> 
> This should probably go into xfs_icache.h along with the
> aforementioned conversion helper.
> 
> As I mentioned previously, I have some code around that creates an
> internal version of the eofblocks structure. The main differences are
> the name (xfs_eofblocks_internal) and I did the conversion down in
> xfs_icache.c since I wasn't changing anything that affected the
> ioctl().
> 
> I can throw it up on the list for reference or if it's of any use as a
> base for this work...

If you have time to put it up that'd be great, but no biggie if not I'll
write a conversion routine. Thanks for looking at this.

> Brian
> 
> >  /*
> >   * Various platform dependent calls that don't fit anywhere else
> >   */
> > diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
> > index b75c9bb..57e2c18 100644
> > --- a/fs/xfs/xfs_qm.c
> > +++ b/fs/xfs/xfs_qm.c
> > @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
> >  int
> >  xfs_qm_vop_dqalloc(
> >  	struct xfs_inode	*ip,
> > -	uid_t			uid,
> > -	gid_t			gid,
> > +	xfs_dqid_t		uid,
> > +	xfs_dqid_t		gid,
> >  	prid_t			prid,
> >  	uint			flags,
> >  	struct xfs_dquot	**O_udqpp,
> > @@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
> >  			 * holding ilock.
> >  			 */
> >  			xfs_iunlock(ip, lockflags);
> > -			if ((error = xfs_qm_dqget(mp, NULL,
> > (xfs_dqid_t) uid,
> > +			if ((error = xfs_qm_dqget(mp, NULL, uid,
> >  						 XFS_DQ_USER,
> >  						 XFS_QMOPT_DQALLOC
> > | XFS_QMOPT_DOWARN,
> > @@ -1723,7 +1723,7 @@ xfs_qm_vop_dqalloc(
> >  	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
> >  		if (ip->i_d.di_gid != gid) {
> >  			xfs_iunlock(ip, lockflags);
> > -			if ((error = xfs_qm_dqget(mp, NULL,
> > (xfs_dqid_t)gid,
> > +			if ((error = xfs_qm_dqget(mp, NULL, gid,
> >  						 XFS_DQ_GROUP,
> >  						 XFS_QMOPT_DQALLOC
> > | XFS_QMOPT_DOWARN,
> > @@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
> >  			XFS_QMOPT_RES_RTBLKS :
> > XFS_QMOPT_RES_REGBLKS; 
> >  	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
> > -	    ip->i_d.di_uid !=
> > (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
> > +	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
> >  		delblksudq = udqp;
> >  		/*
> >  		 * If there are delayed allocation blocks, then we
> > have to diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
> > index c38068f..5f0bfe8 100644
> > --- a/fs/xfs/xfs_quota.h
> > +++ b/fs/xfs/xfs_quota.h
> > @@ -320,8 +320,8 @@ extern int
> > xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct
> > xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, long, long,
> > uint); 
> > -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t,
> > prid_t, uint,
> > -		struct xfs_dquot **, struct xfs_dquot **);
> > +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t,
> > xfs_dqid_t,
> > +		prid_t, uint, struct xfs_dquot **, struct
> > xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct
> > xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct
> > xfs_dquot *); extern int xfs_qm_vop_rename_dqattach(struct
> > xfs_inode **); @@ -341,8 +341,9 @@ extern void
> > xfs_qm_unmount_quotas(struct xfs_mount *); 
> >  #else
> >  static inline int
> > -xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid,
> > prid_t prid,
> > -		uint flags, struct xfs_dquot **udqp, struct
> > xfs_dquot **gdqp) +xfs_qm_vop_dqalloc(struct xfs_inode *ip,
> > xfs_dqid_t uid, xfs_dqid_t gid,
> > +		prid_t prid, uint flags, struct xfs_dquot **udqp,
> > +		struct xfs_dquot **gdqp)
> >  {
> >  	*udqp = NULL;
> >  	*gdqp = NULL;
> > diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
> > index 195a403..c50306e 100644
> > --- a/fs/xfs/xfs_symlink.c
> > +++ b/fs/xfs/xfs_symlink.c
> > @@ -384,7 +384,9 @@ xfs_symlink(
> >  	/*
> >  	 * Make sure that we have allocated dquot(s) on disk.
> >  	 */
> > -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(),
> > current_fsgid(), prid,
> > +	error = xfs_qm_vop_dqalloc(dp,
> > +			xfs_kuid_to_disk(current_fsuid()),
> > +			xfs_kgid_to_disk(current_fsgid()), prid,
> >  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
> > &udqp, &gdqp); if (error)
> >  		goto std_return;
> > diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
> > index 0176bb2..94f4f9f6 100644
> > --- a/fs/xfs/xfs_vnodeops.c
> > +++ b/fs/xfs/xfs_vnodeops.c
> > @@ -515,7 +515,9 @@ xfs_create(
> >  	/*
> >  	 * Make sure that we have allocated dquot(s) on disk.
> >  	 */
> > -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(),
> > current_fsgid(), prid,
> > +	error = xfs_qm_vop_dqalloc(dp,
> > +			xfs_kuid_to_disk(current_fsuid()),
> > +			xfs_kgid_to_disk(current_fsgid()), prid,
> >  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
> > &udqp, &gdqp); if (error)
> >  		return error;
> > diff --git a/init/Kconfig b/init/Kconfig
> > index 9d3a788..8083ffd 100644
> > --- a/init/Kconfig
> > +++ b/init/Kconfig
> > @@ -1065,7 +1065,6 @@ config IPC_NS
> >  
> >  config USER_NS
> >  	bool "User namespace"
> > -	depends on UIDGID_CONVERTED
> >  	select UIDGID_STRICT_TYPE_CHECKS
> >  
> >  	default n
> > @@ -1099,21 +1098,9 @@ config NET_NS
> >  
> >  endif # NAMESPACES
> >  
> > -config UIDGID_CONVERTED
> > -	# True if all of the selected software conmponents are
> > known
> > -	# to have uid_t and gid_t converted to kuid_t and kgid_t
> > -	# where appropriate and are otherwise safe to use with
> > -	# the user namespace.
> > -	bool
> > -	default y
> > -
> > -	# Filesystems
> > -	depends on XFS_FS = n
> > -
> >  config UIDGID_STRICT_TYPE_CHECKS
> >  	bool "Require conversions between uid/gids and their
> > internal representation"
> > -	depends on UIDGID_CONVERTED
> > -	default n
> > +	default y
> >  	help
> >  	 While the nececessary conversions are being added to all
> > subsystems this option allows the code to continue to build for
> > unconverted subsystems.
> > 
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-25 20:08               ` Dwight Engen
@ 2013-06-25 21:04                 ` Brian Foster
  0 siblings, 0 replies; 46+ messages in thread
From: Brian Foster @ 2013-06-25 21:04 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On 06/25/2013 04:08 PM, Dwight Engen wrote:
> On Tue, 25 Jun 2013 12:46:19 -0400
> Brian Foster <bfoster@redhat.com> wrote:
> 
>> On 06/24/2013 09:10 AM, Dwight Engen wrote:
...
>>> diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
>>> index 96e344e..2c35b13 100644
>>> --- a/fs/xfs/xfs_icache.c
>>> +++ b/fs/xfs/xfs_icache.c
>>> @@ -617,7 +617,7 @@ restart:
>>>  
>>>  /*
>>>   * Background scanning to trim post-EOF preallocated space. This
>>> is queued
>>> - * based on the 'background_prealloc_discard_period' tunable (5m
>>> by default).
>>> + * based on the 'speculative_prealloc_lifetime' tunable (5m by
>>> default). */
>>>  STATIC void
>>>  xfs_queue_eofblocks(
>>> @@ -1202,11 +1202,11 @@ xfs_inode_match_id(
>>>  	struct xfs_eofblocks	*eofb)
>>>  {
>>>  	if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
>>> -	    ip->i_d.di_uid != eofb->eof_uid)
>>> +	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
>>>  		return 0;
>>>  
>>>  	if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
>>> -	    ip->i_d.di_gid != eofb->eof_gid)
>>> +	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
>>
>> More of a question... since we're originally comparing against the
>> on-disk values, is the separate internal structure strictly necessary
>> for making eofblocks userns aware?
> 
> Not sure I fully understand your question, we were comparing on-disk
> uid/gid values to unconverted eof values because xfs didn't support the
> eof ioctl callers passing in ids from a userns. I believe part
> of the idea of userns is that i_uid is an opaque type, hence the use of
> _eq() comparators and why we have to convert eof_[ug]id if we want to
> compare them to i_uid as opposed to di_uid.
> 

That latter point was my question, why does this code want/need to
compare to the i_uid as opposed to di_uid. It seems like technically you
could push the conversion up and not have to change much else.

It's not terribly important since this code is moving into the separate
xfs_eofblocks structure anyway. I'm not against it I guess, I just
wanted to be on the same page as to the intent of the change. I suppose
it makes sense if the idea is that core kernel code should be carrying
around kuid types in general.

...
>> This should probably go into xfs_icache.h along with the
>> aforementioned conversion helper.
>>
>> As I mentioned previously, I have some code around that creates an
>> internal version of the eofblocks structure. The main differences are
>> the name (xfs_eofblocks_internal) and I did the conversion down in
>> xfs_icache.c since I wasn't changing anything that affected the
>> ioctl().
>>
>> I can throw it up on the list for reference or if it's of any use as a
>> base for this work...
> 
> If you have time to put it up that'd be great, but no biggie if not I'll
> write a conversion routine. Thanks for looking at this.
> 

I'll forward it along momentarily...

Brian

>> Brian
>>
>>>  /*
>>>   * Various platform dependent calls that don't fit anywhere else
>>>   */
>>> diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
>>> index b75c9bb..57e2c18 100644
>>> --- a/fs/xfs/xfs_qm.c
>>> +++ b/fs/xfs/xfs_qm.c
>>> @@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
>>>  int
>>>  xfs_qm_vop_dqalloc(
>>>  	struct xfs_inode	*ip,
>>> -	uid_t			uid,
>>> -	gid_t			gid,
>>> +	xfs_dqid_t		uid,
>>> +	xfs_dqid_t		gid,
>>>  	prid_t			prid,
>>>  	uint			flags,
>>>  	struct xfs_dquot	**O_udqpp,
>>> @@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
>>>  			 * holding ilock.
>>>  			 */
>>>  			xfs_iunlock(ip, lockflags);
>>> -			if ((error = xfs_qm_dqget(mp, NULL,
>>> (xfs_dqid_t) uid,
>>> +			if ((error = xfs_qm_dqget(mp, NULL, uid,
>>>  						 XFS_DQ_USER,
>>>  						 XFS_QMOPT_DQALLOC
>>> | XFS_QMOPT_DOWARN,
>>> @@ -1723,7 +1723,7 @@ xfs_qm_vop_dqalloc(
>>>  	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
>>>  		if (ip->i_d.di_gid != gid) {
>>>  			xfs_iunlock(ip, lockflags);
>>> -			if ((error = xfs_qm_dqget(mp, NULL,
>>> (xfs_dqid_t)gid,
>>> +			if ((error = xfs_qm_dqget(mp, NULL, gid,
>>>  						 XFS_DQ_GROUP,
>>>  						 XFS_QMOPT_DQALLOC
>>> | XFS_QMOPT_DOWARN,
>>> @@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
>>>  			XFS_QMOPT_RES_RTBLKS :
>>> XFS_QMOPT_RES_REGBLKS; 
>>>  	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
>>> -	    ip->i_d.di_uid !=
>>> (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
>>> +	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
>>>  		delblksudq = udqp;
>>>  		/*
>>>  		 * If there are delayed allocation blocks, then we
>>> have to diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
>>> index c38068f..5f0bfe8 100644
>>> --- a/fs/xfs/xfs_quota.h
>>> +++ b/fs/xfs/xfs_quota.h
>>> @@ -320,8 +320,8 @@ extern int
>>> xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct
>>> xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, long, long,
>>> uint); 
>>> -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t,
>>> prid_t, uint,
>>> -		struct xfs_dquot **, struct xfs_dquot **);
>>> +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t,
>>> xfs_dqid_t,
>>> +		prid_t, uint, struct xfs_dquot **, struct
>>> xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct
>>> xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct
>>> xfs_dquot *); extern int xfs_qm_vop_rename_dqattach(struct
>>> xfs_inode **); @@ -341,8 +341,9 @@ extern void
>>> xfs_qm_unmount_quotas(struct xfs_mount *); 
>>>  #else
>>>  static inline int
>>> -xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid,
>>> prid_t prid,
>>> -		uint flags, struct xfs_dquot **udqp, struct
>>> xfs_dquot **gdqp) +xfs_qm_vop_dqalloc(struct xfs_inode *ip,
>>> xfs_dqid_t uid, xfs_dqid_t gid,
>>> +		prid_t prid, uint flags, struct xfs_dquot **udqp,
>>> +		struct xfs_dquot **gdqp)
>>>  {
>>>  	*udqp = NULL;
>>>  	*gdqp = NULL;
>>> diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
>>> index 195a403..c50306e 100644
>>> --- a/fs/xfs/xfs_symlink.c
>>> +++ b/fs/xfs/xfs_symlink.c
>>> @@ -384,7 +384,9 @@ xfs_symlink(
>>>  	/*
>>>  	 * Make sure that we have allocated dquot(s) on disk.
>>>  	 */
>>> -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(),
>>> current_fsgid(), prid,
>>> +	error = xfs_qm_vop_dqalloc(dp,
>>> +			xfs_kuid_to_disk(current_fsuid()),
>>> +			xfs_kgid_to_disk(current_fsgid()), prid,
>>>  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
>>> &udqp, &gdqp); if (error)
>>>  		goto std_return;
>>> diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
>>> index 0176bb2..94f4f9f6 100644
>>> --- a/fs/xfs/xfs_vnodeops.c
>>> +++ b/fs/xfs/xfs_vnodeops.c
>>> @@ -515,7 +515,9 @@ xfs_create(
>>>  	/*
>>>  	 * Make sure that we have allocated dquot(s) on disk.
>>>  	 */
>>> -	error = xfs_qm_vop_dqalloc(dp, current_fsuid(),
>>> current_fsgid(), prid,
>>> +	error = xfs_qm_vop_dqalloc(dp,
>>> +			xfs_kuid_to_disk(current_fsuid()),
>>> +			xfs_kgid_to_disk(current_fsgid()), prid,
>>>  			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
>>> &udqp, &gdqp); if (error)
>>>  		return error;
>>> diff --git a/init/Kconfig b/init/Kconfig
>>> index 9d3a788..8083ffd 100644
>>> --- a/init/Kconfig
>>> +++ b/init/Kconfig
>>> @@ -1065,7 +1065,6 @@ config IPC_NS
>>>  
>>>  config USER_NS
>>>  	bool "User namespace"
>>> -	depends on UIDGID_CONVERTED
>>>  	select UIDGID_STRICT_TYPE_CHECKS
>>>  
>>>  	default n
>>> @@ -1099,21 +1098,9 @@ config NET_NS
>>>  
>>>  endif # NAMESPACES
>>>  
>>> -config UIDGID_CONVERTED
>>> -	# True if all of the selected software conmponents are
>>> known
>>> -	# to have uid_t and gid_t converted to kuid_t and kgid_t
>>> -	# where appropriate and are otherwise safe to use with
>>> -	# the user namespace.
>>> -	bool
>>> -	default y
>>> -
>>> -	# Filesystems
>>> -	depends on XFS_FS = n
>>> -
>>>  config UIDGID_STRICT_TYPE_CHECKS
>>>  	bool "Require conversions between uid/gids and their
>>> internal representation"
>>> -	depends on UIDGID_CONVERTED
>>> -	default n
>>> +	default y
>>>  	help
>>>  	 While the nececessary conversions are being added to all
>>> subsystems this option allows the code to continue to build for
>>> unconverted subsystems.
>>>
>>
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-24 13:10           ` [PATCH v2 RFC] " Dwight Engen
  2013-06-25 16:46             ` Brian Foster
@ 2013-06-26  2:09             ` Dave Chinner
  2013-06-26 21:30               ` Dwight Engen
                                 ` (8 more replies)
  2013-07-06  4:44             ` [PATCH 1/1] export inode_capable Serge Hallyn
  2013-07-08 13:09             ` [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate Serge Hallyn
  3 siblings, 9 replies; 46+ messages in thread
From: Dave Chinner @ 2013-06-26  2:09 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> Hi Dave, all. Here is a v2 patch that I believe addresses the previous
> comments, but I expect there to be more :) I think there are a few
> more issues to sort out before this is ready, and I want to add some
> tests to xfstests also.
> 
> I added permission checks for eofblocks in the ioctl code, but
> I don't think they are enough. Just because an unprivileged
> caller is in a group doesn't mean he can write to a file of that group,
> and I don't know how we can check for that till we get the inode in
> hand. Brian, if you or anyone else could comment on how this should work
> for the regular user and write() ENOSPC cases that'd be great.
> 
> The xfs code now uses inode->i_uid where possible instead of di_uid.
> The remaining uses of di_uid are where the inode is being setup,
> conversion to/from disk endianess, in dealing with quotas, and bulkstat.

Hi Dwight,

I haven't looked at the code in detail, I'll just mention that I
agree with Brain that we need to split this up into more patches.
The conversions are subtle and easy to misunderstand, so small
patches are good.  I'd say at minimum separate patches are needed
for:

	- permissions check on eof blocks
	- splitting eof blocks structure (I made comments on that to
	  Brain's posted patch)
	- introduce XFS conversion helpers
	- implement VFS-XFS boundary conversion using helpers
	- XFS ioctl conversions (maybe multiple patches)
	- final patch to modify kconfig once everything is done

> We do need to decide on the di_uid that comes back from bulkstat.
> Right now it is returning on disk (== init_user_ns) uids. It looks to
> me like xfsrestore is using the normal vfs routines (chown, fchown,
> lchown) when restoring so that won't line up if the xfsrestore is run
> in !init_user_ns.

Right. This is a major problem because there's no guarantee that you
are running restore in the same namespace as dump was run in. If you
take the dump in the init_user_ns, then you can't restore it from
inside the namespace container, and vice versa.

> We could possibly convert to userns values
> before returning them from the kernel, but I doubt that will work
> well with the xfs quotas.

Well, quotas are already converted to/from userns specific values
before they get to XFS. Including project quotas, which I think at
this point is wrong. We have no test cases for it, though, so I
can't validate that it actually works as it is supposed to and that
we don't break it inadvertantly in the future.

[ I'm still waiting on Eric to provide any information or scripts
for how he tested this all worked when he did the conversions.... ]

> Should we just require that callers of bulkstat
> be in init_user_ns? Thoughts?

This is one of the reasons why I want Eric to give us some idea of
how this is supposed to work - exactly how is backup and restore
supposed to be managed on a shared filesystem that is segmented up
into multiple namespace containers? We can talk about the
implementation all we like, but none of us have a clue to the policy
decisions that users will make that we need to support. Until we
have a clear idea on what policies we are supposed to be supporting,
the implementation will be ambiguous and compromised.

e.g. If users are responsible for it, then bulkstat needs to filter based
on the current namespace. If management is responsible (i.e.
init_user_ns does backup/restore of ns-specific subtrees), then
bulkstat cannot filter and needs to reject calls from outside the
init_user_ns().

But if we have to allow both configurations - which I think we have to
because both cases are valid choices for a hosting provider to give
users - then how are we supposed to implement/handle this?

The same goes for the file handle interfaces - it's perfectly valid
for a user to run a userspace NFS server (e.g. ganesha) inside a
namespace container, but that will allow that process to provide
unchecked remote access to the entire underlying filesystem, not
just the namespace being exported. i.e. fs/export.c needs to be made
namespace aware in some way so that there is a kernel wide policy
for handle to file translations in namespace containers....

---

FWIW, one comment on the wrappers now that I've quickly browsed the
code:

> @@ -68,14 +68,15 @@ xfs_acl_from_disk(
>  
>  		switch (acl_e->e_tag) {
>  		case ACL_USER:
> +			acl_e->e_uid = xfs_kuid_from_disk(be32_to_cpu(ace->ae_id));
> +			break;
>  		case ACL_GROUP:
> -			acl_e->e_id = be32_to_cpu(ace->ae_id);
> +			acl_e->e_gid = xfs_kgid_from_disk(be32_to_cpu(ace->ae_id));

I know I suggested it, but I have to say, that does look a little
weird. Normally the to/from disk routines do endian conversion
internally, so perhaps the conversion routines coul dbe better
named.

These:

	acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
	acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));

I think read a whole lot better, and the endian conversion now makes
sense as that's converting the on-disk value first...

> @@ -101,7 +102,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
>  		acl_e = &acl->a_entries[i];
>  
>  		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
> -		ace->ae_id = cpu_to_be32(acl_e->e_id);
> +		switch(acl_e->e_tag) {
> +		case ACL_USER:
> +			ace->ae_id = cpu_to_be32(xfs_kuid_to_disk(acl_e->e_uid));
> +			break;
> +		case ACL_GROUP:
> +			ace->ae_id = cpu_to_be32(xfs_kgid_to_disk(acl_e->e_gid));

and:
			ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
			ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_uid));

Sorry for asking you to redo this - sometimes an idea I have doesn't
quite work out the first time :/

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-26  2:09             ` Dave Chinner
@ 2013-06-26 21:30               ` Dwight Engen
  2013-06-26 22:44                 ` Dave Chinner
  2013-06-28 14:23               ` Dwight Engen
                                 ` (7 subsequent siblings)
  8 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-26 21:30 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

On Wed, 26 Jun 2013 12:09:24 +1000
Dave Chinner <david@fromorbit.com> wrote:

> On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> > Hi Dave, all. Here is a v2 patch that I believe addresses the
> > previous comments, but I expect there to be more :) I think there
> > are a few more issues to sort out before this is ready, and I want
> > to add some tests to xfstests also.
> > 
> > I added permission checks for eofblocks in the ioctl code, but
> > I don't think they are enough. Just because an unprivileged
> > caller is in a group doesn't mean he can write to a file of that
> > group, and I don't know how we can check for that till we get the
> > inode in hand. Brian, if you or anyone else could comment on how
> > this should work for the regular user and write() ENOSPC cases
> > that'd be great.
> > 
> > The xfs code now uses inode->i_uid where possible instead of di_uid.
> > The remaining uses of di_uid are where the inode is being setup,
> > conversion to/from disk endianess, in dealing with quotas, and
> > bulkstat.
> 
> Hi Dwight,
> 
> I haven't looked at the code in detail, I'll just mention that I
> agree with Brain that we need to split this up into more patches.
> The conversions are subtle and easy to misunderstand, so small
> patches are good.  I'd say at minimum separate patches are needed
> for:
> 
> 	- permissions check on eof blocks
> 	- splitting eof blocks structure (I made comments on that to
> 	  Brain's posted patch)
> 	- introduce XFS conversion helpers
> 	- implement VFS-XFS boundary conversion using helpers
> 	- XFS ioctl conversions (maybe multiple patches)
> 	- final patch to modify kconfig once everything is done

Sure, I'll split it up and integrate the comments from the other
(eof blocks) thread as well.
 
> > We do need to decide on the di_uid that comes back from bulkstat.
> > Right now it is returning on disk (== init_user_ns) uids. It looks
> > to me like xfsrestore is using the normal vfs routines (chown,
> > fchown, lchown) when restoring so that won't line up if the
> > xfsrestore is run in !init_user_ns.
> 
> Right. This is a major problem because there's no guarantee that you
> are running restore in the same namespace as dump was run in. If you
> take the dump in the init_user_ns, then you can't restore it from
> inside the namespace container, and vice versa.

Yep. In thinking just a bit about this, assuming we did convert the
ids that came back from bulkstat, how is this much different than today
where you take a backup on one machine and try to restore it on
another? It seems like today the onus is on the user to ensure the uids
will align correctly. AFAICS tar --numeric-owner would have the same
issue, and it looks like man xfsrestore is getting at a similar
thing in the Quotas section.

> > We could possibly convert to userns values
> > before returning them from the kernel, but I doubt that will work
> > well with the xfs quotas.
> 
> Well, quotas are already converted to/from userns specific values
> before they get to XFS. Including project quotas, which I think at
> this point is wrong. We have no test cases for it, though, so I
> can't validate that it actually works as it is supposed to and that
> we don't break it inadvertantly in the future.
> 
> [ I'm still waiting on Eric to provide any information or scripts
> for how he tested this all worked when he did the conversions.... ]
> 
> > Should we just require that callers of bulkstat
> > be in init_user_ns? Thoughts?
> 
> This is one of the reasons why I want Eric to give us some idea of
> how this is supposed to work - exactly how is backup and restore
> supposed to be managed on a shared filesystem that is segmented up
> into multiple namespace containers? We can talk about the
> implementation all we like, but none of us have a clue to the policy
> decisions that users will make that we need to support. Until we
> have a clear idea on what policies we are supposed to be supporting,
> the implementation will be ambiguous and compromised.
> 
> e.g. If users are responsible for it, then bulkstat needs to filter
> based on the current namespace. If management is responsible (i.e.
> init_user_ns does backup/restore of ns-specific subtrees), then
> bulkstat cannot filter and needs to reject calls from outside the
> init_user_ns().

Maybe we can have bulkstat always filter based on if the caller
kuid_has_mapping(current_user_ns(), inode->i_uid)? That way a caller
from init_user_ns can see them all, but callers from inside a userns
will get a subset of inodes returned?

This doesn't solve the save from one uesrns, restore from a different
one use case, not sure if that was the scenario you were getting at. To
allow for this use case I guess we could have an "id offset" argument
for xfsrestore that gets applied before chown() but that seems icky.

> But if we have to allow both configurations - which I think we have to
> because both cases are valid choices for a hosting provider to give
> users - then how are we supposed to implement/handle this?
> 
> The same goes for the file handle interfaces - it's perfectly valid
> for a user to run a userspace NFS server (e.g. ganesha) inside a
> namespace container, but that will allow that process to provide
> unchecked remote access to the entire underlying filesystem, not
> just the namespace being exported. i.e. fs/export.c needs to be made
> namespace aware in some way so that there is a kernel wide policy
> for handle to file translations in namespace containers....

I haven't looked at the handle stuff, I'll take a look and get
familiarized.

> ---
> 
> FWIW, one comment on the wrappers now that I've quickly browsed the
> code:
> 
> > @@ -68,14 +68,15 @@ xfs_acl_from_disk(
> >  
> >  		switch (acl_e->e_tag) {
> >  		case ACL_USER:
> > +			acl_e->e_uid =
> > xfs_kuid_from_disk(be32_to_cpu(ace->ae_id));
> > +			break;
> >  		case ACL_GROUP:
> > -			acl_e->e_id = be32_to_cpu(ace->ae_id);
> > +			acl_e->e_gid =
> > xfs_kgid_from_disk(be32_to_cpu(ace->ae_id));
> 
> I know I suggested it, but I have to say, that does look a little
> weird. Normally the to/from disk routines do endian conversion
> internally, so perhaps the conversion routines coul dbe better
> named.
> 
> These:
> 
> 	acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
> 	acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
> 
> I think read a whole lot better, and the endian conversion now makes
> sense as that's converting the on-disk value first...
> 
> > @@ -101,7 +102,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const
> > struct posix_acl *acl) acl_e = &acl->a_entries[i];
> >  
> >  		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
> > -		ace->ae_id = cpu_to_be32(acl_e->e_id);
> > +		switch(acl_e->e_tag) {
> > +		case ACL_USER:
> > +			ace->ae_id =
> > cpu_to_be32(xfs_kuid_to_disk(acl_e->e_uid));
> > +			break;
> > +		case ACL_GROUP:
> > +			ace->ae_id =
> > cpu_to_be32(xfs_kgid_to_disk(acl_e->e_gid));
> 
> and:
> 			ace->ae_id =
> cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); ace->ae_id =
> cpu_to_be32(xfs_kgid_to_gid(acl_e->e_uid));
> 
> Sorry for asking you to redo this - sometimes an idea I have doesn't
> quite work out the first time :/

Heh, no problem, I agree these names are even better. Makes me wonder
if the return type of xfs_k[ug]id_to_[ug]id should be [ug]id_t instead
of __uint32_t? I'll use these new names in a split out v3 series to
follow. Thanks.

> Cheers,
> 
> Dave.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-26 21:30               ` Dwight Engen
@ 2013-06-26 22:44                 ` Dave Chinner
  2013-06-27 13:02                   ` Serge Hallyn
  2013-06-27 20:57                   ` Ben Myers
  0 siblings, 2 replies; 46+ messages in thread
From: Dave Chinner @ 2013-06-26 22:44 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> On Wed, 26 Jun 2013 12:09:24 +1000
> Dave Chinner <david@fromorbit.com> wrote:
> 
> > On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> > > Hi Dave, all. Here is a v2 patch that I believe addresses the
> > > previous comments, but I expect there to be more :) I think there
> > > are a few more issues to sort out before this is ready, and I want
> > > to add some tests to xfstests also.
> > > 
> > > I added permission checks for eofblocks in the ioctl code, but
> > > I don't think they are enough. Just because an unprivileged
> > > caller is in a group doesn't mean he can write to a file of that
> > > group, and I don't know how we can check for that till we get the
> > > inode in hand. Brian, if you or anyone else could comment on how
> > > this should work for the regular user and write() ENOSPC cases
> > > that'd be great.
> > > 
> > > The xfs code now uses inode->i_uid where possible instead of di_uid.
> > > The remaining uses of di_uid are where the inode is being setup,
> > > conversion to/from disk endianess, in dealing with quotas, and
> > > bulkstat.
> > 
> > Hi Dwight,
> > 
> > I haven't looked at the code in detail, I'll just mention that I
> > agree with Brain that we need to split this up into more patches.
> > The conversions are subtle and easy to misunderstand, so small
> > patches are good.  I'd say at minimum separate patches are needed
> > for:
> > 
> > 	- permissions check on eof blocks
> > 	- splitting eof blocks structure (I made comments on that to
> > 	  Brain's posted patch)
> > 	- introduce XFS conversion helpers
> > 	- implement VFS-XFS boundary conversion using helpers
> > 	- XFS ioctl conversions (maybe multiple patches)
> > 	- final patch to modify kconfig once everything is done
> 
> Sure, I'll split it up and integrate the comments from the other
> (eof blocks) thread as well.
>  
> > > We do need to decide on the di_uid that comes back from bulkstat.
> > > Right now it is returning on disk (== init_user_ns) uids. It looks
> > > to me like xfsrestore is using the normal vfs routines (chown,
> > > fchown, lchown) when restoring so that won't line up if the
> > > xfsrestore is run in !init_user_ns.
> > 
> > Right. This is a major problem because there's no guarantee that you
> > are running restore in the same namespace as dump was run in. If you
> > take the dump in the init_user_ns, then you can't restore it from
> > inside the namespace container, and vice versa.
> 
> Yep. In thinking just a bit about this, assuming we did convert the
> ids that came back from bulkstat, how is this much different than today
> where you take a backup on one machine and try to restore it on
> another?

Yes, it's similar, but subtly different. One machine to another is
dumping from one filesystem and restoring to another. They are
physically separate. Thi case is dump/restore from/to the *same*
filesystem, just with a different namespace filter in place...

> It seems like today the onus is on the user to ensure the uids
> will align correctly. AFAICS tar --numeric-owner would have the same
> issue, and it looks like man xfsrestore is getting at a similar
> thing in the Quotas section.

Right, but we are restoring to the same filesystem, just in a
different namespace....

Hence the question of *policy* here. It's completely undefined -
we've been given no guidance on how this stuff is supposed to
behave, and there is no documentation to refer to that tells us what
should happen. Until we have a definitive "it should behave like X"
statement from an expert (like Eric), we don't have a solid basis
for implementing one behaviour or the other (or allowing both). This
forms part of the user ABI, so I want to know we're doing the right
thing for the right reasons before fixing it in stone...

> > > Should we just require that callers of bulkstat
> > > be in init_user_ns? Thoughts?
> > 
> > This is one of the reasons why I want Eric to give us some idea of
> > how this is supposed to work - exactly how is backup and restore
> > supposed to be managed on a shared filesystem that is segmented up
> > into multiple namespace containers? We can talk about the
> > implementation all we like, but none of us have a clue to the policy
> > decisions that users will make that we need to support. Until we
> > have a clear idea on what policies we are supposed to be supporting,
> > the implementation will be ambiguous and compromised.
> > 
> > e.g. If users are responsible for it, then bulkstat needs to filter
> > based on the current namespace. If management is responsible (i.e.
> > init_user_ns does backup/restore of ns-specific subtrees), then
> > bulkstat cannot filter and needs to reject calls from outside the
> > init_user_ns().
> 
> Maybe we can have bulkstat always filter based on if the caller
> kuid_has_mapping(current_user_ns(), inode->i_uid)? That way a caller
> from init_user_ns can see them all, but callers from inside a userns
> will get a subset of inodes returned?

We could do that, though it means bulkstat is going to be a *lot
slower* when called from within a user namespace environment. A
namespace might only have a few thousand files for backup, yet the
underlying filesystem might have tens of millions of inodes in it.
The bulkstat call now has to walk all of the inodes just to find the
few thousand that match the filter. And multiply that by the number
of namespaces all doing backups at 3am in the morning and you start
to get an idea of the scope of the problem....


> This doesn't solve the save from one uesrns, restore from a different
> one use case, not sure if that was the scenario you were getting at. To
> allow for this use case I guess we could have an "id offset" argument
> for xfsrestore that gets applied before chown() but that seems icky.

And it means that xfsrestore has to be made aware of namespaces,
which is something I'd prefer not to have to do.

....

> > cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); ace->ae_id =
> > cpu_to_be32(xfs_kgid_to_gid(acl_e->e_uid));
> > 
> > Sorry for asking you to redo this - sometimes an idea I have doesn't
> > quite work out the first time :/
> 
> Heh, no problem, I agree these names are even better. Makes me wonder
> if the return type of xfs_k[ug]id_to_[ug]id should be [ug]id_t instead
> of __uint32_t? I'll use these new names in a split out v3 series to
> follow. Thanks.

The type in the struct xfs_icdinode is __uint32_t, so I think it
should match that.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-26 22:44                 ` Dave Chinner
@ 2013-06-27 13:02                   ` Serge Hallyn
  2013-06-28  1:54                     ` Dave Chinner
  2013-06-27 20:57                   ` Ben Myers
  1 sibling, 1 reply; 46+ messages in thread
From: Serge Hallyn @ 2013-06-27 13:02 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Eric W. Biederman, Brian Foster, Dwight Engen, xfs

Quoting Dave Chinner (david@fromorbit.com):
> On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > On Wed, 26 Jun 2013 12:09:24 +1000
> > Dave Chinner <david@fromorbit.com> wrote:
> > > > We do need to decide on the di_uid that comes back from bulkstat.
> > > > Right now it is returning on disk (== init_user_ns) uids. It looks
> > > > to me like xfsrestore is using the normal vfs routines (chown,

I might not be helpful here, (as despite having used xfs for years
I've not used these features) but feel like I should try based on
what I see in the manpages.  Here is my understanding:

Assume you're a task in a child userns, where you have host uids
100000-110000 mapped to container uids 0-10000,

1. bulkstat is an xfs_ioctl command, right?  It should return the mapped
uids (0-10000).

2. xfsdump should store the uids as seen in the caller's namespace.  If
xfsdump is done from the container, the dump should show uids 0-10000.

3. xfsrestore should use be run from the desired namespace.  If you did
xfsdump from the host ns, you should then xfsrestore from the host ns.
Then inside the container those uids (100000-110000) will be mapped
to your uids (0-10000).

4. If you xfsdump in this container, then xfsrestore in another
container where you have 200000-210000 mapped to 0-10000, the dump
image will have uids 0-10000.  The restored image will have container
uids 0-10000, while on the underlying host media it will be uids
200000-210000.

5. If you xfsdump in this container then xfsrestore on the host, then
the host uids 0-10000 will be used on the underlying media.  The
container would be unable to read this files as the uids do not map
into the container.

-serge

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-26 22:44                 ` Dave Chinner
  2013-06-27 13:02                   ` Serge Hallyn
@ 2013-06-27 20:57                   ` Ben Myers
  2013-06-28  1:46                     ` Dave Chinner
  1 sibling, 1 reply; 46+ messages in thread
From: Ben Myers @ 2013-06-27 20:57 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Eric W. Biederman, Brian Foster, Serge Hallyn, Dwight Engen, xfs

Hey,

On Thu, Jun 27, 2013 at 08:44:10AM +1000, Dave Chinner wrote:
> On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > On Wed, 26 Jun 2013 12:09:24 +1000
> > Dave Chinner <david@fromorbit.com> wrote:
> > > On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> > > > Should we just require that callers of bulkstat
> > > > be in init_user_ns? Thoughts?
> > > 
> > > This is one of the reasons why I want Eric to give us some idea of
> > > how this is supposed to work - exactly how is backup and restore
> > > supposed to be managed on a shared filesystem that is segmented up
> > > into multiple namespace containers? We can talk about the
> > > implementation all we like, but none of us have a clue to the policy
> > > decisions that users will make that we need to support. Until we
> > > have a clear idea on what policies we are supposed to be supporting,
> > > the implementation will be ambiguous and compromised.
> > > 
> > > e.g. If users are responsible for it, then bulkstat needs to filter
> > > based on the current namespace. If management is responsible (i.e.
> > > init_user_ns does backup/restore of ns-specific subtrees), then
> > > bulkstat cannot filter and needs to reject calls from outside the
> > > init_user_ns().
> > 
> > Maybe we can have bulkstat always filter based on if the caller
> > kuid_has_mapping(current_user_ns(), inode->i_uid)? That way a caller
> > from init_user_ns can see them all, but callers from inside a userns
> > will get a subset of inodes returned?
> 
> We could do that, though it means bulkstat is going to be a *lot
> slower* when called from within a user namespace environment. A
> namespace might only have a few thousand files for backup, yet the
> underlying filesystem might have tens of millions of inodes in it.
> The bulkstat call now has to walk all of the inodes just to find the
> few thousand that match the filter. And multiply that by the number
> of namespaces all doing backups at 3am in the morning and you start
> to get an idea of the scope of the problem....

Ugh.  That really doesn't map well onto bulkstat.  If we wanted bulkstat to
work well with namespaces, we might have to teach the filesystem a bit more
about them in order to create the required indices per namespace.  While a
filter might get the job done in a pinch, wouldn't you really rather have an
inobt?  ;)

To build that inobt you'd have to know whether a given directory was the root
of a new namespace.  Maybe implementable as some kind of flag, 'everything
below this dir is part of its own namespace, put it in this inobt'.  And then
you'd have to have a way for bulkstat to know to look there, e.g. if the caller
is not in init_user_ns and if the initial inode had the flag, use the inobt on
that initial inode for bulkstat instead of the regular inobts.  Crazy.  Could
be done.

Initially, requiring bulkstat callers to be in init_user_ns is ok.  It just
doesn't suit everyone's needs..

-Ben

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-27 20:57                   ` Ben Myers
@ 2013-06-28  1:46                     ` Dave Chinner
  2013-06-28 15:15                       ` Serge Hallyn
  0 siblings, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2013-06-28  1:46 UTC (permalink / raw)
  To: Ben Myers
  Cc: Eric W. Biederman, Brian Foster, Serge Hallyn, Dwight Engen, xfs

On Thu, Jun 27, 2013 at 03:57:58PM -0500, Ben Myers wrote:
> Hey,
> 
> On Thu, Jun 27, 2013 at 08:44:10AM +1000, Dave Chinner wrote:
> > On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > > On Wed, 26 Jun 2013 12:09:24 +1000
> > > Dave Chinner <david@fromorbit.com> wrote:
> > > > On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> > > > > Should we just require that callers of bulkstat
> > > > > be in init_user_ns? Thoughts?
> > > > 
> > > > This is one of the reasons why I want Eric to give us some idea of
> > > > how this is supposed to work - exactly how is backup and restore
> > > > supposed to be managed on a shared filesystem that is segmented up
> > > > into multiple namespace containers? We can talk about the
> > > > implementation all we like, but none of us have a clue to the policy
> > > > decisions that users will make that we need to support. Until we
> > > > have a clear idea on what policies we are supposed to be supporting,
> > > > the implementation will be ambiguous and compromised.
> > > > 
> > > > e.g. If users are responsible for it, then bulkstat needs to filter
> > > > based on the current namespace. If management is responsible (i.e.
> > > > init_user_ns does backup/restore of ns-specific subtrees), then
> > > > bulkstat cannot filter and needs to reject calls from outside the
> > > > init_user_ns().
> > > 
> > > Maybe we can have bulkstat always filter based on if the caller
> > > kuid_has_mapping(current_user_ns(), inode->i_uid)? That way a caller
> > > from init_user_ns can see them all, but callers from inside a userns
> > > will get a subset of inodes returned?
> > 
> > We could do that, though it means bulkstat is going to be a *lot
> > slower* when called from within a user namespace environment. A
> > namespace might only have a few thousand files for backup, yet the
> > underlying filesystem might have tens of millions of inodes in it.
> > The bulkstat call now has to walk all of the inodes just to find the
> > few thousand that match the filter. And multiply that by the number
> > of namespaces all doing backups at 3am in the morning and you start
> > to get an idea of the scope of the problem....
> 
> Ugh.  That really doesn't map well onto bulkstat.  If we wanted bulkstat to
> work well with namespaces, we might have to teach the filesystem a bit more
> about them in order to create the required indices per namespace.  While a
> filter might get the job done in a pinch, wouldn't you really rather have an
> inobt?  ;)

Absolutely not. :/

Filesystems can be bind mounted into multiple namespaces, you can
hard link across namespace boundaries, you can do all sorts of
things that result in inodes being shared between namespaces. You
can't have a per-namespace inobt when you can do this sort of thing
that the underlying filesystem many not even be aware of. Hell, you
can have the init namespace manipulate files for the user namespace,
and those manipulations aren't even aware they are happening inside
a namespace.

That doesn't even begin to touch on the major problems it introduces
into the on-disk format. e.g. how do you find, manage and validate
abitrarily rooted allocated inode btrees. What AG do you put them
in? What happens when you have inodes in multiple AGs in a single
namespace? One tree per AG per namespace? What happens when you have
10000 namespaces and 1000 AGs?  How do we find the right inobt(s)
when we do an allocation - they aren't in the AGI anymore? How do we
walk then on mount after an unclean shutdown? How do we allocate and
remove trees? What the hell is repair supposed to do with
corrupt/lost inode btrees?

It's a rats nest, and it doesn't solve the basic problem of how
utilities that use bulkstat are supposed to behave.

> To build that inobt you'd have to know whether a given directory was the root
> of a new namespace.  Maybe implementable as some kind of flag, 'everything
> below this dir is part of its own namespace, put it in this inobt'.  And then
> you'd have to have a way for bulkstat to know to look there, e.g. if the caller
> is not in init_user_ns and if the initial inode had the flag, use the inobt on
> that initial inode for bulkstat instead of the regular inobts.  Crazy.  Could
> be done.

And I could fly to the moon, too. But like per-namespace inode
btrees, I don't see ever happening either...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-27 13:02                   ` Serge Hallyn
@ 2013-06-28  1:54                     ` Dave Chinner
  2013-06-28 15:25                       ` Serge Hallyn
  0 siblings, 1 reply; 46+ messages in thread
From: Dave Chinner @ 2013-06-28  1:54 UTC (permalink / raw)
  To: Serge Hallyn; +Cc: Eric W. Biederman, Brian Foster, Dwight Engen, xfs

On Thu, Jun 27, 2013 at 08:02:05AM -0500, Serge Hallyn wrote:
> Quoting Dave Chinner (david@fromorbit.com):
> > On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > > On Wed, 26 Jun 2013 12:09:24 +1000
> > > Dave Chinner <david@fromorbit.com> wrote:
> > > > > We do need to decide on the di_uid that comes back from bulkstat.
> > > > > Right now it is returning on disk (== init_user_ns) uids. It looks
> > > > > to me like xfsrestore is using the normal vfs routines (chown,
> 
> I might not be helpful here, (as despite having used xfs for years
> I've not used these features) but feel like I should try based on
> what I see in the manpages.  Here is my understanding:
> 
> Assume you're a task in a child userns, where you have host uids
> 100000-110000 mapped to container uids 0-10000,
> 
> 1. bulkstat is an xfs_ioctl command, right?  It should return the mapped
> uids (0-10000).
> 
> 2. xfsdump should store the uids as seen in the caller's namespace.  If
> xfsdump is done from the container, the dump should show uids 0-10000.

So when run from within a namespace, it should filter and return
only inodes that match the uids/gids mapped into the namespace?
That can be done, it's just a rather inefficient use of bulkstat
(which is primarily there for efficiency reasons).

Here's a corner case. Say I download a tarball from somewhere that
has uids/gids inside it, and when I untar it it creates uids/gids
outside the namespaces mapped range of [0-10000]. What happens then?
What uids do we end up on disk, and how do we ensure that the
bulkstat filter still returns those inodes?

> 3. xfsrestore should use be run from the desired namespace.  If you did
> xfsdump from the host ns, you should then xfsrestore from the host ns.
> Then inside the container those uids (100000-110000) will be mapped
> to your uids (0-10000).
> 
> 4. If you xfsdump in this container, then xfsrestore in another
> container where you have 200000-210000 mapped to 0-10000, the dump
> image will have uids 0-10000.  The restored image will have container
> uids 0-10000, while on the underlying host media it will be uids
> 200000-210000.
> 
> 5. If you xfsdump in this container then xfsrestore on the host, then
> the host uids 0-10000 will be used on the underlying media.  The
> container would be unable to read this files as the uids do not map
> into the container.

Yes, that follows from 1+2. We'll need some documentation in
the dump/restore man pages for this, and I'd suggest that the
namespace documentation/man pages get this sort of treatment, too.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-26  2:09             ` Dave Chinner
  2013-06-26 21:30               ` Dwight Engen
@ 2013-06-28 14:23               ` Dwight Engen
  2013-06-28 15:11               ` [PATCH v3 0/6] " Dwight Engen
                                 ` (6 subsequent siblings)
  8 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 14:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

On Wed, 26 Jun 2013 12:09:24 +1000
Dave Chinner <david@fromorbit.com> wrote:

> On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> > Hi Dave, all. Here is a v2 patch that I believe addresses the
> > previous comments, but I expect there to be more :) I think there
> > are a few more issues to sort out before this is ready, and I want
> > to add some tests to xfstests also.
> > 
> > I added permission checks for eofblocks in the ioctl code, but
> > I don't think they are enough. Just because an unprivileged
> > caller is in a group doesn't mean he can write to a file of that
> > group, and I don't know how we can check for that till we get the
> > inode in hand. Brian, if you or anyone else could comment on how
> > this should work for the regular user and write() ENOSPC cases
> > that'd be great.
> > 
> > The xfs code now uses inode->i_uid where possible instead of di_uid.
> > The remaining uses of di_uid are where the inode is being setup,
> > conversion to/from disk endianess, in dealing with quotas, and
> > bulkstat.
> 
> Hi Dwight,
> 
> I haven't looked at the code in detail, I'll just mention that I
> agree with Brain that we need to split this up into more patches.
> The conversions are subtle and easy to misunderstand, so small
> patches are good.  I'd say at minimum separate patches are needed
> for:
> 
> 	- permissions check on eof blocks
> 	- splitting eof blocks structure (I made comments on that to
> 	  Brain's posted patch)
> 	- introduce XFS conversion helpers
> 	- implement VFS-XFS boundary conversion using helpers
> 	- XFS ioctl conversions (maybe multiple patches)
> 	- final patch to modify kconfig once everything is done
> 
> > We do need to decide on the di_uid that comes back from bulkstat.
> > Right now it is returning on disk (== init_user_ns) uids. It looks
> > to me like xfsrestore is using the normal vfs routines (chown,
> > fchown, lchown) when restoring so that won't line up if the
> > xfsrestore is run in !init_user_ns.
> 
> Right. This is a major problem because there's no guarantee that you
> are running restore in the same namespace as dump was run in. If you
> take the dump in the init_user_ns, then you can't restore it from
> inside the namespace container, and vice versa.
> 
> > We could possibly convert to userns values
> > before returning them from the kernel, but I doubt that will work
> > well with the xfs quotas.
> 
> Well, quotas are already converted to/from userns specific values
> before they get to XFS. Including project quotas, which I think at
> this point is wrong. We have no test cases for it, though, so I
> can't validate that it actually works as it is supposed to and that
> we don't break it inadvertantly in the future.
> 
> [ I'm still waiting on Eric to provide any information or scripts
> for how he tested this all worked when he did the conversions.... ]
> 
> > Should we just require that callers of bulkstat
> > be in init_user_ns? Thoughts?
> 
> This is one of the reasons why I want Eric to give us some idea of
> how this is supposed to work - exactly how is backup and restore
> supposed to be managed on a shared filesystem that is segmented up
> into multiple namespace containers? We can talk about the
> implementation all we like, but none of us have a clue to the policy
> decisions that users will make that we need to support. Until we
> have a clear idea on what policies we are supposed to be supporting,
> the implementation will be ambiguous and compromised.
> 
> e.g. If users are responsible for it, then bulkstat needs to filter
> based on the current namespace. If management is responsible (i.e.
> init_user_ns does backup/restore of ns-specific subtrees), then
> bulkstat cannot filter and needs to reject calls from outside the
> init_user_ns().

Hi, in thinking about this a bit more, I'm not sure why bulkstat should
be any different than stat(2). If you stat a file not covered by your
current_user_ns(), it doesn't "filter" and return ENOENT, it just
returns it as overflowuid (65534). Filtering bulkstat also has other
(performance) problems as you pointed out, and we cannot easily filter
XFS_IOC_FSINUMBERS anyway.

I don't think the user namespace is intended to subpartition the
set of inodes available from a filesystem, but only to remap the id
values. It then seems like the only policy that can reliably be
supported today is to backup/restore from init_user_ns, or if you want
to backup/restore in a sub user ns, you must know ahead of time that you
will not encounter ids outside your mapping in the fs/backup media.

> But if we have to allow both configurations - which I think we have to
> because both cases are valid choices for a hosting provider to give
> users - then how are we supposed to implement/handle this?
> 
> The same goes for the file handle interfaces - it's perfectly valid
> for a user to run a userspace NFS server (e.g. ganesha) inside a
> namespace container, but that will allow that process to provide
> unchecked remote access to the entire underlying filesystem, not
> just the namespace being exported. i.e. fs/export.c needs to be made
> namespace aware in some way so that there is a kernel wide policy
> for handle to file translations in namespace containers....

So I looked at open_by_handle_at(2) and it looks to me like it is no
different than regular open(2) wrt the user namespace checks. The code
paths become common at path_openat(). Note that there is no check for
kuid_has_mapping(current_user_ns(), inode->i_uid) so an inode doesn't
have to be covered by the current user ns in order to get it opened,
the caller just has to pass the inode_permission() checks. I guess I
don't see the "unchecked" part, could you elaborate on that?

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH v3 0/6] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-26  2:09             ` Dave Chinner
  2013-06-26 21:30               ` Dwight Engen
  2013-06-28 14:23               ` Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 15:11               ` [PATCH 1/6] create wrappers for converting kuid_t to/from uid_t Dwight Engen
                                 ` (5 subsequent siblings)
  8 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Hi guys,

Here is the split out patch set with new conversion wrapper names,
eofblocks conversion, etc. I did not include a patch to mark xfs as
UIDGID_CONVERTED yet since we still need to decide on bulkstat. Thanks!

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 1/6] create wrappers for converting kuid_t to/from uid_t
  2013-06-26  2:09             ` Dave Chinner
                                 ` (2 preceding siblings ...)
  2013-06-28 15:11               ` [PATCH v3 0/6] " Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 15:11               ` [PATCH 2/6] convert kuid_t to/from uid_t in ACLs Dwight Engen
                                 ` (4 subsequent siblings)
  8 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_linux.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 800f896..761e4c0 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -159,6 +159,32 @@
 #define MAX(a,b)	(max(a,b))
 #define howmany(x, y)	(((x)+((y)-1))/(y))
 
+/* Kernel uid/gid conversion. These are used to convert to/from the on disk
+ * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
+ * The conversion here is type only, the value will remain the same since we
+ * are converting to the init_user_ns. The uid is later mapped to a particular
+ * user namespace value when crossing the kernel/user boundary.
+ */
+static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
+{
+	return from_kuid(&init_user_ns, uid);
+}
+
+static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
+{
+	return make_kuid(&init_user_ns, uid);
+}
+
+static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
+{
+	return from_kgid(&init_user_ns, gid);
+}
+
+static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
+{
+	return make_kgid(&init_user_ns, gid);
+}
+
 /*
  * Various platform dependent calls that don't fit anywhere else
  */
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 2/6] convert kuid_t to/from uid_t in ACLs
  2013-06-26  2:09             ` Dave Chinner
                                 ` (3 preceding siblings ...)
  2013-06-28 15:11               ` [PATCH 1/6] create wrappers for converting kuid_t to/from uid_t Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 15:11               ` [PATCH 3/6] ioctl: check for capabilities in the current user namespace Dwight Engen
                                 ` (3 subsequent siblings)
  8 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Change permission check for setting ACL to use inode_owner_or_capable()
which will additionally allow a CAP_FOWNER user in a user namespace to
be able to set an ACL on an inode covered by the user namespace mapping.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_acl.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 306d883..11a91d6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -68,14 +68,15 @@ xfs_acl_from_disk(
 
 		switch (acl_e->e_tag) {
 		case ACL_USER:
+			acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+			break;
 		case ACL_GROUP:
-			acl_e->e_id = be32_to_cpu(ace->ae_id);
+			acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
 			break;
 		case ACL_USER_OBJ:
 		case ACL_GROUP_OBJ:
 		case ACL_MASK:
 		case ACL_OTHER:
-			acl_e->e_id = ACL_UNDEFINED_ID;
 			break;
 		default:
 			goto fail;
@@ -101,7 +102,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
 		acl_e = &acl->a_entries[i];
 
 		ace->ae_tag = cpu_to_be32(acl_e->e_tag);
-		ace->ae_id = cpu_to_be32(acl_e->e_id);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+			break;
+		case ACL_GROUP:
+			ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+			break;
+		default:
+			ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
+			break;
+		}
+
 		ace->ae_perm = cpu_to_be16(acl_e->e_perm);
 	}
 }
@@ -360,7 +372,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 		return -EINVAL;
 	if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
 		return value ? -EACCES : 0;
-	if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 
 	if (!value)
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 3/6] ioctl: check for capabilities in the current user namespace
  2013-06-26  2:09             ` Dave Chinner
                                 ` (4 preceding siblings ...)
  2013-06-28 15:11               ` [PATCH 2/6] convert kuid_t to/from uid_t in ACLs Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 15:11               ` [PATCH 4/6] convert kuid_t to/from uid_t for xfs internal structures Dwight Engen
                                 ` (2 subsequent siblings)
  8 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Use inode_capable() to check if SUID|SGID bits should be cleared to match
similar check in inode_change_ok().

The check for CAP_LINUX_IMMUTABLE was not modified since all other file
systems also check against init_user_ns rather than current_user_ns.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5e99968..bedf510 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -981,7 +981,7 @@ xfs_ioctl_setattr(
 	 * to the file owner ID, except in cases where the
 	 * CAP_FSETID capability is applicable.
 	 */
-	if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+	if (!inode_owner_or_capable(VFS_I(ip))) {
 		code = XFS_ERROR(EPERM);
 		goto error_return;
 	}
@@ -1103,7 +1103,7 @@ xfs_ioctl_setattr(
 		 * cleared upon successful return from chown()
 		 */
 		if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-		    !capable(CAP_FSETID))
+		    !inode_capable(VFS_I(ip), CAP_FSETID))
 			ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
 
 		/*
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 4/6] convert kuid_t to/from uid_t for xfs internal structures
  2013-06-26  2:09             ` Dave Chinner
                                 ` (5 preceding siblings ...)
  2013-06-28 15:11               ` [PATCH 3/6] ioctl: check for capabilities in the current user namespace Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 15:11               ` [PATCH 5/6] create internal eofblocks structure with kuid_t types Dwight Engen
  2013-06-28 15:11               ` [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match Dwight Engen
  8 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Use uint32 from init_user_ns for xfs internal uid/gid
representation in xfs_icdinode, xfs_dqid_t.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_inode.c    |  6 +++---
 fs/xfs/xfs_iops.c     | 38 ++++++++++++++++++++------------------
 fs/xfs/xfs_qm.c       | 10 +++++-----
 fs/xfs/xfs_quota.h    |  9 +++++----
 fs/xfs/xfs_symlink.c  |  4 +++-
 fs/xfs/xfs_vnodeops.c |  4 +++-
 6 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9ecfe1e..e9acd2d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1288,8 +1288,8 @@ xfs_ialloc(
 	ip->i_d.di_onlink = 0;
 	ip->i_d.di_nlink = nlink;
 	ASSERT(ip->i_d.di_nlink == nlink);
-	ip->i_d.di_uid = current_fsuid();
-	ip->i_d.di_gid = current_fsgid();
+	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
+	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 	xfs_set_projid(ip, prid);
 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 
@@ -1328,7 +1328,7 @@ xfs_ialloc(
 	 */
 	if ((irix_sgid_inherit) &&
 	    (ip->i_d.di_mode & S_ISGID) &&
-	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
+	    (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
 		ip->i_d.di_mode &= ~S_ISGID;
 	}
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c69bbc4..6f6fb17 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -420,8 +420,8 @@ xfs_vn_getattr(
 	stat->dev = inode->i_sb->s_dev;
 	stat->mode = ip->i_d.di_mode;
 	stat->nlink = ip->i_d.di_nlink;
-	stat->uid = ip->i_d.di_uid;
-	stat->gid = ip->i_d.di_gid;
+	stat->uid = inode->i_uid;
+	stat->gid = inode->i_gid;
 	stat->ino = ip->i_ino;
 	stat->atime = inode->i_atime;
 	stat->mtime = inode->i_mtime;
@@ -488,8 +488,8 @@ xfs_setattr_nonsize(
 	int			mask = iattr->ia_valid;
 	xfs_trans_t		*tp;
 	int			error;
-	uid_t			uid = 0, iuid = 0;
-	gid_t			gid = 0, igid = 0;
+	kuid_t			uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
+	kgid_t			gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
 	struct xfs_dquot	*udqp = NULL, *gdqp = NULL;
 	struct xfs_dquot	*olddquot1 = NULL, *olddquot2 = NULL;
 
@@ -522,13 +522,13 @@ xfs_setattr_nonsize(
 			uid = iattr->ia_uid;
 			qflags |= XFS_QMOPT_UQUOTA;
 		} else {
-			uid = ip->i_d.di_uid;
+			uid = inode->i_uid;
 		}
 		if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
 			gid = iattr->ia_gid;
 			qflags |= XFS_QMOPT_GQUOTA;
 		}  else {
-			gid = ip->i_d.di_gid;
+			gid = inode->i_gid;
 		}
 
 		/*
@@ -538,8 +538,10 @@ xfs_setattr_nonsize(
 		 */
 		ASSERT(udqp == NULL);
 		ASSERT(gdqp == NULL);
-		error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-					 qflags, &udqp, &gdqp);
+		error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
+					   xfs_kgid_to_gid(gid),
+					   xfs_get_projid(ip),
+					   qflags, &udqp, &gdqp);
 		if (error)
 			return error;
 	}
@@ -561,8 +563,8 @@ xfs_setattr_nonsize(
 		 * while we didn't have the inode locked, inode's dquot(s)
 		 * would have changed also.
 		 */
-		iuid = ip->i_d.di_uid;
-		igid = ip->i_d.di_gid;
+		iuid = inode->i_uid;
+		igid = inode->i_gid;
 		gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
 		uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
 
@@ -571,8 +573,8 @@ xfs_setattr_nonsize(
 		 * going to change.
 		 */
 		if (XFS_IS_QUOTA_RUNNING(mp) &&
-		    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-		     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+		    ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
+		     (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
 			ASSERT(tp);
 			error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
 						capable(CAP_FOWNER) ?
@@ -602,17 +604,17 @@ xfs_setattr_nonsize(
 		 * Change the ownerships and register quota modifications
 		 * in the transaction.
 		 */
-		if (iuid != uid) {
+		if (!uid_eq(iuid, uid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
 				ASSERT(mask & ATTR_UID);
 				ASSERT(udqp);
 				olddquot1 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_udquot, udqp);
 			}
-			ip->i_d.di_uid = uid;
+			ip->i_d.di_uid = xfs_kuid_to_uid(uid);
 			inode->i_uid = uid;
 		}
-		if (igid != gid) {
+		if (!gid_eq(igid, gid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
 				ASSERT(!XFS_IS_PQUOTA_ON(mp));
 				ASSERT(mask & ATTR_GID);
@@ -620,7 +622,7 @@ xfs_setattr_nonsize(
 				olddquot2 = xfs_qm_vop_chown(tp, ip,
 							&ip->i_gdquot, gdqp);
 			}
-			ip->i_d.di_gid = gid;
+			ip->i_d.di_gid = xfs_kgid_to_gid(gid);
 			inode->i_gid = gid;
 		}
 	}
@@ -1173,8 +1175,8 @@ xfs_setup_inode(
 
 	inode->i_mode	= ip->i_d.di_mode;
 	set_nlink(inode, ip->i_d.di_nlink);
-	inode->i_uid	= ip->i_d.di_uid;
-	inode->i_gid	= ip->i_d.di_gid;
+	inode->i_uid    = xfs_uid_to_kuid(ip->i_d.di_uid);
+	inode->i_gid    = xfs_gid_to_kgid(ip->i_d.di_gid);
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFBLK:
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b75c9bb..57e2c18 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1651,8 +1651,8 @@ xfs_qm_write_sb_changes(
 int
 xfs_qm_vop_dqalloc(
 	struct xfs_inode	*ip,
-	uid_t			uid,
-	gid_t			gid,
+	xfs_dqid_t		uid,
+	xfs_dqid_t		gid,
 	prid_t			prid,
 	uint			flags,
 	struct xfs_dquot	**O_udqpp,
@@ -1697,7 +1697,7 @@ xfs_qm_vop_dqalloc(
 			 * holding ilock.
 			 */
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+			if ((error = xfs_qm_dqget(mp, NULL, uid,
 						 XFS_DQ_USER,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
@@ -1723,7 +1723,7 @@ xfs_qm_vop_dqalloc(
 	if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
 		if (ip->i_d.di_gid != gid) {
 			xfs_iunlock(ip, lockflags);
-			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+			if ((error = xfs_qm_dqget(mp, NULL, gid,
 						 XFS_DQ_GROUP,
 						 XFS_QMOPT_DQALLOC |
 						 XFS_QMOPT_DOWARN,
@@ -1842,7 +1842,7 @@ xfs_qm_vop_chown_reserve(
 			XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
 
 	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-	    ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+	    ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
 		delblksudq = udqp;
 		/*
 		 * If there are delayed allocation blocks, then we have to
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index c38068f..5f0bfe8 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -320,8 +320,8 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
 		struct xfs_mount *, struct xfs_dquot *,
 		struct xfs_dquot *, long, long, uint);
 
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-		struct xfs_dquot **, struct xfs_dquot **);
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+		prid_t, uint, struct xfs_dquot **, struct xfs_dquot **);
 extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
 		struct xfs_dquot *, struct xfs_dquot *);
 extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -341,8 +341,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
 
 #else
 static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-		uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+		prid_t prid, uint flags, struct xfs_dquot **udqp,
+		struct xfs_dquot **gdqp)
 {
 	*udqp = NULL;
 	*gdqp = NULL;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 738c04b..75bb3ea 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -384,7 +384,9 @@ xfs_symlink(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp,
+			xfs_kuid_to_uid(current_fsuid()),
+			xfs_kgid_to_gid(current_fsgid()), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		goto std_return;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 42c0ef2..0262e1d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -506,7 +506,9 @@ xfs_create(
 	/*
 	 * Make sure that we have allocated dquot(s) on disk.
 	 */
-	error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
+	error = xfs_qm_vop_dqalloc(dp,
+			xfs_kuid_to_uid(current_fsuid()),
+			xfs_kgid_to_gid(current_fsgid()), prid,
 			XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
 	if (error)
 		return error;
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 5/6] create internal eofblocks structure with kuid_t types
  2013-06-26  2:09             ` Dave Chinner
                                 ` (6 preceding siblings ...)
  2013-06-28 15:11               ` [PATCH 4/6] convert kuid_t to/from uid_t for xfs internal structures Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 18:09                 ` Brian Foster
  2013-06-28 15:11               ` [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match Dwight Engen
  8 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Have eofblocks ioctl convert uid_t to kuid_t into internal structure.
Update internal filter matching to compare ids with kuid_t types.

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_fs.h     |  2 +-
 fs/xfs/xfs_icache.c |  6 +++---
 fs/xfs/xfs_ioctl.c  | 34 ++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_linux.h  |  8 ++++++++
 4 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d046955..7eb4a5e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -344,7 +344,7 @@ typedef struct xfs_error_injection {
  * Speculative preallocation trimming.
  */
 #define XFS_EOFBLOCKS_VERSION		1
-struct xfs_eofblocks {
+struct xfs_fs_eofblocks {
 	__u32		eof_version;
 	__u32		eof_flags;
 	uid_t		eof_uid;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 96e344e..2c35b13 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -617,7 +617,7 @@ restart:
 
 /*
  * Background scanning to trim post-EOF preallocated space. This is queued
- * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
  */
 STATIC void
 xfs_queue_eofblocks(
@@ -1202,11 +1202,11 @@ xfs_inode_match_id(
 	struct xfs_eofblocks	*eofb)
 {
 	if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
-	    ip->i_d.di_uid != eofb->eof_uid)
+	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
 		return 0;
 
 	if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
-	    ip->i_d.di_gid != eofb->eof_gid)
+	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
 		return 0;
 
 	if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bedf510..487dca5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1328,6 +1328,31 @@ xfs_ioc_getbmapx(
 	return 0;
 }
 
+STATIC int
+xfs_fs_eofblocks_to_internal(
+	struct xfs_fs_eofblocks		*src,
+	struct xfs_eofblocks		*dst)
+{
+	dst->eof_flags = src->eof_flags;
+	dst->eof_prid = src->eof_prid;
+	dst->eof_min_file_size = src->eof_min_file_size;
+
+	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
+		dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
+		if (!uid_valid(dst->eof_uid))
+			return XFS_ERROR(EINVAL);
+	}
+
+	if (src->eof_flags & XFS_EOF_FLAGS_GID) {
+		dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
+		if (!gid_valid(dst->eof_gid))
+			return XFS_ERROR(EINVAL);
+	}
+
+	return 0;
+}
+
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -1610,7 +1635,8 @@ xfs_file_ioctl(
 		return -error;
 
 	case XFS_IOC_FREE_EOFBLOCKS: {
-		struct xfs_eofblocks eofb;
+		struct xfs_fs_eofblocks eofb;
+		struct xfs_eofblocks keofb;
 
 		if (copy_from_user(&eofb, arg, sizeof(eofb)))
 			return -XFS_ERROR(EFAULT);
@@ -1625,7 +1651,11 @@ xfs_file_ioctl(
 		    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
 			return -XFS_ERROR(EINVAL);
 
-		error = xfs_icache_free_eofblocks(mp, &eofb);
+		error = xfs_fs_eofblocks_to_internal(&eofb, &keofb);
+		if (error)
+			return -XFS_ERROR(error);
+
+		error = xfs_icache_free_eofblocks(mp, &keofb);
 		return -error;
 	}
 
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 761e4c0..3c2f403 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -185,6 +185,14 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
 	return make_kgid(&init_user_ns, gid);
 }
 
+struct xfs_eofblocks {
+	__u32		eof_flags;
+	kuid_t		eof_uid;
+	kgid_t		eof_gid;
+	prid_t		eof_prid;
+	__u64		eof_min_file_size;
+};
+
 /*
  * Various platform dependent calls that don't fit anywhere else
  */
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match
  2013-06-26  2:09             ` Dave Chinner
                                 ` (7 preceding siblings ...)
  2013-06-28 15:11               ` [PATCH 5/6] create internal eofblocks structure with kuid_t types Dwight Engen
@ 2013-06-28 15:11               ` Dwight Engen
  2013-06-28 18:50                 ` Brian Foster
  8 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 15:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Brian Foster, Serge Hallyn, Eric W. Biederman, xfs

Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
---
 fs/xfs/xfs_ioctl.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 487dca5..123314e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1655,6 +1655,23 @@ xfs_file_ioctl(
 		if (error)
 			return -XFS_ERROR(error);
 
+		/* non-privileged users should not be able to trim blocks on
+		 * objects they cannot write to, so require them to specify
+		 * either their own uid, or a group they are a member of
+		 */
+		if (!capable(CAP_SYS_ADMIN)) {
+			if (!(eofb.eof_flags & (XFS_EOF_FLAGS_UID | XFS_EOF_FLAGS_GID)))
+				return -XFS_ERROR(EPERM);
+
+			if ((eofb.eof_flags & XFS_EOF_FLAGS_UID) &&
+			    !uid_eq(current_fsuid(), keofb.eof_uid))
+				return -XFS_ERROR(EPERM);
+
+			if ((eofb.eof_flags & XFS_EOF_FLAGS_GID) &&
+			    !in_group_p(keofb.eof_gid))
+				return -XFS_ERROR(EPERM);
+		}
+
 		error = xfs_icache_free_eofblocks(mp, &keofb);
 		return -error;
 	}
-- 
1.8.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-28  1:46                     ` Dave Chinner
@ 2013-06-28 15:15                       ` Serge Hallyn
  0 siblings, 0 replies; 46+ messages in thread
From: Serge Hallyn @ 2013-06-28 15:15 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Eric W. Biederman, Ben Myers, Dwight Engen, Brian Foster, xfs

Quoting Dave Chinner (david@fromorbit.com):
> On Thu, Jun 27, 2013 at 03:57:58PM -0500, Ben Myers wrote:
> > Hey,
> > 
> > On Thu, Jun 27, 2013 at 08:44:10AM +1000, Dave Chinner wrote:
> > > On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > > > On Wed, 26 Jun 2013 12:09:24 +1000
> > > > Dave Chinner <david@fromorbit.com> wrote:
> > > > > On Mon, Jun 24, 2013 at 09:10:35AM -0400, Dwight Engen wrote:
> > > > > > Should we just require that callers of bulkstat
> > > > > > be in init_user_ns? Thoughts?
> > > > > 
> > > > > This is one of the reasons why I want Eric to give us some idea of
> > > > > how this is supposed to work - exactly how is backup and restore
> > > > > supposed to be managed on a shared filesystem that is segmented up
> > > > > into multiple namespace containers? We can talk about the
> > > > > implementation all we like, but none of us have a clue to the policy
> > > > > decisions that users will make that we need to support. Until we
> > > > > have a clear idea on what policies we are supposed to be supporting,
> > > > > the implementation will be ambiguous and compromised.
> > > > > 
> > > > > e.g. If users are responsible for it, then bulkstat needs to filter
> > > > > based on the current namespace. If management is responsible (i.e.
> > > > > init_user_ns does backup/restore of ns-specific subtrees), then
> > > > > bulkstat cannot filter and needs to reject calls from outside the
> > > > > init_user_ns().
> > > > 
> > > > Maybe we can have bulkstat always filter based on if the caller
> > > > kuid_has_mapping(current_user_ns(), inode->i_uid)? That way a caller
> > > > from init_user_ns can see them all, but callers from inside a userns
> > > > will get a subset of inodes returned?
> > > 
> > > We could do that, though it means bulkstat is going to be a *lot
> > > slower* when called from within a user namespace environment. A
> > > namespace might only have a few thousand files for backup, yet the
> > > underlying filesystem might have tens of millions of inodes in it.
> > > The bulkstat call now has to walk all of the inodes just to find the
> > > few thousand that match the filter. And multiply that by the number
> > > of namespaces all doing backups at 3am in the morning and you start
> > > to get an idea of the scope of the problem....
> > 
> > Ugh.  That really doesn't map well onto bulkstat.  If we wanted bulkstat to
> > work well with namespaces, we might have to teach the filesystem a bit more
> > about them in order to create the required indices per namespace.  While a
> > filter might get the job done in a pinch, wouldn't you really rather have an
> > inobt?  ;)

Eric's intent was to map uids at the kernel-user boundary.  At the
syscall.  Just as if you send credentials over a unix socket which
crosses pid and user ns, the ucred.pid, ucred.uid and ucred.gid get
converted to valid values for the recipient.

The filesystem is kernel code.  It should only ever deal with the
kuids.  If a process stat()s a file, it should see uids mapped into
its own userns, or overflowuid if not mapped.

I'm not saying I know what to do with bulkstat, but I'm pretty sure
that teaching the filesystem about namespaces would be wrong.

It sounds to me like only allowing bulkstat from the init_user_ns
makes the most sense.  Short term, and quite likely long term.

> Absolutely not. :/
> 
> Filesystems can be bind mounted into multiple namespaces, you can
> hard link across namespace boundaries, you can do all sorts of
> things that result in inodes being shared between namespaces. You
> can't have a per-namespace inobt when you can do this sort of thing
> that the underlying filesystem many not even be aware of. Hell, you
> can have the init namespace manipulate files for the user namespace,
> and those manipulations aren't even aware they are happening inside
> a namespace.

They're not happening in a namespace.  All the uids in the namespace
are mapped to uids in the init ns.  So if you're doing something in
the init_user_ns, whatever changes you make should simply be properly
reflected in the container.  If you end up chowning a file which
was previously owned by a userid mapped into the container, to a uid
which is not mapped into the container, then inside the container you'll
see the overflowuid after the chown.  Just as you would if you looked
at a host-owned file to begin with.

-serge

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-28  1:54                     ` Dave Chinner
@ 2013-06-28 15:25                       ` Serge Hallyn
  2013-06-28 16:16                         ` Dwight Engen
  0 siblings, 1 reply; 46+ messages in thread
From: Serge Hallyn @ 2013-06-28 15:25 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Eric W. Biederman, Brian Foster, Dwight Engen,
	Michael Kerrisk (man-pages),
	xfs

Quoting Dave Chinner (david@fromorbit.com):
> On Thu, Jun 27, 2013 at 08:02:05AM -0500, Serge Hallyn wrote:
> > Quoting Dave Chinner (david@fromorbit.com):
> > > On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > > > On Wed, 26 Jun 2013 12:09:24 +1000
> > > > Dave Chinner <david@fromorbit.com> wrote:
> > > > > > We do need to decide on the di_uid that comes back from bulkstat.
> > > > > > Right now it is returning on disk (== init_user_ns) uids. It looks
> > > > > > to me like xfsrestore is using the normal vfs routines (chown,
> > 
> > I might not be helpful here, (as despite having used xfs for years
> > I've not used these features) but feel like I should try based on
> > what I see in the manpages.  Here is my understanding:
> > 
> > Assume you're a task in a child userns, where you have host uids
> > 100000-110000 mapped to container uids 0-10000,
> > 
> > 1. bulkstat is an xfs_ioctl command, right?  It should return the mapped
> > uids (0-10000).
> > 
> > 2. xfsdump should store the uids as seen in the caller's namespace.  If
> > xfsdump is done from the container, the dump should show uids 0-10000.
> 
> So when run from within a namespace, it should filter and return
> only inodes that match the uids/gids mapped into the namespace?

I would think they should all be returned, with uid/gid being -1.

> That can be done, it's just a rather inefficient use of bulkstat
> (which is primarily there for efficiency reasons).
> 
> Here's a corner case. Say I download a tarball from somewhere that
> has uids/gids inside it, and when I untar it it creates uids/gids
> outside the namespaces mapped range of [0-10000]. What happens then?

The chown will fail, so they should belong to the fsuid/fsguid of the
calling task.

> What uids do we end up on disk, and how do we ensure that the
> bulkstat filter still returns those inodes?
> 
> > 3. xfsrestore should use be run from the desired namespace.  If you did
> > xfsdump from the host ns, you should then xfsrestore from the host ns.
> > Then inside the container those uids (100000-110000) will be mapped
> > to your uids (0-10000).
> > 
> > 4. If you xfsdump in this container, then xfsrestore in another
> > container where you have 200000-210000 mapped to 0-10000, the dump
> > image will have uids 0-10000.  The restored image will have container
> > uids 0-10000, while on the underlying host media it will be uids
> > 200000-210000.
> > 
> > 5. If you xfsdump in this container then xfsrestore on the host, then
> > the host uids 0-10000 will be used on the underlying media.  The
> > container would be unable to read this files as the uids do not map
> > into the container.
> 
> Yes, that follows from 1+2. We'll need some documentation in
> the dump/restore man pages for this, and I'd suggest that the
> namespace documentation/man pages get this sort of treatment, too.

There is a user_namespaces(7) man page which Michael Kerrisk had been
working on with Eric back in March.  I don't see it at
http://man7.org/linux/man-pages/dir_section_7.html
though, so it may still be in development or in a staging tree.

-serge

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-28 15:25                       ` Serge Hallyn
@ 2013-06-28 16:16                         ` Dwight Engen
  0 siblings, 0 replies; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 16:16 UTC (permalink / raw)
  To: Serge Hallyn
  Cc: Eric W. Biederman, Brian Foster, Michael Kerrisk (man-pages), xfs

On Fri, 28 Jun 2013 10:25:52 -0500
Serge Hallyn <serge.hallyn@ubuntu.com> wrote:

> Quoting Dave Chinner (david@fromorbit.com):
> > On Thu, Jun 27, 2013 at 08:02:05AM -0500, Serge Hallyn wrote:
> > > Quoting Dave Chinner (david@fromorbit.com):
> > > > On Wed, Jun 26, 2013 at 05:30:17PM -0400, Dwight Engen wrote:
> > > > > On Wed, 26 Jun 2013 12:09:24 +1000
> > > > > Dave Chinner <david@fromorbit.com> wrote:
> > > > > > > We do need to decide on the di_uid that comes back from
> > > > > > > bulkstat. Right now it is returning on disk (==
> > > > > > > init_user_ns) uids. It looks to me like xfsrestore is
> > > > > > > using the normal vfs routines (chown,
> > > 
> > > I might not be helpful here, (as despite having used xfs for years
> > > I've not used these features) but feel like I should try based on
> > > what I see in the manpages.  Here is my understanding:
> > > 
> > > Assume you're a task in a child userns, where you have host uids
> > > 100000-110000 mapped to container uids 0-10000,
> > > 
> > > 1. bulkstat is an xfs_ioctl command, right?  It should return the
> > > mapped uids (0-10000).
> > > 
> > > 2. xfsdump should store the uids as seen in the caller's
> > > namespace.  If xfsdump is done from the container, the dump
> > > should show uids 0-10000.
> > 
> > So when run from within a namespace, it should filter and return
> > only inodes that match the uids/gids mapped into the namespace?
> 
> I would think they should all be returned, with uid/gid being -1.

I agree, so I think bulkstat should return the uids with
from_kuid_munged(current_user_ns(), VFS_I(ip)), so it returns the
same values that stat(2) would. This would mean callers in
init_user_ns see the same values they do today. Callers inside
a userns will see mapped values, but note that they have to be
CAP_SYS_ADMIN in init_user_ns, which I wouldn't expect to normally be
the case.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 5/6] create internal eofblocks structure with kuid_t types
  2013-06-28 15:11               ` [PATCH 5/6] create internal eofblocks structure with kuid_t types Dwight Engen
@ 2013-06-28 18:09                 ` Brian Foster
  0 siblings, 0 replies; 46+ messages in thread
From: Brian Foster @ 2013-06-28 18:09 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On 06/28/2013 11:11 AM, Dwight Engen wrote:
> Have eofblocks ioctl convert uid_t to kuid_t into internal structure.
> Update internal filter matching to compare ids with kuid_t types.
> 
> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> ---
>  fs/xfs/xfs_fs.h     |  2 +-
>  fs/xfs/xfs_icache.c |  6 +++---
>  fs/xfs/xfs_ioctl.c  | 34 ++++++++++++++++++++++++++++++++--
>  fs/xfs/xfs_linux.h  |  8 ++++++++
>  4 files changed, 44 insertions(+), 6 deletions(-)
> 
...
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index bedf510..487dca5 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1328,6 +1328,31 @@ xfs_ioc_getbmapx(
>  	return 0;
>  }
>  
> +STATIC int
> +xfs_fs_eofblocks_to_internal(
> +	struct xfs_fs_eofblocks		*src,
> +	struct xfs_eofblocks		*dst)
> +{
> +	dst->eof_flags = src->eof_flags;
> +	dst->eof_prid = src->eof_prid;
> +	dst->eof_min_file_size = src->eof_min_file_size;
> +
> +	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
> +		dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
> +		if (!uid_valid(dst->eof_uid))
> +			return XFS_ERROR(EINVAL);
> +	}
> +
> +	if (src->eof_flags & XFS_EOF_FLAGS_GID) {
> +		dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
> +		if (!gid_valid(dst->eof_gid))
> +			return XFS_ERROR(EINVAL);
> +	}
> +
> +	return 0;
> +}

Is there any harm in removing the policy from this function, storing a
potentially invalid kuid's in the xfs_eofblocks and letting the caller
determine whether an error should be returned? IOW, this function becomes:

inline void
xfs_fs_eofblocks_to_internal(
	struct xfs_fs_eofblocks		*src,
	struct xfs_eofblocks		*dst)
{
	dst->eof_flags = src->eof_flags;
	dst->eof_prid = src->eof_prid;
	dst->eof_min_file_size = src->eof_min_file_size;
	dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
	dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
}

... and xfs_file_ioctl() can check the XFS_EOF_FLAGS_UID/GID flags and
validity of the value to determine whether an error should be returned.

Also, I suspect xfs_icache.h might be a better home for this function.

> +
> +
>  /*
>   * Note: some of the ioctl's return positive numbers as a
>   * byte count indicating success, such as readlink_by_handle.
> @@ -1610,7 +1635,8 @@ xfs_file_ioctl(
>  		return -error;
>  
>  	case XFS_IOC_FREE_EOFBLOCKS: {
> -		struct xfs_eofblocks eofb;
> +		struct xfs_fs_eofblocks eofb;
> +		struct xfs_eofblocks keofb;
>  
>  		if (copy_from_user(&eofb, arg, sizeof(eofb)))
>  			return -XFS_ERROR(EFAULT);
> @@ -1625,7 +1651,11 @@ xfs_file_ioctl(
>  		    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
>  			return -XFS_ERROR(EINVAL);
>  
> -		error = xfs_icache_free_eofblocks(mp, &eofb);
> +		error = xfs_fs_eofblocks_to_internal(&eofb, &keofb);
> +		if (error)
> +			return -XFS_ERROR(error);
> +
> +		error = xfs_icache_free_eofblocks(mp, &keofb);
>  		return -error;
>  	}
>  
> diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
> index 761e4c0..3c2f403 100644
> --- a/fs/xfs/xfs_linux.h
> +++ b/fs/xfs/xfs_linux.h
> @@ -185,6 +185,14 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
>  	return make_kgid(&init_user_ns, gid);
>  }
>  
> +struct xfs_eofblocks {
> +	__u32		eof_flags;
> +	kuid_t		eof_uid;
> +	kgid_t		eof_gid;
> +	prid_t		eof_prid;
> +	__u64		eof_min_file_size;
> +};
> +

xfs_icache.h?

Brian

>  /*
>   * Various platform dependent calls that don't fit anywhere else
>   */
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match
  2013-06-28 15:11               ` [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match Dwight Engen
@ 2013-06-28 18:50                 ` Brian Foster
  2013-06-28 20:28                   ` Dwight Engen
  0 siblings, 1 reply; 46+ messages in thread
From: Brian Foster @ 2013-06-28 18:50 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On 06/28/2013 11:11 AM, Dwight Engen wrote:
> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> ---
>  fs/xfs/xfs_ioctl.c | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 487dca5..123314e 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1655,6 +1655,23 @@ xfs_file_ioctl(
>  		if (error)
>  			return -XFS_ERROR(error);
>  
> +		/* non-privileged users should not be able to trim blocks on
> +		 * objects they cannot write to, so require them to specify
> +		 * either their own uid, or a group they are a member of
> +		 */
> +		if (!capable(CAP_SYS_ADMIN)) {
> +			if (!(eofb.eof_flags & (XFS_EOF_FLAGS_UID | XFS_EOF_FLAGS_GID)))
> +				return -XFS_ERROR(EPERM);
> +
> +			if ((eofb.eof_flags & XFS_EOF_FLAGS_UID) &&
> +			    !uid_eq(current_fsuid(), keofb.eof_uid))
> +				return -XFS_ERROR(EPERM);
> +
> +			if ((eofb.eof_flags & XFS_EOF_FLAGS_GID) &&
> +			    !in_group_p(keofb.eof_gid))
> +				return -XFS_ERROR(EPERM);
> +		}
> +

This looks reasonable to me.

In thinking more about the other aspect of group management (and I admit
I'm still waffling about this), it seems like we could go in a couple
directions:

- Now that we have a separate internal only eofblocks control, be more
flexible and provide an internal only flag (valid only for UID/GID
scans) to instruct the scan to do specific file permission checking
against the inodes. This would be set by xfs_file_ioctl() and do the
write permission enforcement for userspace originated scans. This would
also allow the future EDQUOT work to leave out said flag and do a group
wide scan regardless of the specific permissions of the calling context
(i.e., when the system decides all inodes under a group quota must be
trimmed).

The downsides here are the behavior might be a bit unclear and we'd have
to fork off the flags bits in a manner that's clear and maintainable. I
suppose one could also argue this is overkill for somewhat of a
secondary operation.

- Go the other direction, be less flexible and simply not allow
!capable(CAP_SYS_ADMIN) group scans just as we started doing for project
quotas. This is obviously very simple, but then we disallow regular
users from trimming groups of inodes they should full well have
permission to trim.

I think I'm leaning towards the former approach if it can be implemented
cleanly. Thoughts or ideas?

Brian

>  		error = xfs_icache_free_eofblocks(mp, &keofb);
>  		return -error;
>  	}
> 

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match
  2013-06-28 18:50                 ` Brian Foster
@ 2013-06-28 20:28                   ` Dwight Engen
  2013-06-28 21:39                     ` Brian Foster
  0 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 20:28 UTC (permalink / raw)
  To: Brian Foster; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On Fri, 28 Jun 2013 14:50:24 -0400
Brian Foster <bfoster@redhat.com> wrote:

> On 06/28/2013 11:11 AM, Dwight Engen wrote:
> > Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> > ---
> >  fs/xfs/xfs_ioctl.c | 17 +++++++++++++++++
> >  1 file changed, 17 insertions(+)
> > 
> > diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> > index 487dca5..123314e 100644
> > --- a/fs/xfs/xfs_ioctl.c
> > +++ b/fs/xfs/xfs_ioctl.c
> > @@ -1655,6 +1655,23 @@ xfs_file_ioctl(
> >  		if (error)
> >  			return -XFS_ERROR(error);
> >  
> > +		/* non-privileged users should not be able to trim
> > blocks on
> > +		 * objects they cannot write to, so require them
> > to specify
> > +		 * either their own uid, or a group they are a
> > member of
> > +		 */
> > +		if (!capable(CAP_SYS_ADMIN)) {
> > +			if (!(eofb.eof_flags & (XFS_EOF_FLAGS_UID
> > | XFS_EOF_FLAGS_GID)))
> > +				return -XFS_ERROR(EPERM);
> > +
> > +			if ((eofb.eof_flags & XFS_EOF_FLAGS_UID) &&
> > +			    !uid_eq(current_fsuid(),
> > keofb.eof_uid))
> > +				return -XFS_ERROR(EPERM);
> > +
> > +			if ((eofb.eof_flags & XFS_EOF_FLAGS_GID) &&
> > +			    !in_group_p(keofb.eof_gid))
> > +				return -XFS_ERROR(EPERM);
> > +		}
> > +
> 
> This looks reasonable to me.
> 
> In thinking more about the other aspect of group management (and I
> admit I'm still waffling about this), it seems like we could go in a
> couple directions:
> 
> - Now that we have a separate internal only eofblocks control, be more
> flexible and provide an internal only flag (valid only for UID/GID
> scans) to instruct the scan to do specific file permission checking
> against the inodes. This would be set by xfs_file_ioctl() and do the
> write permission enforcement for userspace originated scans. This
> would also allow the future EDQUOT work to leave out said flag and do
> a group wide scan regardless of the specific permissions of the
> calling context (i.e., when the system decides all inodes under a
> group quota must be trimmed).

I haven't seen your EDQUOT change, but your description made me wonder:
Are you going to kick off a scan for the type of quota exhausted?
Otherwise could a user abuse this by overrunning his user quota in
order to cause group inodes (that he may not have write to) to be
reclaimed?

At any rate, yeah the only way I see to get the permissions checks
right is to set a flag up in ioctl because the checks need to be per
inode. I think you would like to avoid having the checks that low in
xfs_icache, but I don't see a way around that.

> The downsides here are the behavior might be a bit unclear and we'd
> have to fork off the flags bits in a manner that's clear and
> maintainable. I suppose one could also argue this is overkill for
> somewhat of a secondary operation.
>
> - Go the other direction, be less flexible and simply not allow
> !capable(CAP_SYS_ADMIN) group scans just as we started doing for
> project quotas. This is obviously very simple, but then we disallow
> regular users from trimming groups of inodes they should full well
> have permission to trim.

What about uid == self scans? Would we not allow those as well? The
check may be simpler than a group check but still would need to be per
inode, and thus still need the flags of option 1.

> I think I'm leaning towards the former approach if it can be
> implemented cleanly. Thoughts or ideas?

Well the former is certainly more functional and allows the trimming to
be under the users control. How useful is that though once it happens
automatically with your EDQUOT changes? If we only allowed the ioctl to
trigger global scans, we wouldn't need a conversion function nor
separate structure, but I don't know the use cases for uid/gid specific
scans from userspace so its hard for me to judge the tradeoffs.

Maybe this permission stuff should be a separate change since it isn't
really related to user namespace stuff? I just happened to be in the
vicinity and am happy to help :)

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match
  2013-06-28 20:28                   ` Dwight Engen
@ 2013-06-28 21:39                     ` Brian Foster
  2013-06-28 23:22                       ` Dwight Engen
  0 siblings, 1 reply; 46+ messages in thread
From: Brian Foster @ 2013-06-28 21:39 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On 06/28/2013 04:28 PM, Dwight Engen wrote:
> On Fri, 28 Jun 2013 14:50:24 -0400
> Brian Foster <bfoster@redhat.com> wrote:
> 
>> On 06/28/2013 11:11 AM, Dwight Engen wrote:
>>> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
>>> ---
...
>>
>> In thinking more about the other aspect of group management (and I
>> admit I'm still waffling about this), it seems like we could go in a
>> couple directions:
>>
>> - Now that we have a separate internal only eofblocks control, be more
>> flexible and provide an internal only flag (valid only for UID/GID
>> scans) to instruct the scan to do specific file permission checking
>> against the inodes. This would be set by xfs_file_ioctl() and do the
>> write permission enforcement for userspace originated scans. This
>> would also allow the future EDQUOT work to leave out said flag and do
>> a group wide scan regardless of the specific permissions of the
>> calling context (i.e., when the system decides all inodes under a
>> group quota must be trimmed).
> 
> I haven't seen your EDQUOT change, but your description made me wonder:
> Are you going to kick off a scan for the type of quota exhausted?
> Otherwise could a user abuse this by overrunning his user quota in
> order to cause group inodes (that he may not have write to) to be
> reclaimed?
> 

Yes, you could describe it that way. The current behavior is a global
inode flush followed by an ENOSPC/EDQUOT error, so I'm not sure we're
exposing much by attempting an eofblocks scan before running out of
space. It's a fairly minor optimization in the failure path.

Another way to look at it might be that the users/inodes have never
really reserved the extra space that's being trimmed here, so the
filesystem has every right to take it away if it deems it better to put
to use elsewhere (i.e., to put off an ENOSPC related failure).

> At any rate, yeah the only way I see to get the permissions checks
> right is to set a flag up in ioctl because the checks need to be per
> inode. I think you would like to avoid having the checks that low in
> xfs_icache, but I don't see a way around that.
> 

That's the reasoning behind the extra internal only flag. It's not
having the check that I'm against so much as the idea that this layer of
code should be unconditionally bound by the current userspace context.
But it might be reasonable to add control flags to effectively make that
so (i.e., XFS_EOF_FLAGS_ENFORCE_PERMS).

>> The downsides here are the behavior might be a bit unclear and we'd
>> have to fork off the flags bits in a manner that's clear and
>> maintainable. I suppose one could also argue this is overkill for
>> somewhat of a secondary operation.
>>
>> - Go the other direction, be less flexible and simply not allow
>> !capable(CAP_SYS_ADMIN) group scans just as we started doing for
>> project quotas. This is obviously very simple, but then we disallow
>> regular users from trimming groups of inodes they should full well
>> have permission to trim.
> 
> What about uid == self scans? Would we not allow those as well? The
> check may be simpler than a group check but still would need to be per
> inode, and thus still need the flags of option 1.
> 

I'm not following. Aren't we already enforcing this appropriately with
your current patch? My comments were intended to be taken as in addition
to this patch. i.e., with regard to your comments in the commit log here:

http://oss.sgi.com/archives/xfs/2013-06/msg00785.html

>> I think I'm leaning towards the former approach if it can be
>> implemented cleanly. Thoughts or ideas?
> 
> Well the former is certainly more functional and allows the trimming to
> be under the users control. How useful is that though once it happens
> automatically with your EDQUOT changes? If we only allowed the ioctl to
> trigger global scans, we wouldn't need a conversion function nor
> separate structure, but I don't know the use cases for uid/gid specific
> scans from userspace so its hard for me to judge the tradeoffs.
> 

To be honest, there aren't any real users of the eofblocks command from
userspace that I'm aware of at the moment. I added it originally for a
poc quota implementation I was working on for gluster, but the primary
use case for the scanning mechanism is to allow background clean up of
files such that post-eof speculative preallocation doesn't hang around
for too long.

> Maybe this permission stuff should be a separate change since it isn't
> really related to user namespace stuff? I just happened to be in the
> vicinity and am happy to help :)
> 

Sounds reasonable to me. :)

Brian

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match
  2013-06-28 21:39                     ` Brian Foster
@ 2013-06-28 23:22                       ` Dwight Engen
  2013-07-01 12:21                         ` Brian Foster
  0 siblings, 1 reply; 46+ messages in thread
From: Dwight Engen @ 2013-06-28 23:22 UTC (permalink / raw)
  To: Brian Foster; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On Fri, 28 Jun 2013 17:39:13 -0400
Brian Foster <bfoster@redhat.com> wrote:

> On 06/28/2013 04:28 PM, Dwight Engen wrote:
> > On Fri, 28 Jun 2013 14:50:24 -0400
> > Brian Foster <bfoster@redhat.com> wrote:
> > 
> >> On 06/28/2013 11:11 AM, Dwight Engen wrote:
> >>> Signed-off-by: Dwight Engen <dwight.engen@oracle.com>
> >>> ---
> ...
> >>
> >> In thinking more about the other aspect of group management (and I
> >> admit I'm still waffling about this), it seems like we could go in
> >> a couple directions:
> >>
> >> - Now that we have a separate internal only eofblocks control, be
> >> more flexible and provide an internal only flag (valid only for
> >> UID/GID scans) to instruct the scan to do specific file permission
> >> checking against the inodes. This would be set by xfs_file_ioctl()
> >> and do the write permission enforcement for userspace originated
> >> scans. This would also allow the future EDQUOT work to leave out
> >> said flag and do a group wide scan regardless of the specific
> >> permissions of the calling context (i.e., when the system decides
> >> all inodes under a group quota must be trimmed).
> > 
> > I haven't seen your EDQUOT change, but your description made me
> > wonder: Are you going to kick off a scan for the type of quota
> > exhausted? Otherwise could a user abuse this by overrunning his
> > user quota in order to cause group inodes (that he may not have
> > write to) to be reclaimed?
> > 
> 
> Yes, you could describe it that way. The current behavior is a global
> inode flush followed by an ENOSPC/EDQUOT error, so I'm not sure we're
> exposing much by attempting an eofblocks scan before running out of
> space. It's a fairly minor optimization in the failure path.
> 
> Another way to look at it might be that the users/inodes have never
> really reserved the extra space that's being trimmed here, so the
> filesystem has every right to take it away if it deems it better to
> put to use elsewhere (i.e., to put off an ENOSPC related failure).
> 
> > At any rate, yeah the only way I see to get the permissions checks
> > right is to set a flag up in ioctl because the checks need to be per
> > inode. I think you would like to avoid having the checks that low in
> > xfs_icache, but I don't see a way around that.
> > 
> 
> That's the reasoning behind the extra internal only flag. It's not
> having the check that I'm against so much as the idea that this layer
> of code should be unconditionally bound by the current userspace
> context. But it might be reasonable to add control flags to
> effectively make that so (i.e., XFS_EOF_FLAGS_ENFORCE_PERMS).

Right, clearly you've got a use case where it shouldn't be limited
while the ioctl caller could ensure the flag is on.

> >> The downsides here are the behavior might be a bit unclear and we'd
> >> have to fork off the flags bits in a manner that's clear and
> >> maintainable. I suppose one could also argue this is overkill for
> >> somewhat of a secondary operation.
> >>
> >> - Go the other direction, be less flexible and simply not allow
> >> !capable(CAP_SYS_ADMIN) group scans just as we started doing for
> >> project quotas. This is obviously very simple, but then we disallow
> >> regular users from trimming groups of inodes they should full well
> >> have permission to trim.
> > 
> > What about uid == self scans? Would we not allow those as well? The
> > check may be simpler than a group check but still would need to be
> > per inode, and thus still need the flags of option 1.
> > 
> 
> I'm not following. Aren't we already enforcing this appropriately with
> your current patch? My comments were intended to be taken as in
> addition to this patch. i.e., with regard to your comments in the
> commit log here:
> 
> http://oss.sgi.com/archives/xfs/2013-06/msg00785.html

Ahh yes, the code there should be fine since it requires
XFS_EOF_FLAGS_UID and that uid == self to be given. I misunderstood
your "less flexible" above to be in-lieu of the checks in that patch and
it was only mentioning group scans so that is why I was wondering
if you meant to throw out uid scans as well.

> >> I think I'm leaning towards the former approach if it can be
> >> implemented cleanly. Thoughts or ideas?
> > 
> > Well the former is certainly more functional and allows the
> > trimming to be under the users control. How useful is that though
> > once it happens automatically with your EDQUOT changes? If we only
> > allowed the ioctl to trigger global scans, we wouldn't need a
> > conversion function nor separate structure, but I don't know the
> > use cases for uid/gid specific scans from userspace so its hard for
> > me to judge the tradeoffs.
> > 
> 
> To be honest, there aren't any real users of the eofblocks command
> from userspace that I'm aware of at the moment. I added it originally
> for a poc quota implementation I was working on for gluster, but the
> primary use case for the scanning mechanism is to allow background
> clean up of files such that post-eof speculative preallocation
> doesn't hang around for too long.

... and there are likely to be scenarios where waiting for the timer
would be too long?
 
> > Maybe this permission stuff should be a separate change since it
> > isn't really related to user namespace stuff? I just happened to be
> > in the vicinity and am happy to help :)
> > 
> 
> Sounds reasonable to me. :)

If you want me to code up either option, let me know.
 
> Brian

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match
  2013-06-28 23:22                       ` Dwight Engen
@ 2013-07-01 12:21                         ` Brian Foster
  0 siblings, 0 replies; 46+ messages in thread
From: Brian Foster @ 2013-07-01 12:21 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Serge Hallyn, xfs

On 06/28/2013 07:22 PM, Dwight Engen wrote:
> On Fri, 28 Jun 2013 17:39:13 -0400
> Brian Foster <bfoster@redhat.com> wrote:
> 
>> On 06/28/2013 04:28 PM, Dwight Engen wrote:
>>> On Fri, 28 Jun 2013 14:50:24 -0400
>>> Brian Foster <bfoster@redhat.com> wrote:
>>>
>>>> On 06/28/2013 11:11 AM, Dwight Engen wrote:
...
>>
>> To be honest, there aren't any real users of the eofblocks command
>> from userspace that I'm aware of at the moment. I added it originally
>> for a poc quota implementation I was working on for gluster, but the
>> primary use case for the scanning mechanism is to allow background
>> clean up of files such that post-eof speculative preallocation
>> doesn't hang around for too long.
> 
> ... and there are likely to be scenarios where waiting for the timer
> would be too long?
>  

Well one can adjust the timer via the /proc file if necessary. Our use
case was along the lines of ensuring prealloc was cleared up at certain
points where we wanted to track space usage (using a cluster of XFS
project quotas to represent a higher level quota), iirc. It's been a
while since I've looked at that code... ;)

>>> Maybe this permission stuff should be a separate change since it
>>> isn't really related to user namespace stuff? I just happened to be
>>> in the vicinity and am happy to help :)
>>>
>>
>> Sounds reasonable to me. :)
> 
> If you want me to code up either option, let me know.
>  

Feel free to. :) I agree that it's separate from the userns work.
I'll put it on my todo list if you don't get around to it.

Brian

>> Brian

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

* [PATCH 1/1] export inode_capable
  2013-06-24 13:10           ` [PATCH v2 RFC] " Dwight Engen
  2013-06-25 16:46             ` Brian Foster
  2013-06-26  2:09             ` Dave Chinner
@ 2013-07-06  4:44             ` Serge Hallyn
  2013-07-08 13:09             ` [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate Serge Hallyn
  3 siblings, 0 replies; 46+ messages in thread
From: Serge Hallyn @ 2013-07-06  4:44 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Brian Foster, xfs

Hi Dwight,

as I mentioned earlier I did need this patch to build with xfs=m.  But
other than that, I've got a kernel built using your patchset (full src at
http://kernel.ubuntu.com/git?p=serge/ubuntu-saucy.git;a=summary) and
was able to start a user ns container with that kernel.  I'll use that
kernel for some more regular lxc testing next week.

Thanks!

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
---
 kernel/capability.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/capability.c b/kernel/capability.c
index f6c2ce5..a04f86f 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -464,3 +464,5 @@ bool inode_capable(const struct inode *inode, int cap)
 
 	return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
 }
+
+EXPORT_SYMBOL_GPL(inode_capable);
-- 
1.7.9.5

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate
  2013-06-24 13:10           ` [PATCH v2 RFC] " Dwight Engen
                               ` (2 preceding siblings ...)
  2013-07-06  4:44             ` [PATCH 1/1] export inode_capable Serge Hallyn
@ 2013-07-08 13:09             ` Serge Hallyn
  3 siblings, 0 replies; 46+ messages in thread
From: Serge Hallyn @ 2013-07-08 13:09 UTC (permalink / raw)
  To: Dwight Engen; +Cc: Eric W. Biederman, Brian Foster, xfs

Created some userns containers on xfs and did some package building
and installation there.  All seemed fine, thanks!

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2013-07-08 13:06 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-19 15:09 [PATCH] userns: Convert xfs to use kuid/kgid where appropriate Dwight Engen
2013-06-19 20:35 ` Eric W. Biederman
2013-06-20  1:41   ` Dave Chinner
2013-06-20 13:54     ` Dwight Engen
2013-06-20 21:10       ` Dave Chinner
2013-06-20  0:13 ` Dave Chinner
2013-06-20 13:54   ` Dwight Engen
2013-06-20 15:27     ` Brian Foster
2013-06-20 17:39       ` Dwight Engen
2013-06-20 19:12         ` Brian Foster
2013-06-20 22:12           ` Dave Chinner
2013-06-20 22:45           ` Eric W. Biederman
2013-06-20 23:35             ` Dave Chinner
2013-06-20 22:03     ` Dave Chinner
2013-06-21 15:14       ` Dwight Engen
2013-06-24  0:33         ` Dave Chinner
2013-06-24 13:10           ` [PATCH v2 RFC] " Dwight Engen
2013-06-25 16:46             ` Brian Foster
2013-06-25 20:08               ` Dwight Engen
2013-06-25 21:04                 ` Brian Foster
2013-06-26  2:09             ` Dave Chinner
2013-06-26 21:30               ` Dwight Engen
2013-06-26 22:44                 ` Dave Chinner
2013-06-27 13:02                   ` Serge Hallyn
2013-06-28  1:54                     ` Dave Chinner
2013-06-28 15:25                       ` Serge Hallyn
2013-06-28 16:16                         ` Dwight Engen
2013-06-27 20:57                   ` Ben Myers
2013-06-28  1:46                     ` Dave Chinner
2013-06-28 15:15                       ` Serge Hallyn
2013-06-28 14:23               ` Dwight Engen
2013-06-28 15:11               ` [PATCH v3 0/6] " Dwight Engen
2013-06-28 15:11               ` [PATCH 1/6] create wrappers for converting kuid_t to/from uid_t Dwight Engen
2013-06-28 15:11               ` [PATCH 2/6] convert kuid_t to/from uid_t in ACLs Dwight Engen
2013-06-28 15:11               ` [PATCH 3/6] ioctl: check for capabilities in the current user namespace Dwight Engen
2013-06-28 15:11               ` [PATCH 4/6] convert kuid_t to/from uid_t for xfs internal structures Dwight Engen
2013-06-28 15:11               ` [PATCH 5/6] create internal eofblocks structure with kuid_t types Dwight Engen
2013-06-28 18:09                 ` Brian Foster
2013-06-28 15:11               ` [PATCH 6/6] ioctl eofblocks: require non-privileged users to specify uid/gid match Dwight Engen
2013-06-28 18:50                 ` Brian Foster
2013-06-28 20:28                   ` Dwight Engen
2013-06-28 21:39                     ` Brian Foster
2013-06-28 23:22                       ` Dwight Engen
2013-07-01 12:21                         ` Brian Foster
2013-07-06  4:44             ` [PATCH 1/1] export inode_capable Serge Hallyn
2013-07-08 13:09             ` [PATCH v2 RFC] userns: Convert xfs to use kuid/kgid where appropriate Serge Hallyn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.