All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
@ 2016-08-11 17:10 Christoph Hellwig
  2016-08-11 21:54 ` Peter Zijlstra
  2016-08-11 23:43 ` Dave Chinner
  0 siblings, 2 replies; 25+ messages in thread
From: Christoph Hellwig @ 2016-08-11 17:10 UTC (permalink / raw)
  To: xfs, peterz

This patch drops the XFS-own i_iolock and uses the VFS i_rwsem which
recently replaced i_mutex instead.  This means we only have to take
one looks instead of two in many fast path operations, and we can
also shrink the xfs_inode structure.  Thanks to the xfs_ilock family
there is very little churn as well.

There is one major issue with this change though:  lockdep currently
doesn't have a facility to assert a rw_sempahore is held exclusively,
which means we lose the nice ability to assert locking context in
XFS.

Peter, I think you mentioned this would be fairly easy to add to
lockdep and the rw_semaphore code.  Any chance to come up with a proof
of concept?

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_dir2_readdir.c |  2 -
 fs/xfs/xfs_file.c         | 95 ++++++++++++++++-------------------------------
 fs/xfs/xfs_icache.c       |  6 +--
 fs/xfs/xfs_inode.c        | 79 ++++++++++++++++-----------------------
 fs/xfs/xfs_inode.h        |  7 +---
 fs/xfs/xfs_ioctl.c        |  2 +-
 fs/xfs/xfs_iops.c         | 14 +++----
 fs/xfs/xfs_pnfs.c         |  7 +---
 fs/xfs/xfs_pnfs.h         |  4 +-
 fs/xfs/xfs_super.c        |  2 +-
 fs/xfs/xfs_symlink.c      |  7 ++--
 11 files changed, 83 insertions(+), 142 deletions(-)

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f44f799..7662140 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -676,7 +676,6 @@ xfs_readdir(
 	args.dp = dp;
 	args.geo = dp->i_mount->m_dir_geo;
 
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_getdents(&args, ctx);
 	else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -685,7 +684,6 @@ xfs_readdir(
 		rval = xfs_dir2_block_getdents(&args, ctx);
 	else
 		rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 
 	return rval;
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ed95e5b..b604dc7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -47,40 +47,6 @@
 static const struct vm_operations_struct xfs_file_vm_ops;
 
 /*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	if (type & XFS_IOLOCK_EXCL)
-		inode_lock(VFS_I(ip));
-	xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	xfs_iunlock(ip, type);
-	if (type & XFS_IOLOCK_EXCL)
-		inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	xfs_ilock_demote(ip, type);
-	if (type & XFS_IOLOCK_EXCL)
-		inode_unlock(VFS_I(ip));
-}
-
-/*
  * Clear the specified ranges to zero through either the pagecache or DAX.
  * Holes and unwritten extents will be left as-is as they already are zeroed.
  */
@@ -279,10 +245,10 @@ xfs_file_dio_aio_read(
 	 * IO case of no page cache pages to proceeed concurrently without
 	 * serialisation.
 	 */
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (mapping->nrpages) {
-		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
 		/*
 		 * The generic dio code only flushes the range of the particular
@@ -298,7 +264,7 @@ xfs_file_dio_aio_read(
 		if (mapping->nrpages) {
 			ret = filemap_write_and_wait(mapping);
 			if (ret) {
-				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 				return ret;
 			}
 
@@ -311,7 +277,7 @@ xfs_file_dio_aio_read(
 			WARN_ON_ONCE(ret);
 			ret = 0;
 		}
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 	}
 
 	data = *to;
@@ -321,7 +287,7 @@ xfs_file_dio_aio_read(
 		iocb->ki_pos += ret;
 		iov_iter_advance(to, ret);
 	}
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	file_accessed(iocb->ki_filp);
 	return ret;
@@ -344,13 +310,13 @@ xfs_file_dax_read(
 	if (!count)
 		return 0; /* skip atime */
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
 	if (ret > 0) {
 		iocb->ki_pos += ret;
 		iov_iter_advance(to, ret);
 	}
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	file_accessed(iocb->ki_filp);
 	return ret;
@@ -366,9 +332,9 @@ xfs_file_buffered_aio_read(
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = generic_file_read_iter(iocb, to);
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	return ret;
 }
@@ -429,9 +395,9 @@ xfs_file_splice_read(
 		goto out;
 	}
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 out:
 	if (ret > 0)
 		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
@@ -488,15 +454,18 @@ restart:
 	if (error <= 0)
 		return error;
 
-	error = xfs_break_layouts(inode, iolock, true);
+	error = xfs_break_layouts(inode, iolock);
 	if (error)
 		return error;
 
-	/* For changing security info in file_remove_privs() we need i_mutex */
+	/*
+	 * For changing security info in file_remove_privs() we need i_rwsem
+	 * exclusively.
+	 */
 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
-		xfs_rw_iunlock(ip, *iolock);
+		xfs_iunlock(ip, *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, *iolock);
+		xfs_ilock(ip, *iolock);
 		goto restart;
 	}
 	/*
@@ -521,9 +490,9 @@ restart:
 		spin_unlock(&ip->i_flags_lock);
 		if (!drained_dio) {
 			if (*iolock == XFS_IOLOCK_SHARED) {
-				xfs_rw_iunlock(ip, *iolock);
+				xfs_iunlock(ip, *iolock);
 				*iolock = XFS_IOLOCK_EXCL;
-				xfs_rw_ilock(ip, *iolock);
+				xfs_ilock(ip, *iolock);
 				iov_iter_reexpand(from, count);
 			}
 			/*
@@ -630,7 +599,7 @@ xfs_file_dio_aio_write(
 		iolock = XFS_IOLOCK_EXCL;
 	else
 		iolock = XFS_IOLOCK_SHARED;
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 	/*
 	 * Recheck if there are cached pages that need invalidate after we got
@@ -638,9 +607,9 @@ xfs_file_dio_aio_write(
 	 * we were waiting for the iolock.
 	 */
 	if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
-		xfs_rw_iunlock(ip, iolock);
+		xfs_iunlock(ip, iolock);
 		iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, iolock);
+		xfs_ilock(ip, iolock);
 	}
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
@@ -673,7 +642,7 @@ xfs_file_dio_aio_write(
 	if (unaligned_io)
 		inode_dio_wait(inode);
 	else if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
@@ -696,7 +665,7 @@ xfs_file_dio_aio_write(
 		iov_iter_advance(from, ret);
 	}
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 
 	/*
 	 * No fallback to buffered IO on errors for XFS, direct IO will either
@@ -730,7 +699,7 @@ xfs_file_dax_write(
 	} else {
 		iolock = XFS_IOLOCK_SHARED;
 	}
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
@@ -748,7 +717,7 @@ xfs_file_dax_write(
 	}
 
 	if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
@@ -762,7 +731,7 @@ xfs_file_dax_write(
 		iov_iter_advance(from, ret);
 	}
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -779,7 +748,7 @@ xfs_file_buffered_aio_write(
 	int			enospc = 0;
 	int			iolock = XFS_IOLOCK_EXCL;
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
@@ -820,7 +789,7 @@ write_retry:
 
 	current->backing_dev_info = NULL;
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -886,7 +855,7 @@ xfs_file_fallocate(
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock, false);
+	error = xfs_break_layouts(inode, &iolock);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index fb39a66..cc5e7b2 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -69,8 +69,6 @@ xfs_inode_alloc(
 	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
 	ip->i_mount = mp;
@@ -387,8 +385,8 @@ xfs_iget_cache_hit(
 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		inode->i_state = I_NEW;
 
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+		init_rwsem(&inode->i_rwsem);
 
 		spin_unlock(&ip->i_flags_lock);
 		spin_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e08eaea..e2b94bf 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -118,31 +118,31 @@ xfs_ilock_attr_map_shared(
 }
 
 /*
- * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * the i_lock.  This routine allows various combinations of the locks to be
- * obtained.
+ * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
+ * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
+ * various combinations of the locks to be obtained.
  *
  * The 3 locks should always be ordered so that the IO lock is obtained first,
  * the mmap lock second and the ilock last in order to prevent deadlock.
  *
  * Basic locking order:
  *
- * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
  *
  * mmap_sem locking order:
  *
- * i_iolock -> page lock -> mmap_sem
+ * i_rwsem -> page lock -> mmap_sem
  * mmap_sem -> i_mmap_lock -> page_lock
  *
  * The difference in mmap_sem locking order mean that we cannot hold the
  * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
  * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
  * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
  * page faults already hold the mmap_sem.
  *
  * Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
  * taken in places where we need to invalidate the page cache in a race
  * free manner (e.g. truncate, hole punch and other extent manipulation
  * functions).
@@ -167,10 +167,13 @@ xfs_ilock(
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		down_write_nested(&VFS_I(ip)->i_rwsem,
+				  XFS_IOLOCK_DEP(lock_flags));
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		down_read_nested(&VFS_I(ip)->i_rwsem,
+				 XFS_IOLOCK_DEP(lock_flags));
+	}
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
@@ -216,10 +219,10 @@ xfs_ilock_nowait(
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
 	if (lock_flags & XFS_IOLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_iolock))
+		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
 			goto out;
 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_iolock))
+		if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
 			goto out;
 	}
 
@@ -247,9 +250,9 @@ out_undo_mmaplock:
 		mrunlock_shared(&ip->i_mmaplock);
 out_undo_iolock:
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
+		up_write(&VFS_I(ip)->i_rwsem);
 	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
+		up_read(&VFS_I(ip)->i_rwsem);
 out:
 	return 0;
 }
@@ -286,9 +289,9 @@ xfs_iunlock(
 	ASSERT(lock_flags != 0);
 
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
+		up_write(&VFS_I(ip)->i_rwsem);
 	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
+		up_read(&VFS_I(ip)->i_rwsem);
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrunlock_excl(&ip->i_mmaplock);
@@ -321,7 +324,7 @@ xfs_ilock_demote(
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrdemote(&ip->i_mmaplock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrdemote(&ip->i_iolock);
+		downgrade_write(&VFS_I(ip)->i_rwsem);
 
 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
@@ -345,9 +348,11 @@ xfs_isilocked(
 	}
 
 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-		if (!(lock_flags & XFS_IOLOCK_SHARED))
-			return !!ip->i_iolock.mr_writer;
-		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+		/*
+		 * XXX: we'd need something like a rwsem_is_write_locked
+		 * helper for the XFS_IOLOCK_EXCL case..
+		 */
+		return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
 	}
 
 	ASSERT(0);
@@ -397,11 +402,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 
 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 		ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
-		ASSERT(xfs_lockdep_subclass_ok(subclass +
-						XFS_IOLOCK_PARENT_VAL));
 		class += subclass << XFS_IOLOCK_SHIFT;
-		if (lock_mode & XFS_IOLOCK_PARENT)
-			class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
 	}
 
 	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
@@ -453,8 +454,6 @@ xfs_lock_inodes(
 			    XFS_ILOCK_EXCL));
 	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 			      XFS_ILOCK_SHARED)));
-	ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
-		inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
 	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 		inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
@@ -689,7 +688,6 @@ xfs_lookup(
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
 
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	if (error)
 		goto out_unlock;
@@ -698,14 +696,12 @@ xfs_lookup(
 	if (error)
 		goto out_free_name;
 
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 	return 0;
 
 out_free_name:
 	if (ci_name)
 		kmem_free(ci_name->name);
 out_unlock:
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 	*ipp = NULL;
 	return error;
 }
@@ -1179,8 +1175,7 @@ xfs_create(
 	if (error)
 		goto out_release_inode;
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
-		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
 
 	xfs_defer_init(&dfops, &first_block);
@@ -1216,7 +1211,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = false;
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1289,7 +1284,7 @@ xfs_create(
 	xfs_qm_dqrele(pdqp);
 
 	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -1430,11 +1425,10 @@ xfs_link(
 	if (error)
 		goto std_return;
 
-	xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we are using project inheritance, we only allow hard link
@@ -2528,10 +2522,9 @@ xfs_remove(
 		goto std_return;
 	}
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
@@ -2912,12 +2905,6 @@ xfs_rename(
 	 * whether the target directory is the same as the source
 	 * directory, we can lock from 2 to 4 inodes.
 	 */
-	if (!new_parent)
-		xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-	else
-		xfs_lock_two_inodes(src_dp, target_dp,
-				    XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
 	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 
 	/*
@@ -2925,9 +2912,9 @@ xfs_rename(
 	 * we can rely on either trans_commit or trans_cancel to unlock
 	 * them.
 	 */
-	xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent)
-		xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
 	if (target_ip)
 		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e1a411e..dc8806f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -55,7 +55,6 @@ typedef struct xfs_inode {
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
-	mrlock_t		i_iolock;	/* inode IO lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
@@ -316,7 +315,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * IOLOCK values
  *
  * 0-3		subclass value
- * 4-7		PARENT subclass values
+ * 4-7		unused
  *
  * MMAPLOCK values
  *
@@ -331,10 +330,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * 
  */
 #define XFS_IOLOCK_SHIFT		16
-#define XFS_IOLOCK_PARENT_VAL		4
-#define XFS_IOLOCK_MAX_SUBCLASS		(XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_MAX_SUBCLASS		3
 #define XFS_IOLOCK_DEP_MASK		0x000f0000
-#define	XFS_IOLOCK_PARENT		(XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
 
 #define XFS_MMAPLOCK_SHIFT		20
 #define XFS_MMAPLOCK_NUMORDER		0
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 96a70fd..29e0471 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -639,7 +639,7 @@ xfs_ioc_space(
 		return error;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock, false);
+	error = xfs_break_layouts(inode, &iolock);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab820f8..36ac28c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -953,15 +953,13 @@ xfs_vn_setattr(
 	if (iattr->ia_valid & ATTR_SIZE) {
 		uint		iolock = XFS_IOLOCK_EXCL;
 
-		xfs_ilock(ip, iolock);
-		error = xfs_break_layouts(d_inode(dentry), &iolock, true);
-		if (!error) {
-			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-			iolock |= XFS_MMAPLOCK_EXCL;
+		error = xfs_break_layouts(d_inode(dentry), &iolock);
+		if (error)
+			return error;
 
-			error = xfs_setattr_size(ip, iattr);
-		}
-		xfs_iunlock(ip, iolock);
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		error = xfs_setattr_size(ip, iattr);
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 	} else {
 		error = xfs_setattr_nonsize(ip, iattr, 0);
 	}
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0f14b2e..94c8faa 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -32,8 +32,7 @@
 int
 xfs_break_layouts(
 	struct inode		*inode,
-	uint			*iolock,
-	bool			with_imutex)
+	uint			*iolock)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
@@ -42,12 +41,8 @@ xfs_break_layouts(
 
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
-		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
-			inode_unlock(inode);
 		error = break_layout(inode, true);
 		*iolock = XFS_IOLOCK_EXCL;
-		if (with_imutex)
-			inode_lock(inode);
 		xfs_ilock(ip, *iolock);
 	}
 
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index e8339f7..b587cb9 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
 		struct iattr *iattr);
 
-int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
+int xfs_break_layouts(struct inode *inode, uint *iolock);
 #else
 static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
+xfs_break_layouts(struct inode *inode, uint *iolock)
 {
 	return 0;
 }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 24ef83e..f32b95e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -939,7 +939,7 @@ xfs_fs_destroy_inode(
 
 	trace_xfs_destroy_inode(ip);
 
-	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 	XFS_STATS_INC(ip->i_mount, vn_rele);
 	XFS_STATS_INC(ip->i_mount, vn_remove);
 
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 58142ae..f2cb45e 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -238,8 +238,7 @@ xfs_symlink(
 	if (error)
 		goto out_release_inode;
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
-		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
 
 	/*
@@ -287,7 +286,7 @@ xfs_symlink(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = false;
 
 	/*
@@ -412,7 +411,7 @@ out_release_inode:
 	xfs_qm_dqrele(pdqp);
 
 	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
 
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-11 17:10 [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead Christoph Hellwig
@ 2016-08-11 21:54 ` Peter Zijlstra
  2016-08-18 17:37   ` Christoph Hellwig
  2016-08-11 23:43 ` Dave Chinner
  1 sibling, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-08-11 21:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Thu, Aug 11, 2016 at 10:10:23AM -0700, Christoph Hellwig wrote:

> There is one major issue with this change though:  lockdep currently
> doesn't have a facility to assert a rw_sempahore is held exclusively,
> which means we lose the nice ability to assert locking context in
> XFS.
> 
> Peter, I think you mentioned this would be fairly easy to add to
> lockdep and the rw_semaphore code.  Any chance to come up with a proof
> of concept?

Sure, find below. Not been near a compiler.

---
 include/linux/lockdep.h  | 17 +++++++++++++++--
 kernel/locking/lockdep.c | 41 +++++++++++++++++++++++++----------------
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index eabe0138eb06..7f0098d3a7d7 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -338,9 +338,14 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
-#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
+extern int _lock_is_held(struct lockdep_map *lock, int read);
 
-extern int lock_is_held(struct lockdep_map *lock);
+static inline int lock_is_held(struct lockdep_map *lock)
+{
+	return _lock_is_held(lock, -1);
+}
+
+#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
 
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 			   struct lock_class_key *key, unsigned int subclass,
@@ -372,6 +377,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
+#define lockdep_assert_held_exclusive(l)	do {		\
+		WARN_ON(debug_locks && !_lockdep_is_held(l, 0));\
+	} while (0)
+
+#define lockdep_assert_held_read(l)	do {			\
+		WARN_ON(debug_locks && !_lockdep_is_held(l, 1));\
+	} while (0)
+
 #define lockdep_assert_held_once(l)	do {				\
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..abec578378e7 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
 	return 0;
 }
 
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
 
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	}
 	chain_key = iterate_chain_key(chain_key, class_idx);
 
-	if (nest_lock && !__lock_is_held(nest_lock))
+	if (nest_lock && !__lock_is_held(nest_lock, -1))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
 
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3390,10 +3390,17 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	return 0;
 }
 
-static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock, int read)
 {
-	if (hlock->instance == lock)
-		return 1;
+	if (hlock->instance == lock) {
+		if (read == -1)
+			return 1;
+
+		if (hlock->read == read)
+			return 1;
+
+		return 0;
+	}
 
 	if (hlock->references) {
 		struct lock_class *class = lock->class_cache[0];
@@ -3420,6 +3427,8 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 
 		if (hlock->class_idx == class - lock_classes + 1)
 			return 1;
+
+		/* XXX do we want @read stuff for nested locks !? */
 	}
 
 	return 0;
@@ -3452,7 +3461,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3523,7 +3532,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3576,7 +3585,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 	return 1;
 }
 
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3584,7 +3593,7 @@ static int __lock_is_held(struct lockdep_map *lock)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, read))
 			return 1;
 	}
 
@@ -3603,7 +3612,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock)) {
+		if (match_held_lock(hlock, lock, -1)) {
 			/*
 			 * Grab 16bits of randomness; this is sufficient to not
 			 * be guessable and still allows some pin nesting in
@@ -3630,7 +3639,7 @@ static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock)) {
+		if (match_held_lock(hlock, lock, -1)) {
 			hlock->pin_count += cookie.val;
 			return;
 		}
@@ -3650,7 +3659,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock)) {
+		if (match_held_lock(hlock, lock, -1)) {
 			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
 				return;
 
@@ -3769,7 +3778,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
-int lock_is_held(struct lockdep_map *lock)
+int lock_is_held(struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -3781,7 +3790,7 @@ int lock_is_held(struct lockdep_map *lock)
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	ret = __lock_is_held(lock);
+	ret = __lock_is_held(lock, read);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 
@@ -3908,7 +3917,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3961,7 +3970,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-11 17:10 [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead Christoph Hellwig
  2016-08-11 21:54 ` Peter Zijlstra
@ 2016-08-11 23:43 ` Dave Chinner
  2016-08-12  2:50   ` Christoph Hellwig
  1 sibling, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2016-08-11 23:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: peterz, xfs

On Thu, Aug 11, 2016 at 10:10:23AM -0700, Christoph Hellwig wrote:
> This patch drops the XFS-own i_iolock and uses the VFS i_rwsem which
> recently replaced i_mutex instead.  This means we only have to take
> one looks instead of two in many fast path operations, and we can
> also shrink the xfs_inode structure.  Thanks to the xfs_ilock family
> there is very little churn as well.
> 
> There is one major issue with this change though:  lockdep currently
> doesn't have a facility to assert a rw_sempahore is held exclusively,
> which means we lose the nice ability to assert locking context in
> XFS.
> 
> Peter, I think you mentioned this would be fairly easy to add to
> lockdep and the rw_semaphore code.  Any chance to come up with a proof
> of concept?

I saw prototype patches with a writer field in the rswem to track
who holds it for optimisitic spinning support. I was waiting for
that to land before completely removing the mrlock abstraction
from the XFS code, then changing the iolock to use the VFS inode.

Regardless, if the rwsem code can be made to check for exclusive or
shared locking, we can get rid of the mrlock abstraction. Can we do
that first, Christoph, then make this lock change?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-11 23:43 ` Dave Chinner
@ 2016-08-12  2:50   ` Christoph Hellwig
  2016-08-12  9:58     ` Dave Chinner
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-08-12  2:50 UTC (permalink / raw)
  To: Dave Chinner; +Cc: peterz, Christoph Hellwig, xfs

On Fri, Aug 12, 2016 at 09:43:35AM +1000, Dave Chinner wrote:
> Regardless, if the rwsem code can be made to check for exclusive or
> shared locking, we can get rid of the mrlock abstraction. Can we do
> that first, Christoph, then make this lock change?

I was going to do that next, but if you want the patch order switched
around I can do that as well.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-12  2:50   ` Christoph Hellwig
@ 2016-08-12  9:58     ` Dave Chinner
  2016-09-05 15:15       ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2016-08-12  9:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: peterz, xfs

On Fri, Aug 12, 2016 at 04:50:26AM +0200, Christoph Hellwig wrote:
> On Fri, Aug 12, 2016 at 09:43:35AM +1000, Dave Chinner wrote:
> > Regardless, if the rwsem code can be made to check for exclusive or
> > shared locking, we can get rid of the mrlock abstraction. Can we do
> > that first, Christoph, then make this lock change?
> 
> I was going to do that next, but if you want the patch order switched
> around I can do that as well.

Yeah, I'd prefer we remove te abstraction first, then switch to the
vfs inode lock.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-11 21:54 ` Peter Zijlstra
@ 2016-08-18 17:37   ` Christoph Hellwig
  2016-08-19 13:27     ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-08-18 17:37 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

On Thu, Aug 11, 2016 at 11:54:44PM +0200, Peter Zijlstra wrote:
> Sure, find below. Not been near a compiler.

It has now.  Below are the additions I need, and things seem to be
passing fine with that.

Note that to fit the existing XFS lock asserts I'm using
_lockdep_is_held directly instead of lockdep_assert_held_exclusive.
All the exports are there, but I'm just trying to make sure you're
not going to shout at me for that later :)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 7f0098d..4cef9f8 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -345,7 +345,8 @@ static inline int lock_is_held(struct lockdep_map *lock)
 	return _lock_is_held(lock, -1);
 }
 
-#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
+#define lockdep_is_held(lock)		lock_is_held(&(lock)->dep_map)
+#define _lockdep_is_held(lock, r)	_lock_is_held(&(lock)->dep_map, (r))
 
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 			   struct lock_class_key *key, unsigned int subclass,
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index abec578..f39573b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3778,7 +3778,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
-int lock_is_held(struct lockdep_map *lock, int read)
+int _lock_is_held(struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -3796,7 +3796,7 @@ int lock_is_held(struct lockdep_map *lock, int read)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(_lock_is_held);
 
 struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-18 17:37   ` Christoph Hellwig
@ 2016-08-19 13:27     ` Peter Zijlstra
  2016-08-20  6:37       ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-08-19 13:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Thu, Aug 18, 2016 at 07:37:07PM +0200, Christoph Hellwig wrote:
> On Thu, Aug 11, 2016 at 11:54:44PM +0200, Peter Zijlstra wrote:
> > Sure, find below. Not been near a compiler.
> 
> It has now.  Below are the additions I need, and things seem to be
> passing fine with that.
> 
> Note that to fit the existing XFS lock asserts I'm using
> _lockdep_is_held directly instead of lockdep_assert_held_exclusive.
> All the exports are there, but I'm just trying to make sure you're
> not going to shout at me for that later :)

Hurm, if you're going to directly use that maybe we should pick a better
name ;-)

Also, be sure to check the debug_locks variable, if that's cleared the
result of _lockdep_is_held() isn't reliable -- we stop tracking lock
state when there's an error.

Thanks!

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-19 13:27     ` Peter Zijlstra
@ 2016-08-20  6:37       ` Christoph Hellwig
  2016-08-22  8:34         ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-08-20  6:37 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

On Fri, Aug 19, 2016 at 03:27:36PM +0200, Peter Zijlstra wrote:
> Hurm, if you're going to directly use that maybe we should pick a better
> name ;-)

Fine with that.

> Also, be sure to check the debug_locks variable, if that's cleared the
> result of _lockdep_is_held() isn't reliable -- we stop tracking lock
> state when there's an error.

I already do.  But I'm wondering if we can't simply move the 
debug_locks check into lockdep_is_held?  It's already used directly
in a few places, and that would also solve the whole naming issue.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-20  6:37       ` Christoph Hellwig
@ 2016-08-22  8:34         ` Peter Zijlstra
  2016-09-05 15:12           ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-08-22  8:34 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Sat, Aug 20, 2016 at 08:37:23AM +0200, Christoph Hellwig wrote:
> On Fri, Aug 19, 2016 at 03:27:36PM +0200, Peter Zijlstra wrote:
> > Hurm, if you're going to directly use that maybe we should pick a better
> > name ;-)
> 
> Fine with that.
> 
> > Also, be sure to check the debug_locks variable, if that's cleared the
> > result of _lockdep_is_held() isn't reliable -- we stop tracking lock
> > state when there's an error.
> 
> I already do.  But I'm wondering if we can't simply move the 
> debug_locks check into lockdep_is_held?  It's already used directly
> in a few places, and that would also solve the whole naming issue.

Reason I didn't do that initially was that I used lock_is_held() for
both positive and negative tests (ie. assert a lock is held and a lock
is not held). Given that, you cannot pick a right return value when
!debug_locks.

Not sure we still do that, but I distinctly remember running into that
when I did as you suggest now. But that was many years ago.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-22  8:34         ` Peter Zijlstra
@ 2016-09-05 15:12           ` Christoph Hellwig
  2016-09-07  7:43             ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-09-05 15:12 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

Peter, this is the fixed up patch.  Can you write a proper changelog
and add your signoff?

---
 include/linux/lockdep.h  | 18 ++++++++++++++++--
 kernel/locking/lockdep.c | 43 ++++++++++++++++++++++++++-----------------
 2 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index eabe013..4cef9f8 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -338,9 +338,15 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
-#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
+extern int _lock_is_held(struct lockdep_map *lock, int read);
 
-extern int lock_is_held(struct lockdep_map *lock);
+static inline int lock_is_held(struct lockdep_map *lock)
+{
+	return _lock_is_held(lock, -1);
+}
+
+#define lockdep_is_held(lock)		lock_is_held(&(lock)->dep_map)
+#define _lockdep_is_held(lock, r)	_lock_is_held(&(lock)->dep_map, (r))
 
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 			   struct lock_class_key *key, unsigned int subclass,
@@ -372,6 +378,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
+#define lockdep_assert_held_exclusive(l)	do {		\
+		WARN_ON(debug_locks && !_lockdep_is_held(l, 0));\
+	} while (0)
+
+#define lockdep_assert_held_read(l)	do {			\
+		WARN_ON(debug_locks && !_lockdep_is_held(l, 1));\
+	} while (0)
+
 #define lockdep_assert_held_once(l)	do {				\
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763..f39573b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
 	return 0;
 }
 
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
 
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	}
 	chain_key = iterate_chain_key(chain_key, class_idx);
 
-	if (nest_lock && !__lock_is_held(nest_lock))
+	if (nest_lock && !__lock_is_held(nest_lock, -1))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
 
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3390,10 +3390,17 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
 	return 0;
 }
 
-static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
+static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock, int read)
 {
-	if (hlock->instance == lock)
-		return 1;
+	if (hlock->instance == lock) {
+		if (read == -1)
+			return 1;
+
+		if (hlock->read == read)
+			return 1;
+
+		return 0;
+	}
 
 	if (hlock->references) {
 		struct lock_class *class = lock->class_cache[0];
@@ -3420,6 +3427,8 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 
 		if (hlock->class_idx == class - lock_classes + 1)
 			return 1;
+
+		/* XXX do we want @read stuff for nested locks !? */
 	}
 
 	return 0;
@@ -3452,7 +3461,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3523,7 +3532,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3576,7 +3585,7 @@ found_it:
 	return 1;
 }
 
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3584,7 +3593,7 @@ static int __lock_is_held(struct lockdep_map *lock)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, read))
 			return 1;
 	}
 
@@ -3603,7 +3612,7 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock)) {
+		if (match_held_lock(hlock, lock, -1)) {
 			/*
 			 * Grab 16bits of randomness; this is sufficient to not
 			 * be guessable and still allows some pin nesting in
@@ -3630,7 +3639,7 @@ static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock)) {
+		if (match_held_lock(hlock, lock, -1)) {
 			hlock->pin_count += cookie.val;
 			return;
 		}
@@ -3650,7 +3659,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock)) {
+		if (match_held_lock(hlock, lock, -1)) {
 			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
 				return;
 
@@ -3769,7 +3778,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
-int lock_is_held(struct lockdep_map *lock)
+int _lock_is_held(struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -3781,13 +3790,13 @@ int lock_is_held(struct lockdep_map *lock)
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	ret = __lock_is_held(lock);
+	ret = __lock_is_held(lock, read);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(_lock_is_held);
 
 struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {
@@ -3908,7 +3917,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
@@ -3961,7 +3970,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 		 */
 		if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
 			break;
-		if (match_held_lock(hlock, lock))
+		if (match_held_lock(hlock, lock, -1))
 			goto found_it;
 		prev_hlock = hlock;
 	}
-- 
2.1.4

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-08-12  9:58     ` Dave Chinner
@ 2016-09-05 15:15       ` Christoph Hellwig
  2016-09-07 21:45         ` Dave Chinner
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-09-05 15:15 UTC (permalink / raw)
  To: Dave Chinner; +Cc: peterz, Christoph Hellwig, xfs

Hi Dave,

I looked into killing the mrlock and ran into an unexpected problem.

Currently mr_writer tracks that there is someone holding a write lock,
lockdep on the other hand checks if the calling thread has that lock.

While that generally is the right semantic, our hack to offload
btree splits to a work item offends lockdep.  E.g. this callstack
now asserts:

generic/256	[   32.729465] run fstests generic/256 at 2016-09-05 15:09:48
[   33.078511] XFS (vdc): Mounting V5 Filesystem
[   33.090875] XFS (vdc): Ending clean mount
[   59.158520] XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/xfs_trans_inode.c, line: 100
[   59.159559] ------------[ cut here ]------------
[   59.160034] kernel BUG at fs/xfs/xfs_message.c:113!
[   59.160367] invalid opcode: 0000 [#1] SMP
[   59.160633] Modules linked in:
[   59.160846] CPU: 3 PID: 7284 Comm: kworker/3:3 Not tainted 4.8.0-rc2+ #1149
[   59.161056] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
[   59.161056] Workqueue: xfsalloc xfs_btree_split_worker
[   59.161056] task: ffff880136d25ac0 task.stack: ffff8800bb864000
[   59.161056] RIP: 0010:[<ffffffff8159309d>]  [<ffffffff8159309d>] assfail+0x1d/0x20
[   59.161056] RSP: 0018:ffff8800bb867ba0  EFLAGS: 00010282
[   59.161056] RAX: 00000000ffffffea RBX: ffff8801339f3300 RCX: 0000000000000021
[   59.161056] RDX: ffff8800bb867ac8 RSI: 000000000000000a RDI: ffffffff82403b91
[   59.161056] RBP: ffff8800bb867ba0 R08: 0000000000000000 R09: 0000000000000000
[   59.161056] R10: 000000000000000a R11: f000000000000000 R12: 0000000000000001
[   59.161056] R13: ffff8801356aaaf8 R14: ffff8800bb867bd8 R15: ffff8801352d1d98
[   59.161056] FS:  0000000000000000(0000) GS:ffff88013fd80000(0000) knlGS:0000000000000000
[   59.161056] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   59.161056] CR2: 000000000061ee00 CR3: 00000000bb956000 CR4: 00000000000006e0
[   59.161056] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   59.161056] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   59.161056] Stack:
[   59.161056]  ffff8800bb867bc8 ffffffff815b467d ffff8801352d1d98 ffff8800bba0fadc
[   59.161056]  ffff8800bb867d10 ffff8800bb867c88 ffffffff81536c0d ffff8801356aaaf8
[   59.161056]  ffff88013ad64000 ffff8801370e3340 ffff8801373d5600 0000000000000000
[   59.161056] Call Trace:
[   59.161056]  [<ffffffff815b467d>] xfs_trans_log_inode+0x5d/0xd0
[   59.161056]  [<ffffffff81536c0d>] xfs_bmbt_alloc_block+0x15d/0x220
[   59.161056]  [<ffffffff8153d526>] __xfs_btree_split+0xb6/0xae0
[   59.161056]  [<ffffffff81e33907>] ? _raw_spin_unlock_irq+0x27/0x40
[   59.161056]  [<ffffffff8153dfc1>] xfs_btree_split_worker+0x71/0xb0
[   59.161056]  [<ffffffff810f58a1>] process_one_work+0x1c1/0x600
[   59.161056]  [<ffffffff810f581b>] ? process_one_work+0x13b/0x600
[   59.161056]  [<ffffffff810f5d44>] worker_thread+0x64/0x4a0
[   59.161056]  [<ffffffff810f5ce0>] ? process_one_work+0x600/0x600
[   59.161056]  [<ffffffff810fb951>] kthread+0xf1/0x110
[   59.161056]  [<ffffffff81e341ef>] ret_from_fork+0x1f/0x40
[   59.161056]  [<ffffffff810fb860>] ? kthread_create_on_node+0x200/0x200

While it previously did fine.  I fear there might be other locking
asserts in the code called from that work item as well.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-05 15:12           ` Christoph Hellwig
@ 2016-09-07  7:43             ` Peter Zijlstra
  2016-09-08  6:06               ` Ingo Molnar
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-09-07  7:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Ingo Molnar, xfs

On Mon, Sep 05, 2016 at 05:12:44PM +0200, Christoph Hellwig wrote:
> Peter, this is the fixed up patch.  Can you write a proper changelog
> and add your signoff?

Yes, sorry for the delay.

Did you get the workqueue thing sorted, where you rely on another task
holding the lock for you?

I simplified the implementation a little, since I noticed pushing the
@read argument all the way down to match_held_lock() didn't really make
all that much sense and caused a lot of churn.

I also added CONFIG_LOCKDEP=n stubs for the new lockdep_assert_held*()
macros (which you don't use :-).

Feel free to push this through the XFS tree where you add its first use.

---
Subject: locking/lockdep: Provide a type check for lock_is_held
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 5 Sep 2016 17:12:44 +0200

Christoph requested lockdep_assert_held() variants that distinguish
between held-for-read or held-for-write.

Provide:

  int lock_is_held_type(struct lockdep_map *lock, int read)

which takes the same argument as lock_acquire(.read) and matches it to
the held_lock instance.

Use of this function should be gated by the debug_locks variable. When
that is 0 the return value of the lock_is_held_type() function is
undefined. This is done to allow both negative and positive tests for
holding locks.

By default we provide (positive)
lockdep_assert_held{,_exclusive,_read}() macros.

Requested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/lockdep.h  |   23 +++++++++++++++++++++--
 kernel/locking/lockdep.c |   20 ++++++++++++--------
 2 files changed, 33 insertions(+), 10 deletions(-)

--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_
 extern void lock_release(struct lockdep_map *lock, int nested,
 			 unsigned long ip);
 
-#define lockdep_is_held(lock)	lock_is_held(&(lock)->dep_map)
+/*
+ * Same "read" as for lock_acquire(), except -1 means any.
+ */
+extern int lock_is_held_type(struct lockdep_map *lock, int read);
+
+static inline int lock_is_held(struct lockdep_map *lock)
+{
+	return lock_is_held_type(lock, -1);
+}
 
-extern int lock_is_held(struct lockdep_map *lock);
+#define lockdep_is_held(lock)		lock_is_held(&(lock)->dep_map)
+#define lockdep_is_held_type(lock, r)	lock_is_held_type(&(lock)->dep_map, (r))
 
 extern void lock_set_class(struct lockdep_map *lock, const char *name,
 			   struct lock_class_key *key, unsigned int subclass,
@@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockd
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
+#define lockdep_assert_held_exclusive(l)	do {			\
+		WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));	\
+	} while (0)
+
+#define lockdep_assert_held_read(l)	do {				\
+		WARN_ON(debug_locks && !lockdep_is_held_type(l, 1));	\
+	} while (0)
+
 #define lockdep_assert_held_once(l)	do {				\
 		WARN_ON_ONCE(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
@@ -429,6 +446,8 @@ struct lock_class_key { };
 #define lockdep_depth(tsk)	(0)
 
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
+#define lockdep_assert_held_exclusive(l)	do { (void)(l); } while (0)
+#define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 
 #define lockdep_recursing(tsk)			(0)
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct t
 	return 0;
 }
 
-static int __lock_is_held(struct lockdep_map *lock);
+static int __lock_is_held(struct lockdep_map *lock, int read);
 
 /*
  * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep
 	}
 	chain_key = iterate_chain_key(chain_key, class_idx);
 
-	if (nest_lock && !__lock_is_held(nest_lock))
+	if (nest_lock && !__lock_is_held(nest_lock, -1))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
 
 	if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3576,7 +3576,7 @@ __lock_release(struct lockdep_map *lock,
 	return 1;
 }
 
-static int __lock_is_held(struct lockdep_map *lock)
+static int __lock_is_held(struct lockdep_map *lock, int read)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3584,8 +3584,12 @@ static int __lock_is_held(struct lockdep
 	for (i = 0; i < curr->lockdep_depth; i++) {
 		struct held_lock *hlock = curr->held_locks + i;
 
-		if (match_held_lock(hlock, lock))
-			return 1;
+		if (match_held_lock(hlock, lock)) {
+			if (read == -1 || hlock->read == read)
+				return 1;
+
+			return 0;
+		}
 	}
 
 	return 0;
@@ -3769,7 +3773,7 @@ void lock_release(struct lockdep_map *lo
 }
 EXPORT_SYMBOL_GPL(lock_release);
 
-int lock_is_held(struct lockdep_map *lock)
+int lock_is_held_type(struct lockdep_map *lock, int read)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -3781,13 +3785,13 @@ int lock_is_held(struct lockdep_map *loc
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	ret = __lock_is_held(lock);
+	ret = __lock_is_held(lock, read);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(lock_is_held);
+EXPORT_SYMBOL_GPL(lock_is_held_type);
 
 struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-05 15:15       ` Christoph Hellwig
@ 2016-09-07 21:45         ` Dave Chinner
  2016-09-08  6:54           ` Peter Zijlstra
  2016-09-09  8:33           ` Christoph Hellwig
  0 siblings, 2 replies; 25+ messages in thread
From: Dave Chinner @ 2016-09-07 21:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: peterz, xfs

On Mon, Sep 05, 2016 at 05:15:29PM +0200, Christoph Hellwig wrote:
> Hi Dave,
> 
> I looked into killing the mrlock and ran into an unexpected problem.
> 
> Currently mr_writer tracks that there is someone holding a write lock,
> lockdep on the other hand checks if the calling thread has that lock.
> 
> While that generally is the right semantic, our hack to offload
> btree splits to a work item offends lockdep.  E.g. this callstack
> now asserts:

It's a semaphore, not a mutex. Semaphore locking is independent of
task context, the lock follows the object it protects, not the task
that took the lock. i.e. Lockdep is wrong to assume the "owner" of a
rw_sem will not change between lock and unlock.

> While it previously did fine.  I fear there might be other locking
> asserts in the code called from that work item as well.

Probably.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-07  7:43             ` Peter Zijlstra
@ 2016-09-08  6:06               ` Ingo Molnar
  0 siblings, 0 replies; 25+ messages in thread
From: Ingo Molnar @ 2016-09-08  6:06 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Sep 05, 2016 at 05:12:44PM +0200, Christoph Hellwig wrote:
> > Peter, this is the fixed up patch.  Can you write a proper changelog
> > and add your signoff?
> 
> Yes, sorry for the delay.
> 
> Did you get the workqueue thing sorted, where you rely on another task
> holding the lock for you?
> 
> I simplified the implementation a little, since I noticed pushing the
> @read argument all the way down to match_held_lock() didn't really make
> all that much sense and caused a lot of churn.
> 
> I also added CONFIG_LOCKDEP=n stubs for the new lockdep_assert_held*()
> macros (which you don't use :-).
> 
> Feel free to push this through the XFS tree where you add its first use.

I can also create a separate one-commit tree for it which you can pull into the 
XFS tree. This way there won't be conflicts between the XFS tree and the locking 
tree.

Thanks,

	Ingo

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-07 21:45         ` Dave Chinner
@ 2016-09-08  6:54           ` Peter Zijlstra
  2016-09-09  1:06             ` Dave Chinner
  2016-09-09  8:33           ` Christoph Hellwig
  1 sibling, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-09-08  6:54 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs

On Thu, Sep 08, 2016 at 07:45:36AM +1000, Dave Chinner wrote:
> On Mon, Sep 05, 2016 at 05:15:29PM +0200, Christoph Hellwig wrote:
> > Hi Dave,
> > 
> > I looked into killing the mrlock and ran into an unexpected problem.
> > 
> > Currently mr_writer tracks that there is someone holding a write lock,
> > lockdep on the other hand checks if the calling thread has that lock.
> > 
> > While that generally is the right semantic, our hack to offload
> > btree splits to a work item offends lockdep.  E.g. this callstack
> > now asserts:
> 
> It's a semaphore, not a mutex. Semaphore locking is independent of
> task context, the lock follows the object it protects, not the task
> that took the lock. i.e. Lockdep is wrong to assume the "owner" of a
> rw_sem will not change between lock and unlock.

We've added strict owner semantics to rwsem a long time ago.

If you want the actual semaphore semantics (which we greatly discourage,
because you cannot do validation on it) you should use
{down,up}_read_non_owner().

I'm not sure we've got write_non_owner() variants for this.

Turns out, there really are very few 'semaphore' users.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-08  6:54           ` Peter Zijlstra
@ 2016-09-09  1:06             ` Dave Chinner
  2016-09-09  8:21               ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2016-09-09  1:06 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

On Thu, Sep 08, 2016 at 08:54:42AM +0200, Peter Zijlstra wrote:
> On Thu, Sep 08, 2016 at 07:45:36AM +1000, Dave Chinner wrote:
> > On Mon, Sep 05, 2016 at 05:15:29PM +0200, Christoph Hellwig wrote:
> > > Hi Dave,
> > > 
> > > I looked into killing the mrlock and ran into an unexpected problem.
> > > 
> > > Currently mr_writer tracks that there is someone holding a write lock,
> > > lockdep on the other hand checks if the calling thread has that lock.
> > > 
> > > While that generally is the right semantic, our hack to offload
> > > btree splits to a work item offends lockdep.  E.g. this callstack
> > > now asserts:
> > 
> > It's a semaphore, not a mutex. Semaphore locking is independent of
> > task context, the lock follows the object it protects, not the task
> > that took the lock. i.e. Lockdep is wrong to assume the "owner" of a
> > rw_sem will not change between lock and unlock.
> 
> We've added strict owner semantics to rwsem a long time ago.

/me sighs.

History repeats.

We went through all this crap about semaphores and strict ownership
a few years ago when someone tried to make the struct semaphore
require strict ownership and XFS went all explodey. :/

> If you want the actual semaphore semantics (which we greatly discourage,
> because you cannot do validation on it)

What you actually means is "you cannot use /lockdep/ on it", not
"cannot do validation" on it.  Indeed, this discussion started from
wanting to remove the wrappers that implement XFS context
specific locking validation on top of rwsems.

Yes, lockdep has it's uses, but let's not ignore the fact that it
also has significant limitations, introduces serious annotation
complexity for non-trivial locking models (e.g. XFS inodes), and
many people use it as a crutch to avoid thinking about locking
models (i.e. "lockdep doesn't complain, so it must be good!").

>From my perspective, lockdep is a very poor replacement for
architecting a robust locking model and then implementing code that
directly asserts that the correct locks are held at the correct
time. We trip over failed XFS locking asserts during development all
the time, but it's very, very rare to have lockdep tell us we have a
problem in XFS because we catch the problems long before lockdep
will notice them.

> you should use
> {down,up}_read_non_owner().
> I'm not sure we've got write_non_owner() variants for this.

For the case Christoph reported, it's the write side context of the
inode locks that is handed off to other threads. And no, we don't
have annotations for that.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  1:06             ` Dave Chinner
@ 2016-09-09  8:21               ` Peter Zijlstra
  2016-09-09  8:34                 ` Christoph Hellwig
  2016-09-11  0:17                 ` Dave Chinner
  0 siblings, 2 replies; 25+ messages in thread
From: Peter Zijlstra @ 2016-09-09  8:21 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, xfs

On Fri, Sep 09, 2016 at 11:06:01AM +1000, Dave Chinner wrote:
> On Thu, Sep 08, 2016 at 08:54:42AM +0200, Peter Zijlstra wrote:

> > We've added strict owner semantics to rwsem a long time ago.
> 
> /me sighs.
> 
> History repeats.

I doubt it, this was 2006 we're talking about.

> From my perspective, lockdep is a very poor replacement for
> architecting a robust locking model and then implementing code that
> directly asserts that the correct locks are held at the correct
> time.

Very few people people architect locking :-/, but lockdep as the assert,
and I've been encouraging people to use that instead of comments like:

  /* this function should be called with foo lock held */

Now, the problem is that lockdep also asserts the caller is the lock
owner, and not some other random thing.

And given the amount of locking fail caught by lockdep (still..) you
really cannot argue against it.

Sure its a pain at times, but so is pretty much everything.

> > you should use
> > {down,up}_read_non_owner().
> > I'm not sure we've got write_non_owner() variants for this.
> 
> For the case Christoph reported, it's the write side context of the
> inode locks that is handed off to other threads. And no, we don't
> have annotations for that.

So the xfs mrlock already uses rwsem, semantics have not changed. So the
case Christoph found should already conform to the owner semantics.

I've not looked at code, but if the worker is synchronized against the
critical section (it'd pretty much have to be to rely on its locking)
there's nothing wrong, its just that the lockdep_assert_held() thingies
cannot work as it.

That is:

	task A				task B

	down_write(&rwsem);
	queue_work(&work);
					worker()
					  lockdep_assert_held(&rwsem);
	flush_work(&work);
	up_write(&rwsem);


Doesn't work. Explicitly so in fact.

Does the worker have a pointer back to taskA ? I could try something
like lockdep_assert_held_by(lock, task), just need to be careful,
the per-task lock state is just that, per-task, no serialization.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-07 21:45         ` Dave Chinner
  2016-09-08  6:54           ` Peter Zijlstra
@ 2016-09-09  8:33           ` Christoph Hellwig
  2016-09-09  8:44             ` Peter Zijlstra
  1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-09-09  8:33 UTC (permalink / raw)
  To: Dave Chinner; +Cc: peterz, Christoph Hellwig, xfs

On Thu, Sep 08, 2016 at 07:45:36AM +1000, Dave Chinner wrote:
> It's a semaphore, not a mutex. Semaphore locking is independent of
> task context, the lock follows the object it protects, not the task
> that took the lock. i.e. Lockdep is wrong to assume the "owner" of a
> rw_sem will not change between lock and unlock.

That's not the case - rw_semaphores had strict owner semanics for a
long time (although I wish we could get rid of that for a different
reason..).

The problem here is not that we have different tasks acquire and release
the lock - it's always the same.

The "problem" is that that we hand off work to a different task inbetween
and that task asserts that the lock is held.  With the old mrlock hack
our islocked macros would return true as long as _someone_ holds the
lock, while lockdep is generally more strict and wants the current
process to hold the lock.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  8:21               ` Peter Zijlstra
@ 2016-09-09  8:34                 ` Christoph Hellwig
  2016-09-11  0:17                 ` Dave Chinner
  1 sibling, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2016-09-09  8:34 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

On Fri, Sep 09, 2016 at 10:21:39AM +0200, Peter Zijlstra wrote:
> Does the worker have a pointer back to taskA ? I could try something
> like lockdep_assert_held_by(lock, task), just need to be careful,
> the per-task lock state is just that, per-task, no serialization.

Currently it doesn't, but we could add one.  It would be rather hacky,
though.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  8:33           ` Christoph Hellwig
@ 2016-09-09  8:44             ` Peter Zijlstra
  2016-09-09  9:05               ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-09-09  8:44 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Fri, Sep 09, 2016 at 10:33:06AM +0200, Christoph Hellwig wrote:
> On Thu, Sep 08, 2016 at 07:45:36AM +1000, Dave Chinner wrote:
> > It's a semaphore, not a mutex. Semaphore locking is independent of
> > task context, the lock follows the object it protects, not the task
> > that took the lock. i.e. Lockdep is wrong to assume the "owner" of a
> > rw_sem will not change between lock and unlock.
> 
> That's not the case - rw_semaphores had strict owner semanics for a
> long time (although I wish we could get rid of that for a different
> reason..).

Do tell; note however that due to the strict write owner, we can do
things like the optimistic spinning which improved writer->writer
performance significantly.

Also note that !owner locks create problems for RT.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  8:44             ` Peter Zijlstra
@ 2016-09-09  9:05               ` Christoph Hellwig
  2016-09-09  9:51                 ` Peter Zijlstra
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2016-09-09  9:05 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

On Fri, Sep 09, 2016 at 10:44:50AM +0200, Peter Zijlstra wrote:
> Do tell; note however that due to the strict write owner, we can do
> things like the optimistic spinning which improved writer->writer
> performance significantly.
> 
> Also note that !owner locks create problems for RT.

Your clearly missed out on our little Linux-RT conference in June :)

While I basically agree with you there is an important class of problems
that warrant a non-owner release, and that is I/O.  The easiest example
(which also happens to be somewhat relevant for this thread) is the
XFS iolock (or i_rwsem for that matter).  We need to hold this over
file writes, but for direct I/O those always go out to disk.  In the
AIO case there simply is no owner left once control is handed off
to the disk / controller and we'll only be able to unlock once we
get a completion.  Currenrly we work around that using i_dio_count
and a hashed waitqueue, but in many ways that solution is worse than
the problem (and I say that as the person that introduced it!).

We have many many similar issues all over the tree, and people are
"fixing" it using home grown lock primitives like the one above
or using bitlocks (see the recent thread on cross-release semantics
for those).  I think everyone would be better server by accepting
that this case exists and finding a place for it in the framework.
E.g. for RT trying to boost something that is fully under control
of hardware is pointless, but if we have a way to transfer a lock
from an owner to a hardware owned state we could at least boost
until that handoff happened.

None of that is really relevant for this thread though - no cross
release involved in the case that's tripping up lockdep here, just
lock "borrowing" using a synchronous work item.  The real root cause
here is the too small kernel stack that led to this workqueue.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  9:05               ` Christoph Hellwig
@ 2016-09-09  9:51                 ` Peter Zijlstra
  2016-09-10 16:20                   ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2016-09-09  9:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Fri, Sep 09, 2016 at 11:05:44AM +0200, Christoph Hellwig wrote:

> Your clearly missed out on our little Linux-RT conference in June :)

Indeed.

> While I basically agree with you there is an important class of problems
> that warrant a non-owner release, and that is I/O.  The easiest example
> (which also happens to be somewhat relevant for this thread) is the
> XFS iolock (or i_rwsem for that matter).  We need to hold this over
> file writes, but for direct I/O those always go out to disk.  In the
> AIO case there simply is no owner left once control is handed off
> to the disk / controller and we'll only be able to unlock once we
> get a completion.  Currenrly we work around that using i_dio_count
> and a hashed waitqueue, but in many ways that solution is worse than
> the problem (and I say that as the person that introduced it!).

Right, the IO problem.

> We have many many similar issues all over the tree, and people are
> "fixing" it using home grown lock primitives like the one above
> or using bitlocks (see the recent thread on cross-release semantics
> for those). 

Completions and semaphores don't work? And yes, I need to look at that
cross-release muck, but as is that stuff sets my teeth on edge.

> I think everyone would be better server by accepting
> that this case exists and finding a place for it in the framework.
> E.g. for RT trying to boost something that is fully under control
> of hardware is pointless, but if we have a way to transfer a lock
> from an owner to a hardware owned state we could at least boost
> until that handoff happened.

Could be worse than pointless, could indicate borkage. But yes, once you
have that event you could propagate it up the PI chain and notify
things.

IO rarely is deterministic, so having RT tasks in a blocked-on chain
with it is fail. And yes, there's exceptions etc..

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  9:51                 ` Peter Zijlstra
@ 2016-09-10 16:20                   ` Christoph Hellwig
  0 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2016-09-10 16:20 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: xfs

On Fri, Sep 09, 2016 at 11:51:48AM +0200, Peter Zijlstra wrote:
> Completions and semaphores don't work? And yes, I need to look at that
> cross-release muck, but as is that stuff sets my teeth on edge.

Completions can be used as hacks for some of it - we have two or three
places where we do that in XFS.  Using semaphores doesn't seem very
popular.  Also I'd much prefer to have a proper lock instead of working
around it, most importantly to get good lockdep support.

And none of that addresses the fact that we're talking about a
shared/exclusive lock here.

> > I think everyone would be better server by accepting
> > that this case exists and finding a place for it in the framework.
> > E.g. for RT trying to boost something that is fully under control
> > of hardware is pointless, but if we have a way to transfer a lock
> > from an owner to a hardware owned state we could at least boost
> > until that handoff happened.
> 
> Could be worse than pointless, could indicate borkage.

Yes - pointless is still the best case.

> But yes, once you
> have that event you could propagate it up the PI chain and notify
> things.
> 
> IO rarely is deterministic, so having RT tasks in a blocked-on chain
> with it is fail. And yes, there's exceptions etc..

That's often true, but not always.  There is things like battery backed
DRAM which is very deterministic, and there is a lot of work going on
to provide relatively deterministic ways of using flash storage.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-09  8:21               ` Peter Zijlstra
  2016-09-09  8:34                 ` Christoph Hellwig
@ 2016-09-11  0:17                 ` Dave Chinner
  2016-09-13 19:42                   ` Peter Zijlstra
  1 sibling, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2016-09-11  0:17 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Christoph Hellwig, xfs

On Fri, Sep 09, 2016 at 10:21:39AM +0200, Peter Zijlstra wrote:
> On Fri, Sep 09, 2016 at 11:06:01AM +1000, Dave Chinner wrote:
> > On Thu, Sep 08, 2016 at 08:54:42AM +0200, Peter Zijlstra wrote:
> > From my perspective, lockdep is a very poor replacement for
> > architecting a robust locking model and then implementing code that
> > directly asserts that the correct locks are held at the correct
> > time.
> 
> Very few people people architect locking :-/,

And so lockdep was written to help catch such situations. Instead of
encouraging people more careful about locking, it has become a
crutch that people lean on: "if lockdep is silent, my locking must
be good now"...

Law of unintended consequences, eh?

> but lockdep as the assert,
> and I've been encouraging people to use that instead of comments like:
> 
>   /* this function should be called with foo lock held */

Yup, that's been a standard practice in XFS since day zero.  I'm
glad to see that locking development practices are slowly catching
up with the state of the art from the early 90s. :P

> Now, the problem is that lockdep also asserts the caller is the lock
> owner, and not some other random thing.

Yeah, it is does not handle the "lock follows object" model used by
multi-process asynchronous execution engines (which is pretty much
what the linux IO subsystem is). As XFS is likely to move more
towards async execution in future, the need for non-owner support in
semaphores is more important than ever...

> And given the amount of locking fail caught by lockdep (still..) you
> really cannot argue against it.

Sure, but I'm not saying "remove lockdep". What I'm saying is that
/lockdep does not define locking primitve behaviour/. Lockdep is
*only a debugging tool*.

IMO, it is wrong to restrict the semantics of a lock type because of
the limitations of a debugging tool. All that does is encourage
people to invent their own locking primitives that are hidden from
the debugging tool and are more likely to be broken than not. We've
done this several times in XFS, and we've even propagated such
frakenstein infrastructure into the VFS to save others the pain of
inventing their own non-owner exclusion mechanisms...

> > > you should use
> > > {down,up}_read_non_owner().
> > > I'm not sure we've got write_non_owner() variants for this.
> > 
> > For the case Christoph reported, it's the write side context of the
> > inode locks that is handed off to other threads. And no, we don't
> > have annotations for that.
> 
> So the xfs mrlock already uses rwsem, semantics have not changed. So the
> case Christoph found should already conform to the owner semantics.

Which, until there was "spin on owner" added to the rwsems, the
rwsem did not track ownership of the lock. i.e. prior to 2014
(commit 4fc828e "locking/rwsem: Support optimistic spinning"), the
rwsem contained:

struct rw_semaphore {
      long                    count;
      raw_spinlock_t          wait_lock;
      struct list_head        wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
      struct lockdep_map      dep_map;
#endif
};

i.e. historically the only "ownership dependency" a rwsem has ever
had was added by lockdep, which then required annotations to tell
lockdep the usage model of the lock. And even now the optimistic
spinning code doesn't actually use the owner for checking ownership
- the value is simply a special cookie to determine if the lock is
held in read or write mode...

> I've not looked at code, but if the worker is synchronized against the
> critical section (it'd pretty much have to be to rely on its locking)
> there's nothing wrong, its just that the lockdep_assert_held() thingies
> cannot work as it.
> 
> That is:
> 
> 	task A				task B
> 
> 	down_write(&rwsem);
> 	queue_work(&work);
> 					worker()
> 					  lockdep_assert_held(&rwsem);
> 	flush_work(&work);
> 	up_write(&rwsem);
> 
> 
> Doesn't work. Explicitly so in fact.

Then lockdep behaviour is incorrect for rwsems. By definition,
semaphores have explicit non-owner semantics - a semaphore with
strict task ownership constraints is a mutex, not a semaphore....

> Does the worker have a pointer back to taskA ?

Nope. it runs the work while task A sleeps on a completion. When
it's done, task A is woken and it continues.

And, FWIW, we can take locks on objects (e.g. buffers) in task B
that are then released by task A, too.  Unsurprisingly, those
objects are also protected by semaphores, explicitly because of the
non-owner behaviour those object locks have...

> I could try something
> like lockdep_assert_held_by(lock, task), just need to be careful,
> the per-task lock state is just that, per-task, no serialization.

Add a lockdep init flag for rwsems to say "don't track owners" and
we'll set it on the rwsems we pass to other contexts. Then you can
make sure that lockdep asserts don't fire when we pass those locked
objects to other tasks and/or get locked/unlocked by different
tasks...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead
  2016-09-11  0:17                 ` Dave Chinner
@ 2016-09-13 19:42                   ` Peter Zijlstra
  0 siblings, 0 replies; 25+ messages in thread
From: Peter Zijlstra @ 2016-09-13 19:42 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Ingo Molnar, Thomas Gleixner, Christoph Hellwig, xfs

On Sun, Sep 11, 2016 at 10:17:08AM +1000, Dave Chinner wrote:
> On Fri, Sep 09, 2016 at 10:21:39AM +0200, Peter Zijlstra wrote:
> > On Fri, Sep 09, 2016 at 11:06:01AM +1000, Dave Chinner wrote:
> > > On Thu, Sep 08, 2016 at 08:54:42AM +0200, Peter Zijlstra wrote:
> > > From my perspective, lockdep is a very poor replacement for
> > > architecting a robust locking model and then implementing code that
> > > directly asserts that the correct locks are held at the correct
> > > time.
> > 
> > Very few people people architect locking :-/,
> 
> And so lockdep was written to help catch such situations. Instead of
> encouraging people more careful about locking, it has become a
> crutch that people lean on: "if lockdep is silent, my locking must
> be good now"...

You know as well as anybody that with the number of people working on
the kernel you simply cannot 'encourage' to any level of practical
result.

Also, lockdep has caught, and I'm sure still does (although its mostly
invisible since people tend to not send out code with obvious lockdep
fail) locking fail in core kernel code, written by people who really do
know wth they're on about (mostly :-).

I understand you're frustrated, but really, you don't want to return to
the state of locking fail we had.

> Yup, that's been a standard practice in XFS since day zero.  I'm
> glad to see that locking development practices are slowly catching
> up with the state of the art from the early 90s. :P

Meh, asserts and testing invariants was well established in the 70s even
I think. They even had SMP back then :-)

> > Now, the problem is that lockdep also asserts the caller is the lock
> > owner, and not some other random thing.
> 
> Yeah, it is does not handle the "lock follows object" model used by
> multi-process asynchronous execution engines (which is pretty much
> what the linux IO subsystem is). As XFS is likely to move more
> towards async execution in future, the need for non-owner support in
> semaphores is more important than ever...

So we'll have to talk about that, because at the same time there's the
push towards RT.

> > And given the amount of locking fail caught by lockdep (still..) you
> > really cannot argue against it.
> 
> Sure, but I'm not saying "remove lockdep". What I'm saying is that
> /lockdep does not define locking primitve behaviour/. Lockdep is
> *only a debugging tool*.

If you make it too easy to not use the tool, it'll not be used.

Similarly, you very much want to discourage async locking in general,
because its impossible to do PI on it. Also, it really is harder to get
right.

And yes, IO is the special case here, and I realize you're having pain.

> IMO, it is wrong to restrict the semantics of a lock type because of
> the limitations of a debugging tool. All that does is encourage
> people to invent their own locking primitives that are hidden from
> the debugging tool and are more likely to be broken than not. We've
> done this several times in XFS, and we've even propagated such
> frakenstein infrastructure into the VFS to save others the pain of
> inventing their own non-owner exclusion mechanisms...

I'm sure I don't know the details of what you're referring to. But
general infrastructure is better than many copies.

> > So the xfs mrlock already uses rwsem, semantics have not changed. So the
> > case Christoph found should already conform to the owner semantics.
> 
> Which, until there was "spin on owner" added to the rwsems, the
> rwsem did not track ownership of the lock. i.e. prior to 2014
> (commit 4fc828e "locking/rwsem: Support optimistic spinning"), the
> rwsem contained:
> 
> struct rw_semaphore {
>       long                    count;
>       raw_spinlock_t          wait_lock;
>       struct list_head        wait_list;
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>       struct lockdep_map      dep_map;
> #endif
> };
> 
> i.e. historically the only "ownership dependency" a rwsem has ever
> had was added by lockdep, which then required annotations to tell
> lockdep the usage model of the lock. And even now the optimistic
> spinning code doesn't actually use the owner for checking ownership
> - the value is simply a special cookie to determine if the lock is
> held in read or write mode...

Not entirely accurate, the RT patches actually did the ownership thing
before lockdep (lockdep was born out of RT running into a giant heap of
locking fail).

RT needs ownership because of PI.

And I realize RT isn't in mainline and therefore is less visible, but
people are now spending moneys on getting this 'fixed', there's a
Linux Foundation project for it etc..

But yes, the spin-on-owner is an optimization that's clearly taking
advantage of actually having an owner.

> > I've not looked at code, but if the worker is synchronized against the
> > critical section (it'd pretty much have to be to rely on its locking)
> > there's nothing wrong, its just that the lockdep_assert_held() thingies
> > cannot work as it.
> > 
> > That is:
> > 
> > 	task A				task B
> > 
> > 	down_write(&rwsem);
> > 	queue_work(&work);
> > 					worker()
> > 					  lockdep_assert_held(&rwsem);
> > 	flush_work(&work);
> > 	up_write(&rwsem);
> > 
> > 
> > Doesn't work. Explicitly so in fact.
> 
> Then lockdep behaviour is incorrect for rwsems. By definition,
> semaphores have explicit non-owner semantics - a semaphore with
> strict task ownership constraints is a mutex, not a semaphore....

I know, there's been a fair number of discussions on renaming the thing;
but we've never actually managed to come up with a 'better' name.

rw-mutex is equally silly, because MutEx stands for Mutual-Exclusion,
which is very much not the case for the reader part. rwlock is already
taken by the spinning variant.

Also, renaming the thing is a massive pain :-)

But really, at this point rwsem should not be considered a traditional
semaphore, whatever the name says.

Also, strictly speaking, a semaphore is a counter that blocks on going
0, the rwsem cannot be a counting sem in order to actually provide the
exclusive state (w) (nor for the 'unlimited' (r)). Therefore
they've never actually been proper semaphores to begin with and one can
argue the thing has been a misnomer from the very start.

> > Does the worker have a pointer back to taskA ?
> 
> Nope. it runs the work while task A sleeps on a completion. When
> it's done, task A is woken and it continues.

Sure, but that doesn't preclude getting the owner, but I understand this
is a special hack to get around stack size limits.

> And, FWIW, we can take locks on objects (e.g. buffers) in task B
> that are then released by task A, too.  Unsurprisingly, those
> objects are also protected by semaphores, explicitly because of the
> non-owner behaviour those object locks have...

In general, or in this specific case?

For this specific case, we could actually teach lockdep to transfer
owner back and forth since they're fully serialized.

> > I could try something
> > like lockdep_assert_held_by(lock, task), just need to be careful,
> > the per-task lock state is just that, per-task, no serialization.
> 
> Add a lockdep init flag for rwsems to say "don't track owners" and
> we'll set it on the rwsems we pass to other contexts. Then you can
> make sure that lockdep asserts don't fire when we pass those locked
> objects to other tasks and/or get locked/unlocked by different
> tasks...

With the spin-on-owner there's more than just lockdep that wants this
owner thing.


In any case, as Christoph already indicated, we can talk about solutions
for the IO case, but I would very much like to not introduce (more)
async locking primitives for generic use, such things subvert PI and
could tempt Joe random Dev to write 'creative' code.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-09-13 19:43 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-11 17:10 [PATCH, RFC] xfs: remove i_iolock and use i_rwsem in the VFS inode instead Christoph Hellwig
2016-08-11 21:54 ` Peter Zijlstra
2016-08-18 17:37   ` Christoph Hellwig
2016-08-19 13:27     ` Peter Zijlstra
2016-08-20  6:37       ` Christoph Hellwig
2016-08-22  8:34         ` Peter Zijlstra
2016-09-05 15:12           ` Christoph Hellwig
2016-09-07  7:43             ` Peter Zijlstra
2016-09-08  6:06               ` Ingo Molnar
2016-08-11 23:43 ` Dave Chinner
2016-08-12  2:50   ` Christoph Hellwig
2016-08-12  9:58     ` Dave Chinner
2016-09-05 15:15       ` Christoph Hellwig
2016-09-07 21:45         ` Dave Chinner
2016-09-08  6:54           ` Peter Zijlstra
2016-09-09  1:06             ` Dave Chinner
2016-09-09  8:21               ` Peter Zijlstra
2016-09-09  8:34                 ` Christoph Hellwig
2016-09-11  0:17                 ` Dave Chinner
2016-09-13 19:42                   ` Peter Zijlstra
2016-09-09  8:33           ` Christoph Hellwig
2016-09-09  8:44             ` Peter Zijlstra
2016-09-09  9:05               ` Christoph Hellwig
2016-09-09  9:51                 ` Peter Zijlstra
2016-09-10 16:20                   ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.