linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Gao Xiang <gaoxiang25@huawei.com>
To: linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	LKML <linux-kernel@vger.kernel.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Stephen Rothwell <sfr@canb.auug.org.au>,
	Theodore Ts'o <tytso@mit.edu>, Pavel Machek <pavel@denx.de>,
	David Sterba <dsterba@suse.cz>,
	Amir Goldstein <amir73il@gmail.com>,
	Christoph Hellwig <hch@infradead.org>,
	"Darrick J . Wong" <darrick.wong@oracle.com>,
	"Dave Chinner" <david@fromorbit.com>,
	Jaegeuk Kim <jaegeuk@kernel.org>, Jan Kara <jack@suse.cz>,
	Richard Weinberger <richard@nod.at>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	<devel@driverdev.osuosl.org>, <linux-erofs@lists.ozlabs.org>,
	Chao Yu <yuchao0@huawei.com>, Miao Xie <miaoxie@huawei.com>,
	Li Guifu <bluce.liguifu@huawei.com>,
	Fang Wei <fangwei1@huawei.com>, Gao Xiang <gaoxiang25@huawei.com>
Subject: [PATCH v7 16/24] erofs: introduce workstation for decompression
Date: Tue, 13 Aug 2019 17:13:18 +0800	[thread overview]
Message-ID: <20190813091326.84652-17-gaoxiang25@huawei.com> (raw)
In-Reply-To: <20190813091326.84652-1-gaoxiang25@huawei.com>

This patch introduces another concept used by decompress
subsystem called 'workstation'. It can be seen as
a sparse array that stores pointers pointed to data
structures related to the corresponding physical clusters.

All lookups are protected by RCU read lock. Besides,
reference count and spin_lock are also introduced
to manage its lifetime and serialize all update
operations.

`workstation' is currently implemented on the in-kernel
radix tree approach for backward compatibility. With the
evolution of linux kernel, it will be migrated into
new XArray implementation in the future.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
---
 fs/erofs/internal.h |  80 +++++++++++++++++++++
 fs/erofs/super.c    |   4 ++
 fs/erofs/utils.c    | 166 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 248 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 6272793329cd..a287309dbd26 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -65,6 +65,9 @@ struct erofs_sb_info {
 	struct list_head list;
 	struct mutex umount_mutex;
 
+	/* the dedicated workstation for compression */
+	struct radix_tree_root workstn_tree;
+
 	unsigned int shrinker_run_no;
 #endif	/* CONFIG_EROFS_FS_ZIP */
 	u32 blocks;
@@ -150,6 +153,77 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+#if defined(CONFIG_SMP)
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+						 int val)
+{
+	preempt_disable();
+	if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
+		preempt_enable();
+		return false;
+	}
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+					    int orig_val)
+{
+	/*
+	 * other observers should notice all modifications
+	 * in the freezing period.
+	 */
+	smp_mb();
+	atomic_set(&grp->refcount, orig_val);
+	preempt_enable();
+}
+
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+	return atomic_cond_read_relaxed(&grp->refcount,
+					VAL != EROFS_LOCKED_MAGIC);
+}
+#else
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+						 int val)
+{
+	preempt_disable();
+	/* no need to spin on UP platforms, let's just disable preemption. */
+	if (val != atomic_read(&grp->refcount)) {
+		preempt_enable();
+		return false;
+	}
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+					    int orig_val)
+{
+	preempt_enable();
+}
+
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+	int v = atomic_read(&grp->refcount);
+
+	/* workgroup is never freezed on uniprocessor systems */
+	DBG_BUGON(v == EROFS_LOCKED_MAGIC);
+	return v;
+}
+#endif	/* !CONFIG_SMP */
+#endif	/* !CONFIG_EROFS_FS_ZIP */
+
 /* we strictly follow PAGE_SIZE and no buffer head yet */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
@@ -409,6 +483,12 @@ extern const struct file_operations erofs_dir_fops;
 
 /* utils.c */
 #ifdef CONFIG_EROFS_FS_ZIP
+int erofs_workgroup_put(struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+					     pgoff_t index, bool *tag);
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp, bool tag);
+static inline void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) {}
 void erofs_shrinker_register(struct super_block *sb);
 void erofs_shrinker_unregister(struct super_block *sb);
 int __init erofs_init_shrinker(void);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 09992cc3b2fd..ea8d065068fa 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -338,6 +338,10 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		sb->s_flags &= ~SB_POSIXACL;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode))
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index cab7d77c4e59..628178261056 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -7,11 +7,173 @@
  * Created by Gao Xiang <gaoxiang25@huawei.com>
  */
 #include "internal.h"
+#include <linux/pagevec.h>
 
 #ifdef CONFIG_EROFS_FS_ZIP
 /* global shrink count (for all mounted EROFS instances) */
 static atomic_long_t erofs_global_shrink_cnt;
 
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+#define __erofs_workgroup_put(grp)	atomic_dec(&(grp)->refcount)
+
+static int erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+	int o;
+
+repeat:
+	o = erofs_wait_on_workgroup_freezed(grp);
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	/* decrease refcount paired by erofs_workgroup_put */
+	if (unlikely(o == 1))
+		atomic_long_dec(&erofs_global_shrink_cnt);
+	return 0;
+}
+
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+					     pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn_tree, index);
+	if (grp) {
+		*tag = xa_pointer_tag(grp);
+		grp = xa_untag_pointer(grp);
+
+		if (erofs_workgroup_get(grp)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		DBG_BUGON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err;
+
+	/* grp shouldn't be broken or used before */
+	if (unlikely(atomic_read(&grp->refcount) != 1)) {
+		DBG_BUGON(1);
+		return -EINVAL;
+	}
+
+	err = radix_tree_preload(GFP_NOFS);
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	xa_lock(&sbi->workstn_tree);
+
+	grp = xa_tag_pointer(grp, tag);
+
+	/*
+	 * Bump up reference count before making this workgroup
+	 * visible to other users in order to avoid potential UAF
+	 * without serialized by workstn_lock.
+	 */
+	__erofs_workgroup_get(grp);
+
+	err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
+	if (unlikely(err))
+		/*
+		 * it's safe to decrease since the workgroup isn't visible
+		 * and refcount >= 2 (cannot be freezed).
+		 */
+		__erofs_workgroup_put(grp);
+
+	xa_unlock(&sbi->workstn_tree);
+	radix_tree_preload_end();
+	return err;
+}
+
+static void  __erofs_workgroup_free(struct erofs_workgroup *grp)
+{
+	atomic_long_dec(&erofs_global_shrink_cnt);
+	erofs_workgroup_free_rcu(grp);
+}
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count)
+		__erofs_workgroup_free(grp);
+	return count;
+}
+
+/* for nocache case, no customized reclaim path at all */
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+					   struct erofs_workgroup *grp,
+					   bool cleanup)
+{
+	int cnt = atomic_read(&grp->refcount);
+
+	DBG_BUGON(cnt <= 0);
+	DBG_BUGON(cleanup && cnt != 1);
+
+	if (cnt > 1)
+		return false;
+
+	DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
+						     grp->index)) != grp);
+
+	/* (rarely) could be grabbed again when freeing */
+	erofs_workgroup_put(grp);
+	return true;
+}
+
+static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+					      unsigned long nr_shrink,
+					      bool cleanup)
+{
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned int freed = 0;
+
+	int i, found;
+repeat:
+	xa_lock(&sbi->workstn_tree);
+
+	found = radix_tree_gang_lookup(&sbi->workstn_tree,
+				       batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
+
+		first_index = grp->index + 1;
+
+		/* try to shrink each valid workgroup */
+		if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+			continue;
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	xa_unlock(&sbi->workstn_tree);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
+}
+
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
 
@@ -35,7 +197,7 @@ void erofs_shrinker_unregister(struct super_block *sb)
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 
 	mutex_lock(&sbi->umount_mutex);
-	/* will add shrink final handler here */
+	erofs_shrink_workstation(sbi, ~0UL, true);
 
 	spin_lock(&erofs_sb_list_lock);
 	list_del(&sbi->list);
@@ -84,7 +246,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		spin_unlock(&erofs_sb_list_lock);
 		sbi->shrinker_run_no = run_no;
 
-		/* will add shrink handler here */
+		freed += erofs_shrink_workstation(sbi, nr, false);
 
 		spin_lock(&erofs_sb_list_lock);
 		/* Get the next list element before we move this one */
-- 
2.17.1


  parent reply	other threads:[~2019-08-13  9:15 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-13  9:13 [PATCH v7 00/24] erofs: promote erofs from staging Gao Xiang
2019-08-13  9:13 ` [PATCH v7 01/24] erofs: add on-disk layout Gao Xiang
2019-08-13  9:13 ` [PATCH v7 02/24] erofs: add erofs in-memory stuffs Gao Xiang
2019-08-13  9:13 ` [PATCH v7 03/24] erofs: add super block operations Gao Xiang
2019-08-13  9:13 ` [PATCH v7 04/24] erofs: add raw address_space operations Gao Xiang
2019-08-13  9:13 ` [PATCH v7 05/24] erofs: add inode operations Gao Xiang
2019-08-13  9:13 ` [PATCH v7 06/24] erofs: support special inode Gao Xiang
2019-08-13  9:13 ` [PATCH v7 07/24] erofs: add directory operations Gao Xiang
2019-08-13  9:13 ` [PATCH v7 08/24] erofs: add namei functions Gao Xiang
2019-08-13 11:48   ` Pavel Machek
2019-08-13 12:23     ` Gao Xiang
2019-08-13 12:41       ` Gao Xiang
2019-08-15  7:01       ` Pavel Machek
2019-08-15  7:25         ` Gao Xiang
2019-08-13  9:13 ` [PATCH v7 09/24] erofs: support tracepoint Gao Xiang
2019-08-13  9:13 ` [PATCH v7 10/24] erofs: update Kconfig and Makefile Gao Xiang
2019-08-13  9:13 ` [PATCH v7 11/24] erofs: introduce xattr & posixacl support Gao Xiang
2019-08-13  9:13 ` [PATCH v7 12/24] erofs: introduce tagged pointer Gao Xiang
2019-08-13  9:13 ` [PATCH v7 13/24] erofs: add compression indexes support Gao Xiang
2019-08-13  9:13 ` [PATCH v7 14/24] erofs: introduce superblock registration Gao Xiang
2019-08-13  9:13 ` [PATCH v7 15/24] erofs: introduce erofs shrinker Gao Xiang
2019-08-13  9:13 ` Gao Xiang [this message]
2019-08-13  9:13 ` [PATCH v7 17/24] erofs: introduce per-CPU buffers implementation Gao Xiang
2019-08-13  9:13 ` [PATCH v7 18/24] erofs: introduce pagevec for decompression subsystem Gao Xiang
2019-08-13  9:13 ` [PATCH v7 19/24] erofs: add erofs_allocpage() Gao Xiang
2019-08-13  9:13 ` [PATCH v7 20/24] erofs: introduce generic decompression backend Gao Xiang
2019-08-13  9:13 ` [PATCH v7 21/24] erofs: introduce LZ4 decompression inplace Gao Xiang
2019-08-13  9:13 ` [PATCH v7 22/24] erofs: introduce the decompression frontend Gao Xiang
2019-08-13  9:13 ` [PATCH v7 23/24] erofs: introduce cached decompression Gao Xiang
2019-08-13  9:13 ` [PATCH v7 24/24] erofs: add document Gao Xiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190813091326.84652-17-gaoxiang25@huawei.com \
    --to=gaoxiang25@huawei.com \
    --cc=akpm@linux-foundation.org \
    --cc=amir73il@gmail.com \
    --cc=bluce.liguifu@huawei.com \
    --cc=darrick.wong@oracle.com \
    --cc=david@fromorbit.com \
    --cc=devel@driverdev.osuosl.org \
    --cc=dsterba@suse.cz \
    --cc=fangwei1@huawei.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=hch@infradead.org \
    --cc=jack@suse.cz \
    --cc=jaegeuk@kernel.org \
    --cc=linux-erofs@lists.ozlabs.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=miaoxie@huawei.com \
    --cc=pavel@denx.de \
    --cc=richard@nod.at \
    --cc=sfr@canb.auug.org.au \
    --cc=torvalds@linux-foundation.org \
    --cc=tytso@mit.edu \
    --cc=viro@zeniv.linux.org.uk \
    --cc=yuchao0@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).