stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 01/10] mbcache: Don't reclaim used entries
       [not found] <20220614124146.21594-1-jack@suse.cz>
@ 2022-06-14 16:05 ` Jan Kara
  2022-06-16 14:22   ` Ritesh Harjani
  2022-06-14 16:05 ` [PATCH 02/10] mbcache: Add functions to delete entry if unused Jan Kara
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Jan Kara @ 2022-06-14 16:05 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ritesh Harjani, Jan Kara, stable

Do not reclaim entries that are currently used by somebody from a
shrinker. Firstly, these entries are likely useful. Secondly, we will
need to keep such entries to protect pending increment of xattr block
refcount.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..cfc28129fb6f 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -288,7 +288,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
 		entry = list_first_entry(&cache->c_list,
 					 struct mb_cache_entry, e_list);
-		if (entry->e_referenced) {
+		if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) {
 			entry->e_referenced = 0;
 			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
@@ -302,6 +302,14 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 		spin_unlock(&cache->c_list_lock);
 		head = mb_cache_entry_head(cache, entry->e_key);
 		hlist_bl_lock(head);
+		/* Now a reliable check if the entry didn't get used... */
+		if (atomic_read(&entry->e_refcnt) > 2) {
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_list_lock);
+			list_add_tail(&entry->e_list, &cache->c_list);
+			cache->c_entry_count++;
+			continue;
+		}
 		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
 			hlist_bl_del_init(&entry->e_hash_list);
 			atomic_dec(&entry->e_refcnt);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 02/10] mbcache: Add functions to delete entry if unused
       [not found] <20220614124146.21594-1-jack@suse.cz>
  2022-06-14 16:05 ` [PATCH 01/10] mbcache: Don't reclaim used entries Jan Kara
@ 2022-06-14 16:05 ` Jan Kara
  2022-06-16 14:47   ` Ritesh Harjani
  2022-06-14 16:05 ` [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction Jan Kara
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 13+ messages in thread
From: Jan Kara @ 2022-06-14 16:05 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ritesh Harjani, Jan Kara, stable

Add function mb_cache_entry_try_delete() to delete mbcache entry if it
is unused and also add a function to wait for entry to become unused -
mb_cache_entry_wait_unused(). We do not share code between the two
deleting function as one of them will go away soon.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/mbcache.c            | 63 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/mbcache.h | 10 ++++++-
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index cfc28129fb6f..1ae66b2c75f4 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
 }
 EXPORT_SYMBOL(__mb_cache_entry_free);
 
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
 static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
 					   struct mb_cache_entry *entry,
 					   u32 key)
@@ -217,7 +230,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 }
 EXPORT_SYMBOL(mb_cache_entry_get);
 
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete - try to remove a cache entry
  * @cache - cache we work with
  * @key - key
  * @value - value
@@ -254,6 +267,54 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
 }
 EXPORT_SYMBOL(mb_cache_entry_delete);
 
+/* mb_cache_entry_try_delete - try to remove a cache entry
+ * @cache - cache we work with
+ * @key - key
+ * @value - value
+ *
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function returns the entry that we failed to delete because it has users.
+ */
+struct mb_cache_entry *mb_cache_entry_try_delete(struct mb_cache *cache,
+						 u32 key, u64 value)
+{
+	struct hlist_bl_node *node;
+	struct hlist_bl_head *head;
+	struct mb_cache_entry *entry;
+
+	head = mb_cache_entry_head(cache, key);
+	hlist_bl_lock(head);
+	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
+		if (entry->e_key == key && entry->e_value == value) {
+			if (atomic_read(&entry->e_refcnt) > 2) {
+				atomic_inc(&entry->e_refcnt);
+				hlist_bl_unlock(head);
+				return entry;
+			}
+			/* We keep hash list reference to keep entry alive */
+			hlist_bl_del_init(&entry->e_hash_list);
+			hlist_bl_unlock(head);
+			spin_lock(&cache->c_list_lock);
+			if (!list_empty(&entry->e_list)) {
+				list_del_init(&entry->e_list);
+				if (!WARN_ONCE(cache->c_entry_count == 0,
+		"mbcache: attempt to decrement c_entry_count past zero"))
+					cache->c_entry_count--;
+				atomic_dec(&entry->e_refcnt);
+			}
+			spin_unlock(&cache->c_list_lock);
+			mb_cache_entry_put(cache, entry);
+			return NULL;
+		}
+	}
+	hlist_bl_unlock(head);
+
+	return NULL;
+}
+EXPORT_SYMBOL(mb_cache_entry_try_delete);
+
 /* mb_cache_entry_touch - cache entry got used
  * @cache - cache the entry belongs to
  * @entry - entry that got used
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 20f1e3ff6013..1176fdfb8d53 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache);
 int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
 			  u64 value, bool reusable);
 void __mb_cache_entry_free(struct mb_cache_entry *entry);
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
 static inline int mb_cache_entry_put(struct mb_cache *cache,
 				     struct mb_cache_entry *entry)
 {
-	if (!atomic_dec_and_test(&entry->e_refcnt))
+	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
+
+	if (cnt > 0) {
+		if (cnt <= 3)
+			wake_up_var(&entry->e_refcnt);
 		return 0;
+	}
 	__mb_cache_entry_free(entry);
 	return 1;
 }
 
+struct mb_cache_entry *mb_cache_entry_try_delete(struct mb_cache *cache,
+						 u32 key, u64 value);
 void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
 struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
 					  u64 value);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction
       [not found] <20220614124146.21594-1-jack@suse.cz>
  2022-06-14 16:05 ` [PATCH 01/10] mbcache: Don't reclaim used entries Jan Kara
  2022-06-14 16:05 ` [PATCH 02/10] mbcache: Add functions to delete entry if unused Jan Kara
@ 2022-06-14 16:05 ` Jan Kara
  2022-06-16 15:01   ` Ritesh Harjani
  2022-06-14 16:05 ` [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set() Jan Kara
  2022-06-14 16:05 ` [PATCH 05/10] ext4: Fix race when reusing xattr blocks Jan Kara
  4 siblings, 1 reply; 13+ messages in thread
From: Jan Kara @ 2022-06-14 16:05 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ritesh Harjani, Jan Kara, stable

Currently we remove EA inode from mbcache as soon as its xattr refcount
drops to zero. However there can be pending attempts to reuse the inode
and thus refcount handling code has to handle the situation when
refcount increases from zero anyway. So save some work and just keep EA
inode in mbcache until it is getting evicted. At that moment we are sure
following iget() of EA inode will fail anyway (or wait for eviction to
finish and load things from the disk again) and so removing mbcache
entry at that moment is fine and simplifies the code a bit.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c |  2 ++
 fs/ext4/xattr.c | 24 ++++++++----------------
 fs/ext4/xattr.h |  1 +
 3 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3dce7d058985..7450ee734262 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -177,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
 
 	trace_ext4_evict_inode(inode);
 
+	if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+		ext4_evict_ea_inode(inode);
 	if (inode->i_nlink) {
 		/*
 		 * When journalling data dirty buffers are tracked only in the
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 042325349098..7fc40fb1e6b3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -436,6 +436,14 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 	return err;
 }
 
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+	if (EA_INODE_CACHE(inode))
+		mb_cache_entry_delete(EA_INODE_CACHE(inode),
+			ext4_xattr_inode_get_hash(inode), inode->i_ino);
+}
+
 static int
 ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
 			       struct ext4_xattr_entry *entry, void *buffer,
@@ -976,10 +984,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
 static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 				       int ref_change)
 {
-	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
 	struct ext4_iloc iloc;
 	s64 ref_count;
-	u32 hash;
 	int ret;
 
 	inode_lock(ea_inode);
@@ -1002,14 +1008,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 
 			set_nlink(ea_inode, 1);
 			ext4_orphan_del(handle, ea_inode);
-
-			if (ea_inode_cache) {
-				hash = ext4_xattr_inode_get_hash(ea_inode);
-				mb_cache_entry_create(ea_inode_cache,
-						      GFP_NOFS, hash,
-						      ea_inode->i_ino,
-						      true /* reusable */);
-			}
 		}
 	} else {
 		WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -1022,12 +1020,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
 
 			clear_nlink(ea_inode);
 			ext4_orphan_add(handle, ea_inode);
-
-			if (ea_inode_cache) {
-				hash = ext4_xattr_inode_get_hash(ea_inode);
-				mb_cache_entry_delete(ea_inode_cache, hash,
-						      ea_inode->i_ino);
-			}
 		}
 	}
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 77efb9a627ad..060b7a2f485c 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -178,6 +178,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
 
 extern const struct xattr_handler *ext4_xattr_handlers[];
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set()
       [not found] <20220614124146.21594-1-jack@suse.cz>
                   ` (2 preceding siblings ...)
  2022-06-14 16:05 ` [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction Jan Kara
@ 2022-06-14 16:05 ` Jan Kara
  2022-06-14 16:05 ` [PATCH 05/10] ext4: Fix race when reusing xattr blocks Jan Kara
  4 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2022-06-14 16:05 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ritesh Harjani, Jan Kara, stable

Remove unnecessary else (and thus indentation level) from a code block
in ext4_xattr_block_set(). It will also make following code changes
easier. No functional changes.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/xattr.c | 79 ++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7fc40fb1e6b3..aadfae53d055 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1850,6 +1850,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (s->base) {
+		int offset = (char *)s->here - bs->bh->b_data;
+
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, sb, bs->bh,
 						      EXT4_JTR_NONE);
@@ -1882,50 +1884,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (error)
 				goto cleanup;
 			goto inserted;
-		} else {
-			int offset = (char *)s->here - bs->bh->b_data;
+		}
+		unlock_buffer(bs->bh);
+		ea_bdebug(bs->bh, "cloning");
+		s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+		s->first = ENTRY(header(s->base)+1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->here = ENTRY(s->base + offset);
+		s->end = s->base + bs->bh->b_size;
 
-			unlock_buffer(bs->bh);
-			ea_bdebug(bs->bh, "cloning");
-			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
-			error = -ENOMEM;
-			if (s->base == NULL)
+		/*
+		 * If existing entry points to an xattr inode, we need
+		 * to prevent ext4_xattr_set_entry() from decrementing
+		 * ref count on it because the reference belongs to the
+		 * original block. In this case, make the entry look
+		 * like it has an empty value.
+		 */
+		if (!s->not_found && s->here->e_value_inum) {
+			ea_ino = le32_to_cpu(s->here->e_value_inum);
+			error = ext4_xattr_inode_iget(inode, ea_ino,
+				      le32_to_cpu(s->here->e_hash),
+				      &tmp_inode);
+			if (error)
 				goto cleanup;
-			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
-			s->first = ENTRY(header(s->base)+1);
-			header(s->base)->h_refcount = cpu_to_le32(1);
-			s->here = ENTRY(s->base + offset);
-			s->end = s->base + bs->bh->b_size;
 
-			/*
-			 * If existing entry points to an xattr inode, we need
-			 * to prevent ext4_xattr_set_entry() from decrementing
-			 * ref count on it because the reference belongs to the
-			 * original block. In this case, make the entry look
-			 * like it has an empty value.
-			 */
-			if (!s->not_found && s->here->e_value_inum) {
-				ea_ino = le32_to_cpu(s->here->e_value_inum);
-				error = ext4_xattr_inode_iget(inode, ea_ino,
-					      le32_to_cpu(s->here->e_hash),
-					      &tmp_inode);
-				if (error)
-					goto cleanup;
-
-				if (!ext4_test_inode_state(tmp_inode,
-						EXT4_STATE_LUSTRE_EA_INODE)) {
-					/*
-					 * Defer quota free call for previous
-					 * inode until success is guaranteed.
-					 */
-					old_ea_inode_quota = le32_to_cpu(
-							s->here->e_value_size);
-				}
-				iput(tmp_inode);
-
-				s->here->e_value_inum = 0;
-				s->here->e_value_size = 0;
+			if (!ext4_test_inode_state(tmp_inode,
+					EXT4_STATE_LUSTRE_EA_INODE)) {
+				/*
+				 * Defer quota free call for previous
+				 * inode until success is guaranteed.
+				 */
+				old_ea_inode_quota = le32_to_cpu(
+						s->here->e_value_size);
 			}
+			iput(tmp_inode);
+
+			s->here->e_value_inum = 0;
+			s->here->e_value_size = 0;
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH 05/10] ext4: Fix race when reusing xattr blocks
       [not found] <20220614124146.21594-1-jack@suse.cz>
                   ` (3 preceding siblings ...)
  2022-06-14 16:05 ` [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set() Jan Kara
@ 2022-06-14 16:05 ` Jan Kara
  4 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2022-06-14 16:05 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ritesh Harjani, Jan Kara, stable

When ext4_xattr_block_set() decides to remove xattr block the following
race can happen:

CPU1                                    CPU2
ext4_xattr_block_set()                  ext4_xattr_release_block()
  new_bh = ext4_xattr_block_cache_find()

                                          lock_buffer(bh);
                                          ref = le32_to_cpu(BHDR(bh)->h_refcount);
                                          if (ref == 1) {
                                            ...
                                            mb_cache_entry_delete();
                                            unlock_buffer(bh);
                                            ext4_free_blocks();
                                              ...
                                              ext4_forget(..., bh, ...);
                                                jbd2_journal_revoke(..., bh);

  ext4_journal_get_write_access(..., new_bh, ...)
    do_get_write_access()
      jbd2_journal_cancel_revoke(..., new_bh);

Later the code in ext4_xattr_block_set() finds out the block got freed
and cancels reusal of the block but the revoke stays canceled and so in
case of block reuse and journal replay the filesystem can get corrupted.
If the race works out slightly differently, we can also hit assertions
in the jbd2 code.

Fix the problem by making sure that once matching mbcache entry is
found, code dropping the last xattr block reference (or trying to modify
xattr block in place) waits until the mbcache entry reference is
dropped. This way code trying to reuse xattr block is protected from
someone trying to drop the last reference to xattr block.

Reported-by: Ritesh Harjani <ritesh.list@gmail.com>
CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/xattr.c | 67 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index aadfae53d055..0c42c0e22cd2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -439,9 +439,16 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 /* Remove entry from mbcache when EA inode is getting evicted */
 void ext4_evict_ea_inode(struct inode *inode)
 {
-	if (EA_INODE_CACHE(inode))
-		mb_cache_entry_delete(EA_INODE_CACHE(inode),
-			ext4_xattr_inode_get_hash(inode), inode->i_ino);
+	struct mb_cache_entry *oe;
+
+	if (!EA_INODE_CACHE(inode))
+		return;
+	/* Wait for entry to get unused so that we can remove it */
+	while ((oe = mb_cache_entry_try_delete(EA_INODE_CACHE(inode),
+			ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+		mb_cache_entry_wait_unused(oe);
+		mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+	}
 }
 
 static int
@@ -1229,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 	if (error)
 		goto out;
 
+retry_ref:
 	lock_buffer(bh);
 	hash = le32_to_cpu(BHDR(bh)->h_hash);
 	ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1238,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		 * This must happen under buffer lock for
 		 * ext4_xattr_block_set() to reliably detect freed block
 		 */
-		if (ea_block_cache)
-			mb_cache_entry_delete(ea_block_cache, hash,
-					      bh->b_blocknr);
+		if (ea_block_cache) {
+			struct mb_cache_entry *oe;
+
+			oe = mb_cache_entry_try_delete(ea_block_cache, hash,
+						       bh->b_blocknr);
+			if (oe) {
+				unlock_buffer(bh);
+				mb_cache_entry_wait_unused(oe);
+				mb_cache_entry_put(ea_block_cache, oe);
+				goto retry_ref;
+			}
+		}
 		get_bh(bh);
 		unlock_buffer(bh);
 
@@ -1867,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			 * ext4_xattr_block_set() to reliably detect modified
 			 * block
 			 */
-			if (ea_block_cache)
-				mb_cache_entry_delete(ea_block_cache, hash,
-						      bs->bh->b_blocknr);
+			if (ea_block_cache) {
+				struct mb_cache_entry *oe;
+
+				oe = mb_cache_entry_try_delete(ea_block_cache,
+					hash, bs->bh->b_blocknr);
+				if (oe) {
+					/*
+					 * Xattr block is getting reused. Leave
+					 * it alone.
+					 */
+					mb_cache_entry_put(ea_block_cache, oe);
+					goto clone_block;
+				}
+			}
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s, handle, inode,
 						     true /* is_block */);
@@ -1885,6 +1913,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				goto cleanup;
 			goto inserted;
 		}
+clone_block:
 		unlock_buffer(bs->bh);
 		ea_bdebug(bs->bh, "cloning");
 		s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
@@ -1991,18 +2020,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				lock_buffer(new_bh);
 				/*
 				 * We have to be careful about races with
-				 * freeing, rehashing or adding references to
-				 * xattr block. Once we hold buffer lock xattr
-				 * block's state is stable so we can check
-				 * whether the block got freed / rehashed or
-				 * not.  Since we unhash mbcache entry under
-				 * buffer lock when freeing / rehashing xattr
-				 * block, checking whether entry is still
-				 * hashed is reliable. Same rules hold for
-				 * e_reusable handling.
+				 * adding references to xattr block. Once we
+				 * hold buffer lock xattr block's state is
+				 * stable so we can check the additional
+				 * reference fits.
 				 */
-				if (hlist_bl_unhashed(&ce->e_hash_list) ||
-				    !ce->e_reusable) {
+				ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+				if (ref > EXT4_XATTR_REFCOUNT_MAX) {
 					/*
 					 * Undo everything and check mbcache
 					 * again.
@@ -2017,9 +2041,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 					new_bh = NULL;
 					goto inserted;
 				}
-				ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
 				BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
-				if (ref >= EXT4_XATTR_REFCOUNT_MAX)
+				if (ref == EXT4_XATTR_REFCOUNT_MAX)
 					ce->e_reusable = 0;
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					  ref);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH 01/10] mbcache: Don't reclaim used entries
  2022-06-14 16:05 ` [PATCH 01/10] mbcache: Don't reclaim used entries Jan Kara
@ 2022-06-16 14:22   ` Ritesh Harjani
  2022-06-16 17:25     ` Jan Kara
  0 siblings, 1 reply; 13+ messages in thread
From: Ritesh Harjani @ 2022-06-16 14:22 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ted Tso, linux-ext4, stable

On 22/06/14 06:05PM, Jan Kara wrote:
> Do not reclaim entries that are currently used by somebody from a
> shrinker. Firstly, these entries are likely useful. Secondly, we will
> need to keep such entries to protect pending increment of xattr block
> refcount.

Trying to review the patch series to best of my knowledge, so kindly excuse my
silly queries along the way.

>
> CC: stable@vger.kernel.org
> Fixes: 82939d7999df ("ext4: convert to mbcache2")
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/mbcache.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index 97c54d3a2227..cfc28129fb6f 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -288,7 +288,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
>  	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
>  		entry = list_first_entry(&cache->c_list,
>  					 struct mb_cache_entry, e_list);
> -		if (entry->e_referenced) {
> +		if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) {

Sure, yes, the above "||" conditions looks good.
i.e. if the refcnt is above 2, then we should move the entry to the tail of LRU.
Because that means that there is another user of this entry which might have
incremented the refcnt.

>  			entry->e_referenced = 0;
>  			list_move_tail(&entry->e_list, &cache->c_list);
>  			continue;
> @@ -302,6 +302,14 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
>  		spin_unlock(&cache->c_list_lock);
>  		head = mb_cache_entry_head(cache, entry->e_key);
>  		hlist_bl_lock(head);
> +		/* Now a reliable check if the entry didn't get used... */

But not sure why this is more reliable? Anytime we add or remove the entry,
we first always do the list operation and then increment or decrement the
refcnt "atomically".

So could you please help in understanding why will this be more reliable?

-ritesh


> +		if (atomic_read(&entry->e_refcnt) > 2) {
> +			hlist_bl_unlock(head);
> +			spin_lock(&cache->c_list_lock);
> +			list_add_tail(&entry->e_list, &cache->c_list);
> +			cache->c_entry_count++;
> +			continue;
> +		}
>  		if (!hlist_bl_unhashed(&entry->e_hash_list)) {
>  			hlist_bl_del_init(&entry->e_hash_list);
>  			atomic_dec(&entry->e_refcnt);
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 02/10] mbcache: Add functions to delete entry if unused
  2022-06-14 16:05 ` [PATCH 02/10] mbcache: Add functions to delete entry if unused Jan Kara
@ 2022-06-16 14:47   ` Ritesh Harjani
  2022-06-16 17:28     ` Jan Kara
  0 siblings, 1 reply; 13+ messages in thread
From: Ritesh Harjani @ 2022-06-16 14:47 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ted Tso, linux-ext4, stable

On 22/06/14 06:05PM, Jan Kara wrote:
> Add function mb_cache_entry_try_delete() to delete mbcache entry if it
> is unused and also add a function to wait for entry to become unused -
> mb_cache_entry_wait_unused(). We do not share code between the two
> deleting function as one of them will go away soon.
>
> CC: stable@vger.kernel.org
> Fixes: 82939d7999df ("ext4: convert to mbcache2")
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/mbcache.c            | 63 ++++++++++++++++++++++++++++++++++++++++-
>  include/linux/mbcache.h | 10 ++++++-
>  2 files changed, 71 insertions(+), 2 deletions(-)
>
> diff --git a/fs/mbcache.c b/fs/mbcache.c
> index cfc28129fb6f..1ae66b2c75f4 100644
> --- a/fs/mbcache.c
> +++ b/fs/mbcache.c
> @@ -125,6 +125,19 @@ void __mb_cache_entry_free(struct mb_cache_entry *entry)
>  }
>  EXPORT_SYMBOL(__mb_cache_entry_free);
>
> +/*
> + * mb_cache_entry_wait_unused - wait to be the last user of the entry
> + *
> + * @entry - entry to work on
> + *
> + * Wait to be the last user of the entry.
> + */
> +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
> +{
> +	wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 3);
> +}
> +EXPORT_SYMBOL(mb_cache_entry_wait_unused);
> +
>  static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
>  					   struct mb_cache_entry *entry,
>  					   u32 key)
> @@ -217,7 +230,7 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  }
>  EXPORT_SYMBOL(mb_cache_entry_get);
>
> -/* mb_cache_entry_delete - remove a cache entry
> +/* mb_cache_entry_delete - try to remove a cache entry
>   * @cache - cache we work with
>   * @key - key
>   * @value - value
> @@ -254,6 +267,54 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
>  }
>  EXPORT_SYMBOL(mb_cache_entry_delete);
>
> +/* mb_cache_entry_try_delete - try to remove a cache entry
> + * @cache - cache we work with
> + * @key - key
> + * @value - value
> + *
> + * Remove entry from cache @cache with key @key and value @value. The removal
> + * happens only if the entry is unused. The function returns NULL in case the
> + * entry was successfully removed or there's no entry in cache. Otherwise the
> + * function returns the entry that we failed to delete because it has users.

"...Also increment it's refcnt in case if the entry is returned. So the caller
is responsible to call for mb_cache_entry_put() later."

Do you think comment should be added too? And the api naming should be
mb_cache_entry_try_delete_or_get().

Looks like e_refcnt increment is done quitely in case of the entry is found
where as the api just says mb_cache_entry_try_delete().

Other than that, all other code logic looks right to me.

-ritesh

> + */
> +struct mb_cache_entry *mb_cache_entry_try_delete(struct mb_cache *cache,
> +						 u32 key, u64 value)
> +{
> +	struct hlist_bl_node *node;
> +	struct hlist_bl_head *head;
> +	struct mb_cache_entry *entry;
> +
> +	head = mb_cache_entry_head(cache, key);
> +	hlist_bl_lock(head);
> +	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> +		if (entry->e_key == key && entry->e_value == value) {
> +			if (atomic_read(&entry->e_refcnt) > 2) {
> +				atomic_inc(&entry->e_refcnt);
> +				hlist_bl_unlock(head);
> +				return entry;
> +			}
> +			/* We keep hash list reference to keep entry alive */
> +			hlist_bl_del_init(&entry->e_hash_list);
> +			hlist_bl_unlock(head);
> +			spin_lock(&cache->c_list_lock);
> +			if (!list_empty(&entry->e_list)) {
> +				list_del_init(&entry->e_list);
> +				if (!WARN_ONCE(cache->c_entry_count == 0,
> +		"mbcache: attempt to decrement c_entry_count past zero"))
> +					cache->c_entry_count--;
> +				atomic_dec(&entry->e_refcnt);
> +			}
> +			spin_unlock(&cache->c_list_lock);
> +			mb_cache_entry_put(cache, entry);
> +			return NULL;
> +		}
> +	}
> +	hlist_bl_unlock(head);
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL(mb_cache_entry_try_delete);
> +
>  /* mb_cache_entry_touch - cache entry got used
>   * @cache - cache the entry belongs to
>   * @entry - entry that got used
> diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> index 20f1e3ff6013..1176fdfb8d53 100644
> --- a/include/linux/mbcache.h
> +++ b/include/linux/mbcache.h
> @@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache);
>  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
>  			  u64 value, bool reusable);
>  void __mb_cache_entry_free(struct mb_cache_entry *entry);
> +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
>  static inline int mb_cache_entry_put(struct mb_cache *cache,
>  				     struct mb_cache_entry *entry)
>  {
> -	if (!atomic_dec_and_test(&entry->e_refcnt))
> +	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
> +
> +	if (cnt > 0) {
> +		if (cnt <= 3)
> +			wake_up_var(&entry->e_refcnt);
>  		return 0;
> +	}
>  	__mb_cache_entry_free(entry);
>  	return 1;
>  }
>
> +struct mb_cache_entry *mb_cache_entry_try_delete(struct mb_cache *cache,
> +						 u32 key, u64 value);
>  void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
>  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
>  					  u64 value);
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction
  2022-06-14 16:05 ` [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction Jan Kara
@ 2022-06-16 15:01   ` Ritesh Harjani
  2022-06-16 17:30     ` Jan Kara
  0 siblings, 1 reply; 13+ messages in thread
From: Ritesh Harjani @ 2022-06-16 15:01 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ted Tso, linux-ext4, stable

On 22/06/14 06:05PM, Jan Kara wrote:
> Currently we remove EA inode from mbcache as soon as its xattr refcount
> drops to zero. However there can be pending attempts to reuse the inode
> and thus refcount handling code has to handle the situation when
> refcount increases from zero anyway. So save some work and just keep EA
> inode in mbcache until it is getting evicted. At that moment we are sure
> following iget() of EA inode will fail anyway (or wait for eviction to
> finish and load things from the disk again) and so removing mbcache
> entry at that moment is fine and simplifies the code a bit.
>
> CC: stable@vger.kernel.org
> Fixes: 82939d7999df ("ext4: convert to mbcache2")
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/inode.c |  2 ++
>  fs/ext4/xattr.c | 24 ++++++++----------------
>  fs/ext4/xattr.h |  1 +
>  3 files changed, 11 insertions(+), 16 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3dce7d058985..7450ee734262 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -177,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
>
>  	trace_ext4_evict_inode(inode);
>
> +	if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
> +		ext4_evict_ea_inode(inode);
>  	if (inode->i_nlink) {
>  		/*
>  		 * When journalling data dirty buffers are tracked only in the
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 042325349098..7fc40fb1e6b3 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -436,6 +436,14 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
>  	return err;
>  }
>
> +/* Remove entry from mbcache when EA inode is getting evicted */
> +void ext4_evict_ea_inode(struct inode *inode)
> +{
> +	if (EA_INODE_CACHE(inode))
> +		mb_cache_entry_delete(EA_INODE_CACHE(inode),
> +			ext4_xattr_inode_get_hash(inode), inode->i_ino);
> +}
> +
>  static int
>  ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
>  			       struct ext4_xattr_entry *entry, void *buffer,
> @@ -976,10 +984,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
>  static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
>  				       int ref_change)
>  {
> -	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
>  	struct ext4_iloc iloc;
>  	s64 ref_count;
> -	u32 hash;
>  	int ret;
>
>  	inode_lock(ea_inode);
> @@ -1002,14 +1008,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
>
>  			set_nlink(ea_inode, 1);
>  			ext4_orphan_del(handle, ea_inode);
> -
> -			if (ea_inode_cache) {
> -				hash = ext4_xattr_inode_get_hash(ea_inode);
> -				mb_cache_entry_create(ea_inode_cache,
> -						      GFP_NOFS, hash,
> -						      ea_inode->i_ino,
> -						      true /* reusable */);
> -			}

Ok, so if I understand this correctly, since we are not immediately removing the
ea_inode_cache entry when the recount reaches 0, hence when the refcount is
reaches 1 from 0, we need not create mb_cache entry is it?
Is this since the entry never got deleted in the first place?

But what happens when the entry is created the very first time?
I might need to study xattr code to understand how this condition is taken care.

-ritesh


>  		}
>  	} else {
>  		WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
> @@ -1022,12 +1020,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
>
>  			clear_nlink(ea_inode);
>  			ext4_orphan_add(handle, ea_inode);
> -
> -			if (ea_inode_cache) {
> -				hash = ext4_xattr_inode_get_hash(ea_inode);
> -				mb_cache_entry_delete(ea_inode_cache, hash,
> -						      ea_inode->i_ino);
> -			}
>  		}
>  	}
>
> diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
> index 77efb9a627ad..060b7a2f485c 100644
> --- a/fs/ext4/xattr.h
> +++ b/fs/ext4/xattr.h
> @@ -178,6 +178,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
>
>  extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
>  			    struct ext4_inode *raw_inode, handle_t *handle);
> +extern void ext4_evict_ea_inode(struct inode *inode);
>
>  extern const struct xattr_handler *ext4_xattr_handlers[];
>
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 01/10] mbcache: Don't reclaim used entries
  2022-06-16 14:22   ` Ritesh Harjani
@ 2022-06-16 17:25     ` Jan Kara
  0 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2022-06-16 17:25 UTC (permalink / raw)
  To: Ritesh Harjani; +Cc: Jan Kara, Ted Tso, linux-ext4, stable

On Thu 16-06-22 19:52:12, Ritesh Harjani wrote:
> > CC: stable@vger.kernel.org
> > Fixes: 82939d7999df ("ext4: convert to mbcache2")
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/mbcache.c | 10 +++++++++-
> >  1 file changed, 9 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/mbcache.c b/fs/mbcache.c
> > index 97c54d3a2227..cfc28129fb6f 100644
> > --- a/fs/mbcache.c
> > +++ b/fs/mbcache.c
> > @@ -288,7 +288,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
> >  	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
> >  		entry = list_first_entry(&cache->c_list,
> >  					 struct mb_cache_entry, e_list);
> > -		if (entry->e_referenced) {
> > +		if (entry->e_referenced || atomic_read(&entry->e_refcnt) > 2) {
> 
> Sure, yes, the above "||" conditions looks good.
> i.e. if the refcnt is above 2, then we should move the entry to the tail of LRU.
> Because that means that there is another user of this entry which might have
> incremented the refcnt.
> 
> >  			entry->e_referenced = 0;
> >  			list_move_tail(&entry->e_list, &cache->c_list);
> >  			continue;
> > @@ -302,6 +302,14 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
> >  		spin_unlock(&cache->c_list_lock);
> >  		head = mb_cache_entry_head(cache, entry->e_key);
> >  		hlist_bl_lock(head);
> > +		/* Now a reliable check if the entry didn't get used... */
> 
> But not sure why this is more reliable? Anytime we add or remove the entry,
> we first always do the list operation and then increment or decrement the
> refcnt "atomically".
> 
> So could you please help in understanding why will this be more reliable?

It is reliable in the sense that while we hold hlist_bl_lock() there can be
no new references acquired (they get acquired only through the hash table
lookup) and so here we can "atomically" do "check entry is unused and
remove it from the hash".

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 02/10] mbcache: Add functions to delete entry if unused
  2022-06-16 14:47   ` Ritesh Harjani
@ 2022-06-16 17:28     ` Jan Kara
  0 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2022-06-16 17:28 UTC (permalink / raw)
  To: Ritesh Harjani; +Cc: Jan Kara, Ted Tso, linux-ext4, stable

On Thu 16-06-22 20:17:22, Ritesh Harjani wrote:
> On 22/06/14 06:05PM, Jan Kara wrote:
> > Add function mb_cache_entry_try_delete() to delete mbcache entry if it
> > is unused and also add a function to wait for entry to become unused -
> > mb_cache_entry_wait_unused(). We do not share code between the two
> > deleting function as one of them will go away soon.
> >
> > CC: stable@vger.kernel.org
> > Fixes: 82939d7999df ("ext4: convert to mbcache2")
> > Signed-off-by: Jan Kara <jack@suse.cz>

...

> > +/* mb_cache_entry_try_delete - try to remove a cache entry
> > + * @cache - cache we work with
> > + * @key - key
> > + * @value - value
> > + *
> > + * Remove entry from cache @cache with key @key and value @value. The removal
> > + * happens only if the entry is unused. The function returns NULL in case the
> > + * entry was successfully removed or there's no entry in cache. Otherwise the
> > + * function returns the entry that we failed to delete because it has users.
> 
> "...Also increment it's refcnt in case if the entry is returned. So the
> caller is responsible to call for mb_cache_entry_put() later."

Definitely, I'll expand the comment.

> Do you think comment should be added too? And the api naming should be
> mb_cache_entry_try_delete_or_get().
> 
> Looks like e_refcnt increment is done quitely in case of the entry is found
> where as the api just says mb_cache_entry_try_delete().

It's a bit long name but I agree it describes the function better. OK,
let's rename.
 
> Other than that, all other code logic looks right to me.

Thanks for review!

								Honza


> > + */
> > +struct mb_cache_entry *mb_cache_entry_try_delete(struct mb_cache *cache,
> > +						 u32 key, u64 value)
> > +{
> > +	struct hlist_bl_node *node;
> > +	struct hlist_bl_head *head;
> > +	struct mb_cache_entry *entry;
> > +
> > +	head = mb_cache_entry_head(cache, key);
> > +	hlist_bl_lock(head);
> > +	hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
> > +		if (entry->e_key == key && entry->e_value == value) {
> > +			if (atomic_read(&entry->e_refcnt) > 2) {
> > +				atomic_inc(&entry->e_refcnt);
> > +				hlist_bl_unlock(head);
> > +				return entry;
> > +			}
> > +			/* We keep hash list reference to keep entry alive */
> > +			hlist_bl_del_init(&entry->e_hash_list);
> > +			hlist_bl_unlock(head);
> > +			spin_lock(&cache->c_list_lock);
> > +			if (!list_empty(&entry->e_list)) {
> > +				list_del_init(&entry->e_list);
> > +				if (!WARN_ONCE(cache->c_entry_count == 0,
> > +		"mbcache: attempt to decrement c_entry_count past zero"))
> > +					cache->c_entry_count--;
> > +				atomic_dec(&entry->e_refcnt);
> > +			}
> > +			spin_unlock(&cache->c_list_lock);
> > +			mb_cache_entry_put(cache, entry);
> > +			return NULL;
> > +		}
> > +	}
> > +	hlist_bl_unlock(head);
> > +
> > +	return NULL;
> > +}
> > +EXPORT_SYMBOL(mb_cache_entry_try_delete);
> > +
> >  /* mb_cache_entry_touch - cache entry got used
> >   * @cache - cache the entry belongs to
> >   * @entry - entry that got used
> > diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
> > index 20f1e3ff6013..1176fdfb8d53 100644
> > --- a/include/linux/mbcache.h
> > +++ b/include/linux/mbcache.h
> > @@ -30,15 +30,23 @@ void mb_cache_destroy(struct mb_cache *cache);
> >  int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
> >  			  u64 value, bool reusable);
> >  void __mb_cache_entry_free(struct mb_cache_entry *entry);
> > +void mb_cache_entry_wait_unused(struct mb_cache_entry *entry);
> >  static inline int mb_cache_entry_put(struct mb_cache *cache,
> >  				     struct mb_cache_entry *entry)
> >  {
> > -	if (!atomic_dec_and_test(&entry->e_refcnt))
> > +	unsigned int cnt = atomic_dec_return(&entry->e_refcnt);
> > +
> > +	if (cnt > 0) {
> > +		if (cnt <= 3)
> > +			wake_up_var(&entry->e_refcnt);
> >  		return 0;
> > +	}
> >  	__mb_cache_entry_free(entry);
> >  	return 1;
> >  }
> >
> > +struct mb_cache_entry *mb_cache_entry_try_delete(struct mb_cache *cache,
> > +						 u32 key, u64 value);
> >  void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value);
> >  struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
> >  					  u64 value);
> > --
> > 2.35.3
> >
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction
  2022-06-16 15:01   ` Ritesh Harjani
@ 2022-06-16 17:30     ` Jan Kara
  0 siblings, 0 replies; 13+ messages in thread
From: Jan Kara @ 2022-06-16 17:30 UTC (permalink / raw)
  To: Ritesh Harjani; +Cc: Jan Kara, Ted Tso, linux-ext4, stable

On Thu 16-06-22 20:31:18, Ritesh Harjani wrote:
> On 22/06/14 06:05PM, Jan Kara wrote:
> > Currently we remove EA inode from mbcache as soon as its xattr refcount
> > drops to zero. However there can be pending attempts to reuse the inode
> > and thus refcount handling code has to handle the situation when
> > refcount increases from zero anyway. So save some work and just keep EA
> > inode in mbcache until it is getting evicted. At that moment we are sure
> > following iget() of EA inode will fail anyway (or wait for eviction to
> > finish and load things from the disk again) and so removing mbcache
> > entry at that moment is fine and simplifies the code a bit.
> >
> > CC: stable@vger.kernel.org
> > Fixes: 82939d7999df ("ext4: convert to mbcache2")
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/ext4/inode.c |  2 ++
> >  fs/ext4/xattr.c | 24 ++++++++----------------
> >  fs/ext4/xattr.h |  1 +
> >  3 files changed, 11 insertions(+), 16 deletions(-)
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 3dce7d058985..7450ee734262 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -177,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
> >
> >  	trace_ext4_evict_inode(inode);
> >
> > +	if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
> > +		ext4_evict_ea_inode(inode);
> >  	if (inode->i_nlink) {
> >  		/*
> >  		 * When journalling data dirty buffers are tracked only in the
> > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> > index 042325349098..7fc40fb1e6b3 100644
> > --- a/fs/ext4/xattr.c
> > +++ b/fs/ext4/xattr.c
> > @@ -436,6 +436,14 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
> >  	return err;
> >  }
> >
> > +/* Remove entry from mbcache when EA inode is getting evicted */
> > +void ext4_evict_ea_inode(struct inode *inode)
> > +{
> > +	if (EA_INODE_CACHE(inode))
> > +		mb_cache_entry_delete(EA_INODE_CACHE(inode),
> > +			ext4_xattr_inode_get_hash(inode), inode->i_ino);
> > +}
> > +
> >  static int
> >  ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
> >  			       struct ext4_xattr_entry *entry, void *buffer,
> > @@ -976,10 +984,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
> >  static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> >  				       int ref_change)
> >  {
> > -	struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
> >  	struct ext4_iloc iloc;
> >  	s64 ref_count;
> > -	u32 hash;
> >  	int ret;
> >
> >  	inode_lock(ea_inode);
> > @@ -1002,14 +1008,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
> >
> >  			set_nlink(ea_inode, 1);
> >  			ext4_orphan_del(handle, ea_inode);
> > -
> > -			if (ea_inode_cache) {
> > -				hash = ext4_xattr_inode_get_hash(ea_inode);
> > -				mb_cache_entry_create(ea_inode_cache,
> > -						      GFP_NOFS, hash,
> > -						      ea_inode->i_ino,
> > -						      true /* reusable */);
> > -			}
> 
> Ok, so if I understand this correctly, since we are not immediately removing the
> ea_inode_cache entry when the recount reaches 0, hence when the refcount is
> reaches 1 from 0, we need not create mb_cache entry is it?
> Is this since the entry never got deleted in the first place?

Correct.

> But what happens when the entry is created the very first time?
> I might need to study xattr code to understand how this condition is
> taken care.

There are other places that take care of creating the entry in that case.
E.g. ext4_xattr_inode_get() or ext4_xattr_inode_lookup_create().

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set()
  2022-07-12 10:54 ` [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set() Jan Kara
@ 2022-07-14 12:19   ` Ritesh Harjani
  0 siblings, 0 replies; 13+ messages in thread
From: Ritesh Harjani @ 2022-07-14 12:19 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ted Tso, linux-ext4, stable

On 22/07/12 12:54PM, Jan Kara wrote:
> Remove unnecessary else (and thus indentation level) from a code block
> in ext4_xattr_block_set(). It will also make following code changes
> easier. No functional changes.

The patch looks good to me. Just a note, while applying this patch on ext4-dev
tree, I found a minor conflict with below patch.

ext4: use kmemdup() to replace kmalloc + memcpy

Replace kmalloc + memcpy with kmemdup()

-ritesh

>
> CC: stable@vger.kernel.org
> Fixes: 82939d7999df ("ext4: convert to mbcache2")
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/xattr.c | 79 ++++++++++++++++++++++++-------------------------
>  1 file changed, 39 insertions(+), 40 deletions(-)
>
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 7fc40fb1e6b3..aadfae53d055 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1850,6 +1850,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  #define header(x) ((struct ext4_xattr_header *)(x))
>
>  	if (s->base) {
> +		int offset = (char *)s->here - bs->bh->b_data;
> +
>  		BUFFER_TRACE(bs->bh, "get_write_access");
>  		error = ext4_journal_get_write_access(handle, sb, bs->bh,
>  						      EXT4_JTR_NONE);
> @@ -1882,50 +1884,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
>  			if (error)
>  				goto cleanup;
>  			goto inserted;
> -		} else {
> -			int offset = (char *)s->here - bs->bh->b_data;
> +		}
> +		unlock_buffer(bs->bh);
> +		ea_bdebug(bs->bh, "cloning");
> +		s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
> +		error = -ENOMEM;
> +		if (s->base == NULL)
> +			goto cleanup;
> +		memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
> +		s->first = ENTRY(header(s->base)+1);
> +		header(s->base)->h_refcount = cpu_to_le32(1);
> +		s->here = ENTRY(s->base + offset);
> +		s->end = s->base + bs->bh->b_size;
>
> -			unlock_buffer(bs->bh);
> -			ea_bdebug(bs->bh, "cloning");
> -			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
> -			error = -ENOMEM;
> -			if (s->base == NULL)
> +		/*
> +		 * If existing entry points to an xattr inode, we need
> +		 * to prevent ext4_xattr_set_entry() from decrementing
> +		 * ref count on it because the reference belongs to the
> +		 * original block. In this case, make the entry look
> +		 * like it has an empty value.
> +		 */
> +		if (!s->not_found && s->here->e_value_inum) {
> +			ea_ino = le32_to_cpu(s->here->e_value_inum);
> +			error = ext4_xattr_inode_iget(inode, ea_ino,
> +				      le32_to_cpu(s->here->e_hash),
> +				      &tmp_inode);
> +			if (error)
>  				goto cleanup;
> -			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
> -			s->first = ENTRY(header(s->base)+1);
> -			header(s->base)->h_refcount = cpu_to_le32(1);
> -			s->here = ENTRY(s->base + offset);
> -			s->end = s->base + bs->bh->b_size;
>
> -			/*
> -			 * If existing entry points to an xattr inode, we need
> -			 * to prevent ext4_xattr_set_entry() from decrementing
> -			 * ref count on it because the reference belongs to the
> -			 * original block. In this case, make the entry look
> -			 * like it has an empty value.
> -			 */
> -			if (!s->not_found && s->here->e_value_inum) {
> -				ea_ino = le32_to_cpu(s->here->e_value_inum);
> -				error = ext4_xattr_inode_iget(inode, ea_ino,
> -					      le32_to_cpu(s->here->e_hash),
> -					      &tmp_inode);
> -				if (error)
> -					goto cleanup;
> -
> -				if (!ext4_test_inode_state(tmp_inode,
> -						EXT4_STATE_LUSTRE_EA_INODE)) {
> -					/*
> -					 * Defer quota free call for previous
> -					 * inode until success is guaranteed.
> -					 */
> -					old_ea_inode_quota = le32_to_cpu(
> -							s->here->e_value_size);
> -				}
> -				iput(tmp_inode);
> -
> -				s->here->e_value_inum = 0;
> -				s->here->e_value_size = 0;
> +			if (!ext4_test_inode_state(tmp_inode,
> +					EXT4_STATE_LUSTRE_EA_INODE)) {
> +				/*
> +				 * Defer quota free call for previous
> +				 * inode until success is guaranteed.
> +				 */
> +				old_ea_inode_quota = le32_to_cpu(
> +						s->here->e_value_size);
>  			}
> +			iput(tmp_inode);
> +
> +			s->here->e_value_inum = 0;
> +			s->here->e_value_size = 0;
>  		}
>  	} else {
>  		/* Allocate a buffer where we construct the new block. */
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set()
       [not found] <20220712104519.29887-1-jack@suse.cz>
@ 2022-07-12 10:54 ` Jan Kara
  2022-07-14 12:19   ` Ritesh Harjani
  0 siblings, 1 reply; 13+ messages in thread
From: Jan Kara @ 2022-07-12 10:54 UTC (permalink / raw)
  To: Ted Tso; +Cc: linux-ext4, Ritesh Harjani, Jan Kara, stable

Remove unnecessary else (and thus indentation level) from a code block
in ext4_xattr_block_set(). It will also make following code changes
easier. No functional changes.

CC: stable@vger.kernel.org
Fixes: 82939d7999df ("ext4: convert to mbcache2")
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/xattr.c | 79 ++++++++++++++++++++++++-------------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7fc40fb1e6b3..aadfae53d055 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1850,6 +1850,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (s->base) {
+		int offset = (char *)s->here - bs->bh->b_data;
+
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, sb, bs->bh,
 						      EXT4_JTR_NONE);
@@ -1882,50 +1884,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (error)
 				goto cleanup;
 			goto inserted;
-		} else {
-			int offset = (char *)s->here - bs->bh->b_data;
+		}
+		unlock_buffer(bs->bh);
+		ea_bdebug(bs->bh, "cloning");
+		s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+		s->first = ENTRY(header(s->base)+1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->here = ENTRY(s->base + offset);
+		s->end = s->base + bs->bh->b_size;
 
-			unlock_buffer(bs->bh);
-			ea_bdebug(bs->bh, "cloning");
-			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
-			error = -ENOMEM;
-			if (s->base == NULL)
+		/*
+		 * If existing entry points to an xattr inode, we need
+		 * to prevent ext4_xattr_set_entry() from decrementing
+		 * ref count on it because the reference belongs to the
+		 * original block. In this case, make the entry look
+		 * like it has an empty value.
+		 */
+		if (!s->not_found && s->here->e_value_inum) {
+			ea_ino = le32_to_cpu(s->here->e_value_inum);
+			error = ext4_xattr_inode_iget(inode, ea_ino,
+				      le32_to_cpu(s->here->e_hash),
+				      &tmp_inode);
+			if (error)
 				goto cleanup;
-			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
-			s->first = ENTRY(header(s->base)+1);
-			header(s->base)->h_refcount = cpu_to_le32(1);
-			s->here = ENTRY(s->base + offset);
-			s->end = s->base + bs->bh->b_size;
 
-			/*
-			 * If existing entry points to an xattr inode, we need
-			 * to prevent ext4_xattr_set_entry() from decrementing
-			 * ref count on it because the reference belongs to the
-			 * original block. In this case, make the entry look
-			 * like it has an empty value.
-			 */
-			if (!s->not_found && s->here->e_value_inum) {
-				ea_ino = le32_to_cpu(s->here->e_value_inum);
-				error = ext4_xattr_inode_iget(inode, ea_ino,
-					      le32_to_cpu(s->here->e_hash),
-					      &tmp_inode);
-				if (error)
-					goto cleanup;
-
-				if (!ext4_test_inode_state(tmp_inode,
-						EXT4_STATE_LUSTRE_EA_INODE)) {
-					/*
-					 * Defer quota free call for previous
-					 * inode until success is guaranteed.
-					 */
-					old_ea_inode_quota = le32_to_cpu(
-							s->here->e_value_size);
-				}
-				iput(tmp_inode);
-
-				s->here->e_value_inum = 0;
-				s->here->e_value_size = 0;
+			if (!ext4_test_inode_state(tmp_inode,
+					EXT4_STATE_LUSTRE_EA_INODE)) {
+				/*
+				 * Defer quota free call for previous
+				 * inode until success is guaranteed.
+				 */
+				old_ea_inode_quota = le32_to_cpu(
+						s->here->e_value_size);
 			}
+			iput(tmp_inode);
+
+			s->here->e_value_inum = 0;
+			s->here->e_value_size = 0;
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-07-14 12:20 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20220614124146.21594-1-jack@suse.cz>
2022-06-14 16:05 ` [PATCH 01/10] mbcache: Don't reclaim used entries Jan Kara
2022-06-16 14:22   ` Ritesh Harjani
2022-06-16 17:25     ` Jan Kara
2022-06-14 16:05 ` [PATCH 02/10] mbcache: Add functions to delete entry if unused Jan Kara
2022-06-16 14:47   ` Ritesh Harjani
2022-06-16 17:28     ` Jan Kara
2022-06-14 16:05 ` [PATCH 03/10] ext4: Remove EA inode entry from mbcache on inode eviction Jan Kara
2022-06-16 15:01   ` Ritesh Harjani
2022-06-16 17:30     ` Jan Kara
2022-06-14 16:05 ` [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set() Jan Kara
2022-06-14 16:05 ` [PATCH 05/10] ext4: Fix race when reusing xattr blocks Jan Kara
     [not found] <20220712104519.29887-1-jack@suse.cz>
2022-07-12 10:54 ` [PATCH 04/10] ext4: Unindent codeblock in ext4_xattr_block_set() Jan Kara
2022-07-14 12:19   ` Ritesh Harjani

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).